/** @file
Parser: Charset connection decl.
Copyright (c) 2001-2026 Art. Lebedev Studio (https://www.artlebedev.com)
Authors: Konstantin Morshnev <moko@design.ru>, Alexandr Petrosian <paf@design.ru>
*/
#ifndef PA_CHARSET_H
#define PA_CHARSET_H
#define IDENT_PA_CHARSET_H "$Id: pa_charset.h,v 1.63 2026/04/25 13:38:46 moko Exp $"
#include "pa_exception.h"
#include "pa_common.h"
#include "pa_hash.h"
#include "pa_array.h"
#ifdef HAVE_PCRE2
#include <pcre2.h>
#else
#include <pcre.h>
#endif
// we are using some pcre_internal.h stuff as well
#include "../lib/pcre/pa_pcre_internal.h"
#ifdef XML
#include "libxml/xmlstring.h"
#include "libxml/encoding.h"
#endif
// defines
#define MAX_CHARSETS 10
#define MAX_CHARSET_UNI_CODES 500
#ifndef XMLCh
typedef unsigned int XMLCh;
#endif
#ifndef XMLByte
typedef unsigned char XMLByte;
#endif
// helpers
typedef HashString<String::Body> HashStringString;
/** charset holds name & transcode tables
registers libxml transcoders
*/
class Charset: public PA_Object {
public:
Charset(Request_charsets* charsets, const String::Body ANAME, const String* afile_spec);
const String::Body NAME() const { return FNAME; }
const char* NAME_CSTR() const { return FNAME_CSTR; }
bool isUTF8() const { return fisUTF8; }
static String::C transcode(const String::C src, const Charset& source_charset, const Charset& dest_charset);
static String::Body transcode(const String::Body src, const Charset& source_transcoder, const Charset& dest_transcoder);
static String::Body transcode(const char* src,const Charset& source_transcoder, const Charset& dest_transcoder){ return transcode(String::Body(src), source_transcoder, dest_transcoder); }
static String& transcode(const String& src, const Charset& source_transcoder, const Charset& dest_transcoder);
static void transcode(ArrayString& src, const Charset& source_transcoder, const Charset& dest_transcoder);
static void transcode(HashStringString& src, const Charset& source_transcoder, const Charset& dest_transcoder);
static String::C escape(const String::C src, const Charset& source_charset);
static String::Body escape(const String::Body src, const Charset& source_charset);
static String& escape(const String& src, const Charset& source_charset);
static String::C escape_JSON(const String::C src, const Charset& source_charset);
static String::Body escape_JSON(const String::Body src, const Charset& source_charset);
static String& escape_JSON(const String& src, const Charset& source_charset);
void store_Char(XMLByte*& outPtr, XMLCh src, XMLByte not_found);
public:
unsigned char pcre_tables[tables_length];
private:
void load_definition(Request_charsets& charsets, const String& afile_spec);
void sort_ToTable();
const String::C transcodeToUTF8(const String::C src) const;
const String::C transcodeFromUTF8(const String::C src) const;
const String::C transcodeToCharset(const String::C src,
const Charset& dest_transcoder) const;
public:
struct Tables {
struct Rec {
XMLCh intCh;
XMLByte extCh;
};
XMLCh fromTable[0x100];
Rec toTable[MAX_CHARSET_UNI_CODES];
uint toTableSize;
};
struct UTF8CaseTable {
struct Rec {
XMLCh from, to;
};
uint size;
Rec* records;
};
private:
const String::Body FNAME;
char* FNAME_CSTR;
bool fisUTF8;
Tables tables;
static size_t calc_escaped_length_UTF8(XMLByte* src, size_t src_length);
static size_t calc_escaped_length(const XMLByte* src, size_t src_length, const Charset::Tables& tables);
static size_t calc_escaped_length(const String::C src, const Charset& source_charset);
static size_t escape_UTF8(const XMLByte* src, size_t src_length, XMLByte* dest);
static size_t escape(const XMLByte* src, size_t src_length, XMLByte* dest, const Charset::Tables& tables);
static size_t calc_JSON_escaped_length_UTF8(XMLByte* src, size_t src_length);
static size_t calc_JSON_escaped_length(const XMLByte* src, size_t src_length, const Charset::Tables& tables);
static size_t calc_JSON_escaped_length(const String::C src, const Charset& source_charset);
static size_t escape_JSON_UTF8(const XMLByte* src, size_t src_length, XMLByte* dest);
static size_t escape_JSON(const XMLByte* src, size_t src_length, XMLByte* dest, const Charset::Tables& tables);
#ifdef XML
private:
void addEncoding(char* name_cstr);
public:
/// converts xmlChar* null-terminated string to char*
String::C transcode_cstr(const xmlChar* s);
/// converts xmlChar* null-terminated string to parser String
const String& transcode(const xmlChar* s);
/** converts sized char* to xmlChar*
@returns xmlChar* WHICH CALLER SHOULD FREE
*/
xmlChar* transcode_buf2xchar(const char* buf, size_t buf_size);
/// converts parser String to xmlChar*
xmlChar* transcode(const String& s);
/// converts parser String::Body to xmlChar*
xmlChar* transcode(const String::Body s);
private:
xmlCharEncodingInputFunc ftranscoder_input;
xmlCharEncodingOutputFunc ftranscoder_output;
#endif
};
// externs
extern Charset::UTF8CaseTable UTF8CaseToUpper;
extern Charset::UTF8CaseTable UTF8CaseToLower;
void change_case_UTF8(const XMLByte* srcData, size_t srcLen,
XMLByte* toFill, size_t toFillLen,
const Charset::UTF8CaseTable& table);
size_t getUTF8BytePos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t charPos/*position in characters*/);
size_t getUTF8CharPos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t bytePos/*position in bytes*/);
size_t lengthUTF8(const XMLByte* srcBegin, const XMLByte* srcEnd);
unsigned int lengthUTF8Char(const XMLByte c);
const char *fixUTF8(const char *src);
class UTF8_string_iterator {
public:
UTF8_string_iterator(const String& astring): fsrcPtr((XMLByte*)astring.cstr()), fsrcEnd(fsrcPtr + astring.length()) {}
UTF8_string_iterator(XMLByte* asrcPtr, size_t length): fsrcPtr(asrcPtr), fsrcEnd(fsrcPtr + length) {}
bool has_next();
XMLCh next() { return fUTF8Char; }
XMLByte getFirstByte(){ return ffirstByte; }
size_t getCharSize(){ return fcharSize; }
private:
const XMLByte* fsrcPtr;
const XMLByte* fsrcEnd;
size_t fcharSize;
XMLByte ffirstByte;
XMLCh fUTF8Char;
};
#endif
E-mail: