parser3/src/main/pa_charset.C - view

File: [parser3project] / parser3 / src / main / pa_charset.C
Revision 1.118: download - view: text, annotated - select for diffs - revision graph
Sat Apr 25 13:38:46 2026 UTC (5 weeks, 3 days ago) by moko
Branches: MAIN
CVS tags: HEAD

Copyright year updated, websites links changed to https://

/** @file Parser: Charset connection implementation. Copyright (c) 2001-2026 Art. Lebedev Studio (https://www.artlebedev.com) Authors: Konstantin Morshnev <moko@design.ru>, Alexandr Petrosian <paf@design.ru> */ #include "pa_charset.h" #include "pa_charsets.h" // we are using some pcre_internal.h stuff as well #include "../lib/pcre/pa_pcre_internal.h" volatile const char * IDENT_PA_CHARSET_C="$Id: pa_charset.C,v 1.118 2026/04/25 13:38:46 moko Exp $" IDENT_PA_CHARSET_H; #ifdef XML #include "libxml/xmlmemory.h" #include "libxml/encoding.h" #endif // reduce memory usage by pre-calculation utf-8 string length #define PRECALCULATE_DEST_LENGTH // globals Charset::UTF8CaseTable::Rec UTF8CaseToUpperRecords[]={ #include "utf8-to-upper.inc" }; Charset::UTF8CaseTable UTF8CaseToUpper={ sizeof(UTF8CaseToUpperRecords)/sizeof(Charset::UTF8CaseTable::Rec), UTF8CaseToUpperRecords}; Charset::UTF8CaseTable::Rec UTF8CaseToLowerRecords[]={ #include "utf8-to-lower.inc" }; Charset::UTF8CaseTable UTF8CaseToLower={ sizeof(UTF8CaseToLowerRecords)/sizeof(Charset::UTF8CaseTable::Rec), UTF8CaseToLowerRecords}; // helpers inline void prepare_case_tables(unsigned char *tables) { unsigned char *lcc_table=tables+lcc_offset; unsigned char *fcc_table=tables+fcc_offset; for(int i=0; i<0x100; i++) lcc_table[i]=fcc_table[i]=(unsigned char)i; } inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr, unsigned char bit) { unsigned char *ctypes_table=tables+ctypes_offset; ctypes_table[0]=bit; for(; *cstr; cstr++) { unsigned char c=*cstr; ctypes_table[c]|=bit; } } inline unsigned int to_wchar_code(const char* cstr) { if(!cstr || !*cstr) return 0; if(cstr[1]==0) return(unsigned int)(unsigned char)cstr[0]; return pa_atoui(cstr,0); } inline bool to_bool(const char* cstr) { return cstr && *cstr!=0; } static void element2ctypes(unsigned char c, bool belongs, unsigned char *tables, unsigned char bit, int group_offset=-1) { if(!belongs) return; unsigned char *ctypes_table=tables+ctypes_offset; ctypes_table[c]|=bit; if(group_offset>=0) tables[cbits_offset+group_offset+c/8] |= 1<<(c%8); } static void element2case(unsigned char from, unsigned char to, unsigned char *tables) { if(!to) return; unsigned char *lcc_table=tables+lcc_offset; unsigned char *fcc_table=tables+fcc_offset; lcc_table[from]=to; fcc_table[from]=to; fcc_table[to]=from; } inline XMLByte *append_hex_8(XMLByte *dest, unsigned char c, const char* prefix=0) { if(prefix) { strcpy((char *)dest, prefix); dest+=strlen(prefix); } *dest++=hex_digits[c >> 4]; *dest++=hex_digits[c & 0x0F]; return dest; } inline XMLByte *append_hex_16(XMLByte *dest, unsigned int c, const char* prefix=0) { if(prefix) { strcpy((char *)dest, prefix); dest+=strlen(prefix); } *dest++=hex_digits[(c >> 12) & 0x0F]; *dest++=hex_digits[(c >> 8) & 0x0F]; *dest++=hex_digits[(c >> 4) & 0x0F]; *dest++=hex_digits[(c) & 0x0F]; return dest; } // methods Charset::Charset(Request_charsets* acharsets, const String::Body ANAME, const String* afile_spec): FNAME(ANAME), FNAME_CSTR(ANAME.cstrm()) { if(afile_spec) { fisUTF8=false; load_definition(*acharsets, *afile_spec); #ifdef XML addEncoding(FNAME_CSTR); #endif } else { fisUTF8=true; // grab default onces [for UTF-8 so to be able to make a-z =>A-Z memcpy(pcre_tables, pa_pcre_default_tables, sizeof(pcre_tables)); } } void Charset::load_definition(Request_charsets& acharsets, const String& afile_spec) { // pcre_tables // lowcase, flipcase, bits digit+word+whitespace, masks // must not move this inside of prepare_case_tables // don't know the size there memset(pcre_tables, 0, sizeof(pcre_tables)); prepare_case_tables(pcre_tables); cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta); // charset memset(&tables, 0, sizeof(tables)); // loading text char *data=file_read_text(acharsets, afile_spec); // ignore header getrow(&data); // parse cells char *row; while((row=getrow(&data))) { // remove empty&comment lines if(!*row || *row=='#') continue; // char white-space digit hex-digit letter word lowercase unicode1 unicode2 unsigned char c=0; char *cell; for(int column=0; (cell=lsplit(&row, '\t')); column++) { switch(column) { case 0: c=(unsigned char)to_wchar_code(cell); break; // pcre_tables case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break; case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break; case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break; case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break; case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break; case 6: element2case(c, (unsigned char)to_wchar_code(cell), pcre_tables); break; case 7: case 8: // charset if(tables.toTableSize>MAX_CHARSET_UNI_CODES) throw Exception(PARSER_RUNTIME, &afile_spec, "charset must contain not more than %d unicode values", MAX_CHARSET_UNI_CODES); XMLCh unicode=(XMLCh)to_wchar_code(cell); if(!unicode && column==7/*unicode1 column*/) unicode=(XMLCh)c; if(unicode) { if(!tables.fromTable[c]) tables.fromTable[c]=unicode; tables.toTable[tables.toTableSize].intCh=unicode; tables.toTable[tables.toTableSize].extCh=(XMLByte)c; tables.toTableSize++; } break; } } }; // parser charset tables declare only white-space before 0x20, thus adding the missing chars for(uint i=0; i<0x20; i++) if(!tables.fromTable[i]){ tables.fromTable[i]=i; tables.toTable[tables.toTableSize].intCh=i; tables.toTable[tables.toTableSize].extCh=(XMLByte)i; tables.toTableSize++; } // sort by the Unicode code point sort_ToTable(); } static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) { return static_cast<const Charset::Tables::Rec *>(a)->intCh- static_cast<const Charset::Tables::Rec *>(b)->intCh; } void Charset::sort_ToTable() { qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), sort_cmp_Trans_rec_intCh); } // @todo: precache for spedup searching static XMLByte xlatOneTo(const XMLCh toXlat, const Charset::Tables& tables, XMLByte not_found) { int lo = 0; int hi = tables.toTableSize - 1; while(lo<=hi) { // Calc the mid point of the low and high offset. const unsigned int i = (lo + hi) / 2; XMLCh cur=tables.toTable[i].intCh; if(toXlat==cur) return tables.toTable[i].extCh; if(toXlat>cur) lo = i+1; else hi = i-1; } return not_found; } String::C Charset::transcode(const String::C src, const Charset& source_charset, const Charset& dest_charset) { if(!src.length) return String::C("", 0); switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) { default: // 0x00 return source_charset.transcodeToCharset(src, dest_charset); case 0x01: return source_charset.transcodeToUTF8(src); case 0x10: return dest_charset.transcodeFromUTF8(src); case 0x11: return src; } } // --------------------------------------------------------------------------- // Local static data // // gUTFBytes // A list of counts of trailing bytes for each initial byte in the input. // // gUTFOffsets // A list of values to offset each result char type, according to how // many source bytes when into making it. // // gFirstByteMark // A list of values to mask onto the first byte of an encoded sequence, // indexed by the number of bytes used to create the sequence. // --------------------------------------------------------------------------- static const XMLByte gUTFBytes[0x100] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; static const uint gUTFOffsets[6] = { 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080 }; static const XMLByte gFirstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; static int transcodeToUTF8(const XMLByte* srcData, int& srcLen, XMLByte *toFill, int& toFillLen, const Charset::Tables& tables) { const XMLByte* srcPtr=srcData; const XMLByte* srcEnd=srcData+srcLen; XMLByte* outPtr=toFill; XMLByte* outEnd=toFill+toFillLen; while(srcPtr<srcEnd) { uint curVal = tables.fromTable[*srcPtr]; if(!curVal) { // use the replacement character *outPtr++= '?'; srcPtr++; continue; } // Figure out how many bytes we need unsigned int encodedBytes; if(curVal<0x80) encodedBytes = 1; else if(curVal<0x800) encodedBytes = 2; else if(curVal<0x10000) encodedBytes = 3; else if(curVal<0x200000) encodedBytes = 4; else if(curVal<0x4000000) encodedBytes = 5; else if(curVal<= 0x7FFFFFFF) encodedBytes = 6; else { // use the replacement character *outPtr++= '?'; srcPtr++; continue; } // If we cannot fully get this char into the output buffer if (outPtr + encodedBytes > outEnd) break; // We can do it, so update the source index srcPtr++; // And spit out the bytes. We spit them out in reverse order // here, so bump up the output pointer and work down as we go. outPtr+= encodedBytes; switch(encodedBytes) { case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); curVal>>= 6; case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); curVal>>= 6; case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); curVal>>= 6; case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); curVal>>= 6; case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL); curVal>>= 6; case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]); } // Add the encoded bytes back in again to indicate we've eaten them outPtr+= encodedBytes; } // Update the bytes eaten srcLen = srcPtr - srcData; // Return the characters read toFillLen = outPtr - toFill; //return srcPtr==srcEnd?(int)toFillLen:-1; /* xmlCharEncodingInputFunc Returns : the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number of ocetes consumed. */ return 0; } /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter] static int transcodeFromUTF8(const XMLByte* srcData, int& srcLen, XMLByte* toFill, int& toFillLen, const Charset::Tables& tables) { const XMLByte* srcPtr=srcData; const XMLByte* srcEnd=srcData+srcLen; XMLByte* outPtr=toFill; XMLByte* outEnd=toFill+toFillLen; // We now loop until we either run out of input data, or room to store while ((srcPtr < srcEnd) && (outPtr < outEnd)) { // Get the next leading byte out const XMLByte firstByte =* srcPtr; // Special-case ASCII, which is a leading byte value of<= 127 if(firstByte<=127) { *outPtr++= firstByte; srcPtr++; continue; } // See how many trailing src bytes this sequence is going to require const unsigned int trailingBytes = gUTFBytes[firstByte]; // If there are not enough source bytes to do this one, then we // are done. Note that we done>= here because we are implicitly // counting the 1 byte we get no matter what. if(srcPtr+trailingBytes>= srcEnd) break; // Looks ok, so lets build up the value uint tmpVal=0; switch(trailingBytes) { case 5: tmpVal+=*srcPtr++; tmpVal<<=6; case 4: tmpVal+=*srcPtr++; tmpVal<<=6; case 3: tmpVal+=*srcPtr++; tmpVal<<=6; case 2: tmpVal+=*srcPtr++; tmpVal<<=6; case 1: tmpVal+=*srcPtr++; tmpVal<<=6; case 0: tmpVal+=*srcPtr++; break; default: throw Exception(0, 0, "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes); // never } tmpVal-=gUTFOffsets[trailingBytes]; // If it will fit into a single char, then put it in. Otherwise // fail [*encode it as a surrogate pair. If its not valid, use the // replacement char.*] if(!(tmpVal & 0xFFFF0000)) { if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0)) *outPtr++=xlat; else { outPtr+=sprintf((char *)outPtr, "&#%u;", tmpVal); // &#decimal; } } else { const XMLByte* recoverPtr=srcPtr-trailingBytes-1; for(uint i=0; i<=trailingBytes; i++) outPtr+=sprintf((char*)outPtr, "%%%02X", *recoverPtr++); } } // Update the bytes eaten srcLen = srcPtr - srcData; // Return the characters read toFillLen = outPtr - toFill; //return srcPtr==srcEnd?(int)toFillLen:-1; /* xmlCharEncodingOutputFunc Returns : the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number of ocetes consumed. */ return 0; } static bool need_escape(XMLByte c){ return !( (c<=127) && ( pa_isalnum((unsigned char)c) || strchr("*@-_+./", c)!=0 ) ); } // read one UTF8 char and return length of this char (in bytes) static unsigned int readUTF8Char(const XMLByte*& srcPtr, const XMLByte* srcEnd, XMLByte& firstByte, XMLCh& UTF8Char){ if(!srcPtr || !*srcPtr || srcPtr>=srcEnd) return 0; firstByte=*srcPtr; if(firstByte<=127){ UTF8Char=firstByte; srcPtr++; return 1; } unsigned int trailingBytes=gUTFBytes[firstByte]; if(srcPtr+trailingBytes>=srcEnd){ return 0; // not enough bytes in source string for reading } uint tmpVal=0; switch(trailingBytes){ case 5: tmpVal+=*srcPtr++; tmpVal<<=6; case 4: tmpVal+=*srcPtr++; tmpVal<<=6; case 3: tmpVal+=*srcPtr++; tmpVal<<=6; case 2: tmpVal+=*srcPtr++; tmpVal<<=6; case 1: tmpVal+=*srcPtr++; tmpVal<<=6; case 0: tmpVal+=*srcPtr++; } tmpVal-=gUTFOffsets[trailingBytes]; UTF8Char=tmpVal; return trailingBytes+1; } // skip UTF8 char and return length of this char (in bytes) static unsigned int skipUTF8Char(const XMLByte*& srcPtr, const XMLByte* srcEnd){ if(!srcPtr || !*srcPtr || srcPtr>=srcEnd) return 0; unsigned int trailingBytes=gUTFBytes[*srcPtr]+1; srcPtr+=trailingBytes; return trailingBytes; } // read non-UTF8 char, and return number of bytes needed for storing this char in UTF8 static unsigned int readChar(const XMLByte*& srcPtr, const XMLByte* srcEnd, XMLByte& firstByte, XMLCh& UTF8Char, const Charset::Tables& tables){ if(!srcPtr || !*srcPtr || srcPtr>=srcEnd) return 0; firstByte=*srcPtr++; UTF8Char=tables.fromTable[firstByte]; if(UTF8Char<0x80) return 1; else if(UTF8Char<0x800) return 2; else if(UTF8Char<0x10000) return 3; else if(UTF8Char<0x200000) return 4; else if(UTF8Char<0x4000000) return 5; else if(UTF8Char<= 0x7FFFFFFF) return 6; // will use the replacement character '?' firstByte=0; return 1; } size_t Charset::calc_escaped_length_UTF8(XMLByte* src, size_t src_length){ size_t dest_length=0; for(UTF8_string_iterator i(src, src_length); i.has_next(); ){ if(i.getCharSize()==1) dest_length+=!need_escape(i.getFirstByte())?1/*as-is*/:3/*%XX*/; else dest_length+=6; // %uXXXX } return dest_length; } size_t Charset::calc_escaped_length(const XMLByte* src, size_t src_length, const Charset::Tables& tables){ const XMLByte* src_end=src+src_length; XMLByte first_byte; XMLCh UTF8_char; size_t dest_length=0; while(uint char_size=readChar(src, src_end, first_byte, UTF8_char, tables)){ if(char_size==1) dest_length+=(!first_byte/*replacement char '?'*/ || !need_escape(first_byte))?1:3/*'%XX'*/; else dest_length+=6; // %uXXXX } return dest_length; } size_t Charset::calc_escaped_length(const String::C src, const Charset& source_charset){ if(!src.length) return 0; #ifdef PRECALCULATE_DEST_LENGTH if(source_charset.isUTF8()) return calc_escaped_length_UTF8((XMLByte *)src.str, src.length); else return calc_escaped_length((XMLByte *)src.str, src.length, source_charset.tables); #else return src_length*6; // enough for %uXXXX but too memory-hungry #endif } #define escape_char(dest_ptr, char_size, first_byte, UTF8_char) \ if(char_size==1) \ if(first_byte){ \ if(need_escape(first_byte)) \ dest_ptr=append_hex_8(dest_ptr, first_byte, "%"); /* %XX */ \ else \ *dest_ptr++=first_byte; /*as is*/ \ } else \ *dest_ptr++='?'; /* replacement char '?' */ \ else \ dest_ptr=append_hex_16(dest_ptr, UTF8_char, "%u"); /* %uXXXX */ size_t Charset::escape_UTF8(const XMLByte* src, size_t src_length, XMLByte* dest) { XMLByte* dest_ptr=dest; // loop until we either run out of input data for(UTF8_string_iterator i((XMLByte *)src, src_length); i.has_next(); ) escape_char(dest_ptr, i.getCharSize(), i.getFirstByte(), i.next()) return dest_ptr - dest; } size_t Charset::escape(const XMLByte* src, size_t src_length, XMLByte* dest, const Charset::Tables& tables) { const XMLByte* src_end=src+src_length; XMLByte* dest_ptr=dest; XMLByte first_byte; XMLCh UTF8_char; uint char_size; while(char_size=readChar(src, src_end, first_byte, UTF8_char, tables)) escape_char(dest_ptr, char_size, first_byte, UTF8_char) return dest_ptr - dest; } String::C Charset::escape(const String::C src, const Charset& source_charset){ if(!src.length) return String::C("", 0); size_t dest_calculated_length=calc_escaped_length(src, source_charset); XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_calculated_length+1/*terminator*/]; size_t dest_length; if(source_charset.isUTF8()) dest_length=escape_UTF8((XMLByte *)src.str, src.length, dest_body); else dest_length=escape((XMLByte *)src.str, src.length, dest_body, source_charset.tables); if(dest_length>dest_calculated_length) throw Exception(0, 0, "Charset::escape buffer overflow"); dest_body[dest_length]=0; // terminator return String::C((char*)dest_body, dest_length); } String::Body Charset::escape(const String::Body src, const Charset& source_charset) { String::C dest=Charset::escape(String::C(src.cstr(), src.length()), source_charset); return String::Body(dest.length ? dest.str:0); } String& Charset::escape(const String& src, const Charset& source_charset) { if(src.is_empty()) return *new String(); return *new String(escape((String::Body)src, source_charset), String::L_CLEAN); } inline bool need_json_escape(unsigned char c){ return strchr("\n\"\\/\t\r\b\f", c)!=0; } size_t Charset::calc_JSON_escaped_length_UTF8(XMLByte* src, size_t src_length){ size_t dest_length=0; for(UTF8_string_iterator i(src, src_length); i.has_next(); ){ if(i.getCharSize()==1){ XMLByte first_byte=i.getFirstByte(); dest_length+=need_json_escape(first_byte) ? 2 : (first_byte < 0x20 && first_byte /* 0 replacement char is '?' */) ? 6 : 1; } else dest_length+=6; // \uXXXX } return dest_length; } size_t Charset::calc_JSON_escaped_length(const XMLByte* src, size_t src_length, const Charset::Tables& tables){ const XMLByte* src_end=src+src_length; XMLByte first_byte; XMLCh UTF8_char; size_t dest_length=0; while(uint char_size=readChar(src, src_end, first_byte, UTF8_char, tables)){ if(char_size==1) dest_length+=need_json_escape(first_byte) ? 2 : (first_byte < 0x20 && first_byte /* 0 replacement char is '?' */) ? 6 : 1; else dest_length+=6; // \uXXXX } return dest_length; } size_t Charset::calc_JSON_escaped_length(const String::C src, const Charset& source_charset){ if(!src.length) return 0; #ifdef PRECALCULATE_DEST_LENGTH if(source_charset.isUTF8()) return calc_JSON_escaped_length_UTF8((XMLByte *)src.str, src.length); else return calc_JSON_escaped_length((XMLByte *)src.str, src.length, source_charset.tables); #else return src_length*6; // enough for \uXXXX but too memory-hungry #endif } #define escape_char_JSON(dest_ptr, char_size, first_byte, UTF8_char) \ if(char_size==1) \ switch(first_byte){ \ case '\n': *dest_ptr++='\\'; *dest_ptr++='n'; break; \ case '"' : *dest_ptr++='\\'; *dest_ptr++='"'; break; \ case '\\': *dest_ptr++='\\'; *dest_ptr++='\\'; break; \ case '/' : *dest_ptr++='\\'; *dest_ptr++='/'; break; \ case '\t': *dest_ptr++='\\'; *dest_ptr++='t'; break; \ case '\r': *dest_ptr++='\\'; *dest_ptr++='r'; break; \ case '\b': *dest_ptr++='\\'; *dest_ptr++='b'; break; \ case '\f': *dest_ptr++='\\'; *dest_ptr++='f'; break; \ case 0 : *dest_ptr++='?'; break; /*replacement char*/ \ default : if(first_byte < 0x20) dest_ptr=append_hex_16(dest_ptr, UTF8_char, "\\u"); \ else *dest_ptr++=first_byte; \ } \ else \ dest_ptr=append_hex_16(dest_ptr, UTF8_char, "\\u"); // \uXXXX size_t Charset::escape_JSON_UTF8(const XMLByte* src, size_t src_length, XMLByte* dest) { XMLByte* dest_ptr=dest; // loop until we either run out of input data for(UTF8_string_iterator i((XMLByte *)src, src_length); i.has_next(); ) escape_char_JSON(dest_ptr, i.getCharSize(), i.getFirstByte(), i.next()) return dest_ptr - dest; } size_t Charset::escape_JSON(const XMLByte* src, size_t src_length, XMLByte* dest, const Charset::Tables& tables) { const XMLByte* src_end=src+src_length; XMLByte* dest_ptr=dest; XMLByte first_byte; XMLCh UTF8_char; uint char_size; while(char_size=readChar(src, src_end, first_byte, UTF8_char, tables)) escape_char_JSON(dest_ptr, char_size, first_byte, UTF8_char) return dest_ptr - dest; } String::C Charset::escape_JSON(const String::C src, const Charset& source_charset){ if(!src.length) return String::C("", 0); size_t dest_calculated_length=calc_JSON_escaped_length(src, source_charset); XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_calculated_length+1/*terminator*/]; size_t dest_length; if(source_charset.isUTF8()) dest_length=escape_JSON_UTF8((XMLByte *)src.str, src.length, dest_body); else dest_length=escape_JSON((XMLByte *)src.str, src.length, dest_body, source_charset.tables); if(dest_length>dest_calculated_length) throw Exception(0, 0, "Charset::escape_JSON buffer overflow"); dest_body[dest_length]=0; // terminator return String::C((char*)dest_body, dest_length); } String::Body Charset::escape_JSON(const String::Body src, const Charset& source_charset) { String::C dest=Charset::escape_JSON(String::C(src.cstr(), src.length()), source_charset); return String::Body(dest.length ? dest.str:0); } String& Charset::escape_JSON(const String& src, const Charset& source_charset) { if(src.is_empty()) return *new String(); return *new String(escape_JSON((String::Body)src, source_charset), String::L_CLEAN); } const String::C Charset::transcodeToUTF8(const String::C src) const { int src_length=src.length; #ifdef PRECALCULATE_DEST_LENGTH int dest_length=0; const XMLByte* srcPtr=(XMLByte*)src.str; const XMLByte* srcEnd=srcPtr+src_length; XMLByte firstByte; XMLCh UTF8Char; while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, tables)) dest_length+=charSize; #else int dest_length=src_length*6; // so that surly enough (max utf8 seq len=6) but too memory-hungry #endif #ifndef NDEBUG int saved_dest_length=dest_length; #endif XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/]; if(::transcodeToUTF8( (XMLByte *)src.str, src_length, dest_body, dest_length, tables)<0) throw Exception(0, 0, "Charset::transcodeToUTF8 buffer overflow"); assert(dest_length<=saved_dest_length); dest_body[dest_length]=0; // terminator return String::C((char*)dest_body, dest_length); } static XMLCh change_case_UTF8(const XMLCh src, const Charset::UTF8CaseTable& table) { int lo = 0; int hi = table.size - 1; while(lo<=hi) { // Calc the mid point of the low and high offset. const unsigned int i = (lo + hi) / 2; XMLCh cur=table.records[i].from; if(src==cur) return table.records[i].to; if(src>cur) lo = i+1; else hi = i-1; } // not found return src; } static void store_UTF8(XMLCh src, XMLByte*& outPtr){ if(!src) { // use the replacement character *outPtr++= '?'; return; } // Figure out how many bytes we need unsigned int encodedBytes; if(src<0x80) encodedBytes = 1; else if(src<0x800) encodedBytes = 2; else if(src<0x10000) encodedBytes = 3; else if(src<0x200000) encodedBytes = 4; else if(src<0x4000000) encodedBytes = 5; else if(src<= 0x7FFFFFFF) encodedBytes = 6; else { // use the replacement character *outPtr++= '?'; return; } // And spit out the bytes. We spit them out in reverse order // here, so bump up the output pointer and work down as we go. outPtr+= encodedBytes; switch(encodedBytes) { case 6: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL); src>>= 6; case 5: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL); src>>= 6; case 4: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL); src>>= 6; case 3: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL); src>>= 6; case 2: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL); src>>= 6; case 1: *--outPtr = XMLByte(src | gFirstByteMark[encodedBytes]); } // Add the encoded bytes back in again to indicate we've eaten them outPtr+= encodedBytes; } static void change_case_UTF8(XMLCh src, XMLByte*& outPtr, const Charset::UTF8CaseTable& table) { store_UTF8(change_case_UTF8(src, table), outPtr); } void change_case_UTF8(const XMLByte* srcData, size_t srcLen, XMLByte* toFill, size_t toFillLen, const Charset::UTF8CaseTable& table) { const XMLByte* srcPtr=srcData; const XMLByte* srcEnd=srcData+srcLen; XMLByte* outPtr=toFill; XMLByte* outEnd=toFill+toFillLen; // We now loop until we either run out of input data, or room to store while ((srcPtr < srcEnd) && (outPtr < outEnd)) { // Get the next leading byte out const XMLByte firstByte =* srcPtr; if(firstByte<=127) { change_case_UTF8(firstByte, outPtr, table); srcPtr++; continue; } // See how many trailing src bytes this sequence is going to require const unsigned int trailingBytes = gUTFBytes[firstByte]; // Looks ok, so lets build up the value uint tmpVal=0; switch(trailingBytes) { case 5: tmpVal+=*srcPtr++; tmpVal<<=6; case 4: tmpVal+=*srcPtr++; tmpVal<<=6; case 3: tmpVal+=*srcPtr++; tmpVal<<=6; case 2: tmpVal+=*srcPtr++; tmpVal<<=6; case 1: tmpVal+=*srcPtr++; tmpVal<<=6; case 0: tmpVal+=*srcPtr++; break; default: throw Exception(0, 0, "change_case_UTF8 error: wrong trailingBytes value(%d)", trailingBytes); } tmpVal-=gUTFOffsets[trailingBytes]; // If it will fit into a single char, then put it in. Otherwise // fail [*encode it as a surrogate pair. If its not valid, use the // replacement char.*] if(!(tmpVal & 0xFFFF0000)) change_case_UTF8(tmpVal, outPtr, table); else throw Exception(0, 0, "change_case_UTF8 error: too big tmpVal(0x%08X)", tmpVal); } if(srcPtr!=outPtr) throw Exception(0, 0, "change_case_UTF8 error: end pointers do not match"); } static size_t getDecNumLength(XMLCh UTF8Char){ return (UTF8Char < 100) ?2 :(UTF8Char < 1000) ?3 :(UTF8Char < 10000) ?4 :5; } const String::C Charset::transcodeFromUTF8(const String::C src) const { int src_length=src.length; #ifdef PRECALCULATE_DEST_LENGTH int dest_length=0; for(UTF8_string_iterator i((XMLByte *)src.str, src_length); i.has_next(); ){ dest_length += ( i.next() & 0xFFFF0000 ) ? 3*i.getCharSize() // %XX for each byte : ( xlatOneTo(i.next(), tables, 0) != 0 ) ? 1 // can convert it to a single char : 3+getDecNumLength( i.next() ); // print char as &#XX;, &#XXX;, &#XXXX; or &#XXXXX; } #else // so that surly enough, "&#XXX;" has max ratio (huh? 8 bytes needed for '&#XXXXX;') int dest_length=src_length*6; #endif #ifndef NDEBUG int saved_dest_length=dest_length; #endif XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/]; if(::transcodeFromUTF8( (XMLByte *)src.str, src_length, dest_body, dest_length, tables)<0) throw Exception(0, 0, "Charset::transcodeFromUTF8 buffer overflow"); assert(dest_length<=saved_dest_length); dest_body[dest_length]=0; // terminator return String::C((char*)dest_body, dest_length); } /// transcode using both charsets const String::C Charset::transcodeToCharset(const String::C src, const Charset& dest_charset) const { if(&dest_charset==this) return src; else { size_t dest_length=src.length; XMLByte* dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/]; XMLByte* output=dest_body; const XMLByte* input=(XMLByte *)src.str; while(XMLCh c=*input++) { XMLCh curVal = tables.fromTable[c]; *output++=curVal? xlatOneTo(curVal, dest_charset.tables, '?') // OK :'?'; // use the replacement character } dest_body[dest_length]=0; // terminator return String::C((char*)dest_body, dest_length); } } void Charset::store_Char(XMLByte*& outPtr, XMLCh src, XMLByte not_found){ if(isUTF8()) store_UTF8(src, outPtr); else if(char ch=xlatOneTo(src, tables, not_found)) *outPtr++=ch; } #ifdef XML static const Charset::Tables* tables[MAX_CHARSETS]; #define declareXml256ioFuncs(i) \ static int xml256CharEncodingInputFunc##i( \ unsigned char *out, int *outlen, \ const unsigned char *in, int *inlen) { \ return transcodeToUTF8( \ in, *inlen, \ out, *outlen, \ *tables[i]); \ } \ static int xml256CharEncodingOutputFunc##i( \ unsigned char *out, int *outlen, \ const unsigned char *in, int *inlen) { \ return transcodeFromUTF8( \ in, *inlen, \ out, *outlen, \ *tables[i]); \ } declareXml256ioFuncs(0) declareXml256ioFuncs(1) declareXml256ioFuncs(2) declareXml256ioFuncs(3) declareXml256ioFuncs(4) declareXml256ioFuncs(5) declareXml256ioFuncs(6) declareXml256ioFuncs(7) declareXml256ioFuncs(8) declareXml256ioFuncs(9) static xmlCharEncodingInputFunc inputFuncs[MAX_CHARSETS]={ xml256CharEncodingInputFunc0, xml256CharEncodingInputFunc1, xml256CharEncodingInputFunc2, xml256CharEncodingInputFunc3, xml256CharEncodingInputFunc4, xml256CharEncodingInputFunc5, xml256CharEncodingInputFunc6, xml256CharEncodingInputFunc7, xml256CharEncodingInputFunc8, xml256CharEncodingInputFunc9 }; static xmlCharEncodingOutputFunc outputFuncs[MAX_CHARSETS]={ xml256CharEncodingOutputFunc0, xml256CharEncodingOutputFunc1, xml256CharEncodingOutputFunc2, xml256CharEncodingOutputFunc3, xml256CharEncodingOutputFunc4, xml256CharEncodingOutputFunc5, xml256CharEncodingOutputFunc6, xml256CharEncodingOutputFunc7, xml256CharEncodingOutputFunc8, xml256CharEncodingOutputFunc9 }; static size_t handlers_count=0; void Charset::addEncoding(char *name_cstr) { if(handlers_count==MAX_CHARSETS) throw Exception(0, 0, "already allocated %d handlers, no space for new encoding '%s'", MAX_CHARSETS, name_cstr); ftranscoder_input=inputFuncs[handlers_count]; ftranscoder_output=outputFuncs[handlers_count]; ::tables[handlers_count++]=&tables; xmlCharEncodingHandler* handler=xmlNewCharEncodingHandler(name_cstr, ftranscoder_input, ftranscoder_output); if(!handler) throw Exception(0, new String(name_cstr, String::L_TAINTED), "unable to register XML encoding handler"); xmlRegisterCharEncodingHandler(handler); } String::C Charset::transcode_cstr(const xmlChar* s) { if(!s) return String::C("", 0); int inlen=strlen((const char*)s); int outlen=inlen*6/*strlen("ÿ")*/; // max #ifndef NDEBUG int saved_outlen=outlen; #endif char *out=new(PointerFreeGC) char[outlen+1]; int error; if(!fisUTF8) { error=ftranscoder_output((unsigned char*)out, &outlen, (const unsigned char*)s, &inlen); } else { memcpy(out, s, outlen=inlen); error=0; } if(error<0) throw Exception(0, 0, "transcode_cstr failed (%d)", error); assert(outlen<=saved_outlen); out[outlen]=0; return String::C(out, outlen); } const String& Charset::transcode(const xmlChar* s) { String::C cstr=transcode_cstr(s); return *new String(cstr.str, String::L_TAINTED); } /// @test less memory using -maybe- xmlParserInputBufferCreateMem xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) { xmlChar* out; int outlen; int error; #ifndef NDEBUG int saved_outlen; #endif if(!fisUTF8) { outlen=buf_size*6/*max UTF8 bytes per char*/; #ifndef NDEBUG saved_outlen=outlen; #endif out=(xmlChar*)xmlMalloc(outlen+1); error=ftranscoder_input(out, &outlen, (const unsigned char*)buf, (int*)&buf_size); } else { outlen=buf_size; #ifndef NDEBUG saved_outlen=outlen; #endif out=(xmlChar*)xmlMalloc(outlen+1); memcpy(out, buf, outlen); error=0; } if(error<0) throw Exception(0, 0, "transcode_buf failed (%d)", error); assert(outlen<=saved_outlen); out[outlen]=0; return out; } xmlChar* Charset::transcode(const String& s) { String::Body sbody=s.cstr_to_string_body_untaint(String::L_AS_IS); return transcode_buf2xchar(sbody.cstr(), sbody.length()); } xmlChar* Charset::transcode(const String::Body s) { return transcode_buf2xchar(s.cstr(), s.length()); } #endif String::Body Charset::transcode(const String::Body src, const Charset& source_transcoder, const Charset& dest_transcoder) { return String::Body(Charset::transcode(String::C(src.cstr(), src.length()), source_transcoder, dest_transcoder)); } String& Charset::transcode(const String& src, const Charset& source_transcoder, const Charset& dest_transcoder) { if(src.is_empty()) return *new String(); return *new String(transcode((String::Body)src, source_transcoder, dest_transcoder), String::L_CLEAN); } void Charset::transcode(ArrayString& src, const Charset& source_transcoder, const Charset& dest_transcoder) { for(size_t i=0; i<src.count(); i++) src.put(i, &transcode(*src[i], source_transcoder, dest_transcoder)); } #ifndef DOXYGEN struct Transcode_pair_info { const Charset* source_transcoder; const Charset* dest_transcoder; }; #endif static void transcode_pair(HashStringValue::key_type /*akey*/, String::Body& avalue, Transcode_pair_info* info) { avalue=Charset::transcode(avalue, *info->source_transcoder, *info->dest_transcoder); } void Charset::transcode(HashStringString& src, const Charset& source_transcoder, const Charset& dest_transcoder) { Transcode_pair_info info={&source_transcoder, &dest_transcoder}; src.for_each_ref<Transcode_pair_info*>(transcode_pair, &info); } size_t getUTF8BytePos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t charPos){ const XMLByte* ptr=srcBegin; while(charPos-- && skipUTF8Char(ptr, srcEnd)); return ptr-srcBegin; } size_t getUTF8CharPos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t bytePos){ size_t charPos=0; const XMLByte* ptr=srcBegin; const XMLByte* ptrEnd=srcBegin+bytePos; while(skipUTF8Char(ptr, srcEnd)){ if(ptr>ptrEnd) return charPos; charPos++; } // scan till end but position in bytes still too low throw Exception(0, 0, "Error conversion byte pos to char pos"); } size_t lengthUTF8(const XMLByte* srcBegin, const XMLByte* srcEnd){ size_t size=0; while(skipUTF8Char(srcBegin, srcEnd)) size++; return size; } unsigned int lengthUTF8Char(const XMLByte c){ return gUTFBytes[c]+1; } const char *fixUTF8(const char *src){ if(src && *src){ size_t length=strlen(src); int error_offset; if(pa_pcre_valid_utf((unsigned char *)src, length, &error_offset)){ char *result=(char *)pa_malloc_atomic(length+1); char *dst=result; do { if(error_offset){ memcpy(dst, src, error_offset); dst+=error_offset; src+=error_offset; length-=error_offset; } *dst++='?'; src++; length--; } while (length && pa_pcre_valid_utf((unsigned char *)src, length, &error_offset)); if(length){ strcpy(dst, src); } else { *dst='\0'; } return result; } } return src; } bool UTF8_string_iterator::has_next(){ fcharSize=readUTF8Char(fsrcPtr, fsrcEnd, ffirstByte, fUTF8Char); return fcharSize!=0; }