Annotation of parser3/src/main/pa_charset.C, revision 1.115
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.114 moko 4: Copyright (c) 2001-2024 Art. Lebedev Studio (http://www.artlebedev.com)
1.112 moko 5: Authors: Konstantin Morshnev <moko@design.ru>, Alexandr Petrosian <paf@design.ru>
1.27 paf 6: */
1.1 paf 7:
8: #include "pa_charset.h"
1.35 paf 9: #include "pa_charsets.h"
1.1 paf 10:
1.96 moko 11: // we are using some pcre_internal.h stuff as well
12: #include "../lib/pcre/pa_pcre_internal.h"
13:
1.115 ! moko 14: volatile const char * IDENT_PA_CHARSET_C="$Id: pa_charset.C,v 1.114 2024/11/04 03:53:25 moko Exp $" IDENT_PA_CHARSET_H;
1.90 moko 15:
1.1 paf 16: #ifdef XML
1.115 ! moko 17: #include "libxml/xmlmemory.h"
1.8 paf 18: #include "libxml/encoding.h"
1.1 paf 19: #endif
20:
1.46 paf 21: //#define PA_PATCHED_LIBXML_BACKWARD
1.67 misha 22:
23: // reduce memory usage by pre-calculation utf-8 string length
1.60 misha 24: #define PRECALCULATE_DEST_LENGTH
1.46 paf 25:
1.38 paf 26: // globals
27:
28: Charset::UTF8CaseTable::Rec UTF8CaseToUpperRecords[]={
29: #include "utf8-to-upper.inc"
30: };
31: Charset::UTF8CaseTable UTF8CaseToUpper={
32: sizeof(UTF8CaseToUpperRecords)/sizeof(Charset::UTF8CaseTable::Rec),
33: UTF8CaseToUpperRecords};
34:
35: Charset::UTF8CaseTable::Rec UTF8CaseToLowerRecords[]={
36: #include "utf8-to-lower.inc"
37: };
38: Charset::UTF8CaseTable UTF8CaseToLower={
39: sizeof(UTF8CaseToLowerRecords)/sizeof(Charset::UTF8CaseTable::Rec),
40: UTF8CaseToLowerRecords};
41:
1.1 paf 42: // helpers
43:
44: inline void prepare_case_tables(unsigned char *tables) {
45: unsigned char *lcc_table=tables+lcc_offset;
46: unsigned char *fcc_table=tables+fcc_offset;
47: for(int i=0; i<0x100; i++)
1.53 paf 48: lcc_table[i]=fcc_table[i]=(unsigned char)i;
1.1 paf 49: }
1.99 moko 50: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr, unsigned char bit) {
1.1 paf 51: unsigned char *ctypes_table=tables+ctypes_offset;
52: ctypes_table[0]=bit;
53: for(; *cstr; cstr++) {
54: unsigned char c=*cstr;
55: ctypes_table[c]|=bit;
56: }
57: }
1.35 paf 58: inline unsigned int to_wchar_code(const char* cstr) {
1.1 paf 59: if(!cstr || !*cstr)
60: return 0;
61: if(cstr[1]==0)
1.4 paf 62: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 63:
1.91 moko 64: return pa_atoui(cstr,0);
1.1 paf 65: }
1.35 paf 66: inline bool to_bool(const char* cstr) {
1.1 paf 67: return cstr && *cstr!=0;
68: }
1.99 moko 69: static void element2ctypes(unsigned char c, bool belongs, unsigned char *tables, unsigned char bit, int group_offset=-1) {
1.1 paf 70: if(!belongs)
71: return;
72:
73: unsigned char *ctypes_table=tables+ctypes_offset;
74:
75: ctypes_table[c]|=bit;
76: if(group_offset>=0)
1.4 paf 77: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 78: }
1.99 moko 79: static void element2case(unsigned char from, unsigned char to, unsigned char *tables) {
1.1 paf 80: if(!to)
81: return;
82:
83: unsigned char *lcc_table=tables+lcc_offset;
84: unsigned char *fcc_table=tables+fcc_offset;
85: lcc_table[from]=to;
86: fcc_table[from]=to; fcc_table[to]=from;
87: }
88:
1.95 moko 89: inline XMLByte *append_hex_8(XMLByte *dest, unsigned char c, const char* prefix=0) {
1.93 moko 90: if(prefix) {
1.95 moko 91: strcpy((char *)dest, prefix);
1.93 moko 92: dest+=strlen(prefix);
93: }
94: *dest++=hex_digits[c >> 4];
95: *dest++=hex_digits[c & 0x0F];
1.95 moko 96: return dest;
1.93 moko 97: }
98:
1.95 moko 99: inline XMLByte *append_hex_16(XMLByte *dest, unsigned int c, const char* prefix=0) {
1.93 moko 100: if(prefix) {
1.95 moko 101: strcpy((char *)dest, prefix);
1.93 moko 102: dest+=strlen(prefix);
103: }
104: *dest++=hex_digits[(c >> 12) & 0x0F];
105: *dest++=hex_digits[(c >> 8) & 0x0F];
106: *dest++=hex_digits[(c >> 4) & 0x0F];
107: *dest++=hex_digits[(c) & 0x0F];
1.95 moko 108: return dest;
1.93 moko 109: }
110:
1.1 paf 111: // methods
112:
1.103 moko 113: Charset::Charset(Request_charsets* acharsets, const String::Body ANAME, const String* afile_spec):
1.35 paf 114: FNAME(ANAME),
115: FNAME_CSTR(ANAME.cstrm()) {
1.7 paf 116:
1.35 paf 117: if(afile_spec) {
1.1 paf 118: fisUTF8=false;
1.103 moko 119: load_definition(*acharsets, *afile_spec);
1.1 paf 120: #ifdef XML
1.35 paf 121: addEncoding(FNAME_CSTR);
1.1 paf 122: #endif
123: } else {
124: fisUTF8=true;
1.4 paf 125: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.96 moko 126: memcpy(pcre_tables, pa_pcre_default_tables, sizeof(pcre_tables));
1.1 paf 127: }
128:
129: #ifdef XML
1.35 paf 130: initTranscoder(FNAME, FNAME_CSTR);
1.1 paf 131: #endif
132: }
133:
1.104 moko 134: void Charset::load_definition(Request_charsets& acharsets, const String& afile_spec) {
1.1 paf 135: // pcre_tables
136: // lowcase, flipcase, bits digit+word+whitespace, masks
137:
138: // must not move this inside of prepare_case_tables
139: // don't know the size there
140: memset(pcre_tables, 0, sizeof(pcre_tables));
141: prepare_case_tables(pcre_tables);
1.4 paf 142: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 143:
144: // charset
1.35 paf 145: memset(&tables, 0, sizeof(tables));
1.1 paf 146:
147: // loading text
1.104 moko 148: char *data=file_read_text(acharsets, afile_spec);
1.1 paf 149:
150: // ignore header
151: getrow(&data);
152:
153: // parse cells
154: char *row;
1.42 paf 155: while((row=getrow(&data))) {
1.1 paf 156: // remove empty&comment lines
157: if(!*row || *row=='#')
158: continue;
159:
160: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
1.53 paf 161: unsigned char c=0;
1.1 paf 162: char *cell;
1.42 paf 163: for(int column=0; (cell=lsplit(&row, '\t')); column++) {
1.1 paf 164: switch(column) {
1.53 paf 165: case 0: c=(unsigned char)to_wchar_code(cell); break;
1.1 paf 166: // pcre_tables
167: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
168: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
169: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
170: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
171: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
1.53 paf 172: case 6: element2case(c, (unsigned char)to_wchar_code(cell), pcre_tables); break;
1.1 paf 173: case 7:
174: case 8:
175: // charset
1.10 paf 176: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.113 moko 177: throw Exception(PARSER_RUNTIME, &afile_spec, "charset must contain not more than %d unicode values", MAX_CHARSET_UNI_CODES);
1.1 paf 178:
179: XMLCh unicode=(XMLCh)to_wchar_code(cell);
180: if(!unicode && column==7/*unicode1 column*/)
181: unicode=(XMLCh)c;
182: if(unicode) {
1.10 paf 183: if(!tables.fromTable[c])
184: tables.fromTable[c]=unicode;
185: tables.toTable[tables.toTableSize].intCh=unicode;
186: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
187: tables.toTableSize++;
1.1 paf 188: }
189: break;
190: }
191: }
192: };
193:
1.87 moko 194: // parser charset tables declare only white-space before 0x20, thus adding the missing chars
195: for(uint i=0; i<0x20; i++)
196: if(!tables.fromTable[i]){
197: tables.fromTable[i]=i;
198: tables.toTable[tables.toTableSize].intCh=i;
199: tables.toTable[tables.toTableSize].extCh=(XMLByte)i;
200: tables.toTableSize++;
201: }
202:
1.1 paf 203: // sort by the Unicode code point
204: sort_ToTable();
205: }
206:
207: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
208: return
1.38 paf 209: static_cast<const Charset::Tables::Rec *>(a)->intCh-
210: static_cast<const Charset::Tables::Rec *>(b)->intCh;
1.1 paf 211: }
212:
213: void Charset::sort_ToTable() {
1.92 moko 214: qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), sort_cmp_Trans_rec_intCh);
1.1 paf 215: }
216:
1.60 misha 217: // @todo: precache for spedup searching
1.99 moko 218: static XMLByte xlatOneTo(const XMLCh toXlat, const Charset::Tables& tables, XMLByte not_found) {
1.80 misha 219: int lo = 0;
220: int hi = tables.toTableSize - 1;
1.39 paf 221: while(lo<=hi) {
1.35 paf 222: // Calc the mid point of the low and high offset.
1.39 paf 223: const unsigned int i = (lo + hi) / 2;
224:
225: XMLCh cur=tables.toTable[i].intCh;
226: if(toXlat==cur)
227: return tables.toTable[i].extCh;
228: if(toXlat>cur)
229: lo = i+1;
1.1 paf 230: else
1.39 paf 231: hi = i-1;
232: }
1.35 paf 233:
234: return not_found;
1.1 paf 235: }
236:
1.99 moko 237: String::C Charset::transcode(const String::C src, const Charset& source_charset, const Charset& dest_charset) {
1.35 paf 238: if(!src.length)
239: return String::C("", 0);
1.4 paf 240:
1.1 paf 241: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
242: default: // 0x00
1.35 paf 243: return source_charset.transcodeToCharset(src, dest_charset);
1.1 paf 244: case 0x01:
1.35 paf 245: return source_charset.transcodeToUTF8(src);
1.1 paf 246: case 0x10:
1.35 paf 247: return dest_charset.transcodeFromUTF8(src);
1.1 paf 248: case 0x11:
1.35 paf 249: return src;
1.1 paf 250: }
251: }
252:
253: // ---------------------------------------------------------------------------
254: // Local static data
255: //
256: // gUTFBytes
257: // A list of counts of trailing bytes for each initial byte in the input.
258: //
259: // gUTFOffsets
260: // A list of values to offset each result char type, according to how
261: // many source bytes when into making it.
262: //
263: // gFirstByteMark
264: // A list of values to mask onto the first byte of an encoded sequence,
265: // indexed by the number of bytes used to create the sequence.
266: // ---------------------------------------------------------------------------
267: static const XMLByte gUTFBytes[0x100] = {
268: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
269: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
270: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
271: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
272: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
273: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
274: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
275: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
276: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
277: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
278: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
279: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
280: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
281: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
282: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
283: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
284: };
285:
286: static const uint gUTFOffsets[6] = {
1.80 misha 287: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
1.1 paf 288: };
289:
290: static const XMLByte gFirstByteMark[7] = {
1.80 misha 291: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
1.1 paf 292: };
293:
1.99 moko 294: static int transcodeToUTF8(const XMLByte* srcData, int& srcLen, XMLByte *toFill, int& toFillLen, const Charset::Tables& tables) {
1.11 paf 295: const XMLByte* srcPtr=srcData;
296: const XMLByte* srcEnd=srcData+srcLen;
297: XMLByte* outPtr=toFill;
298: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 299:
1.35 paf 300: while(srcPtr<srcEnd) {
301: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 302: if(!curVal) {
1.35 paf 303: // use the replacement character
304: *outPtr++= '?';
305: srcPtr++;
306: continue;
307: }
1.1 paf 308:
1.35 paf 309: // Figure out how many bytes we need
310: unsigned int encodedBytes;
311: if(curVal<0x80)
312: encodedBytes = 1;
313: else if(curVal<0x800)
314: encodedBytes = 2;
315: else if(curVal<0x10000)
316: encodedBytes = 3;
317: else if(curVal<0x200000)
318: encodedBytes = 4;
319: else if(curVal<0x4000000)
320: encodedBytes = 5;
321: else if(curVal<= 0x7FFFFFFF)
322: encodedBytes = 6;
323: else {
324: // use the replacement character
325: *outPtr++= '?';
326: srcPtr++;
327: continue;
328: }
1.11 paf 329:
1.35 paf 330: // If we cannot fully get this char into the output buffer
331: if (outPtr + encodedBytes > outEnd)
332: break;
333:
334: // We can do it, so update the source index
335: srcPtr++;
336:
337: // And spit out the bytes. We spit them out in reverse order
338: // here, so bump up the output pointer and work down as we go.
339: outPtr+= encodedBytes;
340: switch(encodedBytes) {
1.60 misha 341: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
342: curVal>>= 6;
343: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
344: curVal>>= 6;
345: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
346: curVal>>= 6;
347: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
348: curVal>>= 6;
349: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
350: curVal>>= 6;
351: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
1.35 paf 352: }
353:
354: // Add the encoded bytes back in again to indicate we've eaten them
355: outPtr+= encodedBytes;
356: }
357:
358: // Update the bytes eaten
359: srcLen = srcPtr - srcData;
360:
361: // Return the characters read
362: toFillLen = outPtr - toFill;
363:
1.29 paf 364: //return srcPtr==srcEnd?(int)toFillLen:-1;
365: /*
366: xmlCharEncodingInputFunc
367: Returns :
368: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
369: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
370: of ocetes consumed.
371: */
372: return 0;
1.1 paf 373: }
1.26 paf 374: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.99 moko 375: static int transcodeFromUTF8(const XMLByte* srcData, int& srcLen, XMLByte* toFill, int& toFillLen, const Charset::Tables& tables) {
1.11 paf 376: const XMLByte* srcPtr=srcData;
377: const XMLByte* srcEnd=srcData+srcLen;
378: XMLByte* outPtr=toFill;
379: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 380:
1.35 paf 381: // We now loop until we either run out of input data, or room to store
382: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
383: // Get the next leading byte out
384: const XMLByte firstByte =* srcPtr;
385:
386: // Special-case ASCII, which is a leading byte value of<= 127
1.60 misha 387: if(firstByte<=127) {
1.35 paf 388: *outPtr++= firstByte;
389: srcPtr++;
390: continue;
391: }
392:
393: // See how many trailing src bytes this sequence is going to require
394: const unsigned int trailingBytes = gUTFBytes[firstByte];
395:
396: // If there are not enough source bytes to do this one, then we
397: // are done. Note that we done>= here because we are implicitly
398: // counting the 1 byte we get no matter what.
399: if(srcPtr+trailingBytes>= srcEnd)
400: break;
401:
402: // Looks ok, so lets build up the value
403: uint tmpVal=0;
404: switch(trailingBytes) {
405: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
406: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
407: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
408: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
409: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
410: case 0: tmpVal+=*srcPtr++;
411: break;
412:
413: default:
1.100 moko 414: throw Exception(0, 0, "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes); // never
1.35 paf 415: }
416: tmpVal-=gUTFOffsets[trailingBytes];
417:
418: // If it will fit into a single char, then put it in. Otherwise
419: // fail [*encode it as a surrogate pair. If its not valid, use the
420: // replacement char.*]
421: if(!(tmpVal & 0xFFFF0000)) {
1.25 paf 422: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
423: *outPtr++=xlat;
1.49 paf 424: else {
1.50 paf 425: outPtr+=sprintf((char *)outPtr, "&#%u;", tmpVal); // &#decimal;
1.49 paf 426: }
427: } else {
428: const XMLByte* recoverPtr=srcPtr-trailingBytes-1;
429: for(uint i=0; i<=trailingBytes; i++)
430: outPtr+=sprintf((char*)outPtr, "%%%02X", *recoverPtr++);
431: }
1.1 paf 432: }
1.35 paf 433:
434: // Update the bytes eaten
435: srcLen = srcPtr - srcData;
436:
437: // Return the characters read
438: toFillLen = outPtr - toFill;
1.11 paf 439:
1.29 paf 440: //return srcPtr==srcEnd?(int)toFillLen:-1;
441: /*
442: xmlCharEncodingOutputFunc
443: Returns :
444: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
445: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
446: of ocetes consumed.
447: */
448: return 0;
1.10 paf 449: }
450:
1.85 misha 451: static bool need_escape(XMLByte c){
1.60 misha 452: return
1.66 misha 453: !(
454: (c<=127)
455: && (
1.89 misha 456: pa_isalnum((unsigned char)c)
1.66 misha 457: || strchr("*@-_+./", c)!=0
458: )
459: );
1.60 misha 460: }
461:
1.70 misha 462: // read one UTF8 char and return length of this char (in bytes)
463: static unsigned int readUTF8Char(const XMLByte*& srcPtr, const XMLByte* srcEnd, XMLByte& firstByte, XMLCh& UTF8Char){
1.60 misha 464: if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
465: return 0;
466:
467: firstByte=*srcPtr;
468:
469: if(firstByte<=127){
470: UTF8Char=firstByte;
471: srcPtr++;
472: return 1;
473: }
474:
475: unsigned int trailingBytes=gUTFBytes[firstByte];
476:
477: if(srcPtr+trailingBytes>=srcEnd){
478: return 0; // not enough bytes in source string for reading
479: }
480:
481: uint tmpVal=0;
482: switch(trailingBytes){
483: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
484: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
485: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
486: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
487: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
488: case 0: tmpVal+=*srcPtr++;
489: }
490:
491: tmpVal-=gUTFOffsets[trailingBytes];
492: UTF8Char=tmpVal;
493:
494: return trailingBytes+1;
495: }
496:
1.70 misha 497: // skip UTF8 char and return length of this char (in bytes)
498: static unsigned int skipUTF8Char(const XMLByte*& srcPtr, const XMLByte* srcEnd){
1.62 misha 499: if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
500: return 0;
501:
1.63 misha 502: unsigned int trailingBytes=gUTFBytes[*srcPtr]+1;
503: srcPtr+=trailingBytes;
1.62 misha 504:
505: return trailingBytes;
1.61 misha 506: }
507:
1.85 misha 508: // read non-UTF8 char, and return number of bytes needed for storing this char in UTF8
1.61 misha 509: static unsigned int readChar(const XMLByte*& srcPtr, const XMLByte* srcEnd, XMLByte& firstByte, XMLCh& UTF8Char, const Charset::Tables& tables){
1.60 misha 510: if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
511: return 0;
512:
513: firstByte=*srcPtr++;
514: UTF8Char=tables.fromTable[firstByte];
515:
516: if(UTF8Char<0x80)
517: return 1;
518: else if(UTF8Char<0x800)
519: return 2;
520: else if(UTF8Char<0x10000)
521: return 3;
522: else if(UTF8Char<0x200000)
523: return 4;
524: else if(UTF8Char<0x4000000)
525: return 5;
526: else if(UTF8Char<= 0x7FFFFFFF)
527: return 6;
528:
529: // will use the replacement character '?'
530: firstByte=0;
531: return 1;
532: }
533:
1.85 misha 534: size_t Charset::calc_escaped_length_UTF8(XMLByte* src, size_t src_length){
535: size_t dest_length=0;
536:
537: for(UTF8_string_iterator i(src, src_length); i.has_next(); ){
538: if(i.getCharSize()==1)
539: dest_length+=!need_escape(i.getFirstByte())?1/*as-is*/:3/*%XX*/;
540: else
541: dest_length+=6; // %uXXXX
1.60 misha 542: }
543:
1.85 misha 544: return dest_length;
1.60 misha 545: }
546:
1.86 moko 547: size_t Charset::calc_escaped_length(const XMLByte* src, size_t src_length, const Charset::Tables& tables){
548: const XMLByte* src_end=src+src_length;
549: XMLByte first_byte;
550: XMLCh UTF8_char;
1.85 misha 551: size_t dest_length=0;
552:
1.86 moko 553: while(uint char_size=readChar(src, src_end, first_byte, UTF8_char, tables)){
1.85 misha 554: if(char_size==1)
555: dest_length+=(!first_byte/*replacement char '?'*/ || !need_escape(first_byte))?1:3/*'%XX'*/;
556: else
557: dest_length+=6; // %uXXXX
1.60 misha 558: }
559:
1.85 misha 560: return dest_length;
561: }
562:
563: size_t Charset::calc_escaped_length(const String::C src, const Charset& source_charset){
1.86 moko 564: if(!src.length)
1.85 misha 565: return 0;
566:
567: #ifdef PRECALCULATE_DEST_LENGTH
568: if(source_charset.isUTF8())
1.86 moko 569: return calc_escaped_length_UTF8((XMLByte *)src.str, src.length);
1.85 misha 570: else
1.86 moko 571: return calc_escaped_length((XMLByte *)src.str, src.length, source_charset.tables);
1.85 misha 572: #else
573: return src_length*6; // enough for %uXXXX but too memory-hungry
574: #endif
575: }
576:
577: #define escape_char(dest_ptr, char_size, first_byte, UTF8_char) \
578: if(char_size==1) \
579: if(first_byte){ \
580: if(need_escape(first_byte)) \
1.95 moko 581: dest_ptr=append_hex_8(dest_ptr, first_byte, "%"); /* %XX */ \
1.85 misha 582: else \
583: *dest_ptr++=first_byte; /*as is*/ \
584: } else \
585: *dest_ptr++='?'; /* replacement char '?' */ \
586: else \
1.95 moko 587: dest_ptr=append_hex_16(dest_ptr, UTF8_char, "%u"); /* %uXXXX */
1.85 misha 588:
589:
590: size_t Charset::escape_UTF8(const XMLByte* src, size_t src_length, XMLByte* dest) {
591: XMLByte* dest_ptr=dest;
592:
593: // loop until we either run out of input data
594: for(UTF8_string_iterator i((XMLByte *)src, src_length); i.has_next(); )
595: escape_char(dest_ptr, i.getCharSize(), i.getFirstByte(), i.next())
1.60 misha 596:
1.85 misha 597: return dest_ptr - dest;
1.60 misha 598: }
599:
1.85 misha 600: size_t Charset::escape(const XMLByte* src, size_t src_length, XMLByte* dest, const Charset::Tables& tables) {
601: const XMLByte* src_end=src+src_length;
602: XMLByte* dest_ptr=dest;
603:
604: XMLByte first_byte;
605: XMLCh UTF8_char;
606: uint char_size;
607:
1.86 moko 608: while(char_size=readChar(src, src_end, first_byte, UTF8_char, tables))
1.85 misha 609: escape_char(dest_ptr, char_size, first_byte, UTF8_char)
610:
611: return dest_ptr - dest;
612: }
1.60 misha 613:
614: String::C Charset::escape(const String::C src, const Charset& source_charset){
1.86 moko 615: if(!src.length)
1.60 misha 616: return String::C("", 0);
617:
1.85 misha 618: size_t dest_calculated_length=calc_escaped_length(src, source_charset);
619: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_calculated_length+1/*terminator*/];
620:
621: size_t dest_length;
622: if(source_charset.isUTF8())
1.86 moko 623: dest_length=escape_UTF8((XMLByte *)src.str, src.length, dest_body);
1.85 misha 624: else
1.86 moko 625: dest_length=escape((XMLByte *)src.str, src.length, dest_body, source_charset.tables);
1.85 misha 626:
627: if(dest_length>dest_calculated_length)
628: throw Exception(0, 0, "Charset::escape buffer overflow");
629:
630: dest_body[dest_length]=0; // terminator
631: return String::C((char*)dest_body, dest_length);
632: }
633:
634: String::Body Charset::escape(const String::Body src, const Charset& source_charset) {
1.86 moko 635: String::C dest=Charset::escape(String::C(src.cstr(), src.length()), source_charset);
1.85 misha 636: return String::Body(dest.length ? dest.str:0);
637: }
638:
639: String& Charset::escape(const String& src, const Charset& source_charset) {
640: if(src.is_empty())
641: return *new String();
642:
643: return *new String(escape((String::Body)src, source_charset), String::L_CLEAN);
644: }
645:
646: inline bool need_json_escape(unsigned char c){
647: return strchr("\n\"\\/\t\r\b\f", c)!=0;
648: }
649:
650: size_t Charset::calc_JSON_escaped_length_UTF8(XMLByte* src, size_t src_length){
651: size_t dest_length=0;
652:
653: for(UTF8_string_iterator i(src, src_length); i.has_next(); ){
1.93 moko 654: if(i.getCharSize()==1){
655: XMLByte first_byte=i.getFirstByte();
656: dest_length+=need_json_escape(first_byte) ? 2 : (first_byte < 0x20 && first_byte /* 0 replacement char is '?' */) ? 6 : 1;
657: } else
1.85 misha 658: dest_length+=6; // \uXXXX
659: }
660:
661: return dest_length;
662: }
663:
1.86 moko 664: size_t Charset::calc_JSON_escaped_length(const XMLByte* src, size_t src_length, const Charset::Tables& tables){
665: const XMLByte* src_end=src+src_length;
1.85 misha 666: XMLByte first_byte;
667: XMLCh UTF8_char;
1.60 misha 668: size_t dest_length=0;
669:
1.86 moko 670: while(uint char_size=readChar(src, src_end, first_byte, UTF8_char, tables)){
1.85 misha 671: if(char_size==1)
1.93 moko 672: dest_length+=need_json_escape(first_byte) ? 2 : (first_byte < 0x20 && first_byte /* 0 replacement char is '?' */) ? 6 : 1;
1.85 misha 673: else
674: dest_length+=6; // \uXXXX
1.60 misha 675: }
1.85 misha 676:
677: return dest_length;
678: }
679:
680: size_t Charset::calc_JSON_escaped_length(const String::C src, const Charset& source_charset){
1.86 moko 681: if(!src.length)
1.85 misha 682: return 0;
683:
684: #ifdef PRECALCULATE_DEST_LENGTH
685: if(source_charset.isUTF8())
1.86 moko 686: return calc_JSON_escaped_length_UTF8((XMLByte *)src.str, src.length);
1.85 misha 687: else
1.86 moko 688: return calc_JSON_escaped_length((XMLByte *)src.str, src.length, source_charset.tables);
1.60 misha 689: #else
1.85 misha 690: return src_length*6; // enough for \uXXXX but too memory-hungry
1.60 misha 691: #endif
1.85 misha 692: }
693:
694: #define escape_char_JSON(dest_ptr, char_size, first_byte, UTF8_char) \
695: if(char_size==1) \
696: switch(first_byte){ \
697: case '\n': *dest_ptr++='\\'; *dest_ptr++='n'; break; \
698: case '"' : *dest_ptr++='\\'; *dest_ptr++='"'; break; \
699: case '\\': *dest_ptr++='\\'; *dest_ptr++='\\'; break; \
700: case '/' : *dest_ptr++='\\'; *dest_ptr++='/'; break; \
701: case '\t': *dest_ptr++='\\'; *dest_ptr++='t'; break; \
702: case '\r': *dest_ptr++='\\'; *dest_ptr++='r'; break; \
703: case '\b': *dest_ptr++='\\'; *dest_ptr++='b'; break; \
704: case '\f': *dest_ptr++='\\'; *dest_ptr++='f'; break; \
705: case 0 : *dest_ptr++='?'; break; /*replacement char*/ \
1.95 moko 706: default : if(first_byte < 0x20) dest_ptr=append_hex_16(dest_ptr, UTF8_char, "\\u"); \
1.93 moko 707: else *dest_ptr++=first_byte; \
1.85 misha 708: } \
709: else \
1.95 moko 710: dest_ptr=append_hex_16(dest_ptr, UTF8_char, "\\u"); // \uXXXX
1.85 misha 711:
712:
713: size_t Charset::escape_JSON_UTF8(const XMLByte* src, size_t src_length, XMLByte* dest) {
714: XMLByte* dest_ptr=dest;
715:
716: // loop until we either run out of input data
717: for(UTF8_string_iterator i((XMLByte *)src, src_length); i.has_next(); )
718: escape_char_JSON(dest_ptr, i.getCharSize(), i.getFirstByte(), i.next())
719:
720: return dest_ptr - dest;
721: }
722:
723: size_t Charset::escape_JSON(const XMLByte* src, size_t src_length, XMLByte* dest, const Charset::Tables& tables) {
724: const XMLByte* src_end=src+src_length;
725: XMLByte* dest_ptr=dest;
726:
727: XMLByte first_byte;
728: XMLCh UTF8_char;
729: uint char_size;
730:
1.86 moko 731: while(char_size=readChar(src, src_end, first_byte, UTF8_char, tables))
1.85 misha 732: escape_char_JSON(dest_ptr, char_size, first_byte, UTF8_char)
733:
734: return dest_ptr - dest;
735: }
1.60 misha 736:
1.85 misha 737: String::C Charset::escape_JSON(const String::C src, const Charset& source_charset){
1.86 moko 738: if(!src.length)
1.85 misha 739: return String::C("", 0);
1.60 misha 740:
1.85 misha 741: size_t dest_calculated_length=calc_JSON_escaped_length(src, source_charset);
742: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_calculated_length+1/*terminator*/];
743:
744: size_t dest_length;
745: if(source_charset.isUTF8())
1.86 moko 746: dest_length=escape_JSON_UTF8((XMLByte *)src.str, src.length, dest_body);
1.85 misha 747: else
1.86 moko 748: dest_length=escape_JSON((XMLByte *)src.str, src.length, dest_body, source_charset.tables);
1.60 misha 749:
1.85 misha 750: if(dest_length>dest_calculated_length)
751: throw Exception(0, 0, "Charset::escape_JSON buffer overflow");
1.60 misha 752:
753: dest_body[dest_length]=0; // terminator
754: return String::C((char*)dest_body, dest_length);
755: }
1.85 misha 756:
757: String::Body Charset::escape_JSON(const String::Body src, const Charset& source_charset) {
1.86 moko 758: String::C dest=Charset::escape_JSON(String::C(src.cstr(), src.length()), source_charset);
1.77 misha 759: return String::Body(dest.length ? dest.str:0);
1.64 misha 760: }
761:
1.85 misha 762: String& Charset::escape_JSON(const String& src, const Charset& source_charset) {
1.72 misha 763: if(src.is_empty())
1.73 misha 764: return *new String();
1.64 misha 765:
1.85 misha 766: return *new String(escape_JSON((String::Body)src, source_charset), String::L_CLEAN);
1.64 misha 767: }
1.60 misha 768:
1.35 paf 769: const String::C Charset::transcodeToUTF8(const String::C src) const {
1.71 misha 770: int src_length=src.length;
1.60 misha 771:
772: #ifdef PRECALCULATE_DEST_LENGTH
1.71 misha 773: int dest_length=0;
1.60 misha 774: const XMLByte* srcPtr=(XMLByte*)src.str;
775: const XMLByte* srcEnd=srcPtr+src_length;
1.69 misha 776: XMLByte firstByte;
777: XMLCh UTF8Char;
778: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, tables))
1.60 misha 779: dest_length+=charSize;
780: #else
1.85 misha 781: int dest_length=src_length*6; // so that surly enough (max utf8 seq len=6) but too memory-hungry
1.60 misha 782: #endif
783:
1.35 paf 784: #ifndef NDEBUG
1.71 misha 785: int saved_dest_length=dest_length;
1.35 paf 786: #endif
787: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 788:
789: if(::transcodeToUTF8(
1.35 paf 790: (XMLByte *)src.str, src_length,
791: dest_body, dest_length,
1.11 paf 792: tables)<0)
1.100 moko 793: throw Exception(0, 0, "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 794:
1.60 misha 795: assert(dest_length<=saved_dest_length);
796: dest_body[dest_length]=0; // terminator
1.35 paf 797: return String::C((char*)dest_body, dest_length);
1.10 paf 798: }
1.38 paf 799:
800: static XMLCh change_case_UTF8(const XMLCh src, const Charset::UTF8CaseTable& table) {
1.80 misha 801: int lo = 0;
802: int hi = table.size - 1;
1.39 paf 803: while(lo<=hi) {
1.38 paf 804: // Calc the mid point of the low and high offset.
1.39 paf 805: const unsigned int i = (lo + hi) / 2;
806:
807: XMLCh cur=table.records[i].from;
808: if(src==cur)
809: return table.records[i].to;
810: if(src>cur)
811: lo = i+1;
1.38 paf 812: else
1.39 paf 813: hi = i-1;
814: }
815:
816: // not found
1.38 paf 817: return src;
818: }
819:
1.58 misha 820: static void store_UTF8(XMLCh src, XMLByte*& outPtr){
1.38 paf 821: if(!src) {
822: // use the replacement character
823: *outPtr++= '?';
824: return;
825: }
826:
827: // Figure out how many bytes we need
828: unsigned int encodedBytes;
829: if(src<0x80)
830: encodedBytes = 1;
831: else if(src<0x800)
832: encodedBytes = 2;
833: else if(src<0x10000)
834: encodedBytes = 3;
835: else if(src<0x200000)
836: encodedBytes = 4;
837: else if(src<0x4000000)
838: encodedBytes = 5;
839: else if(src<= 0x7FFFFFFF)
840: encodedBytes = 6;
841: else {
842: // use the replacement character
843: *outPtr++= '?';
844: return;
845: }
846:
847: // And spit out the bytes. We spit them out in reverse order
848: // here, so bump up the output pointer and work down as we go.
849: outPtr+= encodedBytes;
850: switch(encodedBytes) {
851: case 6: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
852: src>>= 6;
853: case 5: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
854: src>>= 6;
855: case 4: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
856: src>>= 6;
857: case 3: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
858: src>>= 6;
859: case 2: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
860: src>>= 6;
861: case 1: *--outPtr = XMLByte(src | gFirstByteMark[encodedBytes]);
862: }
863:
864: // Add the encoded bytes back in again to indicate we've eaten them
865: outPtr+= encodedBytes;
866: }
867:
1.99 moko 868: static void change_case_UTF8(XMLCh src, XMLByte*& outPtr, const Charset::UTF8CaseTable& table) {
1.38 paf 869: store_UTF8(change_case_UTF8(src, table), outPtr);
1.98 moko 870: }
871:
1.99 moko 872: void change_case_UTF8(const XMLByte* srcData, size_t srcLen, XMLByte* toFill, size_t toFillLen, const Charset::UTF8CaseTable& table) {
1.38 paf 873: const XMLByte* srcPtr=srcData;
1.44 paf 874: const XMLByte* srcEnd=srcData+srcLen;
1.38 paf 875: XMLByte* outPtr=toFill;
1.44 paf 876: XMLByte* outEnd=toFill+toFillLen;
877:
878: // We now loop until we either run out of input data, or room to store
879: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
880: // Get the next leading byte out
881: const XMLByte firstByte =* srcPtr;
1.38 paf 882:
1.60 misha 883: if(firstByte<=127) {
1.38 paf 884: change_case_UTF8(firstByte, outPtr, table);
885: srcPtr++;
886: continue;
887: }
888:
889: // See how many trailing src bytes this sequence is going to require
890: const unsigned int trailingBytes = gUTFBytes[firstByte];
891:
892: // Looks ok, so lets build up the value
893: uint tmpVal=0;
894: switch(trailingBytes) {
895: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
896: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
897: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
898: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
899: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
900: case 0: tmpVal+=*srcPtr++;
901: break;
902:
903: default:
1.100 moko 904: throw Exception(0, 0, "change_case_UTF8 error: wrong trailingBytes value(%d)", trailingBytes);
1.38 paf 905: }
906: tmpVal-=gUTFOffsets[trailingBytes];
907:
908: // If it will fit into a single char, then put it in. Otherwise
909: // fail [*encode it as a surrogate pair. If its not valid, use the
910: // replacement char.*]
911: if(!(tmpVal & 0xFFFF0000))
912: change_case_UTF8(tmpVal, outPtr, table);
913: else
1.100 moko 914: throw Exception(0, 0, "change_case_UTF8 error: too big tmpVal(0x%08X)", tmpVal);
1.38 paf 915: }
916:
917: if(srcPtr!=outPtr)
1.100 moko 918: throw Exception(0, 0, "change_case_UTF8 error: end pointers do not match");
1.38 paf 919: }
920:
1.60 misha 921: static size_t getDecNumLength(XMLCh UTF8Char){
922: return
923: (UTF8Char < 100)
924: ?2
925: :(UTF8Char < 1000)
926: ?3
927: :(UTF8Char < 10000)
928: ?4
929: :5;
930: }
1.38 paf 931:
1.35 paf 932: const String::C Charset::transcodeFromUTF8(const String::C src) const {
1.82 misha 933: int src_length=src.length;
1.60 misha 934: #ifdef PRECALCULATE_DEST_LENGTH
1.71 misha 935: int dest_length=0;
1.82 misha 936: for(UTF8_string_iterator i((XMLByte *)src.str, src_length); i.has_next(); ){
1.88 misha 937: dest_length += ( i.next() & 0xFFFF0000 )
938: ? 3*i.getCharSize() // %XX for each byte
939: : ( xlatOneTo(i.next(), tables, 0) != 0 )
940: ? 1 // can convert it to a single char
941: : 3+getDecNumLength( i.next() ); // print char as &#XX;, &#XXX;, &#XXXX; or &#XXXXX;
1.60 misha 942: }
943: #else
944: // so that surly enough, "&#XXX;" has max ratio (huh? 8 bytes needed for '&#XXXXX;')
1.82 misha 945: int dest_length=src_length*6;
1.60 misha 946: #endif
947:
1.35 paf 948: #ifndef NDEBUG
1.71 misha 949: int saved_dest_length=dest_length;
1.35 paf 950: #endif
951: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 952:
953: if(::transcodeFromUTF8(
1.82 misha 954: (XMLByte *)src.str, src_length,
1.35 paf 955: dest_body, dest_length,
1.11 paf 956: tables)<0)
1.100 moko 957: throw Exception(0, 0, "Charset::transcodeFromUTF8 buffer overflow");
1.10 paf 958:
1.60 misha 959: assert(dest_length<=saved_dest_length);
960: dest_body[dest_length]=0; // terminator
1.35 paf 961: return String::C((char*)dest_body, dest_length);
1.1 paf 962: }
963:
964: /// transcode using both charsets
1.99 moko 965: const String::C Charset::transcodeToCharset(const String::C src, const Charset& dest_charset) const {
1.35 paf 966: if(&dest_charset==this)
967: return src;
968: else {
969: size_t dest_length=src.length;
970: XMLByte* dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
971:
972: XMLByte* output=dest_body;
973: const XMLByte* input=(XMLByte *)src.str;
974: while(XMLCh c=*input++) {
975: XMLCh curVal = tables.fromTable[c];
976: *output++=curVal?
977: xlatOneTo(curVal, dest_charset.tables, '?') // OK
978: :'?'; // use the replacement character
1.6 paf 979: }
1.1 paf 980:
1.35 paf 981: dest_body[dest_length]=0; // terminator
982: return String::C((char*)dest_body, dest_length);
1.6 paf 983: }
1.1 paf 984: }
985:
1.58 misha 986: void Charset::store_Char(XMLByte*& outPtr, XMLCh src, XMLByte not_found){
1.59 misha 987: if(isUTF8())
1.58 misha 988: store_UTF8(src, outPtr);
1.59 misha 989: else if(char ch=xlatOneTo(src, tables, not_found))
1.58 misha 990: *outPtr++=ch;
1.57 misha 991: }
992:
1.1 paf 993: #ifdef XML
1.10 paf 994:
1.35 paf 995: static const Charset::Tables* tables[MAX_CHARSETS];
1.111 moko 996: static xmlCharEncodingHandler xml_encoding_handlers[MAX_CHARSETS];
1.35 paf 997:
1.46 paf 998: #ifdef PA_PATCHED_LIBXML_BACKWARD
999:
1000: #define declareXml256ioFuncs(i) \
1001: static int xml256CharEncodingInputFunc##i( \
1002: unsigned char *out, int *outlen, \
1003: const unsigned char *in, int *inlen, void*) { \
1004: return transcodeToUTF8( \
1.71 misha 1005: in, *inlen, \
1006: out, *outlen, \
1.46 paf 1007: *tables[i]); \
1008: } \
1009: static int xml256CharEncodingOutputFunc##i( \
1010: unsigned char *out, int *outlen, \
1011: const unsigned char *in, int *inlen, void*) { \
1012: return transcodeFromUTF8( \
1.71 misha 1013: in, *inlen, \
1014: out, *outlen, \
1.46 paf 1015: *tables[i]); \
1016: }
1017:
1018: #else
1019:
1.35 paf 1020: #define declareXml256ioFuncs(i) \
1021: static int xml256CharEncodingInputFunc##i( \
1022: unsigned char *out, int *outlen, \
1023: const unsigned char *in, int *inlen) { \
1024: return transcodeToUTF8( \
1.71 misha 1025: in, *inlen, \
1026: out, *outlen, \
1.35 paf 1027: *tables[i]); \
1028: } \
1029: static int xml256CharEncodingOutputFunc##i( \
1030: unsigned char *out, int *outlen, \
1031: const unsigned char *in, int *inlen) { \
1032: return transcodeFromUTF8( \
1.71 misha 1033: in, *inlen, \
1034: out, *outlen, \
1.35 paf 1035: *tables[i]); \
1036: }
1037:
1.46 paf 1038: #endif
1039:
1040:
1.35 paf 1041: declareXml256ioFuncs(0) declareXml256ioFuncs(1)
1042: declareXml256ioFuncs(2) declareXml256ioFuncs(3)
1043: declareXml256ioFuncs(4) declareXml256ioFuncs(5)
1044: declareXml256ioFuncs(6) declareXml256ioFuncs(7)
1045: declareXml256ioFuncs(8) declareXml256ioFuncs(9)
1046:
1047: static xmlCharEncodingInputFunc inputFuncs[MAX_CHARSETS]={
1048: xml256CharEncodingInputFunc0, xml256CharEncodingInputFunc1,
1049: xml256CharEncodingInputFunc2, xml256CharEncodingInputFunc3,
1050: xml256CharEncodingInputFunc4, xml256CharEncodingInputFunc5,
1051: xml256CharEncodingInputFunc6, xml256CharEncodingInputFunc7,
1052: xml256CharEncodingInputFunc8, xml256CharEncodingInputFunc9
1053: };
1054: static xmlCharEncodingOutputFunc outputFuncs[MAX_CHARSETS]={
1055: xml256CharEncodingOutputFunc0, xml256CharEncodingOutputFunc1,
1056: xml256CharEncodingOutputFunc2, xml256CharEncodingOutputFunc3,
1057: xml256CharEncodingOutputFunc4, xml256CharEncodingOutputFunc5,
1058: xml256CharEncodingOutputFunc6, xml256CharEncodingOutputFunc7,
1059: xml256CharEncodingOutputFunc8, xml256CharEncodingOutputFunc9
1060: };
1061: static size_t handlers_count=0;
1.10 paf 1062:
1063: void Charset::addEncoding(char *name_cstr) {
1.35 paf 1064: if(handlers_count==MAX_CHARSETS)
1.100 moko 1065: throw Exception(0, 0, "already allocated %d handlers, no space for new encoding '%s'", MAX_CHARSETS, name_cstr);
1.35 paf 1066:
1.111 moko 1067: xmlCharEncodingHandler* handler=&xml_encoding_handlers[handlers_count];
1.35 paf 1068: {
1069: handler->name=name_cstr;
1070: handler->input=inputFuncs[handlers_count];
1071: handler->output=outputFuncs[handlers_count];
1072: ::tables[handlers_count]=&tables;
1073: handlers_count++;
1074: }
1.10 paf 1075:
1076: xmlRegisterCharEncodingHandler(handler);
1.35 paf 1077:
1.10 paf 1078: }
1079:
1.37 paf 1080: void Charset::initTranscoder(const String::Body NAME, const char* name_cstr) {
1.15 paf 1081: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
1.35 paf 1082: transcoder(NAME); // check right way
1.15 paf 1083: }
1084:
1.37 paf 1085: xmlCharEncodingHandler& Charset::transcoder(const String::Body NAME) {
1.15 paf 1086: if(!ftranscoder)
1.100 moko 1087: throw Exception(PARSER_RUNTIME, new String(NAME, String::L_TAINTED), "unsupported encoding");
1.35 paf 1088: return *ftranscoder;
1.10 paf 1089: }
1090:
1.54 paf 1091: String::C Charset::transcode_cstr(const xmlChar* s) {
1.13 paf 1092: if(!s)
1.35 paf 1093: return String::C("", 0);
1.8 paf 1094:
1.35 paf 1095: int inlen=strlen((const char*)s);
1.51 paf 1096: int outlen=inlen*6/*strlen("ÿ")*/; // max
1.35 paf 1097: #ifndef NDEBUG
1098: int saved_outlen=outlen;
1099: #endif
1100: char *out=new(PointerFreeGC) char[outlen+1];
1.8 paf 1101:
1.30 paf 1102: int error;
1.35 paf 1103: if(xmlCharEncodingOutputFunc output=transcoder(FNAME).output) {
1.30 paf 1104: error=output(
1.17 paf 1105: (unsigned char*)out, &outlen,
1.46 paf 1106: (const unsigned char*)s, &inlen
1107: #ifdef PA_PATCHED_LIBXML_BACKWARD
1108: ,0
1109: #endif
1110: );
1.30 paf 1111: } else {
1112: memcpy(out, s, outlen=inlen);
1113: error=0;
1114: }
1115: if(error<0)
1.100 moko 1116: throw Exception(0, 0, "transcode_cstr failed (%d)", error);
1.8 paf 1117:
1.35 paf 1118: assert(outlen<=saved_outlen); out[outlen]=0;
1119: return String::C(out, outlen);
1.14 paf 1120: }
1.54 paf 1121: const String& Charset::transcode(const xmlChar* s) {
1.35 paf 1122: String::C cstr=transcode_cstr(s);
1.75 misha 1123: return *new String(cstr.str, String::L_TAINTED);
1.1 paf 1124: }
1125:
1.8 paf 1126: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.35 paf 1127: xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
1128: xmlChar* out;
1.30 paf 1129: int outlen;
1130: int error;
1.35 paf 1131: #ifndef NDEBUG
1132: int saved_outlen;
1133: #endif
1134: if(xmlCharEncodingInputFunc input=transcoder(FNAME).input) {
1.51 paf 1135: outlen=buf_size*6/*max UTF8 bytes per char*/;
1.35 paf 1136: #ifndef NDEBUG
1137: saved_outlen=outlen;
1138: #endif
1.47 paf 1139: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 1140: error=input(
1.17 paf 1141: out, &outlen,
1.46 paf 1142: (const unsigned char*)buf, (int*)&buf_size
1143: #ifdef PA_PATCHED_LIBXML_BACKWARD
1144: ,0
1145: #endif
1146: );
1.30 paf 1147: } else {
1148: outlen=buf_size;
1.35 paf 1149: #ifndef NDEBUG
1150: saved_outlen=outlen;
1151: #endif
1152: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 1153: memcpy(out, buf, outlen);
1154: error=0;
1155: }
1.17 paf 1156:
1.30 paf 1157: if(error<0)
1.100 moko 1158: throw Exception(0, 0, "transcode_buf failed (%d)", error);
1.8 paf 1159:
1.35 paf 1160: assert(outlen<=saved_outlen); out[outlen]=0;
1161: return out;
1.24 paf 1162: }
1.1 paf 1163:
1.79 misha 1164: xmlChar* Charset::transcode(const String& s) {
1165: String::Body sbody=s.cstr_to_string_body_untaint(String::L_AS_IS);
1166: return transcode_buf2xchar(sbody.cstr(), sbody.length());
1.1 paf 1167: }
1.35 paf 1168:
1.79 misha 1169: xmlChar* Charset::transcode(const String::Body s) {
1170: return transcode_buf2xchar(s.cstr(), s.length());
1.35 paf 1171: }
1.36 paf 1172: #endif
1.34 paf 1173:
1.99 moko 1174: String::Body Charset::transcode(const String::Body src, const Charset& source_transcoder, const Charset& dest_transcoder) {
1.101 moko 1175: return String::Body(Charset::transcode(String::C(src.cstr(), src.length()), source_transcoder, dest_transcoder));
1.35 paf 1176: }
1177:
1.99 moko 1178: String& Charset::transcode(const String& src, const Charset& source_transcoder, const Charset& dest_transcoder) {
1.72 misha 1179: if(src.is_empty())
1.73 misha 1180: return *new String();
1.34 paf 1181:
1.37 paf 1182: return *new String(transcode((String::Body)src, source_transcoder, dest_transcoder), String::L_CLEAN);
1.34 paf 1183: }
1184:
1.99 moko 1185: void Charset::transcode(ArrayString& src, const Charset& source_transcoder, const Charset& dest_transcoder) {
1.35 paf 1186: for(size_t i=0; i<src.count(); i++)
1187: src.put(i, &transcode(*src[i], source_transcoder, dest_transcoder));
1.34 paf 1188: }
1189:
1190: #ifndef DOXYGEN
1191: struct Transcode_pair_info {
1192: const Charset* source_transcoder;
1193: const Charset* dest_transcoder;
1194: };
1195: #endif
1.99 moko 1196: static void transcode_pair(HashStringValue::key_type /*akey*/, String::Body& avalue, Transcode_pair_info* info) {
1197: avalue=Charset::transcode(avalue, *info->source_transcoder, *info->dest_transcoder);
1.34 paf 1198: }
1.61 misha 1199:
1.99 moko 1200: void Charset::transcode(HashStringString& src, const Charset& source_transcoder, const Charset& dest_transcoder) {
1.35 paf 1201: Transcode_pair_info info={&source_transcoder, &dest_transcoder};
1.55 paf 1202: src.for_each_ref<Transcode_pair_info*>(transcode_pair, &info);
1.34 paf 1203: }
1.61 misha 1204:
1205: size_t getUTF8BytePos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t charPos){
1206: const XMLByte* ptr=srcBegin;
1.70 misha 1207: while(charPos-- && skipUTF8Char(ptr, srcEnd));
1.61 misha 1208:
1209: return ptr-srcBegin;
1210: }
1211:
1212: size_t getUTF8CharPos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t bytePos){
1213: size_t charPos=0;
1214: const XMLByte* ptr=srcBegin;
1215: const XMLByte* ptrEnd=srcBegin+bytePos;
1.70 misha 1216: while(skipUTF8Char(ptr, srcEnd)){
1.61 misha 1217: if(ptr>ptrEnd)
1218: return charPos;
1219: charPos++;
1220: }
1221:
1222: // scan till end but position in bytes still too low
1.107 moko 1223: throw Exception(0, 0, "Error conversion byte pos to char pos");
1.61 misha 1224: }
1225:
1226: size_t lengthUTF8(const XMLByte* srcBegin, const XMLByte* srcEnd){
1227: size_t size=0;
1.70 misha 1228: while(skipUTF8Char(srcBegin, srcEnd))
1.61 misha 1229: size++;
1230:
1231: return size;
1232: }
1.80 misha 1233:
1.84 misha 1234: unsigned int lengthUTF8Char(const XMLByte c){
1235: return gUTFBytes[c]+1;
1236: }
1237:
1.94 moko 1238: const char *fixUTF8(const char *src){
1239: if(src && *src){
1240: size_t length=strlen(src);
1241:
1242: int error_offset;
1.96 moko 1243: if(pa_pcre_valid_utf((unsigned char *)src, length, &error_offset)){
1.94 moko 1244:
1245: char *result=(char *)pa_malloc_atomic(length+1);
1246: char *dst=result;
1247:
1248: do {
1249:
1250: if(error_offset){
1.110 moko 1251: memcpy(dst, src, error_offset);
1.94 moko 1252: dst+=error_offset;
1253:
1254: src+=error_offset;
1255: length-=error_offset;
1256:
1257: }
1258:
1259: *dst++='?';
1260: src++;
1261: length--;
1262:
1.96 moko 1263: } while (length && pa_pcre_valid_utf((unsigned char *)src, length, &error_offset));
1.94 moko 1264:
1265: if(length){
1266: strcpy(dst, src);
1267: } else {
1268: *dst='\0';
1269: }
1270:
1271: return result;
1272: }
1273: }
1274: return src;
1275: }
1276:
1.80 misha 1277: bool UTF8_string_iterator::has_next(){
1278: fcharSize=readUTF8Char(fsrcPtr, fsrcEnd, ffirstByte, fUTF8Char);
1279: return fcharSize!=0;
1280: }
E-mail: