Annotation of parser3/src/main/pa_charset.C, revision 1.38
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.35 paf 4: Copyright(c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.38 ! paf 8: static const char* IDENT_CHARSET_C="$Date: 2003/09/25 09:15:03 $";
1.1 paf 9:
10: #include "pa_charset.h"
1.35 paf 11: #include "pa_charsets.h"
1.1 paf 12:
13: #ifdef XML
1.8 paf 14: #include "libxml/encoding.h"
1.1 paf 15: #endif
16:
1.38 ! paf 17: // globals
! 18:
! 19: Charset::UTF8CaseTable::Rec UTF8CaseToUpperRecords[]={
! 20: #include "utf8-to-upper.inc"
! 21: };
! 22: Charset::UTF8CaseTable UTF8CaseToUpper={
! 23: sizeof(UTF8CaseToUpperRecords)/sizeof(Charset::UTF8CaseTable::Rec),
! 24: UTF8CaseToUpperRecords};
! 25:
! 26: Charset::UTF8CaseTable::Rec UTF8CaseToLowerRecords[]={
! 27: #include "utf8-to-lower.inc"
! 28: };
! 29: Charset::UTF8CaseTable UTF8CaseToLower={
! 30: sizeof(UTF8CaseToLowerRecords)/sizeof(Charset::UTF8CaseTable::Rec),
! 31: UTF8CaseToLowerRecords};
! 32:
1.1 paf 33: // helpers
34:
35: inline void prepare_case_tables(unsigned char *tables) {
36: unsigned char *lcc_table=tables+lcc_offset;
37: unsigned char *fcc_table=tables+fcc_offset;
38: for(int i=0; i<0x100; i++)
39: lcc_table[i]=fcc_table[i]=i;
40: }
41: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
42: unsigned char bit) {
43: unsigned char *ctypes_table=tables+ctypes_offset;
44: ctypes_table[0]=bit;
45: for(; *cstr; cstr++) {
46: unsigned char c=*cstr;
47: ctypes_table[c]|=bit;
48: }
49: }
1.35 paf 50: inline unsigned int to_wchar_code(const char* cstr) {
1.1 paf 51: if(!cstr || !*cstr)
52: return 0;
53: if(cstr[1]==0)
1.4 paf 54: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 55:
56: char *error_pos;
1.4 paf 57: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 58: }
1.35 paf 59: inline bool to_bool(const char* cstr) {
1.1 paf 60: return cstr && *cstr!=0;
61: }
62: static void element2ctypes(unsigned char c, bool belongs,
63: unsigned char *tables, unsigned char bit, int group_offset=-1) {
64: if(!belongs)
65: return;
66:
67: unsigned char *ctypes_table=tables+ctypes_offset;
68:
69: ctypes_table[c]|=bit;
70: if(group_offset>=0)
1.4 paf 71: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 72: }
73: static void element2case(unsigned char from, unsigned char to,
74: unsigned char *tables) {
75: if(!to)
76: return;
77:
78: unsigned char *lcc_table=tables+lcc_offset;
79: unsigned char *fcc_table=tables+fcc_offset;
80: lcc_table[from]=to;
81: fcc_table[from]=to; fcc_table[to]=from;
82: }
83:
84: // methods
85:
86: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.37 paf 87: Charset::Charset(Request_charsets* charsets, const String::Body ANAME, const String* afile_spec):
1.35 paf 88: FNAME(ANAME),
89: FNAME_CSTR(ANAME.cstrm()) {
1.7 paf 90:
1.35 paf 91: if(afile_spec) {
1.1 paf 92: fisUTF8=false;
1.35 paf 93: load_definition(*charsets, *afile_spec);
1.1 paf 94: #ifdef XML
1.35 paf 95: addEncoding(FNAME_CSTR);
1.1 paf 96: #endif
97: } else {
98: fisUTF8=true;
1.4 paf 99: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 100: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
101: }
102:
103: #ifdef XML
1.35 paf 104: initTranscoder(FNAME, FNAME_CSTR);
1.1 paf 105: #endif
106: }
107:
1.35 paf 108: void Charset::load_definition(Request_charsets& charsets, const String& afile_spec) {
1.1 paf 109: // pcre_tables
110: // lowcase, flipcase, bits digit+word+whitespace, masks
111:
112: // must not move this inside of prepare_case_tables
113: // don't know the size there
114: memset(pcre_tables, 0, sizeof(pcre_tables));
115: prepare_case_tables(pcre_tables);
1.4 paf 116: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 117:
118: // charset
1.35 paf 119: memset(&tables, 0, sizeof(tables));
1.1 paf 120: // strangly vital
1.10 paf 121: tables.toTable[tables.toTableSize].intCh=0;
122: tables.toTable[tables.toTableSize].extCh=(XMLByte)0;
123: tables.toTableSize++;
1.1 paf 124:
125: // loading text
1.35 paf 126: char *data=file_read_text(charsets, afile_spec);
1.1 paf 127:
128: // ignore header
129: getrow(&data);
130:
131: // parse cells
132: char *row;
133: while(row=getrow(&data)) {
134: // remove empty&comment lines
135: if(!*row || *row=='#')
136: continue;
137:
138: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
139: unsigned int c=0;
140: char *cell;
141: for(int column=0; cell=lsplit(&row, '\t'); column++) {
142: switch(column) {
143: case 0: c=to_wchar_code(cell); break;
144: // pcre_tables
145: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
146: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
147: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
148: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
149: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
150: case 6: element2case(c, to_wchar_code(cell), pcre_tables); break;
151: case 7:
152: case 8:
153: // charset
1.10 paf 154: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.23 paf 155: throw Exception("parser.runtime",
1.35 paf 156: &afile_spec,
1.1 paf 157: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
158:
159: XMLCh unicode=(XMLCh)to_wchar_code(cell);
160: if(!unicode && column==7/*unicode1 column*/)
161: unicode=(XMLCh)c;
162: if(unicode) {
1.10 paf 163: if(!tables.fromTable[c])
164: tables.fromTable[c]=unicode;
165: tables.toTable[tables.toTableSize].intCh=unicode;
166: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
167: tables.toTableSize++;
1.1 paf 168: }
169: break;
170: }
171: }
172: };
173:
174: // sort by the Unicode code point
175: sort_ToTable();
176: }
177:
178: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
179: return
1.38 ! paf 180: static_cast<const Charset::Tables::Rec *>(a)->intCh-
! 181: static_cast<const Charset::Tables::Rec *>(b)->intCh;
1.1 paf 182: }
183:
184: void Charset::sort_ToTable() {
1.10 paf 185: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 186: sort_cmp_Trans_rec_intCh);
187: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 188: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 189: //fclose(f);
190: }
191:
1.10 paf 192: static XMLByte xlatOneTo(const XMLCh toXlat,
1.35 paf 193: const Charset::Tables& tables,
194: XMLByte not_found) {
195: unsigned int lowOfs = 0;
196: unsigned int hiOfs = tables.toTableSize - 1;
197: do {
198: // Calc the mid point of the low and high offset.
199: const unsigned int midOfs =((hiOfs - lowOfs) / 2)+lowOfs;
200:
201: // If our test char is greater than the mid point char, then
202: // we move up to the upper half. Else we move to the lower
203: // half. If its equal, then its our guy.
204: if(toXlat>tables.toTable[midOfs].intCh)
205: lowOfs = midOfs;
1.10 paf 206: else if(toXlat<tables.toTable[midOfs].intCh)
1.1 paf 207: hiOfs = midOfs;
208: else
1.10 paf 209: return tables.toTable[midOfs].extCh;
1.4 paf 210: } while(lowOfs+1<hiOfs);
1.35 paf 211:
212: return not_found;
1.1 paf 213: }
214:
1.35 paf 215: String::C Charset::transcode(const String::C src,
216: const Charset& source_charset,
217: const Charset& dest_charset) {
218: if(!src.length)
219: return String::C("", 0);
1.4 paf 220:
1.1 paf 221: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
222: default: // 0x00
1.35 paf 223: return source_charset.transcodeToCharset(src, dest_charset);
1.1 paf 224: case 0x01:
1.35 paf 225: return source_charset.transcodeToUTF8(src);
1.1 paf 226: case 0x10:
1.35 paf 227: return dest_charset.transcodeFromUTF8(src);
1.1 paf 228: case 0x11:
1.35 paf 229: return src;
1.1 paf 230: }
231: }
232:
233: // ---------------------------------------------------------------------------
234: // Local static data
235: //
236: // gUTFBytes
237: // A list of counts of trailing bytes for each initial byte in the input.
238: //
239: // gUTFOffsets
240: // A list of values to offset each result char type, according to how
241: // many source bytes when into making it.
242: //
243: // gFirstByteMark
244: // A list of values to mask onto the first byte of an encoded sequence,
245: // indexed by the number of bytes used to create the sequence.
246: // ---------------------------------------------------------------------------
247: static const XMLByte gUTFBytes[0x100] = {
248: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
249: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
250: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
251: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
252: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
253: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
254: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
255: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
256: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
257: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
258: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
259: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
260: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
261: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
262: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
263: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
264: };
265:
266: static const uint gUTFOffsets[6] = {
267: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
268: };
269:
270: static const XMLByte gFirstByteMark[7] = {
271: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
272: };
273:
1.35 paf 274: static int transcodeToUTF8(const XMLByte* srcData, size_t& srcLen,
275: XMLByte *toFill, size_t& toFillLen,
276: const Charset::Tables& tables) {
1.11 paf 277: const XMLByte* srcPtr=srcData;
278: const XMLByte* srcEnd=srcData+srcLen;
279: XMLByte* outPtr=toFill;
280: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 281:
1.35 paf 282: while(srcPtr<srcEnd) {
283: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 284: if(!curVal) {
1.35 paf 285: // use the replacement character
286: *outPtr++= '?';
287: srcPtr++;
288: continue;
289: }
1.1 paf 290:
1.35 paf 291: // Figure out how many bytes we need
292: unsigned int encodedBytes;
293: if(curVal<0x80)
294: encodedBytes = 1;
295: else if(curVal<0x800)
296: encodedBytes = 2;
297: else if(curVal<0x10000)
298: encodedBytes = 3;
299: else if(curVal<0x200000)
300: encodedBytes = 4;
301: else if(curVal<0x4000000)
302: encodedBytes = 5;
303: else if(curVal<= 0x7FFFFFFF)
304: encodedBytes = 6;
305: else {
306: // use the replacement character
307: *outPtr++= '?';
308: srcPtr++;
309: continue;
310: }
1.11 paf 311:
1.35 paf 312: // If we cannot fully get this char into the output buffer
313: if (outPtr + encodedBytes > outEnd)
314: break;
315:
316: // We can do it, so update the source index
317: srcPtr++;
318:
319: // And spit out the bytes. We spit them out in reverse order
320: // here, so bump up the output pointer and work down as we go.
321: outPtr+= encodedBytes;
322: switch(encodedBytes) {
323: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
324: curVal>>= 6;
325: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
326: curVal>>= 6;
327: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
328: curVal>>= 6;
329: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
330: curVal>>= 6;
331: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
332: curVal>>= 6;
333: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
334: }
335:
336: // Add the encoded bytes back in again to indicate we've eaten them
337: outPtr+= encodedBytes;
338: }
339:
340: // Update the bytes eaten
341: srcLen = srcPtr - srcData;
342:
343: // Return the characters read
344: toFillLen = outPtr - toFill;
345:
1.29 paf 346: //return srcPtr==srcEnd?(int)toFillLen:-1;
347: /*
348: xmlCharEncodingInputFunc
349: Returns :
350: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
351: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
352: of ocetes consumed.
353: */
354: return 0;
1.1 paf 355: }
1.26 paf 356: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.35 paf 357: static int transcodeFromUTF8(const XMLByte* srcData, size_t& srcLen,
358: XMLByte* toFill, size_t& toFillLen,
359: const Charset::Tables& tables) {
1.11 paf 360: const XMLByte* srcPtr=srcData;
361: const XMLByte* srcEnd=srcData+srcLen;
362: XMLByte* outPtr=toFill;
363: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 364:
1.35 paf 365: // We now loop until we either run out of input data, or room to store
366: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
367: // Get the next leading byte out
368: const XMLByte firstByte =* srcPtr;
369:
370: // Special-case ASCII, which is a leading byte value of<= 127
371: if(firstByte<= 127) {
372: *outPtr++= firstByte;
373: srcPtr++;
374: continue;
375: }
376:
377: // See how many trailing src bytes this sequence is going to require
378: const unsigned int trailingBytes = gUTFBytes[firstByte];
379:
380: // If there are not enough source bytes to do this one, then we
381: // are done. Note that we done>= here because we are implicitly
382: // counting the 1 byte we get no matter what.
383: if(srcPtr+trailingBytes>= srcEnd)
384: break;
385:
386: // Looks ok, so lets build up the value
387: uint tmpVal=0;
388: switch(trailingBytes) {
389: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
390: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
391: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
392: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
393: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
394: case 0: tmpVal+=*srcPtr++;
395: break;
396:
397: default:
398: throw Exception(0,
399: 0,
400: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes);
401: }
402: tmpVal-=gUTFOffsets[trailingBytes];
403:
404: // If it will fit into a single char, then put it in. Otherwise
405: // fail [*encode it as a surrogate pair. If its not valid, use the
406: // replacement char.*]
407: if(!(tmpVal & 0xFFFF0000)) {
1.25 paf 408: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
409: *outPtr++=xlat;
410: else
411: outPtr+=sprintf((char *)outPtr, "&#%d;", tmpVal); // &#decimal;
412: } else
1.23 paf 413: throw Exception(0,
1.35 paf 414: 0,
415: "transcodeFromUTF8 error: too big tmpVal(0x%08X)", tmpVal);
1.1 paf 416: }
1.35 paf 417:
418: // Update the bytes eaten
419: srcLen = srcPtr - srcData;
420:
421: // Return the characters read
422: toFillLen = outPtr - toFill;
1.11 paf 423:
1.29 paf 424: //return srcPtr==srcEnd?(int)toFillLen:-1;
425: /*
426: xmlCharEncodingOutputFunc
427: Returns :
428: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
429: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
430: of ocetes consumed.
431: */
432: return 0;
1.10 paf 433: }
434:
435: /// @todo not so memory-hungry with prescan
1.35 paf 436: const String::C Charset::transcodeToUTF8(const String::C src) const {
437: size_t src_length=src.length;
438: size_t dest_length=src.length*6/*so that surly enough, max utf8 seq len=6*/;
439: #ifndef NDEBUG
440: size_t saved_dest_length=dest_length;
441: #endif
442: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 443:
444: if(::transcodeToUTF8(
1.35 paf 445: (XMLByte *)src.str, src_length,
446: dest_body, dest_length,
1.11 paf 447: tables)<0)
1.10 paf 448: throw(0, 0,
449: 0,
1.11 paf 450: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 451:
1.35 paf 452: assert(dest_length<=saved_dest_length); dest_body[dest_length]=0; // terminator
453: return String::C((char*)dest_body, dest_length);
1.10 paf 454: }
1.38 ! paf 455:
! 456: static XMLCh change_case_UTF8(const XMLCh src, const Charset::UTF8CaseTable& table) {
! 457: unsigned int lowOfs = 0;
! 458: unsigned int hiOfs = table.size - 1;
! 459: do {
! 460: // Calc the mid point of the low and high offset.
! 461: const unsigned int midOfs =((hiOfs - lowOfs) / 2)+lowOfs;
! 462:
! 463: // If our test char is greater than the mid point char, then
! 464: // we move up to the upper half. Else we move to the lower
! 465: // half. If its equal, then its our guy.
! 466: if(src>table.records[midOfs].from)
! 467: lowOfs = midOfs;
! 468: else if(src<table.records[midOfs].from)
! 469: hiOfs = midOfs;
! 470: else
! 471: return table.records[midOfs].to;
! 472: } while(lowOfs+1<hiOfs);
! 473:
! 474: return src;
! 475: }
! 476:
! 477: static void store_UTF8(XMLCh src, XMLByte*& outPtr ) {
! 478: if(!src) {
! 479: // use the replacement character
! 480: *outPtr++= '?';
! 481: return;
! 482: }
! 483:
! 484: // Figure out how many bytes we need
! 485: unsigned int encodedBytes;
! 486: if(src<0x80)
! 487: encodedBytes = 1;
! 488: else if(src<0x800)
! 489: encodedBytes = 2;
! 490: else if(src<0x10000)
! 491: encodedBytes = 3;
! 492: else if(src<0x200000)
! 493: encodedBytes = 4;
! 494: else if(src<0x4000000)
! 495: encodedBytes = 5;
! 496: else if(src<= 0x7FFFFFFF)
! 497: encodedBytes = 6;
! 498: else {
! 499: // use the replacement character
! 500: *outPtr++= '?';
! 501: return;
! 502: }
! 503:
! 504: // And spit out the bytes. We spit them out in reverse order
! 505: // here, so bump up the output pointer and work down as we go.
! 506: outPtr+= encodedBytes;
! 507: switch(encodedBytes) {
! 508: case 6: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
! 509: src>>= 6;
! 510: case 5: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
! 511: src>>= 6;
! 512: case 4: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
! 513: src>>= 6;
! 514: case 3: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
! 515: src>>= 6;
! 516: case 2: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
! 517: src>>= 6;
! 518: case 1: *--outPtr = XMLByte(src | gFirstByteMark[encodedBytes]);
! 519: }
! 520:
! 521: // Add the encoded bytes back in again to indicate we've eaten them
! 522: outPtr+= encodedBytes;
! 523: }
! 524:
! 525: static void change_case_UTF8(XMLCh src, XMLByte*& outPtr,
! 526: const Charset::UTF8CaseTable& table) {
! 527: store_UTF8(change_case_UTF8(src, table), outPtr);
! 528: };
! 529: void change_case_UTF8(const XMLByte* srcData, XMLByte* toFill,
! 530: const Charset::UTF8CaseTable& table) {
! 531: const XMLByte* srcPtr=srcData;
! 532: XMLByte* outPtr=toFill;
! 533:
! 534: // Get the next leading byte out
! 535: while (const XMLByte firstByte = *srcPtr) {
! 536: if(firstByte<= 127) {
! 537: change_case_UTF8(firstByte, outPtr, table);
! 538: srcPtr++;
! 539: continue;
! 540: }
! 541:
! 542: // See how many trailing src bytes this sequence is going to require
! 543: const unsigned int trailingBytes = gUTFBytes[firstByte];
! 544:
! 545: // Looks ok, so lets build up the value
! 546: uint tmpVal=0;
! 547: switch(trailingBytes) {
! 548: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
! 549: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
! 550: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
! 551: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
! 552: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
! 553: case 0: tmpVal+=*srcPtr++;
! 554: break;
! 555:
! 556: default:
! 557: throw Exception(0,
! 558: 0,
! 559: "change_case_UTF8 error: wrong trailingBytes value(%d)", trailingBytes);
! 560: }
! 561: tmpVal-=gUTFOffsets[trailingBytes];
! 562:
! 563: // If it will fit into a single char, then put it in. Otherwise
! 564: // fail [*encode it as a surrogate pair. If its not valid, use the
! 565: // replacement char.*]
! 566: if(!(tmpVal & 0xFFFF0000))
! 567: change_case_UTF8(tmpVal, outPtr, table);
! 568: else
! 569: throw Exception(0,
! 570: 0,
! 571: "change_case_UTF8 error: too big tmpVal(0x%08X)", tmpVal);
! 572: }
! 573:
! 574: if(srcPtr!=outPtr)
! 575: throw Exception(0,
! 576: 0,
! 577: "change_case_UTF8 error: end pointers do not match");
! 578: }
! 579:
! 580:
1.35 paf 581: const String::C Charset::transcodeFromUTF8(const String::C src) const {
582: size_t src_length=src.length;
583: size_t dest_length=src.length*6/*so that surly enough, "ÿ" has max ratio */;
584: #ifndef NDEBUG
585: size_t saved_dest_length=dest_length;
586: #endif
587: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 588:
589: if(::transcodeFromUTF8(
1.35 paf 590: (XMLByte *)src.str, src_length,
591: dest_body, dest_length,
1.11 paf 592: tables)<0)
1.10 paf 593: throw(0, 0,
594: 0,
1.35 paf 595: "Charset::transcodeFromUTF8 buffer overflow");
1.10 paf 596:
1.35 paf 597: assert(dest_length<=saved_dest_length); dest_body[dest_length]=0; // terminator
598: return String::C((char*)dest_body, dest_length);
1.1 paf 599: }
600:
601: /// transcode using both charsets
1.35 paf 602: const String::C Charset::transcodeToCharset(const String::C src,
603: const Charset& dest_charset) const {
604: if(&dest_charset==this)
605: return src;
606: else {
607: size_t dest_length=src.length;
608: XMLByte* dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
609:
610: XMLByte* output=dest_body;
611: const XMLByte* input=(XMLByte *)src.str;
612: while(XMLCh c=*input++) {
613: XMLCh curVal = tables.fromTable[c];
614: *output++=curVal?
615: xlatOneTo(curVal, dest_charset.tables, '?') // OK
616: :'?'; // use the replacement character
1.6 paf 617: }
1.1 paf 618:
1.35 paf 619: dest_body[dest_length]=0; // terminator
620: return String::C((char*)dest_body, dest_length);
1.6 paf 621: }
1.1 paf 622: }
623:
624: #ifdef XML
1.10 paf 625:
1.35 paf 626: static const Charset::Tables* tables[MAX_CHARSETS];
627:
628: #define declareXml256ioFuncs(i) \
629: static int xml256CharEncodingInputFunc##i( \
630: unsigned char *out, int *outlen, \
631: const unsigned char *in, int *inlen) { \
632: return transcodeToUTF8( \
633: in, *(size_t*)inlen, \
634: out, *(size_t*)outlen, \
635: *tables[i]); \
636: } \
637: static int xml256CharEncodingOutputFunc##i( \
638: unsigned char *out, int *outlen, \
639: const unsigned char *in, int *inlen) { \
640: return transcodeFromUTF8( \
641: in, *(size_t*)inlen, \
642: out, *(size_t*)outlen, \
643: *tables[i]); \
644: }
645:
646: declareXml256ioFuncs(0) declareXml256ioFuncs(1)
647: declareXml256ioFuncs(2) declareXml256ioFuncs(3)
648: declareXml256ioFuncs(4) declareXml256ioFuncs(5)
649: declareXml256ioFuncs(6) declareXml256ioFuncs(7)
650: declareXml256ioFuncs(8) declareXml256ioFuncs(9)
651:
652: static xmlCharEncodingInputFunc inputFuncs[MAX_CHARSETS]={
653: xml256CharEncodingInputFunc0, xml256CharEncodingInputFunc1,
654: xml256CharEncodingInputFunc2, xml256CharEncodingInputFunc3,
655: xml256CharEncodingInputFunc4, xml256CharEncodingInputFunc5,
656: xml256CharEncodingInputFunc6, xml256CharEncodingInputFunc7,
657: xml256CharEncodingInputFunc8, xml256CharEncodingInputFunc9
658: };
659: static xmlCharEncodingOutputFunc outputFuncs[MAX_CHARSETS]={
660: xml256CharEncodingOutputFunc0, xml256CharEncodingOutputFunc1,
661: xml256CharEncodingOutputFunc2, xml256CharEncodingOutputFunc3,
662: xml256CharEncodingOutputFunc4, xml256CharEncodingOutputFunc5,
663: xml256CharEncodingOutputFunc6, xml256CharEncodingOutputFunc7,
664: xml256CharEncodingOutputFunc8, xml256CharEncodingOutputFunc9
665: };
666: static size_t handlers_count=0;
1.10 paf 667:
668: void Charset::addEncoding(char *name_cstr) {
1.35 paf 669: if(handlers_count==MAX_CHARSETS)
670: throw Exception(0,
671: 0,
672: "already allocated %d handlers, no space for new encoding '%s'",
673: MAX_CHARSETS, name_cstr);
674:
675: xmlCharEncodingHandler* handler=new(PointerFreeGC) xmlCharEncodingHandler;
676: {
677: handler->name=name_cstr;
678: handler->input=inputFuncs[handlers_count];
679: handler->output=outputFuncs[handlers_count];
680: ::tables[handlers_count]=&tables;
681: handlers_count++;
682: }
1.10 paf 683:
684: xmlRegisterCharEncodingHandler(handler);
1.35 paf 685:
1.10 paf 686: }
687:
1.37 paf 688: void Charset::initTranscoder(const String::Body NAME, const char* name_cstr) {
1.15 paf 689: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
1.35 paf 690: transcoder(NAME); // check right way
1.15 paf 691: }
692:
1.37 paf 693: xmlCharEncodingHandler& Charset::transcoder(const String::Body NAME) {
1.15 paf 694: if(!ftranscoder)
1.23 paf 695: throw Exception("parser.runtime",
1.35 paf 696: new String(NAME, String::L_TAINTED),
1.10 paf 697: "unsupported encoding");
1.35 paf 698: return *ftranscoder;
1.10 paf 699: }
700:
1.35 paf 701: String::C Charset::transcode_cstr(xmlChar* s) {
1.13 paf 702: if(!s)
1.35 paf 703: return String::C("", 0);
1.8 paf 704:
1.35 paf 705: int inlen=strlen((const char*)s);
706: int outlen=inlen; // max
707: #ifndef NDEBUG
708: int saved_outlen=outlen;
709: #endif
710: char *out=new(PointerFreeGC) char[outlen+1];
1.8 paf 711:
1.30 paf 712: int error;
1.35 paf 713: if(xmlCharEncodingOutputFunc output=transcoder(FNAME).output) {
1.30 paf 714: error=output(
1.17 paf 715: (unsigned char*)out, &outlen,
1.35 paf 716: (const unsigned char*)s, &inlen);
1.30 paf 717: } else {
718: memcpy(out, s, outlen=inlen);
719: error=0;
720: }
721: if(error<0)
1.23 paf 722: throw Exception(0,
1.8 paf 723: 0,
1.30 paf 724: "transcode_cstr failed (%d)", error);
1.8 paf 725:
1.35 paf 726: assert(outlen<=saved_outlen); out[outlen]=0;
727: return String::C(out, outlen);
1.14 paf 728: }
1.35 paf 729: const String& Charset::transcode(xmlChar* s) {
730: String::C cstr=transcode_cstr(s);
731: return *new String(cstr.str, cstr.length, true);
732: }
733: String::C Charset::transcode_cstr(GdomeDOMString* s) {
734: return s?transcode_cstr(BAD_CAST s->str)
735: :String::C("", 0);
736: }
737: const String& Charset::transcode(GdomeDOMString* s) {
738: String::C cstr=transcode_cstr(s);
739: return *new String(cstr.str, cstr.length, true);
1.1 paf 740: }
741:
1.8 paf 742: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.35 paf 743: xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
744: xmlChar* out;
1.30 paf 745: int outlen;
746: int error;
1.35 paf 747: #ifndef NDEBUG
748: int saved_outlen;
749: #endif
750: if(xmlCharEncodingInputFunc input=transcoder(FNAME).input) {
1.32 paf 751: outlen=buf_size*6/*max*/;
1.35 paf 752: #ifndef NDEBUG
753: saved_outlen=outlen;
754: #endif
755: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 756: error=input(
1.17 paf 757: out, &outlen,
1.35 paf 758: (const unsigned char*)buf, (int*)&buf_size);
1.30 paf 759: } else {
760: outlen=buf_size;
1.35 paf 761: #ifndef NDEBUG
762: saved_outlen=outlen;
763: #endif
764: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 765: memcpy(out, buf, outlen);
766: error=0;
767: }
1.17 paf 768:
1.30 paf 769: if(error<0)
1.23 paf 770: throw Exception(0,
1.8 paf 771: 0,
1.30 paf 772: "transcode_buf failed (%d)", error);
1.8 paf 773:
1.35 paf 774: assert(outlen<=saved_outlen); out[outlen]=0;
775: return out;
1.24 paf 776: }
1.35 paf 777: GdomeDOMString_auto_ptr Charset::transcode_buf2dom(const char* buf, size_t buf_size) {
778: return GdomeDOMString_auto_ptr(transcode_buf2xchar(buf, buf_size));
1.1 paf 779: }
1.12 paf 780: GdomeDOMString_auto_ptr Charset::transcode(const String& s) {
1.35 paf 781: const char* cstr=s.cstr(String::L_UNSPECIFIED);
1.1 paf 782:
1.24 paf 783: return transcode_buf2dom(cstr, strlen(cstr));
1.1 paf 784: }
1.37 paf 785: GdomeDOMString_auto_ptr Charset::transcode(const String::Body s) {
1.35 paf 786: const char* cstr=s.cstr();
787:
788: return transcode_buf2dom(cstr, s.length());
789: }
1.36 paf 790: #endif
1.34 paf 791:
1.37 paf 792: String::Body Charset::transcode(const String::Body src,
1.34 paf 793: const Charset& source_transcoder,
1.35 paf 794: const Charset& dest_transcoder) {
1.34 paf 795:
1.35 paf 796: const char *src_ptr=src.cstr();
1.34 paf 797: size_t src_size=strlen(src_ptr);
798:
1.35 paf 799: String::C dest=Charset::transcode(String::C(src_ptr, src_size),
800: source_transcoder,
801: dest_transcoder);
1.34 paf 802:
1.37 paf 803: return String::Body(dest.str, dest.length);
1.35 paf 804: }
805:
806: String& Charset::transcode(const String& src,
807: const Charset& source_transcoder,
808: const Charset& dest_transcoder) {
809: if(!src.length())
810: return *new String("", 0, false);
1.34 paf 811:
1.37 paf 812: return *new String(transcode((String::Body)src, source_transcoder, dest_transcoder), String::L_CLEAN);
1.34 paf 813: }
814:
1.35 paf 815: void Charset::transcode(ArrayString& src,
1.34 paf 816: const Charset& source_transcoder,
1.35 paf 817: const Charset& dest_transcoder) {
818: for(size_t i=0; i<src.count(); i++)
819: src.put(i, &transcode(*src[i], source_transcoder, dest_transcoder));
1.34 paf 820: }
821:
822: #ifndef DOXYGEN
823: struct Transcode_pair_info {
824: const Charset* source_transcoder;
825: const Charset* dest_transcoder;
826: };
827: #endif
1.37 paf 828: static void transcode_pair(const String::Body akey,
829: String::Body& avalue,
1.35 paf 830: Transcode_pair_info* info) {
831: avalue=Charset::transcode(avalue,
832: *info->source_transcoder,
833: *info->dest_transcoder);
1.34 paf 834: }
1.35 paf 835: void Charset::transcode(HashStringString& src,
1.34 paf 836: const Charset& source_transcoder,
1.35 paf 837: const Charset& dest_transcoder) {
838: Transcode_pair_info info={&source_transcoder, &dest_transcoder};
839: src.for_each_ref(transcode_pair, &info);
1.34 paf 840: }
E-mail: