Annotation of parser3/src/main/pa_charset.C, revision 1.35
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.35 ! paf 4: Copyright(c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.35 ! paf 8: static const char* IDENT_CHARSET_C="$Date: 2003/06/27 08:36:59 $";
1.1 paf 9:
10: #include "pa_charset.h"
1.35 ! paf 11: #include "pa_charsets.h"
1.1 paf 12:
13: #ifdef XML
1.8 paf 14: #include "libxml/encoding.h"
1.1 paf 15: #endif
16:
17: // helpers
18:
19: inline void prepare_case_tables(unsigned char *tables) {
20: unsigned char *lcc_table=tables+lcc_offset;
21: unsigned char *fcc_table=tables+fcc_offset;
22: for(int i=0; i<0x100; i++)
23: lcc_table[i]=fcc_table[i]=i;
24: }
25: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
26: unsigned char bit) {
27: unsigned char *ctypes_table=tables+ctypes_offset;
28: ctypes_table[0]=bit;
29: for(; *cstr; cstr++) {
30: unsigned char c=*cstr;
31: ctypes_table[c]|=bit;
32: }
33: }
1.35 ! paf 34: inline unsigned int to_wchar_code(const char* cstr) {
1.1 paf 35: if(!cstr || !*cstr)
36: return 0;
37: if(cstr[1]==0)
1.4 paf 38: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 39:
40: char *error_pos;
1.4 paf 41: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 42: }
1.35 ! paf 43: inline bool to_bool(const char* cstr) {
1.1 paf 44: return cstr && *cstr!=0;
45: }
46: static void element2ctypes(unsigned char c, bool belongs,
47: unsigned char *tables, unsigned char bit, int group_offset=-1) {
48: if(!belongs)
49: return;
50:
51: unsigned char *ctypes_table=tables+ctypes_offset;
52:
53: ctypes_table[c]|=bit;
54: if(group_offset>=0)
1.4 paf 55: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 56: }
57: static void element2case(unsigned char from, unsigned char to,
58: unsigned char *tables) {
59: if(!to)
60: return;
61:
62: unsigned char *lcc_table=tables+lcc_offset;
63: unsigned char *fcc_table=tables+fcc_offset;
64: lcc_table[from]=to;
65: fcc_table[from]=to; fcc_table[to]=from;
66: }
67:
68: // methods
69:
70: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.35 ! paf 71: Charset::Charset(Request_charsets* charsets, const StringBody ANAME, const String* afile_spec):
! 72: FNAME(ANAME),
! 73: FNAME_CSTR(ANAME.cstrm()) {
1.7 paf 74:
1.35 ! paf 75: if(afile_spec) {
1.1 paf 76: fisUTF8=false;
1.35 ! paf 77: load_definition(*charsets, *afile_spec);
1.1 paf 78: #ifdef XML
1.35 ! paf 79: addEncoding(FNAME_CSTR);
1.1 paf 80: #endif
81: } else {
82: fisUTF8=true;
1.4 paf 83: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 84: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
85: }
86:
87: #ifdef XML
1.35 ! paf 88: initTranscoder(FNAME, FNAME_CSTR);
1.1 paf 89: #endif
90: }
91:
1.35 ! paf 92: void Charset::load_definition(Request_charsets& charsets, const String& afile_spec) {
1.1 paf 93: // pcre_tables
94: // lowcase, flipcase, bits digit+word+whitespace, masks
95:
96: // must not move this inside of prepare_case_tables
97: // don't know the size there
98: memset(pcre_tables, 0, sizeof(pcre_tables));
99: prepare_case_tables(pcre_tables);
1.4 paf 100: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 101:
102: // charset
1.35 ! paf 103: memset(&tables, 0, sizeof(tables));
1.1 paf 104: // strangly vital
1.10 paf 105: tables.toTable[tables.toTableSize].intCh=0;
106: tables.toTable[tables.toTableSize].extCh=(XMLByte)0;
107: tables.toTableSize++;
1.1 paf 108:
109: // loading text
1.35 ! paf 110: char *data=file_read_text(charsets, afile_spec);
1.1 paf 111:
112: // ignore header
113: getrow(&data);
114:
115: // parse cells
116: char *row;
117: while(row=getrow(&data)) {
118: // remove empty&comment lines
119: if(!*row || *row=='#')
120: continue;
121:
122: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
123: unsigned int c=0;
124: char *cell;
125: for(int column=0; cell=lsplit(&row, '\t'); column++) {
126: switch(column) {
127: case 0: c=to_wchar_code(cell); break;
128: // pcre_tables
129: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
130: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
131: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
132: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
133: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
134: case 6: element2case(c, to_wchar_code(cell), pcre_tables); break;
135: case 7:
136: case 8:
137: // charset
1.10 paf 138: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.23 paf 139: throw Exception("parser.runtime",
1.35 ! paf 140: &afile_spec,
1.1 paf 141: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
142:
143: XMLCh unicode=(XMLCh)to_wchar_code(cell);
144: if(!unicode && column==7/*unicode1 column*/)
145: unicode=(XMLCh)c;
146: if(unicode) {
1.10 paf 147: if(!tables.fromTable[c])
148: tables.fromTable[c]=unicode;
149: tables.toTable[tables.toTableSize].intCh=unicode;
150: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
151: tables.toTableSize++;
1.1 paf 152: }
153: break;
154: }
155: }
156: };
157:
158: // sort by the Unicode code point
159: sort_ToTable();
160: }
161:
162: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
163: return
164: static_cast<const Charset_TransRec *>(a)->intCh-
165: static_cast<const Charset_TransRec *>(b)->intCh;
166: }
167:
168: void Charset::sort_ToTable() {
1.10 paf 169: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 170: sort_cmp_Trans_rec_intCh);
171: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 172: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 173: //fclose(f);
174: }
175:
1.10 paf 176: static XMLByte xlatOneTo(const XMLCh toXlat,
1.35 ! paf 177: const Charset::Tables& tables,
! 178: XMLByte not_found) {
! 179: unsigned int lowOfs = 0;
! 180: unsigned int hiOfs = tables.toTableSize - 1;
! 181: XMLByte curByte = 0;
! 182: do {
! 183: // Calc the mid point of the low and high offset.
! 184: const unsigned int midOfs =((hiOfs - lowOfs) / 2)+lowOfs;
! 185:
! 186: // If our test char is greater than the mid point char, then
! 187: // we move up to the upper half. Else we move to the lower
! 188: // half. If its equal, then its our guy.
! 189: if(toXlat>tables.toTable[midOfs].intCh)
! 190: lowOfs = midOfs;
1.10 paf 191: else if(toXlat<tables.toTable[midOfs].intCh)
1.1 paf 192: hiOfs = midOfs;
193: else
1.10 paf 194: return tables.toTable[midOfs].extCh;
1.4 paf 195: } while(lowOfs+1<hiOfs);
1.35 ! paf 196:
! 197: return not_found;
1.1 paf 198: }
199:
1.35 ! paf 200: String::C Charset::transcode(const String::C src,
! 201: const Charset& source_charset,
! 202: const Charset& dest_charset) {
! 203: if(!src.length)
! 204: return String::C("", 0);
1.4 paf 205:
1.1 paf 206: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
207: default: // 0x00
1.35 ! paf 208: return source_charset.transcodeToCharset(src, dest_charset);
1.1 paf 209: case 0x01:
1.35 ! paf 210: return source_charset.transcodeToUTF8(src);
1.1 paf 211: case 0x10:
1.35 ! paf 212: return dest_charset.transcodeFromUTF8(src);
1.1 paf 213: case 0x11:
1.35 ! paf 214: return src;
1.1 paf 215: }
216: }
217:
218: // ---------------------------------------------------------------------------
219: // Local static data
220: //
221: // gUTFBytes
222: // A list of counts of trailing bytes for each initial byte in the input.
223: //
224: // gUTFOffsets
225: // A list of values to offset each result char type, according to how
226: // many source bytes when into making it.
227: //
228: // gFirstByteMark
229: // A list of values to mask onto the first byte of an encoded sequence,
230: // indexed by the number of bytes used to create the sequence.
231: // ---------------------------------------------------------------------------
232: static const XMLByte gUTFBytes[0x100] = {
233: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
234: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
235: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
236: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
237: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
238: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
239: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
240: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
241: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
242: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
243: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
244: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
245: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
246: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
247: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
248: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
249: };
250:
251: static const uint gUTFOffsets[6] = {
252: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
253: };
254:
255: static const XMLByte gFirstByteMark[7] = {
256: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
257: };
258:
1.35 ! paf 259: static int transcodeToUTF8(const XMLByte* srcData, size_t& srcLen,
! 260: XMLByte *toFill, size_t& toFillLen,
! 261: const Charset::Tables& tables) {
1.11 paf 262: const XMLByte* srcPtr=srcData;
263: const XMLByte* srcEnd=srcData+srcLen;
264: XMLByte* outPtr=toFill;
265: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 266:
1.35 ! paf 267: while(srcPtr<srcEnd) {
! 268: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 269: if(!curVal) {
1.35 ! paf 270: // use the replacement character
! 271: *outPtr++= '?';
! 272: srcPtr++;
! 273: continue;
! 274: }
1.1 paf 275:
1.35 ! paf 276: // Figure out how many bytes we need
! 277: unsigned int encodedBytes;
! 278: if(curVal<0x80)
! 279: encodedBytes = 1;
! 280: else if(curVal<0x800)
! 281: encodedBytes = 2;
! 282: else if(curVal<0x10000)
! 283: encodedBytes = 3;
! 284: else if(curVal<0x200000)
! 285: encodedBytes = 4;
! 286: else if(curVal<0x4000000)
! 287: encodedBytes = 5;
! 288: else if(curVal<= 0x7FFFFFFF)
! 289: encodedBytes = 6;
! 290: else {
! 291: // use the replacement character
! 292: *outPtr++= '?';
! 293: srcPtr++;
! 294: continue;
! 295: }
1.11 paf 296:
1.35 ! paf 297: // If we cannot fully get this char into the output buffer
! 298: if (outPtr + encodedBytes > outEnd)
! 299: break;
! 300:
! 301: // We can do it, so update the source index
! 302: srcPtr++;
! 303:
! 304: // And spit out the bytes. We spit them out in reverse order
! 305: // here, so bump up the output pointer and work down as we go.
! 306: outPtr+= encodedBytes;
! 307: switch(encodedBytes) {
! 308: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
! 309: curVal>>= 6;
! 310: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
! 311: curVal>>= 6;
! 312: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
! 313: curVal>>= 6;
! 314: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
! 315: curVal>>= 6;
! 316: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
! 317: curVal>>= 6;
! 318: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
! 319: }
! 320:
! 321: // Add the encoded bytes back in again to indicate we've eaten them
! 322: outPtr+= encodedBytes;
! 323: }
! 324:
! 325: // Update the bytes eaten
! 326: srcLen = srcPtr - srcData;
! 327:
! 328: // Return the characters read
! 329: toFillLen = outPtr - toFill;
! 330:
1.29 paf 331: //return srcPtr==srcEnd?(int)toFillLen:-1;
332: /*
333: xmlCharEncodingInputFunc
334: Returns :
335: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
336: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
337: of ocetes consumed.
338: */
339: return 0;
1.1 paf 340: }
1.26 paf 341: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.35 ! paf 342: static int transcodeFromUTF8(const XMLByte* srcData, size_t& srcLen,
! 343: XMLByte* toFill, size_t& toFillLen,
! 344: const Charset::Tables& tables) {
1.11 paf 345: const XMLByte* srcPtr=srcData;
346: const XMLByte* srcEnd=srcData+srcLen;
347: XMLByte* outPtr=toFill;
348: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 349:
1.35 ! paf 350: // We now loop until we either run out of input data, or room to store
! 351: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
! 352: // Get the next leading byte out
! 353: const XMLByte firstByte =* srcPtr;
! 354:
! 355: // Special-case ASCII, which is a leading byte value of<= 127
! 356: if(firstByte<= 127) {
! 357: *outPtr++= firstByte;
! 358: srcPtr++;
! 359: continue;
! 360: }
! 361:
! 362: // See how many trailing src bytes this sequence is going to require
! 363: const unsigned int trailingBytes = gUTFBytes[firstByte];
! 364:
! 365: // If there are not enough source bytes to do this one, then we
! 366: // are done. Note that we done>= here because we are implicitly
! 367: // counting the 1 byte we get no matter what.
! 368: if(srcPtr+trailingBytes>= srcEnd)
! 369: break;
! 370:
! 371: // Looks ok, so lets build up the value
! 372: uint tmpVal=0;
! 373: switch(trailingBytes) {
! 374: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
! 375: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
! 376: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
! 377: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
! 378: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
! 379: case 0: tmpVal+=*srcPtr++;
! 380: break;
! 381:
! 382: default:
! 383: throw Exception(0,
! 384: 0,
! 385: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes);
! 386: }
! 387: tmpVal-=gUTFOffsets[trailingBytes];
! 388:
! 389: // If it will fit into a single char, then put it in. Otherwise
! 390: // fail [*encode it as a surrogate pair. If its not valid, use the
! 391: // replacement char.*]
! 392: if(!(tmpVal & 0xFFFF0000)) {
1.25 paf 393: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
394: *outPtr++=xlat;
395: else
396: outPtr+=sprintf((char *)outPtr, "&#%d;", tmpVal); // &#decimal;
397: } else
1.23 paf 398: throw Exception(0,
1.35 ! paf 399: 0,
! 400: "transcodeFromUTF8 error: too big tmpVal(0x%08X)", tmpVal);
1.1 paf 401: }
1.35 ! paf 402:
! 403: // Update the bytes eaten
! 404: srcLen = srcPtr - srcData;
! 405:
! 406: // Return the characters read
! 407: toFillLen = outPtr - toFill;
1.11 paf 408:
1.29 paf 409: //return srcPtr==srcEnd?(int)toFillLen:-1;
410: /*
411: xmlCharEncodingOutputFunc
412: Returns :
413: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
414: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
415: of ocetes consumed.
416: */
417: return 0;
1.10 paf 418: }
419:
420: /// @todo not so memory-hungry with prescan
1.35 ! paf 421: const String::C Charset::transcodeToUTF8(const String::C src) const {
! 422: size_t src_length=src.length;
! 423: size_t dest_length=src.length*6/*so that surly enough, max utf8 seq len=6*/;
! 424: #ifndef NDEBUG
! 425: size_t saved_dest_length=dest_length;
! 426: #endif
! 427: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 428:
429: if(::transcodeToUTF8(
1.35 ! paf 430: (XMLByte *)src.str, src_length,
! 431: dest_body, dest_length,
1.11 paf 432: tables)<0)
1.10 paf 433: throw(0, 0,
434: 0,
1.11 paf 435: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 436:
1.35 ! paf 437: assert(dest_length<=saved_dest_length); dest_body[dest_length]=0; // terminator
! 438: return String::C((char*)dest_body, dest_length);
1.10 paf 439: }
1.35 ! paf 440: const String::C Charset::transcodeFromUTF8(const String::C src) const {
! 441: size_t src_length=src.length;
! 442: size_t dest_length=src.length*6/*so that surly enough, "ÿ" has max ratio */;
! 443: #ifndef NDEBUG
! 444: size_t saved_dest_length=dest_length;
! 445: #endif
! 446: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 447:
448: if(::transcodeFromUTF8(
1.35 ! paf 449: (XMLByte *)src.str, src_length,
! 450: dest_body, dest_length,
1.11 paf 451: tables)<0)
1.10 paf 452: throw(0, 0,
453: 0,
1.35 ! paf 454: "Charset::transcodeFromUTF8 buffer overflow");
1.10 paf 455:
1.35 ! paf 456: assert(dest_length<=saved_dest_length); dest_body[dest_length]=0; // terminator
! 457: return String::C((char*)dest_body, dest_length);
1.1 paf 458: }
459:
460: /// transcode using both charsets
1.35 ! paf 461: const String::C Charset::transcodeToCharset(const String::C src,
! 462: const Charset& dest_charset) const {
! 463: if(&dest_charset==this)
! 464: return src;
! 465: else {
! 466: size_t dest_length=src.length;
! 467: XMLByte* dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
! 468:
! 469: XMLByte* output=dest_body;
! 470: const XMLByte* input=(XMLByte *)src.str;
! 471: while(XMLCh c=*input++) {
! 472: XMLCh curVal = tables.fromTable[c];
! 473: *output++=curVal?
! 474: xlatOneTo(curVal, dest_charset.tables, '?') // OK
! 475: :'?'; // use the replacement character
1.6 paf 476: }
1.1 paf 477:
1.35 ! paf 478: dest_body[dest_length]=0; // terminator
! 479: return String::C((char*)dest_body, dest_length);
1.6 paf 480: }
1.1 paf 481: }
482:
483: #ifdef XML
1.10 paf 484:
1.35 ! paf 485: static const Charset::Tables* tables[MAX_CHARSETS];
! 486:
! 487: #define declareXml256ioFuncs(i) \
! 488: static int xml256CharEncodingInputFunc##i( \
! 489: unsigned char *out, int *outlen, \
! 490: const unsigned char *in, int *inlen) { \
! 491: return transcodeToUTF8( \
! 492: in, *(size_t*)inlen, \
! 493: out, *(size_t*)outlen, \
! 494: *tables[i]); \
! 495: } \
! 496: static int xml256CharEncodingOutputFunc##i( \
! 497: unsigned char *out, int *outlen, \
! 498: const unsigned char *in, int *inlen) { \
! 499: return transcodeFromUTF8( \
! 500: in, *(size_t*)inlen, \
! 501: out, *(size_t*)outlen, \
! 502: *tables[i]); \
! 503: }
! 504:
! 505: declareXml256ioFuncs(0) declareXml256ioFuncs(1)
! 506: declareXml256ioFuncs(2) declareXml256ioFuncs(3)
! 507: declareXml256ioFuncs(4) declareXml256ioFuncs(5)
! 508: declareXml256ioFuncs(6) declareXml256ioFuncs(7)
! 509: declareXml256ioFuncs(8) declareXml256ioFuncs(9)
! 510:
! 511: static xmlCharEncodingInputFunc inputFuncs[MAX_CHARSETS]={
! 512: xml256CharEncodingInputFunc0, xml256CharEncodingInputFunc1,
! 513: xml256CharEncodingInputFunc2, xml256CharEncodingInputFunc3,
! 514: xml256CharEncodingInputFunc4, xml256CharEncodingInputFunc5,
! 515: xml256CharEncodingInputFunc6, xml256CharEncodingInputFunc7,
! 516: xml256CharEncodingInputFunc8, xml256CharEncodingInputFunc9
! 517: };
! 518: static xmlCharEncodingOutputFunc outputFuncs[MAX_CHARSETS]={
! 519: xml256CharEncodingOutputFunc0, xml256CharEncodingOutputFunc1,
! 520: xml256CharEncodingOutputFunc2, xml256CharEncodingOutputFunc3,
! 521: xml256CharEncodingOutputFunc4, xml256CharEncodingOutputFunc5,
! 522: xml256CharEncodingOutputFunc6, xml256CharEncodingOutputFunc7,
! 523: xml256CharEncodingOutputFunc8, xml256CharEncodingOutputFunc9
! 524: };
! 525: static size_t handlers_count=0;
1.10 paf 526:
527: void Charset::addEncoding(char *name_cstr) {
1.35 ! paf 528: if(handlers_count==MAX_CHARSETS)
! 529: throw Exception(0,
! 530: 0,
! 531: "already allocated %d handlers, no space for new encoding '%s'",
! 532: MAX_CHARSETS, name_cstr);
! 533:
! 534: xmlCharEncodingHandler* handler=new(PointerFreeGC) xmlCharEncodingHandler;
! 535: {
! 536: handler->name=name_cstr;
! 537: handler->input=inputFuncs[handlers_count];
! 538: handler->output=outputFuncs[handlers_count];
! 539: ::tables[handlers_count]=&tables;
! 540: handlers_count++;
! 541: }
1.10 paf 542:
543: xmlRegisterCharEncodingHandler(handler);
1.35 ! paf 544:
1.10 paf 545: }
546:
1.35 ! paf 547: void Charset::initTranscoder(const StringBody NAME, const char* name_cstr) {
1.15 paf 548: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
1.35 ! paf 549: transcoder(NAME); // check right way
1.15 paf 550: }
551:
1.35 ! paf 552: xmlCharEncodingHandler& Charset::transcoder(const StringBody NAME) {
1.15 paf 553: if(!ftranscoder)
1.23 paf 554: throw Exception("parser.runtime",
1.35 ! paf 555: new String(NAME, String::L_TAINTED),
1.10 paf 556: "unsupported encoding");
1.35 ! paf 557: return *ftranscoder;
1.10 paf 558: }
559:
1.35 ! paf 560: String::C Charset::transcode_cstr(xmlChar* s) {
1.13 paf 561: if(!s)
1.35 ! paf 562: return String::C("", 0);
1.8 paf 563:
1.35 ! paf 564: int inlen=strlen((const char*)s);
! 565: int outlen=inlen; // max
! 566: #ifndef NDEBUG
! 567: int saved_outlen=outlen;
! 568: #endif
! 569: char *out=new(PointerFreeGC) char[outlen+1];
1.8 paf 570:
1.30 paf 571: int error;
1.35 ! paf 572: if(xmlCharEncodingOutputFunc output=transcoder(FNAME).output) {
1.30 paf 573: error=output(
1.17 paf 574: (unsigned char*)out, &outlen,
1.35 ! paf 575: (const unsigned char*)s, &inlen);
1.30 paf 576: } else {
577: memcpy(out, s, outlen=inlen);
578: error=0;
579: }
580: if(error<0)
1.23 paf 581: throw Exception(0,
1.8 paf 582: 0,
1.30 paf 583: "transcode_cstr failed (%d)", error);
1.8 paf 584:
1.35 ! paf 585: assert(outlen<=saved_outlen); out[outlen]=0;
! 586: return String::C(out, outlen);
1.14 paf 587: }
1.35 ! paf 588: const String& Charset::transcode(xmlChar* s) {
! 589: String::C cstr=transcode_cstr(s);
! 590: return *new String(cstr.str, cstr.length, true);
! 591: }
! 592: String::C Charset::transcode_cstr(GdomeDOMString* s) {
! 593: return s?transcode_cstr(BAD_CAST s->str)
! 594: :String::C("", 0);
! 595: }
! 596: const String& Charset::transcode(GdomeDOMString* s) {
! 597: String::C cstr=transcode_cstr(s);
! 598: return *new String(cstr.str, cstr.length, true);
1.1 paf 599: }
600:
1.8 paf 601: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.35 ! paf 602: xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
! 603: xmlChar* out;
1.30 paf 604: int outlen;
605: int error;
1.35 ! paf 606: #ifndef NDEBUG
! 607: int saved_outlen;
! 608: #endif
! 609: if(xmlCharEncodingInputFunc input=transcoder(FNAME).input) {
1.32 paf 610: outlen=buf_size*6/*max*/;
1.35 ! paf 611: #ifndef NDEBUG
! 612: saved_outlen=outlen;
! 613: #endif
! 614: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 615: error=input(
1.17 paf 616: out, &outlen,
1.35 ! paf 617: (const unsigned char*)buf, (int*)&buf_size);
1.30 paf 618: } else {
619: outlen=buf_size;
1.35 ! paf 620: #ifndef NDEBUG
! 621: saved_outlen=outlen;
! 622: #endif
! 623: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 624: memcpy(out, buf, outlen);
625: error=0;
626: }
1.17 paf 627:
1.30 paf 628: if(error<0)
1.23 paf 629: throw Exception(0,
1.8 paf 630: 0,
1.30 paf 631: "transcode_buf failed (%d)", error);
1.8 paf 632:
1.35 ! paf 633: assert(outlen<=saved_outlen); out[outlen]=0;
! 634: return out;
1.24 paf 635: }
1.35 ! paf 636: GdomeDOMString_auto_ptr Charset::transcode_buf2dom(const char* buf, size_t buf_size) {
! 637: return GdomeDOMString_auto_ptr(transcode_buf2xchar(buf, buf_size));
1.1 paf 638: }
1.12 paf 639: GdomeDOMString_auto_ptr Charset::transcode(const String& s) {
1.35 ! paf 640: const char* cstr=s.cstr(String::L_UNSPECIFIED);
1.1 paf 641:
1.24 paf 642: return transcode_buf2dom(cstr, strlen(cstr));
1.1 paf 643: }
1.35 ! paf 644: GdomeDOMString_auto_ptr Charset::transcode(const StringBody s) {
! 645: const char* cstr=s.cstr();
! 646:
! 647: return transcode_buf2dom(cstr, s.length());
! 648: }
1.34 paf 649:
1.35 ! paf 650: StringBody Charset::transcode(const StringBody src,
1.34 paf 651: const Charset& source_transcoder,
1.35 ! paf 652: const Charset& dest_transcoder) {
1.34 paf 653:
1.35 ! paf 654: const char *src_ptr=src.cstr();
1.34 paf 655: size_t src_size=strlen(src_ptr);
656:
1.35 ! paf 657: String::C dest=Charset::transcode(String::C(src_ptr, src_size),
! 658: source_transcoder,
! 659: dest_transcoder);
1.34 paf 660:
1.35 ! paf 661: return StringBody(dest.str, dest.length);
! 662: }
! 663:
! 664: String& Charset::transcode(const String& src,
! 665: const Charset& source_transcoder,
! 666: const Charset& dest_transcoder) {
! 667: if(!src.length())
! 668: return *new String("", 0, false);
1.34 paf 669:
1.35 ! paf 670: return *new String(transcode((StringBody)src, source_transcoder, dest_transcoder), String::L_CLEAN);
1.34 paf 671: }
672:
1.35 ! paf 673: void Charset::transcode(ArrayString& src,
1.34 paf 674: const Charset& source_transcoder,
1.35 ! paf 675: const Charset& dest_transcoder) {
! 676: for(size_t i=0; i<src.count(); i++)
! 677: src.put(i, &transcode(*src[i], source_transcoder, dest_transcoder));
1.34 paf 678: }
679:
680: #ifndef DOXYGEN
681: struct Transcode_pair_info {
682: const Charset* source_transcoder;
683: const Charset* dest_transcoder;
684: };
685: #endif
1.35 ! paf 686: static void transcode_pair(const StringBody akey,
! 687: StringBody& avalue,
! 688: Transcode_pair_info* info) {
! 689: avalue=Charset::transcode(avalue,
! 690: *info->source_transcoder,
! 691: *info->dest_transcoder);
1.34 paf 692: }
1.35 ! paf 693: void Charset::transcode(HashStringString& src,
1.34 paf 694: const Charset& source_transcoder,
1.35 ! paf 695: const Charset& dest_transcoder) {
! 696: Transcode_pair_info info={&source_transcoder, &dest_transcoder};
! 697: src.for_each_ref(transcode_pair, &info);
1.34 paf 698: }
1.35 ! paf 699: #endif
E-mail: