Annotation of parser3/src/main/pa_charset.C, revision 1.37
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.35 paf 4: Copyright(c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.37 ! paf 8: static const char* IDENT_CHARSET_C="$Date: 2003/09/24 14:32:06 $";
1.1 paf 9:
10: #include "pa_charset.h"
1.35 paf 11: #include "pa_charsets.h"
1.1 paf 12:
13: #ifdef XML
1.8 paf 14: #include "libxml/encoding.h"
1.1 paf 15: #endif
16:
17: // helpers
18:
19: inline void prepare_case_tables(unsigned char *tables) {
20: unsigned char *lcc_table=tables+lcc_offset;
21: unsigned char *fcc_table=tables+fcc_offset;
22: for(int i=0; i<0x100; i++)
23: lcc_table[i]=fcc_table[i]=i;
24: }
25: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
26: unsigned char bit) {
27: unsigned char *ctypes_table=tables+ctypes_offset;
28: ctypes_table[0]=bit;
29: for(; *cstr; cstr++) {
30: unsigned char c=*cstr;
31: ctypes_table[c]|=bit;
32: }
33: }
1.35 paf 34: inline unsigned int to_wchar_code(const char* cstr) {
1.1 paf 35: if(!cstr || !*cstr)
36: return 0;
37: if(cstr[1]==0)
1.4 paf 38: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 39:
40: char *error_pos;
1.4 paf 41: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 42: }
1.35 paf 43: inline bool to_bool(const char* cstr) {
1.1 paf 44: return cstr && *cstr!=0;
45: }
46: static void element2ctypes(unsigned char c, bool belongs,
47: unsigned char *tables, unsigned char bit, int group_offset=-1) {
48: if(!belongs)
49: return;
50:
51: unsigned char *ctypes_table=tables+ctypes_offset;
52:
53: ctypes_table[c]|=bit;
54: if(group_offset>=0)
1.4 paf 55: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 56: }
57: static void element2case(unsigned char from, unsigned char to,
58: unsigned char *tables) {
59: if(!to)
60: return;
61:
62: unsigned char *lcc_table=tables+lcc_offset;
63: unsigned char *fcc_table=tables+fcc_offset;
64: lcc_table[from]=to;
65: fcc_table[from]=to; fcc_table[to]=from;
66: }
67:
68: // methods
69:
70: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.37 ! paf 71: Charset::Charset(Request_charsets* charsets, const String::Body ANAME, const String* afile_spec):
1.35 paf 72: FNAME(ANAME),
73: FNAME_CSTR(ANAME.cstrm()) {
1.7 paf 74:
1.35 paf 75: if(afile_spec) {
1.1 paf 76: fisUTF8=false;
1.35 paf 77: load_definition(*charsets, *afile_spec);
1.1 paf 78: #ifdef XML
1.35 paf 79: addEncoding(FNAME_CSTR);
1.1 paf 80: #endif
81: } else {
82: fisUTF8=true;
1.4 paf 83: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 84: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
85: }
86:
87: #ifdef XML
1.35 paf 88: initTranscoder(FNAME, FNAME_CSTR);
1.1 paf 89: #endif
90: }
91:
1.35 paf 92: void Charset::load_definition(Request_charsets& charsets, const String& afile_spec) {
1.1 paf 93: // pcre_tables
94: // lowcase, flipcase, bits digit+word+whitespace, masks
95:
96: // must not move this inside of prepare_case_tables
97: // don't know the size there
98: memset(pcre_tables, 0, sizeof(pcre_tables));
99: prepare_case_tables(pcre_tables);
1.4 paf 100: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 101:
102: // charset
1.35 paf 103: memset(&tables, 0, sizeof(tables));
1.1 paf 104: // strangly vital
1.10 paf 105: tables.toTable[tables.toTableSize].intCh=0;
106: tables.toTable[tables.toTableSize].extCh=(XMLByte)0;
107: tables.toTableSize++;
1.1 paf 108:
109: // loading text
1.35 paf 110: char *data=file_read_text(charsets, afile_spec);
1.1 paf 111:
112: // ignore header
113: getrow(&data);
114:
115: // parse cells
116: char *row;
117: while(row=getrow(&data)) {
118: // remove empty&comment lines
119: if(!*row || *row=='#')
120: continue;
121:
122: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
123: unsigned int c=0;
124: char *cell;
125: for(int column=0; cell=lsplit(&row, '\t'); column++) {
126: switch(column) {
127: case 0: c=to_wchar_code(cell); break;
128: // pcre_tables
129: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
130: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
131: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
132: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
133: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
134: case 6: element2case(c, to_wchar_code(cell), pcre_tables); break;
135: case 7:
136: case 8:
137: // charset
1.10 paf 138: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.23 paf 139: throw Exception("parser.runtime",
1.35 paf 140: &afile_spec,
1.1 paf 141: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
142:
143: XMLCh unicode=(XMLCh)to_wchar_code(cell);
144: if(!unicode && column==7/*unicode1 column*/)
145: unicode=(XMLCh)c;
146: if(unicode) {
1.10 paf 147: if(!tables.fromTable[c])
148: tables.fromTable[c]=unicode;
149: tables.toTable[tables.toTableSize].intCh=unicode;
150: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
151: tables.toTableSize++;
1.1 paf 152: }
153: break;
154: }
155: }
156: };
157:
158: // sort by the Unicode code point
159: sort_ToTable();
160: }
161:
162: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
163: return
164: static_cast<const Charset_TransRec *>(a)->intCh-
165: static_cast<const Charset_TransRec *>(b)->intCh;
166: }
167:
168: void Charset::sort_ToTable() {
1.10 paf 169: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 170: sort_cmp_Trans_rec_intCh);
171: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 172: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 173: //fclose(f);
174: }
175:
1.10 paf 176: static XMLByte xlatOneTo(const XMLCh toXlat,
1.35 paf 177: const Charset::Tables& tables,
178: XMLByte not_found) {
179: unsigned int lowOfs = 0;
180: unsigned int hiOfs = tables.toTableSize - 1;
181: XMLByte curByte = 0;
182: do {
183: // Calc the mid point of the low and high offset.
184: const unsigned int midOfs =((hiOfs - lowOfs) / 2)+lowOfs;
185:
186: // If our test char is greater than the mid point char, then
187: // we move up to the upper half. Else we move to the lower
188: // half. If its equal, then its our guy.
189: if(toXlat>tables.toTable[midOfs].intCh)
190: lowOfs = midOfs;
1.10 paf 191: else if(toXlat<tables.toTable[midOfs].intCh)
1.1 paf 192: hiOfs = midOfs;
193: else
1.10 paf 194: return tables.toTable[midOfs].extCh;
1.4 paf 195: } while(lowOfs+1<hiOfs);
1.35 paf 196:
197: return not_found;
1.1 paf 198: }
199:
1.35 paf 200: String::C Charset::transcode(const String::C src,
201: const Charset& source_charset,
202: const Charset& dest_charset) {
203: if(!src.length)
204: return String::C("", 0);
1.4 paf 205:
1.1 paf 206: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
207: default: // 0x00
1.35 paf 208: return source_charset.transcodeToCharset(src, dest_charset);
1.1 paf 209: case 0x01:
1.35 paf 210: return source_charset.transcodeToUTF8(src);
1.1 paf 211: case 0x10:
1.35 paf 212: return dest_charset.transcodeFromUTF8(src);
1.1 paf 213: case 0x11:
1.35 paf 214: return src;
1.1 paf 215: }
216: }
217:
218: // ---------------------------------------------------------------------------
219: // Local static data
220: //
221: // gUTFBytes
222: // A list of counts of trailing bytes for each initial byte in the input.
223: //
224: // gUTFOffsets
225: // A list of values to offset each result char type, according to how
226: // many source bytes when into making it.
227: //
228: // gFirstByteMark
229: // A list of values to mask onto the first byte of an encoded sequence,
230: // indexed by the number of bytes used to create the sequence.
231: // ---------------------------------------------------------------------------
232: static const XMLByte gUTFBytes[0x100] = {
233: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
234: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
235: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
236: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
237: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
238: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
239: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
240: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
241: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
242: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
243: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
244: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
245: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
246: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
247: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
248: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
249: };
250:
251: static const uint gUTFOffsets[6] = {
252: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
253: };
254:
255: static const XMLByte gFirstByteMark[7] = {
256: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
257: };
258:
1.35 paf 259: static int transcodeToUTF8(const XMLByte* srcData, size_t& srcLen,
260: XMLByte *toFill, size_t& toFillLen,
261: const Charset::Tables& tables) {
1.11 paf 262: const XMLByte* srcPtr=srcData;
263: const XMLByte* srcEnd=srcData+srcLen;
264: XMLByte* outPtr=toFill;
265: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 266:
1.35 paf 267: while(srcPtr<srcEnd) {
268: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 269: if(!curVal) {
1.35 paf 270: // use the replacement character
271: *outPtr++= '?';
272: srcPtr++;
273: continue;
274: }
1.1 paf 275:
1.35 paf 276: // Figure out how many bytes we need
277: unsigned int encodedBytes;
278: if(curVal<0x80)
279: encodedBytes = 1;
280: else if(curVal<0x800)
281: encodedBytes = 2;
282: else if(curVal<0x10000)
283: encodedBytes = 3;
284: else if(curVal<0x200000)
285: encodedBytes = 4;
286: else if(curVal<0x4000000)
287: encodedBytes = 5;
288: else if(curVal<= 0x7FFFFFFF)
289: encodedBytes = 6;
290: else {
291: // use the replacement character
292: *outPtr++= '?';
293: srcPtr++;
294: continue;
295: }
1.11 paf 296:
1.35 paf 297: // If we cannot fully get this char into the output buffer
298: if (outPtr + encodedBytes > outEnd)
299: break;
300:
301: // We can do it, so update the source index
302: srcPtr++;
303:
304: // And spit out the bytes. We spit them out in reverse order
305: // here, so bump up the output pointer and work down as we go.
306: outPtr+= encodedBytes;
307: switch(encodedBytes) {
308: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
309: curVal>>= 6;
310: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
311: curVal>>= 6;
312: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
313: curVal>>= 6;
314: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
315: curVal>>= 6;
316: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
317: curVal>>= 6;
318: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
319: }
320:
321: // Add the encoded bytes back in again to indicate we've eaten them
322: outPtr+= encodedBytes;
323: }
324:
325: // Update the bytes eaten
326: srcLen = srcPtr - srcData;
327:
328: // Return the characters read
329: toFillLen = outPtr - toFill;
330:
1.29 paf 331: //return srcPtr==srcEnd?(int)toFillLen:-1;
332: /*
333: xmlCharEncodingInputFunc
334: Returns :
335: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
336: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
337: of ocetes consumed.
338: */
339: return 0;
1.1 paf 340: }
1.26 paf 341: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.35 paf 342: static int transcodeFromUTF8(const XMLByte* srcData, size_t& srcLen,
343: XMLByte* toFill, size_t& toFillLen,
344: const Charset::Tables& tables) {
1.11 paf 345: const XMLByte* srcPtr=srcData;
346: const XMLByte* srcEnd=srcData+srcLen;
347: XMLByte* outPtr=toFill;
348: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 349:
1.35 paf 350: // We now loop until we either run out of input data, or room to store
351: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
352: // Get the next leading byte out
353: const XMLByte firstByte =* srcPtr;
354:
355: // Special-case ASCII, which is a leading byte value of<= 127
356: if(firstByte<= 127) {
357: *outPtr++= firstByte;
358: srcPtr++;
359: continue;
360: }
361:
362: // See how many trailing src bytes this sequence is going to require
363: const unsigned int trailingBytes = gUTFBytes[firstByte];
364:
365: // If there are not enough source bytes to do this one, then we
366: // are done. Note that we done>= here because we are implicitly
367: // counting the 1 byte we get no matter what.
368: if(srcPtr+trailingBytes>= srcEnd)
369: break;
370:
371: // Looks ok, so lets build up the value
372: uint tmpVal=0;
373: switch(trailingBytes) {
374: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
375: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
376: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
377: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
378: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
379: case 0: tmpVal+=*srcPtr++;
380: break;
381:
382: default:
383: throw Exception(0,
384: 0,
385: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes);
386: }
387: tmpVal-=gUTFOffsets[trailingBytes];
388:
389: // If it will fit into a single char, then put it in. Otherwise
390: // fail [*encode it as a surrogate pair. If its not valid, use the
391: // replacement char.*]
392: if(!(tmpVal & 0xFFFF0000)) {
1.25 paf 393: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
394: *outPtr++=xlat;
395: else
396: outPtr+=sprintf((char *)outPtr, "&#%d;", tmpVal); // &#decimal;
397: } else
1.23 paf 398: throw Exception(0,
1.35 paf 399: 0,
400: "transcodeFromUTF8 error: too big tmpVal(0x%08X)", tmpVal);
1.1 paf 401: }
1.35 paf 402:
403: // Update the bytes eaten
404: srcLen = srcPtr - srcData;
405:
406: // Return the characters read
407: toFillLen = outPtr - toFill;
1.11 paf 408:
1.29 paf 409: //return srcPtr==srcEnd?(int)toFillLen:-1;
410: /*
411: xmlCharEncodingOutputFunc
412: Returns :
413: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
414: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
415: of ocetes consumed.
416: */
417: return 0;
1.10 paf 418: }
419:
420: /// @todo not so memory-hungry with prescan
1.35 paf 421: const String::C Charset::transcodeToUTF8(const String::C src) const {
422: size_t src_length=src.length;
423: size_t dest_length=src.length*6/*so that surly enough, max utf8 seq len=6*/;
424: #ifndef NDEBUG
425: size_t saved_dest_length=dest_length;
426: #endif
427: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 428:
429: if(::transcodeToUTF8(
1.35 paf 430: (XMLByte *)src.str, src_length,
431: dest_body, dest_length,
1.11 paf 432: tables)<0)
1.10 paf 433: throw(0, 0,
434: 0,
1.11 paf 435: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 436:
1.35 paf 437: assert(dest_length<=saved_dest_length); dest_body[dest_length]=0; // terminator
438: return String::C((char*)dest_body, dest_length);
1.10 paf 439: }
1.35 paf 440: const String::C Charset::transcodeFromUTF8(const String::C src) const {
441: size_t src_length=src.length;
442: size_t dest_length=src.length*6/*so that surly enough, "ÿ" has max ratio */;
443: #ifndef NDEBUG
444: size_t saved_dest_length=dest_length;
445: #endif
446: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 447:
448: if(::transcodeFromUTF8(
1.35 paf 449: (XMLByte *)src.str, src_length,
450: dest_body, dest_length,
1.11 paf 451: tables)<0)
1.10 paf 452: throw(0, 0,
453: 0,
1.35 paf 454: "Charset::transcodeFromUTF8 buffer overflow");
1.10 paf 455:
1.35 paf 456: assert(dest_length<=saved_dest_length); dest_body[dest_length]=0; // terminator
457: return String::C((char*)dest_body, dest_length);
1.1 paf 458: }
459:
460: /// transcode using both charsets
1.35 paf 461: const String::C Charset::transcodeToCharset(const String::C src,
462: const Charset& dest_charset) const {
463: if(&dest_charset==this)
464: return src;
465: else {
466: size_t dest_length=src.length;
467: XMLByte* dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
468:
469: XMLByte* output=dest_body;
470: const XMLByte* input=(XMLByte *)src.str;
471: while(XMLCh c=*input++) {
472: XMLCh curVal = tables.fromTable[c];
473: *output++=curVal?
474: xlatOneTo(curVal, dest_charset.tables, '?') // OK
475: :'?'; // use the replacement character
1.6 paf 476: }
1.1 paf 477:
1.35 paf 478: dest_body[dest_length]=0; // terminator
479: return String::C((char*)dest_body, dest_length);
1.6 paf 480: }
1.1 paf 481: }
482:
483: #ifdef XML
1.10 paf 484:
1.35 paf 485: static const Charset::Tables* tables[MAX_CHARSETS];
486:
487: #define declareXml256ioFuncs(i) \
488: static int xml256CharEncodingInputFunc##i( \
489: unsigned char *out, int *outlen, \
490: const unsigned char *in, int *inlen) { \
491: return transcodeToUTF8( \
492: in, *(size_t*)inlen, \
493: out, *(size_t*)outlen, \
494: *tables[i]); \
495: } \
496: static int xml256CharEncodingOutputFunc##i( \
497: unsigned char *out, int *outlen, \
498: const unsigned char *in, int *inlen) { \
499: return transcodeFromUTF8( \
500: in, *(size_t*)inlen, \
501: out, *(size_t*)outlen, \
502: *tables[i]); \
503: }
504:
505: declareXml256ioFuncs(0) declareXml256ioFuncs(1)
506: declareXml256ioFuncs(2) declareXml256ioFuncs(3)
507: declareXml256ioFuncs(4) declareXml256ioFuncs(5)
508: declareXml256ioFuncs(6) declareXml256ioFuncs(7)
509: declareXml256ioFuncs(8) declareXml256ioFuncs(9)
510:
511: static xmlCharEncodingInputFunc inputFuncs[MAX_CHARSETS]={
512: xml256CharEncodingInputFunc0, xml256CharEncodingInputFunc1,
513: xml256CharEncodingInputFunc2, xml256CharEncodingInputFunc3,
514: xml256CharEncodingInputFunc4, xml256CharEncodingInputFunc5,
515: xml256CharEncodingInputFunc6, xml256CharEncodingInputFunc7,
516: xml256CharEncodingInputFunc8, xml256CharEncodingInputFunc9
517: };
518: static xmlCharEncodingOutputFunc outputFuncs[MAX_CHARSETS]={
519: xml256CharEncodingOutputFunc0, xml256CharEncodingOutputFunc1,
520: xml256CharEncodingOutputFunc2, xml256CharEncodingOutputFunc3,
521: xml256CharEncodingOutputFunc4, xml256CharEncodingOutputFunc5,
522: xml256CharEncodingOutputFunc6, xml256CharEncodingOutputFunc7,
523: xml256CharEncodingOutputFunc8, xml256CharEncodingOutputFunc9
524: };
525: static size_t handlers_count=0;
1.10 paf 526:
527: void Charset::addEncoding(char *name_cstr) {
1.35 paf 528: if(handlers_count==MAX_CHARSETS)
529: throw Exception(0,
530: 0,
531: "already allocated %d handlers, no space for new encoding '%s'",
532: MAX_CHARSETS, name_cstr);
533:
534: xmlCharEncodingHandler* handler=new(PointerFreeGC) xmlCharEncodingHandler;
535: {
536: handler->name=name_cstr;
537: handler->input=inputFuncs[handlers_count];
538: handler->output=outputFuncs[handlers_count];
539: ::tables[handlers_count]=&tables;
540: handlers_count++;
541: }
1.10 paf 542:
543: xmlRegisterCharEncodingHandler(handler);
1.35 paf 544:
1.10 paf 545: }
546:
1.37 ! paf 547: void Charset::initTranscoder(const String::Body NAME, const char* name_cstr) {
1.15 paf 548: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
1.35 paf 549: transcoder(NAME); // check right way
1.15 paf 550: }
551:
1.37 ! paf 552: xmlCharEncodingHandler& Charset::transcoder(const String::Body NAME) {
1.15 paf 553: if(!ftranscoder)
1.23 paf 554: throw Exception("parser.runtime",
1.35 paf 555: new String(NAME, String::L_TAINTED),
1.10 paf 556: "unsupported encoding");
1.35 paf 557: return *ftranscoder;
1.10 paf 558: }
559:
1.35 paf 560: String::C Charset::transcode_cstr(xmlChar* s) {
1.13 paf 561: if(!s)
1.35 paf 562: return String::C("", 0);
1.8 paf 563:
1.35 paf 564: int inlen=strlen((const char*)s);
565: int outlen=inlen; // max
566: #ifndef NDEBUG
567: int saved_outlen=outlen;
568: #endif
569: char *out=new(PointerFreeGC) char[outlen+1];
1.8 paf 570:
1.30 paf 571: int error;
1.35 paf 572: if(xmlCharEncodingOutputFunc output=transcoder(FNAME).output) {
1.30 paf 573: error=output(
1.17 paf 574: (unsigned char*)out, &outlen,
1.35 paf 575: (const unsigned char*)s, &inlen);
1.30 paf 576: } else {
577: memcpy(out, s, outlen=inlen);
578: error=0;
579: }
580: if(error<0)
1.23 paf 581: throw Exception(0,
1.8 paf 582: 0,
1.30 paf 583: "transcode_cstr failed (%d)", error);
1.8 paf 584:
1.35 paf 585: assert(outlen<=saved_outlen); out[outlen]=0;
586: return String::C(out, outlen);
1.14 paf 587: }
1.35 paf 588: const String& Charset::transcode(xmlChar* s) {
589: String::C cstr=transcode_cstr(s);
590: return *new String(cstr.str, cstr.length, true);
591: }
592: String::C Charset::transcode_cstr(GdomeDOMString* s) {
593: return s?transcode_cstr(BAD_CAST s->str)
594: :String::C("", 0);
595: }
596: const String& Charset::transcode(GdomeDOMString* s) {
597: String::C cstr=transcode_cstr(s);
598: return *new String(cstr.str, cstr.length, true);
1.1 paf 599: }
600:
1.8 paf 601: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.35 paf 602: xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
603: xmlChar* out;
1.30 paf 604: int outlen;
605: int error;
1.35 paf 606: #ifndef NDEBUG
607: int saved_outlen;
608: #endif
609: if(xmlCharEncodingInputFunc input=transcoder(FNAME).input) {
1.32 paf 610: outlen=buf_size*6/*max*/;
1.35 paf 611: #ifndef NDEBUG
612: saved_outlen=outlen;
613: #endif
614: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 615: error=input(
1.17 paf 616: out, &outlen,
1.35 paf 617: (const unsigned char*)buf, (int*)&buf_size);
1.30 paf 618: } else {
619: outlen=buf_size;
1.35 paf 620: #ifndef NDEBUG
621: saved_outlen=outlen;
622: #endif
623: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 624: memcpy(out, buf, outlen);
625: error=0;
626: }
1.17 paf 627:
1.30 paf 628: if(error<0)
1.23 paf 629: throw Exception(0,
1.8 paf 630: 0,
1.30 paf 631: "transcode_buf failed (%d)", error);
1.8 paf 632:
1.35 paf 633: assert(outlen<=saved_outlen); out[outlen]=0;
634: return out;
1.24 paf 635: }
1.35 paf 636: GdomeDOMString_auto_ptr Charset::transcode_buf2dom(const char* buf, size_t buf_size) {
637: return GdomeDOMString_auto_ptr(transcode_buf2xchar(buf, buf_size));
1.1 paf 638: }
1.12 paf 639: GdomeDOMString_auto_ptr Charset::transcode(const String& s) {
1.35 paf 640: const char* cstr=s.cstr(String::L_UNSPECIFIED);
1.1 paf 641:
1.24 paf 642: return transcode_buf2dom(cstr, strlen(cstr));
1.1 paf 643: }
1.37 ! paf 644: GdomeDOMString_auto_ptr Charset::transcode(const String::Body s) {
1.35 paf 645: const char* cstr=s.cstr();
646:
647: return transcode_buf2dom(cstr, s.length());
648: }
1.36 paf 649: #endif
1.34 paf 650:
1.37 ! paf 651: String::Body Charset::transcode(const String::Body src,
1.34 paf 652: const Charset& source_transcoder,
1.35 paf 653: const Charset& dest_transcoder) {
1.34 paf 654:
1.35 paf 655: const char *src_ptr=src.cstr();
1.34 paf 656: size_t src_size=strlen(src_ptr);
657:
1.35 paf 658: String::C dest=Charset::transcode(String::C(src_ptr, src_size),
659: source_transcoder,
660: dest_transcoder);
1.34 paf 661:
1.37 ! paf 662: return String::Body(dest.str, dest.length);
1.35 paf 663: }
664:
665: String& Charset::transcode(const String& src,
666: const Charset& source_transcoder,
667: const Charset& dest_transcoder) {
668: if(!src.length())
669: return *new String("", 0, false);
1.34 paf 670:
1.37 ! paf 671: return *new String(transcode((String::Body)src, source_transcoder, dest_transcoder), String::L_CLEAN);
1.34 paf 672: }
673:
1.35 paf 674: void Charset::transcode(ArrayString& src,
1.34 paf 675: const Charset& source_transcoder,
1.35 paf 676: const Charset& dest_transcoder) {
677: for(size_t i=0; i<src.count(); i++)
678: src.put(i, &transcode(*src[i], source_transcoder, dest_transcoder));
1.34 paf 679: }
680:
681: #ifndef DOXYGEN
682: struct Transcode_pair_info {
683: const Charset* source_transcoder;
684: const Charset* dest_transcoder;
685: };
686: #endif
1.37 ! paf 687: static void transcode_pair(const String::Body akey,
! 688: String::Body& avalue,
1.35 paf 689: Transcode_pair_info* info) {
690: avalue=Charset::transcode(avalue,
691: *info->source_transcoder,
692: *info->dest_transcoder);
1.34 paf 693: }
1.35 paf 694: void Charset::transcode(HashStringString& src,
1.34 paf 695: const Charset& source_transcoder,
1.35 paf 696: const Charset& dest_transcoder) {
697: Transcode_pair_info info={&source_transcoder, &dest_transcoder};
698: src.for_each_ref(transcode_pair, &info);
1.34 paf 699: }
E-mail: