Annotation of parser3/src/main/pa_charset.C, revision 1.34
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.33 paf 4: Copyright(c) 2001, 2003 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.34 ! paf 8: static const char* IDENT_CHARSET_C="$Date: 2003/01/21 15:51:13 $";
1.1 paf 9:
10: #include "pa_charset.h"
1.34 ! paf 11: #include "pa_array.h"
! 12: #include "pa_hash.h"
1.1 paf 13:
14: #ifdef XML
1.8 paf 15: #include "libxml/encoding.h"
1.1 paf 16: #endif
17:
18: // globals
19:
20:
21: // consts
22:
23: #define MAX_CHARSET_UNI_CODES 500
24:
25: // helpers
26:
27: inline void prepare_case_tables(unsigned char *tables) {
28: unsigned char *lcc_table=tables+lcc_offset;
29: unsigned char *fcc_table=tables+fcc_offset;
30: for(int i=0; i<0x100; i++)
31: lcc_table[i]=fcc_table[i]=i;
32: }
33: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
34: unsigned char bit) {
35: unsigned char *ctypes_table=tables+ctypes_offset;
36: ctypes_table[0]=bit;
37: for(; *cstr; cstr++) {
38: unsigned char c=*cstr;
39: ctypes_table[c]|=bit;
40: }
41: }
42: inline unsigned int to_wchar_code(const char *cstr) {
43: if(!cstr || !*cstr)
44: return 0;
45: if(cstr[1]==0)
1.4 paf 46: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 47:
48: char *error_pos;
1.4 paf 49: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 50: }
51: inline bool to_bool(const char *cstr) {
52: return cstr && *cstr!=0;
53: }
54: static void element2ctypes(unsigned char c, bool belongs,
55: unsigned char *tables, unsigned char bit, int group_offset=-1) {
56: if(!belongs)
57: return;
58:
59: unsigned char *ctypes_table=tables+ctypes_offset;
60:
61: ctypes_table[c]|=bit;
62: if(group_offset>=0)
1.4 paf 63: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 64: }
65: static void element2case(unsigned char from, unsigned char to,
66: unsigned char *tables) {
67: if(!to)
68: return;
69:
70: unsigned char *lcc_table=tables+lcc_offset;
71: unsigned char *fcc_table=tables+fcc_offset;
72: lcc_table[from]=to;
73: fcc_table[from]=to; fcc_table[to]=from;
74: }
75:
76: // methods
77:
78: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.7 paf 79: Charset::Charset(Pool& apool, const String& aname, const String *request_file_spec): Pooled(apool),
80: fname(aname) {
1.1 paf 81:
1.10 paf 82: char *name_cstr=fname.cstr();
83: for(char *c=name_cstr; *c; c++)
84: *c = toupper(*c);
1.7 paf 85:
86: if(request_file_spec) {
1.1 paf 87: fisUTF8=false;
1.7 paf 88: loadDefinition(*request_file_spec);
1.1 paf 89: #ifdef XML
90: addEncoding(name_cstr);
91: #endif
92: } else {
93: fisUTF8=true;
1.4 paf 94: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 95: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
96: }
97:
98: #ifdef XML
99: initTranscoder(&aname, name_cstr);
100: #endif
101: }
102:
103: Charset::~Charset() {
104: #ifdef XML
1.9 paf 105: // not deleting transcoder, that's not our business
1.1 paf 106: #endif
107: }
108:
1.7 paf 109: void Charset::loadDefinition(const String& request_file_spec) {
1.1 paf 110: // pcre_tables
111: // lowcase, flipcase, bits digit+word+whitespace, masks
112:
113: // must not move this inside of prepare_case_tables
114: // don't know the size there
115: memset(pcre_tables, 0, sizeof(pcre_tables));
116: prepare_case_tables(pcre_tables);
1.4 paf 117: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 118:
119: // charset
1.10 paf 120: memset(tables.fromTable, 0, sizeof(tables.fromTable));
121: tables.toTable=(Charset_TransRec *)calloc(sizeof(Charset_TransRec)*MAX_CHARSET_UNI_CODES);
122: tables.toTableSize=0;
1.1 paf 123: // strangly vital
1.10 paf 124: tables.toTable[tables.toTableSize].intCh=0;
125: tables.toTable[tables.toTableSize].extCh=(XMLByte)0;
126: tables.toTableSize++;
1.1 paf 127:
128: // loading text
1.7 paf 129: char *data=file_read_text(pool(), request_file_spec);
1.1 paf 130:
131: // ignore header
132: getrow(&data);
133:
134: // parse cells
135: char *row;
136: while(row=getrow(&data)) {
137: // remove empty&comment lines
138: if(!*row || *row=='#')
139: continue;
140:
141: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
142: unsigned int c=0;
143: char *cell;
144: for(int column=0; cell=lsplit(&row, '\t'); column++) {
145: switch(column) {
146: case 0: c=to_wchar_code(cell); break;
147: // pcre_tables
148: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
149: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
150: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
151: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
152: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
153: case 6: element2case(c, to_wchar_code(cell), pcre_tables); break;
154: case 7:
155: case 8:
156: // charset
1.10 paf 157: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.23 paf 158: throw Exception("parser.runtime",
1.7 paf 159: &request_file_spec,
1.1 paf 160: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
161:
162: XMLCh unicode=(XMLCh)to_wchar_code(cell);
163: if(!unicode && column==7/*unicode1 column*/)
164: unicode=(XMLCh)c;
165: if(unicode) {
1.10 paf 166: if(!tables.fromTable[c])
167: tables.fromTable[c]=unicode;
168: tables.toTable[tables.toTableSize].intCh=unicode;
169: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
170: tables.toTableSize++;
1.1 paf 171: }
172: break;
173: }
174: }
175: };
176:
177: // sort by the Unicode code point
178: sort_ToTable();
179: }
180:
181: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
182: return
183: static_cast<const Charset_TransRec *>(a)->intCh-
184: static_cast<const Charset_TransRec *>(b)->intCh;
185: }
186:
187: void Charset::sort_ToTable() {
1.10 paf 188: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 189: sort_cmp_Trans_rec_intCh);
190: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 191: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 192: //fclose(f);
193: }
194:
1.10 paf 195: static XMLByte xlatOneTo(const XMLCh toXlat,
1.25 paf 196: const Charset::Tables& tables,
197: XMLByte not_found) {
1.1 paf 198: unsigned int lowOfs = 0;
1.10 paf 199: unsigned int hiOfs = tables.toTableSize - 1;
1.1 paf 200: XMLByte curByte = 0;
201: do {
202: // Calc the mid point of the low and high offset.
1.4 paf 203: const unsigned int midOfs =((hiOfs - lowOfs) / 2)+lowOfs;
1.1 paf 204:
205: // If our test char is greater than the mid point char, then
206: // we move up to the upper half. Else we move to the lower
207: // half. If its equal, then its our guy.
1.10 paf 208: if(toXlat>tables.toTable[midOfs].intCh)
1.1 paf 209: lowOfs = midOfs;
1.10 paf 210: else if(toXlat<tables.toTable[midOfs].intCh)
1.1 paf 211: hiOfs = midOfs;
212: else
1.10 paf 213: return tables.toTable[midOfs].extCh;
1.4 paf 214: } while(lowOfs+1<hiOfs);
1.1 paf 215:
1.25 paf 216: return not_found;
1.1 paf 217: }
218:
219: void Charset::transcode(Pool& pool,
220: const Charset& source_charset, const void *source_body, size_t source_content_length,
221: const Charset& dest_charset, const void *& dest_body, size_t& dest_content_length
222: ) {
1.4 paf 223: if(!source_content_length) {
224: dest_body=0;
225: dest_content_length=0;
226: return;
227: }
228:
1.1 paf 229: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
230: default: // 0x00
231: source_charset.transcodeToCharset(pool, dest_charset,
232: source_body, source_content_length,
233: dest_body, dest_content_length);
234: break;
235: case 0x01:
236: source_charset.transcodeToUTF8(pool,
237: source_body, source_content_length,
238: dest_body, dest_content_length);
239: break;
240: case 0x10:
241: dest_charset.transcodeFromUTF8(pool,
242: source_body, source_content_length,
243: dest_body, dest_content_length);
244: break;
245: case 0x11:
246: dest_body=source_body;
247: dest_content_length=source_content_length;
248: break;
249: }
250: }
251:
252: // ---------------------------------------------------------------------------
253: // Local static data
254: //
255: // gUTFBytes
256: // A list of counts of trailing bytes for each initial byte in the input.
257: //
258: // gUTFOffsets
259: // A list of values to offset each result char type, according to how
260: // many source bytes when into making it.
261: //
262: // gFirstByteMark
263: // A list of values to mask onto the first byte of an encoded sequence,
264: // indexed by the number of bytes used to create the sequence.
265: // ---------------------------------------------------------------------------
266: static const XMLByte gUTFBytes[0x100] = {
267: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
268: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
269: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
270: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
271: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
272: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
273: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
274: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
275: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
276: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
277: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
278: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
279: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
280: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
281: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
282: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
283: };
284:
285: static const uint gUTFOffsets[6] = {
286: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
287: };
288:
289: static const XMLByte gFirstByteMark[7] = {
290: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
291: };
292:
1.11 paf 293: static int transcodeToUTF8(
294: const XMLByte* srcData, size_t& srcLen,
295: XMLByte *toFill, size_t& toFillLen,
1.10 paf 296: const Charset::Tables& tables) {
1.11 paf 297: const XMLByte* srcPtr=srcData;
298: const XMLByte* srcEnd=srcData+srcLen;
299: XMLByte* outPtr=toFill;
300: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 301:
1.4 paf 302: while(srcPtr<srcEnd) {
1.10 paf 303: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 304: if(!curVal) {
305: // use the replacement character
1.4 paf 306: *outPtr++= '?';
307: srcPtr++;
1.1 paf 308: continue;
309: }
310:
311: // Figure out how many bytes we need
312: unsigned int encodedBytes;
1.4 paf 313: if(curVal<0x80)
1.1 paf 314: encodedBytes = 1;
1.4 paf 315: else if(curVal<0x800)
1.1 paf 316: encodedBytes = 2;
1.4 paf 317: else if(curVal<0x10000)
1.1 paf 318: encodedBytes = 3;
1.4 paf 319: else if(curVal<0x200000)
1.1 paf 320: encodedBytes = 4;
1.4 paf 321: else if(curVal<0x4000000)
1.1 paf 322: encodedBytes = 5;
1.4 paf 323: else if(curVal<= 0x7FFFFFFF)
1.1 paf 324: encodedBytes = 6;
325: else {
326: // use the replacement character
1.4 paf 327: *outPtr++= '?';
328: srcPtr++;
1.1 paf 329: continue;
330: }
331:
1.10 paf 332: // If we cannot fully get this char into the output buffer
333: if (outPtr + encodedBytes > outEnd)
334: break;
1.1 paf 335:
336: // We can do it, so update the source index
337: srcPtr++;
338:
339: // And spit out the bytes. We spit them out in reverse order
340: // here, so bump up the output pointer and work down as we go.
1.4 paf 341: outPtr+= encodedBytes;
1.1 paf 342: switch(encodedBytes) {
1.18 paf 343: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 344: curVal>>= 6;
1.18 paf 345: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 346: curVal>>= 6;
1.18 paf 347: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 348: curVal>>= 6;
1.18 paf 349: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 350: curVal>>= 6;
1.18 paf 351: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 352: curVal>>= 6;
1.18 paf 353: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
1.1 paf 354: }
355:
356: // Add the encoded bytes back in again to indicate we've eaten them
1.4 paf 357: outPtr+= encodedBytes;
1.1 paf 358: }
359:
1.11 paf 360: // Update the bytes eaten
361: srcLen = srcPtr - srcData;
362:
363: // Return the characters read
364: toFillLen = outPtr - toFill;
365:
1.29 paf 366: //return srcPtr==srcEnd?(int)toFillLen:-1;
367: /*
368: xmlCharEncodingInputFunc
369: Returns :
370: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
371: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
372: of ocetes consumed.
373: */
374: return 0;
1.1 paf 375: }
1.26 paf 376: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.30 paf 377: static int transcodeFromUTF8(
1.11 paf 378: const XMLByte *srcData, size_t& srcLen,
379: XMLByte* toFill, size_t& toFillLen,
380: const Charset::Tables& tables) {
381: const XMLByte* srcPtr=srcData;
382: const XMLByte* srcEnd=srcData+srcLen;
383: XMLByte* outPtr=toFill;
384: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 385:
1.10 paf 386: // We now loop until we either run out of input data, or room to store
387: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
1.1 paf 388: // Get the next leading byte out
389: const XMLByte firstByte = *srcPtr;
390:
1.4 paf 391: // Special-case ASCII, which is a leading byte value of<= 127
392: if(firstByte<= 127) {
393: *outPtr++= firstByte;
1.1 paf 394: srcPtr++;
395: continue;
396: }
397:
398: // See how many trailing src bytes this sequence is going to require
399: const unsigned int trailingBytes = gUTFBytes[firstByte];
400:
401: // If there are not enough source bytes to do this one, then we
1.4 paf 402: // are done. Note that we done>= here because we are implicitly
1.1 paf 403: // counting the 1 byte we get no matter what.
1.4 paf 404: if(srcPtr+trailingBytes>= srcEnd)
1.1 paf 405: break;
406:
407: // Looks ok, so lets build up the value
408: uint tmpVal=0;
409: switch(trailingBytes) {
410: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
411: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
412: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
413: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
414: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
415: case 0: tmpVal+=*srcPtr++;
416: break;
417:
418: default:
1.23 paf 419: throw Exception(0,
1.1 paf 420: 0,
1.4 paf 421: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes);
1.1 paf 422: }
423: tmpVal-=gUTFOffsets[trailingBytes];
424:
425: // If it will fit into a single char, then put it in. Otherwise
426: // fail [*encode it as a surrogate pair. If its not valid, use the
427: // replacement char.*]
1.25 paf 428: if(!(tmpVal & 0xFFFF0000)) {
429: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
430: *outPtr++=xlat;
431: else
432: outPtr+=sprintf((char *)outPtr, "&#%d;", tmpVal); // &#decimal;
433: } else
1.23 paf 434: throw Exception(0,
1.1 paf 435: 0,
1.4 paf 436: "transcodeFromUTF8 error: too big tmpVal(0x%08X)", tmpVal);
1.1 paf 437: }
438:
1.11 paf 439: // Update the bytes eaten
440: srcLen = srcPtr - srcData;
441:
442: // Return the characters read
443: toFillLen = outPtr - toFill;
444:
1.29 paf 445: //return srcPtr==srcEnd?(int)toFillLen:-1;
446: /*
447: xmlCharEncodingOutputFunc
448: Returns :
449: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
450: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
451: of ocetes consumed.
452: */
453: return 0;
1.10 paf 454: }
455:
456: /// @todo not so memory-hungry with prescan
457: void Charset::transcodeToUTF8(Pool& pool,
458: const void *source_body, size_t source_content_length,
1.11 paf 459: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 460: dest_content_length=source_content_length*6/*so that surly enough, max utf8 seq len=6*/;
1.11 paf 461: XMLByte *dest_body=(XMLByte*)pool.malloc(dest_content_length);
462:
463: if(::transcodeToUTF8(
464: (XMLByte *)source_body, source_content_length,
465: dest_body, dest_content_length,
466: tables)<0)
1.10 paf 467: throw(0, 0,
468: 0,
1.11 paf 469: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 470:
1.1 paf 471: // return
472: adest_body=dest_body;
1.10 paf 473: }
474: void Charset::transcodeFromUTF8(Pool& pool,
475: const void *source_body, size_t source_content_length,
1.11 paf 476: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 477: dest_content_length=source_content_length*6/*so that surly enough, "ÿ" has max ratio */;
1.11 paf 478: XMLByte *dest_body=(XMLByte*)pool.malloc(dest_content_length);
479:
480: if(::transcodeFromUTF8(
481: (XMLByte *)source_body, source_content_length,
482: dest_body, dest_content_length,
483: tables)<0)
1.10 paf 484: throw(0, 0,
485: 0,
1.11 paf 486: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 487:
488: // return
489: adest_body=dest_body;
1.1 paf 490: }
491:
492: /// transcode using both charsets
493: void Charset::transcodeToCharset(Pool& pool,
494: const Charset& dest_charset,
495: const void *source_body, size_t source_content_length,
1.6 paf 496: const void *& adest_body, size_t& adest_content_length) const {
1.3 paf 497: if(&dest_charset==this) {
1.6 paf 498: adest_body=source_body;
499: adest_content_length=source_content_length;
500: } else {
501: size_t dest_content_length=source_content_length;
502: unsigned char *dest_body=(unsigned char *)pool.malloc(dest_content_length);
503:
1.11 paf 504: const XMLByte* srcPtr=(XMLByte *)source_body;
505: const XMLByte* srcEnd=(XMLByte *)source_body+source_content_length;
1.6 paf 506:
507: for(XMLByte* outPtr=dest_body; srcPtr<srcEnd; srcPtr++) {
1.10 paf 508: XMLCh curVal = tables.fromTable[*srcPtr];
1.6 paf 509: if(curVal)
1.25 paf 510: *outPtr++=xlatOneTo(curVal, dest_charset.tables, '?');
1.6 paf 511: else {
512: // use the replacement character
513: *outPtr++= '?';
514: }
515: }
1.1 paf 516:
1.6 paf 517: adest_body=dest_body;
518: adest_content_length=dest_content_length;
519: }
1.1 paf 520: }
521:
522: #ifdef XML
1.10 paf 523: static int xml256CharEncodingInputFunc (
524: unsigned char *out,
525: int *outlen,
526: const unsigned char *in,
527: int *inlen,
528: void *info) {
529: return transcodeToUTF8(
1.21 paf 530: in, *(size_t*)inlen,
531: out, *(size_t*)outlen,
1.10 paf 532: *(const Charset::Tables *)info);
533: }
534:
535: static int xml256CharEncodingOutputFunc (
536: unsigned char *out,
537: int *outlen,
538: const unsigned char *in,
539: int *inlen,
540: void *info) {
541: return transcodeFromUTF8(
1.21 paf 542: in, *(size_t*)inlen,
543: out, *(size_t*)outlen,
1.10 paf 544: *(const Charset::Tables *)info);
545: }
546:
547:
548: void Charset::addEncoding(char *name_cstr) {
549: xmlCharEncodingHandler *handler=
550: (xmlCharEncodingHandler *)malloc(sizeof(xmlCharEncodingHandler));
551: handler->name=name_cstr;
552: handler->input=xml256CharEncodingInputFunc; handler->inputInfo=&tables;
553: handler->output=xml256CharEncodingOutputFunc; handler->outputInfo=&tables;
554:
555: xmlRegisterCharEncodingHandler(handler);
556: }
557:
558: void Charset::initTranscoder(const String *source, const char *name_cstr) {
1.15 paf 559: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
560: transcoder(source); // check right way
561: }
562:
563: xmlCharEncodingHandler *Charset::transcoder(const String *source) {
564: if(!ftranscoder)
1.23 paf 565: throw Exception("parser.runtime",
1.10 paf 566: source,
567: "unsupported encoding");
1.15 paf 568: return ftranscoder;
1.10 paf 569: }
570:
1.14 paf 571: const char *Charset::transcode_cstr(xmlChar *s) {
1.13 paf 572: if(!s)
1.14 paf 573: return "";
1.8 paf 574:
1.14 paf 575: int inlen=strlen((const char *)s);
1.8 paf 576: int outlen=inlen+1; // max
577: char *out=(char *)malloc(outlen*sizeof(char));
578:
1.30 paf 579: int error;
1.17 paf 580: if(xmlCharEncodingOutputFunc output=transcoder(0)->output) {
1.30 paf 581: error=output(
1.17 paf 582: (unsigned char*)out, &outlen,
583: (const unsigned char*)s, &inlen,
584: transcoder(0)->outputInfo);
1.30 paf 585: } else {
586: memcpy(out, s, outlen=inlen);
587: error=0;
588: }
589: if(error<0)
1.23 paf 590: throw Exception(0,
1.8 paf 591: 0,
1.30 paf 592: "transcode_cstr failed (%d)", error);
1.8 paf 593:
1.30 paf 594: out[outlen/*surely would be less then on input*/]=0;
1.8 paf 595: return out;
1.14 paf 596: }
1.31 paf 597: String& Charset::transcode(xmlChar *s
598: #ifndef NO_STRING_ORIGIN
599: , const String *origin
600: #endif
601: ) {
602: String& result=*NEW String(pool());
603: result.APPEND_CLEAN(transcode_cstr(s), 0/*auto-size*/, origin->origin().file, origin->origin().line);
604: return result;
1.14 paf 605: }
606: const char *Charset::transcode_cstr(GdomeDOMString *s) {
607: return s?transcode_cstr(BAD_CAST s->str):"";
1.1 paf 608: }
1.31 paf 609: String& Charset::transcode(GdomeDOMString *s
610: #ifndef NO_STRING_ORIGIN
611: , const String *origin
612: #endif
613: ) {
614: String& result=*NEW String(pool());
615: result.APPEND_CLEAN(transcode_cstr(s), 0/*auto-size*/, origin->origin().file, origin->origin().line);
616: return result;
1.1 paf 617: }
618:
1.8 paf 619: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.24 paf 620: xmlChar *Charset::transcode_buf2xchar(const char *buf, size_t buf_size) {
1.30 paf 621: unsigned char *out;
622: int outlen;
623: int error;
1.17 paf 624: if(xmlCharEncodingInputFunc input=transcoder(0)->input) {
1.32 paf 625: outlen=buf_size*6/*max*/;
626: out=(unsigned char*)malloc((outlen+1)*sizeof(unsigned char));
1.30 paf 627: error=input(
1.17 paf 628: out, &outlen,
629: (const unsigned char *)buf, (int *)&buf_size,
630: transcoder(0)->inputInfo);
1.30 paf 631: } else {
632: outlen=buf_size;
1.32 paf 633: out=(unsigned char*)malloc((outlen+1)*sizeof(unsigned char));
1.30 paf 634: memcpy(out, buf, outlen);
635: error=0;
636: }
1.17 paf 637:
1.30 paf 638: if(error<0)
1.23 paf 639: throw Exception(0,
1.8 paf 640: 0,
1.30 paf 641: "transcode_buf failed (%d)", error);
1.8 paf 642:
1.30 paf 643: out[outlen/*surely would be less then on input*/]=0;
1.24 paf 644: return (xmlChar *)out;
645: }
646: GdomeDOMString_auto_ptr Charset::transcode_buf2dom(const char *buf, size_t buf_size) {
647: return GdomeDOMString_auto_ptr((gchar*)transcode_buf2xchar(buf, buf_size));
1.1 paf 648: }
1.12 paf 649: GdomeDOMString_auto_ptr Charset::transcode(const String& s) {
1.1 paf 650: const char *cstr=s.cstr(String::UL_UNSPECIFIED);
651:
1.24 paf 652: return transcode_buf2dom(cstr, strlen(cstr));
1.1 paf 653: }
654: #endif
1.34 ! paf 655:
! 656: String& Charset::transcode(Pool& pool,
! 657: const Charset& source_transcoder,
! 658: const Charset& dest_transcoder,
! 659: const String& src) {
! 660:
! 661: const char *src_ptr=src.cstr(String::UL_UNSPECIFIED);
! 662: size_t src_size=strlen(src_ptr);
! 663:
! 664: const void *dest_ptr;
! 665: size_t dest_size;
! 666:
! 667: Charset::transcode(pool,
! 668: source_transcoder, (const void*)src_ptr, src_size,
! 669: dest_transcoder, dest_ptr, dest_size);
! 670:
! 671: return *new(pool) String(pool, (const char*)dest_ptr, dest_size);
! 672: }
! 673:
! 674: void Charset::transcode(Pool& pool,
! 675: const Charset& source_transcoder,
! 676: const Charset& dest_transcoder,
! 677: Array& src) {
! 678: for(int i=0; i<src.size(); i++)
! 679: src.put(i, &transcode(pool, source_transcoder, dest_transcoder, *src.get_string(i)));
! 680: }
! 681:
! 682: #ifndef DOXYGEN
! 683: struct Transcode_pair_info {
! 684: Pool* pool;
! 685: const Charset* source_transcoder;
! 686: const Charset* dest_transcoder;
! 687: };
! 688: #endif
! 689: static void transcode_pair(const Hash::Key& key, Hash::Val *& value, void *raw_info) {
! 690: Transcode_pair_info& info=*static_cast<Transcode_pair_info*>(raw_info);
! 691: value=&Charset::transcode(*info.pool,
! 692: *info.source_transcoder,
! 693: *info.dest_transcoder,
! 694: *static_cast<String*>(value));
! 695: }
! 696: void Charset::transcode(Pool& pool,
! 697: const Charset& source_transcoder,
! 698: const Charset& dest_transcoder,
! 699: Hash& src) {
! 700: Transcode_pair_info info={&pool, &source_transcoder, &dest_transcoder};
! 701: src.for_each(transcode_pair, &info);
! 702: }
E-mail: