Annotation of parser3/src/main/pa_charset.C, revision 1.30
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.20 paf 4: Copyright(c) 2001, 2002 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.30 ! paf 8: static const char* IDENT_CHARSET_C="$Date: 2002/09/10 08:41:00 $";
1.1 paf 9:
10: #include "pa_charset.h"
11:
12: #ifdef XML
1.8 paf 13: #include "libxml/encoding.h"
1.1 paf 14: #endif
15:
16: // globals
17:
18:
19: // consts
20:
21: #define MAX_CHARSET_UNI_CODES 500
22:
23: // helpers
24:
25: inline void prepare_case_tables(unsigned char *tables) {
26: unsigned char *lcc_table=tables+lcc_offset;
27: unsigned char *fcc_table=tables+fcc_offset;
28: for(int i=0; i<0x100; i++)
29: lcc_table[i]=fcc_table[i]=i;
30: }
31: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
32: unsigned char bit) {
33: unsigned char *ctypes_table=tables+ctypes_offset;
34: ctypes_table[0]=bit;
35: for(; *cstr; cstr++) {
36: unsigned char c=*cstr;
37: ctypes_table[c]|=bit;
38: }
39: }
40: inline unsigned int to_wchar_code(const char *cstr) {
41: if(!cstr || !*cstr)
42: return 0;
43: if(cstr[1]==0)
1.4 paf 44: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 45:
46: char *error_pos;
1.4 paf 47: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 48: }
49: inline bool to_bool(const char *cstr) {
50: return cstr && *cstr!=0;
51: }
52: static void element2ctypes(unsigned char c, bool belongs,
53: unsigned char *tables, unsigned char bit, int group_offset=-1) {
54: if(!belongs)
55: return;
56:
57: unsigned char *ctypes_table=tables+ctypes_offset;
58:
59: ctypes_table[c]|=bit;
60: if(group_offset>=0)
1.4 paf 61: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 62: }
63: static void element2case(unsigned char from, unsigned char to,
64: unsigned char *tables) {
65: if(!to)
66: return;
67:
68: unsigned char *lcc_table=tables+lcc_offset;
69: unsigned char *fcc_table=tables+fcc_offset;
70: lcc_table[from]=to;
71: fcc_table[from]=to; fcc_table[to]=from;
72: }
73:
74: // methods
75:
76: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.7 paf 77: Charset::Charset(Pool& apool, const String& aname, const String *request_file_spec): Pooled(apool),
78: fname(aname) {
1.1 paf 79:
1.10 paf 80: char *name_cstr=fname.cstr();
81: for(char *c=name_cstr; *c; c++)
82: *c = toupper(*c);
1.7 paf 83:
84: if(request_file_spec) {
1.1 paf 85: fisUTF8=false;
1.7 paf 86: loadDefinition(*request_file_spec);
1.1 paf 87: #ifdef XML
88: addEncoding(name_cstr);
89: #endif
90: } else {
91: fisUTF8=true;
1.4 paf 92: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 93: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
94: }
95:
96: #ifdef XML
97: initTranscoder(&aname, name_cstr);
98: #endif
99: }
100:
101: Charset::~Charset() {
102: #ifdef XML
1.9 paf 103: // not deleting transcoder, that's not our business
1.1 paf 104: #endif
105: }
106:
1.7 paf 107: void Charset::loadDefinition(const String& request_file_spec) {
1.1 paf 108: // pcre_tables
109: // lowcase, flipcase, bits digit+word+whitespace, masks
110:
111: // must not move this inside of prepare_case_tables
112: // don't know the size there
113: memset(pcre_tables, 0, sizeof(pcre_tables));
114: prepare_case_tables(pcre_tables);
1.4 paf 115: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 116:
117: // charset
1.10 paf 118: memset(tables.fromTable, 0, sizeof(tables.fromTable));
119: tables.toTable=(Charset_TransRec *)calloc(sizeof(Charset_TransRec)*MAX_CHARSET_UNI_CODES);
120: tables.toTableSize=0;
1.1 paf 121: // strangly vital
1.10 paf 122: tables.toTable[tables.toTableSize].intCh=0;
123: tables.toTable[tables.toTableSize].extCh=(XMLByte)0;
124: tables.toTableSize++;
1.1 paf 125:
126: // loading text
1.7 paf 127: char *data=file_read_text(pool(), request_file_spec);
1.1 paf 128:
129: // ignore header
130: getrow(&data);
131:
132: // parse cells
133: char *row;
134: while(row=getrow(&data)) {
135: // remove empty&comment lines
136: if(!*row || *row=='#')
137: continue;
138:
139: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
140: unsigned int c=0;
141: char *cell;
142: for(int column=0; cell=lsplit(&row, '\t'); column++) {
143: switch(column) {
144: case 0: c=to_wchar_code(cell); break;
145: // pcre_tables
146: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
147: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
148: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
149: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
150: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
151: case 6: element2case(c, to_wchar_code(cell), pcre_tables); break;
152: case 7:
153: case 8:
154: // charset
1.10 paf 155: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.23 paf 156: throw Exception("parser.runtime",
1.7 paf 157: &request_file_spec,
1.1 paf 158: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
159:
160: XMLCh unicode=(XMLCh)to_wchar_code(cell);
161: if(!unicode && column==7/*unicode1 column*/)
162: unicode=(XMLCh)c;
163: if(unicode) {
1.10 paf 164: if(!tables.fromTable[c])
165: tables.fromTable[c]=unicode;
166: tables.toTable[tables.toTableSize].intCh=unicode;
167: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
168: tables.toTableSize++;
1.1 paf 169: }
170: break;
171: }
172: }
173: };
174:
175: // sort by the Unicode code point
176: sort_ToTable();
177: }
178:
179: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
180: return
181: static_cast<const Charset_TransRec *>(a)->intCh-
182: static_cast<const Charset_TransRec *>(b)->intCh;
183: }
184:
185: void Charset::sort_ToTable() {
1.10 paf 186: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 187: sort_cmp_Trans_rec_intCh);
188: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 189: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 190: //fclose(f);
191: }
192:
1.10 paf 193: static XMLByte xlatOneTo(const XMLCh toXlat,
1.25 paf 194: const Charset::Tables& tables,
195: XMLByte not_found) {
1.1 paf 196: unsigned int lowOfs = 0;
1.10 paf 197: unsigned int hiOfs = tables.toTableSize - 1;
1.1 paf 198: XMLByte curByte = 0;
199: do {
200: // Calc the mid point of the low and high offset.
1.4 paf 201: const unsigned int midOfs =((hiOfs - lowOfs) / 2)+lowOfs;
1.1 paf 202:
203: // If our test char is greater than the mid point char, then
204: // we move up to the upper half. Else we move to the lower
205: // half. If its equal, then its our guy.
1.10 paf 206: if(toXlat>tables.toTable[midOfs].intCh)
1.1 paf 207: lowOfs = midOfs;
1.10 paf 208: else if(toXlat<tables.toTable[midOfs].intCh)
1.1 paf 209: hiOfs = midOfs;
210: else
1.10 paf 211: return tables.toTable[midOfs].extCh;
1.4 paf 212: } while(lowOfs+1<hiOfs);
1.1 paf 213:
1.25 paf 214: return not_found;
1.1 paf 215: }
216:
217: void Charset::transcode(Pool& pool,
218: const Charset& source_charset, const void *source_body, size_t source_content_length,
219: const Charset& dest_charset, const void *& dest_body, size_t& dest_content_length
220: ) {
1.4 paf 221: if(!source_content_length) {
222: dest_body=0;
223: dest_content_length=0;
224: return;
225: }
226:
1.1 paf 227: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
228: default: // 0x00
229: source_charset.transcodeToCharset(pool, dest_charset,
230: source_body, source_content_length,
231: dest_body, dest_content_length);
232: break;
233: case 0x01:
234: source_charset.transcodeToUTF8(pool,
235: source_body, source_content_length,
236: dest_body, dest_content_length);
237: break;
238: case 0x10:
239: dest_charset.transcodeFromUTF8(pool,
240: source_body, source_content_length,
241: dest_body, dest_content_length);
242: break;
243: case 0x11:
244: dest_body=source_body;
245: dest_content_length=source_content_length;
246: break;
247: }
248: }
249:
250: // ---------------------------------------------------------------------------
251: // Local static data
252: //
253: // gUTFBytes
254: // A list of counts of trailing bytes for each initial byte in the input.
255: //
256: // gUTFOffsets
257: // A list of values to offset each result char type, according to how
258: // many source bytes when into making it.
259: //
260: // gFirstByteMark
261: // A list of values to mask onto the first byte of an encoded sequence,
262: // indexed by the number of bytes used to create the sequence.
263: // ---------------------------------------------------------------------------
264: static const XMLByte gUTFBytes[0x100] = {
265: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
266: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
267: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
268: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
269: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
270: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
271: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
272: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
273: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
274: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
275: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
276: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
277: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
278: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
279: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
280: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
281: };
282:
283: static const uint gUTFOffsets[6] = {
284: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
285: };
286:
287: static const XMLByte gFirstByteMark[7] = {
288: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
289: };
290:
1.11 paf 291: static int transcodeToUTF8(
292: const XMLByte* srcData, size_t& srcLen,
293: XMLByte *toFill, size_t& toFillLen,
1.10 paf 294: const Charset::Tables& tables) {
1.11 paf 295: const XMLByte* srcPtr=srcData;
296: const XMLByte* srcEnd=srcData+srcLen;
297: XMLByte* outPtr=toFill;
298: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 299:
1.4 paf 300: while(srcPtr<srcEnd) {
1.10 paf 301: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 302: if(!curVal) {
303: // use the replacement character
1.4 paf 304: *outPtr++= '?';
305: srcPtr++;
1.1 paf 306: continue;
307: }
308:
309: // Figure out how many bytes we need
310: unsigned int encodedBytes;
1.4 paf 311: if(curVal<0x80)
1.1 paf 312: encodedBytes = 1;
1.4 paf 313: else if(curVal<0x800)
1.1 paf 314: encodedBytes = 2;
1.4 paf 315: else if(curVal<0x10000)
1.1 paf 316: encodedBytes = 3;
1.4 paf 317: else if(curVal<0x200000)
1.1 paf 318: encodedBytes = 4;
1.4 paf 319: else if(curVal<0x4000000)
1.1 paf 320: encodedBytes = 5;
1.4 paf 321: else if(curVal<= 0x7FFFFFFF)
1.1 paf 322: encodedBytes = 6;
323: else {
324: // use the replacement character
1.4 paf 325: *outPtr++= '?';
326: srcPtr++;
1.1 paf 327: continue;
328: }
329:
1.10 paf 330: // If we cannot fully get this char into the output buffer
331: if (outPtr + encodedBytes > outEnd)
332: break;
1.1 paf 333:
334: // We can do it, so update the source index
335: srcPtr++;
336:
337: // And spit out the bytes. We spit them out in reverse order
338: // here, so bump up the output pointer and work down as we go.
1.4 paf 339: outPtr+= encodedBytes;
1.1 paf 340: switch(encodedBytes) {
1.18 paf 341: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 342: curVal>>= 6;
1.18 paf 343: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 344: curVal>>= 6;
1.18 paf 345: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 346: curVal>>= 6;
1.18 paf 347: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 348: curVal>>= 6;
1.18 paf 349: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 350: curVal>>= 6;
1.18 paf 351: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
1.1 paf 352: }
353:
354: // Add the encoded bytes back in again to indicate we've eaten them
1.4 paf 355: outPtr+= encodedBytes;
1.1 paf 356: }
357:
1.11 paf 358: // Update the bytes eaten
359: srcLen = srcPtr - srcData;
360:
361: // Return the characters read
362: toFillLen = outPtr - toFill;
363:
1.29 paf 364: //return srcPtr==srcEnd?(int)toFillLen:-1;
365: /*
366: xmlCharEncodingInputFunc
367: Returns :
368: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
369: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
370: of ocetes consumed.
371: */
372: return 0;
1.1 paf 373: }
1.26 paf 374: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.30 ! paf 375: static int transcodeFromUTF8(
1.11 paf 376: const XMLByte *srcData, size_t& srcLen,
377: XMLByte* toFill, size_t& toFillLen,
378: const Charset::Tables& tables) {
379: const XMLByte* srcPtr=srcData;
380: const XMLByte* srcEnd=srcData+srcLen;
381: XMLByte* outPtr=toFill;
382: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 383:
1.10 paf 384: // We now loop until we either run out of input data, or room to store
385: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
1.1 paf 386: // Get the next leading byte out
387: const XMLByte firstByte = *srcPtr;
388:
1.4 paf 389: // Special-case ASCII, which is a leading byte value of<= 127
390: if(firstByte<= 127) {
391: *outPtr++= firstByte;
1.1 paf 392: srcPtr++;
393: continue;
394: }
395:
396: // See how many trailing src bytes this sequence is going to require
397: const unsigned int trailingBytes = gUTFBytes[firstByte];
398:
399: // If there are not enough source bytes to do this one, then we
1.4 paf 400: // are done. Note that we done>= here because we are implicitly
1.1 paf 401: // counting the 1 byte we get no matter what.
1.4 paf 402: if(srcPtr+trailingBytes>= srcEnd)
1.1 paf 403: break;
404:
405: // Looks ok, so lets build up the value
406: uint tmpVal=0;
407: switch(trailingBytes) {
408: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
409: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
410: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
411: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
412: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
413: case 0: tmpVal+=*srcPtr++;
414: break;
415:
416: default:
1.23 paf 417: throw Exception(0,
1.1 paf 418: 0,
1.4 paf 419: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes);
1.1 paf 420: }
421: tmpVal-=gUTFOffsets[trailingBytes];
422:
423: // If it will fit into a single char, then put it in. Otherwise
424: // fail [*encode it as a surrogate pair. If its not valid, use the
425: // replacement char.*]
1.25 paf 426: if(!(tmpVal & 0xFFFF0000)) {
427: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
428: *outPtr++=xlat;
429: else
430: outPtr+=sprintf((char *)outPtr, "&#%d;", tmpVal); // &#decimal;
431: } else
1.23 paf 432: throw Exception(0,
1.1 paf 433: 0,
1.4 paf 434: "transcodeFromUTF8 error: too big tmpVal(0x%08X)", tmpVal);
1.1 paf 435: }
436:
1.11 paf 437: // Update the bytes eaten
438: srcLen = srcPtr - srcData;
439:
440: // Return the characters read
441: toFillLen = outPtr - toFill;
442:
1.29 paf 443: //return srcPtr==srcEnd?(int)toFillLen:-1;
444: /*
445: xmlCharEncodingOutputFunc
446: Returns :
447: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
448: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
449: of ocetes consumed.
450: */
451: return 0;
1.10 paf 452: }
453:
454: /// @todo not so memory-hungry with prescan
455: void Charset::transcodeToUTF8(Pool& pool,
456: const void *source_body, size_t source_content_length,
1.11 paf 457: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 458: dest_content_length=source_content_length*6/*so that surly enough, max utf8 seq len=6*/;
1.11 paf 459: XMLByte *dest_body=(XMLByte*)pool.malloc(dest_content_length);
460:
461: if(::transcodeToUTF8(
462: (XMLByte *)source_body, source_content_length,
463: dest_body, dest_content_length,
464: tables)<0)
1.10 paf 465: throw(0, 0,
466: 0,
1.11 paf 467: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 468:
1.1 paf 469: // return
470: adest_body=dest_body;
1.10 paf 471: }
472: void Charset::transcodeFromUTF8(Pool& pool,
473: const void *source_body, size_t source_content_length,
1.11 paf 474: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 475: dest_content_length=source_content_length*6/*so that surly enough, "ÿ" has max ratio */;
1.11 paf 476: XMLByte *dest_body=(XMLByte*)pool.malloc(dest_content_length);
477:
478: if(::transcodeFromUTF8(
479: (XMLByte *)source_body, source_content_length,
480: dest_body, dest_content_length,
481: tables)<0)
1.10 paf 482: throw(0, 0,
483: 0,
1.11 paf 484: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 485:
486: // return
487: adest_body=dest_body;
1.1 paf 488: }
489:
490: /// transcode using both charsets
491: void Charset::transcodeToCharset(Pool& pool,
492: const Charset& dest_charset,
493: const void *source_body, size_t source_content_length,
1.6 paf 494: const void *& adest_body, size_t& adest_content_length) const {
1.3 paf 495: if(&dest_charset==this) {
1.6 paf 496: adest_body=source_body;
497: adest_content_length=source_content_length;
498: } else {
499: size_t dest_content_length=source_content_length;
500: unsigned char *dest_body=(unsigned char *)pool.malloc(dest_content_length);
501:
1.11 paf 502: const XMLByte* srcPtr=(XMLByte *)source_body;
503: const XMLByte* srcEnd=(XMLByte *)source_body+source_content_length;
1.6 paf 504:
505: for(XMLByte* outPtr=dest_body; srcPtr<srcEnd; srcPtr++) {
1.10 paf 506: XMLCh curVal = tables.fromTable[*srcPtr];
1.6 paf 507: if(curVal)
1.25 paf 508: *outPtr++=xlatOneTo(curVal, dest_charset.tables, '?');
1.6 paf 509: else {
510: // use the replacement character
511: *outPtr++= '?';
512: }
513: }
1.1 paf 514:
1.6 paf 515: adest_body=dest_body;
516: adest_content_length=dest_content_length;
517: }
1.1 paf 518: }
519:
520: #ifdef XML
1.10 paf 521: static int xml256CharEncodingInputFunc (
522: unsigned char *out,
523: int *outlen,
524: const unsigned char *in,
525: int *inlen,
526: void *info) {
527: return transcodeToUTF8(
1.21 paf 528: in, *(size_t*)inlen,
529: out, *(size_t*)outlen,
1.10 paf 530: *(const Charset::Tables *)info);
531: }
532:
533: static int xml256CharEncodingOutputFunc (
534: unsigned char *out,
535: int *outlen,
536: const unsigned char *in,
537: int *inlen,
538: void *info) {
539: return transcodeFromUTF8(
1.21 paf 540: in, *(size_t*)inlen,
541: out, *(size_t*)outlen,
1.10 paf 542: *(const Charset::Tables *)info);
543: }
544:
545:
546: void Charset::addEncoding(char *name_cstr) {
547: xmlCharEncodingHandler *handler=
548: (xmlCharEncodingHandler *)malloc(sizeof(xmlCharEncodingHandler));
549: handler->name=name_cstr;
550: handler->input=xml256CharEncodingInputFunc; handler->inputInfo=&tables;
551: handler->output=xml256CharEncodingOutputFunc; handler->outputInfo=&tables;
552:
553: xmlRegisterCharEncodingHandler(handler);
554: }
555:
556: void Charset::initTranscoder(const String *source, const char *name_cstr) {
1.15 paf 557: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
558: transcoder(source); // check right way
559: }
560:
561: xmlCharEncodingHandler *Charset::transcoder(const String *source) {
562: if(!ftranscoder)
1.23 paf 563: throw Exception("parser.runtime",
1.10 paf 564: source,
565: "unsupported encoding");
1.15 paf 566: return ftranscoder;
1.10 paf 567: }
568:
1.14 paf 569: const char *Charset::transcode_cstr(xmlChar *s) {
1.13 paf 570: if(!s)
1.14 paf 571: return "";
1.8 paf 572:
1.14 paf 573: int inlen=strlen((const char *)s);
1.8 paf 574: int outlen=inlen+1; // max
575: char *out=(char *)malloc(outlen*sizeof(char));
576:
1.30 ! paf 577: int error;
1.17 paf 578: if(xmlCharEncodingOutputFunc output=transcoder(0)->output) {
1.30 ! paf 579: error=output(
1.17 paf 580: (unsigned char*)out, &outlen,
581: (const unsigned char*)s, &inlen,
582: transcoder(0)->outputInfo);
1.30 ! paf 583: } else {
! 584: memcpy(out, s, outlen=inlen);
! 585: error=0;
! 586: }
! 587: if(error<0)
1.23 paf 588: throw Exception(0,
1.8 paf 589: 0,
1.30 ! paf 590: "transcode_cstr failed (%d)", error);
1.8 paf 591:
1.30 ! paf 592: out[outlen/*surely would be less then on input*/]=0;
1.8 paf 593: return out;
1.14 paf 594: }
595: String& Charset::transcode(xmlChar *s) {
596: return *NEW String(pool(), transcode_cstr(s));
597: }
598: const char *Charset::transcode_cstr(GdomeDOMString *s) {
599: return s?transcode_cstr(BAD_CAST s->str):"";
1.1 paf 600: }
1.8 paf 601: String& Charset::transcode(GdomeDOMString *s) {
1.1 paf 602: return *NEW String(pool(), transcode_cstr(s));
603: }
604:
1.8 paf 605: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.24 paf 606: xmlChar *Charset::transcode_buf2xchar(const char *buf, size_t buf_size) {
1.30 ! paf 607: unsigned char *out;
! 608: int outlen;
! 609: int error;
1.17 paf 610: if(xmlCharEncodingInputFunc input=transcoder(0)->input) {
1.30 ! paf 611: outlen=buf_size*6/*max*/+1;
! 612: out=(unsigned char*)malloc(outlen*sizeof(unsigned char));
! 613: error=input(
1.17 paf 614: out, &outlen,
615: (const unsigned char *)buf, (int *)&buf_size,
616: transcoder(0)->inputInfo);
1.30 ! paf 617: } else {
! 618: outlen=buf_size;
! 619: out=(unsigned char*)malloc(outlen*sizeof(unsigned char));
! 620: memcpy(out, buf, outlen);
! 621: error=0;
! 622: }
1.17 paf 623:
1.30 ! paf 624: if(error<0)
1.23 paf 625: throw Exception(0,
1.8 paf 626: 0,
1.30 ! paf 627: "transcode_buf failed (%d)", error);
1.8 paf 628:
1.30 ! paf 629: out[outlen/*surely would be less then on input*/]=0;
1.24 paf 630: return (xmlChar *)out;
631: }
632: GdomeDOMString_auto_ptr Charset::transcode_buf2dom(const char *buf, size_t buf_size) {
633: return GdomeDOMString_auto_ptr((gchar*)transcode_buf2xchar(buf, buf_size));
1.1 paf 634: }
1.12 paf 635: GdomeDOMString_auto_ptr Charset::transcode(const String& s) {
1.1 paf 636: const char *cstr=s.cstr(String::UL_UNSPECIFIED);
637:
1.24 paf 638: return transcode_buf2dom(cstr, strlen(cstr));
1.1 paf 639: }
640: #endif
E-mail: