Annotation of parser3/src/main/pa_charset.C, revision 1.33.2.3
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.33 paf 4: Copyright(c) 2001, 2003 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.33.2.3! paf 8: static const char* IDENT_CHARSET_C="$Date: 2003/01/28 11:38:50 $";
1.1 paf 9:
10: #include "pa_charset.h"
1.33.2.2 paf 11: //#include "pa_globals.h"
1.1 paf 12:
13: #ifdef XML
1.8 paf 14: #include "libxml/encoding.h"
1.1 paf 15: #endif
16:
17: // globals
18:
1.33.2.2 paf 19: #define CHARSET_UTF8_NAME "UTF-8"
20:
21: CharsetPtr UTF8_charset(new Charset(ConstStringPtr(new String(CHARSET_UTF8_NAME)),
1.33.2.3! paf 22: ConstStringPtrZero/*no file=system*/));
1.1 paf 23:
24: // consts
25:
26: #define MAX_CHARSET_UNI_CODES 500
27:
28: // helpers
29:
30: inline void prepare_case_tables(unsigned char *tables) {
31: unsigned char *lcc_table=tables+lcc_offset;
32: unsigned char *fcc_table=tables+fcc_offset;
33: for(int i=0; i<0x100; i++)
34: lcc_table[i]=fcc_table[i]=i;
35: }
36: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
37: unsigned char bit) {
38: unsigned char *ctypes_table=tables+ctypes_offset;
39: ctypes_table[0]=bit;
40: for(; *cstr; cstr++) {
41: unsigned char c=*cstr;
42: ctypes_table[c]|=bit;
43: }
44: }
45: inline unsigned int to_wchar_code(const char *cstr) {
46: if(!cstr || !*cstr)
47: return 0;
48: if(cstr[1]==0)
1.4 paf 49: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 50:
51: char *error_pos;
1.4 paf 52: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 53: }
54: inline bool to_bool(const char *cstr) {
55: return cstr && *cstr!=0;
56: }
57: static void element2ctypes(unsigned char c, bool belongs,
58: unsigned char *tables, unsigned char bit, int group_offset=-1) {
59: if(!belongs)
60: return;
61:
62: unsigned char *ctypes_table=tables+ctypes_offset;
63:
64: ctypes_table[c]|=bit;
65: if(group_offset>=0)
1.4 paf 66: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 67: }
68: static void element2case(unsigned char from, unsigned char to,
69: unsigned char *tables) {
70: if(!to)
71: return;
72:
73: unsigned char *lcc_table=tables+lcc_offset;
74: unsigned char *fcc_table=tables+fcc_offset;
75: lcc_table[from]=to;
76: fcc_table[from]=to; fcc_table[to]=from;
77: }
78:
79: // methods
80:
81: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.33.2.3! paf 82: Charset::Charset(ConstStringPtr aname, ConstStringPtr afile_spec):
1.33.2.1 paf 83: fname(aname),
84: fname_cstr(aname->cstr()) {
1.1 paf 85:
1.33.2.1 paf 86: for(char *c=fname_cstr; *c; c++)
1.10 paf 87: *c = toupper(*c);
1.7 paf 88:
1.33.2.3! paf 89: if(afile_spec) {
1.1 paf 90: fisUTF8=false;
1.33.2.3! paf 91: load_definition(fname);
1.1 paf 92: #ifdef XML
1.33.2.1 paf 93: addEncoding(fname_cstr);
1.1 paf 94: #endif
95: } else {
96: fisUTF8=true;
1.4 paf 97: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 98: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
99: }
100:
101: #ifdef XML
1.33.2.1 paf 102: initTranscoder(fname, fname_cstr);
1.1 paf 103: #endif
104: }
105:
106: Charset::~Charset() {
1.33.2.1 paf 107: // @todonow unregister encodings
1.1 paf 108: #ifdef XML
1.9 paf 109: // not deleting transcoder, that's not our business
1.1 paf 110: #endif
111: }
112:
1.33.2.3! paf 113: void Charset::load_definition(ConstStringPtr afile_spec) {
1.1 paf 114: // pcre_tables
115: // lowcase, flipcase, bits digit+word+whitespace, masks
116:
117: // must not move this inside of prepare_case_tables
118: // don't know the size there
119: memset(pcre_tables, 0, sizeof(pcre_tables));
120: prepare_case_tables(pcre_tables);
1.4 paf 121: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 122:
123: // charset
1.10 paf 124: memset(tables.fromTable, 0, sizeof(tables.fromTable));
1.33.2.3! paf 125: tables.toTable=(Charset_TransRec *)pool_for_load.calloc(sizeof(Charset_TransRec)*MAX_CHARSET_UNI_CODES);
1.10 paf 126: tables.toTableSize=0;
1.1 paf 127: // strangly vital
1.10 paf 128: tables.toTable[tables.toTableSize].intCh=0;
129: tables.toTable[tables.toTableSize].extCh=(XMLByte)0;
130: tables.toTableSize++;
1.1 paf 131:
132: // loading text
1.33.2.3! paf 133: char *data=file_read_text(pool_for_load, *UTF8_charset, afile_spec);
1.1 paf 134:
135: // ignore header
136: getrow(&data);
137:
138: // parse cells
139: char *row;
140: while(row=getrow(&data)) {
141: // remove empty&comment lines
142: if(!*row || *row=='#')
143: continue;
144:
145: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
146: unsigned int c=0;
147: char *cell;
148: for(int column=0; cell=lsplit(&row, '\t'); column++) {
149: switch(column) {
150: case 0: c=to_wchar_code(cell); break;
151: // pcre_tables
152: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
153: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
154: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
155: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
156: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
157: case 6: element2case(c, to_wchar_code(cell), pcre_tables); break;
158: case 7:
159: case 8:
160: // charset
1.10 paf 161: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.23 paf 162: throw Exception("parser.runtime",
1.33.2.1 paf 163: afile_spec,
1.1 paf 164: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
165:
166: XMLCh unicode=(XMLCh)to_wchar_code(cell);
167: if(!unicode && column==7/*unicode1 column*/)
168: unicode=(XMLCh)c;
169: if(unicode) {
1.10 paf 170: if(!tables.fromTable[c])
171: tables.fromTable[c]=unicode;
172: tables.toTable[tables.toTableSize].intCh=unicode;
173: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
174: tables.toTableSize++;
1.1 paf 175: }
176: break;
177: }
178: }
179: };
180:
181: // sort by the Unicode code point
182: sort_ToTable();
183: }
184:
185: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
186: return
187: static_cast<const Charset_TransRec *>(a)->intCh-
188: static_cast<const Charset_TransRec *>(b)->intCh;
189: }
190:
191: void Charset::sort_ToTable() {
1.10 paf 192: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 193: sort_cmp_Trans_rec_intCh);
194: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 195: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 196: //fclose(f);
197: }
198:
1.10 paf 199: static XMLByte xlatOneTo(const XMLCh toXlat,
1.25 paf 200: const Charset::Tables& tables,
201: XMLByte not_found) {
1.1 paf 202: unsigned int lowOfs = 0;
1.10 paf 203: unsigned int hiOfs = tables.toTableSize - 1;
1.1 paf 204: XMLByte curByte = 0;
205: do {
206: // Calc the mid point of the low and high offset.
1.4 paf 207: const unsigned int midOfs =((hiOfs - lowOfs) / 2)+lowOfs;
1.1 paf 208:
209: // If our test char is greater than the mid point char, then
210: // we move up to the upper half. Else we move to the lower
211: // half. If its equal, then its our guy.
1.10 paf 212: if(toXlat>tables.toTable[midOfs].intCh)
1.1 paf 213: lowOfs = midOfs;
1.10 paf 214: else if(toXlat<tables.toTable[midOfs].intCh)
1.1 paf 215: hiOfs = midOfs;
216: else
1.10 paf 217: return tables.toTable[midOfs].extCh;
1.4 paf 218: } while(lowOfs+1<hiOfs);
1.1 paf 219:
1.25 paf 220: return not_found;
1.1 paf 221: }
222:
223: void Charset::transcode(Pool& pool,
224: const Charset& source_charset, const void *source_body, size_t source_content_length,
225: const Charset& dest_charset, const void *& dest_body, size_t& dest_content_length
226: ) {
1.4 paf 227: if(!source_content_length) {
228: dest_body=0;
229: dest_content_length=0;
230: return;
231: }
232:
1.1 paf 233: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
234: default: // 0x00
235: source_charset.transcodeToCharset(pool, dest_charset,
236: source_body, source_content_length,
237: dest_body, dest_content_length);
238: break;
239: case 0x01:
240: source_charset.transcodeToUTF8(pool,
241: source_body, source_content_length,
242: dest_body, dest_content_length);
243: break;
244: case 0x10:
245: dest_charset.transcodeFromUTF8(pool,
246: source_body, source_content_length,
247: dest_body, dest_content_length);
248: break;
249: case 0x11:
250: dest_body=source_body;
251: dest_content_length=source_content_length;
252: break;
253: }
254: }
255:
256: // ---------------------------------------------------------------------------
257: // Local static data
258: //
259: // gUTFBytes
260: // A list of counts of trailing bytes for each initial byte in the input.
261: //
262: // gUTFOffsets
263: // A list of values to offset each result char type, according to how
264: // many source bytes when into making it.
265: //
266: // gFirstByteMark
267: // A list of values to mask onto the first byte of an encoded sequence,
268: // indexed by the number of bytes used to create the sequence.
269: // ---------------------------------------------------------------------------
270: static const XMLByte gUTFBytes[0x100] = {
271: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
272: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
273: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
274: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
275: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
276: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
277: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
278: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
279: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
280: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
281: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
282: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
283: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
284: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
285: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
286: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
287: };
288:
289: static const uint gUTFOffsets[6] = {
290: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
291: };
292:
293: static const XMLByte gFirstByteMark[7] = {
294: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
295: };
296:
1.11 paf 297: static int transcodeToUTF8(
298: const XMLByte* srcData, size_t& srcLen,
299: XMLByte *toFill, size_t& toFillLen,
1.10 paf 300: const Charset::Tables& tables) {
1.11 paf 301: const XMLByte* srcPtr=srcData;
302: const XMLByte* srcEnd=srcData+srcLen;
303: XMLByte* outPtr=toFill;
304: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 305:
1.4 paf 306: while(srcPtr<srcEnd) {
1.10 paf 307: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 308: if(!curVal) {
309: // use the replacement character
1.4 paf 310: *outPtr++= '?';
311: srcPtr++;
1.1 paf 312: continue;
313: }
314:
315: // Figure out how many bytes we need
316: unsigned int encodedBytes;
1.4 paf 317: if(curVal<0x80)
1.1 paf 318: encodedBytes = 1;
1.4 paf 319: else if(curVal<0x800)
1.1 paf 320: encodedBytes = 2;
1.4 paf 321: else if(curVal<0x10000)
1.1 paf 322: encodedBytes = 3;
1.4 paf 323: else if(curVal<0x200000)
1.1 paf 324: encodedBytes = 4;
1.4 paf 325: else if(curVal<0x4000000)
1.1 paf 326: encodedBytes = 5;
1.4 paf 327: else if(curVal<= 0x7FFFFFFF)
1.1 paf 328: encodedBytes = 6;
329: else {
330: // use the replacement character
1.4 paf 331: *outPtr++= '?';
332: srcPtr++;
1.1 paf 333: continue;
334: }
335:
1.10 paf 336: // If we cannot fully get this char into the output buffer
337: if (outPtr + encodedBytes > outEnd)
338: break;
1.1 paf 339:
340: // We can do it, so update the source index
341: srcPtr++;
342:
343: // And spit out the bytes. We spit them out in reverse order
344: // here, so bump up the output pointer and work down as we go.
1.4 paf 345: outPtr+= encodedBytes;
1.1 paf 346: switch(encodedBytes) {
1.18 paf 347: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 348: curVal>>= 6;
1.18 paf 349: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 350: curVal>>= 6;
1.18 paf 351: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 352: curVal>>= 6;
1.18 paf 353: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 354: curVal>>= 6;
1.18 paf 355: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 356: curVal>>= 6;
1.18 paf 357: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
1.1 paf 358: }
359:
360: // Add the encoded bytes back in again to indicate we've eaten them
1.4 paf 361: outPtr+= encodedBytes;
1.1 paf 362: }
363:
1.11 paf 364: // Update the bytes eaten
365: srcLen = srcPtr - srcData;
366:
367: // Return the characters read
368: toFillLen = outPtr - toFill;
369:
1.29 paf 370: //return srcPtr==srcEnd?(int)toFillLen:-1;
371: /*
372: xmlCharEncodingInputFunc
373: Returns :
374: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
375: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
376: of ocetes consumed.
377: */
378: return 0;
1.1 paf 379: }
1.26 paf 380: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.30 paf 381: static int transcodeFromUTF8(
1.11 paf 382: const XMLByte *srcData, size_t& srcLen,
383: XMLByte* toFill, size_t& toFillLen,
384: const Charset::Tables& tables) {
385: const XMLByte* srcPtr=srcData;
386: const XMLByte* srcEnd=srcData+srcLen;
387: XMLByte* outPtr=toFill;
388: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 389:
1.10 paf 390: // We now loop until we either run out of input data, or room to store
391: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
1.1 paf 392: // Get the next leading byte out
393: const XMLByte firstByte = *srcPtr;
394:
1.4 paf 395: // Special-case ASCII, which is a leading byte value of<= 127
396: if(firstByte<= 127) {
397: *outPtr++= firstByte;
1.1 paf 398: srcPtr++;
399: continue;
400: }
401:
402: // See how many trailing src bytes this sequence is going to require
403: const unsigned int trailingBytes = gUTFBytes[firstByte];
404:
405: // If there are not enough source bytes to do this one, then we
1.4 paf 406: // are done. Note that we done>= here because we are implicitly
1.1 paf 407: // counting the 1 byte we get no matter what.
1.4 paf 408: if(srcPtr+trailingBytes>= srcEnd)
1.1 paf 409: break;
410:
411: // Looks ok, so lets build up the value
412: uint tmpVal=0;
413: switch(trailingBytes) {
414: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
415: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
416: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
417: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
418: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
419: case 0: tmpVal+=*srcPtr++;
420: break;
421:
422: default:
1.23 paf 423: throw Exception(0,
1.33.2.1 paf 424: Exception::undefined_source,
1.4 paf 425: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes);
1.1 paf 426: }
427: tmpVal-=gUTFOffsets[trailingBytes];
428:
429: // If it will fit into a single char, then put it in. Otherwise
430: // fail [*encode it as a surrogate pair. If its not valid, use the
431: // replacement char.*]
1.25 paf 432: if(!(tmpVal & 0xFFFF0000)) {
433: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
434: *outPtr++=xlat;
435: else
436: outPtr+=sprintf((char *)outPtr, "&#%d;", tmpVal); // &#decimal;
437: } else
1.23 paf 438: throw Exception(0,
1.33.2.1 paf 439: Exception::undefined_source,
1.4 paf 440: "transcodeFromUTF8 error: too big tmpVal(0x%08X)", tmpVal);
1.1 paf 441: }
442:
1.11 paf 443: // Update the bytes eaten
444: srcLen = srcPtr - srcData;
445:
446: // Return the characters read
447: toFillLen = outPtr - toFill;
448:
1.29 paf 449: //return srcPtr==srcEnd?(int)toFillLen:-1;
450: /*
451: xmlCharEncodingOutputFunc
452: Returns :
453: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
454: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
455: of ocetes consumed.
456: */
457: return 0;
1.10 paf 458: }
459:
460: /// @todo not so memory-hungry with prescan
461: void Charset::transcodeToUTF8(Pool& pool,
462: const void *source_body, size_t source_content_length,
1.11 paf 463: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 464: dest_content_length=source_content_length*6/*so that surly enough, max utf8 seq len=6*/;
1.11 paf 465: XMLByte *dest_body=(XMLByte*)pool.malloc(dest_content_length);
466:
467: if(::transcodeToUTF8(
468: (XMLByte *)source_body, source_content_length,
469: dest_body, dest_content_length,
470: tables)<0)
1.10 paf 471: throw(0, 0,
472: 0,
1.11 paf 473: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 474:
1.1 paf 475: // return
476: adest_body=dest_body;
1.10 paf 477: }
478: void Charset::transcodeFromUTF8(Pool& pool,
479: const void *source_body, size_t source_content_length,
1.11 paf 480: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 481: dest_content_length=source_content_length*6/*so that surly enough, "ÿ" has max ratio */;
1.11 paf 482: XMLByte *dest_body=(XMLByte*)pool.malloc(dest_content_length);
483:
484: if(::transcodeFromUTF8(
485: (XMLByte *)source_body, source_content_length,
486: dest_body, dest_content_length,
487: tables)<0)
1.10 paf 488: throw(0, 0,
489: 0,
1.11 paf 490: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 491:
492: // return
493: adest_body=dest_body;
1.1 paf 494: }
495:
496: /// transcode using both charsets
497: void Charset::transcodeToCharset(Pool& pool,
498: const Charset& dest_charset,
499: const void *source_body, size_t source_content_length,
1.6 paf 500: const void *& adest_body, size_t& adest_content_length) const {
1.3 paf 501: if(&dest_charset==this) {
1.6 paf 502: adest_body=source_body;
503: adest_content_length=source_content_length;
504: } else {
505: size_t dest_content_length=source_content_length;
506: unsigned char *dest_body=(unsigned char *)pool.malloc(dest_content_length);
507:
1.11 paf 508: const XMLByte* srcPtr=(XMLByte *)source_body;
509: const XMLByte* srcEnd=(XMLByte *)source_body+source_content_length;
1.6 paf 510:
511: for(XMLByte* outPtr=dest_body; srcPtr<srcEnd; srcPtr++) {
1.10 paf 512: XMLCh curVal = tables.fromTable[*srcPtr];
1.6 paf 513: if(curVal)
1.25 paf 514: *outPtr++=xlatOneTo(curVal, dest_charset.tables, '?');
1.6 paf 515: else {
516: // use the replacement character
517: *outPtr++= '?';
518: }
519: }
1.1 paf 520:
1.6 paf 521: adest_body=dest_body;
522: adest_content_length=dest_content_length;
523: }
1.1 paf 524: }
525:
526: #ifdef XML
1.10 paf 527: static int xml256CharEncodingInputFunc (
528: unsigned char *out,
529: int *outlen,
530: const unsigned char *in,
531: int *inlen,
532: void *info) {
533: return transcodeToUTF8(
1.21 paf 534: in, *(size_t*)inlen,
535: out, *(size_t*)outlen,
1.10 paf 536: *(const Charset::Tables *)info);
537: }
538:
539: static int xml256CharEncodingOutputFunc (
540: unsigned char *out,
541: int *outlen,
542: const unsigned char *in,
543: int *inlen,
544: void *info) {
545: return transcodeFromUTF8(
1.21 paf 546: in, *(size_t*)inlen,
547: out, *(size_t*)outlen,
1.10 paf 548: *(const Charset::Tables *)info);
549: }
550:
551:
552: void Charset::addEncoding(char *name_cstr) {
553: xmlCharEncodingHandler *handler=
554: (xmlCharEncodingHandler *)malloc(sizeof(xmlCharEncodingHandler));
555: handler->name=name_cstr;
556: handler->input=xml256CharEncodingInputFunc; handler->inputInfo=&tables;
557: handler->output=xml256CharEncodingOutputFunc; handler->outputInfo=&tables;
558:
559: xmlRegisterCharEncodingHandler(handler);
560: }
561:
562: void Charset::initTranscoder(const String *source, const char *name_cstr) {
1.15 paf 563: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
564: transcoder(source); // check right way
565: }
566:
567: xmlCharEncodingHandler *Charset::transcoder(const String *source) {
568: if(!ftranscoder)
1.23 paf 569: throw Exception("parser.runtime",
1.10 paf 570: source,
571: "unsupported encoding");
1.15 paf 572: return ftranscoder;
1.10 paf 573: }
574:
1.14 paf 575: const char *Charset::transcode_cstr(xmlChar *s) {
1.13 paf 576: if(!s)
1.14 paf 577: return "";
1.8 paf 578:
1.14 paf 579: int inlen=strlen((const char *)s);
1.8 paf 580: int outlen=inlen+1; // max
581: char *out=(char *)malloc(outlen*sizeof(char));
582:
1.30 paf 583: int error;
1.17 paf 584: if(xmlCharEncodingOutputFunc output=transcoder(0)->output) {
1.30 paf 585: error=output(
1.17 paf 586: (unsigned char*)out, &outlen,
587: (const unsigned char*)s, &inlen,
588: transcoder(0)->outputInfo);
1.30 paf 589: } else {
590: memcpy(out, s, outlen=inlen);
591: error=0;
592: }
593: if(error<0)
1.23 paf 594: throw Exception(0,
1.8 paf 595: 0,
1.30 paf 596: "transcode_cstr failed (%d)", error);
1.8 paf 597:
1.30 paf 598: out[outlen/*surely would be less then on input*/]=0;
1.8 paf 599: return out;
1.14 paf 600: }
1.31 paf 601: String& Charset::transcode(xmlChar *s
602: #ifndef NO_STRING_ORIGIN
603: , const String *origin
604: #endif
605: ) {
606: String& result=*NEW String(pool());
607: result.APPEND_CLEAN(transcode_cstr(s), 0/*auto-size*/, origin->origin().file, origin->origin().line);
608: return result;
1.14 paf 609: }
610: const char *Charset::transcode_cstr(GdomeDOMString *s) {
611: return s?transcode_cstr(BAD_CAST s->str):"";
1.1 paf 612: }
1.31 paf 613: String& Charset::transcode(GdomeDOMString *s
614: #ifndef NO_STRING_ORIGIN
615: , const String *origin
616: #endif
617: ) {
618: String& result=*NEW String(pool());
619: result.APPEND_CLEAN(transcode_cstr(s), 0/*auto-size*/, origin->origin().file, origin->origin().line);
620: return result;
1.1 paf 621: }
622:
1.8 paf 623: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.24 paf 624: xmlChar *Charset::transcode_buf2xchar(const char *buf, size_t buf_size) {
1.30 paf 625: unsigned char *out;
626: int outlen;
627: int error;
1.17 paf 628: if(xmlCharEncodingInputFunc input=transcoder(0)->input) {
1.32 paf 629: outlen=buf_size*6/*max*/;
630: out=(unsigned char*)malloc((outlen+1)*sizeof(unsigned char));
1.30 paf 631: error=input(
1.17 paf 632: out, &outlen,
633: (const unsigned char *)buf, (int *)&buf_size,
634: transcoder(0)->inputInfo);
1.30 paf 635: } else {
636: outlen=buf_size;
1.32 paf 637: out=(unsigned char*)malloc((outlen+1)*sizeof(unsigned char));
1.30 paf 638: memcpy(out, buf, outlen);
639: error=0;
640: }
1.17 paf 641:
1.30 paf 642: if(error<0)
1.23 paf 643: throw Exception(0,
1.8 paf 644: 0,
1.30 paf 645: "transcode_buf failed (%d)", error);
1.8 paf 646:
1.30 paf 647: out[outlen/*surely would be less then on input*/]=0;
1.24 paf 648: return (xmlChar *)out;
649: }
650: GdomeDOMString_auto_ptr Charset::transcode_buf2dom(const char *buf, size_t buf_size) {
651: return GdomeDOMString_auto_ptr((gchar*)transcode_buf2xchar(buf, buf_size));
1.1 paf 652: }
1.12 paf 653: GdomeDOMString_auto_ptr Charset::transcode(const String& s) {
1.1 paf 654: const char *cstr=s.cstr(String::UL_UNSPECIFIED);
655:
1.24 paf 656: return transcode_buf2dom(cstr, strlen(cstr));
1.1 paf 657: }
658: #endif
E-mail: