Annotation of parser3/src/main/pa_charset.C, revision 1.33.2.7
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.33.2.6 paf 4: Copyright(c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.33.2.7! paf 8: static const char* IDENT_CHARSET_C="$Date: 2003/01/31 12:34:35 $";
1.1 paf 9:
10: #include "pa_charset.h"
11:
12: #ifdef XML
1.8 paf 13: #include "libxml/encoding.h"
1.1 paf 14: #endif
15:
16: // globals
17:
1.33.2.2 paf 18: #define CHARSET_UTF8_NAME "UTF-8"
19:
1.33.2.5 paf 20: CharsetPtr UTF8_charset(new Charset(StringPtr(new String(CHARSET_UTF8_NAME)),
21: StringPtr(0)/*no file=system*/));
1.1 paf 22:
23: // consts
24:
25: #define MAX_CHARSET_UNI_CODES 500
26:
27: // helpers
28:
29: inline void prepare_case_tables(unsigned char *tables) {
30: unsigned char *lcc_table=tables+lcc_offset;
31: unsigned char *fcc_table=tables+fcc_offset;
32: for(int i=0; i<0x100; i++)
33: lcc_table[i]=fcc_table[i]=i;
34: }
35: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
36: unsigned char bit) {
37: unsigned char *ctypes_table=tables+ctypes_offset;
38: ctypes_table[0]=bit;
39: for(; *cstr; cstr++) {
40: unsigned char c=*cstr;
41: ctypes_table[c]|=bit;
42: }
43: }
1.33.2.6 paf 44: inline unsigned int to_wchar_code(const char* cstr) {
1.1 paf 45: if(!cstr || !*cstr)
46: return 0;
47: if(cstr[1]==0)
1.4 paf 48: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 49:
50: char *error_pos;
1.4 paf 51: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 52: }
1.33.2.6 paf 53: inline bool to_bool(const char* cstr) {
1.1 paf 54: return cstr && *cstr!=0;
55: }
56: static void element2ctypes(unsigned char c, bool belongs,
57: unsigned char *tables, unsigned char bit, int group_offset=-1) {
58: if(!belongs)
59: return;
60:
61: unsigned char *ctypes_table=tables+ctypes_offset;
62:
63: ctypes_table[c]|=bit;
64: if(group_offset>=0)
1.4 paf 65: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 66: }
67: static void element2case(unsigned char from, unsigned char to,
68: unsigned char *tables) {
69: if(!to)
70: return;
71:
72: unsigned char *lcc_table=tables+lcc_offset;
73: unsigned char *fcc_table=tables+fcc_offset;
74: lcc_table[from]=to;
75: fcc_table[from]=to; fcc_table[to]=from;
76: }
77:
78: // methods
79:
80: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.33.2.5 paf 81: Charset::Charset(StringPtr aname, StringPtr afile_spec):
1.33.2.1 paf 82: fname(aname),
83: fname_cstr(aname->cstr()) {
1.1 paf 84:
1.33.2.1 paf 85: for(char *c=fname_cstr; *c; c++)
1.10 paf 86: *c = toupper(*c);
1.7 paf 87:
1.33.2.3 paf 88: if(afile_spec) {
1.1 paf 89: fisUTF8=false;
1.33.2.3 paf 90: load_definition(fname);
1.1 paf 91: #ifdef XML
1.33.2.1 paf 92: addEncoding(fname_cstr);
1.1 paf 93: #endif
94: } else {
95: fisUTF8=true;
1.4 paf 96: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 97: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
98: }
99:
100: #ifdef XML
1.33.2.1 paf 101: initTranscoder(fname, fname_cstr);
1.1 paf 102: #endif
103: }
104:
105: Charset::~Charset() {
1.33.2.1 paf 106: // @todonow unregister encodings
1.1 paf 107: #ifdef XML
1.9 paf 108: // not deleting transcoder, that's not our business
1.1 paf 109: #endif
110: }
111:
1.33.2.5 paf 112: void Charset::load_definition(StringPtr afile_spec) {
1.1 paf 113: // pcre_tables
114: // lowcase, flipcase, bits digit+word+whitespace, masks
115:
116: // must not move this inside of prepare_case_tables
117: // don't know the size there
118: memset(pcre_tables, 0, sizeof(pcre_tables));
119: prepare_case_tables(pcre_tables);
1.4 paf 120: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 121:
122: // charset
1.10 paf 123: memset(tables.fromTable, 0, sizeof(tables.fromTable));
1.33.2.3 paf 124: tables.toTable=(Charset_TransRec *)pool_for_load.calloc(sizeof(Charset_TransRec)*MAX_CHARSET_UNI_CODES);
1.10 paf 125: tables.toTableSize=0;
1.1 paf 126: // strangly vital
1.10 paf 127: tables.toTable[tables.toTableSize].intCh=0;
128: tables.toTable[tables.toTableSize].extCh=(XMLByte)0;
129: tables.toTableSize++;
1.1 paf 130:
131: // loading text
1.33.2.3 paf 132: char *data=file_read_text(pool_for_load, *UTF8_charset, afile_spec);
1.1 paf 133:
134: // ignore header
135: getrow(&data);
136:
137: // parse cells
138: char *row;
139: while(row=getrow(&data)) {
140: // remove empty&comment lines
141: if(!*row || *row=='#')
142: continue;
143:
144: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
145: unsigned int c=0;
146: char *cell;
147: for(int column=0; cell=lsplit(&row, '\t'); column++) {
148: switch(column) {
149: case 0: c=to_wchar_code(cell); break;
150: // pcre_tables
151: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
152: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
153: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
154: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
155: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
156: case 6: element2case(c, to_wchar_code(cell), pcre_tables); break;
157: case 7:
158: case 8:
159: // charset
1.10 paf 160: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.23 paf 161: throw Exception("parser.runtime",
1.33.2.1 paf 162: afile_spec,
1.1 paf 163: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
164:
165: XMLCh unicode=(XMLCh)to_wchar_code(cell);
166: if(!unicode && column==7/*unicode1 column*/)
167: unicode=(XMLCh)c;
168: if(unicode) {
1.10 paf 169: if(!tables.fromTable[c])
170: tables.fromTable[c]=unicode;
171: tables.toTable[tables.toTableSize].intCh=unicode;
172: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
173: tables.toTableSize++;
1.1 paf 174: }
175: break;
176: }
177: }
178: };
179:
180: // sort by the Unicode code point
181: sort_ToTable();
182: }
183:
184: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
185: return
186: static_cast<const Charset_TransRec *>(a)->intCh-
187: static_cast<const Charset_TransRec *>(b)->intCh;
188: }
189:
190: void Charset::sort_ToTable() {
1.10 paf 191: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 192: sort_cmp_Trans_rec_intCh);
193: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 194: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 195: //fclose(f);
196: }
197:
1.10 paf 198: static XMLByte xlatOneTo(const XMLCh toXlat,
1.25 paf 199: const Charset::Tables& tables,
200: XMLByte not_found) {
1.1 paf 201: unsigned int lowOfs = 0;
1.10 paf 202: unsigned int hiOfs = tables.toTableSize - 1;
1.1 paf 203: XMLByte curByte = 0;
204: do {
205: // Calc the mid point of the low and high offset.
1.4 paf 206: const unsigned int midOfs =((hiOfs - lowOfs) / 2)+lowOfs;
1.1 paf 207:
208: // If our test char is greater than the mid point char, then
209: // we move up to the upper half. Else we move to the lower
210: // half. If its equal, then its our guy.
1.10 paf 211: if(toXlat>tables.toTable[midOfs].intCh)
1.1 paf 212: lowOfs = midOfs;
1.10 paf 213: else if(toXlat<tables.toTable[midOfs].intCh)
1.1 paf 214: hiOfs = midOfs;
215: else
1.10 paf 216: return tables.toTable[midOfs].extCh;
1.4 paf 217: } while(lowOfs+1<hiOfs);
1.1 paf 218:
1.25 paf 219: return not_found;
1.1 paf 220: }
221:
222: void Charset::transcode(Pool& pool,
223: const Charset& source_charset, const void *source_body, size_t source_content_length,
224: const Charset& dest_charset, const void *& dest_body, size_t& dest_content_length
225: ) {
1.4 paf 226: if(!source_content_length) {
227: dest_body=0;
228: dest_content_length=0;
229: return;
230: }
231:
1.1 paf 232: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
233: default: // 0x00
234: source_charset.transcodeToCharset(pool, dest_charset,
235: source_body, source_content_length,
236: dest_body, dest_content_length);
237: break;
238: case 0x01:
239: source_charset.transcodeToUTF8(pool,
240: source_body, source_content_length,
241: dest_body, dest_content_length);
242: break;
243: case 0x10:
244: dest_charset.transcodeFromUTF8(pool,
245: source_body, source_content_length,
246: dest_body, dest_content_length);
247: break;
248: case 0x11:
249: dest_body=source_body;
250: dest_content_length=source_content_length;
251: break;
252: }
253: }
254:
255: // ---------------------------------------------------------------------------
256: // Local static data
257: //
258: // gUTFBytes
259: // A list of counts of trailing bytes for each initial byte in the input.
260: //
261: // gUTFOffsets
262: // A list of values to offset each result char type, according to how
263: // many source bytes when into making it.
264: //
265: // gFirstByteMark
266: // A list of values to mask onto the first byte of an encoded sequence,
267: // indexed by the number of bytes used to create the sequence.
268: // ---------------------------------------------------------------------------
269: static const XMLByte gUTFBytes[0x100] = {
270: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
271: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
272: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
273: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
274: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
275: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
276: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
277: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
278: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
279: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
280: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
281: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
282: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
283: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
284: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
285: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
286: };
287:
288: static const uint gUTFOffsets[6] = {
289: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
290: };
291:
292: static const XMLByte gFirstByteMark[7] = {
293: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
294: };
295:
1.11 paf 296: static int transcodeToUTF8(
297: const XMLByte* srcData, size_t& srcLen,
298: XMLByte *toFill, size_t& toFillLen,
1.10 paf 299: const Charset::Tables& tables) {
1.11 paf 300: const XMLByte* srcPtr=srcData;
301: const XMLByte* srcEnd=srcData+srcLen;
302: XMLByte* outPtr=toFill;
303: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 304:
1.4 paf 305: while(srcPtr<srcEnd) {
1.10 paf 306: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 307: if(!curVal) {
308: // use the replacement character
1.4 paf 309: *outPtr++= '?';
310: srcPtr++;
1.1 paf 311: continue;
312: }
313:
314: // Figure out how many bytes we need
315: unsigned int encodedBytes;
1.4 paf 316: if(curVal<0x80)
1.1 paf 317: encodedBytes = 1;
1.4 paf 318: else if(curVal<0x800)
1.1 paf 319: encodedBytes = 2;
1.4 paf 320: else if(curVal<0x10000)
1.1 paf 321: encodedBytes = 3;
1.4 paf 322: else if(curVal<0x200000)
1.1 paf 323: encodedBytes = 4;
1.4 paf 324: else if(curVal<0x4000000)
1.1 paf 325: encodedBytes = 5;
1.4 paf 326: else if(curVal<= 0x7FFFFFFF)
1.1 paf 327: encodedBytes = 6;
328: else {
329: // use the replacement character
1.4 paf 330: *outPtr++= '?';
331: srcPtr++;
1.1 paf 332: continue;
333: }
334:
1.10 paf 335: // If we cannot fully get this char into the output buffer
336: if (outPtr + encodedBytes > outEnd)
337: break;
1.1 paf 338:
339: // We can do it, so update the source index
340: srcPtr++;
341:
342: // And spit out the bytes. We spit them out in reverse order
343: // here, so bump up the output pointer and work down as we go.
1.4 paf 344: outPtr+= encodedBytes;
1.1 paf 345: switch(encodedBytes) {
1.18 paf 346: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 347: curVal>>= 6;
1.18 paf 348: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 349: curVal>>= 6;
1.18 paf 350: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 351: curVal>>= 6;
1.18 paf 352: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 353: curVal>>= 6;
1.18 paf 354: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 355: curVal>>= 6;
1.18 paf 356: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
1.1 paf 357: }
358:
359: // Add the encoded bytes back in again to indicate we've eaten them
1.4 paf 360: outPtr+= encodedBytes;
1.1 paf 361: }
362:
1.11 paf 363: // Update the bytes eaten
364: srcLen = srcPtr - srcData;
365:
366: // Return the characters read
367: toFillLen = outPtr - toFill;
368:
1.29 paf 369: //return srcPtr==srcEnd?(int)toFillLen:-1;
370: /*
371: xmlCharEncodingInputFunc
372: Returns :
373: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
374: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
375: of ocetes consumed.
376: */
377: return 0;
1.1 paf 378: }
1.26 paf 379: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.30 paf 380: static int transcodeFromUTF8(
1.11 paf 381: const XMLByte *srcData, size_t& srcLen,
382: XMLByte* toFill, size_t& toFillLen,
383: const Charset::Tables& tables) {
384: const XMLByte* srcPtr=srcData;
385: const XMLByte* srcEnd=srcData+srcLen;
386: XMLByte* outPtr=toFill;
387: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 388:
1.10 paf 389: // We now loop until we either run out of input data, or room to store
390: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
1.1 paf 391: // Get the next leading byte out
392: const XMLByte firstByte = *srcPtr;
393:
1.4 paf 394: // Special-case ASCII, which is a leading byte value of<= 127
395: if(firstByte<= 127) {
396: *outPtr++= firstByte;
1.1 paf 397: srcPtr++;
398: continue;
399: }
400:
401: // See how many trailing src bytes this sequence is going to require
402: const unsigned int trailingBytes = gUTFBytes[firstByte];
403:
404: // If there are not enough source bytes to do this one, then we
1.4 paf 405: // are done. Note that we done>= here because we are implicitly
1.1 paf 406: // counting the 1 byte we get no matter what.
1.4 paf 407: if(srcPtr+trailingBytes>= srcEnd)
1.1 paf 408: break;
409:
410: // Looks ok, so lets build up the value
411: uint tmpVal=0;
412: switch(trailingBytes) {
413: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
414: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
415: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
416: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
417: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
418: case 0: tmpVal+=*srcPtr++;
419: break;
420:
421: default:
1.23 paf 422: throw Exception(0,
1.33.2.1 paf 423: Exception::undefined_source,
1.4 paf 424: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes);
1.1 paf 425: }
426: tmpVal-=gUTFOffsets[trailingBytes];
427:
428: // If it will fit into a single char, then put it in. Otherwise
429: // fail [*encode it as a surrogate pair. If its not valid, use the
430: // replacement char.*]
1.25 paf 431: if(!(tmpVal & 0xFFFF0000)) {
432: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
433: *outPtr++=xlat;
434: else
435: outPtr+=sprintf((char *)outPtr, "&#%d;", tmpVal); // &#decimal;
436: } else
1.23 paf 437: throw Exception(0,
1.33.2.1 paf 438: Exception::undefined_source,
1.4 paf 439: "transcodeFromUTF8 error: too big tmpVal(0x%08X)", tmpVal);
1.1 paf 440: }
441:
1.11 paf 442: // Update the bytes eaten
443: srcLen = srcPtr - srcData;
444:
445: // Return the characters read
446: toFillLen = outPtr - toFill;
447:
1.29 paf 448: //return srcPtr==srcEnd?(int)toFillLen:-1;
449: /*
450: xmlCharEncodingOutputFunc
451: Returns :
452: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
453: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
454: of ocetes consumed.
455: */
456: return 0;
1.10 paf 457: }
458:
459: /// @todo not so memory-hungry with prescan
460: void Charset::transcodeToUTF8(Pool& pool,
461: const void *source_body, size_t source_content_length,
1.11 paf 462: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 463: dest_content_length=source_content_length*6/*so that surly enough, max utf8 seq len=6*/;
1.11 paf 464: XMLByte *dest_body=(XMLByte*)pool.malloc(dest_content_length);
465:
466: if(::transcodeToUTF8(
467: (XMLByte *)source_body, source_content_length,
468: dest_body, dest_content_length,
469: tables)<0)
1.10 paf 470: throw(0, 0,
471: 0,
1.11 paf 472: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 473:
1.1 paf 474: // return
475: adest_body=dest_body;
1.10 paf 476: }
477: void Charset::transcodeFromUTF8(Pool& pool,
478: const void *source_body, size_t source_content_length,
1.11 paf 479: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 480: dest_content_length=source_content_length*6/*so that surly enough, "ÿ" has max ratio */;
1.11 paf 481: XMLByte *dest_body=(XMLByte*)pool.malloc(dest_content_length);
482:
483: if(::transcodeFromUTF8(
484: (XMLByte *)source_body, source_content_length,
485: dest_body, dest_content_length,
486: tables)<0)
1.10 paf 487: throw(0, 0,
488: 0,
1.11 paf 489: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 490:
491: // return
492: adest_body=dest_body;
1.1 paf 493: }
494:
495: /// transcode using both charsets
496: void Charset::transcodeToCharset(Pool& pool,
497: const Charset& dest_charset,
498: const void *source_body, size_t source_content_length,
1.6 paf 499: const void *& adest_body, size_t& adest_content_length) const {
1.3 paf 500: if(&dest_charset==this) {
1.6 paf 501: adest_body=source_body;
502: adest_content_length=source_content_length;
503: } else {
504: size_t dest_content_length=source_content_length;
505: unsigned char *dest_body=(unsigned char *)pool.malloc(dest_content_length);
506:
1.11 paf 507: const XMLByte* srcPtr=(XMLByte *)source_body;
508: const XMLByte* srcEnd=(XMLByte *)source_body+source_content_length;
1.6 paf 509:
510: for(XMLByte* outPtr=dest_body; srcPtr<srcEnd; srcPtr++) {
1.10 paf 511: XMLCh curVal = tables.fromTable[*srcPtr];
1.6 paf 512: if(curVal)
1.25 paf 513: *outPtr++=xlatOneTo(curVal, dest_charset.tables, '?');
1.6 paf 514: else {
515: // use the replacement character
516: *outPtr++= '?';
517: }
518: }
1.1 paf 519:
1.6 paf 520: adest_body=dest_body;
521: adest_content_length=dest_content_length;
522: }
1.1 paf 523: }
524:
525: #ifdef XML
1.10 paf 526: static int xml256CharEncodingInputFunc (
527: unsigned char *out,
528: int *outlen,
529: const unsigned char *in,
530: int *inlen,
531: void *info) {
532: return transcodeToUTF8(
1.21 paf 533: in, *(size_t*)inlen,
534: out, *(size_t*)outlen,
1.10 paf 535: *(const Charset::Tables *)info);
536: }
537:
538: static int xml256CharEncodingOutputFunc (
539: unsigned char *out,
540: int *outlen,
541: const unsigned char *in,
542: int *inlen,
543: void *info) {
544: return transcodeFromUTF8(
1.21 paf 545: in, *(size_t*)inlen,
546: out, *(size_t*)outlen,
1.10 paf 547: *(const Charset::Tables *)info);
548: }
549:
550:
551: void Charset::addEncoding(char *name_cstr) {
552: xmlCharEncodingHandler *handler=
553: (xmlCharEncodingHandler *)malloc(sizeof(xmlCharEncodingHandler));
554: handler->name=name_cstr;
555: handler->input=xml256CharEncodingInputFunc; handler->inputInfo=&tables;
556: handler->output=xml256CharEncodingOutputFunc; handler->outputInfo=&tables;
557:
558: xmlRegisterCharEncodingHandler(handler);
559: }
560:
1.33.2.6 paf 561: void Charset::initTranscoder(const String *source, const char* name_cstr) {
1.15 paf 562: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
563: transcoder(source); // check right way
564: }
565:
566: xmlCharEncodingHandler *Charset::transcoder(const String *source) {
567: if(!ftranscoder)
1.23 paf 568: throw Exception("parser.runtime",
1.10 paf 569: source,
570: "unsupported encoding");
1.15 paf 571: return ftranscoder;
1.10 paf 572: }
573:
1.33.2.6 paf 574: const char* Charset::transcode_cstr(xmlChar *s) {
1.13 paf 575: if(!s)
1.14 paf 576: return "";
1.8 paf 577:
1.33.2.6 paf 578: int inlen=strlen((const char* )s);
1.8 paf 579: int outlen=inlen+1; // max
580: char *out=(char *)malloc(outlen*sizeof(char));
581:
1.30 paf 582: int error;
1.17 paf 583: if(xmlCharEncodingOutputFunc output=transcoder(0)->output) {
1.30 paf 584: error=output(
1.17 paf 585: (unsigned char*)out, &outlen,
586: (const unsigned char*)s, &inlen,
587: transcoder(0)->outputInfo);
1.30 paf 588: } else {
589: memcpy(out, s, outlen=inlen);
590: error=0;
591: }
592: if(error<0)
1.23 paf 593: throw Exception(0,
1.8 paf 594: 0,
1.30 paf 595: "transcode_cstr failed (%d)", error);
1.8 paf 596:
1.30 paf 597: out[outlen/*surely would be less then on input*/]=0;
1.8 paf 598: return out;
1.14 paf 599: }
1.31 paf 600: String& Charset::transcode(xmlChar *s
601: #ifndef NO_STRING_ORIGIN
602: , const String *origin
603: #endif
604: ) {
605: String& result=*NEW String(pool());
606: result.APPEND_CLEAN(transcode_cstr(s), 0/*auto-size*/, origin->origin().file, origin->origin().line);
607: return result;
1.14 paf 608: }
1.33.2.6 paf 609: const char* Charset::transcode_cstr(GdomeDOMString *s) {
1.14 paf 610: return s?transcode_cstr(BAD_CAST s->str):"";
1.1 paf 611: }
1.31 paf 612: String& Charset::transcode(GdomeDOMString *s
613: #ifndef NO_STRING_ORIGIN
614: , const String *origin
615: #endif
616: ) {
617: String& result=*NEW String(pool());
618: result.APPEND_CLEAN(transcode_cstr(s), 0/*auto-size*/, origin->origin().file, origin->origin().line);
619: return result;
1.1 paf 620: }
621:
1.8 paf 622: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.33.2.6 paf 623: xmlChar *Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
1.30 paf 624: unsigned char *out;
625: int outlen;
626: int error;
1.17 paf 627: if(xmlCharEncodingInputFunc input=transcoder(0)->input) {
1.32 paf 628: outlen=buf_size*6/*max*/;
629: out=(unsigned char*)malloc((outlen+1)*sizeof(unsigned char));
1.30 paf 630: error=input(
1.17 paf 631: out, &outlen,
632: (const unsigned char *)buf, (int *)&buf_size,
633: transcoder(0)->inputInfo);
1.30 paf 634: } else {
635: outlen=buf_size;
1.32 paf 636: out=(unsigned char*)malloc((outlen+1)*sizeof(unsigned char));
1.30 paf 637: memcpy(out, buf, outlen);
638: error=0;
639: }
1.17 paf 640:
1.30 paf 641: if(error<0)
1.23 paf 642: throw Exception(0,
1.8 paf 643: 0,
1.30 paf 644: "transcode_buf failed (%d)", error);
1.8 paf 645:
1.30 paf 646: out[outlen/*surely would be less then on input*/]=0;
1.24 paf 647: return (xmlChar *)out;
648: }
1.33.2.6 paf 649: GdomeDOMString_auto_ptr Charset::transcode_buf2dom(const char* buf, size_t buf_size) {
1.24 paf 650: return GdomeDOMString_auto_ptr((gchar*)transcode_buf2xchar(buf, buf_size));
1.1 paf 651: }
1.12 paf 652: GdomeDOMString_auto_ptr Charset::transcode(const String& s) {
1.33.2.6 paf 653: const char* cstr=s.cstr(String::UL_UNSPECIFIED);
1.1 paf 654:
1.24 paf 655: return transcode_buf2dom(cstr, strlen(cstr));
1.1 paf 656: }
657: #endif
E-mail: