Annotation of parser3/src/main/pa_charset.C, revision 1.33.2.19.2.12
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.33.2.6 paf 4: Copyright(c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.33.2.19.2.1 (paf 8:): static const char* IDENT_CHARSET_C="$Date: 2003/03/25 12:39:14 $";
1.1 paf 9:
10: #include "pa_charset.h"
1.33.2.13 paf 11: #include "pa_charsets.h"
1.1 paf 12:
13: #ifdef XML
1.8 paf 14: #include "libxml/encoding.h"
1.1 paf 15: #endif
16:
17: // helpers
18:
19: inline void prepare_case_tables(unsigned char *tables) {
20: unsigned char *lcc_table=tables+lcc_offset;
21: unsigned char *fcc_table=tables+fcc_offset;
22: for(int i=0; i<0x100; i++)
23: lcc_table[i]=fcc_table[i]=i;
24: }
25: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
26: unsigned char bit) {
27: unsigned char *ctypes_table=tables+ctypes_offset;
28: ctypes_table[0]=bit;
29: for(; *cstr; cstr++) {
30: unsigned char c=*cstr;
31: ctypes_table[c]|=bit;
32: }
33: }
1.33.2.6 paf 34: inline unsigned int to_wchar_code(const char* cstr) {
1.1 paf 35: if(!cstr || !*cstr)
36: return 0;
37: if(cstr[1]==0)
1.4 paf 38: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 39:
40: char *error_pos;
1.4 paf 41: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 42: }
1.33.2.6 paf 43: inline bool to_bool(const char* cstr) {
1.1 paf 44: return cstr && *cstr!=0;
45: }
46: static void element2ctypes(unsigned char c, bool belongs,
47: unsigned char *tables, unsigned char bit, int group_offset=-1) {
48: if(!belongs)
49: return;
50:
51: unsigned char *ctypes_table=tables+ctypes_offset;
52:
53: ctypes_table[c]|=bit;
54: if(group_offset>=0)
1.4 paf 55: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 56: }
57: static void element2case(unsigned char from, unsigned char to,
58: unsigned char *tables) {
59: if(!to)
60: return;
61:
62: unsigned char *lcc_table=tables+lcc_offset;
63: unsigned char *fcc_table=tables+fcc_offset;
64: lcc_table[from]=to;
65: fcc_table[from]=to; fcc_table[to]=from;
66: }
67:
68: // methods
69:
70: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.33.2.19.2.5 (paf 71:: Charset::Charset(const String& aname, const String* afile_spec):
1.33.2.1 paf 72: fname(aname),
1.33.2.19.2.5 (paf 73:: fname_cstr(aname.cstrm()) {
1.1 paf 74:
1.33.2.1 paf 75: for(char *c=fname_cstr; *c; c++)
1.33.2.19.2.5 (paf 76:: *c = toupper(*c);
1.7 paf 77:
1.33.2.3 paf 78: if(afile_spec) {
1.1 paf 79: fisUTF8=false;
1.33.2.19.2.5 (paf 80:: load_definition(*afile_spec);
1.1 paf 81: #ifdef XML
1.33.2.1 paf 82: addEncoding(fname_cstr);
1.1 paf 83: #endif
84: } else {
85: fisUTF8=true;
1.4 paf 86: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 87: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
88: }
89:
90: #ifdef XML
1.33.2.1 paf 91: initTranscoder(fname, fname_cstr);
1.1 paf 92: #endif
93: }
94:
1.33.2.19.2.1 (paf 95:: void Charset::load_definition(const String& afile_spec) {
1.1 paf 96: // pcre_tables
97: // lowcase, flipcase, bits digit+word+whitespace, masks
98:
99: // must not move this inside of prepare_case_tables
100: // don't know the size there
101: memset(pcre_tables, 0, sizeof(pcre_tables));
102: prepare_case_tables(pcre_tables);
1.4 paf 103: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 104:
105: // charset
1.33.2.12 paf 106: memset(&tables, 0, sizeof(tables));
1.1 paf 107: // strangly vital
1.10 paf 108: tables.toTable[tables.toTableSize].intCh=0;
109: tables.toTable[tables.toTableSize].extCh=(XMLByte)0;
110: tables.toTableSize++;
1.1 paf 111:
112: // loading text
1.33.2.19.2.5 (paf 113:: char *data=file_read_text(UTF8_charset, afile_spec);
1.1 paf 114:
115: // ignore header
116: getrow(&data);
117:
118: // parse cells
119: char *row;
120: while(row=getrow(&data)) {
121: // remove empty&comment lines
122: if(!*row || *row=='#')
123: continue;
124:
125: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
126: unsigned int c=0;
127: char *cell;
128: for(int column=0; cell=lsplit(&row, '\t'); column++) {
129: switch(column) {
130: case 0: c=to_wchar_code(cell); break;
131: // pcre_tables
132: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
133: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
134: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
135: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
136: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
137: case 6: element2case(c, to_wchar_code(cell), pcre_tables); break;
138: case 7:
139: case 8:
140: // charset
1.10 paf 141: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.23 paf 142: throw Exception("parser.runtime",
1.33.2.19.2.5 (paf 143:: &afile_spec,
1.1 paf 144: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
145:
146: XMLCh unicode=(XMLCh)to_wchar_code(cell);
147: if(!unicode && column==7/*unicode1 column*/)
148: unicode=(XMLCh)c;
149: if(unicode) {
1.10 paf 150: if(!tables.fromTable[c])
151: tables.fromTable[c]=unicode;
152: tables.toTable[tables.toTableSize].intCh=unicode;
153: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
154: tables.toTableSize++;
1.1 paf 155: }
156: break;
157: }
158: }
159: };
160:
161: // sort by the Unicode code point
162: sort_ToTable();
163: }
164:
165: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
166: return
167: static_cast<const Charset_TransRec *>(a)->intCh-
168: static_cast<const Charset_TransRec *>(b)->intCh;
169: }
170:
171: void Charset::sort_ToTable() {
1.10 paf 172: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 173: sort_cmp_Trans_rec_intCh);
174: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 175: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 176: //fclose(f);
177: }
178:
1.10 paf 179: static XMLByte xlatOneTo(const XMLCh toXlat,
1.25 paf 180: const Charset::Tables& tables,
181: XMLByte not_found) {
1.1 paf 182: unsigned int lowOfs = 0;
1.10 paf 183: unsigned int hiOfs = tables.toTableSize - 1;
1.1 paf 184: XMLByte curByte = 0;
185: do {
186: // Calc the mid point of the low and high offset.
1.4 paf 187: const unsigned int midOfs =((hiOfs - lowOfs) / 2)+lowOfs;
1.1 paf 188:
189: // If our test char is greater than the mid point char, then
190: // we move up to the upper half. Else we move to the lower
191: // half. If its equal, then its our guy.
1.10 paf 192: if(toXlat>tables.toTable[midOfs].intCh)
1.1 paf 193: lowOfs = midOfs;
1.10 paf 194: else if(toXlat<tables.toTable[midOfs].intCh)
1.1 paf 195: hiOfs = midOfs;
196: else
1.10 paf 197: return tables.toTable[midOfs].extCh;
1.4 paf 198: } while(lowOfs+1<hiOfs);
1.1 paf 199:
1.25 paf 200: return not_found;
1.1 paf 201: }
202:
1.33.2.19.2.1 (paf 203:: void Charset::transcode(
1.33.2.14 paf 204: const Charset& source_charset, const void* source_body, size_t source_content_length,
1.1 paf 205: const Charset& dest_charset, const void *& dest_body, size_t& dest_content_length
206: ) {
1.4 paf 207: if(!source_content_length) {
208: dest_body=0;
209: dest_content_length=0;
210: return;
211: }
212:
1.1 paf 213: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
214: default: // 0x00
1.33.2.19.2.3 (paf 215:: source_charset.transcodeToCharset(dest_charset,
1.1 paf 216: source_body, source_content_length,
217: dest_body, dest_content_length);
218: break;
219: case 0x01:
1.33.2.19.2.5 (paf 220:: source_charset.transcodeToUTF8(
1.1 paf 221: source_body, source_content_length,
222: dest_body, dest_content_length);
223: break;
224: case 0x10:
1.33.2.19.2.5 (paf 225:: dest_charset.transcodeFromUTF8(
1.1 paf 226: source_body, source_content_length,
227: dest_body, dest_content_length);
228: break;
229: case 0x11:
1.33.2.19.2.5 (paf 230:: dest_body=source_body;
231:: dest_content_length=source_content_length;
1.1 paf 232: break;
233: }
234: }
235:
236: // ---------------------------------------------------------------------------
237: // Local static data
238: //
239: // gUTFBytes
240: // A list of counts of trailing bytes for each initial byte in the input.
241: //
242: // gUTFOffsets
243: // A list of values to offset each result char type, according to how
244: // many source bytes when into making it.
245: //
246: // gFirstByteMark
247: // A list of values to mask onto the first byte of an encoded sequence,
248: // indexed by the number of bytes used to create the sequence.
249: // ---------------------------------------------------------------------------
250: static const XMLByte gUTFBytes[0x100] = {
251: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
252: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
253: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
254: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
255: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
256: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
257: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
258: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
259: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
260: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
261: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
262: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
263: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
264: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
265: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
266: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
267: };
268:
269: static const uint gUTFOffsets[6] = {
270: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
271: };
272:
273: static const XMLByte gFirstByteMark[7] = {
274: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
275: };
276:
1.11 paf 277: static int transcodeToUTF8(
278: const XMLByte* srcData, size_t& srcLen,
279: XMLByte *toFill, size_t& toFillLen,
1.10 paf 280: const Charset::Tables& tables) {
1.11 paf 281: const XMLByte* srcPtr=srcData;
282: const XMLByte* srcEnd=srcData+srcLen;
283: XMLByte* outPtr=toFill;
284: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 285:
1.4 paf 286: while(srcPtr<srcEnd) {
1.10 paf 287: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 288: if(!curVal) {
289: // use the replacement character
1.4 paf 290: *outPtr++= '?';
291: srcPtr++;
1.1 paf 292: continue;
293: }
294:
295: // Figure out how many bytes we need
296: unsigned int encodedBytes;
1.4 paf 297: if(curVal<0x80)
1.1 paf 298: encodedBytes = 1;
1.4 paf 299: else if(curVal<0x800)
1.1 paf 300: encodedBytes = 2;
1.4 paf 301: else if(curVal<0x10000)
1.1 paf 302: encodedBytes = 3;
1.4 paf 303: else if(curVal<0x200000)
1.1 paf 304: encodedBytes = 4;
1.4 paf 305: else if(curVal<0x4000000)
1.1 paf 306: encodedBytes = 5;
1.4 paf 307: else if(curVal<= 0x7FFFFFFF)
1.1 paf 308: encodedBytes = 6;
309: else {
310: // use the replacement character
1.4 paf 311: *outPtr++= '?';
312: srcPtr++;
1.1 paf 313: continue;
314: }
315:
1.10 paf 316: // If we cannot fully get this char into the output buffer
317: if (outPtr + encodedBytes > outEnd)
318: break;
1.1 paf 319:
320: // We can do it, so update the source index
321: srcPtr++;
322:
323: // And spit out the bytes. We spit them out in reverse order
324: // here, so bump up the output pointer and work down as we go.
1.4 paf 325: outPtr+= encodedBytes;
1.1 paf 326: switch(encodedBytes) {
1.18 paf 327: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 328: curVal>>= 6;
1.18 paf 329: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 330: curVal>>= 6;
1.18 paf 331: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 332: curVal>>= 6;
1.18 paf 333: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 334: curVal>>= 6;
1.18 paf 335: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 336: curVal>>= 6;
1.18 paf 337: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
1.1 paf 338: }
339:
340: // Add the encoded bytes back in again to indicate we've eaten them
1.4 paf 341: outPtr+= encodedBytes;
1.1 paf 342: }
343:
1.11 paf 344: // Update the bytes eaten
345: srcLen = srcPtr - srcData;
346:
347: // Return the characters read
348: toFillLen = outPtr - toFill;
349:
1.29 paf 350: //return srcPtr==srcEnd?(int)toFillLen:-1;
351: /*
352: xmlCharEncodingInputFunc
353: Returns :
354: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
355: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
356: of ocetes consumed.
357: */
358: return 0;
1.1 paf 359: }
1.26 paf 360: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.30 paf 361: static int transcodeFromUTF8(
1.33.2.14 paf 362: const XMLByte* srcData, size_t& srcLen,
1.11 paf 363: XMLByte* toFill, size_t& toFillLen,
364: const Charset::Tables& tables) {
365: const XMLByte* srcPtr=srcData;
366: const XMLByte* srcEnd=srcData+srcLen;
367: XMLByte* outPtr=toFill;
368: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 369:
1.10 paf 370: // We now loop until we either run out of input data, or room to store
371: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
1.1 paf 372: // Get the next leading byte out
1.33.2.14 paf 373: const XMLByte firstByte =* srcPtr;
1.1 paf 374:
1.4 paf 375: // Special-case ASCII, which is a leading byte value of<= 127
376: if(firstByte<= 127) {
377: *outPtr++= firstByte;
1.1 paf 378: srcPtr++;
379: continue;
380: }
381:
382: // See how many trailing src bytes this sequence is going to require
383: const unsigned int trailingBytes = gUTFBytes[firstByte];
384:
385: // If there are not enough source bytes to do this one, then we
1.4 paf 386: // are done. Note that we done>= here because we are implicitly
1.1 paf 387: // counting the 1 byte we get no matter what.
1.4 paf 388: if(srcPtr+trailingBytes>= srcEnd)
1.1 paf 389: break;
390:
391: // Looks ok, so lets build up the value
392: uint tmpVal=0;
393: switch(trailingBytes) {
394: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
395: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
396: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
397: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
398: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
399: case 0: tmpVal+=*srcPtr++;
400: break;
401:
402: default:
1.23 paf 403: throw Exception(0,
1.33.2.19.2.2 (paf 404:: 0,
1.4 paf 405: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes);
1.1 paf 406: }
407: tmpVal-=gUTFOffsets[trailingBytes];
408:
409: // If it will fit into a single char, then put it in. Otherwise
410: // fail [*encode it as a surrogate pair. If its not valid, use the
411: // replacement char.*]
1.25 paf 412: if(!(tmpVal & 0xFFFF0000)) {
413: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
414: *outPtr++=xlat;
415: else
416: outPtr+=sprintf((char *)outPtr, "&#%d;", tmpVal); // &#decimal;
417: } else
1.23 paf 418: throw Exception(0,
1.33.2.19.2.2 (paf 419:: 0,
1.4 paf 420: "transcodeFromUTF8 error: too big tmpVal(0x%08X)", tmpVal);
1.1 paf 421: }
422:
1.11 paf 423: // Update the bytes eaten
424: srcLen = srcPtr - srcData;
425:
426: // Return the characters read
427: toFillLen = outPtr - toFill;
428:
1.29 paf 429: //return srcPtr==srcEnd?(int)toFillLen:-1;
430: /*
431: xmlCharEncodingOutputFunc
432: Returns :
433: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
434: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
435: of ocetes consumed.
436: */
437: return 0;
1.10 paf 438: }
439:
440: /// @todo not so memory-hungry with prescan
1.33.2.19.2.1 (paf 441:: void Charset::transcodeToUTF8(
1.33.2.14 paf 442: const void* source_body, size_t source_content_length,
1.11 paf 443: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 444: dest_content_length=source_content_length*6/*so that surly enough, max utf8 seq len=6*/;
1.33.2.19.2.1 (paf 445:): XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_content_length];
1.11 paf 446:
447: if(::transcodeToUTF8(
448: (XMLByte *)source_body, source_content_length,
449: dest_body, dest_content_length,
450: tables)<0)
1.10 paf 451: throw(0, 0,
452: 0,
1.11 paf 453: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 454:
1.1 paf 455: // return
456: adest_body=dest_body;
1.10 paf 457: }
1.33.2.19.2.1 (paf 458:: void Charset::transcodeFromUTF8(
1.33.2.14 paf 459: const void* source_body, size_t source_content_length,
1.11 paf 460: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 461: dest_content_length=source_content_length*6/*so that surly enough, "ÿ" has max ratio */;
1.33.2.19.2.1 (paf 462:): XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_content_length];
1.11 paf 463:
464: if(::transcodeFromUTF8(
465: (XMLByte *)source_body, source_content_length,
466: dest_body, dest_content_length,
467: tables)<0)
1.10 paf 468: throw(0, 0,
469: 0,
1.11 paf 470: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 471:
472: // return
473: adest_body=dest_body;
1.1 paf 474: }
475:
476: /// transcode using both charsets
1.33.2.19.2.1 (paf 477:: void Charset::transcodeToCharset(
478:): const Charset& dest_charset,
479:): const void* source_body, size_t source_content_length,
480:): const void *& adest_body, size_t& adest_content_length) const {
1.3 paf 481: if(&dest_charset==this) {
1.33.2.19.2.5 (paf 482:: adest_body=source_body;
483:: adest_content_length=source_content_length;
1.6 paf 484: } else {
485: size_t dest_content_length=source_content_length;
1.33.2.19.2.1 (paf 486:): unsigned char *dest_body=new(PointerFreeGC) unsigned char[dest_content_length];
1.6 paf 487:
1.11 paf 488: const XMLByte* srcPtr=(XMLByte *)source_body;
489: const XMLByte* srcEnd=(XMLByte *)source_body+source_content_length;
1.6 paf 490:
491: for(XMLByte* outPtr=dest_body; srcPtr<srcEnd; srcPtr++) {
1.10 paf 492: XMLCh curVal = tables.fromTable[*srcPtr];
1.6 paf 493: if(curVal)
1.25 paf 494: *outPtr++=xlatOneTo(curVal, dest_charset.tables, '?');
1.6 paf 495: else {
496: // use the replacement character
497: *outPtr++= '?';
498: }
499: }
1.1 paf 500:
1.6 paf 501: adest_body=dest_body;
502: adest_content_length=dest_content_length;
503: }
1.1 paf 504: }
505:
506: #ifdef XML
1.10 paf 507: static int xml256CharEncodingInputFunc (
508: unsigned char *out,
509: int *outlen,
510: const unsigned char *in,
511: int *inlen,
512: void *info) {
513: return transcodeToUTF8(
1.33.2.19.2.9 (paf 514:: in, *(size_t*)inlen,
515:: out, *(size_t*)outlen,
516:: *(const Charset::Tables *)info);
1.10 paf 517: }
518:
519: static int xml256CharEncodingOutputFunc (
520: unsigned char *out,
521: int *outlen,
522: const unsigned char *in,
523: int *inlen,
524: void *info) {
525: return transcodeFromUTF8(
1.33.2.19.2.9 (paf 526:: in, *(size_t*)inlen,
527:: out, *(size_t*)outlen,
528:: *(const Charset::Tables *)info);
1.10 paf 529: }
530:
531:
532: void Charset::addEncoding(char *name_cstr) {
1.33.2.19.2.9 (paf 533:: xmlCharEncodingHandler* handler=new(PointerFreeGC) xmlCharEncodingHandler;
1.10 paf 534: handler->name=name_cstr;
535: handler->input=xml256CharEncodingInputFunc; handler->inputInfo=&tables;
536: handler->output=xml256CharEncodingOutputFunc; handler->outputInfo=&tables;
537:
538: xmlRegisterCharEncodingHandler(handler);
539: }
540:
1.33.2.19.2.7 (paf 541:: void Charset::initTranscoder(const String& name, const char* name_cstr) {
1.15 paf 542: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
1.33.2.19.2.7 (paf 543:: transcoder(name); // check right way
1.15 paf 544: }
545:
1.33.2.19.2.7 (paf 546:: xmlCharEncodingHandler& Charset::transcoder(const String& name) {
1.15 paf 547: if(!ftranscoder)
1.23 paf 548: throw Exception("parser.runtime",
1.33.2.19.2.7 (paf 549:: &name,
1.10 paf 550: "unsupported encoding");
1.33.2.14 paf 551: return *ftranscoder;
1.10 paf 552: }
553:
1.33.2.19.2.7 (paf 554:: const char* Charset::transcode_cstr(xmlChar* s) {
1.13 paf 555: if(!s)
1.14 paf 556: return "";
1.8 paf 557:
1.33.2.6 paf 558: int inlen=strlen((const char* )s);
1.8 paf 559: int outlen=inlen+1; // max
1.33.2.19.2.6 (paf 560:: char *out=new(PointerFreeGC) char[outlen];
1.8 paf 561:
1.30 paf 562: int error;
1.33.2.19.2.7 (paf 563:: if(xmlCharEncodingOutputFunc output=transcoder(fname).output) {
1.30 paf 564: error=output(
1.17 paf 565: (unsigned char*)out, &outlen,
566: (const unsigned char*)s, &inlen,
1.33.2.19.2.7 (paf 567:: transcoder(fname).outputInfo);
1.30 paf 568: } else {
569: memcpy(out, s, outlen=inlen);
570: error=0;
571: }
572: if(error<0)
1.33.2.19.2.2 (paf 573:: throw Exception(0,
574:: 0,
1.30 paf 575: "transcode_cstr failed (%d)", error);
1.8 paf 576:
1.30 paf 577: out[outlen/*surely would be less then on input*/]=0;
1.8 paf 578: return out;
1.14 paf 579: }
1.33.2.19.2.7 (paf 580:: const String& Charset::transcode(xmlChar* s) {
1.33.2.19.2.1 (paf 581:): return *new String(transcode_cstr(s), 0/*auto-size*/, true);
1.14 paf 582: }
1.33.2.19.2.7 (paf 583:: const char* Charset::transcode_cstr(GdomeDOMString* s) {
1.33.2.19.2.3 (paf 584:: return s?transcode_cstr(BAD_CAST s->str):"";
1.1 paf 585: }
1.33.2.19.2.7 (paf 586:: const String& Charset::transcode(GdomeDOMString* s) {
1.33.2.19.2.1 (paf 587:): return *new String(transcode_cstr(s), 0/*auto-size*/, true);
1.1 paf 588: }
589:
1.8 paf 590: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.33.2.16 paf 591: void* Charset::transcode_buf2mchar(transcode_buf_malloc_func malloc_func,
1.33.2.19.2.8 (paf 592:: const char* buf, size_t buf_size) {
1.33.2.14 paf 593: unsigned char* out;
1.30 paf 594: int outlen;
595: int error;
1.33.2.19.2.7 (paf 596:: if(xmlCharEncodingInputFunc input=transcoder(fname).input) {
1.32 paf 597: outlen=buf_size*6/*max*/;
1.33.2.16 paf 598: out=(unsigned char*)malloc_func(outlen+1);
1.30 paf 599: error=input(
1.17 paf 600: out, &outlen,
601: (const unsigned char *)buf, (int *)&buf_size,
1.33.2.19.2.7 (paf 602:: transcoder(fname).inputInfo);
1.30 paf 603: } else {
604: outlen=buf_size;
1.33.2.16 paf 605: out=(unsigned char*)malloc_func(outlen+1);
1.30 paf 606: memcpy(out, buf, outlen);
607: error=0;
608: }
1.17 paf 609:
1.30 paf 610: if(error<0)
1.33.2.19.2.2 (paf 611:: throw Exception(0,
612:: 0,
1.30 paf 613: "transcode_buf failed (%d)", error);
1.8 paf 614:
1.30 paf 615: out[outlen/*surely would be less then on input*/]=0;
1.33.2.16 paf 616: return out;
617: }
618:
619: xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
620: return static_cast<xmlChar*>(transcode_buf2mchar(xmlMalloc, buf, buf_size));
621: }
622: static void* g_malloc_wrapper(size_t size) {
623: return g_malloc(size);
624: }
625: gchar* Charset::transcode_buf2gchar(const char* buf, size_t buf_size) {
626: return static_cast<gchar*>(transcode_buf2mchar(g_malloc_wrapper, buf, buf_size));
1.24 paf 627: }
1.33.2.6 paf 628: GdomeDOMString_auto_ptr Charset::transcode_buf2dom(const char* buf, size_t buf_size) {
1.33.2.16 paf 629: return GdomeDOMString_auto_ptr(transcode_buf2gchar(buf, buf_size));
1.1 paf 630: }
1.33.2.19.2.1 (paf 631:: GdomeDOMString_auto_ptr Charset::transcode(const String& s) {
1.33.2.19.2.7 (paf 632:: const char* cstr=s.cstr(String::L_UNSPECIFIED);
1.1 paf 633:
1.24 paf 634: return transcode_buf2dom(cstr, strlen(cstr));
1.33.2.19.2.8 (paf 635:: }
636:: GdomeDOMString_auto_ptr Charset::transcode(const StringBody s) {
637:: const char* cstr=s.cstr();
638::
639:: return transcode_buf2dom(cstr, s.length());
1.1 paf 640: }
641: #endif
E-mail: