Annotation of parser3/src/main/pa_charset.C, revision 1.33.2.14
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.33.2.6 paf 4: Copyright(c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.33.2.14! paf 8: static const char* IDENT_CHARSET_C="$Date: 2003/03/06 08:21:09 $";
1.1 paf 9:
10: #include "pa_charset.h"
1.33.2.13 paf 11: #include "pa_charsets.h"
1.1 paf 12:
13: #ifdef XML
1.8 paf 14: #include "libxml/encoding.h"
1.1 paf 15: #endif
16:
17: // helpers
18:
19: inline void prepare_case_tables(unsigned char *tables) {
20: unsigned char *lcc_table=tables+lcc_offset;
21: unsigned char *fcc_table=tables+fcc_offset;
22: for(int i=0; i<0x100; i++)
23: lcc_table[i]=fcc_table[i]=i;
24: }
25: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
26: unsigned char bit) {
27: unsigned char *ctypes_table=tables+ctypes_offset;
28: ctypes_table[0]=bit;
29: for(; *cstr; cstr++) {
30: unsigned char c=*cstr;
31: ctypes_table[c]|=bit;
32: }
33: }
1.33.2.6 paf 34: inline unsigned int to_wchar_code(const char* cstr) {
1.1 paf 35: if(!cstr || !*cstr)
36: return 0;
37: if(cstr[1]==0)
1.4 paf 38: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 39:
40: char *error_pos;
1.4 paf 41: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 42: }
1.33.2.6 paf 43: inline bool to_bool(const char* cstr) {
1.1 paf 44: return cstr && *cstr!=0;
45: }
46: static void element2ctypes(unsigned char c, bool belongs,
47: unsigned char *tables, unsigned char bit, int group_offset=-1) {
48: if(!belongs)
49: return;
50:
51: unsigned char *ctypes_table=tables+ctypes_offset;
52:
53: ctypes_table[c]|=bit;
54: if(group_offset>=0)
1.4 paf 55: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 56: }
57: static void element2case(unsigned char from, unsigned char to,
58: unsigned char *tables) {
59: if(!to)
60: return;
61:
62: unsigned char *lcc_table=tables+lcc_offset;
63: unsigned char *fcc_table=tables+fcc_offset;
64: lcc_table[from]=to;
65: fcc_table[from]=to; fcc_table[to]=from;
66: }
67:
68: // methods
69:
70: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.33.2.5 paf 71: Charset::Charset(StringPtr aname, StringPtr afile_spec):
1.33.2.1 paf 72: fname(aname),
73: fname_cstr(aname->cstr()) {
1.1 paf 74:
1.33.2.1 paf 75: for(char *c=fname_cstr; *c; c++)
1.10 paf 76: *c = toupper(*c);
1.7 paf 77:
1.33.2.3 paf 78: if(afile_spec) {
1.1 paf 79: fisUTF8=false;
1.33.2.9 paf 80: load_definition(afile_spec);
1.1 paf 81: #ifdef XML
1.33.2.1 paf 82: addEncoding(fname_cstr);
1.1 paf 83: #endif
84: } else {
85: fisUTF8=true;
1.4 paf 86: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 87: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
88: }
89:
90: #ifdef XML
1.33.2.1 paf 91: initTranscoder(fname, fname_cstr);
1.1 paf 92: #endif
93: }
94:
95: Charset::~Charset() {
1.33.2.1 paf 96: // @todonow unregister encodings
1.1 paf 97: #ifdef XML
1.9 paf 98: // not deleting transcoder, that's not our business
1.1 paf 99: #endif
100: }
101:
1.33.2.5 paf 102: void Charset::load_definition(StringPtr afile_spec) {
1.1 paf 103: // pcre_tables
104: // lowcase, flipcase, bits digit+word+whitespace, masks
105:
106: // must not move this inside of prepare_case_tables
107: // don't know the size there
108: memset(pcre_tables, 0, sizeof(pcre_tables));
109: prepare_case_tables(pcre_tables);
1.4 paf 110: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 111:
112: // charset
1.33.2.12 paf 113: memset(&tables, 0, sizeof(tables));
1.1 paf 114: // strangly vital
1.10 paf 115: tables.toTable[tables.toTableSize].intCh=0;
116: tables.toTable[tables.toTableSize].extCh=(XMLByte)0;
117: tables.toTableSize++;
1.1 paf 118:
119: // loading text
1.33.2.10 paf 120: Pool pool_for_load_only;
121: char *data=file_read_text(pool_for_load_only, *UTF8_charset, afile_spec);
1.1 paf 122:
123: // ignore header
124: getrow(&data);
125:
126: // parse cells
127: char *row;
128: while(row=getrow(&data)) {
129: // remove empty&comment lines
130: if(!*row || *row=='#')
131: continue;
132:
133: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
134: unsigned int c=0;
135: char *cell;
136: for(int column=0; cell=lsplit(&row, '\t'); column++) {
137: switch(column) {
138: case 0: c=to_wchar_code(cell); break;
139: // pcre_tables
140: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
141: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
142: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
143: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
144: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
145: case 6: element2case(c, to_wchar_code(cell), pcre_tables); break;
146: case 7:
147: case 8:
148: // charset
1.10 paf 149: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.23 paf 150: throw Exception("parser.runtime",
1.33.2.1 paf 151: afile_spec,
1.1 paf 152: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
153:
154: XMLCh unicode=(XMLCh)to_wchar_code(cell);
155: if(!unicode && column==7/*unicode1 column*/)
156: unicode=(XMLCh)c;
157: if(unicode) {
1.10 paf 158: if(!tables.fromTable[c])
159: tables.fromTable[c]=unicode;
160: tables.toTable[tables.toTableSize].intCh=unicode;
161: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
162: tables.toTableSize++;
1.1 paf 163: }
164: break;
165: }
166: }
167: };
168:
169: // sort by the Unicode code point
170: sort_ToTable();
171: }
172:
173: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
174: return
175: static_cast<const Charset_TransRec *>(a)->intCh-
176: static_cast<const Charset_TransRec *>(b)->intCh;
177: }
178:
179: void Charset::sort_ToTable() {
1.10 paf 180: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 181: sort_cmp_Trans_rec_intCh);
182: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 183: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 184: //fclose(f);
185: }
186:
1.10 paf 187: static XMLByte xlatOneTo(const XMLCh toXlat,
1.25 paf 188: const Charset::Tables& tables,
189: XMLByte not_found) {
1.1 paf 190: unsigned int lowOfs = 0;
1.10 paf 191: unsigned int hiOfs = tables.toTableSize - 1;
1.1 paf 192: XMLByte curByte = 0;
193: do {
194: // Calc the mid point of the low and high offset.
1.4 paf 195: const unsigned int midOfs =((hiOfs - lowOfs) / 2)+lowOfs;
1.1 paf 196:
197: // If our test char is greater than the mid point char, then
198: // we move up to the upper half. Else we move to the lower
199: // half. If its equal, then its our guy.
1.10 paf 200: if(toXlat>tables.toTable[midOfs].intCh)
1.1 paf 201: lowOfs = midOfs;
1.10 paf 202: else if(toXlat<tables.toTable[midOfs].intCh)
1.1 paf 203: hiOfs = midOfs;
204: else
1.10 paf 205: return tables.toTable[midOfs].extCh;
1.4 paf 206: } while(lowOfs+1<hiOfs);
1.1 paf 207:
1.25 paf 208: return not_found;
1.1 paf 209: }
210:
211: void Charset::transcode(Pool& pool,
1.33.2.14! paf 212: const Charset& source_charset, const void* source_body, size_t source_content_length,
1.1 paf 213: const Charset& dest_charset, const void *& dest_body, size_t& dest_content_length
214: ) {
1.4 paf 215: if(!source_content_length) {
216: dest_body=0;
217: dest_content_length=0;
218: return;
219: }
220:
1.1 paf 221: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
222: default: // 0x00
223: source_charset.transcodeToCharset(pool, dest_charset,
224: source_body, source_content_length,
225: dest_body, dest_content_length);
226: break;
227: case 0x01:
228: source_charset.transcodeToUTF8(pool,
229: source_body, source_content_length,
230: dest_body, dest_content_length);
231: break;
232: case 0x10:
233: dest_charset.transcodeFromUTF8(pool,
234: source_body, source_content_length,
235: dest_body, dest_content_length);
236: break;
237: case 0x11:
238: dest_body=source_body;
239: dest_content_length=source_content_length;
240: break;
241: }
242: }
243:
244: // ---------------------------------------------------------------------------
245: // Local static data
246: //
247: // gUTFBytes
248: // A list of counts of trailing bytes for each initial byte in the input.
249: //
250: // gUTFOffsets
251: // A list of values to offset each result char type, according to how
252: // many source bytes when into making it.
253: //
254: // gFirstByteMark
255: // A list of values to mask onto the first byte of an encoded sequence,
256: // indexed by the number of bytes used to create the sequence.
257: // ---------------------------------------------------------------------------
258: static const XMLByte gUTFBytes[0x100] = {
259: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
260: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
261: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
262: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
263: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
264: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
265: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
266: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
267: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
268: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
269: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
270: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
271: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
272: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
273: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
274: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
275: };
276:
277: static const uint gUTFOffsets[6] = {
278: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
279: };
280:
281: static const XMLByte gFirstByteMark[7] = {
282: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
283: };
284:
1.11 paf 285: static int transcodeToUTF8(
286: const XMLByte* srcData, size_t& srcLen,
287: XMLByte *toFill, size_t& toFillLen,
1.10 paf 288: const Charset::Tables& tables) {
1.11 paf 289: const XMLByte* srcPtr=srcData;
290: const XMLByte* srcEnd=srcData+srcLen;
291: XMLByte* outPtr=toFill;
292: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 293:
1.4 paf 294: while(srcPtr<srcEnd) {
1.10 paf 295: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 296: if(!curVal) {
297: // use the replacement character
1.4 paf 298: *outPtr++= '?';
299: srcPtr++;
1.1 paf 300: continue;
301: }
302:
303: // Figure out how many bytes we need
304: unsigned int encodedBytes;
1.4 paf 305: if(curVal<0x80)
1.1 paf 306: encodedBytes = 1;
1.4 paf 307: else if(curVal<0x800)
1.1 paf 308: encodedBytes = 2;
1.4 paf 309: else if(curVal<0x10000)
1.1 paf 310: encodedBytes = 3;
1.4 paf 311: else if(curVal<0x200000)
1.1 paf 312: encodedBytes = 4;
1.4 paf 313: else if(curVal<0x4000000)
1.1 paf 314: encodedBytes = 5;
1.4 paf 315: else if(curVal<= 0x7FFFFFFF)
1.1 paf 316: encodedBytes = 6;
317: else {
318: // use the replacement character
1.4 paf 319: *outPtr++= '?';
320: srcPtr++;
1.1 paf 321: continue;
322: }
323:
1.10 paf 324: // If we cannot fully get this char into the output buffer
325: if (outPtr + encodedBytes > outEnd)
326: break;
1.1 paf 327:
328: // We can do it, so update the source index
329: srcPtr++;
330:
331: // And spit out the bytes. We spit them out in reverse order
332: // here, so bump up the output pointer and work down as we go.
1.4 paf 333: outPtr+= encodedBytes;
1.1 paf 334: switch(encodedBytes) {
1.18 paf 335: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 336: curVal>>= 6;
1.18 paf 337: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 338: curVal>>= 6;
1.18 paf 339: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 340: curVal>>= 6;
1.18 paf 341: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 342: curVal>>= 6;
1.18 paf 343: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 344: curVal>>= 6;
1.18 paf 345: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
1.1 paf 346: }
347:
348: // Add the encoded bytes back in again to indicate we've eaten them
1.4 paf 349: outPtr+= encodedBytes;
1.1 paf 350: }
351:
1.11 paf 352: // Update the bytes eaten
353: srcLen = srcPtr - srcData;
354:
355: // Return the characters read
356: toFillLen = outPtr - toFill;
357:
1.29 paf 358: //return srcPtr==srcEnd?(int)toFillLen:-1;
359: /*
360: xmlCharEncodingInputFunc
361: Returns :
362: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
363: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
364: of ocetes consumed.
365: */
366: return 0;
1.1 paf 367: }
1.26 paf 368: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.30 paf 369: static int transcodeFromUTF8(
1.33.2.14! paf 370: const XMLByte* srcData, size_t& srcLen,
1.11 paf 371: XMLByte* toFill, size_t& toFillLen,
372: const Charset::Tables& tables) {
373: const XMLByte* srcPtr=srcData;
374: const XMLByte* srcEnd=srcData+srcLen;
375: XMLByte* outPtr=toFill;
376: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 377:
1.10 paf 378: // We now loop until we either run out of input data, or room to store
379: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
1.1 paf 380: // Get the next leading byte out
1.33.2.14! paf 381: const XMLByte firstByte =* srcPtr;
1.1 paf 382:
1.4 paf 383: // Special-case ASCII, which is a leading byte value of<= 127
384: if(firstByte<= 127) {
385: *outPtr++= firstByte;
1.1 paf 386: srcPtr++;
387: continue;
388: }
389:
390: // See how many trailing src bytes this sequence is going to require
391: const unsigned int trailingBytes = gUTFBytes[firstByte];
392:
393: // If there are not enough source bytes to do this one, then we
1.4 paf 394: // are done. Note that we done>= here because we are implicitly
1.1 paf 395: // counting the 1 byte we get no matter what.
1.4 paf 396: if(srcPtr+trailingBytes>= srcEnd)
1.1 paf 397: break;
398:
399: // Looks ok, so lets build up the value
400: uint tmpVal=0;
401: switch(trailingBytes) {
402: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
403: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
404: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
405: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
406: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
407: case 0: tmpVal+=*srcPtr++;
408: break;
409:
410: default:
1.23 paf 411: throw Exception(0,
1.33.2.1 paf 412: Exception::undefined_source,
1.4 paf 413: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes);
1.1 paf 414: }
415: tmpVal-=gUTFOffsets[trailingBytes];
416:
417: // If it will fit into a single char, then put it in. Otherwise
418: // fail [*encode it as a surrogate pair. If its not valid, use the
419: // replacement char.*]
1.25 paf 420: if(!(tmpVal & 0xFFFF0000)) {
421: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
422: *outPtr++=xlat;
423: else
424: outPtr+=sprintf((char *)outPtr, "&#%d;", tmpVal); // &#decimal;
425: } else
1.23 paf 426: throw Exception(0,
1.33.2.1 paf 427: Exception::undefined_source,
1.4 paf 428: "transcodeFromUTF8 error: too big tmpVal(0x%08X)", tmpVal);
1.1 paf 429: }
430:
1.11 paf 431: // Update the bytes eaten
432: srcLen = srcPtr - srcData;
433:
434: // Return the characters read
435: toFillLen = outPtr - toFill;
436:
1.29 paf 437: //return srcPtr==srcEnd?(int)toFillLen:-1;
438: /*
439: xmlCharEncodingOutputFunc
440: Returns :
441: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
442: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
443: of ocetes consumed.
444: */
445: return 0;
1.10 paf 446: }
447:
448: /// @todo not so memory-hungry with prescan
449: void Charset::transcodeToUTF8(Pool& pool,
1.33.2.14! paf 450: const void* source_body, size_t source_content_length,
1.11 paf 451: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 452: dest_content_length=source_content_length*6/*so that surly enough, max utf8 seq len=6*/;
1.33.2.8 paf 453: XMLByte *dest_body=new(pool) XMLByte[dest_content_length];
1.11 paf 454:
455: if(::transcodeToUTF8(
456: (XMLByte *)source_body, source_content_length,
457: dest_body, dest_content_length,
458: tables)<0)
1.10 paf 459: throw(0, 0,
460: 0,
1.11 paf 461: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 462:
1.1 paf 463: // return
464: adest_body=dest_body;
1.10 paf 465: }
466: void Charset::transcodeFromUTF8(Pool& pool,
1.33.2.14! paf 467: const void* source_body, size_t source_content_length,
1.11 paf 468: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 469: dest_content_length=source_content_length*6/*so that surly enough, "ÿ" has max ratio */;
1.33.2.8 paf 470: XMLByte *dest_body=new(pool) XMLByte[dest_content_length];
1.11 paf 471:
472: if(::transcodeFromUTF8(
473: (XMLByte *)source_body, source_content_length,
474: dest_body, dest_content_length,
475: tables)<0)
1.10 paf 476: throw(0, 0,
477: 0,
1.11 paf 478: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 479:
480: // return
481: adest_body=dest_body;
1.1 paf 482: }
483:
484: /// transcode using both charsets
485: void Charset::transcodeToCharset(Pool& pool,
486: const Charset& dest_charset,
1.33.2.14! paf 487: const void* source_body, size_t source_content_length,
1.6 paf 488: const void *& adest_body, size_t& adest_content_length) const {
1.3 paf 489: if(&dest_charset==this) {
1.6 paf 490: adest_body=source_body;
491: adest_content_length=source_content_length;
492: } else {
493: size_t dest_content_length=source_content_length;
1.33.2.8 paf 494: unsigned char *dest_body=new(pool) unsigned char[dest_content_length];
1.6 paf 495:
1.11 paf 496: const XMLByte* srcPtr=(XMLByte *)source_body;
497: const XMLByte* srcEnd=(XMLByte *)source_body+source_content_length;
1.6 paf 498:
499: for(XMLByte* outPtr=dest_body; srcPtr<srcEnd; srcPtr++) {
1.10 paf 500: XMLCh curVal = tables.fromTable[*srcPtr];
1.6 paf 501: if(curVal)
1.25 paf 502: *outPtr++=xlatOneTo(curVal, dest_charset.tables, '?');
1.6 paf 503: else {
504: // use the replacement character
505: *outPtr++= '?';
506: }
507: }
1.1 paf 508:
1.6 paf 509: adest_body=dest_body;
510: adest_content_length=dest_content_length;
511: }
1.1 paf 512: }
513:
514: #ifdef XML
1.10 paf 515: static int xml256CharEncodingInputFunc (
516: unsigned char *out,
517: int *outlen,
518: const unsigned char *in,
519: int *inlen,
520: void *info) {
521: return transcodeToUTF8(
1.21 paf 522: in, *(size_t*)inlen,
523: out, *(size_t*)outlen,
1.10 paf 524: *(const Charset::Tables *)info);
525: }
526:
527: static int xml256CharEncodingOutputFunc (
528: unsigned char *out,
529: int *outlen,
530: const unsigned char *in,
531: int *inlen,
532: void *info) {
533: return transcodeFromUTF8(
1.21 paf 534: in, *(size_t*)inlen,
535: out, *(size_t*)outlen,
1.10 paf 536: *(const Charset::Tables *)info);
537: }
538:
539:
540: void Charset::addEncoding(char *name_cstr) {
1.33.2.14! paf 541: xmlCharEncodingHandler* handler=new xmlCharEncodingHandler;
! 542: fcreated_handler=xmlCharEncodingHandlerPtr(fcreated_handler);
! 543:
1.10 paf 544: handler->name=name_cstr;
545: handler->input=xml256CharEncodingInputFunc; handler->inputInfo=&tables;
546: handler->output=xml256CharEncodingOutputFunc; handler->outputInfo=&tables;
547:
548: xmlRegisterCharEncodingHandler(handler);
549: }
550:
1.33.2.14! paf 551: void Charset::initTranscoder(StringPtr source, const char* name_cstr) {
1.15 paf 552: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
553: transcoder(source); // check right way
554: }
555:
1.33.2.14! paf 556: xmlCharEncodingHandler& Charset::transcoder(StringPtr source) {
1.15 paf 557: if(!ftranscoder)
1.23 paf 558: throw Exception("parser.runtime",
1.10 paf 559: source,
560: "unsupported encoding");
1.33.2.14! paf 561: return *ftranscoder;
1.10 paf 562: }
563:
1.33.2.14! paf 564: CharPtr Charset::transcode_cstr(xmlChar* s) {
1.13 paf 565: if(!s)
1.14 paf 566: return "";
1.8 paf 567:
1.33.2.6 paf 568: int inlen=strlen((const char* )s);
1.8 paf 569: int outlen=inlen+1; // max
1.33.2.10 paf 570: char *out=new char[outlen];
1.8 paf 571:
1.30 paf 572: int error;
1.33.2.14! paf 573: if(xmlCharEncodingOutputFunc output=transcoder(Exception::undefined_source).output) {
1.30 paf 574: error=output(
1.17 paf 575: (unsigned char*)out, &outlen,
576: (const unsigned char*)s, &inlen,
1.33.2.14! paf 577: transcoder(Exception::undefined_source).outputInfo);
1.30 paf 578: } else {
579: memcpy(out, s, outlen=inlen);
580: error=0;
581: }
582: if(error<0)
1.33.2.14! paf 583: throw Exception(Exception::undefined_type,
! 584: Exception::undefined_source,
1.30 paf 585: "transcode_cstr failed (%d)", error);
1.8 paf 586:
1.30 paf 587: out[outlen/*surely would be less then on input*/]=0;
1.8 paf 588: return out;
1.14 paf 589: }
1.33.2.14! paf 590: StringPtr Charset::transcode(xmlChar* s
1.31 paf 591: #ifndef NO_STRING_ORIGIN
1.33.2.14! paf 592: , StringPtr origin
1.31 paf 593: #endif
594: ) {
1.33.2.14! paf 595: StringPtr result(new String());
! 596: result->APPEND_CLEAN(transcode_cstr(s), 0/*auto-size*/, origin->origin().file, origin->origin().line);
1.31 paf 597: return result;
1.14 paf 598: }
1.33.2.14! paf 599: const char* Charset::transcode_cstr(GdomeDOMString* s) {
1.14 paf 600: return s?transcode_cstr(BAD_CAST s->str):"";
1.1 paf 601: }
1.33.2.14! paf 602: StringPtr Charset::transcode(GdomeDOMString* s
1.31 paf 603: #ifndef NO_STRING_ORIGIN
604: , const String *origin
605: #endif
606: ) {
1.33.2.14! paf 607: StringPtr result(new String());
! 608: result->APPEND_CLEAN(transcode_cstr(s), 0/*auto-size*/, origin->origin().file, origin->origin().line);
1.31 paf 609: return result;
1.1 paf 610: }
611:
1.8 paf 612: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.33.2.14! paf 613: xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
! 614: unsigned char* out;
1.30 paf 615: int outlen;
616: int error;
1.33.2.14! paf 617: if(xmlCharEncodingInputFunc input=transcoder(Exception::undefined_source).input) {
1.32 paf 618: outlen=buf_size*6/*max*/;
1.33.2.14! paf 619: out=(unsigned char*)malloc(sizeof(xmlChar)*(outlen+1));
1.30 paf 620: error=input(
1.17 paf 621: out, &outlen,
622: (const unsigned char *)buf, (int *)&buf_size,
1.33.2.14! paf 623: transcoder(Exception::undefined_source).inputInfo);
1.30 paf 624: } else {
625: outlen=buf_size;
1.33.2.14! paf 626: out=(unsigned char*)malloc(sizeof(xmlChar)*(outlen+1));
1.30 paf 627: memcpy(out, buf, outlen);
628: error=0;
629: }
1.17 paf 630:
1.30 paf 631: if(error<0)
1.33.2.14! paf 632: throw Exception(Exception::undefined_type,
! 633: Exception::undefined_source,
1.30 paf 634: "transcode_buf failed (%d)", error);
1.8 paf 635:
1.30 paf 636: out[outlen/*surely would be less then on input*/]=0;
1.33.2.14! paf 637: return (xmlChar*)out;
1.24 paf 638: }
1.33.2.6 paf 639: GdomeDOMString_auto_ptr Charset::transcode_buf2dom(const char* buf, size_t buf_size) {
1.24 paf 640: return GdomeDOMString_auto_ptr((gchar*)transcode_buf2xchar(buf, buf_size));
1.1 paf 641: }
1.33.2.14! paf 642: GdomeDOMString_auto_ptr Charset::transcode(StringPtr s) {
! 643: CharPtr cstr=s->cstr(String::UL_UNSPECIFIED);
1.1 paf 644:
1.24 paf 645: return transcode_buf2dom(cstr, strlen(cstr));
1.1 paf 646: }
647: #endif
E-mail: