Annotation of parser3/src/main/pa_charset.C, revision 1.33.2.19.2.10
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.33.2.6 paf 4: Copyright(c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.33.2.19.2.1 (paf 8:): static const char* IDENT_CHARSET_C="$Date: 2003/03/25 10:25:25 $";
1.1 paf 9:
10: #include "pa_charset.h"
1.33.2.13 paf 11: #include "pa_charsets.h"
1.1 paf 12:
13: #ifdef XML
1.8 paf 14: #include "libxml/encoding.h"
1.1 paf 15: #endif
16:
17: // helpers
18:
19: inline void prepare_case_tables(unsigned char *tables) {
20: unsigned char *lcc_table=tables+lcc_offset;
21: unsigned char *fcc_table=tables+fcc_offset;
22: for(int i=0; i<0x100; i++)
23: lcc_table[i]=fcc_table[i]=i;
24: }
25: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
26: unsigned char bit) {
27: unsigned char *ctypes_table=tables+ctypes_offset;
28: ctypes_table[0]=bit;
29: for(; *cstr; cstr++) {
30: unsigned char c=*cstr;
31: ctypes_table[c]|=bit;
32: }
33: }
1.33.2.6 paf 34: inline unsigned int to_wchar_code(const char* cstr) {
1.1 paf 35: if(!cstr || !*cstr)
36: return 0;
37: if(cstr[1]==0)
1.4 paf 38: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 39:
40: char *error_pos;
1.4 paf 41: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 42: }
1.33.2.6 paf 43: inline bool to_bool(const char* cstr) {
1.1 paf 44: return cstr && *cstr!=0;
45: }
46: static void element2ctypes(unsigned char c, bool belongs,
47: unsigned char *tables, unsigned char bit, int group_offset=-1) {
48: if(!belongs)
49: return;
50:
51: unsigned char *ctypes_table=tables+ctypes_offset;
52:
53: ctypes_table[c]|=bit;
54: if(group_offset>=0)
1.4 paf 55: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 56: }
57: static void element2case(unsigned char from, unsigned char to,
58: unsigned char *tables) {
59: if(!to)
60: return;
61:
62: unsigned char *lcc_table=tables+lcc_offset;
63: unsigned char *fcc_table=tables+fcc_offset;
64: lcc_table[from]=to;
65: fcc_table[from]=to; fcc_table[to]=from;
66: }
67:
68: // methods
69:
70: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.33.2.19.2.5 (paf 71:: Charset::Charset(const String& aname, const String* afile_spec):
1.33.2.1 paf 72: fname(aname),
1.33.2.19.2.5 (paf 73:: fname_cstr(aname.cstrm()) {
1.1 paf 74:
1.33.2.1 paf 75: for(char *c=fname_cstr; *c; c++)
1.33.2.19.2.5 (paf 76:: *c = toupper(*c);
1.7 paf 77:
1.33.2.3 paf 78: if(afile_spec) {
1.1 paf 79: fisUTF8=false;
1.33.2.19.2.5 (paf 80:: load_definition(*afile_spec);
1.1 paf 81: #ifdef XML
1.33.2.1 paf 82: addEncoding(fname_cstr);
1.1 paf 83: #endif
84: } else {
85: fisUTF8=true;
1.4 paf 86: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 87: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
88: }
89:
90: #ifdef XML
1.33.2.1 paf 91: initTranscoder(fname, fname_cstr);
1.1 paf 92: #endif
93: }
94:
95: Charset::~Charset() {
1.33.2.1 paf 96: // @todonow unregister encodings
1.1 paf 97: #ifdef XML
1.9 paf 98: // not deleting transcoder, that's not our business
1.1 paf 99: #endif
100: }
101:
1.33.2.19.2.1 (paf 102:: void Charset::load_definition(const String& afile_spec) {
1.1 paf 103: // pcre_tables
104: // lowcase, flipcase, bits digit+word+whitespace, masks
105:
106: // must not move this inside of prepare_case_tables
107: // don't know the size there
108: memset(pcre_tables, 0, sizeof(pcre_tables));
109: prepare_case_tables(pcre_tables);
1.4 paf 110: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 111:
112: // charset
1.33.2.12 paf 113: memset(&tables, 0, sizeof(tables));
1.1 paf 114: // strangly vital
1.10 paf 115: tables.toTable[tables.toTableSize].intCh=0;
116: tables.toTable[tables.toTableSize].extCh=(XMLByte)0;
117: tables.toTableSize++;
1.1 paf 118:
119: // loading text
1.33.2.19.2.5 (paf 120:: char *data=file_read_text(UTF8_charset, afile_spec);
1.1 paf 121:
122: // ignore header
123: getrow(&data);
124:
125: // parse cells
126: char *row;
127: while(row=getrow(&data)) {
128: // remove empty&comment lines
129: if(!*row || *row=='#')
130: continue;
131:
132: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
133: unsigned int c=0;
134: char *cell;
135: for(int column=0; cell=lsplit(&row, '\t'); column++) {
136: switch(column) {
137: case 0: c=to_wchar_code(cell); break;
138: // pcre_tables
139: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
140: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
141: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
142: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
143: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
144: case 6: element2case(c, to_wchar_code(cell), pcre_tables); break;
145: case 7:
146: case 8:
147: // charset
1.10 paf 148: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.23 paf 149: throw Exception("parser.runtime",
1.33.2.19.2.5 (paf 150:: &afile_spec,
1.1 paf 151: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
152:
153: XMLCh unicode=(XMLCh)to_wchar_code(cell);
154: if(!unicode && column==7/*unicode1 column*/)
155: unicode=(XMLCh)c;
156: if(unicode) {
1.10 paf 157: if(!tables.fromTable[c])
158: tables.fromTable[c]=unicode;
159: tables.toTable[tables.toTableSize].intCh=unicode;
160: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
161: tables.toTableSize++;
1.1 paf 162: }
163: break;
164: }
165: }
166: };
167:
168: // sort by the Unicode code point
169: sort_ToTable();
170: }
171:
172: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
173: return
174: static_cast<const Charset_TransRec *>(a)->intCh-
175: static_cast<const Charset_TransRec *>(b)->intCh;
176: }
177:
178: void Charset::sort_ToTable() {
1.10 paf 179: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 180: sort_cmp_Trans_rec_intCh);
181: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 182: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 183: //fclose(f);
184: }
185:
1.10 paf 186: static XMLByte xlatOneTo(const XMLCh toXlat,
1.25 paf 187: const Charset::Tables& tables,
188: XMLByte not_found) {
1.1 paf 189: unsigned int lowOfs = 0;
1.10 paf 190: unsigned int hiOfs = tables.toTableSize - 1;
1.1 paf 191: XMLByte curByte = 0;
192: do {
193: // Calc the mid point of the low and high offset.
1.4 paf 194: const unsigned int midOfs =((hiOfs - lowOfs) / 2)+lowOfs;
1.1 paf 195:
196: // If our test char is greater than the mid point char, then
197: // we move up to the upper half. Else we move to the lower
198: // half. If its equal, then its our guy.
1.10 paf 199: if(toXlat>tables.toTable[midOfs].intCh)
1.1 paf 200: lowOfs = midOfs;
1.10 paf 201: else if(toXlat<tables.toTable[midOfs].intCh)
1.1 paf 202: hiOfs = midOfs;
203: else
1.10 paf 204: return tables.toTable[midOfs].extCh;
1.4 paf 205: } while(lowOfs+1<hiOfs);
1.1 paf 206:
1.25 paf 207: return not_found;
1.1 paf 208: }
209:
1.33.2.19.2.1 (paf 210:: void Charset::transcode(
1.33.2.14 paf 211: const Charset& source_charset, const void* source_body, size_t source_content_length,
1.1 paf 212: const Charset& dest_charset, const void *& dest_body, size_t& dest_content_length
213: ) {
1.4 paf 214: if(!source_content_length) {
215: dest_body=0;
216: dest_content_length=0;
217: return;
218: }
219:
1.1 paf 220: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
221: default: // 0x00
1.33.2.19.2.3 (paf 222:: source_charset.transcodeToCharset(dest_charset,
1.1 paf 223: source_body, source_content_length,
224: dest_body, dest_content_length);
225: break;
226: case 0x01:
1.33.2.19.2.5 (paf 227:: source_charset.transcodeToUTF8(
1.1 paf 228: source_body, source_content_length,
229: dest_body, dest_content_length);
230: break;
231: case 0x10:
1.33.2.19.2.5 (paf 232:: dest_charset.transcodeFromUTF8(
1.1 paf 233: source_body, source_content_length,
234: dest_body, dest_content_length);
235: break;
236: case 0x11:
1.33.2.19.2.5 (paf 237:: dest_body=source_body;
238:: dest_content_length=source_content_length;
1.1 paf 239: break;
240: }
241: }
242:
243: // ---------------------------------------------------------------------------
244: // Local static data
245: //
246: // gUTFBytes
247: // A list of counts of trailing bytes for each initial byte in the input.
248: //
249: // gUTFOffsets
250: // A list of values to offset each result char type, according to how
251: // many source bytes when into making it.
252: //
253: // gFirstByteMark
254: // A list of values to mask onto the first byte of an encoded sequence,
255: // indexed by the number of bytes used to create the sequence.
256: // ---------------------------------------------------------------------------
257: static const XMLByte gUTFBytes[0x100] = {
258: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
259: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
260: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
261: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
262: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
263: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
264: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
265: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
266: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
267: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
268: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
269: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
270: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
271: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
272: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
273: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
274: };
275:
276: static const uint gUTFOffsets[6] = {
277: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
278: };
279:
280: static const XMLByte gFirstByteMark[7] = {
281: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
282: };
283:
1.11 paf 284: static int transcodeToUTF8(
285: const XMLByte* srcData, size_t& srcLen,
286: XMLByte *toFill, size_t& toFillLen,
1.10 paf 287: const Charset::Tables& tables) {
1.11 paf 288: const XMLByte* srcPtr=srcData;
289: const XMLByte* srcEnd=srcData+srcLen;
290: XMLByte* outPtr=toFill;
291: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 292:
1.4 paf 293: while(srcPtr<srcEnd) {
1.10 paf 294: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 295: if(!curVal) {
296: // use the replacement character
1.4 paf 297: *outPtr++= '?';
298: srcPtr++;
1.1 paf 299: continue;
300: }
301:
302: // Figure out how many bytes we need
303: unsigned int encodedBytes;
1.4 paf 304: if(curVal<0x80)
1.1 paf 305: encodedBytes = 1;
1.4 paf 306: else if(curVal<0x800)
1.1 paf 307: encodedBytes = 2;
1.4 paf 308: else if(curVal<0x10000)
1.1 paf 309: encodedBytes = 3;
1.4 paf 310: else if(curVal<0x200000)
1.1 paf 311: encodedBytes = 4;
1.4 paf 312: else if(curVal<0x4000000)
1.1 paf 313: encodedBytes = 5;
1.4 paf 314: else if(curVal<= 0x7FFFFFFF)
1.1 paf 315: encodedBytes = 6;
316: else {
317: // use the replacement character
1.4 paf 318: *outPtr++= '?';
319: srcPtr++;
1.1 paf 320: continue;
321: }
322:
1.10 paf 323: // If we cannot fully get this char into the output buffer
324: if (outPtr + encodedBytes > outEnd)
325: break;
1.1 paf 326:
327: // We can do it, so update the source index
328: srcPtr++;
329:
330: // And spit out the bytes. We spit them out in reverse order
331: // here, so bump up the output pointer and work down as we go.
1.4 paf 332: outPtr+= encodedBytes;
1.1 paf 333: switch(encodedBytes) {
1.18 paf 334: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 335: curVal>>= 6;
1.18 paf 336: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 337: curVal>>= 6;
1.18 paf 338: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 339: curVal>>= 6;
1.18 paf 340: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 341: curVal>>= 6;
1.18 paf 342: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 343: curVal>>= 6;
1.18 paf 344: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
1.1 paf 345: }
346:
347: // Add the encoded bytes back in again to indicate we've eaten them
1.4 paf 348: outPtr+= encodedBytes;
1.1 paf 349: }
350:
1.11 paf 351: // Update the bytes eaten
352: srcLen = srcPtr - srcData;
353:
354: // Return the characters read
355: toFillLen = outPtr - toFill;
356:
1.29 paf 357: //return srcPtr==srcEnd?(int)toFillLen:-1;
358: /*
359: xmlCharEncodingInputFunc
360: Returns :
361: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
362: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
363: of ocetes consumed.
364: */
365: return 0;
1.1 paf 366: }
1.26 paf 367: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.30 paf 368: static int transcodeFromUTF8(
1.33.2.14 paf 369: const XMLByte* srcData, size_t& srcLen,
1.11 paf 370: XMLByte* toFill, size_t& toFillLen,
371: const Charset::Tables& tables) {
372: const XMLByte* srcPtr=srcData;
373: const XMLByte* srcEnd=srcData+srcLen;
374: XMLByte* outPtr=toFill;
375: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 376:
1.10 paf 377: // We now loop until we either run out of input data, or room to store
378: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
1.1 paf 379: // Get the next leading byte out
1.33.2.14 paf 380: const XMLByte firstByte =* srcPtr;
1.1 paf 381:
1.4 paf 382: // Special-case ASCII, which is a leading byte value of<= 127
383: if(firstByte<= 127) {
384: *outPtr++= firstByte;
1.1 paf 385: srcPtr++;
386: continue;
387: }
388:
389: // See how many trailing src bytes this sequence is going to require
390: const unsigned int trailingBytes = gUTFBytes[firstByte];
391:
392: // If there are not enough source bytes to do this one, then we
1.4 paf 393: // are done. Note that we done>= here because we are implicitly
1.1 paf 394: // counting the 1 byte we get no matter what.
1.4 paf 395: if(srcPtr+trailingBytes>= srcEnd)
1.1 paf 396: break;
397:
398: // Looks ok, so lets build up the value
399: uint tmpVal=0;
400: switch(trailingBytes) {
401: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
402: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
403: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
404: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
405: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
406: case 0: tmpVal+=*srcPtr++;
407: break;
408:
409: default:
1.23 paf 410: throw Exception(0,
1.33.2.19.2.2 (paf 411:: 0,
1.4 paf 412: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes);
1.1 paf 413: }
414: tmpVal-=gUTFOffsets[trailingBytes];
415:
416: // If it will fit into a single char, then put it in. Otherwise
417: // fail [*encode it as a surrogate pair. If its not valid, use the
418: // replacement char.*]
1.25 paf 419: if(!(tmpVal & 0xFFFF0000)) {
420: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
421: *outPtr++=xlat;
422: else
423: outPtr+=sprintf((char *)outPtr, "&#%d;", tmpVal); // &#decimal;
424: } else
1.23 paf 425: throw Exception(0,
1.33.2.19.2.2 (paf 426:: 0,
1.4 paf 427: "transcodeFromUTF8 error: too big tmpVal(0x%08X)", tmpVal);
1.1 paf 428: }
429:
1.11 paf 430: // Update the bytes eaten
431: srcLen = srcPtr - srcData;
432:
433: // Return the characters read
434: toFillLen = outPtr - toFill;
435:
1.29 paf 436: //return srcPtr==srcEnd?(int)toFillLen:-1;
437: /*
438: xmlCharEncodingOutputFunc
439: Returns :
440: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
441: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
442: of ocetes consumed.
443: */
444: return 0;
1.10 paf 445: }
446:
447: /// @todo not so memory-hungry with prescan
1.33.2.19.2.1 (paf 448:: void Charset::transcodeToUTF8(
1.33.2.14 paf 449: const void* source_body, size_t source_content_length,
1.11 paf 450: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 451: dest_content_length=source_content_length*6/*so that surly enough, max utf8 seq len=6*/;
1.33.2.19.2.1 (paf 452:: XMLByte *dest_body=new XMLByte[dest_content_length];
1.11 paf 453:
454: if(::transcodeToUTF8(
455: (XMLByte *)source_body, source_content_length,
456: dest_body, dest_content_length,
457: tables)<0)
1.10 paf 458: throw(0, 0,
459: 0,
1.11 paf 460: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 461:
1.1 paf 462: // return
463: adest_body=dest_body;
1.10 paf 464: }
1.33.2.19.2.1 (paf 465:: void Charset::transcodeFromUTF8(
1.33.2.14 paf 466: const void* source_body, size_t source_content_length,
1.11 paf 467: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 468: dest_content_length=source_content_length*6/*so that surly enough, "ÿ" has max ratio */;
1.33.2.19.2.1 (paf 469:: XMLByte *dest_body=new XMLByte[dest_content_length];
1.11 paf 470:
471: if(::transcodeFromUTF8(
472: (XMLByte *)source_body, source_content_length,
473: dest_body, dest_content_length,
474: tables)<0)
1.10 paf 475: throw(0, 0,
476: 0,
1.11 paf 477: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 478:
479: // return
480: adest_body=dest_body;
1.1 paf 481: }
482:
483: /// transcode using both charsets
1.33.2.19.2.1 (paf 484:: void Charset::transcodeToCharset(
1.1 paf 485: const Charset& dest_charset,
1.33.2.14 paf 486: const void* source_body, size_t source_content_length,
1.6 paf 487: const void *& adest_body, size_t& adest_content_length) const {
1.3 paf 488: if(&dest_charset==this) {
1.33.2.19.2.5 (paf 489:: adest_body=source_body;
490:: adest_content_length=source_content_length;
1.6 paf 491: } else {
492: size_t dest_content_length=source_content_length;
1.33.2.19.2.1 (paf 493:: unsigned char *dest_body=new unsigned char[dest_content_length];
1.6 paf 494:
1.11 paf 495: const XMLByte* srcPtr=(XMLByte *)source_body;
496: const XMLByte* srcEnd=(XMLByte *)source_body+source_content_length;
1.6 paf 497:
498: for(XMLByte* outPtr=dest_body; srcPtr<srcEnd; srcPtr++) {
1.10 paf 499: XMLCh curVal = tables.fromTable[*srcPtr];
1.6 paf 500: if(curVal)
1.25 paf 501: *outPtr++=xlatOneTo(curVal, dest_charset.tables, '?');
1.6 paf 502: else {
503: // use the replacement character
504: *outPtr++= '?';
505: }
506: }
1.1 paf 507:
1.6 paf 508: adest_body=dest_body;
509: adest_content_length=dest_content_length;
510: }
1.1 paf 511: }
512:
513: #ifdef XML
1.10 paf 514: static int xml256CharEncodingInputFunc (
515: unsigned char *out,
516: int *outlen,
517: const unsigned char *in,
518: int *inlen,
519: void *info) {
520: return transcodeToUTF8(
1.33.2.19.2.9 (paf 521:: in, *(size_t*)inlen,
522:: out, *(size_t*)outlen,
523:: *(const Charset::Tables *)info);
1.10 paf 524: }
525:
526: static int xml256CharEncodingOutputFunc (
527: unsigned char *out,
528: int *outlen,
529: const unsigned char *in,
530: int *inlen,
531: void *info) {
532: return transcodeFromUTF8(
1.33.2.19.2.9 (paf 533:: in, *(size_t*)inlen,
534:: out, *(size_t*)outlen,
535:: *(const Charset::Tables *)info);
1.10 paf 536: }
537:
538:
539: void Charset::addEncoding(char *name_cstr) {
1.33.2.19.2.9 (paf 540:: xmlCharEncodingHandler* handler=new(PointerFreeGC) xmlCharEncodingHandler;
1.10 paf 541: handler->name=name_cstr;
542: handler->input=xml256CharEncodingInputFunc; handler->inputInfo=&tables;
543: handler->output=xml256CharEncodingOutputFunc; handler->outputInfo=&tables;
544:
545: xmlRegisterCharEncodingHandler(handler);
546: }
547:
1.33.2.19.2.7 (paf 548:: void Charset::initTranscoder(const String& name, const char* name_cstr) {
1.15 paf 549: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
1.33.2.19.2.7 (paf 550:: transcoder(name); // check right way
1.15 paf 551: }
552:
1.33.2.19.2.7 (paf 553:: xmlCharEncodingHandler& Charset::transcoder(const String& name) {
1.15 paf 554: if(!ftranscoder)
1.23 paf 555: throw Exception("parser.runtime",
1.33.2.19.2.7 (paf 556:: &name,
1.10 paf 557: "unsupported encoding");
1.33.2.14 paf 558: return *ftranscoder;
1.10 paf 559: }
560:
1.33.2.19.2.7 (paf 561:: const char* Charset::transcode_cstr(xmlChar* s) {
1.13 paf 562: if(!s)
1.14 paf 563: return "";
1.8 paf 564:
1.33.2.6 paf 565: int inlen=strlen((const char* )s);
1.8 paf 566: int outlen=inlen+1; // max
1.33.2.19.2.6 (paf 567:: char *out=new(PointerFreeGC) char[outlen];
1.8 paf 568:
1.30 paf 569: int error;
1.33.2.19.2.7 (paf 570:: if(xmlCharEncodingOutputFunc output=transcoder(fname).output) {
1.30 paf 571: error=output(
1.17 paf 572: (unsigned char*)out, &outlen,
573: (const unsigned char*)s, &inlen,
1.33.2.19.2.7 (paf 574:: transcoder(fname).outputInfo);
1.30 paf 575: } else {
576: memcpy(out, s, outlen=inlen);
577: error=0;
578: }
579: if(error<0)
1.33.2.19.2.2 (paf 580:: throw Exception(0,
581:: 0,
1.30 paf 582: "transcode_cstr failed (%d)", error);
1.8 paf 583:
1.30 paf 584: out[outlen/*surely would be less then on input*/]=0;
1.8 paf 585: return out;
1.14 paf 586: }
1.33.2.19.2.7 (paf 587:: const String& Charset::transcode(xmlChar* s) {
1.33.2.19.2.1 (paf 588:): return *new String(transcode_cstr(s), 0/*auto-size*/, true);
1.14 paf 589: }
1.33.2.19.2.7 (paf 590:: const char* Charset::transcode_cstr(GdomeDOMString* s) {
1.33.2.19.2.3 (paf 591:: return s?transcode_cstr(BAD_CAST s->str):"";
1.1 paf 592: }
1.33.2.19.2.7 (paf 593:: const String& Charset::transcode(GdomeDOMString* s) {
1.33.2.19.2.1 (paf 594:): return *new String(transcode_cstr(s), 0/*auto-size*/, true);
1.1 paf 595: }
596:
1.8 paf 597: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.33.2.16 paf 598: void* Charset::transcode_buf2mchar(transcode_buf_malloc_func malloc_func,
1.33.2.19.2.8 (paf 599:: const char* buf, size_t buf_size) {
1.33.2.14 paf 600: unsigned char* out;
1.30 paf 601: int outlen;
602: int error;
1.33.2.19.2.7 (paf 603:: if(xmlCharEncodingInputFunc input=transcoder(fname).input) {
1.32 paf 604: outlen=buf_size*6/*max*/;
1.33.2.16 paf 605: out=(unsigned char*)malloc_func(outlen+1);
1.30 paf 606: error=input(
1.17 paf 607: out, &outlen,
608: (const unsigned char *)buf, (int *)&buf_size,
1.33.2.19.2.7 (paf 609:: transcoder(fname).inputInfo);
1.30 paf 610: } else {
611: outlen=buf_size;
1.33.2.16 paf 612: out=(unsigned char*)malloc_func(outlen+1);
1.30 paf 613: memcpy(out, buf, outlen);
614: error=0;
615: }
1.17 paf 616:
1.30 paf 617: if(error<0)
1.33.2.19.2.2 (paf 618:: throw Exception(0,
619:: 0,
1.30 paf 620: "transcode_buf failed (%d)", error);
1.8 paf 621:
1.30 paf 622: out[outlen/*surely would be less then on input*/]=0;
1.33.2.16 paf 623: return out;
624: }
625:
626: xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
627: return static_cast<xmlChar*>(transcode_buf2mchar(xmlMalloc, buf, buf_size));
628: }
629: static void* g_malloc_wrapper(size_t size) {
630: return g_malloc(size);
631: }
632: gchar* Charset::transcode_buf2gchar(const char* buf, size_t buf_size) {
633: return static_cast<gchar*>(transcode_buf2mchar(g_malloc_wrapper, buf, buf_size));
1.24 paf 634: }
1.33.2.6 paf 635: GdomeDOMString_auto_ptr Charset::transcode_buf2dom(const char* buf, size_t buf_size) {
1.33.2.16 paf 636: return GdomeDOMString_auto_ptr(transcode_buf2gchar(buf, buf_size));
1.1 paf 637: }
1.33.2.19.2.1 (paf 638:: GdomeDOMString_auto_ptr Charset::transcode(const String& s) {
1.33.2.19.2.7 (paf 639:: const char* cstr=s.cstr(String::L_UNSPECIFIED);
1.1 paf 640:
1.24 paf 641: return transcode_buf2dom(cstr, strlen(cstr));
1.33.2.19.2.8 (paf 642:: }
643:: GdomeDOMString_auto_ptr Charset::transcode(const StringBody s) {
644:: const char* cstr=s.cstr();
645::
646:: return transcode_buf2dom(cstr, s.length());
1.1 paf 647: }
648: #endif
E-mail: