Annotation of parser3/src/main/pa_charset.C, revision 1.33.2.19
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.33.2.6 paf 4: Copyright(c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.33.2.19! paf 8: static const char* IDENT_CHARSET_C="$Date: 2003/03/07 15:01:04 $";
1.1 paf 9:
10: #include "pa_charset.h"
1.33.2.13 paf 11: #include "pa_charsets.h"
1.1 paf 12:
13: #ifdef XML
1.8 paf 14: #include "libxml/encoding.h"
1.1 paf 15: #endif
16:
17: // helpers
18:
19: inline void prepare_case_tables(unsigned char *tables) {
20: unsigned char *lcc_table=tables+lcc_offset;
21: unsigned char *fcc_table=tables+fcc_offset;
22: for(int i=0; i<0x100; i++)
23: lcc_table[i]=fcc_table[i]=i;
24: }
25: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
26: unsigned char bit) {
27: unsigned char *ctypes_table=tables+ctypes_offset;
28: ctypes_table[0]=bit;
29: for(; *cstr; cstr++) {
30: unsigned char c=*cstr;
31: ctypes_table[c]|=bit;
32: }
33: }
1.33.2.6 paf 34: inline unsigned int to_wchar_code(const char* cstr) {
1.1 paf 35: if(!cstr || !*cstr)
36: return 0;
37: if(cstr[1]==0)
1.4 paf 38: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 39:
40: char *error_pos;
1.4 paf 41: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 42: }
1.33.2.6 paf 43: inline bool to_bool(const char* cstr) {
1.1 paf 44: return cstr && *cstr!=0;
45: }
46: static void element2ctypes(unsigned char c, bool belongs,
47: unsigned char *tables, unsigned char bit, int group_offset=-1) {
48: if(!belongs)
49: return;
50:
51: unsigned char *ctypes_table=tables+ctypes_offset;
52:
53: ctypes_table[c]|=bit;
54: if(group_offset>=0)
1.4 paf 55: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 56: }
57: static void element2case(unsigned char from, unsigned char to,
58: unsigned char *tables) {
59: if(!to)
60: return;
61:
62: unsigned char *lcc_table=tables+lcc_offset;
63: unsigned char *fcc_table=tables+fcc_offset;
64: lcc_table[from]=to;
65: fcc_table[from]=to; fcc_table[to]=from;
66: }
67:
68: // methods
69:
70: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.33.2.5 paf 71: Charset::Charset(StringPtr aname, StringPtr afile_spec):
1.33.2.1 paf 72: fname(aname),
73: fname_cstr(aname->cstr()) {
1.1 paf 74:
1.33.2.1 paf 75: for(char *c=fname_cstr; *c; c++)
1.10 paf 76: *c = toupper(*c);
1.7 paf 77:
1.33.2.3 paf 78: if(afile_spec) {
1.1 paf 79: fisUTF8=false;
1.33.2.9 paf 80: load_definition(afile_spec);
1.1 paf 81: #ifdef XML
1.33.2.1 paf 82: addEncoding(fname_cstr);
1.1 paf 83: #endif
84: } else {
85: fisUTF8=true;
1.4 paf 86: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 87: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
88: }
89:
90: #ifdef XML
1.33.2.1 paf 91: initTranscoder(fname, fname_cstr);
1.1 paf 92: #endif
93: }
94:
95: Charset::~Charset() {
1.33.2.1 paf 96: // @todonow unregister encodings
1.1 paf 97: #ifdef XML
1.9 paf 98: // not deleting transcoder, that's not our business
1.1 paf 99: #endif
100: }
101:
1.33.2.5 paf 102: void Charset::load_definition(StringPtr afile_spec) {
1.1 paf 103: // pcre_tables
104: // lowcase, flipcase, bits digit+word+whitespace, masks
105:
106: // must not move this inside of prepare_case_tables
107: // don't know the size there
108: memset(pcre_tables, 0, sizeof(pcre_tables));
109: prepare_case_tables(pcre_tables);
1.4 paf 110: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 111:
112: // charset
1.33.2.12 paf 113: memset(&tables, 0, sizeof(tables));
1.1 paf 114: // strangly vital
1.10 paf 115: tables.toTable[tables.toTableSize].intCh=0;
116: tables.toTable[tables.toTableSize].extCh=(XMLByte)0;
117: tables.toTableSize++;
1.1 paf 118:
119: // loading text
1.33.2.10 paf 120: Pool pool_for_load_only;
121: char *data=file_read_text(pool_for_load_only, *UTF8_charset, afile_spec);
1.1 paf 122:
123: // ignore header
124: getrow(&data);
125:
126: // parse cells
127: char *row;
128: while(row=getrow(&data)) {
129: // remove empty&comment lines
130: if(!*row || *row=='#')
131: continue;
132:
133: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
134: unsigned int c=0;
135: char *cell;
136: for(int column=0; cell=lsplit(&row, '\t'); column++) {
137: switch(column) {
138: case 0: c=to_wchar_code(cell); break;
139: // pcre_tables
140: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
141: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
142: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
143: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
144: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
145: case 6: element2case(c, to_wchar_code(cell), pcre_tables); break;
146: case 7:
147: case 8:
148: // charset
1.10 paf 149: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.23 paf 150: throw Exception("parser.runtime",
1.33.2.1 paf 151: afile_spec,
1.1 paf 152: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
153:
154: XMLCh unicode=(XMLCh)to_wchar_code(cell);
155: if(!unicode && column==7/*unicode1 column*/)
156: unicode=(XMLCh)c;
157: if(unicode) {
1.10 paf 158: if(!tables.fromTable[c])
159: tables.fromTable[c]=unicode;
160: tables.toTable[tables.toTableSize].intCh=unicode;
161: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
162: tables.toTableSize++;
1.1 paf 163: }
164: break;
165: }
166: }
167: };
168:
169: // sort by the Unicode code point
170: sort_ToTable();
171: }
172:
173: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
174: return
175: static_cast<const Charset_TransRec *>(a)->intCh-
176: static_cast<const Charset_TransRec *>(b)->intCh;
177: }
178:
179: void Charset::sort_ToTable() {
1.10 paf 180: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 181: sort_cmp_Trans_rec_intCh);
182: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 183: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 184: //fclose(f);
185: }
186:
1.10 paf 187: static XMLByte xlatOneTo(const XMLCh toXlat,
1.25 paf 188: const Charset::Tables& tables,
189: XMLByte not_found) {
1.1 paf 190: unsigned int lowOfs = 0;
1.10 paf 191: unsigned int hiOfs = tables.toTableSize - 1;
1.1 paf 192: XMLByte curByte = 0;
193: do {
194: // Calc the mid point of the low and high offset.
1.4 paf 195: const unsigned int midOfs =((hiOfs - lowOfs) / 2)+lowOfs;
1.1 paf 196:
197: // If our test char is greater than the mid point char, then
198: // we move up to the upper half. Else we move to the lower
199: // half. If its equal, then its our guy.
1.10 paf 200: if(toXlat>tables.toTable[midOfs].intCh)
1.1 paf 201: lowOfs = midOfs;
1.10 paf 202: else if(toXlat<tables.toTable[midOfs].intCh)
1.1 paf 203: hiOfs = midOfs;
204: else
1.10 paf 205: return tables.toTable[midOfs].extCh;
1.4 paf 206: } while(lowOfs+1<hiOfs);
1.1 paf 207:
1.25 paf 208: return not_found;
1.1 paf 209: }
210:
211: void Charset::transcode(Pool& pool,
1.33.2.14 paf 212: const Charset& source_charset, const void* source_body, size_t source_content_length,
1.1 paf 213: const Charset& dest_charset, const void *& dest_body, size_t& dest_content_length
214: ) {
1.4 paf 215: if(!source_content_length) {
216: dest_body=0;
217: dest_content_length=0;
218: return;
219: }
220:
1.1 paf 221: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
222: default: // 0x00
223: source_charset.transcodeToCharset(pool, dest_charset,
224: source_body, source_content_length,
225: dest_body, dest_content_length);
226: break;
227: case 0x01:
228: source_charset.transcodeToUTF8(pool,
229: source_body, source_content_length,
230: dest_body, dest_content_length);
231: break;
232: case 0x10:
233: dest_charset.transcodeFromUTF8(pool,
234: source_body, source_content_length,
235: dest_body, dest_content_length);
236: break;
237: case 0x11:
1.33.2.19! paf 238: dest_body=pool.copy((char*)source_body, dest_content_length=source_content_length);
1.1 paf 239: break;
240: }
241: }
242:
243: // ---------------------------------------------------------------------------
244: // Local static data
245: //
246: // gUTFBytes
247: // A list of counts of trailing bytes for each initial byte in the input.
248: //
249: // gUTFOffsets
250: // A list of values to offset each result char type, according to how
251: // many source bytes when into making it.
252: //
253: // gFirstByteMark
254: // A list of values to mask onto the first byte of an encoded sequence,
255: // indexed by the number of bytes used to create the sequence.
256: // ---------------------------------------------------------------------------
257: static const XMLByte gUTFBytes[0x100] = {
258: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
259: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
260: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
261: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
262: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
263: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
264: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
265: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
266: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
267: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
268: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
269: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
270: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
271: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
272: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
273: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
274: };
275:
276: static const uint gUTFOffsets[6] = {
277: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
278: };
279:
280: static const XMLByte gFirstByteMark[7] = {
281: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
282: };
283:
1.11 paf 284: static int transcodeToUTF8(
285: const XMLByte* srcData, size_t& srcLen,
286: XMLByte *toFill, size_t& toFillLen,
1.10 paf 287: const Charset::Tables& tables) {
1.11 paf 288: const XMLByte* srcPtr=srcData;
289: const XMLByte* srcEnd=srcData+srcLen;
290: XMLByte* outPtr=toFill;
291: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 292:
1.4 paf 293: while(srcPtr<srcEnd) {
1.10 paf 294: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 295: if(!curVal) {
296: // use the replacement character
1.4 paf 297: *outPtr++= '?';
298: srcPtr++;
1.1 paf 299: continue;
300: }
301:
302: // Figure out how many bytes we need
303: unsigned int encodedBytes;
1.4 paf 304: if(curVal<0x80)
1.1 paf 305: encodedBytes = 1;
1.4 paf 306: else if(curVal<0x800)
1.1 paf 307: encodedBytes = 2;
1.4 paf 308: else if(curVal<0x10000)
1.1 paf 309: encodedBytes = 3;
1.4 paf 310: else if(curVal<0x200000)
1.1 paf 311: encodedBytes = 4;
1.4 paf 312: else if(curVal<0x4000000)
1.1 paf 313: encodedBytes = 5;
1.4 paf 314: else if(curVal<= 0x7FFFFFFF)
1.1 paf 315: encodedBytes = 6;
316: else {
317: // use the replacement character
1.4 paf 318: *outPtr++= '?';
319: srcPtr++;
1.1 paf 320: continue;
321: }
322:
1.10 paf 323: // If we cannot fully get this char into the output buffer
324: if (outPtr + encodedBytes > outEnd)
325: break;
1.1 paf 326:
327: // We can do it, so update the source index
328: srcPtr++;
329:
330: // And spit out the bytes. We spit them out in reverse order
331: // here, so bump up the output pointer and work down as we go.
1.4 paf 332: outPtr+= encodedBytes;
1.1 paf 333: switch(encodedBytes) {
1.18 paf 334: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 335: curVal>>= 6;
1.18 paf 336: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 337: curVal>>= 6;
1.18 paf 338: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 339: curVal>>= 6;
1.18 paf 340: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 341: curVal>>= 6;
1.18 paf 342: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
1.4 paf 343: curVal>>= 6;
1.18 paf 344: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
1.1 paf 345: }
346:
347: // Add the encoded bytes back in again to indicate we've eaten them
1.4 paf 348: outPtr+= encodedBytes;
1.1 paf 349: }
350:
1.11 paf 351: // Update the bytes eaten
352: srcLen = srcPtr - srcData;
353:
354: // Return the characters read
355: toFillLen = outPtr - toFill;
356:
1.29 paf 357: //return srcPtr==srcEnd?(int)toFillLen:-1;
358: /*
359: xmlCharEncodingInputFunc
360: Returns :
361: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
362: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
363: of ocetes consumed.
364: */
365: return 0;
1.1 paf 366: }
1.26 paf 367: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.30 paf 368: static int transcodeFromUTF8(
1.33.2.14 paf 369: const XMLByte* srcData, size_t& srcLen,
1.11 paf 370: XMLByte* toFill, size_t& toFillLen,
371: const Charset::Tables& tables) {
372: const XMLByte* srcPtr=srcData;
373: const XMLByte* srcEnd=srcData+srcLen;
374: XMLByte* outPtr=toFill;
375: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 376:
1.10 paf 377: // We now loop until we either run out of input data, or room to store
378: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
1.1 paf 379: // Get the next leading byte out
1.33.2.14 paf 380: const XMLByte firstByte =* srcPtr;
1.1 paf 381:
1.4 paf 382: // Special-case ASCII, which is a leading byte value of<= 127
383: if(firstByte<= 127) {
384: *outPtr++= firstByte;
1.1 paf 385: srcPtr++;
386: continue;
387: }
388:
389: // See how many trailing src bytes this sequence is going to require
390: const unsigned int trailingBytes = gUTFBytes[firstByte];
391:
392: // If there are not enough source bytes to do this one, then we
1.4 paf 393: // are done. Note that we done>= here because we are implicitly
1.1 paf 394: // counting the 1 byte we get no matter what.
1.4 paf 395: if(srcPtr+trailingBytes>= srcEnd)
1.1 paf 396: break;
397:
398: // Looks ok, so lets build up the value
399: uint tmpVal=0;
400: switch(trailingBytes) {
401: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
402: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
403: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
404: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
405: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
406: case 0: tmpVal+=*srcPtr++;
407: break;
408:
409: default:
1.23 paf 410: throw Exception(0,
1.33.2.1 paf 411: Exception::undefined_source,
1.4 paf 412: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes);
1.1 paf 413: }
414: tmpVal-=gUTFOffsets[trailingBytes];
415:
416: // If it will fit into a single char, then put it in. Otherwise
417: // fail [*encode it as a surrogate pair. If its not valid, use the
418: // replacement char.*]
1.25 paf 419: if(!(tmpVal & 0xFFFF0000)) {
420: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
421: *outPtr++=xlat;
422: else
423: outPtr+=sprintf((char *)outPtr, "&#%d;", tmpVal); // &#decimal;
424: } else
1.23 paf 425: throw Exception(0,
1.33.2.1 paf 426: Exception::undefined_source,
1.4 paf 427: "transcodeFromUTF8 error: too big tmpVal(0x%08X)", tmpVal);
1.1 paf 428: }
429:
1.11 paf 430: // Update the bytes eaten
431: srcLen = srcPtr - srcData;
432:
433: // Return the characters read
434: toFillLen = outPtr - toFill;
435:
1.29 paf 436: //return srcPtr==srcEnd?(int)toFillLen:-1;
437: /*
438: xmlCharEncodingOutputFunc
439: Returns :
440: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
441: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
442: of ocetes consumed.
443: */
444: return 0;
1.10 paf 445: }
446:
447: /// @todo not so memory-hungry with prescan
448: void Charset::transcodeToUTF8(Pool& pool,
1.33.2.14 paf 449: const void* source_body, size_t source_content_length,
1.11 paf 450: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 451: dest_content_length=source_content_length*6/*so that surly enough, max utf8 seq len=6*/;
1.33.2.8 paf 452: XMLByte *dest_body=new(pool) XMLByte[dest_content_length];
1.11 paf 453:
454: if(::transcodeToUTF8(
455: (XMLByte *)source_body, source_content_length,
456: dest_body, dest_content_length,
457: tables)<0)
1.10 paf 458: throw(0, 0,
459: 0,
1.11 paf 460: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 461:
1.1 paf 462: // return
463: adest_body=dest_body;
1.10 paf 464: }
465: void Charset::transcodeFromUTF8(Pool& pool,
1.33.2.14 paf 466: const void* source_body, size_t source_content_length,
1.11 paf 467: const void *& adest_body, size_t& dest_content_length) const {
1.25 paf 468: dest_content_length=source_content_length*6/*so that surly enough, "ÿ" has max ratio */;
1.33.2.8 paf 469: XMLByte *dest_body=new(pool) XMLByte[dest_content_length];
1.11 paf 470:
471: if(::transcodeFromUTF8(
472: (XMLByte *)source_body, source_content_length,
473: dest_body, dest_content_length,
474: tables)<0)
1.10 paf 475: throw(0, 0,
476: 0,
1.11 paf 477: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 478:
479: // return
480: adest_body=dest_body;
1.1 paf 481: }
482:
483: /// transcode using both charsets
484: void Charset::transcodeToCharset(Pool& pool,
485: const Charset& dest_charset,
1.33.2.14 paf 486: const void* source_body, size_t source_content_length,
1.6 paf 487: const void *& adest_body, size_t& adest_content_length) const {
1.3 paf 488: if(&dest_charset==this) {
1.33.2.19! paf 489: adest_body=pool.copy((char*)source_body, adest_content_length=source_content_length);
1.6 paf 490: } else {
491: size_t dest_content_length=source_content_length;
1.33.2.8 paf 492: unsigned char *dest_body=new(pool) unsigned char[dest_content_length];
1.6 paf 493:
1.11 paf 494: const XMLByte* srcPtr=(XMLByte *)source_body;
495: const XMLByte* srcEnd=(XMLByte *)source_body+source_content_length;
1.6 paf 496:
497: for(XMLByte* outPtr=dest_body; srcPtr<srcEnd; srcPtr++) {
1.10 paf 498: XMLCh curVal = tables.fromTable[*srcPtr];
1.6 paf 499: if(curVal)
1.25 paf 500: *outPtr++=xlatOneTo(curVal, dest_charset.tables, '?');
1.6 paf 501: else {
502: // use the replacement character
503: *outPtr++= '?';
504: }
505: }
1.1 paf 506:
1.6 paf 507: adest_body=dest_body;
508: adest_content_length=dest_content_length;
509: }
1.1 paf 510: }
511:
512: #ifdef XML
1.10 paf 513: static int xml256CharEncodingInputFunc (
514: unsigned char *out,
515: int *outlen,
516: const unsigned char *in,
517: int *inlen,
518: void *info) {
519: return transcodeToUTF8(
1.21 paf 520: in, *(size_t*)inlen,
521: out, *(size_t*)outlen,
1.10 paf 522: *(const Charset::Tables *)info);
523: }
524:
525: static int xml256CharEncodingOutputFunc (
526: unsigned char *out,
527: int *outlen,
528: const unsigned char *in,
529: int *inlen,
530: void *info) {
531: return transcodeFromUTF8(
1.21 paf 532: in, *(size_t*)inlen,
533: out, *(size_t*)outlen,
1.10 paf 534: *(const Charset::Tables *)info);
535: }
536:
537:
538: void Charset::addEncoding(char *name_cstr) {
1.33.2.14 paf 539: xmlCharEncodingHandler* handler=new xmlCharEncodingHandler;
1.33.2.18 paf 540: fcreated_handler=xmlCharEncodingHandlerPtr(handler);
1.33.2.14 paf 541:
1.10 paf 542: handler->name=name_cstr;
543: handler->input=xml256CharEncodingInputFunc; handler->inputInfo=&tables;
544: handler->output=xml256CharEncodingOutputFunc; handler->outputInfo=&tables;
545:
546: xmlRegisterCharEncodingHandler(handler);
547: }
548:
1.33.2.14 paf 549: void Charset::initTranscoder(StringPtr source, const char* name_cstr) {
1.15 paf 550: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
551: transcoder(source); // check right way
552: }
553:
1.33.2.14 paf 554: xmlCharEncodingHandler& Charset::transcoder(StringPtr source) {
1.15 paf 555: if(!ftranscoder)
1.23 paf 556: throw Exception("parser.runtime",
1.10 paf 557: source,
558: "unsupported encoding");
1.33.2.14 paf 559: return *ftranscoder;
1.10 paf 560: }
561:
1.33.2.17 paf 562: const char* Charset::transcode_cstr(Pool& pool, xmlChar* s) {
1.13 paf 563: if(!s)
1.14 paf 564: return "";
1.8 paf 565:
1.33.2.6 paf 566: int inlen=strlen((const char* )s);
1.8 paf 567: int outlen=inlen+1; // max
1.33.2.17 paf 568: char *out=new(pool) char[outlen];
1.8 paf 569:
1.30 paf 570: int error;
1.33.2.14 paf 571: if(xmlCharEncodingOutputFunc output=transcoder(Exception::undefined_source).output) {
1.30 paf 572: error=output(
1.17 paf 573: (unsigned char*)out, &outlen,
574: (const unsigned char*)s, &inlen,
1.33.2.14 paf 575: transcoder(Exception::undefined_source).outputInfo);
1.30 paf 576: } else {
577: memcpy(out, s, outlen=inlen);
578: error=0;
579: }
580: if(error<0)
1.33.2.14 paf 581: throw Exception(Exception::undefined_type,
582: Exception::undefined_source,
1.30 paf 583: "transcode_cstr failed (%d)", error);
1.8 paf 584:
1.30 paf 585: out[outlen/*surely would be less then on input*/]=0;
1.8 paf 586: return out;
1.14 paf 587: }
1.33.2.17 paf 588: StringPtr Charset::transcode(Pool& pool, xmlChar* s
1.31 paf 589: #ifndef NO_STRING_ORIGIN
1.33.2.14 paf 590: , StringPtr origin
1.31 paf 591: #endif
592: ) {
1.33.2.14 paf 593: StringPtr result(new String());
1.33.2.17 paf 594: result->APPEND_CLEAN(
595: transcode_cstr(pool, s), 0/*auto-size*/,
596: origin->origin().file, origin->origin().line);
1.31 paf 597: return result;
1.14 paf 598: }
1.33.2.17 paf 599: const char* Charset::transcode_cstr(Pool& pool, GdomeDOMString* s) {
600: return s?transcode_cstr(pool, BAD_CAST s->str):"";
1.1 paf 601: }
1.33.2.17 paf 602: StringPtr Charset::transcode(Pool& pool, GdomeDOMString* s
1.31 paf 603: #ifndef NO_STRING_ORIGIN
1.33.2.15 paf 604: , StringPtr origin
1.31 paf 605: #endif
606: ) {
1.33.2.14 paf 607: StringPtr result(new String());
1.33.2.17 paf 608: result->APPEND_CLEAN(transcode_cstr(pool, s), 0/*auto-size*/, origin->origin().file, origin->origin().line);
1.31 paf 609: return result;
1.1 paf 610: }
611:
1.8 paf 612: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.33.2.16 paf 613: void* Charset::transcode_buf2mchar(transcode_buf_malloc_func malloc_func,
614: const char* buf, size_t buf_size) {
1.33.2.14 paf 615: unsigned char* out;
1.30 paf 616: int outlen;
617: int error;
1.33.2.14 paf 618: if(xmlCharEncodingInputFunc input=transcoder(Exception::undefined_source).input) {
1.32 paf 619: outlen=buf_size*6/*max*/;
1.33.2.16 paf 620: out=(unsigned char*)malloc_func(outlen+1);
1.30 paf 621: error=input(
1.17 paf 622: out, &outlen,
623: (const unsigned char *)buf, (int *)&buf_size,
1.33.2.14 paf 624: transcoder(Exception::undefined_source).inputInfo);
1.30 paf 625: } else {
626: outlen=buf_size;
1.33.2.16 paf 627: out=(unsigned char*)malloc_func(outlen+1);
1.30 paf 628: memcpy(out, buf, outlen);
629: error=0;
630: }
1.17 paf 631:
1.30 paf 632: if(error<0)
1.33.2.14 paf 633: throw Exception(Exception::undefined_type,
634: Exception::undefined_source,
1.30 paf 635: "transcode_buf failed (%d)", error);
1.8 paf 636:
1.30 paf 637: out[outlen/*surely would be less then on input*/]=0;
1.33.2.16 paf 638: return out;
639: }
640:
641: xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
642: return static_cast<xmlChar*>(transcode_buf2mchar(xmlMalloc, buf, buf_size));
643: }
644: static void* g_malloc_wrapper(size_t size) {
645: return g_malloc(size);
646: }
647: gchar* Charset::transcode_buf2gchar(const char* buf, size_t buf_size) {
648: return static_cast<gchar*>(transcode_buf2mchar(g_malloc_wrapper, buf, buf_size));
1.24 paf 649: }
1.33.2.6 paf 650: GdomeDOMString_auto_ptr Charset::transcode_buf2dom(const char* buf, size_t buf_size) {
1.33.2.16 paf 651: return GdomeDOMString_auto_ptr(transcode_buf2gchar(buf, buf_size));
1.1 paf 652: }
1.33.2.14 paf 653: GdomeDOMString_auto_ptr Charset::transcode(StringPtr s) {
654: CharPtr cstr=s->cstr(String::UL_UNSPECIFIED);
1.1 paf 655:
1.24 paf 656: return transcode_buf2dom(cstr, strlen(cstr));
1.1 paf 657: }
658: #endif
E-mail: