Annotation of parser3/src/main/pa_charset.C, revision 1.33.2.19.2.19
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.33.2.6 paf 4: Copyright(c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.33.2.19.2.1 (paf 8:): static const char* IDENT_CHARSET_C="$Date: 2003/04/09 08:01:20 $";
1.1 paf 9:
10: #include "pa_charset.h"
1.33.2.13 paf 11: #include "pa_charsets.h"
1.1 paf 12:
13: #ifdef XML
1.8 paf 14: #include "libxml/encoding.h"
1.1 paf 15: #endif
16:
17: // helpers
18:
19: inline void prepare_case_tables(unsigned char *tables) {
20: unsigned char *lcc_table=tables+lcc_offset;
21: unsigned char *fcc_table=tables+fcc_offset;
22: for(int i=0; i<0x100; i++)
23: lcc_table[i]=fcc_table[i]=i;
24: }
25: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
26: unsigned char bit) {
27: unsigned char *ctypes_table=tables+ctypes_offset;
28: ctypes_table[0]=bit;
29: for(; *cstr; cstr++) {
30: unsigned char c=*cstr;
31: ctypes_table[c]|=bit;
32: }
33: }
1.33.2.6 paf 34: inline unsigned int to_wchar_code(const char* cstr) {
1.1 paf 35: if(!cstr || !*cstr)
36: return 0;
37: if(cstr[1]==0)
1.4 paf 38: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 39:
40: char *error_pos;
1.4 paf 41: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 42: }
1.33.2.6 paf 43: inline bool to_bool(const char* cstr) {
1.1 paf 44: return cstr && *cstr!=0;
45: }
46: static void element2ctypes(unsigned char c, bool belongs,
47: unsigned char *tables, unsigned char bit, int group_offset=-1) {
48: if(!belongs)
49: return;
50:
51: unsigned char *ctypes_table=tables+ctypes_offset;
52:
53: ctypes_table[c]|=bit;
54: if(group_offset>=0)
1.4 paf 55: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 56: }
57: static void element2case(unsigned char from, unsigned char to,
58: unsigned char *tables) {
59: if(!to)
60: return;
61:
62: unsigned char *lcc_table=tables+lcc_offset;
63: unsigned char *fcc_table=tables+fcc_offset;
64: lcc_table[from]=to;
65: fcc_table[from]=to; fcc_table[to]=from;
66: }
67:
68: // methods
69:
70: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.33.2.19.2.1 (paf 71:): Charset::Charset(Request_charsets* charsets, const StringBody ANAME, const String* afile_spec):
72:): FNAME(ANAME),
73:): FNAME_CSTR(ANAME.cstrm()) {
1.7 paf 74:
1.33.2.3 paf 75: if(afile_spec) {
1.1 paf 76: fisUTF8=false;
1.33.2.19.2.1 (paf 77:): load_definition(*charsets, *afile_spec);
1.1 paf 78: #ifdef XML
1.33.2.19.2.1 (paf 79:): addEncoding(FNAME_CSTR);
1.1 paf 80: #endif
81: } else {
82: fisUTF8=true;
1.4 paf 83: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 84: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
85: }
86:
87: #ifdef XML
1.33.2.19.2.1 (paf 88:): initTranscoder(FNAME, FNAME_CSTR);
1.1 paf 89: #endif
90: }
91:
1.33.2.19.2.1 (paf 92:): void Charset::load_definition(Request_charsets& charsets, const String& afile_spec) {
1.1 paf 93: // pcre_tables
94: // lowcase, flipcase, bits digit+word+whitespace, masks
95:
96: // must not move this inside of prepare_case_tables
97: // don't know the size there
98: memset(pcre_tables, 0, sizeof(pcre_tables));
99: prepare_case_tables(pcre_tables);
1.4 paf 100: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 101:
102: // charset
1.33.2.12 paf 103: memset(&tables, 0, sizeof(tables));
1.1 paf 104: // strangly vital
1.10 paf 105: tables.toTable[tables.toTableSize].intCh=0;
106: tables.toTable[tables.toTableSize].extCh=(XMLByte)0;
107: tables.toTableSize++;
1.1 paf 108:
109: // loading text
1.33.2.19.2.1 (paf 110:): char *data=file_read_text(charsets, afile_spec);
1.1 paf 111:
112: // ignore header
113: getrow(&data);
114:
115: // parse cells
116: char *row;
117: while(row=getrow(&data)) {
118: // remove empty&comment lines
119: if(!*row || *row=='#')
120: continue;
121:
122: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
123: unsigned int c=0;
124: char *cell;
125: for(int column=0; cell=lsplit(&row, '\t'); column++) {
126: switch(column) {
127: case 0: c=to_wchar_code(cell); break;
128: // pcre_tables
129: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
130: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
131: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
132: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
133: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
134: case 6: element2case(c, to_wchar_code(cell), pcre_tables); break;
135: case 7:
136: case 8:
137: // charset
1.10 paf 138: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.23 paf 139: throw Exception("parser.runtime",
1.33.2.19.2.5 (paf 140:: &afile_spec,
1.1 paf 141: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
142:
143: XMLCh unicode=(XMLCh)to_wchar_code(cell);
144: if(!unicode && column==7/*unicode1 column*/)
145: unicode=(XMLCh)c;
146: if(unicode) {
1.10 paf 147: if(!tables.fromTable[c])
148: tables.fromTable[c]=unicode;
149: tables.toTable[tables.toTableSize].intCh=unicode;
150: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
151: tables.toTableSize++;
1.1 paf 152: }
153: break;
154: }
155: }
156: };
157:
158: // sort by the Unicode code point
159: sort_ToTable();
160: }
161:
162: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
163: return
164: static_cast<const Charset_TransRec *>(a)->intCh-
165: static_cast<const Charset_TransRec *>(b)->intCh;
166: }
167:
168: void Charset::sort_ToTable() {
1.10 paf 169: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 170: sort_cmp_Trans_rec_intCh);
171: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 172: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 173: //fclose(f);
174: }
175:
1.10 paf 176: static XMLByte xlatOneTo(const XMLCh toXlat,
1.33.2.19.2.1 (paf 177:): const Charset::Tables& tables,
178:): XMLByte not_found) {
179:): unsigned int lowOfs = 0;
180:): unsigned int hiOfs = tables.toTableSize - 1;
181:): XMLByte curByte = 0;
182:): do {
183:): // Calc the mid point of the low and high offset.
184:): const unsigned int midOfs =((hiOfs - lowOfs) / 2)+lowOfs;
185:):
186:): // If our test char is greater than the mid point char, then
187:): // we move up to the upper half. Else we move to the lower
188:): // half. If its equal, then its our guy.
189:): if(toXlat>tables.toTable[midOfs].intCh)
190:): lowOfs = midOfs;
1.10 paf 191: else if(toXlat<tables.toTable[midOfs].intCh)
1.1 paf 192: hiOfs = midOfs;
193: else
1.10 paf 194: return tables.toTable[midOfs].extCh;
1.4 paf 195: } while(lowOfs+1<hiOfs);
1.33.2.19.2.1 (paf 196:):
197:): return not_found;
1.1 paf 198: }
199:
1.33.2.19.2.1 (paf 200:): String::C Charset::transcode(const String::C src,
201:): const Charset& source_charset,
202:): const Charset& dest_charset) {
203:): if(!src.length)
204:): return String::C(0, 0);
1.4 paf 205:
1.1 paf 206: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
207: default: // 0x00
1.33.2.19.2.1 (paf 208:): return source_charset.transcodeToCharset(src, dest_charset);
1.1 paf 209: case 0x01:
1.33.2.19.2.1 (paf 210:): return source_charset.transcodeToUTF8(src);
1.1 paf 211: case 0x10:
1.33.2.19.2.1 (paf 212:): return dest_charset.transcodeFromUTF8(src);
1.1 paf 213: case 0x11:
1.33.2.19.2.1 (paf 214:): return src;
1.1 paf 215: }
216: }
217:
218: // ---------------------------------------------------------------------------
219: // Local static data
220: //
221: // gUTFBytes
222: // A list of counts of trailing bytes for each initial byte in the input.
223: //
224: // gUTFOffsets
225: // A list of values to offset each result char type, according to how
226: // many source bytes when into making it.
227: //
228: // gFirstByteMark
229: // A list of values to mask onto the first byte of an encoded sequence,
230: // indexed by the number of bytes used to create the sequence.
231: // ---------------------------------------------------------------------------
232: static const XMLByte gUTFBytes[0x100] = {
233: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
234: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
235: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
236: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
237: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
238: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
239: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
240: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
241: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
242: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
243: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
244: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
245: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
246: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
247: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
248: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
249: };
250:
251: static const uint gUTFOffsets[6] = {
252: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
253: };
254:
255: static const XMLByte gFirstByteMark[7] = {
256: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
257: };
258:
1.11 paf 259: static int transcodeToUTF8(
1.33.2.19.2.1 (paf 260:): const XMLByte* srcData, size_t& srcLen,
261:): XMLByte *toFill, size_t& toFillLen,
262:): const Charset::Tables& tables) {
1.11 paf 263: const XMLByte* srcPtr=srcData;
264: const XMLByte* srcEnd=srcData+srcLen;
265: XMLByte* outPtr=toFill;
266: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 267:
1.33.2.19.2.1 (paf 268:): while(srcPtr<srcEnd) {
269:): uint curVal = tables.fromTable[*srcPtr];
1.1 paf 270: if(!curVal) {
1.33.2.19.2.1 (paf 271:): // use the replacement character
272:): *outPtr++= '?';
273:): srcPtr++;
274:): continue;
275:): }
1.11 paf 276:
1.33.2.19.2.1 (paf 277:): // Figure out how many bytes we need
278:): unsigned int encodedBytes;
279:): if(curVal<0x80)
280:): encodedBytes = 1;
281:): else if(curVal<0x800)
282:): encodedBytes = 2;
283:): else if(curVal<0x10000)
284:): encodedBytes = 3;
285:): else if(curVal<0x200000)
286:): encodedBytes = 4;
287:): else if(curVal<0x4000000)
288:): encodedBytes = 5;
289:): else if(curVal<= 0x7FFFFFFF)
290:): encodedBytes = 6;
291:): else {
292:): // use the replacement character
293:): *outPtr++= '?';
294:): srcPtr++;
295:): continue;
296:): }
1.11 paf 297:
1.33.2.19.2.1 (paf 298:): // If we cannot fully get this char into the output buffer
299:): if (outPtr + encodedBytes > outEnd)
300:): break;
301:):
302:): // We can do it, so update the source index
303:): srcPtr++;
304:):
305:): // And spit out the bytes. We spit them out in reverse order
306:): // here, so bump up the output pointer and work down as we go.
307:): outPtr+= encodedBytes;
308:): switch(encodedBytes) {
309:): case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
310:): curVal>>= 6;
311:): case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
312:): curVal>>= 6;
313:): case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
314:): curVal>>= 6;
315:): case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
316:): curVal>>= 6;
317:): case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
318:): curVal>>= 6;
319:): case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
320:): }
321:):
322:): // Add the encoded bytes back in again to indicate we've eaten them
323:): outPtr+= encodedBytes;
324:): }
325:):
326:): // Update the bytes eaten
327:): srcLen = srcPtr - srcData;
328:):
329:): // Return the characters read
330:): toFillLen = outPtr - toFill;
331:):
1.29 paf 332: //return srcPtr==srcEnd?(int)toFillLen:-1;
333: /*
334: xmlCharEncodingInputFunc
335: Returns :
336: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
337: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
338: of ocetes consumed.
339: */
340: return 0;
1.1 paf 341: }
1.26 paf 342: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.30 paf 343: static int transcodeFromUTF8(
1.33.2.14 paf 344: const XMLByte* srcData, size_t& srcLen,
1.11 paf 345: XMLByte* toFill, size_t& toFillLen,
346: const Charset::Tables& tables) {
347: const XMLByte* srcPtr=srcData;
348: const XMLByte* srcEnd=srcData+srcLen;
349: XMLByte* outPtr=toFill;
350: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 351:
1.10 paf 352: // We now loop until we either run out of input data, or room to store
353: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
1.1 paf 354: // Get the next leading byte out
1.33.2.14 paf 355: const XMLByte firstByte =* srcPtr;
1.1 paf 356:
1.4 paf 357: // Special-case ASCII, which is a leading byte value of<= 127
358: if(firstByte<= 127) {
359: *outPtr++= firstByte;
1.1 paf 360: srcPtr++;
361: continue;
362: }
363:
364: // See how many trailing src bytes this sequence is going to require
365: const unsigned int trailingBytes = gUTFBytes[firstByte];
366:
367: // If there are not enough source bytes to do this one, then we
1.4 paf 368: // are done. Note that we done>= here because we are implicitly
1.1 paf 369: // counting the 1 byte we get no matter what.
1.4 paf 370: if(srcPtr+trailingBytes>= srcEnd)
1.1 paf 371: break;
372:
373: // Looks ok, so lets build up the value
374: uint tmpVal=0;
375: switch(trailingBytes) {
376: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
377: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
378: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
379: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
380: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
381: case 0: tmpVal+=*srcPtr++;
382: break;
383:
384: default:
1.23 paf 385: throw Exception(0,
1.33.2.19.2.2 (paf 386:: 0,
1.4 paf 387: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes);
1.1 paf 388: }
389: tmpVal-=gUTFOffsets[trailingBytes];
390:
391: // If it will fit into a single char, then put it in. Otherwise
392: // fail [*encode it as a surrogate pair. If its not valid, use the
393: // replacement char.*]
1.25 paf 394: if(!(tmpVal & 0xFFFF0000)) {
395: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
396: *outPtr++=xlat;
397: else
398: outPtr+=sprintf((char *)outPtr, "&#%d;", tmpVal); // &#decimal;
399: } else
1.23 paf 400: throw Exception(0,
1.33.2.19.2.2 (paf 401:: 0,
1.4 paf 402: "transcodeFromUTF8 error: too big tmpVal(0x%08X)", tmpVal);
1.1 paf 403: }
404:
1.11 paf 405: // Update the bytes eaten
406: srcLen = srcPtr - srcData;
407:
408: // Return the characters read
409: toFillLen = outPtr - toFill;
410:
1.29 paf 411: //return srcPtr==srcEnd?(int)toFillLen:-1;
412: /*
413: xmlCharEncodingOutputFunc
414: Returns :
415: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
416: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
417: of ocetes consumed.
418: */
419: return 0;
1.10 paf 420: }
421:
422: /// @todo not so memory-hungry with prescan
1.33.2.19.2.1 (paf 423:): const String::C Charset::transcodeToUTF8(const String::C src) const {
424:): size_t src_length=src.length;
425:): size_t dest_length=src.length*6/*so that surly enough, max utf8 seq len=6*/;
426:): XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 427:
428: if(::transcodeToUTF8(
1.33.2.19.2.1 (paf 429:): (XMLByte *)src.str, src_length,
430:): dest_body, dest_length,
1.11 paf 431: tables)<0)
1.10 paf 432: throw(0, 0,
433: 0,
1.11 paf 434: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 435:
1.33.2.19.2.1 (paf 436:): dest_body[dest_length]=0; // terminator
437:): return String::C((char*)dest_body, dest_length);
1.10 paf 438: }
1.33.2.19.2.1 (paf 439:): const String::C Charset::transcodeFromUTF8(const String::C src) const {
440:): size_t src_length=src.length;
441:): size_t dest_length=src.length*6/*so that surly enough, "ÿ" has max ratio */;
442:): XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 443:
444: if(::transcodeFromUTF8(
1.33.2.19.2.1 (paf 445:): (XMLByte *)src.str, src_length,
446:): dest_body, dest_length,
1.11 paf 447: tables)<0)
1.10 paf 448: throw(0, 0,
449: 0,
1.33.2.19.2.1 (paf 450:): "Charset::transcodeFromUTF8 buffer overflow");
1.10 paf 451:
1.33.2.19.2.1 (paf 452:): dest_body[dest_length]=0; // terminator
453:): return String::C((char*)dest_body, dest_length);
1.1 paf 454: }
455:
456: /// transcode using both charsets
1.33.2.19.2.1 (paf 457:): const String::C Charset::transcodeToCharset(const String::C src,
458:): const Charset& dest_charset) const {
459:): if(&dest_charset==this)
460:): return src;
461:): else {
462:): size_t dest_length=src.length;
463:): XMLByte* dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
464:):
465:): XMLByte* output=dest_body;
466:): const XMLByte* input=(XMLByte *)src.str;
467:): while(XMLCh c=*input++) {
468:): XMLCh curVal = tables.fromTable[c];
469:): *output++=curVal?
470:): xlatOneTo(curVal, dest_charset.tables, '?') // OK
471:): :'?'; // use the replacement character
1.6 paf 472: }
1.1 paf 473:
1.33.2.19.2.1 (paf 474:): dest_body[dest_length]=0; // terminator
475:): return String::C((char*)dest_body, dest_length);
1.6 paf 476: }
1.1 paf 477: }
478:
479: #ifdef XML
1.10 paf 480:
1.33.2.19.2.1 (paf 481:): static const Charset::Tables* tables[MAX_CHARSETS];
482:):
483:): #define declareXml256ioFuncs(i) \
484:): static int xml256CharEncodingInputFunc##i( \
485:): unsigned char *out, int *outlen, \
486:): const unsigned char *in, int *inlen) { \
487:): return transcodeToUTF8( \
488:): in, *(size_t*)inlen, \
489:): out, *(size_t*)outlen, \
490:): *tables[i]); \
491:): } \
492:): static int xml256CharEncodingOutputFunc##i( \
493:): unsigned char *out, int *outlen, \
494:): const unsigned char *in, int *inlen) { \
495:): return transcodeFromUTF8( \
496:): in, *(size_t*)inlen, \
497:): out, *(size_t*)outlen, \
498:): *tables[i]); \
499:): }
500:):
501:): declareXml256ioFuncs(0) declareXml256ioFuncs(1)
502:): declareXml256ioFuncs(2) declareXml256ioFuncs(3)
503:): declareXml256ioFuncs(4) declareXml256ioFuncs(5)
504:): declareXml256ioFuncs(6) declareXml256ioFuncs(7)
505:): declareXml256ioFuncs(8) declareXml256ioFuncs(9)
506:):
507:): static xmlCharEncodingInputFunc inputFuncs[MAX_CHARSETS]={
508:): xml256CharEncodingInputFunc0, xml256CharEncodingInputFunc1,
509:): xml256CharEncodingInputFunc2, xml256CharEncodingInputFunc3,
510:): xml256CharEncodingInputFunc4, xml256CharEncodingInputFunc5,
511:): xml256CharEncodingInputFunc6, xml256CharEncodingInputFunc7,
512:): xml256CharEncodingInputFunc8, xml256CharEncodingInputFunc9
513:): };
514:): static xmlCharEncodingOutputFunc outputFuncs[MAX_CHARSETS]={
515:): xml256CharEncodingOutputFunc0, xml256CharEncodingOutputFunc1,
516:): xml256CharEncodingOutputFunc2, xml256CharEncodingOutputFunc3,
517:): xml256CharEncodingOutputFunc4, xml256CharEncodingOutputFunc5,
518:): xml256CharEncodingOutputFunc6, xml256CharEncodingOutputFunc7,
519:): xml256CharEncodingOutputFunc8, xml256CharEncodingOutputFunc9
520:): };
521:): static size_t handlers_count=0;
1.10 paf 522:
523: void Charset::addEncoding(char *name_cstr) {
1.33.2.19.2.1 (paf 524:): if(handlers_count==MAX_CHARSETS)
525:): throw Exception(0,
526:): 0,
527:): "already allocated %d handlers, no space for new encoding '%s'",
528:): MAX_CHARSETS, name_cstr);
529:):
1.33.2.19.2.9 (paf 530:: xmlCharEncodingHandler* handler=new(PointerFreeGC) xmlCharEncodingHandler;
1.33.2.19.2.1 (paf 531:): {
532:): handler->name=name_cstr;
533:): handler->input=inputFuncs[handlers_count];
534:): handler->output=outputFuncs[handlers_count];
535:): ::tables[handlers_count]=&tables;
536:): handlers_count++;
537:): }
1.10 paf 538:
539: xmlRegisterCharEncodingHandler(handler);
1.33.2.19.2.1 (paf 540:):
1.10 paf 541: }
542:
1.33.2.19.2.1 (paf 543:): void Charset::initTranscoder(const StringBody NAME, const char* name_cstr) {
1.15 paf 544: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
1.33.2.19.2.1 (paf 545:): transcoder(NAME); // check right way
1.15 paf 546: }
547:
1.33.2.19.2.1 (paf 548:): xmlCharEncodingHandler& Charset::transcoder(const StringBody NAME) {
1.15 paf 549: if(!ftranscoder)
1.23 paf 550: throw Exception("parser.runtime",
1.33.2.19.2.1 (paf 551:): new String(NAME, String::L_TAINTED),
1.10 paf 552: "unsupported encoding");
1.33.2.14 paf 553: return *ftranscoder;
1.10 paf 554: }
555:
1.33.2.19.2.7 (paf 556:: const char* Charset::transcode_cstr(xmlChar* s) {
1.13 paf 557: if(!s)
1.14 paf 558: return "";
1.8 paf 559:
1.33.2.6 paf 560: int inlen=strlen((const char* )s);
1.8 paf 561: int outlen=inlen+1; // max
1.33.2.19.2.6 (paf 562:: char *out=new(PointerFreeGC) char[outlen];
1.8 paf 563:
1.30 paf 564: int error;
1.33.2.19.2.1 (paf 565:): if(xmlCharEncodingOutputFunc output=transcoder(FNAME).output) {
1.30 paf 566: error=output(
1.17 paf 567: (unsigned char*)out, &outlen,
1.33.2.19.2.1 (paf 568:): (const unsigned char*)s, &inlen);
1.30 paf 569: } else {
570: memcpy(out, s, outlen=inlen);
571: error=0;
572: }
573: if(error<0)
1.33.2.19.2.2 (paf 574:: throw Exception(0,
575:: 0,
1.30 paf 576: "transcode_cstr failed (%d)", error);
1.8 paf 577:
1.30 paf 578: out[outlen/*surely would be less then on input*/]=0;
1.8 paf 579: return out;
1.14 paf 580: }
1.33.2.19.2.7 (paf 581:: const String& Charset::transcode(xmlChar* s) {
1.33.2.19.2.1 (paf 582:): return *new String(transcode_cstr(s), 0/*auto-size*/, true);
1.14 paf 583: }
1.33.2.19.2.7 (paf 584:: const char* Charset::transcode_cstr(GdomeDOMString* s) {
1.33.2.19.2.3 (paf 585:: return s?transcode_cstr(BAD_CAST s->str):"";
1.1 paf 586: }
1.33.2.19.2.7 (paf 587:: const String& Charset::transcode(GdomeDOMString* s) {
1.33.2.19.2.1 (paf 588:): return *new String(transcode_cstr(s), 0/*auto-size*/, true);
1.1 paf 589: }
590:
1.8 paf 591: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.33.2.16 paf 592: void* Charset::transcode_buf2mchar(transcode_buf_malloc_func malloc_func,
1.33.2.19.2.8 (paf 593:: const char* buf, size_t buf_size) {
1.33.2.14 paf 594: unsigned char* out;
1.30 paf 595: int outlen;
596: int error;
1.33.2.19.2.1 (paf 597:): if(xmlCharEncodingInputFunc input=transcoder(FNAME).input) {
1.32 paf 598: outlen=buf_size*6/*max*/;
1.33.2.16 paf 599: out=(unsigned char*)malloc_func(outlen+1);
1.30 paf 600: } else {
601: outlen=buf_size;
1.33.2.16 paf 602: out=(unsigned char*)malloc_func(outlen+1);
1.30 paf 603: memcpy(out, buf, outlen);
604: error=0;
605: }
1.17 paf 606:
1.30 paf 607: if(error<0)
1.33.2.19.2.2 (paf 608:: throw Exception(0,
609:: 0,
1.30 paf 610: "transcode_buf failed (%d)", error);
1.8 paf 611:
1.30 paf 612: out[outlen/*surely would be less then on input*/]=0;
1.33.2.16 paf 613: return out;
614: }
615:
616: xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
617: return static_cast<xmlChar*>(transcode_buf2mchar(xmlMalloc, buf, buf_size));
618: }
619: static void* g_malloc_wrapper(size_t size) {
1.33.2.19.2.1 (paf 620:): if(void* out=g_malloc(size))
621:): return out;
622:):
623:): return pa_fail_alloc("g_malloc_wrapper", size);
1.33.2.16 paf 624: }
625: gchar* Charset::transcode_buf2gchar(const char* buf, size_t buf_size) {
626: return static_cast<gchar*>(transcode_buf2mchar(g_malloc_wrapper, buf, buf_size));
1.24 paf 627: }
1.33.2.6 paf 628: GdomeDOMString_auto_ptr Charset::transcode_buf2dom(const char* buf, size_t buf_size) {
1.33.2.16 paf 629: return GdomeDOMString_auto_ptr(transcode_buf2gchar(buf, buf_size));
1.1 paf 630: }
1.33.2.19.2.1 (paf 631:: GdomeDOMString_auto_ptr Charset::transcode(const String& s) {
1.33.2.19.2.7 (paf 632:: const char* cstr=s.cstr(String::L_UNSPECIFIED);
1.1 paf 633:
1.24 paf 634: return transcode_buf2dom(cstr, strlen(cstr));
1.33.2.19.2.8 (paf 635:: }
636:: GdomeDOMString_auto_ptr Charset::transcode(const StringBody s) {
637:: const char* cstr=s.cstr();
638::
639:: return transcode_buf2dom(cstr, s.length());
1.1 paf 640: }
641: #endif
E-mail: