Annotation of parser3/src/main/pa_charset.C, revision 1.61
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.52 paf 4: Copyright(c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.61 ! misha 8: static const char * const IDENT_CHARSET_C="$Date: 2008-07-15 12:53:10 $";
1.1 paf 9:
10: #include "pa_charset.h"
1.35 paf 11: #include "pa_charsets.h"
1.1 paf 12:
13: #ifdef XML
1.8 paf 14: #include "libxml/encoding.h"
1.1 paf 15: #endif
16:
1.46 paf 17: //#define PA_PATCHED_LIBXML_BACKWARD
1.60 misha 18: #define PRECALCULATE_DEST_LENGTH
1.46 paf 19:
1.38 paf 20: // globals
21:
22: Charset::UTF8CaseTable::Rec UTF8CaseToUpperRecords[]={
23: #include "utf8-to-upper.inc"
24: };
25: Charset::UTF8CaseTable UTF8CaseToUpper={
26: sizeof(UTF8CaseToUpperRecords)/sizeof(Charset::UTF8CaseTable::Rec),
27: UTF8CaseToUpperRecords};
28:
29: Charset::UTF8CaseTable::Rec UTF8CaseToLowerRecords[]={
30: #include "utf8-to-lower.inc"
31: };
32: Charset::UTF8CaseTable UTF8CaseToLower={
33: sizeof(UTF8CaseToLowerRecords)/sizeof(Charset::UTF8CaseTable::Rec),
34: UTF8CaseToLowerRecords};
35:
1.1 paf 36: // helpers
37:
38: inline void prepare_case_tables(unsigned char *tables) {
39: unsigned char *lcc_table=tables+lcc_offset;
40: unsigned char *fcc_table=tables+fcc_offset;
41: for(int i=0; i<0x100; i++)
1.53 paf 42: lcc_table[i]=fcc_table[i]=(unsigned char)i;
1.1 paf 43: }
44: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
45: unsigned char bit) {
46: unsigned char *ctypes_table=tables+ctypes_offset;
47: ctypes_table[0]=bit;
48: for(; *cstr; cstr++) {
49: unsigned char c=*cstr;
50: ctypes_table[c]|=bit;
51: }
52: }
1.35 paf 53: inline unsigned int to_wchar_code(const char* cstr) {
1.1 paf 54: if(!cstr || !*cstr)
55: return 0;
56: if(cstr[1]==0)
1.4 paf 57: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 58:
59: char *error_pos;
1.4 paf 60: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 61: }
1.35 paf 62: inline bool to_bool(const char* cstr) {
1.1 paf 63: return cstr && *cstr!=0;
64: }
65: static void element2ctypes(unsigned char c, bool belongs,
66: unsigned char *tables, unsigned char bit, int group_offset=-1) {
67: if(!belongs)
68: return;
69:
70: unsigned char *ctypes_table=tables+ctypes_offset;
71:
72: ctypes_table[c]|=bit;
73: if(group_offset>=0)
1.4 paf 74: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 75: }
76: static void element2case(unsigned char from, unsigned char to,
77: unsigned char *tables) {
78: if(!to)
79: return;
80:
81: unsigned char *lcc_table=tables+lcc_offset;
82: unsigned char *fcc_table=tables+fcc_offset;
83: lcc_table[from]=to;
84: fcc_table[from]=to; fcc_table[to]=from;
85: }
86:
87: // methods
88:
89: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.37 paf 90: Charset::Charset(Request_charsets* charsets, const String::Body ANAME, const String* afile_spec):
1.35 paf 91: FNAME(ANAME),
92: FNAME_CSTR(ANAME.cstrm()) {
1.7 paf 93:
1.35 paf 94: if(afile_spec) {
1.1 paf 95: fisUTF8=false;
1.35 paf 96: load_definition(*charsets, *afile_spec);
1.1 paf 97: #ifdef XML
1.35 paf 98: addEncoding(FNAME_CSTR);
1.1 paf 99: #endif
100: } else {
101: fisUTF8=true;
1.4 paf 102: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 103: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
104: }
105:
106: #ifdef XML
1.35 paf 107: initTranscoder(FNAME, FNAME_CSTR);
1.1 paf 108: #endif
109: }
110:
1.35 paf 111: void Charset::load_definition(Request_charsets& charsets, const String& afile_spec) {
1.1 paf 112: // pcre_tables
113: // lowcase, flipcase, bits digit+word+whitespace, masks
114:
115: // must not move this inside of prepare_case_tables
116: // don't know the size there
117: memset(pcre_tables, 0, sizeof(pcre_tables));
118: prepare_case_tables(pcre_tables);
1.4 paf 119: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 120:
121: // charset
1.35 paf 122: memset(&tables, 0, sizeof(tables));
1.1 paf 123:
124: // loading text
1.35 paf 125: char *data=file_read_text(charsets, afile_spec);
1.1 paf 126:
127: // ignore header
128: getrow(&data);
129:
130: // parse cells
131: char *row;
1.42 paf 132: while((row=getrow(&data))) {
1.1 paf 133: // remove empty&comment lines
134: if(!*row || *row=='#')
135: continue;
136:
137: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
1.53 paf 138: unsigned char c=0;
1.1 paf 139: char *cell;
1.42 paf 140: for(int column=0; (cell=lsplit(&row, '\t')); column++) {
1.1 paf 141: switch(column) {
1.53 paf 142: case 0: c=(unsigned char)to_wchar_code(cell); break;
1.1 paf 143: // pcre_tables
144: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
145: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
146: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
147: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
148: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
1.53 paf 149: case 6: element2case(c, (unsigned char)to_wchar_code(cell), pcre_tables); break;
1.1 paf 150: case 7:
151: case 8:
152: // charset
1.10 paf 153: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.56 misha 154: throw Exception(PARSER_RUNTIME,
1.35 paf 155: &afile_spec,
1.1 paf 156: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
157:
158: XMLCh unicode=(XMLCh)to_wchar_code(cell);
159: if(!unicode && column==7/*unicode1 column*/)
160: unicode=(XMLCh)c;
161: if(unicode) {
1.10 paf 162: if(!tables.fromTable[c])
163: tables.fromTable[c]=unicode;
164: tables.toTable[tables.toTableSize].intCh=unicode;
165: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
166: tables.toTableSize++;
1.1 paf 167: }
168: break;
169: }
170: }
171: };
172:
173: // sort by the Unicode code point
174: sort_ToTable();
175: }
176:
177: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
178: return
1.38 paf 179: static_cast<const Charset::Tables::Rec *>(a)->intCh-
180: static_cast<const Charset::Tables::Rec *>(b)->intCh;
1.1 paf 181: }
182:
183: void Charset::sort_ToTable() {
1.10 paf 184: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 185: sort_cmp_Trans_rec_intCh);
186: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 187: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 188: //fclose(f);
189: }
190:
1.60 misha 191: // @todo: precache for spedup searching
1.10 paf 192: static XMLByte xlatOneTo(const XMLCh toXlat,
1.35 paf 193: const Charset::Tables& tables,
194: XMLByte not_found) {
1.39 paf 195: int lo = 0;
196: int hi = tables.toTableSize - 1;
197: while(lo<=hi) {
1.35 paf 198: // Calc the mid point of the low and high offset.
1.39 paf 199: const unsigned int i = (lo + hi) / 2;
200:
201: XMLCh cur=tables.toTable[i].intCh;
202: if(toXlat==cur)
203: return tables.toTable[i].extCh;
204: if(toXlat>cur)
205: lo = i+1;
1.1 paf 206: else
1.39 paf 207: hi = i-1;
208: }
1.35 paf 209:
210: return not_found;
1.1 paf 211: }
212:
1.35 paf 213: String::C Charset::transcode(const String::C src,
214: const Charset& source_charset,
215: const Charset& dest_charset) {
216: if(!src.length)
217: return String::C("", 0);
1.4 paf 218:
1.1 paf 219: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
220: default: // 0x00
1.35 paf 221: return source_charset.transcodeToCharset(src, dest_charset);
1.1 paf 222: case 0x01:
1.35 paf 223: return source_charset.transcodeToUTF8(src);
1.1 paf 224: case 0x10:
1.35 paf 225: return dest_charset.transcodeFromUTF8(src);
1.1 paf 226: case 0x11:
1.35 paf 227: return src;
1.1 paf 228: }
229: }
230:
231: // ---------------------------------------------------------------------------
232: // Local static data
233: //
234: // gUTFBytes
235: // A list of counts of trailing bytes for each initial byte in the input.
236: //
237: // gUTFOffsets
238: // A list of values to offset each result char type, according to how
239: // many source bytes when into making it.
240: //
241: // gFirstByteMark
242: // A list of values to mask onto the first byte of an encoded sequence,
243: // indexed by the number of bytes used to create the sequence.
244: // ---------------------------------------------------------------------------
245: static const XMLByte gUTFBytes[0x100] = {
246: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
247: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
248: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
249: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
250: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
251: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
252: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
253: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
254: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
255: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
256: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
257: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
258: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
259: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
260: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
261: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
262: };
263:
264: static const uint gUTFOffsets[6] = {
265: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
266: };
267:
268: static const XMLByte gFirstByteMark[7] = {
269: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
270: };
271:
1.35 paf 272: static int transcodeToUTF8(const XMLByte* srcData, size_t& srcLen,
273: XMLByte *toFill, size_t& toFillLen,
274: const Charset::Tables& tables) {
1.11 paf 275: const XMLByte* srcPtr=srcData;
276: const XMLByte* srcEnd=srcData+srcLen;
277: XMLByte* outPtr=toFill;
278: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 279:
1.35 paf 280: while(srcPtr<srcEnd) {
281: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 282: if(!curVal) {
1.35 paf 283: // use the replacement character
284: *outPtr++= '?';
285: srcPtr++;
286: continue;
287: }
1.1 paf 288:
1.35 paf 289: // Figure out how many bytes we need
290: unsigned int encodedBytes;
291: if(curVal<0x80)
292: encodedBytes = 1;
293: else if(curVal<0x800)
294: encodedBytes = 2;
295: else if(curVal<0x10000)
296: encodedBytes = 3;
297: else if(curVal<0x200000)
298: encodedBytes = 4;
299: else if(curVal<0x4000000)
300: encodedBytes = 5;
301: else if(curVal<= 0x7FFFFFFF)
302: encodedBytes = 6;
303: else {
304: // use the replacement character
305: *outPtr++= '?';
306: srcPtr++;
307: continue;
308: }
1.11 paf 309:
1.35 paf 310: // If we cannot fully get this char into the output buffer
311: if (outPtr + encodedBytes > outEnd)
312: break;
313:
314: // We can do it, so update the source index
315: srcPtr++;
316:
317: // And spit out the bytes. We spit them out in reverse order
318: // here, so bump up the output pointer and work down as we go.
319: outPtr+= encodedBytes;
320: switch(encodedBytes) {
1.60 misha 321: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
322: curVal>>= 6;
323: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
324: curVal>>= 6;
325: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
326: curVal>>= 6;
327: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
328: curVal>>= 6;
329: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
330: curVal>>= 6;
331: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
1.35 paf 332: }
333:
334: // Add the encoded bytes back in again to indicate we've eaten them
335: outPtr+= encodedBytes;
336: }
337:
338: // Update the bytes eaten
339: srcLen = srcPtr - srcData;
340:
341: // Return the characters read
342: toFillLen = outPtr - toFill;
343:
1.29 paf 344: //return srcPtr==srcEnd?(int)toFillLen:-1;
345: /*
346: xmlCharEncodingInputFunc
347: Returns :
348: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
349: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
350: of ocetes consumed.
351: */
352: return 0;
1.1 paf 353: }
1.26 paf 354: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.35 paf 355: static int transcodeFromUTF8(const XMLByte* srcData, size_t& srcLen,
356: XMLByte* toFill, size_t& toFillLen,
357: const Charset::Tables& tables) {
1.11 paf 358: const XMLByte* srcPtr=srcData;
359: const XMLByte* srcEnd=srcData+srcLen;
360: XMLByte* outPtr=toFill;
361: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 362:
1.35 paf 363: // We now loop until we either run out of input data, or room to store
364: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
365: // Get the next leading byte out
366: const XMLByte firstByte =* srcPtr;
367:
368: // Special-case ASCII, which is a leading byte value of<= 127
1.60 misha 369: if(firstByte<=127) {
1.35 paf 370: *outPtr++= firstByte;
371: srcPtr++;
372: continue;
373: }
374:
375: // See how many trailing src bytes this sequence is going to require
376: const unsigned int trailingBytes = gUTFBytes[firstByte];
377:
378: // If there are not enough source bytes to do this one, then we
379: // are done. Note that we done>= here because we are implicitly
380: // counting the 1 byte we get no matter what.
381: if(srcPtr+trailingBytes>= srcEnd)
382: break;
383:
384: // Looks ok, so lets build up the value
385: uint tmpVal=0;
386: switch(trailingBytes) {
387: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
388: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
389: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
390: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
391: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
392: case 0: tmpVal+=*srcPtr++;
393: break;
394:
395: default:
396: throw Exception(0,
397: 0,
1.49 paf 398: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes); // never
1.35 paf 399: }
400: tmpVal-=gUTFOffsets[trailingBytes];
401:
402: // If it will fit into a single char, then put it in. Otherwise
403: // fail [*encode it as a surrogate pair. If its not valid, use the
404: // replacement char.*]
405: if(!(tmpVal & 0xFFFF0000)) {
1.25 paf 406: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
407: *outPtr++=xlat;
1.49 paf 408: else {
1.50 paf 409: outPtr+=sprintf((char *)outPtr, "&#%u;", tmpVal); // &#decimal;
1.49 paf 410: }
411: } else {
412: const XMLByte* recoverPtr=srcPtr-trailingBytes-1;
413: for(uint i=0; i<=trailingBytes; i++)
414: outPtr+=sprintf((char*)outPtr, "%%%02X", *recoverPtr++);
415: }
1.1 paf 416: }
1.35 paf 417:
418: // Update the bytes eaten
419: srcLen = srcPtr - srcData;
420:
421: // Return the characters read
422: toFillLen = outPtr - toFill;
1.11 paf 423:
1.29 paf 424: //return srcPtr==srcEnd?(int)toFillLen:-1;
425: /*
426: xmlCharEncodingOutputFunc
427: Returns :
428: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
429: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
430: of ocetes consumed.
431: */
432: return 0;
1.10 paf 433: }
434:
1.60 misha 435: static bool is_escaped(char c){
436: return
437: !(c<=127
438: && (
439: ((c>='0') && (c<='9'))
440: || ((c>='A') && (c<='Z'))
441: || ((c>='a') && (c<='z'))
442: || strchr("*@-_+./", c)!=0
443: ));
444: }
445:
446: // read one utf8 character, return number of bytes needed for store it
1.61 ! misha 447: static unsigned int readChar(const XMLByte*& srcPtr, const XMLByte* srcEnd, XMLByte& firstByte, XMLCh& UTF8Char){
1.60 misha 448: if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
449: return 0;
450:
451: firstByte=*srcPtr;
452:
453: if(firstByte<=127){
454: UTF8Char=firstByte;
455: srcPtr++;
456: return 1;
457: }
458:
459: unsigned int trailingBytes=gUTFBytes[firstByte];
460:
461: if(srcPtr+trailingBytes>=srcEnd){
462: return 0; // not enough bytes in source string for reading
463: }
464:
465: uint tmpVal=0;
466: switch(trailingBytes){
467: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
468: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
469: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
470: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
471: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
472: case 0: tmpVal+=*srcPtr++;
473: }
474:
475: tmpVal-=gUTFOffsets[trailingBytes];
476: UTF8Char=tmpVal;
477:
478: return trailingBytes+1;
479: }
480:
1.61 ! misha 481: static unsigned int readChar(const XMLByte*& srcPtr, const XMLByte* srcEnd){
! 482: XMLByte firstByte;
! 483: XMLCh UTF8Char;
! 484: return readChar(srcPtr, srcEnd, firstByte, UTF8Char);
! 485: }
! 486:
1.60 misha 487: // read char, return number of bytes needed for store it as UTF8
1.61 ! misha 488: static unsigned int readChar(const XMLByte*& srcPtr, const XMLByte* srcEnd, XMLByte& firstByte, XMLCh& UTF8Char, const Charset::Tables& tables){
1.60 misha 489: if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
490: return 0;
491:
492: firstByte=*srcPtr++;
493: UTF8Char=tables.fromTable[firstByte];
494:
495: if(UTF8Char<0x80)
496: return 1;
497: else if(UTF8Char<0x800)
498: return 2;
499: else if(UTF8Char<0x10000)
500: return 3;
501: else if(UTF8Char<0x200000)
502: return 4;
503: else if(UTF8Char<0x4000000)
504: return 5;
505: else if(UTF8Char<= 0x7FFFFFFF)
506: return 6;
507:
508: // will use the replacement character '?'
509: firstByte=0;
510: return 1;
511: }
512:
513: static int escape(const XMLByte* srcData, size_t& srcLen,
514: XMLByte* toFill, size_t& toFillLen) {
515: const XMLByte* srcPtr=srcData;
516: const XMLByte* srcEnd=srcData+srcLen;
517: XMLByte* outPtr=toFill;
518: XMLByte* outEnd=toFill+toFillLen;
519: XMLByte firstByte;
520: XMLCh UTF8Char;
521: uint charSize;
522:
523: // loop until we either run out of input data, or room to store
524: while((outPtr < outEnd) && (charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char))){
525: if(charSize==1){
526: if(is_escaped(firstByte)) // %XX
527: outPtr+=sprintf((char*)outPtr, "%%%02X", firstByte);
528: else
529: *outPtr++=firstByte;
530: } else
531: outPtr+=sprintf((char*)outPtr, "%%u%04X", UTF8Char); // %uXXXX
532: }
533:
534: // Update the bytes eaten
535: srcLen=srcPtr-srcData;
536:
537: // Return the characters read
538: toFillLen=outPtr-toFill;
539:
540: return 0;
541: }
542:
543: static int escape(const XMLByte* srcData, size_t& srcLen,
544: XMLByte *toFill, size_t& toFillLen,
545: const Charset::Tables& tables) {
546: const XMLByte* srcPtr=srcData;
547: const XMLByte* srcEnd=srcData+srcLen;
548: XMLByte* outPtr=toFill;
549: //XMLByte* outEnd=toFill+toFillLen;
550: XMLByte firstByte;
551: XMLCh UTF8Char;
552: uint charSize;
553:
554: while(charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, tables)){
555: if(charSize==1){
556: if(firstByte){
557: if(is_escaped(firstByte)) // %XX
558: outPtr+=sprintf((char*)outPtr, "%%%02X", firstByte);
559: else
560: *outPtr++=firstByte;
561: } else // add replacement char '?'
562: *outPtr++='?';
563: } else
564: outPtr+=sprintf((char*)outPtr, "%%u%04X", UTF8Char); // %uXXXX
565: }
566:
567: // Update the bytes eaten
568: srcLen = srcPtr - srcData;
569:
570: // Return the characters read
571: toFillLen = outPtr - toFill;
572:
573: return 0;
574: }
575:
576:
577: String::C Charset::escape(const String::C src, const Charset& source_charset){
578: size_t src_length=src.length;
579: if(!src_length)
580: return String::C("", 0);
581:
582: #ifdef PRECALCULATE_DEST_LENGTH
583: size_t dest_length=0;
584: const XMLByte* srcPtr=(XMLByte*)src.str;
585: const XMLByte* srcEnd=srcPtr+src_length;
586: XMLByte firstByte;
587: XMLCh UTF8Char;
588:
589: if(source_charset.isUTF8()){
590: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char)){
591: if(charSize==1)
592: dest_length+=!is_escaped(firstByte)?1:3/*%XX*/;
593: else
594: dest_length+=6; // '%uXXXX'
595: }
596: } else {
597: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, source_charset.tables)){
598: if(charSize==1)
599: dest_length+=(!firstByte/*replacement char '?'*/ || !is_escaped(firstByte))?1:3/*'%XX'*/;
600: else
601: dest_length+=6; // '%uXXXX'
602: }
603: }
604: #else
605: size_t dest_length=src_length*6; // enough for %uXXXX but too memory-hungry
606: #endif
607:
608: //throw Exception(0,0,"%u",dest_length);
609:
610: #ifndef NDEBUG
611: size_t saved_dest_length=dest_length;
612: #endif
613: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
614:
615: int status;
616: if(source_charset.isUTF8()){
617: status=::escape((XMLByte *)src.str, src_length, dest_body, dest_length);
618: } else {
619: status=::escape((XMLByte *)src.str, src_length, dest_body, dest_length, source_charset.tables);
620: }
621:
622: if(status<0)
623: throw Exception(0,
624: 0,
625: "Charset::escapeString buffer overflow");
626:
627: assert(dest_length<=saved_dest_length);
628: dest_body[dest_length]=0; // terminator
629: return String::C((char*)dest_body, dest_length);
630: }
631:
632:
1.35 paf 633: const String::C Charset::transcodeToUTF8(const String::C src) const {
634: size_t src_length=src.length;
1.60 misha 635:
636: #ifdef PRECALCULATE_DEST_LENGTH
637: size_t dest_length=0;
638: const XMLByte* srcPtr=(XMLByte*)src.str;
639: const XMLByte* srcEnd=srcPtr+src_length;
640: XMLByte firstByte;
641: XMLCh UTF8Char;
642: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, tables))
643: dest_length+=charSize;
644: #else
645: size_t dest_length=src_length*6; // so that surly enough (max utf8 seq len=6) but too memory-hyngry
646: #endif
647:
648: //throw Exception(0,0,"%u",dest_length);
649:
1.35 paf 650: #ifndef NDEBUG
651: size_t saved_dest_length=dest_length;
652: #endif
653: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 654:
655: if(::transcodeToUTF8(
1.35 paf 656: (XMLByte *)src.str, src_length,
657: dest_body, dest_length,
1.11 paf 658: tables)<0)
1.43 paf 659: throw Exception(0,
1.10 paf 660: 0,
1.11 paf 661: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 662:
1.60 misha 663: assert(dest_length<=saved_dest_length);
664: dest_body[dest_length]=0; // terminator
1.35 paf 665: return String::C((char*)dest_body, dest_length);
1.10 paf 666: }
1.38 paf 667:
668: static XMLCh change_case_UTF8(const XMLCh src, const Charset::UTF8CaseTable& table) {
1.39 paf 669: int lo = 0;
670: int hi = table.size - 1;
671: while(lo<=hi) {
1.38 paf 672: // Calc the mid point of the low and high offset.
1.39 paf 673: const unsigned int i = (lo + hi) / 2;
674:
675: XMLCh cur=table.records[i].from;
676: if(src==cur)
677: return table.records[i].to;
678: if(src>cur)
679: lo = i+1;
1.38 paf 680: else
1.39 paf 681: hi = i-1;
682: }
683:
684: // not found
1.38 paf 685: return src;
686: }
687:
1.58 misha 688: static void store_UTF8(XMLCh src, XMLByte*& outPtr){
1.38 paf 689: if(!src) {
690: // use the replacement character
691: *outPtr++= '?';
692: return;
693: }
694:
695: // Figure out how many bytes we need
696: unsigned int encodedBytes;
697: if(src<0x80)
698: encodedBytes = 1;
699: else if(src<0x800)
700: encodedBytes = 2;
701: else if(src<0x10000)
702: encodedBytes = 3;
703: else if(src<0x200000)
704: encodedBytes = 4;
705: else if(src<0x4000000)
706: encodedBytes = 5;
707: else if(src<= 0x7FFFFFFF)
708: encodedBytes = 6;
709: else {
710: // use the replacement character
711: *outPtr++= '?';
712: return;
713: }
714:
715: // And spit out the bytes. We spit them out in reverse order
716: // here, so bump up the output pointer and work down as we go.
717: outPtr+= encodedBytes;
718: switch(encodedBytes) {
719: case 6: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
720: src>>= 6;
721: case 5: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
722: src>>= 6;
723: case 4: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
724: src>>= 6;
725: case 3: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
726: src>>= 6;
727: case 2: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
728: src>>= 6;
729: case 1: *--outPtr = XMLByte(src | gFirstByteMark[encodedBytes]);
730: }
731:
732: // Add the encoded bytes back in again to indicate we've eaten them
733: outPtr+= encodedBytes;
734: }
735:
736: static void change_case_UTF8(XMLCh src, XMLByte*& outPtr,
737: const Charset::UTF8CaseTable& table) {
738: store_UTF8(change_case_UTF8(src, table), outPtr);
739: };
1.44 paf 740: void change_case_UTF8(const XMLByte* srcData, size_t srcLen,
741: XMLByte* toFill, size_t toFillLen,
742: const Charset::UTF8CaseTable& table) {
1.38 paf 743: const XMLByte* srcPtr=srcData;
1.44 paf 744: const XMLByte* srcEnd=srcData+srcLen;
1.38 paf 745: XMLByte* outPtr=toFill;
1.44 paf 746: XMLByte* outEnd=toFill+toFillLen;
747:
748: // We now loop until we either run out of input data, or room to store
749: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
750: // Get the next leading byte out
751: const XMLByte firstByte =* srcPtr;
1.38 paf 752:
1.60 misha 753: if(firstByte<=127) {
1.38 paf 754: change_case_UTF8(firstByte, outPtr, table);
755: srcPtr++;
756: continue;
757: }
758:
759: // See how many trailing src bytes this sequence is going to require
760: const unsigned int trailingBytes = gUTFBytes[firstByte];
761:
762: // Looks ok, so lets build up the value
763: uint tmpVal=0;
764: switch(trailingBytes) {
765: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
766: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
767: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
768: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
769: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
770: case 0: tmpVal+=*srcPtr++;
771: break;
772:
773: default:
774: throw Exception(0,
775: 0,
776: "change_case_UTF8 error: wrong trailingBytes value(%d)", trailingBytes);
777: }
778: tmpVal-=gUTFOffsets[trailingBytes];
779:
780: // If it will fit into a single char, then put it in. Otherwise
781: // fail [*encode it as a surrogate pair. If its not valid, use the
782: // replacement char.*]
783: if(!(tmpVal & 0xFFFF0000))
784: change_case_UTF8(tmpVal, outPtr, table);
785: else
786: throw Exception(0,
787: 0,
788: "change_case_UTF8 error: too big tmpVal(0x%08X)", tmpVal);
789: }
790:
791: if(srcPtr!=outPtr)
792: throw Exception(0,
793: 0,
794: "change_case_UTF8 error: end pointers do not match");
795: }
796:
1.60 misha 797: static size_t getDecNumLength(XMLCh UTF8Char){
798: return
799: (UTF8Char < 100)
800: ?2
801: :(UTF8Char < 1000)
802: ?3
803: :(UTF8Char < 10000)
804: ?4
805: :5;
806: }
1.38 paf 807:
1.35 paf 808: const String::C Charset::transcodeFromUTF8(const String::C src) const {
809: size_t src_length=src.length;
1.60 misha 810:
811: #ifdef PRECALCULATE_DEST_LENGTH
812: size_t dest_length=0;
813: const XMLByte* srcPtr=(XMLByte*)src.str;
814: const XMLByte* srcEnd=srcPtr+src_length;
815: XMLByte firstByte;
816: XMLCh UTF8Char;
817: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char)){
818: if(charSize==1)
819: dest_length++;
820: else
821: dest_length+=(UTF8Char & 0xFFFF0000)
822: ?charSize*3 // '%XX' for each byte
823: :(xlatOneTo(UTF8Char, tables, 0)!=0)
824: ?1 // can convert it to single char
825: :getDecNumLength(UTF8Char)+3; // &#XX; - &#XXXXX;
826: }
827: #else
828: // so that surly enough, "&#XXX;" has max ratio (huh? 8 bytes needed for '&#XXXXX;')
829: size_t dest_length=src_length*6;
830: #endif
831:
832: //throw Exception(0,0,"%u",dest_length);
833:
1.35 paf 834: #ifndef NDEBUG
835: size_t saved_dest_length=dest_length;
836: #endif
837: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 838:
839: if(::transcodeFromUTF8(
1.35 paf 840: (XMLByte *)src.str, src_length,
841: dest_body, dest_length,
1.11 paf 842: tables)<0)
1.43 paf 843: throw Exception(0,
1.10 paf 844: 0,
1.35 paf 845: "Charset::transcodeFromUTF8 buffer overflow");
1.10 paf 846:
1.60 misha 847: assert(dest_length<=saved_dest_length);
848: dest_body[dest_length]=0; // terminator
1.35 paf 849: return String::C((char*)dest_body, dest_length);
1.1 paf 850: }
851:
852: /// transcode using both charsets
1.35 paf 853: const String::C Charset::transcodeToCharset(const String::C src,
854: const Charset& dest_charset) const {
855: if(&dest_charset==this)
856: return src;
857: else {
858: size_t dest_length=src.length;
859: XMLByte* dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
860:
861: XMLByte* output=dest_body;
862: const XMLByte* input=(XMLByte *)src.str;
863: while(XMLCh c=*input++) {
864: XMLCh curVal = tables.fromTable[c];
865: *output++=curVal?
866: xlatOneTo(curVal, dest_charset.tables, '?') // OK
867: :'?'; // use the replacement character
1.6 paf 868: }
1.1 paf 869:
1.35 paf 870: dest_body[dest_length]=0; // terminator
871: return String::C((char*)dest_body, dest_length);
1.6 paf 872: }
1.1 paf 873: }
874:
1.58 misha 875: void Charset::store_Char(XMLByte*& outPtr, XMLCh src, XMLByte not_found){
1.59 misha 876: if(isUTF8())
1.58 misha 877: store_UTF8(src, outPtr);
1.59 misha 878: else if(char ch=xlatOneTo(src, tables, not_found))
1.58 misha 879: *outPtr++=ch;
1.57 misha 880: }
881:
1.1 paf 882: #ifdef XML
1.10 paf 883:
1.35 paf 884: static const Charset::Tables* tables[MAX_CHARSETS];
885:
1.46 paf 886: #ifdef PA_PATCHED_LIBXML_BACKWARD
887:
888: #define declareXml256ioFuncs(i) \
889: static int xml256CharEncodingInputFunc##i( \
890: unsigned char *out, int *outlen, \
891: const unsigned char *in, int *inlen, void*) { \
892: return transcodeToUTF8( \
893: in, *(size_t*)inlen, \
894: out, *(size_t*)outlen, \
895: *tables[i]); \
896: } \
897: static int xml256CharEncodingOutputFunc##i( \
898: unsigned char *out, int *outlen, \
899: const unsigned char *in, int *inlen, void*) { \
900: return transcodeFromUTF8( \
901: in, *(size_t*)inlen, \
902: out, *(size_t*)outlen, \
903: *tables[i]); \
904: }
905:
906: #else
907:
1.35 paf 908: #define declareXml256ioFuncs(i) \
909: static int xml256CharEncodingInputFunc##i( \
910: unsigned char *out, int *outlen, \
911: const unsigned char *in, int *inlen) { \
912: return transcodeToUTF8( \
913: in, *(size_t*)inlen, \
914: out, *(size_t*)outlen, \
915: *tables[i]); \
916: } \
917: static int xml256CharEncodingOutputFunc##i( \
918: unsigned char *out, int *outlen, \
919: const unsigned char *in, int *inlen) { \
920: return transcodeFromUTF8( \
921: in, *(size_t*)inlen, \
922: out, *(size_t*)outlen, \
923: *tables[i]); \
924: }
925:
1.46 paf 926: #endif
927:
928:
1.35 paf 929: declareXml256ioFuncs(0) declareXml256ioFuncs(1)
930: declareXml256ioFuncs(2) declareXml256ioFuncs(3)
931: declareXml256ioFuncs(4) declareXml256ioFuncs(5)
932: declareXml256ioFuncs(6) declareXml256ioFuncs(7)
933: declareXml256ioFuncs(8) declareXml256ioFuncs(9)
934:
935: static xmlCharEncodingInputFunc inputFuncs[MAX_CHARSETS]={
936: xml256CharEncodingInputFunc0, xml256CharEncodingInputFunc1,
937: xml256CharEncodingInputFunc2, xml256CharEncodingInputFunc3,
938: xml256CharEncodingInputFunc4, xml256CharEncodingInputFunc5,
939: xml256CharEncodingInputFunc6, xml256CharEncodingInputFunc7,
940: xml256CharEncodingInputFunc8, xml256CharEncodingInputFunc9
941: };
942: static xmlCharEncodingOutputFunc outputFuncs[MAX_CHARSETS]={
943: xml256CharEncodingOutputFunc0, xml256CharEncodingOutputFunc1,
944: xml256CharEncodingOutputFunc2, xml256CharEncodingOutputFunc3,
945: xml256CharEncodingOutputFunc4, xml256CharEncodingOutputFunc5,
946: xml256CharEncodingOutputFunc6, xml256CharEncodingOutputFunc7,
947: xml256CharEncodingOutputFunc8, xml256CharEncodingOutputFunc9
948: };
949: static size_t handlers_count=0;
1.10 paf 950:
951: void Charset::addEncoding(char *name_cstr) {
1.35 paf 952: if(handlers_count==MAX_CHARSETS)
953: throw Exception(0,
954: 0,
955: "already allocated %d handlers, no space for new encoding '%s'",
956: MAX_CHARSETS, name_cstr);
957:
1.45 paf 958: xmlCharEncodingHandler* handler=new(UseGC) xmlCharEncodingHandler;
1.35 paf 959: {
960: handler->name=name_cstr;
961: handler->input=inputFuncs[handlers_count];
962: handler->output=outputFuncs[handlers_count];
963: ::tables[handlers_count]=&tables;
964: handlers_count++;
965: }
1.10 paf 966:
967: xmlRegisterCharEncodingHandler(handler);
1.35 paf 968:
1.10 paf 969: }
970:
1.37 paf 971: void Charset::initTranscoder(const String::Body NAME, const char* name_cstr) {
1.15 paf 972: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
1.35 paf 973: transcoder(NAME); // check right way
1.15 paf 974: }
975:
1.37 paf 976: xmlCharEncodingHandler& Charset::transcoder(const String::Body NAME) {
1.15 paf 977: if(!ftranscoder)
1.56 misha 978: throw Exception(PARSER_RUNTIME,
1.35 paf 979: new String(NAME, String::L_TAINTED),
1.10 paf 980: "unsupported encoding");
1.35 paf 981: return *ftranscoder;
1.10 paf 982: }
983:
1.54 paf 984: String::C Charset::transcode_cstr(const xmlChar* s) {
1.13 paf 985: if(!s)
1.35 paf 986: return String::C("", 0);
1.8 paf 987:
1.35 paf 988: int inlen=strlen((const char*)s);
1.51 paf 989: int outlen=inlen*6/*strlen("ÿ")*/; // max
1.35 paf 990: #ifndef NDEBUG
991: int saved_outlen=outlen;
992: #endif
993: char *out=new(PointerFreeGC) char[outlen+1];
1.8 paf 994:
1.30 paf 995: int error;
1.35 paf 996: if(xmlCharEncodingOutputFunc output=transcoder(FNAME).output) {
1.30 paf 997: error=output(
1.17 paf 998: (unsigned char*)out, &outlen,
1.46 paf 999: (const unsigned char*)s, &inlen
1000: #ifdef PA_PATCHED_LIBXML_BACKWARD
1001: ,0
1002: #endif
1003: );
1.30 paf 1004: } else {
1005: memcpy(out, s, outlen=inlen);
1006: error=0;
1007: }
1008: if(error<0)
1.23 paf 1009: throw Exception(0,
1.8 paf 1010: 0,
1.30 paf 1011: "transcode_cstr failed (%d)", error);
1.8 paf 1012:
1.35 paf 1013: assert(outlen<=saved_outlen); out[outlen]=0;
1014: return String::C(out, outlen);
1.14 paf 1015: }
1.54 paf 1016: const String& Charset::transcode(const xmlChar* s) {
1.35 paf 1017: String::C cstr=transcode_cstr(s);
1018: return *new String(cstr.str, cstr.length, true);
1.1 paf 1019: }
1020:
1.8 paf 1021: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.35 paf 1022: xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
1023: xmlChar* out;
1.30 paf 1024: int outlen;
1025: int error;
1.35 paf 1026: #ifndef NDEBUG
1027: int saved_outlen;
1028: #endif
1029: if(xmlCharEncodingInputFunc input=transcoder(FNAME).input) {
1.51 paf 1030: outlen=buf_size*6/*max UTF8 bytes per char*/;
1.35 paf 1031: #ifndef NDEBUG
1032: saved_outlen=outlen;
1033: #endif
1.47 paf 1034: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 1035: error=input(
1.17 paf 1036: out, &outlen,
1.46 paf 1037: (const unsigned char*)buf, (int*)&buf_size
1038: #ifdef PA_PATCHED_LIBXML_BACKWARD
1039: ,0
1040: #endif
1041: );
1.30 paf 1042: } else {
1043: outlen=buf_size;
1.35 paf 1044: #ifndef NDEBUG
1045: saved_outlen=outlen;
1046: #endif
1047: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 1048: memcpy(out, buf, outlen);
1049: error=0;
1050: }
1.17 paf 1051:
1.30 paf 1052: if(error<0)
1.23 paf 1053: throw Exception(0,
1.8 paf 1054: 0,
1.30 paf 1055: "transcode_buf failed (%d)", error);
1.8 paf 1056:
1.35 paf 1057: assert(outlen<=saved_outlen); out[outlen]=0;
1058: return out;
1.24 paf 1059: }
1.54 paf 1060: xmlChar* Charset::transcode(const String& s) {
1.35 paf 1061: const char* cstr=s.cstr(String::L_UNSPECIFIED);
1.1 paf 1062:
1.54 paf 1063: return transcode_buf2xchar(cstr, strlen(cstr));
1.1 paf 1064: }
1.54 paf 1065: xmlChar* Charset::transcode(const String::Body s) {
1.35 paf 1066: const char* cstr=s.cstr();
1067:
1.54 paf 1068: return transcode_buf2xchar(cstr, s.length());
1.35 paf 1069: }
1.36 paf 1070: #endif
1.34 paf 1071:
1.37 paf 1072: String::Body Charset::transcode(const String::Body src,
1.34 paf 1073: const Charset& source_transcoder,
1.35 paf 1074: const Charset& dest_transcoder) {
1.34 paf 1075:
1.35 paf 1076: const char *src_ptr=src.cstr();
1.34 paf 1077: size_t src_size=strlen(src_ptr);
1078:
1.35 paf 1079: String::C dest=Charset::transcode(String::C(src_ptr, src_size),
1080: source_transcoder,
1081: dest_transcoder);
1.34 paf 1082:
1.37 paf 1083: return String::Body(dest.str, dest.length);
1.35 paf 1084: }
1085:
1086: String& Charset::transcode(const String& src,
1087: const Charset& source_transcoder,
1088: const Charset& dest_transcoder) {
1089: if(!src.length())
1090: return *new String("", 0, false);
1.34 paf 1091:
1.37 paf 1092: return *new String(transcode((String::Body)src, source_transcoder, dest_transcoder), String::L_CLEAN);
1.34 paf 1093: }
1094:
1.35 paf 1095: void Charset::transcode(ArrayString& src,
1.34 paf 1096: const Charset& source_transcoder,
1.35 paf 1097: const Charset& dest_transcoder) {
1098: for(size_t i=0; i<src.count(); i++)
1099: src.put(i, &transcode(*src[i], source_transcoder, dest_transcoder));
1.34 paf 1100: }
1101:
1102: #ifndef DOXYGEN
1103: struct Transcode_pair_info {
1104: const Charset* source_transcoder;
1105: const Charset* dest_transcoder;
1106: };
1107: #endif
1.40 paf 1108: static void transcode_pair(const String::Body /*akey*/,
1.37 paf 1109: String::Body& avalue,
1.35 paf 1110: Transcode_pair_info* info) {
1111: avalue=Charset::transcode(avalue,
1112: *info->source_transcoder,
1113: *info->dest_transcoder);
1.34 paf 1114: }
1.61 ! misha 1115:
1.35 paf 1116: void Charset::transcode(HashStringString& src,
1.34 paf 1117: const Charset& source_transcoder,
1.35 paf 1118: const Charset& dest_transcoder) {
1119: Transcode_pair_info info={&source_transcoder, &dest_transcoder};
1.55 paf 1120: src.for_each_ref<Transcode_pair_info*>(transcode_pair, &info);
1.34 paf 1121: }
1.61 ! misha 1122:
! 1123: size_t getUTF8BytePos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t charPos){
! 1124: const XMLByte* ptr=srcBegin;
! 1125: while(charPos-- && readChar(ptr, srcEnd));
! 1126:
! 1127: return ptr-srcBegin;
! 1128: }
! 1129:
! 1130: size_t getUTF8CharPos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t bytePos){
! 1131: size_t charPos=0;
! 1132: const XMLByte* ptr=srcBegin;
! 1133: const XMLByte* ptrEnd=srcBegin+bytePos;
! 1134: while(readChar(ptr, srcEnd)){
! 1135: if(ptr>ptrEnd)
! 1136: return charPos;
! 1137: charPos++;
! 1138: }
! 1139:
! 1140: // scan till end but position in bytes still too low
! 1141: throw Exception(0,
! 1142: 0,
! 1143: "Error convertion byte pos to char pos");
! 1144: }
! 1145:
! 1146: size_t lengthUTF8(const XMLByte* srcBegin, const XMLByte* srcEnd){
! 1147: size_t size=0;
! 1148: while(readChar(srcBegin, srcEnd))
! 1149: size++;
! 1150:
! 1151: return size;
! 1152: }
E-mail: