Annotation of parser3/src/main/pa_charset.C, revision 1.63
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.52 paf 4: Copyright(c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.63 ! misha 8: static const char * const IDENT_CHARSET_C="$Date: 2008-07-16 17:23:28 $";
1.1 paf 9:
10: #include "pa_charset.h"
1.35 paf 11: #include "pa_charsets.h"
1.1 paf 12:
13: #ifdef XML
1.8 paf 14: #include "libxml/encoding.h"
1.1 paf 15: #endif
16:
1.46 paf 17: //#define PA_PATCHED_LIBXML_BACKWARD
1.60 misha 18: #define PRECALCULATE_DEST_LENGTH
1.46 paf 19:
1.38 paf 20: // globals
21:
22: Charset::UTF8CaseTable::Rec UTF8CaseToUpperRecords[]={
23: #include "utf8-to-upper.inc"
24: };
25: Charset::UTF8CaseTable UTF8CaseToUpper={
26: sizeof(UTF8CaseToUpperRecords)/sizeof(Charset::UTF8CaseTable::Rec),
27: UTF8CaseToUpperRecords};
28:
29: Charset::UTF8CaseTable::Rec UTF8CaseToLowerRecords[]={
30: #include "utf8-to-lower.inc"
31: };
32: Charset::UTF8CaseTable UTF8CaseToLower={
33: sizeof(UTF8CaseToLowerRecords)/sizeof(Charset::UTF8CaseTable::Rec),
34: UTF8CaseToLowerRecords};
35:
1.1 paf 36: // helpers
37:
38: inline void prepare_case_tables(unsigned char *tables) {
39: unsigned char *lcc_table=tables+lcc_offset;
40: unsigned char *fcc_table=tables+fcc_offset;
41: for(int i=0; i<0x100; i++)
1.53 paf 42: lcc_table[i]=fcc_table[i]=(unsigned char)i;
1.1 paf 43: }
44: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
45: unsigned char bit) {
46: unsigned char *ctypes_table=tables+ctypes_offset;
47: ctypes_table[0]=bit;
48: for(; *cstr; cstr++) {
49: unsigned char c=*cstr;
50: ctypes_table[c]|=bit;
51: }
52: }
1.35 paf 53: inline unsigned int to_wchar_code(const char* cstr) {
1.1 paf 54: if(!cstr || !*cstr)
55: return 0;
56: if(cstr[1]==0)
1.4 paf 57: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 58:
59: char *error_pos;
1.4 paf 60: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 61: }
1.35 paf 62: inline bool to_bool(const char* cstr) {
1.1 paf 63: return cstr && *cstr!=0;
64: }
65: static void element2ctypes(unsigned char c, bool belongs,
66: unsigned char *tables, unsigned char bit, int group_offset=-1) {
67: if(!belongs)
68: return;
69:
70: unsigned char *ctypes_table=tables+ctypes_offset;
71:
72: ctypes_table[c]|=bit;
73: if(group_offset>=0)
1.4 paf 74: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 75: }
76: static void element2case(unsigned char from, unsigned char to,
77: unsigned char *tables) {
78: if(!to)
79: return;
80:
81: unsigned char *lcc_table=tables+lcc_offset;
82: unsigned char *fcc_table=tables+fcc_offset;
83: lcc_table[from]=to;
84: fcc_table[from]=to; fcc_table[to]=from;
85: }
86:
87: // methods
88:
89: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.37 paf 90: Charset::Charset(Request_charsets* charsets, const String::Body ANAME, const String* afile_spec):
1.35 paf 91: FNAME(ANAME),
92: FNAME_CSTR(ANAME.cstrm()) {
1.7 paf 93:
1.35 paf 94: if(afile_spec) {
1.1 paf 95: fisUTF8=false;
1.35 paf 96: load_definition(*charsets, *afile_spec);
1.1 paf 97: #ifdef XML
1.35 paf 98: addEncoding(FNAME_CSTR);
1.1 paf 99: #endif
100: } else {
101: fisUTF8=true;
1.4 paf 102: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 103: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
104: }
105:
106: #ifdef XML
1.35 paf 107: initTranscoder(FNAME, FNAME_CSTR);
1.1 paf 108: #endif
109: }
110:
1.35 paf 111: void Charset::load_definition(Request_charsets& charsets, const String& afile_spec) {
1.1 paf 112: // pcre_tables
113: // lowcase, flipcase, bits digit+word+whitespace, masks
114:
115: // must not move this inside of prepare_case_tables
116: // don't know the size there
117: memset(pcre_tables, 0, sizeof(pcre_tables));
118: prepare_case_tables(pcre_tables);
1.4 paf 119: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 120:
121: // charset
1.35 paf 122: memset(&tables, 0, sizeof(tables));
1.1 paf 123:
124: // loading text
1.35 paf 125: char *data=file_read_text(charsets, afile_spec);
1.1 paf 126:
127: // ignore header
128: getrow(&data);
129:
130: // parse cells
131: char *row;
1.42 paf 132: while((row=getrow(&data))) {
1.1 paf 133: // remove empty&comment lines
134: if(!*row || *row=='#')
135: continue;
136:
137: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
1.53 paf 138: unsigned char c=0;
1.1 paf 139: char *cell;
1.42 paf 140: for(int column=0; (cell=lsplit(&row, '\t')); column++) {
1.1 paf 141: switch(column) {
1.53 paf 142: case 0: c=(unsigned char)to_wchar_code(cell); break;
1.1 paf 143: // pcre_tables
144: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
145: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
146: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
147: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
148: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
1.53 paf 149: case 6: element2case(c, (unsigned char)to_wchar_code(cell), pcre_tables); break;
1.1 paf 150: case 7:
151: case 8:
152: // charset
1.10 paf 153: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.56 misha 154: throw Exception(PARSER_RUNTIME,
1.35 paf 155: &afile_spec,
1.1 paf 156: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
157:
158: XMLCh unicode=(XMLCh)to_wchar_code(cell);
159: if(!unicode && column==7/*unicode1 column*/)
160: unicode=(XMLCh)c;
161: if(unicode) {
1.10 paf 162: if(!tables.fromTable[c])
163: tables.fromTable[c]=unicode;
164: tables.toTable[tables.toTableSize].intCh=unicode;
165: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
166: tables.toTableSize++;
1.1 paf 167: }
168: break;
169: }
170: }
171: };
172:
173: // sort by the Unicode code point
174: sort_ToTable();
175: }
176:
177: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
178: return
1.38 paf 179: static_cast<const Charset::Tables::Rec *>(a)->intCh-
180: static_cast<const Charset::Tables::Rec *>(b)->intCh;
1.1 paf 181: }
182:
183: void Charset::sort_ToTable() {
1.10 paf 184: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 185: sort_cmp_Trans_rec_intCh);
186: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 187: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 188: //fclose(f);
189: }
190:
1.60 misha 191: // @todo: precache for spedup searching
1.10 paf 192: static XMLByte xlatOneTo(const XMLCh toXlat,
1.35 paf 193: const Charset::Tables& tables,
194: XMLByte not_found) {
1.39 paf 195: int lo = 0;
196: int hi = tables.toTableSize - 1;
197: while(lo<=hi) {
1.35 paf 198: // Calc the mid point of the low and high offset.
1.39 paf 199: const unsigned int i = (lo + hi) / 2;
200:
201: XMLCh cur=tables.toTable[i].intCh;
202: if(toXlat==cur)
203: return tables.toTable[i].extCh;
204: if(toXlat>cur)
205: lo = i+1;
1.1 paf 206: else
1.39 paf 207: hi = i-1;
208: }
1.35 paf 209:
210: return not_found;
1.1 paf 211: }
212:
1.35 paf 213: String::C Charset::transcode(const String::C src,
214: const Charset& source_charset,
215: const Charset& dest_charset) {
216: if(!src.length)
217: return String::C("", 0);
1.4 paf 218:
1.1 paf 219: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
220: default: // 0x00
1.35 paf 221: return source_charset.transcodeToCharset(src, dest_charset);
1.1 paf 222: case 0x01:
1.35 paf 223: return source_charset.transcodeToUTF8(src);
1.1 paf 224: case 0x10:
1.35 paf 225: return dest_charset.transcodeFromUTF8(src);
1.1 paf 226: case 0x11:
1.35 paf 227: return src;
1.1 paf 228: }
229: }
230:
231: // ---------------------------------------------------------------------------
232: // Local static data
233: //
234: // gUTFBytes
235: // A list of counts of trailing bytes for each initial byte in the input.
236: //
237: // gUTFOffsets
238: // A list of values to offset each result char type, according to how
239: // many source bytes when into making it.
240: //
241: // gFirstByteMark
242: // A list of values to mask onto the first byte of an encoded sequence,
243: // indexed by the number of bytes used to create the sequence.
244: // ---------------------------------------------------------------------------
245: static const XMLByte gUTFBytes[0x100] = {
246: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
247: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
248: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
249: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
250: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
251: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
252: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
253: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
254: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
255: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
256: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
257: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
258: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
259: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
260: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
261: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
262: };
263:
264: static const uint gUTFOffsets[6] = {
265: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
266: };
267:
268: static const XMLByte gFirstByteMark[7] = {
269: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
270: };
271:
1.35 paf 272: static int transcodeToUTF8(const XMLByte* srcData, size_t& srcLen,
273: XMLByte *toFill, size_t& toFillLen,
274: const Charset::Tables& tables) {
1.11 paf 275: const XMLByte* srcPtr=srcData;
276: const XMLByte* srcEnd=srcData+srcLen;
277: XMLByte* outPtr=toFill;
278: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 279:
1.35 paf 280: while(srcPtr<srcEnd) {
281: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 282: if(!curVal) {
1.35 paf 283: // use the replacement character
284: *outPtr++= '?';
285: srcPtr++;
286: continue;
287: }
1.1 paf 288:
1.35 paf 289: // Figure out how many bytes we need
290: unsigned int encodedBytes;
291: if(curVal<0x80)
292: encodedBytes = 1;
293: else if(curVal<0x800)
294: encodedBytes = 2;
295: else if(curVal<0x10000)
296: encodedBytes = 3;
297: else if(curVal<0x200000)
298: encodedBytes = 4;
299: else if(curVal<0x4000000)
300: encodedBytes = 5;
301: else if(curVal<= 0x7FFFFFFF)
302: encodedBytes = 6;
303: else {
304: // use the replacement character
305: *outPtr++= '?';
306: srcPtr++;
307: continue;
308: }
1.11 paf 309:
1.35 paf 310: // If we cannot fully get this char into the output buffer
311: if (outPtr + encodedBytes > outEnd)
312: break;
313:
314: // We can do it, so update the source index
315: srcPtr++;
316:
317: // And spit out the bytes. We spit them out in reverse order
318: // here, so bump up the output pointer and work down as we go.
319: outPtr+= encodedBytes;
320: switch(encodedBytes) {
1.60 misha 321: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
322: curVal>>= 6;
323: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
324: curVal>>= 6;
325: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
326: curVal>>= 6;
327: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
328: curVal>>= 6;
329: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
330: curVal>>= 6;
331: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
1.35 paf 332: }
333:
334: // Add the encoded bytes back in again to indicate we've eaten them
335: outPtr+= encodedBytes;
336: }
337:
338: // Update the bytes eaten
339: srcLen = srcPtr - srcData;
340:
341: // Return the characters read
342: toFillLen = outPtr - toFill;
343:
1.29 paf 344: //return srcPtr==srcEnd?(int)toFillLen:-1;
345: /*
346: xmlCharEncodingInputFunc
347: Returns :
348: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
349: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
350: of ocetes consumed.
351: */
352: return 0;
1.1 paf 353: }
1.26 paf 354: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.35 paf 355: static int transcodeFromUTF8(const XMLByte* srcData, size_t& srcLen,
356: XMLByte* toFill, size_t& toFillLen,
357: const Charset::Tables& tables) {
1.11 paf 358: const XMLByte* srcPtr=srcData;
359: const XMLByte* srcEnd=srcData+srcLen;
360: XMLByte* outPtr=toFill;
361: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 362:
1.35 paf 363: // We now loop until we either run out of input data, or room to store
364: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
365: // Get the next leading byte out
366: const XMLByte firstByte =* srcPtr;
367:
368: // Special-case ASCII, which is a leading byte value of<= 127
1.60 misha 369: if(firstByte<=127) {
1.35 paf 370: *outPtr++= firstByte;
371: srcPtr++;
372: continue;
373: }
374:
375: // See how many trailing src bytes this sequence is going to require
376: const unsigned int trailingBytes = gUTFBytes[firstByte];
377:
378: // If there are not enough source bytes to do this one, then we
379: // are done. Note that we done>= here because we are implicitly
380: // counting the 1 byte we get no matter what.
381: if(srcPtr+trailingBytes>= srcEnd)
382: break;
383:
384: // Looks ok, so lets build up the value
385: uint tmpVal=0;
386: switch(trailingBytes) {
387: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
388: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
389: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
390: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
391: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
392: case 0: tmpVal+=*srcPtr++;
393: break;
394:
395: default:
396: throw Exception(0,
397: 0,
1.49 paf 398: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes); // never
1.35 paf 399: }
400: tmpVal-=gUTFOffsets[trailingBytes];
401:
402: // If it will fit into a single char, then put it in. Otherwise
403: // fail [*encode it as a surrogate pair. If its not valid, use the
404: // replacement char.*]
405: if(!(tmpVal & 0xFFFF0000)) {
1.25 paf 406: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
407: *outPtr++=xlat;
1.49 paf 408: else {
1.50 paf 409: outPtr+=sprintf((char *)outPtr, "&#%u;", tmpVal); // &#decimal;
1.49 paf 410: }
411: } else {
412: const XMLByte* recoverPtr=srcPtr-trailingBytes-1;
413: for(uint i=0; i<=trailingBytes; i++)
414: outPtr+=sprintf((char*)outPtr, "%%%02X", *recoverPtr++);
415: }
1.1 paf 416: }
1.35 paf 417:
418: // Update the bytes eaten
419: srcLen = srcPtr - srcData;
420:
421: // Return the characters read
422: toFillLen = outPtr - toFill;
1.11 paf 423:
1.29 paf 424: //return srcPtr==srcEnd?(int)toFillLen:-1;
425: /*
426: xmlCharEncodingOutputFunc
427: Returns :
428: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
429: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
430: of ocetes consumed.
431: */
432: return 0;
1.10 paf 433: }
434:
1.60 misha 435: static bool is_escaped(char c){
436: return
437: !(c<=127
438: && (
439: ((c>='0') && (c<='9'))
440: || ((c>='A') && (c<='Z'))
441: || ((c>='a') && (c<='z'))
442: || strchr("*@-_+./", c)!=0
443: ));
444: }
445:
446: // read one utf8 character, return number of bytes needed for store it
1.61 misha 447: static unsigned int readChar(const XMLByte*& srcPtr, const XMLByte* srcEnd, XMLByte& firstByte, XMLCh& UTF8Char){
1.60 misha 448: if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
449: return 0;
450:
451: firstByte=*srcPtr;
452:
453: if(firstByte<=127){
454: UTF8Char=firstByte;
455: srcPtr++;
456: return 1;
457: }
458:
459: unsigned int trailingBytes=gUTFBytes[firstByte];
460:
461: if(srcPtr+trailingBytes>=srcEnd){
462: return 0; // not enough bytes in source string for reading
463: }
464:
465: uint tmpVal=0;
466: switch(trailingBytes){
467: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
468: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
469: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
470: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
471: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
472: case 0: tmpVal+=*srcPtr++;
473: }
474:
475: tmpVal-=gUTFOffsets[trailingBytes];
476: UTF8Char=tmpVal;
477:
478: return trailingBytes+1;
479: }
480:
1.62 misha 481: static unsigned int skipChar(const XMLByte*& srcPtr, const XMLByte* srcEnd){
482: if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
483: return 0;
484:
1.63 ! misha 485: unsigned int trailingBytes=gUTFBytes[*srcPtr]+1;
! 486: srcPtr+=trailingBytes;
1.62 misha 487:
488: return trailingBytes;
1.61 misha 489: }
490:
1.60 misha 491: // read char, return number of bytes needed for store it as UTF8
1.61 misha 492: static unsigned int readChar(const XMLByte*& srcPtr, const XMLByte* srcEnd, XMLByte& firstByte, XMLCh& UTF8Char, const Charset::Tables& tables){
1.60 misha 493: if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
494: return 0;
495:
496: firstByte=*srcPtr++;
497: UTF8Char=tables.fromTable[firstByte];
498:
499: if(UTF8Char<0x80)
500: return 1;
501: else if(UTF8Char<0x800)
502: return 2;
503: else if(UTF8Char<0x10000)
504: return 3;
505: else if(UTF8Char<0x200000)
506: return 4;
507: else if(UTF8Char<0x4000000)
508: return 5;
509: else if(UTF8Char<= 0x7FFFFFFF)
510: return 6;
511:
512: // will use the replacement character '?'
513: firstByte=0;
514: return 1;
515: }
516:
517: static int escape(const XMLByte* srcData, size_t& srcLen,
518: XMLByte* toFill, size_t& toFillLen) {
519: const XMLByte* srcPtr=srcData;
520: const XMLByte* srcEnd=srcData+srcLen;
521: XMLByte* outPtr=toFill;
522: XMLByte* outEnd=toFill+toFillLen;
523: XMLByte firstByte;
524: XMLCh UTF8Char;
525: uint charSize;
526:
527: // loop until we either run out of input data, or room to store
528: while((outPtr < outEnd) && (charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char))){
529: if(charSize==1){
530: if(is_escaped(firstByte)) // %XX
531: outPtr+=sprintf((char*)outPtr, "%%%02X", firstByte);
532: else
533: *outPtr++=firstByte;
534: } else
535: outPtr+=sprintf((char*)outPtr, "%%u%04X", UTF8Char); // %uXXXX
536: }
537:
538: // Update the bytes eaten
539: srcLen=srcPtr-srcData;
540:
541: // Return the characters read
542: toFillLen=outPtr-toFill;
543:
544: return 0;
545: }
546:
547: static int escape(const XMLByte* srcData, size_t& srcLen,
548: XMLByte *toFill, size_t& toFillLen,
549: const Charset::Tables& tables) {
550: const XMLByte* srcPtr=srcData;
551: const XMLByte* srcEnd=srcData+srcLen;
552: XMLByte* outPtr=toFill;
553: //XMLByte* outEnd=toFill+toFillLen;
554: XMLByte firstByte;
555: XMLCh UTF8Char;
556: uint charSize;
557:
558: while(charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, tables)){
559: if(charSize==1){
560: if(firstByte){
561: if(is_escaped(firstByte)) // %XX
562: outPtr+=sprintf((char*)outPtr, "%%%02X", firstByte);
563: else
564: *outPtr++=firstByte;
565: } else // add replacement char '?'
566: *outPtr++='?';
567: } else
568: outPtr+=sprintf((char*)outPtr, "%%u%04X", UTF8Char); // %uXXXX
569: }
570:
571: // Update the bytes eaten
572: srcLen = srcPtr - srcData;
573:
574: // Return the characters read
575: toFillLen = outPtr - toFill;
576:
577: return 0;
578: }
579:
580:
581: String::C Charset::escape(const String::C src, const Charset& source_charset){
582: size_t src_length=src.length;
583: if(!src_length)
584: return String::C("", 0);
585:
586: #ifdef PRECALCULATE_DEST_LENGTH
587: size_t dest_length=0;
588: const XMLByte* srcPtr=(XMLByte*)src.str;
589: const XMLByte* srcEnd=srcPtr+src_length;
590: XMLByte firstByte;
591: XMLCh UTF8Char;
592:
593: if(source_charset.isUTF8()){
594: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char)){
595: if(charSize==1)
596: dest_length+=!is_escaped(firstByte)?1:3/*%XX*/;
597: else
598: dest_length+=6; // '%uXXXX'
599: }
600: } else {
601: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, source_charset.tables)){
602: if(charSize==1)
603: dest_length+=(!firstByte/*replacement char '?'*/ || !is_escaped(firstByte))?1:3/*'%XX'*/;
604: else
605: dest_length+=6; // '%uXXXX'
606: }
607: }
608: #else
609: size_t dest_length=src_length*6; // enough for %uXXXX but too memory-hungry
610: #endif
611:
612: //throw Exception(0,0,"%u",dest_length);
613:
614: #ifndef NDEBUG
615: size_t saved_dest_length=dest_length;
616: #endif
617: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
618:
619: int status;
620: if(source_charset.isUTF8()){
621: status=::escape((XMLByte *)src.str, src_length, dest_body, dest_length);
622: } else {
623: status=::escape((XMLByte *)src.str, src_length, dest_body, dest_length, source_charset.tables);
624: }
625:
626: if(status<0)
627: throw Exception(0,
628: 0,
629: "Charset::escapeString buffer overflow");
630:
631: assert(dest_length<=saved_dest_length);
632: dest_body[dest_length]=0; // terminator
633: return String::C((char*)dest_body, dest_length);
634: }
635:
636:
1.35 paf 637: const String::C Charset::transcodeToUTF8(const String::C src) const {
638: size_t src_length=src.length;
1.60 misha 639:
640: #ifdef PRECALCULATE_DEST_LENGTH
641: size_t dest_length=0;
642: const XMLByte* srcPtr=(XMLByte*)src.str;
643: const XMLByte* srcEnd=srcPtr+src_length;
644: XMLByte firstByte;
645: XMLCh UTF8Char;
646: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, tables))
647: dest_length+=charSize;
648: #else
649: size_t dest_length=src_length*6; // so that surly enough (max utf8 seq len=6) but too memory-hyngry
650: #endif
651:
652: //throw Exception(0,0,"%u",dest_length);
653:
1.35 paf 654: #ifndef NDEBUG
655: size_t saved_dest_length=dest_length;
656: #endif
657: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 658:
659: if(::transcodeToUTF8(
1.35 paf 660: (XMLByte *)src.str, src_length,
661: dest_body, dest_length,
1.11 paf 662: tables)<0)
1.43 paf 663: throw Exception(0,
1.10 paf 664: 0,
1.11 paf 665: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 666:
1.60 misha 667: assert(dest_length<=saved_dest_length);
668: dest_body[dest_length]=0; // terminator
1.35 paf 669: return String::C((char*)dest_body, dest_length);
1.10 paf 670: }
1.38 paf 671:
672: static XMLCh change_case_UTF8(const XMLCh src, const Charset::UTF8CaseTable& table) {
1.39 paf 673: int lo = 0;
674: int hi = table.size - 1;
675: while(lo<=hi) {
1.38 paf 676: // Calc the mid point of the low and high offset.
1.39 paf 677: const unsigned int i = (lo + hi) / 2;
678:
679: XMLCh cur=table.records[i].from;
680: if(src==cur)
681: return table.records[i].to;
682: if(src>cur)
683: lo = i+1;
1.38 paf 684: else
1.39 paf 685: hi = i-1;
686: }
687:
688: // not found
1.38 paf 689: return src;
690: }
691:
1.58 misha 692: static void store_UTF8(XMLCh src, XMLByte*& outPtr){
1.38 paf 693: if(!src) {
694: // use the replacement character
695: *outPtr++= '?';
696: return;
697: }
698:
699: // Figure out how many bytes we need
700: unsigned int encodedBytes;
701: if(src<0x80)
702: encodedBytes = 1;
703: else if(src<0x800)
704: encodedBytes = 2;
705: else if(src<0x10000)
706: encodedBytes = 3;
707: else if(src<0x200000)
708: encodedBytes = 4;
709: else if(src<0x4000000)
710: encodedBytes = 5;
711: else if(src<= 0x7FFFFFFF)
712: encodedBytes = 6;
713: else {
714: // use the replacement character
715: *outPtr++= '?';
716: return;
717: }
718:
719: // And spit out the bytes. We spit them out in reverse order
720: // here, so bump up the output pointer and work down as we go.
721: outPtr+= encodedBytes;
722: switch(encodedBytes) {
723: case 6: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
724: src>>= 6;
725: case 5: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
726: src>>= 6;
727: case 4: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
728: src>>= 6;
729: case 3: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
730: src>>= 6;
731: case 2: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
732: src>>= 6;
733: case 1: *--outPtr = XMLByte(src | gFirstByteMark[encodedBytes]);
734: }
735:
736: // Add the encoded bytes back in again to indicate we've eaten them
737: outPtr+= encodedBytes;
738: }
739:
740: static void change_case_UTF8(XMLCh src, XMLByte*& outPtr,
741: const Charset::UTF8CaseTable& table) {
742: store_UTF8(change_case_UTF8(src, table), outPtr);
743: };
1.44 paf 744: void change_case_UTF8(const XMLByte* srcData, size_t srcLen,
745: XMLByte* toFill, size_t toFillLen,
746: const Charset::UTF8CaseTable& table) {
1.38 paf 747: const XMLByte* srcPtr=srcData;
1.44 paf 748: const XMLByte* srcEnd=srcData+srcLen;
1.38 paf 749: XMLByte* outPtr=toFill;
1.44 paf 750: XMLByte* outEnd=toFill+toFillLen;
751:
752: // We now loop until we either run out of input data, or room to store
753: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
754: // Get the next leading byte out
755: const XMLByte firstByte =* srcPtr;
1.38 paf 756:
1.60 misha 757: if(firstByte<=127) {
1.38 paf 758: change_case_UTF8(firstByte, outPtr, table);
759: srcPtr++;
760: continue;
761: }
762:
763: // See how many trailing src bytes this sequence is going to require
764: const unsigned int trailingBytes = gUTFBytes[firstByte];
765:
766: // Looks ok, so lets build up the value
767: uint tmpVal=0;
768: switch(trailingBytes) {
769: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
770: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
771: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
772: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
773: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
774: case 0: tmpVal+=*srcPtr++;
775: break;
776:
777: default:
778: throw Exception(0,
779: 0,
780: "change_case_UTF8 error: wrong trailingBytes value(%d)", trailingBytes);
781: }
782: tmpVal-=gUTFOffsets[trailingBytes];
783:
784: // If it will fit into a single char, then put it in. Otherwise
785: // fail [*encode it as a surrogate pair. If its not valid, use the
786: // replacement char.*]
787: if(!(tmpVal & 0xFFFF0000))
788: change_case_UTF8(tmpVal, outPtr, table);
789: else
790: throw Exception(0,
791: 0,
792: "change_case_UTF8 error: too big tmpVal(0x%08X)", tmpVal);
793: }
794:
795: if(srcPtr!=outPtr)
796: throw Exception(0,
797: 0,
798: "change_case_UTF8 error: end pointers do not match");
799: }
800:
1.60 misha 801: static size_t getDecNumLength(XMLCh UTF8Char){
802: return
803: (UTF8Char < 100)
804: ?2
805: :(UTF8Char < 1000)
806: ?3
807: :(UTF8Char < 10000)
808: ?4
809: :5;
810: }
1.38 paf 811:
1.35 paf 812: const String::C Charset::transcodeFromUTF8(const String::C src) const {
813: size_t src_length=src.length;
1.60 misha 814:
815: #ifdef PRECALCULATE_DEST_LENGTH
816: size_t dest_length=0;
817: const XMLByte* srcPtr=(XMLByte*)src.str;
818: const XMLByte* srcEnd=srcPtr+src_length;
819: XMLByte firstByte;
820: XMLCh UTF8Char;
821: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char)){
822: if(charSize==1)
823: dest_length++;
824: else
825: dest_length+=(UTF8Char & 0xFFFF0000)
826: ?charSize*3 // '%XX' for each byte
827: :(xlatOneTo(UTF8Char, tables, 0)!=0)
828: ?1 // can convert it to single char
829: :getDecNumLength(UTF8Char)+3; // &#XX; - &#XXXXX;
830: }
831: #else
832: // so that surly enough, "&#XXX;" has max ratio (huh? 8 bytes needed for '&#XXXXX;')
833: size_t dest_length=src_length*6;
834: #endif
835:
836: //throw Exception(0,0,"%u",dest_length);
837:
1.35 paf 838: #ifndef NDEBUG
839: size_t saved_dest_length=dest_length;
840: #endif
841: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 842:
843: if(::transcodeFromUTF8(
1.35 paf 844: (XMLByte *)src.str, src_length,
845: dest_body, dest_length,
1.11 paf 846: tables)<0)
1.43 paf 847: throw Exception(0,
1.10 paf 848: 0,
1.35 paf 849: "Charset::transcodeFromUTF8 buffer overflow");
1.10 paf 850:
1.60 misha 851: assert(dest_length<=saved_dest_length);
852: dest_body[dest_length]=0; // terminator
1.35 paf 853: return String::C((char*)dest_body, dest_length);
1.1 paf 854: }
855:
856: /// transcode using both charsets
1.35 paf 857: const String::C Charset::transcodeToCharset(const String::C src,
858: const Charset& dest_charset) const {
859: if(&dest_charset==this)
860: return src;
861: else {
862: size_t dest_length=src.length;
863: XMLByte* dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
864:
865: XMLByte* output=dest_body;
866: const XMLByte* input=(XMLByte *)src.str;
867: while(XMLCh c=*input++) {
868: XMLCh curVal = tables.fromTable[c];
869: *output++=curVal?
870: xlatOneTo(curVal, dest_charset.tables, '?') // OK
871: :'?'; // use the replacement character
1.6 paf 872: }
1.1 paf 873:
1.35 paf 874: dest_body[dest_length]=0; // terminator
875: return String::C((char*)dest_body, dest_length);
1.6 paf 876: }
1.1 paf 877: }
878:
1.58 misha 879: void Charset::store_Char(XMLByte*& outPtr, XMLCh src, XMLByte not_found){
1.59 misha 880: if(isUTF8())
1.58 misha 881: store_UTF8(src, outPtr);
1.59 misha 882: else if(char ch=xlatOneTo(src, tables, not_found))
1.58 misha 883: *outPtr++=ch;
1.57 misha 884: }
885:
1.1 paf 886: #ifdef XML
1.10 paf 887:
1.35 paf 888: static const Charset::Tables* tables[MAX_CHARSETS];
889:
1.46 paf 890: #ifdef PA_PATCHED_LIBXML_BACKWARD
891:
892: #define declareXml256ioFuncs(i) \
893: static int xml256CharEncodingInputFunc##i( \
894: unsigned char *out, int *outlen, \
895: const unsigned char *in, int *inlen, void*) { \
896: return transcodeToUTF8( \
897: in, *(size_t*)inlen, \
898: out, *(size_t*)outlen, \
899: *tables[i]); \
900: } \
901: static int xml256CharEncodingOutputFunc##i( \
902: unsigned char *out, int *outlen, \
903: const unsigned char *in, int *inlen, void*) { \
904: return transcodeFromUTF8( \
905: in, *(size_t*)inlen, \
906: out, *(size_t*)outlen, \
907: *tables[i]); \
908: }
909:
910: #else
911:
1.35 paf 912: #define declareXml256ioFuncs(i) \
913: static int xml256CharEncodingInputFunc##i( \
914: unsigned char *out, int *outlen, \
915: const unsigned char *in, int *inlen) { \
916: return transcodeToUTF8( \
917: in, *(size_t*)inlen, \
918: out, *(size_t*)outlen, \
919: *tables[i]); \
920: } \
921: static int xml256CharEncodingOutputFunc##i( \
922: unsigned char *out, int *outlen, \
923: const unsigned char *in, int *inlen) { \
924: return transcodeFromUTF8( \
925: in, *(size_t*)inlen, \
926: out, *(size_t*)outlen, \
927: *tables[i]); \
928: }
929:
1.46 paf 930: #endif
931:
932:
1.35 paf 933: declareXml256ioFuncs(0) declareXml256ioFuncs(1)
934: declareXml256ioFuncs(2) declareXml256ioFuncs(3)
935: declareXml256ioFuncs(4) declareXml256ioFuncs(5)
936: declareXml256ioFuncs(6) declareXml256ioFuncs(7)
937: declareXml256ioFuncs(8) declareXml256ioFuncs(9)
938:
939: static xmlCharEncodingInputFunc inputFuncs[MAX_CHARSETS]={
940: xml256CharEncodingInputFunc0, xml256CharEncodingInputFunc1,
941: xml256CharEncodingInputFunc2, xml256CharEncodingInputFunc3,
942: xml256CharEncodingInputFunc4, xml256CharEncodingInputFunc5,
943: xml256CharEncodingInputFunc6, xml256CharEncodingInputFunc7,
944: xml256CharEncodingInputFunc8, xml256CharEncodingInputFunc9
945: };
946: static xmlCharEncodingOutputFunc outputFuncs[MAX_CHARSETS]={
947: xml256CharEncodingOutputFunc0, xml256CharEncodingOutputFunc1,
948: xml256CharEncodingOutputFunc2, xml256CharEncodingOutputFunc3,
949: xml256CharEncodingOutputFunc4, xml256CharEncodingOutputFunc5,
950: xml256CharEncodingOutputFunc6, xml256CharEncodingOutputFunc7,
951: xml256CharEncodingOutputFunc8, xml256CharEncodingOutputFunc9
952: };
953: static size_t handlers_count=0;
1.10 paf 954:
955: void Charset::addEncoding(char *name_cstr) {
1.35 paf 956: if(handlers_count==MAX_CHARSETS)
957: throw Exception(0,
958: 0,
959: "already allocated %d handlers, no space for new encoding '%s'",
960: MAX_CHARSETS, name_cstr);
961:
1.45 paf 962: xmlCharEncodingHandler* handler=new(UseGC) xmlCharEncodingHandler;
1.35 paf 963: {
964: handler->name=name_cstr;
965: handler->input=inputFuncs[handlers_count];
966: handler->output=outputFuncs[handlers_count];
967: ::tables[handlers_count]=&tables;
968: handlers_count++;
969: }
1.10 paf 970:
971: xmlRegisterCharEncodingHandler(handler);
1.35 paf 972:
1.10 paf 973: }
974:
1.37 paf 975: void Charset::initTranscoder(const String::Body NAME, const char* name_cstr) {
1.15 paf 976: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
1.35 paf 977: transcoder(NAME); // check right way
1.15 paf 978: }
979:
1.37 paf 980: xmlCharEncodingHandler& Charset::transcoder(const String::Body NAME) {
1.15 paf 981: if(!ftranscoder)
1.56 misha 982: throw Exception(PARSER_RUNTIME,
1.35 paf 983: new String(NAME, String::L_TAINTED),
1.10 paf 984: "unsupported encoding");
1.35 paf 985: return *ftranscoder;
1.10 paf 986: }
987:
1.54 paf 988: String::C Charset::transcode_cstr(const xmlChar* s) {
1.13 paf 989: if(!s)
1.35 paf 990: return String::C("", 0);
1.8 paf 991:
1.35 paf 992: int inlen=strlen((const char*)s);
1.51 paf 993: int outlen=inlen*6/*strlen("ÿ")*/; // max
1.35 paf 994: #ifndef NDEBUG
995: int saved_outlen=outlen;
996: #endif
997: char *out=new(PointerFreeGC) char[outlen+1];
1.8 paf 998:
1.30 paf 999: int error;
1.35 paf 1000: if(xmlCharEncodingOutputFunc output=transcoder(FNAME).output) {
1.30 paf 1001: error=output(
1.17 paf 1002: (unsigned char*)out, &outlen,
1.46 paf 1003: (const unsigned char*)s, &inlen
1004: #ifdef PA_PATCHED_LIBXML_BACKWARD
1005: ,0
1006: #endif
1007: );
1.30 paf 1008: } else {
1009: memcpy(out, s, outlen=inlen);
1010: error=0;
1011: }
1012: if(error<0)
1.23 paf 1013: throw Exception(0,
1.8 paf 1014: 0,
1.30 paf 1015: "transcode_cstr failed (%d)", error);
1.8 paf 1016:
1.35 paf 1017: assert(outlen<=saved_outlen); out[outlen]=0;
1018: return String::C(out, outlen);
1.14 paf 1019: }
1.54 paf 1020: const String& Charset::transcode(const xmlChar* s) {
1.35 paf 1021: String::C cstr=transcode_cstr(s);
1022: return *new String(cstr.str, cstr.length, true);
1.1 paf 1023: }
1024:
1.8 paf 1025: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.35 paf 1026: xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
1027: xmlChar* out;
1.30 paf 1028: int outlen;
1029: int error;
1.35 paf 1030: #ifndef NDEBUG
1031: int saved_outlen;
1032: #endif
1033: if(xmlCharEncodingInputFunc input=transcoder(FNAME).input) {
1.51 paf 1034: outlen=buf_size*6/*max UTF8 bytes per char*/;
1.35 paf 1035: #ifndef NDEBUG
1036: saved_outlen=outlen;
1037: #endif
1.47 paf 1038: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 1039: error=input(
1.17 paf 1040: out, &outlen,
1.46 paf 1041: (const unsigned char*)buf, (int*)&buf_size
1042: #ifdef PA_PATCHED_LIBXML_BACKWARD
1043: ,0
1044: #endif
1045: );
1.30 paf 1046: } else {
1047: outlen=buf_size;
1.35 paf 1048: #ifndef NDEBUG
1049: saved_outlen=outlen;
1050: #endif
1051: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 1052: memcpy(out, buf, outlen);
1053: error=0;
1054: }
1.17 paf 1055:
1.30 paf 1056: if(error<0)
1.23 paf 1057: throw Exception(0,
1.8 paf 1058: 0,
1.30 paf 1059: "transcode_buf failed (%d)", error);
1.8 paf 1060:
1.35 paf 1061: assert(outlen<=saved_outlen); out[outlen]=0;
1062: return out;
1.24 paf 1063: }
1.54 paf 1064: xmlChar* Charset::transcode(const String& s) {
1.35 paf 1065: const char* cstr=s.cstr(String::L_UNSPECIFIED);
1.1 paf 1066:
1.54 paf 1067: return transcode_buf2xchar(cstr, strlen(cstr));
1.1 paf 1068: }
1.54 paf 1069: xmlChar* Charset::transcode(const String::Body s) {
1.35 paf 1070: const char* cstr=s.cstr();
1071:
1.54 paf 1072: return transcode_buf2xchar(cstr, s.length());
1.35 paf 1073: }
1.36 paf 1074: #endif
1.34 paf 1075:
1.37 paf 1076: String::Body Charset::transcode(const String::Body src,
1.34 paf 1077: const Charset& source_transcoder,
1.35 paf 1078: const Charset& dest_transcoder) {
1.34 paf 1079:
1.35 paf 1080: const char *src_ptr=src.cstr();
1.34 paf 1081: size_t src_size=strlen(src_ptr);
1082:
1.35 paf 1083: String::C dest=Charset::transcode(String::C(src_ptr, src_size),
1084: source_transcoder,
1085: dest_transcoder);
1.34 paf 1086:
1.37 paf 1087: return String::Body(dest.str, dest.length);
1.35 paf 1088: }
1089:
1090: String& Charset::transcode(const String& src,
1091: const Charset& source_transcoder,
1092: const Charset& dest_transcoder) {
1093: if(!src.length())
1094: return *new String("", 0, false);
1.34 paf 1095:
1.37 paf 1096: return *new String(transcode((String::Body)src, source_transcoder, dest_transcoder), String::L_CLEAN);
1.34 paf 1097: }
1098:
1.35 paf 1099: void Charset::transcode(ArrayString& src,
1.34 paf 1100: const Charset& source_transcoder,
1.35 paf 1101: const Charset& dest_transcoder) {
1102: for(size_t i=0; i<src.count(); i++)
1103: src.put(i, &transcode(*src[i], source_transcoder, dest_transcoder));
1.34 paf 1104: }
1105:
1106: #ifndef DOXYGEN
1107: struct Transcode_pair_info {
1108: const Charset* source_transcoder;
1109: const Charset* dest_transcoder;
1110: };
1111: #endif
1.40 paf 1112: static void transcode_pair(const String::Body /*akey*/,
1.37 paf 1113: String::Body& avalue,
1.35 paf 1114: Transcode_pair_info* info) {
1115: avalue=Charset::transcode(avalue,
1116: *info->source_transcoder,
1117: *info->dest_transcoder);
1.34 paf 1118: }
1.61 misha 1119:
1.35 paf 1120: void Charset::transcode(HashStringString& src,
1.34 paf 1121: const Charset& source_transcoder,
1.35 paf 1122: const Charset& dest_transcoder) {
1123: Transcode_pair_info info={&source_transcoder, &dest_transcoder};
1.55 paf 1124: src.for_each_ref<Transcode_pair_info*>(transcode_pair, &info);
1.34 paf 1125: }
1.61 misha 1126:
1127: size_t getUTF8BytePos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t charPos){
1128: const XMLByte* ptr=srcBegin;
1.62 misha 1129: while(charPos-- && skipChar(ptr, srcEnd));
1.61 misha 1130:
1131: return ptr-srcBegin;
1132: }
1133:
1134: size_t getUTF8CharPos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t bytePos){
1135: size_t charPos=0;
1136: const XMLByte* ptr=srcBegin;
1137: const XMLByte* ptrEnd=srcBegin+bytePos;
1.62 misha 1138: while(skipChar(ptr, srcEnd)){
1.61 misha 1139: if(ptr>ptrEnd)
1140: return charPos;
1141: charPos++;
1142: }
1143:
1144: // scan till end but position in bytes still too low
1145: throw Exception(0,
1146: 0,
1147: "Error convertion byte pos to char pos");
1148: }
1149:
1150: size_t lengthUTF8(const XMLByte* srcBegin, const XMLByte* srcEnd){
1151: size_t size=0;
1.62 misha 1152: while(skipChar(srcBegin, srcEnd))
1.61 misha 1153: size++;
1154:
1155: return size;
1156: }
E-mail: