Annotation of parser3/src/main/pa_charset.C, revision 1.62
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.52 paf 4: Copyright(c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.62 ! misha 8: static const char * const IDENT_CHARSET_C="$Date: 2008-07-16 17:07:16 $";
1.1 paf 9:
10: #include "pa_charset.h"
1.35 paf 11: #include "pa_charsets.h"
1.1 paf 12:
13: #ifdef XML
1.8 paf 14: #include "libxml/encoding.h"
1.1 paf 15: #endif
16:
1.46 paf 17: //#define PA_PATCHED_LIBXML_BACKWARD
1.60 misha 18: #define PRECALCULATE_DEST_LENGTH
1.46 paf 19:
1.38 paf 20: // globals
21:
22: Charset::UTF8CaseTable::Rec UTF8CaseToUpperRecords[]={
23: #include "utf8-to-upper.inc"
24: };
25: Charset::UTF8CaseTable UTF8CaseToUpper={
26: sizeof(UTF8CaseToUpperRecords)/sizeof(Charset::UTF8CaseTable::Rec),
27: UTF8CaseToUpperRecords};
28:
29: Charset::UTF8CaseTable::Rec UTF8CaseToLowerRecords[]={
30: #include "utf8-to-lower.inc"
31: };
32: Charset::UTF8CaseTable UTF8CaseToLower={
33: sizeof(UTF8CaseToLowerRecords)/sizeof(Charset::UTF8CaseTable::Rec),
34: UTF8CaseToLowerRecords};
35:
1.1 paf 36: // helpers
37:
38: inline void prepare_case_tables(unsigned char *tables) {
39: unsigned char *lcc_table=tables+lcc_offset;
40: unsigned char *fcc_table=tables+fcc_offset;
41: for(int i=0; i<0x100; i++)
1.53 paf 42: lcc_table[i]=fcc_table[i]=(unsigned char)i;
1.1 paf 43: }
44: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
45: unsigned char bit) {
46: unsigned char *ctypes_table=tables+ctypes_offset;
47: ctypes_table[0]=bit;
48: for(; *cstr; cstr++) {
49: unsigned char c=*cstr;
50: ctypes_table[c]|=bit;
51: }
52: }
1.35 paf 53: inline unsigned int to_wchar_code(const char* cstr) {
1.1 paf 54: if(!cstr || !*cstr)
55: return 0;
56: if(cstr[1]==0)
1.4 paf 57: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 58:
59: char *error_pos;
1.4 paf 60: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 61: }
1.35 paf 62: inline bool to_bool(const char* cstr) {
1.1 paf 63: return cstr && *cstr!=0;
64: }
65: static void element2ctypes(unsigned char c, bool belongs,
66: unsigned char *tables, unsigned char bit, int group_offset=-1) {
67: if(!belongs)
68: return;
69:
70: unsigned char *ctypes_table=tables+ctypes_offset;
71:
72: ctypes_table[c]|=bit;
73: if(group_offset>=0)
1.4 paf 74: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 75: }
76: static void element2case(unsigned char from, unsigned char to,
77: unsigned char *tables) {
78: if(!to)
79: return;
80:
81: unsigned char *lcc_table=tables+lcc_offset;
82: unsigned char *fcc_table=tables+fcc_offset;
83: lcc_table[from]=to;
84: fcc_table[from]=to; fcc_table[to]=from;
85: }
86:
87: // methods
88:
89: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.37 paf 90: Charset::Charset(Request_charsets* charsets, const String::Body ANAME, const String* afile_spec):
1.35 paf 91: FNAME(ANAME),
92: FNAME_CSTR(ANAME.cstrm()) {
1.7 paf 93:
1.35 paf 94: if(afile_spec) {
1.1 paf 95: fisUTF8=false;
1.35 paf 96: load_definition(*charsets, *afile_spec);
1.1 paf 97: #ifdef XML
1.35 paf 98: addEncoding(FNAME_CSTR);
1.1 paf 99: #endif
100: } else {
101: fisUTF8=true;
1.4 paf 102: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 103: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
104: }
105:
106: #ifdef XML
1.35 paf 107: initTranscoder(FNAME, FNAME_CSTR);
1.1 paf 108: #endif
109: }
110:
1.35 paf 111: void Charset::load_definition(Request_charsets& charsets, const String& afile_spec) {
1.1 paf 112: // pcre_tables
113: // lowcase, flipcase, bits digit+word+whitespace, masks
114:
115: // must not move this inside of prepare_case_tables
116: // don't know the size there
117: memset(pcre_tables, 0, sizeof(pcre_tables));
118: prepare_case_tables(pcre_tables);
1.4 paf 119: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 120:
121: // charset
1.35 paf 122: memset(&tables, 0, sizeof(tables));
1.1 paf 123:
124: // loading text
1.35 paf 125: char *data=file_read_text(charsets, afile_spec);
1.1 paf 126:
127: // ignore header
128: getrow(&data);
129:
130: // parse cells
131: char *row;
1.42 paf 132: while((row=getrow(&data))) {
1.1 paf 133: // remove empty&comment lines
134: if(!*row || *row=='#')
135: continue;
136:
137: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
1.53 paf 138: unsigned char c=0;
1.1 paf 139: char *cell;
1.42 paf 140: for(int column=0; (cell=lsplit(&row, '\t')); column++) {
1.1 paf 141: switch(column) {
1.53 paf 142: case 0: c=(unsigned char)to_wchar_code(cell); break;
1.1 paf 143: // pcre_tables
144: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
145: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
146: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
147: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
148: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
1.53 paf 149: case 6: element2case(c, (unsigned char)to_wchar_code(cell), pcre_tables); break;
1.1 paf 150: case 7:
151: case 8:
152: // charset
1.10 paf 153: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.56 misha 154: throw Exception(PARSER_RUNTIME,
1.35 paf 155: &afile_spec,
1.1 paf 156: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
157:
158: XMLCh unicode=(XMLCh)to_wchar_code(cell);
159: if(!unicode && column==7/*unicode1 column*/)
160: unicode=(XMLCh)c;
161: if(unicode) {
1.10 paf 162: if(!tables.fromTable[c])
163: tables.fromTable[c]=unicode;
164: tables.toTable[tables.toTableSize].intCh=unicode;
165: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
166: tables.toTableSize++;
1.1 paf 167: }
168: break;
169: }
170: }
171: };
172:
173: // sort by the Unicode code point
174: sort_ToTable();
175: }
176:
177: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
178: return
1.38 paf 179: static_cast<const Charset::Tables::Rec *>(a)->intCh-
180: static_cast<const Charset::Tables::Rec *>(b)->intCh;
1.1 paf 181: }
182:
183: void Charset::sort_ToTable() {
1.10 paf 184: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 185: sort_cmp_Trans_rec_intCh);
186: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 187: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 188: //fclose(f);
189: }
190:
1.60 misha 191: // @todo: precache for spedup searching
1.10 paf 192: static XMLByte xlatOneTo(const XMLCh toXlat,
1.35 paf 193: const Charset::Tables& tables,
194: XMLByte not_found) {
1.39 paf 195: int lo = 0;
196: int hi = tables.toTableSize - 1;
197: while(lo<=hi) {
1.35 paf 198: // Calc the mid point of the low and high offset.
1.39 paf 199: const unsigned int i = (lo + hi) / 2;
200:
201: XMLCh cur=tables.toTable[i].intCh;
202: if(toXlat==cur)
203: return tables.toTable[i].extCh;
204: if(toXlat>cur)
205: lo = i+1;
1.1 paf 206: else
1.39 paf 207: hi = i-1;
208: }
1.35 paf 209:
210: return not_found;
1.1 paf 211: }
212:
1.35 paf 213: String::C Charset::transcode(const String::C src,
214: const Charset& source_charset,
215: const Charset& dest_charset) {
216: if(!src.length)
217: return String::C("", 0);
1.4 paf 218:
1.1 paf 219: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
220: default: // 0x00
1.35 paf 221: return source_charset.transcodeToCharset(src, dest_charset);
1.1 paf 222: case 0x01:
1.35 paf 223: return source_charset.transcodeToUTF8(src);
1.1 paf 224: case 0x10:
1.35 paf 225: return dest_charset.transcodeFromUTF8(src);
1.1 paf 226: case 0x11:
1.35 paf 227: return src;
1.1 paf 228: }
229: }
230:
231: // ---------------------------------------------------------------------------
232: // Local static data
233: //
234: // gUTFBytes
235: // A list of counts of trailing bytes for each initial byte in the input.
236: //
237: // gUTFOffsets
238: // A list of values to offset each result char type, according to how
239: // many source bytes when into making it.
240: //
241: // gFirstByteMark
242: // A list of values to mask onto the first byte of an encoded sequence,
243: // indexed by the number of bytes used to create the sequence.
244: // ---------------------------------------------------------------------------
245: static const XMLByte gUTFBytes[0x100] = {
246: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
247: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
248: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
249: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
250: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
251: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
252: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
253: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
254: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
255: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
256: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
257: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
258: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
259: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
260: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
261: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
262: };
263:
264: static const uint gUTFOffsets[6] = {
265: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
266: };
267:
268: static const XMLByte gFirstByteMark[7] = {
269: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
270: };
271:
1.35 paf 272: static int transcodeToUTF8(const XMLByte* srcData, size_t& srcLen,
273: XMLByte *toFill, size_t& toFillLen,
274: const Charset::Tables& tables) {
1.11 paf 275: const XMLByte* srcPtr=srcData;
276: const XMLByte* srcEnd=srcData+srcLen;
277: XMLByte* outPtr=toFill;
278: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 279:
1.35 paf 280: while(srcPtr<srcEnd) {
281: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 282: if(!curVal) {
1.35 paf 283: // use the replacement character
284: *outPtr++= '?';
285: srcPtr++;
286: continue;
287: }
1.1 paf 288:
1.35 paf 289: // Figure out how many bytes we need
290: unsigned int encodedBytes;
291: if(curVal<0x80)
292: encodedBytes = 1;
293: else if(curVal<0x800)
294: encodedBytes = 2;
295: else if(curVal<0x10000)
296: encodedBytes = 3;
297: else if(curVal<0x200000)
298: encodedBytes = 4;
299: else if(curVal<0x4000000)
300: encodedBytes = 5;
301: else if(curVal<= 0x7FFFFFFF)
302: encodedBytes = 6;
303: else {
304: // use the replacement character
305: *outPtr++= '?';
306: srcPtr++;
307: continue;
308: }
1.11 paf 309:
1.35 paf 310: // If we cannot fully get this char into the output buffer
311: if (outPtr + encodedBytes > outEnd)
312: break;
313:
314: // We can do it, so update the source index
315: srcPtr++;
316:
317: // And spit out the bytes. We spit them out in reverse order
318: // here, so bump up the output pointer and work down as we go.
319: outPtr+= encodedBytes;
320: switch(encodedBytes) {
1.60 misha 321: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
322: curVal>>= 6;
323: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
324: curVal>>= 6;
325: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
326: curVal>>= 6;
327: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
328: curVal>>= 6;
329: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
330: curVal>>= 6;
331: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
1.35 paf 332: }
333:
334: // Add the encoded bytes back in again to indicate we've eaten them
335: outPtr+= encodedBytes;
336: }
337:
338: // Update the bytes eaten
339: srcLen = srcPtr - srcData;
340:
341: // Return the characters read
342: toFillLen = outPtr - toFill;
343:
1.29 paf 344: //return srcPtr==srcEnd?(int)toFillLen:-1;
345: /*
346: xmlCharEncodingInputFunc
347: Returns :
348: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
349: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
350: of ocetes consumed.
351: */
352: return 0;
1.1 paf 353: }
1.26 paf 354: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.35 paf 355: static int transcodeFromUTF8(const XMLByte* srcData, size_t& srcLen,
356: XMLByte* toFill, size_t& toFillLen,
357: const Charset::Tables& tables) {
1.11 paf 358: const XMLByte* srcPtr=srcData;
359: const XMLByte* srcEnd=srcData+srcLen;
360: XMLByte* outPtr=toFill;
361: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 362:
1.35 paf 363: // We now loop until we either run out of input data, or room to store
364: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
365: // Get the next leading byte out
366: const XMLByte firstByte =* srcPtr;
367:
368: // Special-case ASCII, which is a leading byte value of<= 127
1.60 misha 369: if(firstByte<=127) {
1.35 paf 370: *outPtr++= firstByte;
371: srcPtr++;
372: continue;
373: }
374:
375: // See how many trailing src bytes this sequence is going to require
376: const unsigned int trailingBytes = gUTFBytes[firstByte];
377:
378: // If there are not enough source bytes to do this one, then we
379: // are done. Note that we done>= here because we are implicitly
380: // counting the 1 byte we get no matter what.
381: if(srcPtr+trailingBytes>= srcEnd)
382: break;
383:
384: // Looks ok, so lets build up the value
385: uint tmpVal=0;
386: switch(trailingBytes) {
387: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
388: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
389: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
390: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
391: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
392: case 0: tmpVal+=*srcPtr++;
393: break;
394:
395: default:
396: throw Exception(0,
397: 0,
1.49 paf 398: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes); // never
1.35 paf 399: }
400: tmpVal-=gUTFOffsets[trailingBytes];
401:
402: // If it will fit into a single char, then put it in. Otherwise
403: // fail [*encode it as a surrogate pair. If its not valid, use the
404: // replacement char.*]
405: if(!(tmpVal & 0xFFFF0000)) {
1.25 paf 406: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
407: *outPtr++=xlat;
1.49 paf 408: else {
1.50 paf 409: outPtr+=sprintf((char *)outPtr, "&#%u;", tmpVal); // &#decimal;
1.49 paf 410: }
411: } else {
412: const XMLByte* recoverPtr=srcPtr-trailingBytes-1;
413: for(uint i=0; i<=trailingBytes; i++)
414: outPtr+=sprintf((char*)outPtr, "%%%02X", *recoverPtr++);
415: }
1.1 paf 416: }
1.35 paf 417:
418: // Update the bytes eaten
419: srcLen = srcPtr - srcData;
420:
421: // Return the characters read
422: toFillLen = outPtr - toFill;
1.11 paf 423:
1.29 paf 424: //return srcPtr==srcEnd?(int)toFillLen:-1;
425: /*
426: xmlCharEncodingOutputFunc
427: Returns :
428: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
429: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
430: of ocetes consumed.
431: */
432: return 0;
1.10 paf 433: }
434:
1.60 misha 435: static bool is_escaped(char c){
436: return
437: !(c<=127
438: && (
439: ((c>='0') && (c<='9'))
440: || ((c>='A') && (c<='Z'))
441: || ((c>='a') && (c<='z'))
442: || strchr("*@-_+./", c)!=0
443: ));
444: }
445:
446: // read one utf8 character, return number of bytes needed for store it
1.61 misha 447: static unsigned int readChar(const XMLByte*& srcPtr, const XMLByte* srcEnd, XMLByte& firstByte, XMLCh& UTF8Char){
1.60 misha 448: if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
449: return 0;
450:
451: firstByte=*srcPtr;
452:
453: if(firstByte<=127){
454: UTF8Char=firstByte;
455: srcPtr++;
456: return 1;
457: }
458:
459: unsigned int trailingBytes=gUTFBytes[firstByte];
460:
461: if(srcPtr+trailingBytes>=srcEnd){
462: return 0; // not enough bytes in source string for reading
463: }
464:
465: uint tmpVal=0;
466: switch(trailingBytes){
467: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
468: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
469: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
470: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
471: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
472: case 0: tmpVal+=*srcPtr++;
473: }
474:
475: tmpVal-=gUTFOffsets[trailingBytes];
476: UTF8Char=tmpVal;
477:
478: return trailingBytes+1;
479: }
480:
1.62 ! misha 481: static unsigned int skipChar(const XMLByte*& srcPtr, const XMLByte* srcEnd){
! 482: if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
! 483: return 0;
! 484:
! 485: XMLByte firstByte=*srcPtr;
! 486:
! 487: if(firstByte<=127){
! 488: srcPtr++;
! 489: return 1;
! 490: }
! 491:
! 492: unsigned int trailingBytes=gUTFBytes[firstByte]+1;
! 493: srcPtr+=trailingBytes;
! 494: return trailingBytes;
1.61 misha 495: }
496:
1.60 misha 497: // read char, return number of bytes needed for store it as UTF8
1.61 misha 498: static unsigned int readChar(const XMLByte*& srcPtr, const XMLByte* srcEnd, XMLByte& firstByte, XMLCh& UTF8Char, const Charset::Tables& tables){
1.60 misha 499: if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
500: return 0;
501:
502: firstByte=*srcPtr++;
503: UTF8Char=tables.fromTable[firstByte];
504:
505: if(UTF8Char<0x80)
506: return 1;
507: else if(UTF8Char<0x800)
508: return 2;
509: else if(UTF8Char<0x10000)
510: return 3;
511: else if(UTF8Char<0x200000)
512: return 4;
513: else if(UTF8Char<0x4000000)
514: return 5;
515: else if(UTF8Char<= 0x7FFFFFFF)
516: return 6;
517:
518: // will use the replacement character '?'
519: firstByte=0;
520: return 1;
521: }
522:
523: static int escape(const XMLByte* srcData, size_t& srcLen,
524: XMLByte* toFill, size_t& toFillLen) {
525: const XMLByte* srcPtr=srcData;
526: const XMLByte* srcEnd=srcData+srcLen;
527: XMLByte* outPtr=toFill;
528: XMLByte* outEnd=toFill+toFillLen;
529: XMLByte firstByte;
530: XMLCh UTF8Char;
531: uint charSize;
532:
533: // loop until we either run out of input data, or room to store
534: while((outPtr < outEnd) && (charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char))){
535: if(charSize==1){
536: if(is_escaped(firstByte)) // %XX
537: outPtr+=sprintf((char*)outPtr, "%%%02X", firstByte);
538: else
539: *outPtr++=firstByte;
540: } else
541: outPtr+=sprintf((char*)outPtr, "%%u%04X", UTF8Char); // %uXXXX
542: }
543:
544: // Update the bytes eaten
545: srcLen=srcPtr-srcData;
546:
547: // Return the characters read
548: toFillLen=outPtr-toFill;
549:
550: return 0;
551: }
552:
553: static int escape(const XMLByte* srcData, size_t& srcLen,
554: XMLByte *toFill, size_t& toFillLen,
555: const Charset::Tables& tables) {
556: const XMLByte* srcPtr=srcData;
557: const XMLByte* srcEnd=srcData+srcLen;
558: XMLByte* outPtr=toFill;
559: //XMLByte* outEnd=toFill+toFillLen;
560: XMLByte firstByte;
561: XMLCh UTF8Char;
562: uint charSize;
563:
564: while(charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, tables)){
565: if(charSize==1){
566: if(firstByte){
567: if(is_escaped(firstByte)) // %XX
568: outPtr+=sprintf((char*)outPtr, "%%%02X", firstByte);
569: else
570: *outPtr++=firstByte;
571: } else // add replacement char '?'
572: *outPtr++='?';
573: } else
574: outPtr+=sprintf((char*)outPtr, "%%u%04X", UTF8Char); // %uXXXX
575: }
576:
577: // Update the bytes eaten
578: srcLen = srcPtr - srcData;
579:
580: // Return the characters read
581: toFillLen = outPtr - toFill;
582:
583: return 0;
584: }
585:
586:
587: String::C Charset::escape(const String::C src, const Charset& source_charset){
588: size_t src_length=src.length;
589: if(!src_length)
590: return String::C("", 0);
591:
592: #ifdef PRECALCULATE_DEST_LENGTH
593: size_t dest_length=0;
594: const XMLByte* srcPtr=(XMLByte*)src.str;
595: const XMLByte* srcEnd=srcPtr+src_length;
596: XMLByte firstByte;
597: XMLCh UTF8Char;
598:
599: if(source_charset.isUTF8()){
600: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char)){
601: if(charSize==1)
602: dest_length+=!is_escaped(firstByte)?1:3/*%XX*/;
603: else
604: dest_length+=6; // '%uXXXX'
605: }
606: } else {
607: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, source_charset.tables)){
608: if(charSize==1)
609: dest_length+=(!firstByte/*replacement char '?'*/ || !is_escaped(firstByte))?1:3/*'%XX'*/;
610: else
611: dest_length+=6; // '%uXXXX'
612: }
613: }
614: #else
615: size_t dest_length=src_length*6; // enough for %uXXXX but too memory-hungry
616: #endif
617:
618: //throw Exception(0,0,"%u",dest_length);
619:
620: #ifndef NDEBUG
621: size_t saved_dest_length=dest_length;
622: #endif
623: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
624:
625: int status;
626: if(source_charset.isUTF8()){
627: status=::escape((XMLByte *)src.str, src_length, dest_body, dest_length);
628: } else {
629: status=::escape((XMLByte *)src.str, src_length, dest_body, dest_length, source_charset.tables);
630: }
631:
632: if(status<0)
633: throw Exception(0,
634: 0,
635: "Charset::escapeString buffer overflow");
636:
637: assert(dest_length<=saved_dest_length);
638: dest_body[dest_length]=0; // terminator
639: return String::C((char*)dest_body, dest_length);
640: }
641:
642:
1.35 paf 643: const String::C Charset::transcodeToUTF8(const String::C src) const {
644: size_t src_length=src.length;
1.60 misha 645:
646: #ifdef PRECALCULATE_DEST_LENGTH
647: size_t dest_length=0;
648: const XMLByte* srcPtr=(XMLByte*)src.str;
649: const XMLByte* srcEnd=srcPtr+src_length;
650: XMLByte firstByte;
651: XMLCh UTF8Char;
652: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, tables))
653: dest_length+=charSize;
654: #else
655: size_t dest_length=src_length*6; // so that surly enough (max utf8 seq len=6) but too memory-hyngry
656: #endif
657:
658: //throw Exception(0,0,"%u",dest_length);
659:
1.35 paf 660: #ifndef NDEBUG
661: size_t saved_dest_length=dest_length;
662: #endif
663: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 664:
665: if(::transcodeToUTF8(
1.35 paf 666: (XMLByte *)src.str, src_length,
667: dest_body, dest_length,
1.11 paf 668: tables)<0)
1.43 paf 669: throw Exception(0,
1.10 paf 670: 0,
1.11 paf 671: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 672:
1.60 misha 673: assert(dest_length<=saved_dest_length);
674: dest_body[dest_length]=0; // terminator
1.35 paf 675: return String::C((char*)dest_body, dest_length);
1.10 paf 676: }
1.38 paf 677:
678: static XMLCh change_case_UTF8(const XMLCh src, const Charset::UTF8CaseTable& table) {
1.39 paf 679: int lo = 0;
680: int hi = table.size - 1;
681: while(lo<=hi) {
1.38 paf 682: // Calc the mid point of the low and high offset.
1.39 paf 683: const unsigned int i = (lo + hi) / 2;
684:
685: XMLCh cur=table.records[i].from;
686: if(src==cur)
687: return table.records[i].to;
688: if(src>cur)
689: lo = i+1;
1.38 paf 690: else
1.39 paf 691: hi = i-1;
692: }
693:
694: // not found
1.38 paf 695: return src;
696: }
697:
1.58 misha 698: static void store_UTF8(XMLCh src, XMLByte*& outPtr){
1.38 paf 699: if(!src) {
700: // use the replacement character
701: *outPtr++= '?';
702: return;
703: }
704:
705: // Figure out how many bytes we need
706: unsigned int encodedBytes;
707: if(src<0x80)
708: encodedBytes = 1;
709: else if(src<0x800)
710: encodedBytes = 2;
711: else if(src<0x10000)
712: encodedBytes = 3;
713: else if(src<0x200000)
714: encodedBytes = 4;
715: else if(src<0x4000000)
716: encodedBytes = 5;
717: else if(src<= 0x7FFFFFFF)
718: encodedBytes = 6;
719: else {
720: // use the replacement character
721: *outPtr++= '?';
722: return;
723: }
724:
725: // And spit out the bytes. We spit them out in reverse order
726: // here, so bump up the output pointer and work down as we go.
727: outPtr+= encodedBytes;
728: switch(encodedBytes) {
729: case 6: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
730: src>>= 6;
731: case 5: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
732: src>>= 6;
733: case 4: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
734: src>>= 6;
735: case 3: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
736: src>>= 6;
737: case 2: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
738: src>>= 6;
739: case 1: *--outPtr = XMLByte(src | gFirstByteMark[encodedBytes]);
740: }
741:
742: // Add the encoded bytes back in again to indicate we've eaten them
743: outPtr+= encodedBytes;
744: }
745:
746: static void change_case_UTF8(XMLCh src, XMLByte*& outPtr,
747: const Charset::UTF8CaseTable& table) {
748: store_UTF8(change_case_UTF8(src, table), outPtr);
749: };
1.44 paf 750: void change_case_UTF8(const XMLByte* srcData, size_t srcLen,
751: XMLByte* toFill, size_t toFillLen,
752: const Charset::UTF8CaseTable& table) {
1.38 paf 753: const XMLByte* srcPtr=srcData;
1.44 paf 754: const XMLByte* srcEnd=srcData+srcLen;
1.38 paf 755: XMLByte* outPtr=toFill;
1.44 paf 756: XMLByte* outEnd=toFill+toFillLen;
757:
758: // We now loop until we either run out of input data, or room to store
759: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
760: // Get the next leading byte out
761: const XMLByte firstByte =* srcPtr;
1.38 paf 762:
1.60 misha 763: if(firstByte<=127) {
1.38 paf 764: change_case_UTF8(firstByte, outPtr, table);
765: srcPtr++;
766: continue;
767: }
768:
769: // See how many trailing src bytes this sequence is going to require
770: const unsigned int trailingBytes = gUTFBytes[firstByte];
771:
772: // Looks ok, so lets build up the value
773: uint tmpVal=0;
774: switch(trailingBytes) {
775: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
776: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
777: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
778: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
779: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
780: case 0: tmpVal+=*srcPtr++;
781: break;
782:
783: default:
784: throw Exception(0,
785: 0,
786: "change_case_UTF8 error: wrong trailingBytes value(%d)", trailingBytes);
787: }
788: tmpVal-=gUTFOffsets[trailingBytes];
789:
790: // If it will fit into a single char, then put it in. Otherwise
791: // fail [*encode it as a surrogate pair. If its not valid, use the
792: // replacement char.*]
793: if(!(tmpVal & 0xFFFF0000))
794: change_case_UTF8(tmpVal, outPtr, table);
795: else
796: throw Exception(0,
797: 0,
798: "change_case_UTF8 error: too big tmpVal(0x%08X)", tmpVal);
799: }
800:
801: if(srcPtr!=outPtr)
802: throw Exception(0,
803: 0,
804: "change_case_UTF8 error: end pointers do not match");
805: }
806:
1.60 misha 807: static size_t getDecNumLength(XMLCh UTF8Char){
808: return
809: (UTF8Char < 100)
810: ?2
811: :(UTF8Char < 1000)
812: ?3
813: :(UTF8Char < 10000)
814: ?4
815: :5;
816: }
1.38 paf 817:
1.35 paf 818: const String::C Charset::transcodeFromUTF8(const String::C src) const {
819: size_t src_length=src.length;
1.60 misha 820:
821: #ifdef PRECALCULATE_DEST_LENGTH
822: size_t dest_length=0;
823: const XMLByte* srcPtr=(XMLByte*)src.str;
824: const XMLByte* srcEnd=srcPtr+src_length;
825: XMLByte firstByte;
826: XMLCh UTF8Char;
827: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char)){
828: if(charSize==1)
829: dest_length++;
830: else
831: dest_length+=(UTF8Char & 0xFFFF0000)
832: ?charSize*3 // '%XX' for each byte
833: :(xlatOneTo(UTF8Char, tables, 0)!=0)
834: ?1 // can convert it to single char
835: :getDecNumLength(UTF8Char)+3; // &#XX; - &#XXXXX;
836: }
837: #else
838: // so that surly enough, "&#XXX;" has max ratio (huh? 8 bytes needed for '&#XXXXX;')
839: size_t dest_length=src_length*6;
840: #endif
841:
842: //throw Exception(0,0,"%u",dest_length);
843:
1.35 paf 844: #ifndef NDEBUG
845: size_t saved_dest_length=dest_length;
846: #endif
847: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 848:
849: if(::transcodeFromUTF8(
1.35 paf 850: (XMLByte *)src.str, src_length,
851: dest_body, dest_length,
1.11 paf 852: tables)<0)
1.43 paf 853: throw Exception(0,
1.10 paf 854: 0,
1.35 paf 855: "Charset::transcodeFromUTF8 buffer overflow");
1.10 paf 856:
1.60 misha 857: assert(dest_length<=saved_dest_length);
858: dest_body[dest_length]=0; // terminator
1.35 paf 859: return String::C((char*)dest_body, dest_length);
1.1 paf 860: }
861:
862: /// transcode using both charsets
1.35 paf 863: const String::C Charset::transcodeToCharset(const String::C src,
864: const Charset& dest_charset) const {
865: if(&dest_charset==this)
866: return src;
867: else {
868: size_t dest_length=src.length;
869: XMLByte* dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
870:
871: XMLByte* output=dest_body;
872: const XMLByte* input=(XMLByte *)src.str;
873: while(XMLCh c=*input++) {
874: XMLCh curVal = tables.fromTable[c];
875: *output++=curVal?
876: xlatOneTo(curVal, dest_charset.tables, '?') // OK
877: :'?'; // use the replacement character
1.6 paf 878: }
1.1 paf 879:
1.35 paf 880: dest_body[dest_length]=0; // terminator
881: return String::C((char*)dest_body, dest_length);
1.6 paf 882: }
1.1 paf 883: }
884:
1.58 misha 885: void Charset::store_Char(XMLByte*& outPtr, XMLCh src, XMLByte not_found){
1.59 misha 886: if(isUTF8())
1.58 misha 887: store_UTF8(src, outPtr);
1.59 misha 888: else if(char ch=xlatOneTo(src, tables, not_found))
1.58 misha 889: *outPtr++=ch;
1.57 misha 890: }
891:
1.1 paf 892: #ifdef XML
1.10 paf 893:
1.35 paf 894: static const Charset::Tables* tables[MAX_CHARSETS];
895:
1.46 paf 896: #ifdef PA_PATCHED_LIBXML_BACKWARD
897:
898: #define declareXml256ioFuncs(i) \
899: static int xml256CharEncodingInputFunc##i( \
900: unsigned char *out, int *outlen, \
901: const unsigned char *in, int *inlen, void*) { \
902: return transcodeToUTF8( \
903: in, *(size_t*)inlen, \
904: out, *(size_t*)outlen, \
905: *tables[i]); \
906: } \
907: static int xml256CharEncodingOutputFunc##i( \
908: unsigned char *out, int *outlen, \
909: const unsigned char *in, int *inlen, void*) { \
910: return transcodeFromUTF8( \
911: in, *(size_t*)inlen, \
912: out, *(size_t*)outlen, \
913: *tables[i]); \
914: }
915:
916: #else
917:
1.35 paf 918: #define declareXml256ioFuncs(i) \
919: static int xml256CharEncodingInputFunc##i( \
920: unsigned char *out, int *outlen, \
921: const unsigned char *in, int *inlen) { \
922: return transcodeToUTF8( \
923: in, *(size_t*)inlen, \
924: out, *(size_t*)outlen, \
925: *tables[i]); \
926: } \
927: static int xml256CharEncodingOutputFunc##i( \
928: unsigned char *out, int *outlen, \
929: const unsigned char *in, int *inlen) { \
930: return transcodeFromUTF8( \
931: in, *(size_t*)inlen, \
932: out, *(size_t*)outlen, \
933: *tables[i]); \
934: }
935:
1.46 paf 936: #endif
937:
938:
1.35 paf 939: declareXml256ioFuncs(0) declareXml256ioFuncs(1)
940: declareXml256ioFuncs(2) declareXml256ioFuncs(3)
941: declareXml256ioFuncs(4) declareXml256ioFuncs(5)
942: declareXml256ioFuncs(6) declareXml256ioFuncs(7)
943: declareXml256ioFuncs(8) declareXml256ioFuncs(9)
944:
945: static xmlCharEncodingInputFunc inputFuncs[MAX_CHARSETS]={
946: xml256CharEncodingInputFunc0, xml256CharEncodingInputFunc1,
947: xml256CharEncodingInputFunc2, xml256CharEncodingInputFunc3,
948: xml256CharEncodingInputFunc4, xml256CharEncodingInputFunc5,
949: xml256CharEncodingInputFunc6, xml256CharEncodingInputFunc7,
950: xml256CharEncodingInputFunc8, xml256CharEncodingInputFunc9
951: };
952: static xmlCharEncodingOutputFunc outputFuncs[MAX_CHARSETS]={
953: xml256CharEncodingOutputFunc0, xml256CharEncodingOutputFunc1,
954: xml256CharEncodingOutputFunc2, xml256CharEncodingOutputFunc3,
955: xml256CharEncodingOutputFunc4, xml256CharEncodingOutputFunc5,
956: xml256CharEncodingOutputFunc6, xml256CharEncodingOutputFunc7,
957: xml256CharEncodingOutputFunc8, xml256CharEncodingOutputFunc9
958: };
959: static size_t handlers_count=0;
1.10 paf 960:
961: void Charset::addEncoding(char *name_cstr) {
1.35 paf 962: if(handlers_count==MAX_CHARSETS)
963: throw Exception(0,
964: 0,
965: "already allocated %d handlers, no space for new encoding '%s'",
966: MAX_CHARSETS, name_cstr);
967:
1.45 paf 968: xmlCharEncodingHandler* handler=new(UseGC) xmlCharEncodingHandler;
1.35 paf 969: {
970: handler->name=name_cstr;
971: handler->input=inputFuncs[handlers_count];
972: handler->output=outputFuncs[handlers_count];
973: ::tables[handlers_count]=&tables;
974: handlers_count++;
975: }
1.10 paf 976:
977: xmlRegisterCharEncodingHandler(handler);
1.35 paf 978:
1.10 paf 979: }
980:
1.37 paf 981: void Charset::initTranscoder(const String::Body NAME, const char* name_cstr) {
1.15 paf 982: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
1.35 paf 983: transcoder(NAME); // check right way
1.15 paf 984: }
985:
1.37 paf 986: xmlCharEncodingHandler& Charset::transcoder(const String::Body NAME) {
1.15 paf 987: if(!ftranscoder)
1.56 misha 988: throw Exception(PARSER_RUNTIME,
1.35 paf 989: new String(NAME, String::L_TAINTED),
1.10 paf 990: "unsupported encoding");
1.35 paf 991: return *ftranscoder;
1.10 paf 992: }
993:
1.54 paf 994: String::C Charset::transcode_cstr(const xmlChar* s) {
1.13 paf 995: if(!s)
1.35 paf 996: return String::C("", 0);
1.8 paf 997:
1.35 paf 998: int inlen=strlen((const char*)s);
1.51 paf 999: int outlen=inlen*6/*strlen("ÿ")*/; // max
1.35 paf 1000: #ifndef NDEBUG
1001: int saved_outlen=outlen;
1002: #endif
1003: char *out=new(PointerFreeGC) char[outlen+1];
1.8 paf 1004:
1.30 paf 1005: int error;
1.35 paf 1006: if(xmlCharEncodingOutputFunc output=transcoder(FNAME).output) {
1.30 paf 1007: error=output(
1.17 paf 1008: (unsigned char*)out, &outlen,
1.46 paf 1009: (const unsigned char*)s, &inlen
1010: #ifdef PA_PATCHED_LIBXML_BACKWARD
1011: ,0
1012: #endif
1013: );
1.30 paf 1014: } else {
1015: memcpy(out, s, outlen=inlen);
1016: error=0;
1017: }
1018: if(error<0)
1.23 paf 1019: throw Exception(0,
1.8 paf 1020: 0,
1.30 paf 1021: "transcode_cstr failed (%d)", error);
1.8 paf 1022:
1.35 paf 1023: assert(outlen<=saved_outlen); out[outlen]=0;
1024: return String::C(out, outlen);
1.14 paf 1025: }
1.54 paf 1026: const String& Charset::transcode(const xmlChar* s) {
1.35 paf 1027: String::C cstr=transcode_cstr(s);
1028: return *new String(cstr.str, cstr.length, true);
1.1 paf 1029: }
1030:
1.8 paf 1031: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.35 paf 1032: xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
1033: xmlChar* out;
1.30 paf 1034: int outlen;
1035: int error;
1.35 paf 1036: #ifndef NDEBUG
1037: int saved_outlen;
1038: #endif
1039: if(xmlCharEncodingInputFunc input=transcoder(FNAME).input) {
1.51 paf 1040: outlen=buf_size*6/*max UTF8 bytes per char*/;
1.35 paf 1041: #ifndef NDEBUG
1042: saved_outlen=outlen;
1043: #endif
1.47 paf 1044: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 1045: error=input(
1.17 paf 1046: out, &outlen,
1.46 paf 1047: (const unsigned char*)buf, (int*)&buf_size
1048: #ifdef PA_PATCHED_LIBXML_BACKWARD
1049: ,0
1050: #endif
1051: );
1.30 paf 1052: } else {
1053: outlen=buf_size;
1.35 paf 1054: #ifndef NDEBUG
1055: saved_outlen=outlen;
1056: #endif
1057: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 1058: memcpy(out, buf, outlen);
1059: error=0;
1060: }
1.17 paf 1061:
1.30 paf 1062: if(error<0)
1.23 paf 1063: throw Exception(0,
1.8 paf 1064: 0,
1.30 paf 1065: "transcode_buf failed (%d)", error);
1.8 paf 1066:
1.35 paf 1067: assert(outlen<=saved_outlen); out[outlen]=0;
1068: return out;
1.24 paf 1069: }
1.54 paf 1070: xmlChar* Charset::transcode(const String& s) {
1.35 paf 1071: const char* cstr=s.cstr(String::L_UNSPECIFIED);
1.1 paf 1072:
1.54 paf 1073: return transcode_buf2xchar(cstr, strlen(cstr));
1.1 paf 1074: }
1.54 paf 1075: xmlChar* Charset::transcode(const String::Body s) {
1.35 paf 1076: const char* cstr=s.cstr();
1077:
1.54 paf 1078: return transcode_buf2xchar(cstr, s.length());
1.35 paf 1079: }
1.36 paf 1080: #endif
1.34 paf 1081:
1.37 paf 1082: String::Body Charset::transcode(const String::Body src,
1.34 paf 1083: const Charset& source_transcoder,
1.35 paf 1084: const Charset& dest_transcoder) {
1.34 paf 1085:
1.35 paf 1086: const char *src_ptr=src.cstr();
1.34 paf 1087: size_t src_size=strlen(src_ptr);
1088:
1.35 paf 1089: String::C dest=Charset::transcode(String::C(src_ptr, src_size),
1090: source_transcoder,
1091: dest_transcoder);
1.34 paf 1092:
1.37 paf 1093: return String::Body(dest.str, dest.length);
1.35 paf 1094: }
1095:
1096: String& Charset::transcode(const String& src,
1097: const Charset& source_transcoder,
1098: const Charset& dest_transcoder) {
1099: if(!src.length())
1100: return *new String("", 0, false);
1.34 paf 1101:
1.37 paf 1102: return *new String(transcode((String::Body)src, source_transcoder, dest_transcoder), String::L_CLEAN);
1.34 paf 1103: }
1104:
1.35 paf 1105: void Charset::transcode(ArrayString& src,
1.34 paf 1106: const Charset& source_transcoder,
1.35 paf 1107: const Charset& dest_transcoder) {
1108: for(size_t i=0; i<src.count(); i++)
1109: src.put(i, &transcode(*src[i], source_transcoder, dest_transcoder));
1.34 paf 1110: }
1111:
1112: #ifndef DOXYGEN
1113: struct Transcode_pair_info {
1114: const Charset* source_transcoder;
1115: const Charset* dest_transcoder;
1116: };
1117: #endif
1.40 paf 1118: static void transcode_pair(const String::Body /*akey*/,
1.37 paf 1119: String::Body& avalue,
1.35 paf 1120: Transcode_pair_info* info) {
1121: avalue=Charset::transcode(avalue,
1122: *info->source_transcoder,
1123: *info->dest_transcoder);
1.34 paf 1124: }
1.61 misha 1125:
1.35 paf 1126: void Charset::transcode(HashStringString& src,
1.34 paf 1127: const Charset& source_transcoder,
1.35 paf 1128: const Charset& dest_transcoder) {
1129: Transcode_pair_info info={&source_transcoder, &dest_transcoder};
1.55 paf 1130: src.for_each_ref<Transcode_pair_info*>(transcode_pair, &info);
1.34 paf 1131: }
1.61 misha 1132:
1133: size_t getUTF8BytePos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t charPos){
1134: const XMLByte* ptr=srcBegin;
1.62 ! misha 1135: while(charPos-- && skipChar(ptr, srcEnd));
1.61 misha 1136:
1137: return ptr-srcBegin;
1138: }
1139:
1140: size_t getUTF8CharPos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t bytePos){
1141: size_t charPos=0;
1142: const XMLByte* ptr=srcBegin;
1143: const XMLByte* ptrEnd=srcBegin+bytePos;
1.62 ! misha 1144: while(skipChar(ptr, srcEnd)){
1.61 misha 1145: if(ptr>ptrEnd)
1146: return charPos;
1147: charPos++;
1148: }
1149:
1150: // scan till end but position in bytes still too low
1151: throw Exception(0,
1152: 0,
1153: "Error convertion byte pos to char pos");
1154: }
1155:
1156: size_t lengthUTF8(const XMLByte* srcBegin, const XMLByte* srcEnd){
1157: size_t size=0;
1.62 ! misha 1158: while(skipChar(srcBegin, srcEnd))
1.61 misha 1159: size++;
1160:
1161: return size;
1162: }
E-mail: