Annotation of parser3/src/main/pa_charset.C, revision 1.64
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
1.52 paf 4: Copyright(c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com)
1.4 paf 5: Author: Alexander Petrosyan<paf@design.ru>(http://paf.design.ru)
1.27 paf 6: */
1.1 paf 7:
1.64 ! misha 8: static const char * const IDENT_CHARSET_C="$Date: 2008-07-17 09:10:49 $";
1.1 paf 9:
10: #include "pa_charset.h"
1.35 paf 11: #include "pa_charsets.h"
1.1 paf 12:
13: #ifdef XML
1.8 paf 14: #include "libxml/encoding.h"
1.1 paf 15: #endif
16:
1.46 paf 17: //#define PA_PATCHED_LIBXML_BACKWARD
1.60 misha 18: #define PRECALCULATE_DEST_LENGTH
1.46 paf 19:
1.38 paf 20: // globals
21:
22: Charset::UTF8CaseTable::Rec UTF8CaseToUpperRecords[]={
23: #include "utf8-to-upper.inc"
24: };
25: Charset::UTF8CaseTable UTF8CaseToUpper={
26: sizeof(UTF8CaseToUpperRecords)/sizeof(Charset::UTF8CaseTable::Rec),
27: UTF8CaseToUpperRecords};
28:
29: Charset::UTF8CaseTable::Rec UTF8CaseToLowerRecords[]={
30: #include "utf8-to-lower.inc"
31: };
32: Charset::UTF8CaseTable UTF8CaseToLower={
33: sizeof(UTF8CaseToLowerRecords)/sizeof(Charset::UTF8CaseTable::Rec),
34: UTF8CaseToLowerRecords};
35:
1.1 paf 36: // helpers
37:
38: inline void prepare_case_tables(unsigned char *tables) {
39: unsigned char *lcc_table=tables+lcc_offset;
40: unsigned char *fcc_table=tables+fcc_offset;
41: for(int i=0; i<0x100; i++)
1.53 paf 42: lcc_table[i]=fcc_table[i]=(unsigned char)i;
1.1 paf 43: }
44: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
45: unsigned char bit) {
46: unsigned char *ctypes_table=tables+ctypes_offset;
47: ctypes_table[0]=bit;
48: for(; *cstr; cstr++) {
49: unsigned char c=*cstr;
50: ctypes_table[c]|=bit;
51: }
52: }
1.35 paf 53: inline unsigned int to_wchar_code(const char* cstr) {
1.1 paf 54: if(!cstr || !*cstr)
55: return 0;
56: if(cstr[1]==0)
1.4 paf 57: return(unsigned int)(unsigned char)cstr[0];
1.1 paf 58:
59: char *error_pos;
1.4 paf 60: return(unsigned int)strtol(cstr, &error_pos, 0);
1.1 paf 61: }
1.35 paf 62: inline bool to_bool(const char* cstr) {
1.1 paf 63: return cstr && *cstr!=0;
64: }
65: static void element2ctypes(unsigned char c, bool belongs,
66: unsigned char *tables, unsigned char bit, int group_offset=-1) {
67: if(!belongs)
68: return;
69:
70: unsigned char *ctypes_table=tables+ctypes_offset;
71:
72: ctypes_table[c]|=bit;
73: if(group_offset>=0)
1.4 paf 74: tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1 paf 75: }
76: static void element2case(unsigned char from, unsigned char to,
77: unsigned char *tables) {
78: if(!to)
79: return;
80:
81: unsigned char *lcc_table=tables+lcc_offset;
82: unsigned char *fcc_table=tables+fcc_offset;
83: lcc_table[from]=to;
84: fcc_table[from]=to; fcc_table[to]=from;
85: }
86:
87: // methods
88:
89: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
1.37 paf 90: Charset::Charset(Request_charsets* charsets, const String::Body ANAME, const String* afile_spec):
1.35 paf 91: FNAME(ANAME),
92: FNAME_CSTR(ANAME.cstrm()) {
1.7 paf 93:
1.35 paf 94: if(afile_spec) {
1.1 paf 95: fisUTF8=false;
1.35 paf 96: load_definition(*charsets, *afile_spec);
1.1 paf 97: #ifdef XML
1.35 paf 98: addEncoding(FNAME_CSTR);
1.1 paf 99: #endif
100: } else {
101: fisUTF8=true;
1.4 paf 102: // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.1 paf 103: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
104: }
105:
106: #ifdef XML
1.35 paf 107: initTranscoder(FNAME, FNAME_CSTR);
1.1 paf 108: #endif
109: }
110:
1.35 paf 111: void Charset::load_definition(Request_charsets& charsets, const String& afile_spec) {
1.1 paf 112: // pcre_tables
113: // lowcase, flipcase, bits digit+word+whitespace, masks
114:
115: // must not move this inside of prepare_case_tables
116: // don't know the size there
117: memset(pcre_tables, 0, sizeof(pcre_tables));
118: prepare_case_tables(pcre_tables);
1.4 paf 119: cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1 paf 120:
121: // charset
1.35 paf 122: memset(&tables, 0, sizeof(tables));
1.1 paf 123:
124: // loading text
1.35 paf 125: char *data=file_read_text(charsets, afile_spec);
1.1 paf 126:
127: // ignore header
128: getrow(&data);
129:
130: // parse cells
131: char *row;
1.42 paf 132: while((row=getrow(&data))) {
1.1 paf 133: // remove empty&comment lines
134: if(!*row || *row=='#')
135: continue;
136:
137: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
1.53 paf 138: unsigned char c=0;
1.1 paf 139: char *cell;
1.42 paf 140: for(int column=0; (cell=lsplit(&row, '\t')); column++) {
1.1 paf 141: switch(column) {
1.53 paf 142: case 0: c=(unsigned char)to_wchar_code(cell); break;
1.1 paf 143: // pcre_tables
144: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
145: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
146: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
147: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
148: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
1.53 paf 149: case 6: element2case(c, (unsigned char)to_wchar_code(cell), pcre_tables); break;
1.1 paf 150: case 7:
151: case 8:
152: // charset
1.10 paf 153: if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.56 misha 154: throw Exception(PARSER_RUNTIME,
1.35 paf 155: &afile_spec,
1.1 paf 156: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
157:
158: XMLCh unicode=(XMLCh)to_wchar_code(cell);
159: if(!unicode && column==7/*unicode1 column*/)
160: unicode=(XMLCh)c;
161: if(unicode) {
1.10 paf 162: if(!tables.fromTable[c])
163: tables.fromTable[c]=unicode;
164: tables.toTable[tables.toTableSize].intCh=unicode;
165: tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
166: tables.toTableSize++;
1.1 paf 167: }
168: break;
169: }
170: }
171: };
172:
173: // sort by the Unicode code point
174: sort_ToTable();
175: }
176:
177: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
178: return
1.38 paf 179: static_cast<const Charset::Tables::Rec *>(a)->intCh-
180: static_cast<const Charset::Tables::Rec *>(b)->intCh;
1.1 paf 181: }
182:
183: void Charset::sort_ToTable() {
1.10 paf 184: _qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable),
1.1 paf 185: sort_cmp_Trans_rec_intCh);
186: //FILE *f=fopen("c:\\temp\\a", "wb");
1.10 paf 187: //fwrite(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), f);
1.1 paf 188: //fclose(f);
189: }
190:
1.60 misha 191: // @todo: precache for spedup searching
1.10 paf 192: static XMLByte xlatOneTo(const XMLCh toXlat,
1.35 paf 193: const Charset::Tables& tables,
194: XMLByte not_found) {
1.39 paf 195: int lo = 0;
196: int hi = tables.toTableSize - 1;
197: while(lo<=hi) {
1.35 paf 198: // Calc the mid point of the low and high offset.
1.39 paf 199: const unsigned int i = (lo + hi) / 2;
200:
201: XMLCh cur=tables.toTable[i].intCh;
202: if(toXlat==cur)
203: return tables.toTable[i].extCh;
204: if(toXlat>cur)
205: lo = i+1;
1.1 paf 206: else
1.39 paf 207: hi = i-1;
208: }
1.35 paf 209:
210: return not_found;
1.1 paf 211: }
212:
1.35 paf 213: String::C Charset::transcode(const String::C src,
214: const Charset& source_charset,
215: const Charset& dest_charset) {
216: if(!src.length)
217: return String::C("", 0);
1.4 paf 218:
1.1 paf 219: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
220: default: // 0x00
1.35 paf 221: return source_charset.transcodeToCharset(src, dest_charset);
1.1 paf 222: case 0x01:
1.35 paf 223: return source_charset.transcodeToUTF8(src);
1.1 paf 224: case 0x10:
1.35 paf 225: return dest_charset.transcodeFromUTF8(src);
1.1 paf 226: case 0x11:
1.35 paf 227: return src;
1.1 paf 228: }
229: }
230:
231: // ---------------------------------------------------------------------------
232: // Local static data
233: //
234: // gUTFBytes
235: // A list of counts of trailing bytes for each initial byte in the input.
236: //
237: // gUTFOffsets
238: // A list of values to offset each result char type, according to how
239: // many source bytes when into making it.
240: //
241: // gFirstByteMark
242: // A list of values to mask onto the first byte of an encoded sequence,
243: // indexed by the number of bytes used to create the sequence.
244: // ---------------------------------------------------------------------------
245: static const XMLByte gUTFBytes[0x100] = {
246: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
247: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
248: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
249: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
250: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
251: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
252: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
253: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
254: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
255: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
256: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
257: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
258: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
259: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
260: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
261: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
262: };
263:
264: static const uint gUTFOffsets[6] = {
265: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
266: };
267:
268: static const XMLByte gFirstByteMark[7] = {
269: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
270: };
271:
1.35 paf 272: static int transcodeToUTF8(const XMLByte* srcData, size_t& srcLen,
273: XMLByte *toFill, size_t& toFillLen,
274: const Charset::Tables& tables) {
1.11 paf 275: const XMLByte* srcPtr=srcData;
276: const XMLByte* srcEnd=srcData+srcLen;
277: XMLByte* outPtr=toFill;
278: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 279:
1.35 paf 280: while(srcPtr<srcEnd) {
281: uint curVal = tables.fromTable[*srcPtr];
1.1 paf 282: if(!curVal) {
1.35 paf 283: // use the replacement character
284: *outPtr++= '?';
285: srcPtr++;
286: continue;
287: }
1.1 paf 288:
1.35 paf 289: // Figure out how many bytes we need
290: unsigned int encodedBytes;
291: if(curVal<0x80)
292: encodedBytes = 1;
293: else if(curVal<0x800)
294: encodedBytes = 2;
295: else if(curVal<0x10000)
296: encodedBytes = 3;
297: else if(curVal<0x200000)
298: encodedBytes = 4;
299: else if(curVal<0x4000000)
300: encodedBytes = 5;
301: else if(curVal<= 0x7FFFFFFF)
302: encodedBytes = 6;
303: else {
304: // use the replacement character
305: *outPtr++= '?';
306: srcPtr++;
307: continue;
308: }
1.11 paf 309:
1.35 paf 310: // If we cannot fully get this char into the output buffer
311: if (outPtr + encodedBytes > outEnd)
312: break;
313:
314: // We can do it, so update the source index
315: srcPtr++;
316:
317: // And spit out the bytes. We spit them out in reverse order
318: // here, so bump up the output pointer and work down as we go.
319: outPtr+= encodedBytes;
320: switch(encodedBytes) {
1.60 misha 321: case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
322: curVal>>= 6;
323: case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
324: curVal>>= 6;
325: case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
326: curVal>>= 6;
327: case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
328: curVal>>= 6;
329: case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
330: curVal>>= 6;
331: case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
1.35 paf 332: }
333:
334: // Add the encoded bytes back in again to indicate we've eaten them
335: outPtr+= encodedBytes;
336: }
337:
338: // Update the bytes eaten
339: srcLen = srcPtr - srcData;
340:
341: // Return the characters read
342: toFillLen = outPtr - toFill;
343:
1.29 paf 344: //return srcPtr==srcEnd?(int)toFillLen:-1;
345: /*
346: xmlCharEncodingInputFunc
347: Returns :
348: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
349: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
350: of ocetes consumed.
351: */
352: return 0;
1.1 paf 353: }
1.26 paf 354: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.35 paf 355: static int transcodeFromUTF8(const XMLByte* srcData, size_t& srcLen,
356: XMLByte* toFill, size_t& toFillLen,
357: const Charset::Tables& tables) {
1.11 paf 358: const XMLByte* srcPtr=srcData;
359: const XMLByte* srcEnd=srcData+srcLen;
360: XMLByte* outPtr=toFill;
361: XMLByte* outEnd=toFill+toFillLen;
1.1 paf 362:
1.35 paf 363: // We now loop until we either run out of input data, or room to store
364: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
365: // Get the next leading byte out
366: const XMLByte firstByte =* srcPtr;
367:
368: // Special-case ASCII, which is a leading byte value of<= 127
1.60 misha 369: if(firstByte<=127) {
1.35 paf 370: *outPtr++= firstByte;
371: srcPtr++;
372: continue;
373: }
374:
375: // See how many trailing src bytes this sequence is going to require
376: const unsigned int trailingBytes = gUTFBytes[firstByte];
377:
378: // If there are not enough source bytes to do this one, then we
379: // are done. Note that we done>= here because we are implicitly
380: // counting the 1 byte we get no matter what.
381: if(srcPtr+trailingBytes>= srcEnd)
382: break;
383:
384: // Looks ok, so lets build up the value
385: uint tmpVal=0;
386: switch(trailingBytes) {
387: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
388: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
389: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
390: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
391: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
392: case 0: tmpVal+=*srcPtr++;
393: break;
394:
395: default:
396: throw Exception(0,
397: 0,
1.49 paf 398: "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes); // never
1.35 paf 399: }
400: tmpVal-=gUTFOffsets[trailingBytes];
401:
402: // If it will fit into a single char, then put it in. Otherwise
403: // fail [*encode it as a surrogate pair. If its not valid, use the
404: // replacement char.*]
405: if(!(tmpVal & 0xFFFF0000)) {
1.25 paf 406: if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
407: *outPtr++=xlat;
1.49 paf 408: else {
1.50 paf 409: outPtr+=sprintf((char *)outPtr, "&#%u;", tmpVal); // &#decimal;
1.49 paf 410: }
411: } else {
412: const XMLByte* recoverPtr=srcPtr-trailingBytes-1;
413: for(uint i=0; i<=trailingBytes; i++)
414: outPtr+=sprintf((char*)outPtr, "%%%02X", *recoverPtr++);
415: }
1.1 paf 416: }
1.35 paf 417:
418: // Update the bytes eaten
419: srcLen = srcPtr - srcData;
420:
421: // Return the characters read
422: toFillLen = outPtr - toFill;
1.11 paf 423:
1.29 paf 424: //return srcPtr==srcEnd?(int)toFillLen:-1;
425: /*
426: xmlCharEncodingOutputFunc
427: Returns :
428: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
429: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
430: of ocetes consumed.
431: */
432: return 0;
1.10 paf 433: }
434:
1.60 misha 435: static bool is_escaped(char c){
436: return
437: !(c<=127
438: && (
439: ((c>='0') && (c<='9'))
440: || ((c>='A') && (c<='Z'))
441: || ((c>='a') && (c<='z'))
442: || strchr("*@-_+./", c)!=0
443: ));
444: }
445:
446: // read one utf8 character, return number of bytes needed for store it
1.61 misha 447: static unsigned int readChar(const XMLByte*& srcPtr, const XMLByte* srcEnd, XMLByte& firstByte, XMLCh& UTF8Char){
1.60 misha 448: if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
449: return 0;
450:
451: firstByte=*srcPtr;
452:
453: if(firstByte<=127){
454: UTF8Char=firstByte;
455: srcPtr++;
456: return 1;
457: }
458:
459: unsigned int trailingBytes=gUTFBytes[firstByte];
460:
461: if(srcPtr+trailingBytes>=srcEnd){
462: return 0; // not enough bytes in source string for reading
463: }
464:
465: uint tmpVal=0;
466: switch(trailingBytes){
467: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
468: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
469: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
470: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
471: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
472: case 0: tmpVal+=*srcPtr++;
473: }
474:
475: tmpVal-=gUTFOffsets[trailingBytes];
476: UTF8Char=tmpVal;
477:
478: return trailingBytes+1;
479: }
480:
1.62 misha 481: static unsigned int skipChar(const XMLByte*& srcPtr, const XMLByte* srcEnd){
482: if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
483: return 0;
484:
1.63 misha 485: unsigned int trailingBytes=gUTFBytes[*srcPtr]+1;
486: srcPtr+=trailingBytes;
1.62 misha 487:
488: return trailingBytes;
1.61 misha 489: }
490:
1.60 misha 491: // read char, return number of bytes needed for store it as UTF8
1.61 misha 492: static unsigned int readChar(const XMLByte*& srcPtr, const XMLByte* srcEnd, XMLByte& firstByte, XMLCh& UTF8Char, const Charset::Tables& tables){
1.60 misha 493: if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
494: return 0;
495:
496: firstByte=*srcPtr++;
497: UTF8Char=tables.fromTable[firstByte];
498:
499: if(UTF8Char<0x80)
500: return 1;
501: else if(UTF8Char<0x800)
502: return 2;
503: else if(UTF8Char<0x10000)
504: return 3;
505: else if(UTF8Char<0x200000)
506: return 4;
507: else if(UTF8Char<0x4000000)
508: return 5;
509: else if(UTF8Char<= 0x7FFFFFFF)
510: return 6;
511:
512: // will use the replacement character '?'
513: firstByte=0;
514: return 1;
515: }
516:
517: static int escape(const XMLByte* srcData, size_t& srcLen,
518: XMLByte* toFill, size_t& toFillLen) {
519: const XMLByte* srcPtr=srcData;
520: const XMLByte* srcEnd=srcData+srcLen;
521: XMLByte* outPtr=toFill;
522: XMLByte* outEnd=toFill+toFillLen;
523: XMLByte firstByte;
524: XMLCh UTF8Char;
525: uint charSize;
526:
527: // loop until we either run out of input data, or room to store
528: while((outPtr < outEnd) && (charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char))){
529: if(charSize==1){
530: if(is_escaped(firstByte)) // %XX
531: outPtr+=sprintf((char*)outPtr, "%%%02X", firstByte);
532: else
533: *outPtr++=firstByte;
534: } else
535: outPtr+=sprintf((char*)outPtr, "%%u%04X", UTF8Char); // %uXXXX
536: }
537:
538: // Update the bytes eaten
539: srcLen=srcPtr-srcData;
540:
541: // Return the characters read
542: toFillLen=outPtr-toFill;
543:
544: return 0;
545: }
546:
547: static int escape(const XMLByte* srcData, size_t& srcLen,
548: XMLByte *toFill, size_t& toFillLen,
549: const Charset::Tables& tables) {
550: const XMLByte* srcPtr=srcData;
551: const XMLByte* srcEnd=srcData+srcLen;
552: XMLByte* outPtr=toFill;
553: //XMLByte* outEnd=toFill+toFillLen;
554: XMLByte firstByte;
555: XMLCh UTF8Char;
556: uint charSize;
557:
558: while(charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, tables)){
559: if(charSize==1){
560: if(firstByte){
561: if(is_escaped(firstByte)) // %XX
562: outPtr+=sprintf((char*)outPtr, "%%%02X", firstByte);
563: else
564: *outPtr++=firstByte;
565: } else // add replacement char '?'
566: *outPtr++='?';
567: } else
568: outPtr+=sprintf((char*)outPtr, "%%u%04X", UTF8Char); // %uXXXX
569: }
570:
571: // Update the bytes eaten
572: srcLen = srcPtr - srcData;
573:
574: // Return the characters read
575: toFillLen = outPtr - toFill;
576:
577: return 0;
578: }
579:
580:
581: String::C Charset::escape(const String::C src, const Charset& source_charset){
582: size_t src_length=src.length;
583: if(!src_length)
584: return String::C("", 0);
585:
586: #ifdef PRECALCULATE_DEST_LENGTH
587: size_t dest_length=0;
588: const XMLByte* srcPtr=(XMLByte*)src.str;
589: const XMLByte* srcEnd=srcPtr+src_length;
590: XMLByte firstByte;
591: XMLCh UTF8Char;
592:
593: if(source_charset.isUTF8()){
594: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char)){
595: if(charSize==1)
596: dest_length+=!is_escaped(firstByte)?1:3/*%XX*/;
597: else
598: dest_length+=6; // '%uXXXX'
599: }
600: } else {
601: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, source_charset.tables)){
602: if(charSize==1)
603: dest_length+=(!firstByte/*replacement char '?'*/ || !is_escaped(firstByte))?1:3/*'%XX'*/;
604: else
605: dest_length+=6; // '%uXXXX'
606: }
607: }
608: #else
609: size_t dest_length=src_length*6; // enough for %uXXXX but too memory-hungry
610: #endif
611:
612: //throw Exception(0,0,"%u",dest_length);
613:
614: #ifndef NDEBUG
615: size_t saved_dest_length=dest_length;
616: #endif
617: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
618:
619: int status;
620: if(source_charset.isUTF8()){
621: status=::escape((XMLByte *)src.str, src_length, dest_body, dest_length);
622: } else {
623: status=::escape((XMLByte *)src.str, src_length, dest_body, dest_length, source_charset.tables);
624: }
625:
626: if(status<0)
627: throw Exception(0,
628: 0,
629: "Charset::escapeString buffer overflow");
630:
631: assert(dest_length<=saved_dest_length);
632: dest_body[dest_length]=0; // terminator
633: return String::C((char*)dest_body, dest_length);
634: }
635:
1.64 ! misha 636: String::Body Charset::escape(const String::Body src, const Charset& source_charset) {
! 637: const char *src_ptr=src.cstr();
! 638: size_t src_size=strlen(src_ptr);
! 639:
! 640: String::C dest=Charset::escape(String::C(src_ptr, src_size),
! 641: source_charset);
! 642:
! 643: return String::Body(dest.str, dest.length);
! 644: }
! 645:
! 646: String& Charset::escape(const String& src, const Charset& source_charset) {
! 647: if(!src.length())
! 648: return *new String("", 0, false);
! 649:
! 650: return *new String(escape((String::Body)src, source_charset), String::L_CLEAN);
! 651: }
1.60 misha 652:
1.35 paf 653: const String::C Charset::transcodeToUTF8(const String::C src) const {
654: size_t src_length=src.length;
1.60 misha 655:
656: #ifdef PRECALCULATE_DEST_LENGTH
657: size_t dest_length=0;
658: const XMLByte* srcPtr=(XMLByte*)src.str;
659: const XMLByte* srcEnd=srcPtr+src_length;
660: XMLByte firstByte;
661: XMLCh UTF8Char;
662: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, tables))
663: dest_length+=charSize;
664: #else
665: size_t dest_length=src_length*6; // so that surly enough (max utf8 seq len=6) but too memory-hyngry
666: #endif
667:
668: //throw Exception(0,0,"%u",dest_length);
669:
1.35 paf 670: #ifndef NDEBUG
671: size_t saved_dest_length=dest_length;
672: #endif
673: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 674:
675: if(::transcodeToUTF8(
1.35 paf 676: (XMLByte *)src.str, src_length,
677: dest_body, dest_length,
1.11 paf 678: tables)<0)
1.43 paf 679: throw Exception(0,
1.10 paf 680: 0,
1.11 paf 681: "Charset::transcodeToUTF8 buffer overflow");
1.10 paf 682:
1.60 misha 683: assert(dest_length<=saved_dest_length);
684: dest_body[dest_length]=0; // terminator
1.35 paf 685: return String::C((char*)dest_body, dest_length);
1.10 paf 686: }
1.38 paf 687:
688: static XMLCh change_case_UTF8(const XMLCh src, const Charset::UTF8CaseTable& table) {
1.39 paf 689: int lo = 0;
690: int hi = table.size - 1;
691: while(lo<=hi) {
1.38 paf 692: // Calc the mid point of the low and high offset.
1.39 paf 693: const unsigned int i = (lo + hi) / 2;
694:
695: XMLCh cur=table.records[i].from;
696: if(src==cur)
697: return table.records[i].to;
698: if(src>cur)
699: lo = i+1;
1.38 paf 700: else
1.39 paf 701: hi = i-1;
702: }
703:
704: // not found
1.38 paf 705: return src;
706: }
707:
1.58 misha 708: static void store_UTF8(XMLCh src, XMLByte*& outPtr){
1.38 paf 709: if(!src) {
710: // use the replacement character
711: *outPtr++= '?';
712: return;
713: }
714:
715: // Figure out how many bytes we need
716: unsigned int encodedBytes;
717: if(src<0x80)
718: encodedBytes = 1;
719: else if(src<0x800)
720: encodedBytes = 2;
721: else if(src<0x10000)
722: encodedBytes = 3;
723: else if(src<0x200000)
724: encodedBytes = 4;
725: else if(src<0x4000000)
726: encodedBytes = 5;
727: else if(src<= 0x7FFFFFFF)
728: encodedBytes = 6;
729: else {
730: // use the replacement character
731: *outPtr++= '?';
732: return;
733: }
734:
735: // And spit out the bytes. We spit them out in reverse order
736: // here, so bump up the output pointer and work down as we go.
737: outPtr+= encodedBytes;
738: switch(encodedBytes) {
739: case 6: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
740: src>>= 6;
741: case 5: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
742: src>>= 6;
743: case 4: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
744: src>>= 6;
745: case 3: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
746: src>>= 6;
747: case 2: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
748: src>>= 6;
749: case 1: *--outPtr = XMLByte(src | gFirstByteMark[encodedBytes]);
750: }
751:
752: // Add the encoded bytes back in again to indicate we've eaten them
753: outPtr+= encodedBytes;
754: }
755:
756: static void change_case_UTF8(XMLCh src, XMLByte*& outPtr,
757: const Charset::UTF8CaseTable& table) {
758: store_UTF8(change_case_UTF8(src, table), outPtr);
759: };
1.44 paf 760: void change_case_UTF8(const XMLByte* srcData, size_t srcLen,
761: XMLByte* toFill, size_t toFillLen,
762: const Charset::UTF8CaseTable& table) {
1.38 paf 763: const XMLByte* srcPtr=srcData;
1.44 paf 764: const XMLByte* srcEnd=srcData+srcLen;
1.38 paf 765: XMLByte* outPtr=toFill;
1.44 paf 766: XMLByte* outEnd=toFill+toFillLen;
767:
768: // We now loop until we either run out of input data, or room to store
769: while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
770: // Get the next leading byte out
771: const XMLByte firstByte =* srcPtr;
1.38 paf 772:
1.60 misha 773: if(firstByte<=127) {
1.38 paf 774: change_case_UTF8(firstByte, outPtr, table);
775: srcPtr++;
776: continue;
777: }
778:
779: // See how many trailing src bytes this sequence is going to require
780: const unsigned int trailingBytes = gUTFBytes[firstByte];
781:
782: // Looks ok, so lets build up the value
783: uint tmpVal=0;
784: switch(trailingBytes) {
785: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
786: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
787: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
788: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
789: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
790: case 0: tmpVal+=*srcPtr++;
791: break;
792:
793: default:
794: throw Exception(0,
795: 0,
796: "change_case_UTF8 error: wrong trailingBytes value(%d)", trailingBytes);
797: }
798: tmpVal-=gUTFOffsets[trailingBytes];
799:
800: // If it will fit into a single char, then put it in. Otherwise
801: // fail [*encode it as a surrogate pair. If its not valid, use the
802: // replacement char.*]
803: if(!(tmpVal & 0xFFFF0000))
804: change_case_UTF8(tmpVal, outPtr, table);
805: else
806: throw Exception(0,
807: 0,
808: "change_case_UTF8 error: too big tmpVal(0x%08X)", tmpVal);
809: }
810:
811: if(srcPtr!=outPtr)
812: throw Exception(0,
813: 0,
814: "change_case_UTF8 error: end pointers do not match");
815: }
816:
1.60 misha 817: static size_t getDecNumLength(XMLCh UTF8Char){
818: return
819: (UTF8Char < 100)
820: ?2
821: :(UTF8Char < 1000)
822: ?3
823: :(UTF8Char < 10000)
824: ?4
825: :5;
826: }
1.38 paf 827:
1.35 paf 828: const String::C Charset::transcodeFromUTF8(const String::C src) const {
829: size_t src_length=src.length;
1.60 misha 830:
831: #ifdef PRECALCULATE_DEST_LENGTH
832: size_t dest_length=0;
833: const XMLByte* srcPtr=(XMLByte*)src.str;
834: const XMLByte* srcEnd=srcPtr+src_length;
835: XMLByte firstByte;
836: XMLCh UTF8Char;
837: while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char)){
838: if(charSize==1)
839: dest_length++;
840: else
841: dest_length+=(UTF8Char & 0xFFFF0000)
842: ?charSize*3 // '%XX' for each byte
843: :(xlatOneTo(UTF8Char, tables, 0)!=0)
844: ?1 // can convert it to single char
845: :getDecNumLength(UTF8Char)+3; // &#XX; - &#XXXXX;
846: }
847: #else
848: // so that surly enough, "&#XXX;" has max ratio (huh? 8 bytes needed for '&#XXXXX;')
849: size_t dest_length=src_length*6;
850: #endif
851:
852: //throw Exception(0,0,"%u",dest_length);
853:
1.35 paf 854: #ifndef NDEBUG
855: size_t saved_dest_length=dest_length;
856: #endif
857: XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11 paf 858:
859: if(::transcodeFromUTF8(
1.35 paf 860: (XMLByte *)src.str, src_length,
861: dest_body, dest_length,
1.11 paf 862: tables)<0)
1.43 paf 863: throw Exception(0,
1.10 paf 864: 0,
1.35 paf 865: "Charset::transcodeFromUTF8 buffer overflow");
1.10 paf 866:
1.60 misha 867: assert(dest_length<=saved_dest_length);
868: dest_body[dest_length]=0; // terminator
1.35 paf 869: return String::C((char*)dest_body, dest_length);
1.1 paf 870: }
871:
872: /// transcode using both charsets
1.35 paf 873: const String::C Charset::transcodeToCharset(const String::C src,
874: const Charset& dest_charset) const {
875: if(&dest_charset==this)
876: return src;
877: else {
878: size_t dest_length=src.length;
879: XMLByte* dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
880:
881: XMLByte* output=dest_body;
882: const XMLByte* input=(XMLByte *)src.str;
883: while(XMLCh c=*input++) {
884: XMLCh curVal = tables.fromTable[c];
885: *output++=curVal?
886: xlatOneTo(curVal, dest_charset.tables, '?') // OK
887: :'?'; // use the replacement character
1.6 paf 888: }
1.1 paf 889:
1.35 paf 890: dest_body[dest_length]=0; // terminator
891: return String::C((char*)dest_body, dest_length);
1.6 paf 892: }
1.1 paf 893: }
894:
1.58 misha 895: void Charset::store_Char(XMLByte*& outPtr, XMLCh src, XMLByte not_found){
1.59 misha 896: if(isUTF8())
1.58 misha 897: store_UTF8(src, outPtr);
1.59 misha 898: else if(char ch=xlatOneTo(src, tables, not_found))
1.58 misha 899: *outPtr++=ch;
1.57 misha 900: }
901:
1.1 paf 902: #ifdef XML
1.10 paf 903:
1.35 paf 904: static const Charset::Tables* tables[MAX_CHARSETS];
905:
1.46 paf 906: #ifdef PA_PATCHED_LIBXML_BACKWARD
907:
908: #define declareXml256ioFuncs(i) \
909: static int xml256CharEncodingInputFunc##i( \
910: unsigned char *out, int *outlen, \
911: const unsigned char *in, int *inlen, void*) { \
912: return transcodeToUTF8( \
913: in, *(size_t*)inlen, \
914: out, *(size_t*)outlen, \
915: *tables[i]); \
916: } \
917: static int xml256CharEncodingOutputFunc##i( \
918: unsigned char *out, int *outlen, \
919: const unsigned char *in, int *inlen, void*) { \
920: return transcodeFromUTF8( \
921: in, *(size_t*)inlen, \
922: out, *(size_t*)outlen, \
923: *tables[i]); \
924: }
925:
926: #else
927:
1.35 paf 928: #define declareXml256ioFuncs(i) \
929: static int xml256CharEncodingInputFunc##i( \
930: unsigned char *out, int *outlen, \
931: const unsigned char *in, int *inlen) { \
932: return transcodeToUTF8( \
933: in, *(size_t*)inlen, \
934: out, *(size_t*)outlen, \
935: *tables[i]); \
936: } \
937: static int xml256CharEncodingOutputFunc##i( \
938: unsigned char *out, int *outlen, \
939: const unsigned char *in, int *inlen) { \
940: return transcodeFromUTF8( \
941: in, *(size_t*)inlen, \
942: out, *(size_t*)outlen, \
943: *tables[i]); \
944: }
945:
1.46 paf 946: #endif
947:
948:
1.35 paf 949: declareXml256ioFuncs(0) declareXml256ioFuncs(1)
950: declareXml256ioFuncs(2) declareXml256ioFuncs(3)
951: declareXml256ioFuncs(4) declareXml256ioFuncs(5)
952: declareXml256ioFuncs(6) declareXml256ioFuncs(7)
953: declareXml256ioFuncs(8) declareXml256ioFuncs(9)
954:
955: static xmlCharEncodingInputFunc inputFuncs[MAX_CHARSETS]={
956: xml256CharEncodingInputFunc0, xml256CharEncodingInputFunc1,
957: xml256CharEncodingInputFunc2, xml256CharEncodingInputFunc3,
958: xml256CharEncodingInputFunc4, xml256CharEncodingInputFunc5,
959: xml256CharEncodingInputFunc6, xml256CharEncodingInputFunc7,
960: xml256CharEncodingInputFunc8, xml256CharEncodingInputFunc9
961: };
962: static xmlCharEncodingOutputFunc outputFuncs[MAX_CHARSETS]={
963: xml256CharEncodingOutputFunc0, xml256CharEncodingOutputFunc1,
964: xml256CharEncodingOutputFunc2, xml256CharEncodingOutputFunc3,
965: xml256CharEncodingOutputFunc4, xml256CharEncodingOutputFunc5,
966: xml256CharEncodingOutputFunc6, xml256CharEncodingOutputFunc7,
967: xml256CharEncodingOutputFunc8, xml256CharEncodingOutputFunc9
968: };
969: static size_t handlers_count=0;
1.10 paf 970:
971: void Charset::addEncoding(char *name_cstr) {
1.35 paf 972: if(handlers_count==MAX_CHARSETS)
973: throw Exception(0,
974: 0,
975: "already allocated %d handlers, no space for new encoding '%s'",
976: MAX_CHARSETS, name_cstr);
977:
1.45 paf 978: xmlCharEncodingHandler* handler=new(UseGC) xmlCharEncodingHandler;
1.35 paf 979: {
980: handler->name=name_cstr;
981: handler->input=inputFuncs[handlers_count];
982: handler->output=outputFuncs[handlers_count];
983: ::tables[handlers_count]=&tables;
984: handlers_count++;
985: }
1.10 paf 986:
987: xmlRegisterCharEncodingHandler(handler);
1.35 paf 988:
1.10 paf 989: }
990:
1.37 paf 991: void Charset::initTranscoder(const String::Body NAME, const char* name_cstr) {
1.15 paf 992: ftranscoder=xmlFindCharEncodingHandler(name_cstr);
1.35 paf 993: transcoder(NAME); // check right way
1.15 paf 994: }
995:
1.37 paf 996: xmlCharEncodingHandler& Charset::transcoder(const String::Body NAME) {
1.15 paf 997: if(!ftranscoder)
1.56 misha 998: throw Exception(PARSER_RUNTIME,
1.35 paf 999: new String(NAME, String::L_TAINTED),
1.10 paf 1000: "unsupported encoding");
1.35 paf 1001: return *ftranscoder;
1.10 paf 1002: }
1003:
1.54 paf 1004: String::C Charset::transcode_cstr(const xmlChar* s) {
1.13 paf 1005: if(!s)
1.35 paf 1006: return String::C("", 0);
1.8 paf 1007:
1.35 paf 1008: int inlen=strlen((const char*)s);
1.51 paf 1009: int outlen=inlen*6/*strlen("ÿ")*/; // max
1.35 paf 1010: #ifndef NDEBUG
1011: int saved_outlen=outlen;
1012: #endif
1013: char *out=new(PointerFreeGC) char[outlen+1];
1.8 paf 1014:
1.30 paf 1015: int error;
1.35 paf 1016: if(xmlCharEncodingOutputFunc output=transcoder(FNAME).output) {
1.30 paf 1017: error=output(
1.17 paf 1018: (unsigned char*)out, &outlen,
1.46 paf 1019: (const unsigned char*)s, &inlen
1020: #ifdef PA_PATCHED_LIBXML_BACKWARD
1021: ,0
1022: #endif
1023: );
1.30 paf 1024: } else {
1025: memcpy(out, s, outlen=inlen);
1026: error=0;
1027: }
1028: if(error<0)
1.23 paf 1029: throw Exception(0,
1.8 paf 1030: 0,
1.30 paf 1031: "transcode_cstr failed (%d)", error);
1.8 paf 1032:
1.35 paf 1033: assert(outlen<=saved_outlen); out[outlen]=0;
1034: return String::C(out, outlen);
1.14 paf 1035: }
1.54 paf 1036: const String& Charset::transcode(const xmlChar* s) {
1.35 paf 1037: String::C cstr=transcode_cstr(s);
1038: return *new String(cstr.str, cstr.length, true);
1.1 paf 1039: }
1040:
1.8 paf 1041: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.35 paf 1042: xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
1043: xmlChar* out;
1.30 paf 1044: int outlen;
1045: int error;
1.35 paf 1046: #ifndef NDEBUG
1047: int saved_outlen;
1048: #endif
1049: if(xmlCharEncodingInputFunc input=transcoder(FNAME).input) {
1.51 paf 1050: outlen=buf_size*6/*max UTF8 bytes per char*/;
1.35 paf 1051: #ifndef NDEBUG
1052: saved_outlen=outlen;
1053: #endif
1.47 paf 1054: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 1055: error=input(
1.17 paf 1056: out, &outlen,
1.46 paf 1057: (const unsigned char*)buf, (int*)&buf_size
1058: #ifdef PA_PATCHED_LIBXML_BACKWARD
1059: ,0
1060: #endif
1061: );
1.30 paf 1062: } else {
1063: outlen=buf_size;
1.35 paf 1064: #ifndef NDEBUG
1065: saved_outlen=outlen;
1066: #endif
1067: out=(xmlChar*)xmlMalloc(outlen+1);
1.30 paf 1068: memcpy(out, buf, outlen);
1069: error=0;
1070: }
1.17 paf 1071:
1.30 paf 1072: if(error<0)
1.23 paf 1073: throw Exception(0,
1.8 paf 1074: 0,
1.30 paf 1075: "transcode_buf failed (%d)", error);
1.8 paf 1076:
1.35 paf 1077: assert(outlen<=saved_outlen); out[outlen]=0;
1078: return out;
1.24 paf 1079: }
1.54 paf 1080: xmlChar* Charset::transcode(const String& s) {
1.35 paf 1081: const char* cstr=s.cstr(String::L_UNSPECIFIED);
1.1 paf 1082:
1.54 paf 1083: return transcode_buf2xchar(cstr, strlen(cstr));
1.1 paf 1084: }
1.54 paf 1085: xmlChar* Charset::transcode(const String::Body s) {
1.35 paf 1086: const char* cstr=s.cstr();
1087:
1.54 paf 1088: return transcode_buf2xchar(cstr, s.length());
1.35 paf 1089: }
1.36 paf 1090: #endif
1.34 paf 1091:
1.37 paf 1092: String::Body Charset::transcode(const String::Body src,
1.34 paf 1093: const Charset& source_transcoder,
1.35 paf 1094: const Charset& dest_transcoder) {
1.34 paf 1095:
1.35 paf 1096: const char *src_ptr=src.cstr();
1.34 paf 1097: size_t src_size=strlen(src_ptr);
1098:
1.35 paf 1099: String::C dest=Charset::transcode(String::C(src_ptr, src_size),
1100: source_transcoder,
1101: dest_transcoder);
1.34 paf 1102:
1.37 paf 1103: return String::Body(dest.str, dest.length);
1.35 paf 1104: }
1105:
1106: String& Charset::transcode(const String& src,
1107: const Charset& source_transcoder,
1108: const Charset& dest_transcoder) {
1109: if(!src.length())
1110: return *new String("", 0, false);
1.34 paf 1111:
1.37 paf 1112: return *new String(transcode((String::Body)src, source_transcoder, dest_transcoder), String::L_CLEAN);
1.34 paf 1113: }
1114:
1.35 paf 1115: void Charset::transcode(ArrayString& src,
1.34 paf 1116: const Charset& source_transcoder,
1.35 paf 1117: const Charset& dest_transcoder) {
1118: for(size_t i=0; i<src.count(); i++)
1119: src.put(i, &transcode(*src[i], source_transcoder, dest_transcoder));
1.34 paf 1120: }
1121:
1122: #ifndef DOXYGEN
1123: struct Transcode_pair_info {
1124: const Charset* source_transcoder;
1125: const Charset* dest_transcoder;
1126: };
1127: #endif
1.40 paf 1128: static void transcode_pair(const String::Body /*akey*/,
1.37 paf 1129: String::Body& avalue,
1.35 paf 1130: Transcode_pair_info* info) {
1131: avalue=Charset::transcode(avalue,
1132: *info->source_transcoder,
1133: *info->dest_transcoder);
1.34 paf 1134: }
1.61 misha 1135:
1.35 paf 1136: void Charset::transcode(HashStringString& src,
1.34 paf 1137: const Charset& source_transcoder,
1.35 paf 1138: const Charset& dest_transcoder) {
1139: Transcode_pair_info info={&source_transcoder, &dest_transcoder};
1.55 paf 1140: src.for_each_ref<Transcode_pair_info*>(transcode_pair, &info);
1.34 paf 1141: }
1.61 misha 1142:
1143: size_t getUTF8BytePos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t charPos){
1144: const XMLByte* ptr=srcBegin;
1.62 misha 1145: while(charPos-- && skipChar(ptr, srcEnd));
1.61 misha 1146:
1147: return ptr-srcBegin;
1148: }
1149:
1150: size_t getUTF8CharPos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t bytePos){
1151: size_t charPos=0;
1152: const XMLByte* ptr=srcBegin;
1153: const XMLByte* ptrEnd=srcBegin+bytePos;
1.62 misha 1154: while(skipChar(ptr, srcEnd)){
1.61 misha 1155: if(ptr>ptrEnd)
1156: return charPos;
1157: charPos++;
1158: }
1159:
1160: // scan till end but position in bytes still too low
1161: throw Exception(0,
1162: 0,
1163: "Error convertion byte pos to char pos");
1164: }
1165:
1166: size_t lengthUTF8(const XMLByte* srcBegin, const XMLByte* srcEnd){
1167: size_t size=0;
1.62 misha 1168: while(skipChar(srcBegin, srcEnd))
1.61 misha 1169: size++;
1170:
1171: return size;
1172: }
E-mail: