parser3/src/main/pa_charset.C - annotate

Return to pa_charset.C CVS log
Up to [parser3project] / parser3 / src / main
Annotation of parser3/src/main/pa_charset.C, revision 1.112

1.1       paf         1: /** @file
                      2:        Parser: Charset connection implementation.
                      3: 
1.112   ! moko        4:        Copyright (c) 2001-2023 Art. Lebedev Studio (http://www.artlebedev.com)
        !             5:        Authors: Konstantin Morshnev <moko@design.ru>, Alexandr Petrosian <paf@design.ru>
1.27      paf         6: */
1.1       paf         7: 
                      8: #include "pa_charset.h"
1.35      paf         9: #include "pa_charsets.h"
1.1       paf        10: 
1.96      moko       11: // we are using some pcre_internal.h stuff as well
                     12: #include "../lib/pcre/pa_pcre_internal.h"
                     13: 
1.112   ! moko       14: volatile const char * IDENT_PA_CHARSET_C="$Id: pa_charset.C,v 1.111 2022/03/21 13:22:02 moko Exp $" IDENT_PA_CHARSET_H;
1.90      moko       15: 
1.1       paf        16: #ifdef XML
1.8       paf        17: #include "libxml/encoding.h"
1.1       paf        18: #endif
                     19: 
1.46      paf        20: //#define PA_PATCHED_LIBXML_BACKWARD
1.67      misha      21: 
                     22: // reduce memory usage by pre-calculation utf-8 string length
1.60      misha      23: #define PRECALCULATE_DEST_LENGTH
1.46      paf        24: 
1.38      paf        25: // globals
                     26: 
                     27: Charset::UTF8CaseTable::Rec UTF8CaseToUpperRecords[]={
                     28: #include "utf8-to-upper.inc"
                     29: };
                     30: Charset::UTF8CaseTable UTF8CaseToUpper={
                     31:        sizeof(UTF8CaseToUpperRecords)/sizeof(Charset::UTF8CaseTable::Rec),
                     32:        UTF8CaseToUpperRecords};
                     33: 
                     34: Charset::UTF8CaseTable::Rec UTF8CaseToLowerRecords[]={
                     35: #include "utf8-to-lower.inc"
                     36: };
                     37: Charset::UTF8CaseTable UTF8CaseToLower={
                     38:        sizeof(UTF8CaseToLowerRecords)/sizeof(Charset::UTF8CaseTable::Rec),
                     39:        UTF8CaseToLowerRecords};
                     40: 
1.1       paf        41: // helpers
                     42: 
                     43: inline void prepare_case_tables(unsigned char *tables) {
                     44:        unsigned char *lcc_table=tables+lcc_offset;
                     45:        unsigned char *fcc_table=tables+fcc_offset;
                     46:        for(int i=0; i<0x100; i++)
1.53      paf        47:                lcc_table[i]=fcc_table[i]=(unsigned char)i;
1.1       paf        48: }
1.99      moko       49: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr, unsigned char bit) {
1.1       paf        50:        unsigned char *ctypes_table=tables+ctypes_offset;
                     51:        ctypes_table[0]=bit;
                     52:        for(; *cstr; cstr++) {
                     53:                unsigned char c=*cstr;
                     54:                ctypes_table[c]|=bit;
                     55:        }
                     56: }
1.35      paf        57: inline unsigned int to_wchar_code(const char* cstr) {
1.1       paf        58:        if(!cstr || !*cstr)
                     59:                return 0;
                     60:        if(cstr[1]==0)
1.4       paf        61:                return(unsigned int)(unsigned char)cstr[0];
1.1       paf        62: 
1.91      moko       63:        return pa_atoui(cstr,0);
1.1       paf        64: }
1.35      paf        65: inline bool to_bool(const char* cstr) {
1.1       paf        66:        return cstr && *cstr!=0;
                     67: }
1.99      moko       68: static void element2ctypes(unsigned char c, bool belongs, unsigned char *tables, unsigned char bit, int group_offset=-1) {
1.1       paf        69:        if(!belongs)
                     70:                return;
                     71: 
                     72:        unsigned char *ctypes_table=tables+ctypes_offset;
                     73: 
                     74:        ctypes_table[c]|=bit;
                     75:        if(group_offset>=0)
1.4       paf        76:                tables[cbits_offset+group_offset+c/8] |= 1<<(c%8);
1.1       paf        77: }
1.99      moko       78: static void element2case(unsigned char from, unsigned char to, unsigned char *tables) {
1.1       paf        79:        if(!to) 
                     80:                return;
                     81: 
                     82:        unsigned char *lcc_table=tables+lcc_offset;
                     83:        unsigned char *fcc_table=tables+fcc_offset;
                     84:        lcc_table[from]=to;
                     85:        fcc_table[from]=to; fcc_table[to]=from;
                     86: }
                     87: 
1.95      moko       88: inline XMLByte *append_hex_8(XMLByte *dest, unsigned char c, const char* prefix=0) {
1.93      moko       89:     if(prefix) {
1.95      moko       90:         strcpy((char *)dest, prefix);
1.93      moko       91:         dest+=strlen(prefix);
                     92:     }
                     93:     *dest++=hex_digits[c >> 4];
                     94:     *dest++=hex_digits[c & 0x0F];
1.95      moko       95:     return dest;
1.93      moko       96: }
                     97: 
1.95      moko       98: inline XMLByte *append_hex_16(XMLByte *dest, unsigned int c, const char* prefix=0) {
1.93      moko       99:     if(prefix) {
1.95      moko      100:         strcpy((char *)dest, prefix);
1.93      moko      101:         dest+=strlen(prefix);
                    102:     }
                    103:     *dest++=hex_digits[(c >> 12) & 0x0F];
                    104:     *dest++=hex_digits[(c >> 8) & 0x0F];
                    105:     *dest++=hex_digits[(c >> 4) & 0x0F];
                    106:     *dest++=hex_digits[(c) & 0x0F];
1.95      moko      107:     return dest;
1.93      moko      108: }
                    109: 
1.1       paf       110: // methods
                    111: 
1.103     moko      112: Charset::Charset(Request_charsets* acharsets, const String::Body ANAME, const String* afile_spec): 
1.35      paf       113:        FNAME(ANAME),
                    114:        FNAME_CSTR(ANAME.cstrm()) {
1.7       paf       115: 
1.35      paf       116:        if(afile_spec) {
1.1       paf       117:                fisUTF8=false;
1.103     moko      118:                load_definition(*acharsets, *afile_spec);
1.1       paf       119: #ifdef XML
1.35      paf       120:                addEncoding(FNAME_CSTR);
1.1       paf       121: #endif
                    122:        } else {
                    123:                fisUTF8=true;
1.4       paf       124:                // grab default onces [for UTF-8 so to be able to make a-z =>A-Z
1.96      moko      125:                memcpy(pcre_tables, pa_pcre_default_tables, sizeof(pcre_tables));
1.1       paf       126:        }
                    127: 
                    128: #ifdef XML
1.35      paf       129:        initTranscoder(FNAME, FNAME_CSTR);
1.1       paf       130: #endif
                    131: }
                    132: 
1.104     moko      133: void Charset::load_definition(Request_charsets& acharsets, const String& afile_spec) {
1.1       paf       134:        // pcre_tables
                    135:        // lowcase, flipcase, bits digit+word+whitespace, masks
                    136: 
                    137:        // must not move this inside of prepare_case_tables
                    138:        // don't know the size there
                    139:        memset(pcre_tables, 0, sizeof(pcre_tables)); 
                    140:        prepare_case_tables(pcre_tables);
1.4       paf       141:        cstr2ctypes(pcre_tables,(const unsigned char *)"*+?{^.$|()[", ctype_meta);
1.1       paf       142: 
                    143:        // charset
1.35      paf       144:        memset(&tables, 0, sizeof(tables));
1.1       paf       145: 
                    146:        // loading text
1.104     moko      147:        char *data=file_read_text(acharsets, afile_spec);
1.1       paf       148: 
                    149:        // ignore header
                    150:        getrow(&data);
                    151: 
                    152:        // parse cells
                    153:        char *row;
1.42      paf       154:        while((row=getrow(&data))) {
1.1       paf       155:                // remove empty&comment lines
                    156:                if(!*row || *row=='#')
                    157:                        continue;
                    158: 
                    159:                // char white-space     digit   hex-digit       letter  word    lowercase       unicode1        unicode2        
1.53      paf       160:                unsigned char c=0;
1.1       paf       161:                char *cell;
1.42      paf       162:                for(int column=0; (cell=lsplit(&row, '\t')); column++) {
1.1       paf       163:                        switch(column) {
1.53      paf       164:                        case 0: c=(unsigned char)to_wchar_code(cell); break;
1.1       paf       165:                        // pcre_tables
                    166:                        case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
                    167:                        case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
                    168:                        case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
                    169:                        case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
                    170:                        case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
1.53      paf       171:                        case 6: element2case(c, (unsigned char)to_wchar_code(cell), pcre_tables); break;
1.1       paf       172:                        case 7:
                    173:                        case 8:
                    174:                                // charset
1.10      paf       175:                                if(tables.toTableSize>MAX_CHARSET_UNI_CODES)
1.100     moko      176:                                        throw Exception(PARSER_RUNTIME, &afile_spec, "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
1.1       paf       177: 
                    178:                                XMLCh unicode=(XMLCh)to_wchar_code(cell);
                    179:                                if(!unicode && column==7/*unicode1 column*/)
                    180:                                        unicode=(XMLCh)c;
                    181:                                if(unicode) {
1.10      paf       182:                                        if(!tables.fromTable[c])
                    183:                                                tables.fromTable[c]=unicode;
                    184:                                        tables.toTable[tables.toTableSize].intCh=unicode;
                    185:                                        tables.toTable[tables.toTableSize].extCh=(XMLByte)c;
                    186:                                        tables.toTableSize++;
1.1       paf       187:                                }
                    188:                                break;
                    189:                        }
                    190:                }
                    191:        };
                    192: 
1.87      moko      193:        // parser charset tables declare only white-space before 0x20, thus adding the missing chars
                    194:        for(uint i=0; i<0x20; i++)
                    195:                if(!tables.fromTable[i]){
                    196:                        tables.fromTable[i]=i;
                    197:                        tables.toTable[tables.toTableSize].intCh=i;
                    198:                        tables.toTable[tables.toTableSize].extCh=(XMLByte)i;
                    199:                        tables.toTableSize++;
                    200:                }
                    201: 
1.1       paf       202:        // sort by the Unicode code point
                    203:        sort_ToTable();
                    204: }
                    205: 
                    206: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
                    207:        return 
1.38      paf       208:                static_cast<const Charset::Tables::Rec *>(a)->intCh-
                    209:                static_cast<const Charset::Tables::Rec *>(b)->intCh;
1.1       paf       210: }
                    211: 
                    212: void Charset::sort_ToTable() {
1.92      moko      213:        qsort(tables.toTable, tables.toTableSize, sizeof(*tables.toTable), sort_cmp_Trans_rec_intCh);
1.1       paf       214: }
                    215: 
1.60      misha     216: // @todo: precache for spedup searching
1.99      moko      217: static XMLByte xlatOneTo(const XMLCh toXlat, const Charset::Tables& tables, XMLByte not_found) {
1.80      misha     218:        int lo = 0;
                    219:        int hi = tables.toTableSize - 1;
1.39      paf       220:        while(lo<=hi) {
1.35      paf       221:                // Calc the mid point of the low and high offset.
1.39      paf       222:                const unsigned int i = (lo + hi) / 2;
                    223: 
                    224:                XMLCh cur=tables.toTable[i].intCh;
                    225:                if(toXlat==cur)
                    226:                        return tables.toTable[i].extCh;
                    227:                if(toXlat>cur)
                    228:                        lo = i+1;
1.1       paf       229:                else
1.39      paf       230:                        hi = i-1;
                    231:        }
1.35      paf       232:        
                    233:        return not_found;
1.1       paf       234: }
                    235: 
1.99      moko      236: String::C Charset::transcode(const String::C src, const Charset& source_charset, const Charset& dest_charset) {
1.35      paf       237:        if(!src.length)
                    238:                return String::C("", 0);
1.4       paf       239: 
1.1       paf       240:        switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
                    241:                default: // 0x00
1.35      paf       242:                        return source_charset.transcodeToCharset(src, dest_charset);
1.1       paf       243:                case 0x01:
1.35      paf       244:                        return source_charset.transcodeToUTF8(src);
1.1       paf       245:                case 0x10:
1.35      paf       246:                        return dest_charset.transcodeFromUTF8(src);
1.1       paf       247:                case 0x11:
1.35      paf       248:                        return src;
1.1       paf       249:        }
                    250: }
                    251: 
                    252: // ---------------------------------------------------------------------------
                    253: //  Local static data
                    254: //
                    255: //  gUTFBytes
                    256: //      A list of counts of trailing bytes for each initial byte in the input.
                    257: //
                    258: //  gUTFOffsets
                    259: //      A list of values to offset each result char type, according to how
                    260: //      many source bytes when into making it.
                    261: //
                    262: //  gFirstByteMark
                    263: //      A list of values to mask onto the first byte of an encoded sequence,
                    264: //      indexed by the number of bytes used to create the sequence.
                    265: // ---------------------------------------------------------------------------
                    266: static const XMLByte gUTFBytes[0x100] = {
                    267:         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                    268:     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                    269:     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                    270:     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                    271:     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                    272:     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                    273:     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                    274:     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                    275:     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                    276:     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                    277:     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                    278:     ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
                    279:     ,   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
                    280:     ,   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
                    281:     ,   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
                    282:     ,   3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
                    283: };
                    284: 
                    285: static const uint gUTFOffsets[6] = {
1.80      misha     286:        0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
1.1       paf       287: };
                    288: 
                    289: static const XMLByte gFirstByteMark[7] = {
1.80      misha     290:        0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
1.1       paf       291: };
                    292: 
1.99      moko      293: static int transcodeToUTF8(const XMLByte* srcData, int& srcLen, XMLByte *toFill, int& toFillLen, const Charset::Tables& tables) {
1.11      paf       294:        const XMLByte* srcPtr=srcData;
                    295:        const XMLByte* srcEnd=srcData+srcLen;
                    296:        XMLByte* outPtr=toFill;
                    297:        XMLByte* outEnd=toFill+toFillLen;
1.1       paf       298: 
1.35      paf       299:        while(srcPtr<srcEnd) {
                    300:                uint curVal = tables.fromTable[*srcPtr];
1.1       paf       301:                if(!curVal) {
1.35      paf       302:                        // use the replacement character
                    303:                        *outPtr++= '?';
                    304:                        srcPtr++;
                    305:                        continue;
                    306:                }
1.1       paf       307: 
1.35      paf       308:                // Figure out how many bytes we need
                    309:                unsigned int encodedBytes;
                    310:                if(curVal<0x80)
                    311:                        encodedBytes = 1;
                    312:                else if(curVal<0x800)
                    313:                        encodedBytes = 2;
                    314:                else if(curVal<0x10000)
                    315:                        encodedBytes = 3;
                    316:                else if(curVal<0x200000)
                    317:                        encodedBytes = 4;
                    318:                else if(curVal<0x4000000)
                    319:                        encodedBytes = 5;
                    320:                else if(curVal<= 0x7FFFFFFF)
                    321:                        encodedBytes = 6;
                    322:                else {
                    323:                        // use the replacement character
                    324:                        *outPtr++= '?';
                    325:                        srcPtr++;
                    326:                        continue;
                    327:                }
1.11      paf       328: 
1.35      paf       329:                //  If we cannot fully get this char into the output buffer
                    330:                if (outPtr + encodedBytes > outEnd)
                    331:                        break;
                    332:                
                    333:                // We can do it, so update the source index
                    334:                srcPtr++;
                    335:                
                    336:                //  And spit out the bytes. We spit them out in reverse order
                    337:                //  here, so bump up the output pointer and work down as we go.
                    338:                outPtr+= encodedBytes;
                    339:                switch(encodedBytes) {
1.60      misha     340:                        case 6: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
                    341:                                curVal>>= 6;
                    342:                        case 5: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
                    343:                                curVal>>= 6;
                    344:                        case 4: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
                    345:                                curVal>>= 6;
                    346:                        case 3: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
                    347:                                curVal>>= 6;
                    348:                        case 2: *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
                    349:                                curVal>>= 6;
                    350:                        case 1: *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
1.35      paf       351:                }
                    352:                
                    353:                // Add the encoded bytes back in again to indicate we've eaten them
                    354:                outPtr+= encodedBytes;
                    355:        }
                    356:        
                    357:        // Update the bytes eaten
                    358:        srcLen = srcPtr - srcData;
                    359:        
                    360:        // Return the characters read
                    361:        toFillLen = outPtr - toFill;
                    362:        
1.29      paf       363:        //return srcPtr==srcEnd?(int)toFillLen:-1;
                    364: /*
                    365: xmlCharEncodingInputFunc
                    366: Returns :
                    367: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
                    368: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
                    369: of ocetes consumed.
                    370: */
                    371:        return 0;
1.1       paf       372: }
1.26      paf       373: /// @todo digital entites only when xml/html output [at output in html/xml mode, in html part of a letter]
1.99      moko      374: static int transcodeFromUTF8(const XMLByte* srcData, int& srcLen, XMLByte* toFill, int& toFillLen, const Charset::Tables& tables) {
1.11      paf       375:        const XMLByte* srcPtr=srcData;
                    376:        const XMLByte* srcEnd=srcData+srcLen;
                    377:        XMLByte* outPtr=toFill;
                    378:        XMLByte* outEnd=toFill+toFillLen;
1.1       paf       379: 
1.35      paf       380:        //  We now loop until we either run out of input data, or room to store
                    381:        while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
                    382:                // Get the next leading byte out
                    383:                const XMLByte firstByte =* srcPtr;
                    384:                
                    385:                // Special-case ASCII, which is a leading byte value of<= 127
1.60      misha     386:                if(firstByte<=127) {
1.35      paf       387:                        *outPtr++= firstByte;
                    388:                        srcPtr++;
                    389:                        continue;
                    390:                }
                    391:                
                    392:                // See how many trailing src bytes this sequence is going to require
                    393:                const unsigned int trailingBytes = gUTFBytes[firstByte];
                    394:                
                    395:                //  If there are not enough source bytes to do this one, then we
                    396:                //  are done. Note that we done>= here because we are implicitly
                    397:                //  counting the 1 byte we get no matter what.
                    398:                if(srcPtr+trailingBytes>= srcEnd)
                    399:                        break;
                    400:                
                    401:                // Looks ok, so lets build up the value
                    402:                uint tmpVal=0;
                    403:                switch(trailingBytes) {
                    404:                case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
                    405:                case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
                    406:                case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
                    407:                case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
                    408:                case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
                    409:                case 0: tmpVal+=*srcPtr++;
                    410:                        break;
                    411:                        
                    412:                default:
1.100     moko      413:                        throw Exception(0, 0, "transcodeFromUTF8 error: wrong trailingBytes value(%d)", trailingBytes); // never
1.35      paf       414:                }
                    415:                tmpVal-=gUTFOffsets[trailingBytes];
                    416:                
                    417:                //  If it will fit into a single char, then put it in. Otherwise
                    418:                //  fail [*encode it as a surrogate pair. If its not valid, use the
                    419:                //  replacement char.*]
                    420:                if(!(tmpVal & 0xFFFF0000)) {
1.25      paf       421:                        if(XMLByte xlat=xlatOneTo(tmpVal, tables, 0))
                    422:                                *outPtr++=xlat;
1.49      paf       423:                        else {
1.50      paf       424:                                outPtr+=sprintf((char *)outPtr, "&#%u;", tmpVal); // &#decimal;
1.49      paf       425:                        }
                    426:                } else {
                    427:                        const XMLByte* recoverPtr=srcPtr-trailingBytes-1;
                    428:                        for(uint i=0; i<=trailingBytes; i++)
                    429:                                outPtr+=sprintf((char*)outPtr, "%%%02X", *recoverPtr++);
                    430:                }
1.1       paf       431:        }
1.35      paf       432:        
                    433:        // Update the bytes eaten
                    434:        srcLen = srcPtr - srcData;
                    435:        
                    436:        // Return the characters read
                    437:        toFillLen = outPtr - toFill;
1.11      paf       438: 
1.29      paf       439:        //return srcPtr==srcEnd?(int)toFillLen:-1;
                    440: /*
                    441: xmlCharEncodingOutputFunc
                    442: Returns :
                    443: the number of byte written, or -1 by lack of space, or -2 if the transcoding failed. The value of inlen after return is the
                    444: number of octets consumed as the return value is positive, else unpredictiable. The value of outlen after return is the number
                    445: of ocetes consumed.
                    446: */
                    447:        return 0;
1.10      paf       448: }
                    449: 
1.85      misha     450: static bool need_escape(XMLByte c){
1.60      misha     451:        return
1.66      misha     452:                !(
                    453:                        (c<=127)
                    454:                        && (
1.89      misha     455:                                pa_isalnum((unsigned char)c)
1.66      misha     456:                                || strchr("*@-_+./", c)!=0
                    457:                        )
                    458:                );
1.60      misha     459: }
                    460: 
1.70      misha     461: // read one UTF8 char and return length of this char (in bytes)
                    462: static unsigned int readUTF8Char(const XMLByte*& srcPtr, const XMLByte* srcEnd, XMLByte& firstByte, XMLCh& UTF8Char){
1.60      misha     463:        if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
                    464:                return 0;
                    465: 
                    466:        firstByte=*srcPtr;
                    467: 
                    468:        if(firstByte<=127){
                    469:                UTF8Char=firstByte;
                    470:                srcPtr++;
                    471:                return 1;
                    472:        }
                    473: 
                    474:        unsigned int trailingBytes=gUTFBytes[firstByte];
                    475: 
                    476:        if(srcPtr+trailingBytes>=srcEnd){
                    477:                return 0; // not enough bytes in source string for reading
                    478:        }
                    479: 
                    480:        uint tmpVal=0;
                    481:        switch(trailingBytes){
                    482:                case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
                    483:                case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
                    484:                case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
                    485:                case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
                    486:                case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
                    487:                case 0: tmpVal+=*srcPtr++;
                    488:        }
                    489: 
                    490:        tmpVal-=gUTFOffsets[trailingBytes];
                    491:        UTF8Char=tmpVal;
                    492: 
                    493:        return trailingBytes+1;
                    494: }
                    495: 
1.70      misha     496: // skip UTF8 char and return length of this char (in bytes)
                    497: static unsigned int skipUTF8Char(const XMLByte*& srcPtr, const XMLByte* srcEnd){
1.62      misha     498:        if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
                    499:                return 0;
                    500: 
1.63      misha     501:        unsigned int trailingBytes=gUTFBytes[*srcPtr]+1;
                    502:        srcPtr+=trailingBytes;
1.62      misha     503: 
                    504:        return trailingBytes;
1.61      misha     505: }
                    506: 
1.85      misha     507: // read non-UTF8 char, and return number of bytes needed for storing this char in UTF8
1.61      misha     508: static unsigned int readChar(const XMLByte*& srcPtr, const XMLByte* srcEnd, XMLByte& firstByte, XMLCh& UTF8Char, const Charset::Tables& tables){
1.60      misha     509:        if(!srcPtr || !*srcPtr || srcPtr>=srcEnd)
                    510:                return 0;
                    511: 
                    512:        firstByte=*srcPtr++;
                    513:        UTF8Char=tables.fromTable[firstByte];
                    514: 
                    515:        if(UTF8Char<0x80)
                    516:                return 1;
                    517:        else if(UTF8Char<0x800)
                    518:                return 2;
                    519:        else if(UTF8Char<0x10000)
                    520:                return 3;
                    521:        else if(UTF8Char<0x200000)
                    522:                return 4;
                    523:        else if(UTF8Char<0x4000000)
                    524:                return 5;
                    525:        else if(UTF8Char<= 0x7FFFFFFF)
                    526:                return 6;
                    527: 
                    528:        // will use the replacement character '?'
                    529:        firstByte=0;
                    530:        return 1;
                    531: }
                    532: 
1.85      misha     533: size_t Charset::calc_escaped_length_UTF8(XMLByte* src, size_t src_length){
                    534:        size_t dest_length=0;
                    535: 
                    536:        for(UTF8_string_iterator i(src, src_length); i.has_next(); ){
                    537:                if(i.getCharSize()==1)
                    538:                        dest_length+=!need_escape(i.getFirstByte())?1/*as-is*/:3/*%XX*/;
                    539:                else
                    540:                        dest_length+=6; // %uXXXX
1.60      misha     541:        }
                    542: 
1.85      misha     543:        return dest_length;
1.60      misha     544: }
                    545: 
1.86      moko      546: size_t Charset::calc_escaped_length(const XMLByte* src, size_t src_length, const Charset::Tables& tables){
                    547:        const XMLByte* src_end=src+src_length;
                    548:        XMLByte first_byte;
                    549:        XMLCh UTF8_char;
1.85      misha     550:        size_t dest_length=0;
                    551: 
1.86      moko      552:        while(uint char_size=readChar(src, src_end, first_byte, UTF8_char, tables)){
1.85      misha     553:                if(char_size==1)
                    554:                        dest_length+=(!first_byte/*replacement char '?'*/ || !need_escape(first_byte))?1:3/*'%XX'*/;
                    555:                else
                    556:                        dest_length+=6; // %uXXXX
1.60      misha     557:        }
                    558: 
1.85      misha     559:        return dest_length;
                    560: }
                    561: 
                    562: size_t Charset::calc_escaped_length(const String::C src, const Charset& source_charset){
1.86      moko      563:        if(!src.length)
1.85      misha     564:                return 0;
                    565: 
                    566: #ifdef PRECALCULATE_DEST_LENGTH
                    567:        if(source_charset.isUTF8())
1.86      moko      568:                return calc_escaped_length_UTF8((XMLByte *)src.str, src.length);
1.85      misha     569:        else
1.86      moko      570:                return calc_escaped_length((XMLByte *)src.str, src.length, source_charset.tables);
1.85      misha     571: #else
                    572:        return src_length*6; // enough for %uXXXX but too memory-hungry
                    573: #endif
                    574: }
                    575: 
                    576: #define escape_char(dest_ptr, char_size, first_byte, UTF8_char) \
                    577:        if(char_size==1) \
                    578:                if(first_byte){ \
                    579:                        if(need_escape(first_byte)) \
1.95      moko      580:                                dest_ptr=append_hex_8(dest_ptr, first_byte, "%");  /* %XX */ \
1.85      misha     581:                        else \
                    582:                                *dest_ptr++=first_byte; /*as is*/ \
                    583:                } else \
                    584:                        *dest_ptr++='?'; /* replacement char '?' */ \
                    585:        else \
1.95      moko      586:                dest_ptr=append_hex_16(dest_ptr, UTF8_char, "%u"); /* %uXXXX */
1.85      misha     587: 
                    588: 
                    589: size_t Charset::escape_UTF8(const XMLByte* src, size_t src_length, XMLByte* dest) {
                    590:        XMLByte* dest_ptr=dest;
                    591: 
                    592:        // loop until we either run out of input data
                    593:        for(UTF8_string_iterator i((XMLByte *)src, src_length); i.has_next(); )
                    594:                escape_char(dest_ptr, i.getCharSize(), i.getFirstByte(), i.next())
1.60      misha     595:        
1.85      misha     596:        return dest_ptr - dest;
1.60      misha     597: }
                    598: 
1.85      misha     599: size_t Charset::escape(const XMLByte* src, size_t src_length, XMLByte* dest, const Charset::Tables& tables) {
                    600:        const XMLByte* src_end=src+src_length;
                    601:        XMLByte* dest_ptr=dest;
                    602: 
                    603:        XMLByte first_byte;
                    604:        XMLCh UTF8_char;
                    605:        uint char_size;
                    606: 
1.86      moko      607:        while(char_size=readChar(src, src_end, first_byte, UTF8_char, tables))
1.85      misha     608:                escape_char(dest_ptr, char_size, first_byte, UTF8_char)
                    609: 
                    610:        return dest_ptr - dest;
                    611: }
1.60      misha     612: 
                    613: String::C Charset::escape(const String::C src, const Charset& source_charset){
1.86      moko      614:        if(!src.length)
1.60      misha     615:                return String::C("", 0);
                    616: 
1.85      misha     617:        size_t dest_calculated_length=calc_escaped_length(src, source_charset);
                    618:        XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_calculated_length+1/*terminator*/];
                    619: 
                    620:        size_t dest_length;
                    621:        if(source_charset.isUTF8())
1.86      moko      622:                dest_length=escape_UTF8((XMLByte *)src.str, src.length, dest_body);
1.85      misha     623:        else
1.86      moko      624:                dest_length=escape((XMLByte *)src.str, src.length, dest_body, source_charset.tables);
1.85      misha     625: 
                    626:        if(dest_length>dest_calculated_length)
                    627:                throw Exception(0, 0, "Charset::escape buffer overflow");
                    628: 
                    629:        dest_body[dest_length]=0; // terminator
                    630:        return String::C((char*)dest_body, dest_length);
                    631: }
                    632: 
                    633: String::Body Charset::escape(const String::Body src, const Charset& source_charset) {
1.86      moko      634:        String::C dest=Charset::escape(String::C(src.cstr(), src.length()), source_charset);
1.85      misha     635:        return String::Body(dest.length ? dest.str:0);
                    636: }
                    637: 
                    638: String& Charset::escape(const String& src, const Charset& source_charset) {
                    639:        if(src.is_empty())
                    640:                return *new String();
                    641: 
                    642:        return *new String(escape((String::Body)src, source_charset), String::L_CLEAN);
                    643: }
                    644: 
                    645: inline bool need_json_escape(unsigned char c){
                    646:        return strchr("\n\"\\/\t\r\b\f", c)!=0;
                    647: }
                    648: 
                    649: size_t Charset::calc_JSON_escaped_length_UTF8(XMLByte* src, size_t src_length){
                    650:        size_t dest_length=0;
                    651: 
                    652:        for(UTF8_string_iterator i(src, src_length); i.has_next(); ){
1.93      moko      653:                if(i.getCharSize()==1){
                    654:                        XMLByte first_byte=i.getFirstByte();
                    655:                        dest_length+=need_json_escape(first_byte) ? 2 : (first_byte < 0x20 && first_byte /* 0 replacement char is '?' */) ? 6 : 1;
                    656:                } else
1.85      misha     657:                        dest_length+=6; // \uXXXX
                    658:        }
                    659: 
                    660:        return dest_length;
                    661: }
                    662: 
1.86      moko      663: size_t Charset::calc_JSON_escaped_length(const XMLByte* src, size_t src_length, const Charset::Tables& tables){
                    664:        const XMLByte* src_end=src+src_length;
1.85      misha     665:        XMLByte first_byte;
                    666:        XMLCh UTF8_char;
1.60      misha     667:        size_t dest_length=0;
                    668: 
1.86      moko      669:        while(uint char_size=readChar(src, src_end, first_byte, UTF8_char, tables)){
1.85      misha     670:                if(char_size==1)
1.93      moko      671:                        dest_length+=need_json_escape(first_byte) ? 2 : (first_byte < 0x20 && first_byte /* 0 replacement char is '?' */) ? 6 : 1;
1.85      misha     672:                else
                    673:                        dest_length+=6; // \uXXXX
1.60      misha     674:        }
1.85      misha     675: 
                    676:        return dest_length;
                    677: }
                    678: 
                    679: size_t Charset::calc_JSON_escaped_length(const String::C src, const Charset& source_charset){
1.86      moko      680:        if(!src.length)
1.85      misha     681:                return 0;
                    682: 
                    683: #ifdef PRECALCULATE_DEST_LENGTH
                    684:        if(source_charset.isUTF8())
1.86      moko      685:                return calc_JSON_escaped_length_UTF8((XMLByte *)src.str, src.length);
1.85      misha     686:        else
1.86      moko      687:                return calc_JSON_escaped_length((XMLByte *)src.str, src.length, source_charset.tables);
1.60      misha     688: #else
1.85      misha     689:        return src_length*6; // enough for \uXXXX but too memory-hungry
1.60      misha     690: #endif
1.85      misha     691: }
                    692: 
                    693: #define escape_char_JSON(dest_ptr, char_size, first_byte, UTF8_char) \
                    694:        if(char_size==1) \
                    695:                switch(first_byte){ \
                    696:                        case '\n': *dest_ptr++='\\'; *dest_ptr++='n';  break; \
                    697:                        case '"' : *dest_ptr++='\\'; *dest_ptr++='"';  break; \
                    698:                        case '\\': *dest_ptr++='\\'; *dest_ptr++='\\'; break; \
                    699:                        case '/' : *dest_ptr++='\\'; *dest_ptr++='/';  break; \
                    700:                        case '\t': *dest_ptr++='\\'; *dest_ptr++='t';  break; \
                    701:                        case '\r': *dest_ptr++='\\'; *dest_ptr++='r';  break; \
                    702:                        case '\b': *dest_ptr++='\\'; *dest_ptr++='b';  break; \
                    703:                        case '\f': *dest_ptr++='\\'; *dest_ptr++='f';  break; \
                    704:                        case   0 : *dest_ptr++='?'; break; /*replacement char*/ \
1.95      moko      705:                        default  : if(first_byte < 0x20) dest_ptr=append_hex_16(dest_ptr, UTF8_char, "\\u"); \
1.93      moko      706:                                                else *dest_ptr++=first_byte; \
1.85      misha     707:                } \
                    708:        else \
1.95      moko      709:                dest_ptr=append_hex_16(dest_ptr, UTF8_char, "\\u"); // \uXXXX
1.85      misha     710: 
                    711: 
                    712: size_t Charset::escape_JSON_UTF8(const XMLByte* src, size_t src_length, XMLByte* dest) {
                    713:        XMLByte* dest_ptr=dest;
                    714: 
                    715:        // loop until we either run out of input data
                    716:        for(UTF8_string_iterator i((XMLByte *)src, src_length); i.has_next(); )
                    717:                escape_char_JSON(dest_ptr, i.getCharSize(), i.getFirstByte(), i.next())
                    718: 
                    719:        return dest_ptr - dest;
                    720: }
                    721: 
                    722: size_t Charset::escape_JSON(const XMLByte* src, size_t src_length, XMLByte* dest, const Charset::Tables& tables) {
                    723:        const XMLByte* src_end=src+src_length;
                    724:        XMLByte* dest_ptr=dest;
                    725: 
                    726:        XMLByte first_byte;
                    727:        XMLCh UTF8_char;
                    728:        uint char_size;
                    729: 
1.86      moko      730:        while(char_size=readChar(src, src_end, first_byte, UTF8_char, tables))
1.85      misha     731:                escape_char_JSON(dest_ptr, char_size, first_byte, UTF8_char)
                    732: 
                    733:        return dest_ptr - dest;
                    734: }
1.60      misha     735: 
1.85      misha     736: String::C Charset::escape_JSON(const String::C src, const Charset& source_charset){
1.86      moko      737:        if(!src.length)
1.85      misha     738:                return String::C("", 0);
1.60      misha     739: 
1.85      misha     740:        size_t dest_calculated_length=calc_JSON_escaped_length(src, source_charset);
                    741:        XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_calculated_length+1/*terminator*/];
                    742: 
                    743:        size_t dest_length;
                    744:        if(source_charset.isUTF8())
1.86      moko      745:                dest_length=escape_JSON_UTF8((XMLByte *)src.str, src.length, dest_body);
1.85      misha     746:        else
1.86      moko      747:                dest_length=escape_JSON((XMLByte *)src.str, src.length, dest_body, source_charset.tables);
1.60      misha     748: 
1.85      misha     749:        if(dest_length>dest_calculated_length)
                    750:                throw Exception(0, 0, "Charset::escape_JSON buffer overflow");
1.60      misha     751: 
                    752:        dest_body[dest_length]=0; // terminator
                    753:        return String::C((char*)dest_body, dest_length);
                    754: }
1.85      misha     755: 
                    756: String::Body Charset::escape_JSON(const String::Body src, const Charset& source_charset) {
1.86      moko      757:        String::C dest=Charset::escape_JSON(String::C(src.cstr(), src.length()), source_charset);
1.77      misha     758:        return String::Body(dest.length ? dest.str:0);
1.64      misha     759: }
                    760: 
1.85      misha     761: String& Charset::escape_JSON(const String& src, const Charset& source_charset) {
1.72      misha     762:        if(src.is_empty())
1.73      misha     763:                return *new String();
1.64      misha     764: 
1.85      misha     765:        return *new String(escape_JSON((String::Body)src, source_charset), String::L_CLEAN);
1.64      misha     766: }
1.60      misha     767: 
1.35      paf       768: const String::C Charset::transcodeToUTF8(const String::C src) const {
1.71      misha     769:        int src_length=src.length;
1.60      misha     770: 
                    771: #ifdef PRECALCULATE_DEST_LENGTH
1.71      misha     772:        int dest_length=0;
1.60      misha     773:        const XMLByte* srcPtr=(XMLByte*)src.str;
                    774:        const XMLByte* srcEnd=srcPtr+src_length;
1.69      misha     775:        XMLByte firstByte;
                    776:        XMLCh UTF8Char;
                    777:        while(uint charSize=readChar(srcPtr, srcEnd, firstByte, UTF8Char, tables))
1.60      misha     778:                dest_length+=charSize;
                    779: #else
1.85      misha     780:        int dest_length=src_length*6; // so that surly enough (max utf8 seq len=6) but too memory-hungry
1.60      misha     781: #endif
                    782: 
1.35      paf       783: #ifndef NDEBUG
1.71      misha     784:        int saved_dest_length=dest_length;
1.35      paf       785: #endif
                    786:        XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11      paf       787: 
                    788:        if(::transcodeToUTF8(
1.35      paf       789:                (XMLByte *)src.str, src_length,
                    790:                dest_body, dest_length,
1.11      paf       791:                tables)<0)
1.100     moko      792:                throw Exception(0, 0, "Charset::transcodeToUTF8 buffer overflow");
1.10      paf       793: 
1.60      misha     794:        assert(dest_length<=saved_dest_length);
                    795:        dest_body[dest_length]=0; // terminator
1.35      paf       796:        return String::C((char*)dest_body, dest_length);
1.10      paf       797: }
1.38      paf       798: 
                    799: static XMLCh change_case_UTF8(const XMLCh src, const Charset::UTF8CaseTable& table) {
1.80      misha     800:        int lo = 0;
                    801:        int hi = table.size - 1;
1.39      paf       802:        while(lo<=hi) {
1.38      paf       803:                // Calc the mid point of the low and high offset.
1.39      paf       804:                const unsigned int i = (lo + hi) / 2;
                    805: 
                    806:                XMLCh cur=table.records[i].from;
                    807:                if(src==cur)
                    808:                        return table.records[i].to;
                    809:                if(src>cur)
                    810:                        lo = i+1;
1.38      paf       811:                else
1.39      paf       812:                        hi = i-1;
                    813:        }
                    814: 
                    815:        // not found
1.38      paf       816:        return src;
                    817: }
                    818: 
1.58      misha     819: static void store_UTF8(XMLCh src, XMLByte*& outPtr){
1.38      paf       820:        if(!src) {
                    821:                // use the replacement character
                    822:                *outPtr++= '?';
                    823:                return;
                    824:        }
                    825: 
                    826:        // Figure out how many bytes we need
                    827:        unsigned int encodedBytes;
                    828:        if(src<0x80)
                    829:                encodedBytes = 1;
                    830:        else if(src<0x800)
                    831:                encodedBytes = 2;
                    832:        else if(src<0x10000)
                    833:                encodedBytes = 3;
                    834:        else if(src<0x200000)
                    835:                encodedBytes = 4;
                    836:        else if(src<0x4000000)
                    837:                encodedBytes = 5;
                    838:        else if(src<= 0x7FFFFFFF)
                    839:                encodedBytes = 6;
                    840:        else {
                    841:                // use the replacement character
                    842:                *outPtr++= '?';
                    843:                return;
                    844:        }
                    845: 
                    846:        //  And spit out the bytes. We spit them out in reverse order
                    847:        //  here, so bump up the output pointer and work down as we go.
                    848:        outPtr+= encodedBytes;
                    849:        switch(encodedBytes) {
                    850:        case 6: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
                    851:                src>>= 6;
                    852:        case 5: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
                    853:                src>>= 6;
                    854:        case 4: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
                    855:                src>>= 6;
                    856:        case 3: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
                    857:                src>>= 6;
                    858:        case 2: *--outPtr = XMLByte((src | 0x80UL) & 0xBFUL);
                    859:                src>>= 6;
                    860:        case 1: *--outPtr = XMLByte(src | gFirstByteMark[encodedBytes]);
                    861:        }
                    862:        
                    863:        // Add the encoded bytes back in again to indicate we've eaten them
                    864:        outPtr+= encodedBytes;
                    865: }
                    866: 
1.99      moko      867: static void change_case_UTF8(XMLCh src, XMLByte*& outPtr, const Charset::UTF8CaseTable& table) {
1.38      paf       868:        store_UTF8(change_case_UTF8(src, table), outPtr);
1.98      moko      869: }
                    870: 
1.99      moko      871: void change_case_UTF8(const XMLByte* srcData, size_t srcLen, XMLByte* toFill, size_t toFillLen, const Charset::UTF8CaseTable& table) {
1.38      paf       872:        const XMLByte* srcPtr=srcData;
1.44      paf       873:        const XMLByte* srcEnd=srcData+srcLen;
1.38      paf       874:        XMLByte* outPtr=toFill;
1.44      paf       875:        XMLByte* outEnd=toFill+toFillLen;
                    876: 
                    877:        //  We now loop until we either run out of input data, or room to store
                    878:        while ((srcPtr < srcEnd) && (outPtr < outEnd)) {
                    879:                // Get the next leading byte out
                    880:                const XMLByte firstByte =* srcPtr;
1.38      paf       881: 
1.60      misha     882:                if(firstByte<=127) {
1.38      paf       883:                        change_case_UTF8(firstByte, outPtr, table);
                    884:                        srcPtr++;
                    885:                        continue;
                    886:                }
                    887:                
                    888:                // See how many trailing src bytes this sequence is going to require
                    889:                const unsigned int trailingBytes = gUTFBytes[firstByte];
                    890:                
                    891:                // Looks ok, so lets build up the value
                    892:                uint tmpVal=0;
                    893:                switch(trailingBytes) {
                    894:                case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
                    895:                case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
                    896:                case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
                    897:                case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
                    898:                case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
                    899:                case 0: tmpVal+=*srcPtr++;
                    900:                        break;
                    901:                        
                    902:                default:
1.100     moko      903:                        throw Exception(0, 0, "change_case_UTF8 error: wrong trailingBytes value(%d)", trailingBytes);
1.38      paf       904:                }
                    905:                tmpVal-=gUTFOffsets[trailingBytes];
                    906:                
                    907:                //  If it will fit into a single char, then put it in. Otherwise
                    908:                //  fail [*encode it as a surrogate pair. If its not valid, use the
                    909:                //  replacement char.*]
                    910:                if(!(tmpVal & 0xFFFF0000))
                    911:                        change_case_UTF8(tmpVal, outPtr, table);
                    912:                else
1.100     moko      913:                        throw Exception(0, 0, "change_case_UTF8 error: too big tmpVal(0x%08X)", tmpVal);
1.38      paf       914:        }
                    915:        
                    916:        if(srcPtr!=outPtr)
1.100     moko      917:                throw Exception(0, 0, "change_case_UTF8 error: end pointers do not match");
1.38      paf       918: }
                    919: 
1.60      misha     920: static size_t getDecNumLength(XMLCh UTF8Char){
                    921:        return
                    922:                (UTF8Char < 100)
                    923:                        ?2
                    924:                        :(UTF8Char < 1000)
                    925:                                ?3
                    926:                                :(UTF8Char < 10000)
                    927:                                        ?4
                    928:                                        :5;
                    929: }
1.38      paf       930: 
1.35      paf       931: const String::C Charset::transcodeFromUTF8(const String::C src) const {
1.82      misha     932:        int src_length=src.length;
1.60      misha     933: #ifdef PRECALCULATE_DEST_LENGTH
1.71      misha     934:        int dest_length=0;
1.82      misha     935:        for(UTF8_string_iterator i((XMLByte *)src.str, src_length); i.has_next(); ){
1.88      misha     936:                dest_length += ( i.next() & 0xFFFF0000 )
                    937:                                                ? 3*i.getCharSize()                                             // %XX for each byte
                    938:                                                : ( xlatOneTo(i.next(), tables, 0) != 0 )
                    939:                                                        ? 1                                                                     // can convert it to a single char
                    940:                                                        : 3+getDecNumLength( i.next() );        // print char as &#XX;, &#XXX;, &#XXXX; or &#XXXXX;
1.60      misha     941:        }
                    942: #else
                    943:        // so that surly enough, "&#XXX;" has max ratio (huh? 8 bytes needed for '&#XXXXX;')
1.82      misha     944:        int dest_length=src_length*6;
1.60      misha     945: #endif
                    946: 
1.35      paf       947: #ifndef NDEBUG
1.71      misha     948:        int saved_dest_length=dest_length;
1.35      paf       949: #endif
                    950:        XMLByte *dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
1.11      paf       951: 
                    952:        if(::transcodeFromUTF8(
1.82      misha     953:                (XMLByte *)src.str, src_length,
1.35      paf       954:                dest_body, dest_length,
1.11      paf       955:                tables)<0)
1.100     moko      956:                throw Exception(0, 0, "Charset::transcodeFromUTF8 buffer overflow");
1.10      paf       957: 
1.60      misha     958:        assert(dest_length<=saved_dest_length);
                    959:        dest_body[dest_length]=0; // terminator
1.35      paf       960:        return String::C((char*)dest_body, dest_length);
1.1       paf       961: }
                    962: 
                    963: /// transcode using both charsets
1.99      moko      964: const String::C Charset::transcodeToCharset(const String::C src, const Charset& dest_charset) const {
1.35      paf       965:        if(&dest_charset==this) 
                    966:                return src;
                    967:        else {
                    968:                size_t dest_length=src.length;
                    969:                XMLByte* dest_body=new(PointerFreeGC) XMLByte[dest_length+1/*for terminator*/];
                    970: 
                    971:                XMLByte* output=dest_body;
                    972:                const XMLByte* input=(XMLByte *)src.str;
                    973:                while(XMLCh c=*input++) {
                    974:                        XMLCh curVal = tables.fromTable[c];
                    975:                        *output++=curVal?
                    976:                                xlatOneTo(curVal, dest_charset.tables, '?') // OK
                    977:                                :'?'; // use the replacement character
1.6       paf       978:                }
1.1       paf       979: 
1.35      paf       980:                dest_body[dest_length]=0; // terminator
                    981:                return String::C((char*)dest_body, dest_length);
1.6       paf       982:        }
1.1       paf       983: }                      
                    984: 
1.58      misha     985: void Charset::store_Char(XMLByte*& outPtr, XMLCh src, XMLByte not_found){
1.59      misha     986:        if(isUTF8())
1.58      misha     987:                store_UTF8(src, outPtr);
1.59      misha     988:        else if(char ch=xlatOneTo(src, tables, not_found))
1.58      misha     989:                        *outPtr++=ch;
1.57      misha     990: }
                    991: 
1.1       paf       992: #ifdef XML
1.10      paf       993: 
1.35      paf       994: static const Charset::Tables* tables[MAX_CHARSETS];
1.111     moko      995: static xmlCharEncodingHandler xml_encoding_handlers[MAX_CHARSETS];
1.35      paf       996: 
1.46      paf       997: #ifdef PA_PATCHED_LIBXML_BACKWARD
                    998: 
                    999: #define declareXml256ioFuncs(i) \
                   1000:        static int xml256CharEncodingInputFunc##i( \
                   1001:                unsigned char *out, int *outlen, \
                   1002:                const unsigned char *in, int *inlen, void*) { \
                   1003:                return transcodeToUTF8( \
1.71      misha    1004:                        in, *inlen, \
                   1005:                        out, *outlen, \
1.46      paf      1006:                        *tables[i]); \
                   1007:        } \
                   1008:        static int xml256CharEncodingOutputFunc##i( \
                   1009:                unsigned char *out, int *outlen, \
                   1010:                const unsigned char *in, int *inlen, void*) { \
                   1011:                return transcodeFromUTF8( \
1.71      misha    1012:                        in, *inlen, \
                   1013:                        out, *outlen, \
1.46      paf      1014:                        *tables[i]); \
                   1015:        }
                   1016: 
                   1017: #else
                   1018: 
1.35      paf      1019: #define declareXml256ioFuncs(i) \
                   1020:        static int xml256CharEncodingInputFunc##i( \
                   1021:                unsigned char *out, int *outlen, \
                   1022:                const unsigned char *in, int *inlen) { \
                   1023:                return transcodeToUTF8( \
1.71      misha    1024:                        in, *inlen, \
                   1025:                        out, *outlen, \
1.35      paf      1026:                        *tables[i]); \
                   1027:        } \
                   1028:        static int xml256CharEncodingOutputFunc##i( \
                   1029:                unsigned char *out, int *outlen, \
                   1030:                const unsigned char *in, int *inlen) { \
                   1031:                return transcodeFromUTF8( \
1.71      misha    1032:                        in, *inlen, \
                   1033:                        out, *outlen, \
1.35      paf      1034:                        *tables[i]); \
                   1035:        }
                   1036: 
1.46      paf      1037: #endif
                   1038: 
                   1039: 
1.35      paf      1040: declareXml256ioFuncs(0)        declareXml256ioFuncs(1)
                   1041: declareXml256ioFuncs(2)        declareXml256ioFuncs(3)
                   1042: declareXml256ioFuncs(4)        declareXml256ioFuncs(5)
                   1043: declareXml256ioFuncs(6)        declareXml256ioFuncs(7)
                   1044: declareXml256ioFuncs(8)        declareXml256ioFuncs(9)
                   1045: 
                   1046: static xmlCharEncodingInputFunc inputFuncs[MAX_CHARSETS]={
                   1047:        xml256CharEncodingInputFunc0,   xml256CharEncodingInputFunc1,
                   1048:        xml256CharEncodingInputFunc2,   xml256CharEncodingInputFunc3,
                   1049:        xml256CharEncodingInputFunc4,   xml256CharEncodingInputFunc5,
                   1050:        xml256CharEncodingInputFunc6,   xml256CharEncodingInputFunc7,
                   1051:        xml256CharEncodingInputFunc8,   xml256CharEncodingInputFunc9
                   1052: };
                   1053: static xmlCharEncodingOutputFunc outputFuncs[MAX_CHARSETS]={
                   1054:        xml256CharEncodingOutputFunc0,  xml256CharEncodingOutputFunc1,
                   1055:        xml256CharEncodingOutputFunc2,  xml256CharEncodingOutputFunc3,
                   1056:        xml256CharEncodingOutputFunc4,  xml256CharEncodingOutputFunc5,
                   1057:        xml256CharEncodingOutputFunc6,  xml256CharEncodingOutputFunc7,
                   1058:        xml256CharEncodingOutputFunc8,  xml256CharEncodingOutputFunc9
                   1059: };
                   1060: static size_t handlers_count=0;
1.10      paf      1061: 
                   1062: void Charset::addEncoding(char *name_cstr) {
1.35      paf      1063:        if(handlers_count==MAX_CHARSETS)
1.100     moko     1064:                throw Exception(0, 0, "already allocated %d handlers, no space for new encoding '%s'", MAX_CHARSETS, name_cstr);
1.35      paf      1065: 
1.111     moko     1066:        xmlCharEncodingHandler* handler=&xml_encoding_handlers[handlers_count];
1.35      paf      1067:        {
                   1068:                handler->name=name_cstr;
                   1069:                handler->input=inputFuncs[handlers_count]; 
                   1070:                handler->output=outputFuncs[handlers_count]; 
                   1071:                ::tables[handlers_count]=&tables;
                   1072:                handlers_count++;
                   1073:        }
1.10      paf      1074:        
                   1075:        xmlRegisterCharEncodingHandler(handler);
1.35      paf      1076: 
1.10      paf      1077: }
                   1078: 
1.37      paf      1079: void Charset::initTranscoder(const String::Body NAME, const char* name_cstr) {
1.15      paf      1080:        ftranscoder=xmlFindCharEncodingHandler(name_cstr);
1.35      paf      1081:        transcoder(NAME); // check right way
1.15      paf      1082: }
                   1083: 
1.37      paf      1084: xmlCharEncodingHandler& Charset::transcoder(const String::Body NAME) {
1.15      paf      1085:        if(!ftranscoder)
1.100     moko     1086:                throw Exception(PARSER_RUNTIME, new String(NAME, String::L_TAINTED), "unsupported encoding");
1.35      paf      1087:        return *ftranscoder;
1.10      paf      1088: }
                   1089: 
1.54      paf      1090: String::C Charset::transcode_cstr(const xmlChar* s) {
1.13      paf      1091:        if(!s)
1.35      paf      1092:                return String::C("", 0);
1.8       paf      1093: 
1.35      paf      1094:        int inlen=strlen((const char*)s);
1.51      paf      1095:        int outlen=inlen*6/*strlen("&#255;")*/; // max
1.35      paf      1096: #ifndef NDEBUG
                   1097:        int saved_outlen=outlen;
                   1098: #endif
                   1099:        char *out=new(PointerFreeGC) char[outlen+1];
1.8       paf      1100:        
1.30      paf      1101:        int error;
1.35      paf      1102:        if(xmlCharEncodingOutputFunc output=transcoder(FNAME).output) {
1.30      paf      1103:                error=output(
1.17      paf      1104:                        (unsigned char*)out, &outlen,
1.46      paf      1105:                        (const unsigned char*)s, &inlen
                   1106: #ifdef PA_PATCHED_LIBXML_BACKWARD
                   1107:                        ,0
                   1108: #endif
                   1109:                        );
1.30      paf      1110:        } else {
                   1111:                memcpy(out, s, outlen=inlen);
                   1112:                error=0;
                   1113:        }
                   1114:        if(error<0)
1.100     moko     1115:                throw Exception(0, 0, "transcode_cstr failed (%d)", error);
1.8       paf      1116: 
1.35      paf      1117:        assert(outlen<=saved_outlen); out[outlen]=0;
                   1118:        return String::C(out, outlen);
1.14      paf      1119: }
1.54      paf      1120: const String& Charset::transcode(const xmlChar* s) { 
1.35      paf      1121:        String::C cstr=transcode_cstr(s);
1.75      misha    1122:        return *new String(cstr.str, String::L_TAINTED);
1.1       paf      1123: }
                   1124: 
1.8       paf      1125: /// @test less memory using -maybe- xmlParserInputBufferCreateMem
1.35      paf      1126: xmlChar* Charset::transcode_buf2xchar(const char* buf, size_t buf_size) {
                   1127:        xmlChar* out;
1.30      paf      1128:        int outlen;
                   1129:        int error;
1.35      paf      1130: #ifndef NDEBUG
                   1131:        int saved_outlen;
                   1132: #endif
                   1133:        if(xmlCharEncodingInputFunc input=transcoder(FNAME).input) {
1.51      paf      1134:                outlen=buf_size*6/*max UTF8 bytes per char*/;
1.35      paf      1135: #ifndef NDEBUG
                   1136:                saved_outlen=outlen;
                   1137: #endif
1.47      paf      1138:                out=(xmlChar*)xmlMalloc(outlen+1);
1.30      paf      1139:                error=input(
1.17      paf      1140:                        out, &outlen,
1.46      paf      1141:                        (const unsigned char*)buf, (int*)&buf_size
                   1142: #ifdef PA_PATCHED_LIBXML_BACKWARD
                   1143:                        ,0
                   1144: #endif
                   1145:                        );
1.30      paf      1146:        } else {
                   1147:                outlen=buf_size;
1.35      paf      1148: #ifndef NDEBUG
                   1149:                saved_outlen=outlen;
                   1150: #endif
                   1151:                out=(xmlChar*)xmlMalloc(outlen+1);
1.30      paf      1152:                memcpy(out, buf, outlen);
                   1153:                error=0;
                   1154:        }
1.17      paf      1155:        
1.30      paf      1156:        if(error<0)
1.100     moko     1157:                throw Exception(0, 0, "transcode_buf failed (%d)", error);
1.8       paf      1158: 
1.35      paf      1159:        assert(outlen<=saved_outlen); out[outlen]=0;
                   1160:        return out;
1.24      paf      1161: }
1.1       paf      1162: 
1.79      misha    1163: xmlChar* Charset::transcode(const String& s) {
                   1164:        String::Body sbody=s.cstr_to_string_body_untaint(String::L_AS_IS);
                   1165:        return transcode_buf2xchar(sbody.cstr(), sbody.length()); 
1.1       paf      1166: }
1.35      paf      1167: 
1.79      misha    1168: xmlChar* Charset::transcode(const String::Body s) {
                   1169:        return transcode_buf2xchar(s.cstr(), s.length()); 
1.35      paf      1170: }
1.36      paf      1171: #endif
1.34      paf      1172: 
1.99      moko     1173: String::Body Charset::transcode(const String::Body src, const Charset& source_transcoder, const Charset& dest_transcoder) {
1.101     moko     1174:        return String::Body(Charset::transcode(String::C(src.cstr(), src.length()), source_transcoder, dest_transcoder));
1.35      paf      1175: }
                   1176: 
1.99      moko     1177: String& Charset::transcode(const String& src, const Charset& source_transcoder, const Charset& dest_transcoder) {
1.72      misha    1178:        if(src.is_empty())
1.73      misha    1179:                return *new String();
1.34      paf      1180: 
1.37      paf      1181:        return *new String(transcode((String::Body)src, source_transcoder, dest_transcoder), String::L_CLEAN);
1.34      paf      1182: }
                   1183: 
1.99      moko     1184: void Charset::transcode(ArrayString& src, const Charset& source_transcoder, const Charset& dest_transcoder) {
1.35      paf      1185:        for(size_t i=0; i<src.count(); i++)
                   1186:                src.put(i, &transcode(*src[i], source_transcoder, dest_transcoder));
1.34      paf      1187: }
                   1188: 
                   1189: #ifndef DOXYGEN
                   1190: struct Transcode_pair_info {
                   1191:        const Charset* source_transcoder;
                   1192:        const Charset* dest_transcoder;
                   1193: };
                   1194: #endif
1.99      moko     1195: static void transcode_pair(HashStringValue::key_type /*akey*/, String::Body& avalue, Transcode_pair_info* info) {
                   1196:        avalue=Charset::transcode(avalue, *info->source_transcoder, *info->dest_transcoder);
1.34      paf      1197: }
1.61      misha    1198: 
1.99      moko     1199: void Charset::transcode(HashStringString& src, const Charset& source_transcoder, const Charset& dest_transcoder) {
1.35      paf      1200:        Transcode_pair_info info={&source_transcoder, &dest_transcoder};
1.55      paf      1201:        src.for_each_ref<Transcode_pair_info*>(transcode_pair, &info);
1.34      paf      1202: }
1.61      misha    1203: 
                   1204: size_t getUTF8BytePos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t charPos){
                   1205:        const XMLByte* ptr=srcBegin;
1.70      misha    1206:        while(charPos-- && skipUTF8Char(ptr, srcEnd));
1.61      misha    1207: 
                   1208:        return ptr-srcBegin;
                   1209: }
                   1210: 
                   1211: size_t getUTF8CharPos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t bytePos){
                   1212:        size_t charPos=0;
                   1213:        const XMLByte* ptr=srcBegin;
                   1214:        const XMLByte* ptrEnd=srcBegin+bytePos;
1.70      misha    1215:        while(skipUTF8Char(ptr, srcEnd)){
1.61      misha    1216:                if(ptr>ptrEnd)
                   1217:                        return charPos;
                   1218:                charPos++;
                   1219:        }
                   1220: 
                   1221:        // scan till end but position in bytes still too low
1.107     moko     1222:        throw Exception(0, 0, "Error conversion byte pos to char pos");
1.61      misha    1223: }
                   1224: 
                   1225: size_t lengthUTF8(const XMLByte* srcBegin, const XMLByte* srcEnd){
                   1226:        size_t size=0;
1.70      misha    1227:        while(skipUTF8Char(srcBegin, srcEnd))
1.61      misha    1228:                size++;
                   1229: 
                   1230:        return size;
                   1231: }
1.80      misha    1232: 
1.84      misha    1233: unsigned int lengthUTF8Char(const XMLByte c){
                   1234:        return gUTFBytes[c]+1;
                   1235: }
                   1236: 
1.94      moko     1237: const char *fixUTF8(const char *src){
                   1238:        if(src && *src){
                   1239:                size_t length=strlen(src);
                   1240: 
                   1241:                int error_offset;
1.96      moko     1242:                if(pa_pcre_valid_utf((unsigned char *)src, length, &error_offset)){
1.94      moko     1243: 
                   1244:                        char *result=(char *)pa_malloc_atomic(length+1);
                   1245:                        char *dst=result;
                   1246: 
                   1247:                        do {
                   1248: 
                   1249:                                if(error_offset){
1.110     moko     1250:                                        memcpy(dst, src, error_offset);
1.94      moko     1251:                                        dst+=error_offset;
                   1252: 
                   1253:                                        src+=error_offset;
                   1254:                                        length-=error_offset;
                   1255: 
                   1256:                                }
                   1257: 
                   1258:                                *dst++='?';
                   1259:                                src++;
                   1260:                                length--;
                   1261: 
1.96      moko     1262:                        } while (length && pa_pcre_valid_utf((unsigned char *)src, length, &error_offset));
1.94      moko     1263: 
                   1264:                        if(length){
                   1265:                                strcpy(dst, src);
                   1266:                        } else {
                   1267:                                *dst='\0';
                   1268:                        }
                   1269: 
                   1270:                        return result;
                   1271:                }
                   1272:        }
                   1273:        return src;
                   1274: }
                   1275: 
1.80      misha    1276: bool UTF8_string_iterator::has_next(){
                   1277:        fcharSize=readUTF8Char(fsrcPtr, fsrcEnd, ffirstByte, fUTF8Char);
                   1278:        return fcharSize!=0;
                   1279: }
E-mail: