Annotation of parser3/src/include/pa_charset.h, revision 1.61

1.1       paf         1: /** @file
                      2:        Parser: Charset connection decl.
                      3: 
1.59      moko        4:        Copyright (c) 2001-2024 Art. Lebedev Studio (http://www.artlebedev.com)
1.57      moko        5:        Authors: Konstantin Morshnev <moko@design.ru>, Alexandr Petrosian <paf@design.ru>
1.1       paf         6: */
                      7: 
                      8: #ifndef PA_CHARSET_H
                      9: #define PA_CHARSET_H
1.17      paf        10: 
1.61    ! moko       11: #define IDENT_PA_CHARSET_H "$Id: pa_charset.h,v 1.60 2024/12/11 21:57:35 moko Exp $"
1.23      paf        12: 
1.1       paf        13: 
                     14: #include "pa_exception.h"
                     15: #include "pa_common.h"
1.23      paf        16: #include "pa_hash.h"
                     17: #include "pa_array.h"
1.1       paf        18: 
1.58      moko       19: #ifdef HAVE_PCRE2
                     20: #include <pcre2.h>
                     21: #else
                     22: #include <pcre.h>
                     23: #endif
1.51      moko       24: // we are using some pcre_internal.h stuff as well
                     25: #include "../lib/pcre/pa_pcre_internal.h"
1.1       paf        26: 
1.3       paf        27: #ifdef XML
1.60      moko       28: #include "libxml/xmlstring.h"
1.3       paf        29: #include "libxml/encoding.h"
                     30: #endif
                     31: 
1.1       paf        32: // defines
                     33: 
1.23      paf        34: #define MAX_CHARSETS 10
                     35: 
                     36: #define MAX_CHARSET_UNI_CODES 500
                     37: 
1.41      misha      38: #ifndef XMLCh 
                     39:        typedef unsigned int XMLCh;
                     40: #endif
                     41: #ifndef XMLByte
                     42:        typedef unsigned char XMLByte;
                     43: #endif
1.3       paf        44: 
1.23      paf        45: // helpers
                     46: 
1.42      misha      47: typedef HashString<String::Body> HashStringString;
1.23      paf        48: 
1.1       paf        49: /**    charset holds name & transcode tables 
1.23      paf        50:        registers libxml transcoders
1.1       paf        51: */
1.23      paf        52: class Charset: public PA_Object {
1.1       paf        53: public:
                     54: 
1.24      paf        55:        Charset(Request_charsets* charsets, const String::Body ANAME, const String* afile_spec);
1.1       paf        56:        
1.24      paf        57:        const String::Body NAME() const { return FNAME; }
1.30      paf        58:        const char* NAME_CSTR() const { return FNAME_CSTR; }
1.1       paf        59: 
                     60:        bool isUTF8() const { return fisUTF8; }
                     61: 
1.53      moko       62:        static String::C transcode(const String::C src, const Charset& source_charset, const Charset& dest_charset);
                     63:        static String::Body transcode(const String::Body src, const Charset& source_transcoder, const Charset& dest_transcoder);
1.61    ! moko       64:        static String::Body transcode(const char* src,const Charset& source_transcoder, const Charset& dest_transcoder){ return transcode(String::Body(src), source_transcoder, dest_transcoder); }
1.53      moko       65:        static String& transcode(const String& src, const Charset& source_transcoder, const Charset& dest_transcoder);
                     66:        static void transcode(ArrayString& src, const Charset& source_transcoder, const Charset& dest_transcoder);
                     67:        static void transcode(HashStringString& src, const Charset& source_transcoder, const Charset& dest_transcoder);
                     68: 
                     69:        static String::C escape(const String::C src, const Charset& source_charset);
                     70:        static String::Body escape(const String::Body src, const Charset& source_charset);
                     71:        static String& escape(const String& src, const Charset& source_charset);
                     72: 
                     73:        static String::C escape_JSON(const String::C src, const Charset& source_charset);
                     74:        static String::Body escape_JSON(const String::Body src, const Charset& source_charset);
                     75:        static String& escape_JSON(const String& src, const Charset& source_charset);
1.38      misha      76: 
1.35      misha      77:        void store_Char(XMLByte*& outPtr, XMLCh src, XMLByte not_found);
1.33      misha      78: 
1.9       paf        79: #ifdef XML
1.24      paf        80:        xmlCharEncodingHandler& transcoder(const String::Body NAME);
1.9       paf        81: #endif
                     82: 
1.1       paf        83: public:
                     84: 
                     85:        unsigned char pcre_tables[tables_length];
                     86: 
                     87: private:
                     88: 
1.23      paf        89:        void load_definition(Request_charsets& charsets, const String& afile_spec);
1.1       paf        90:        void sort_ToTable();
                     91: 
1.23      paf        92:        const String::C transcodeToUTF8(const String::C src) const;
                     93:        const String::C transcodeFromUTF8(const String::C src) const;
                     94:        
                     95:        const String::C transcodeToCharset(const String::C src,
                     96:                const Charset& dest_transcoder) const;
1.1       paf        97: 
1.4       paf        98: public:
                     99: 
                    100:        struct Tables {
1.25      paf       101:                struct Rec {
                    102:                        XMLCh intCh;
                    103:                        XMLByte extCh;
                    104:                };
                    105: 
1.4       paf       106:                XMLCh fromTable[0x100];
1.25      paf       107:                Rec toTable[MAX_CHARSET_UNI_CODES];
1.4       paf       108:                uint toTableSize;
                    109:        };
                    110: 
1.25      paf       111:        struct UTF8CaseTable {
                    112:                struct Rec {
                    113:                        XMLCh from, to;
                    114:                };
                    115: 
                    116:                uint size;
                    117:                Rec* records;
                    118:        };
                    119: 
1.1       paf       120: private:
                    121: 
1.24      paf       122:        const String::Body FNAME;
1.23      paf       123:        char* FNAME_CSTR;
1.1       paf       124:        bool fisUTF8;
1.4       paf       125:        Tables tables;
1.1       paf       126: 
1.45      misha     127:        static size_t calc_escaped_length_UTF8(XMLByte* src, size_t src_length);
1.46      moko      128:        static size_t calc_escaped_length(const XMLByte* src, size_t src_length, const Charset::Tables& tables);
1.45      misha     129:        static size_t calc_escaped_length(const String::C src, const Charset& source_charset);
                    130:        static size_t escape_UTF8(const XMLByte* src, size_t src_length, XMLByte* dest);
                    131:        static size_t escape(const XMLByte* src, size_t src_length, XMLByte* dest, const Charset::Tables& tables);
                    132: 
                    133:        static size_t calc_JSON_escaped_length_UTF8(XMLByte* src, size_t src_length);
1.46      moko      134:        static size_t calc_JSON_escaped_length(const XMLByte* src, size_t src_length, const Charset::Tables& tables);
1.45      misha     135:        static size_t calc_JSON_escaped_length(const String::C src, const Charset& source_charset);
                    136:        static size_t escape_JSON_UTF8(const XMLByte* src, size_t src_length, XMLByte* dest);
                    137:        static size_t escape_JSON(const XMLByte* src, size_t src_length, XMLByte* dest, const Charset::Tables& tables);
                    138: 
1.47      misha     139: #ifdef XML
                    140: 
                    141: private:
                    142:        void addEncoding(char* name_cstr);
                    143:        void initTranscoder(const String::Body name, const char* name_cstr);
                    144: 
1.1       paf       145: public:
1.23      paf       146:        /// converts xmlChar* null-terminated string to char* 
1.32      paf       147:        String::C transcode_cstr(const xmlChar* s);
1.8       paf       148:        /// converts xmlChar* null-terminated string to parser String
1.32      paf       149:        const String& transcode(const xmlChar* s);
1.23      paf       150: 
                    151:        /** converts sized char*  to xmlChar*
                    152:                @returns xmlChar*  WHICH CALLER SHOULD FREE
1.16      paf       153:        */
1.23      paf       154:        xmlChar* transcode_buf2xchar(const char* buf, size_t buf_size);
1.32      paf       155:        /// converts parser String to xmlChar*
                    156:        xmlChar* transcode(const String& s);
                    157:        /// converts parser String::Body to xmlChar*
                    158:        xmlChar* transcode(const String::Body s);
1.1       paf       159: 
                    160: private:
                    161: 
1.23      paf       162:        xmlCharEncodingHandler* ftranscoder;
1.1       paf       163: 
                    164: #endif
                    165: 
1.5       paf       166: };
1.25      paf       167: 
                    168: 
                    169: // externs
                    170: 
                    171: extern Charset::UTF8CaseTable UTF8CaseToUpper;
                    172: extern Charset::UTF8CaseTable UTF8CaseToLower;
1.28      paf       173: void change_case_UTF8(const XMLByte* srcData, size_t srcLen,
1.43      misha     174:                                        XMLByte* toFill, size_t toFillLen,
                    175:                                        const Charset::UTF8CaseTable& table);
1.37      misha     176: size_t getUTF8BytePos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t charPos/*position in characters*/);
                    177: size_t getUTF8CharPos(const XMLByte* srcBegin, const XMLByte* srcEnd, size_t bytePos/*position in bytes*/);
                    178: size_t lengthUTF8(const XMLByte* srcBegin, const XMLByte* srcEnd);
1.44      misha     179: unsigned int lengthUTF8Char(const XMLByte c);
1.25      paf       180: 
1.52      moko      181: const char *fixUTF8(const char *src);
1.43      misha     182: 
                    183: class UTF8_string_iterator {
                    184:        public:
                    185:                UTF8_string_iterator(const String& astring): fsrcPtr((XMLByte*)astring.cstr()), fsrcEnd(fsrcPtr + astring.length()) {}
                    186:                UTF8_string_iterator(XMLByte* asrcPtr, size_t length): fsrcPtr(asrcPtr), fsrcEnd(fsrcPtr + length) {}
                    187: 
                    188:                bool has_next();
                    189:                XMLCh next() { return fUTF8Char; }
                    190:                XMLByte getFirstByte(){ return ffirstByte; }
                    191:                size_t getCharSize(){ return fcharSize; }
                    192:        private:
                    193:                const XMLByte* fsrcPtr;
                    194:                const XMLByte* fsrcEnd;
                    195:                size_t fcharSize;
                    196:                XMLByte ffirstByte;
                    197:                XMLCh fUTF8Char;
                    198: };
                    199: 
1.1       paf       200: #endif

E-mail: