Annotation of parser3/src/main/pa_charset.C, revision 1.2
1.1 paf 1: /** @file
2: Parser: Charset connection implementation.
3:
4: Copyright (c) 2001 ArtLebedev Group (http://www.artlebedev.com)
5: Author: Alexander Petrosyan <paf@design.ru> (http://paf.design.ru)
6:
1.2 ! paf 7: $Id: pa_charset.C,v 1.1 2001/12/15 21:28:21 paf Exp $
1.1 paf 8: */
9:
10: #include "pa_charset.h"
11: //#include "pa_exception.h"
12: //#include "pa_common.h"
13: //#include "pa_threads.h"
14:
15: #ifdef XML
16: # include <util/TransENameMap.hpp>
17: # include <util/XML256TableTranscoder.hpp>
18: # include <util/PlatformUtils.hpp>
19: # include <PlatformSupport/XalanTranscodingServices.hpp>
20: #endif
21:
22: // globals
23:
24:
25: // consts
26:
27: #define MAX_CHARSET_UNI_CODES 500
28:
29: // helpers
30:
31: inline void prepare_case_tables(unsigned char *tables) {
32: unsigned char *lcc_table=tables+lcc_offset;
33: unsigned char *fcc_table=tables+fcc_offset;
34: for(int i=0; i<0x100; i++)
35: lcc_table[i]=fcc_table[i]=i;
36: }
37: inline void cstr2ctypes(unsigned char *tables, const unsigned char *cstr,
38: unsigned char bit) {
39: unsigned char *ctypes_table=tables+ctypes_offset;
40: ctypes_table[0]=bit;
41: for(; *cstr; cstr++) {
42: unsigned char c=*cstr;
43: ctypes_table[c]|=bit;
44: }
45: }
46: inline unsigned int to_wchar_code(const char *cstr) {
47: if(!cstr || !*cstr)
48: return 0;
49: if(cstr[1]==0)
50: return (unsigned int)(unsigned char)cstr[0];
51:
52: char *error_pos;
53: return (unsigned int)strtol(cstr, &error_pos, 0);
54: }
55: inline bool to_bool(const char *cstr) {
56: return cstr && *cstr!=0;
57: }
58: static void element2ctypes(unsigned char c, bool belongs,
59: unsigned char *tables, unsigned char bit, int group_offset=-1) {
60: if(!belongs)
61: return;
62:
63: unsigned char *ctypes_table=tables+ctypes_offset;
64:
65: ctypes_table[c]|=bit;
66: if(group_offset>=0)
67: tables[cbits_offset+group_offset+c/8] |= 1 << (c%8);
68: }
69: static void element2case(unsigned char from, unsigned char to,
70: unsigned char *tables) {
71: if(!to)
72: return;
73:
74: unsigned char *lcc_table=tables+lcc_offset;
75: unsigned char *fcc_table=tables+fcc_offset;
76: lcc_table[from]=to;
77: fcc_table[from]=to; fcc_table[to]=from;
78: }
79:
80: #ifdef XML
81: template <class TType> class ENameMapFor2 : public ENameMap {
82: public :
83: // -----------------------------------------------------------------------
84: // Constructors and Destructor
85: // -----------------------------------------------------------------------
86: ENameMapFor2(
87: const XMLCh* const encodingName
88: , const XMLCh* const fromTable
89: , const XMLTransService::TransRec* const toTable
90: , const unsigned int toTableSize
91: ) : ENameMap(encodingName),
92: ffromTable(fromTable),
93: ftoTable(toTable),
94: ftoTableSize(toTableSize) {}
95:
96: // -----------------------------------------------------------------------
97: // Implementation of virtual factory method
98: // -----------------------------------------------------------------------
99: virtual XMLTranscoder* makeNew(const unsigned int blockSize) const {
100: return new TType(
101: getKey(),
102: blockSize,
103: ffromTable,
104: ftoTable, ftoTableSize);
105: }
106: private:
107: const XMLCh* const ffromTable;
108: const XMLTransService::TransRec* const ftoTable;
109: const unsigned int ftoTableSize;
110:
111: private :
112: // -----------------------------------------------------------------------
113: // Unimplemented constructors and operators
114: // -----------------------------------------------------------------------
115: ENameMapFor2();
116: ENameMapFor2(const ENameMapFor2<TType>&);
117: void operator=(const ENameMapFor2<TType>&);
118: };
119:
120: class XML256TableTranscoder2 : public XML256TableTranscoder {
121: public :
122: XML256TableTranscoder2(
123: const XMLCh* const encodingName
124: , const unsigned int blockSize
125: , const XMLCh* const fromTable
126: , const XMLTransService::TransRec* const toTable
127: , const unsigned int toTableSize
128: ) : XML256TableTranscoder(encodingName, blockSize, fromTable, toTable, toTableSize) {}
129:
130: private :
131: XML256TableTranscoder2();
132: XML256TableTranscoder2(const XML256TableTranscoder2&);
133: void operator=(const XML256TableTranscoder2&);
134: };
135: #endif
136:
137: // methods
138:
139: extern "C" unsigned char pcre_default_tables[]; // pcre/chartables.c
140: Charset::Charset(Pool& apool, const String& aname, const String *file_spec) : Pooled(apool),
141: fname(apool) {
142: // fname
1.2 ! paf 143: char *name_cstr=(char *)malloc(aname.size()+1);
! 144: memcpy(name_cstr, aname.cstr(String::UL_AS_IS), aname.size()+1);
1.1 paf 145: fname.APPEND_CLEAN(name_cstr, aname.size(), 0, 0);
146:
147: if(file_spec) {
148: fisUTF8=false;
149: loadDefinition(*file_spec);
150: #ifdef XML
151: addEncoding(name_cstr);
152: #endif
153: } else {
154: fisUTF8=true;
155: // grab default onces [for UTF-8 so to be able to make a-z => A-Z
156: memcpy(pcre_tables, pcre_default_tables, sizeof(pcre_tables));
157: }
158:
159: #ifdef XML
160: initTranscoder(&aname, name_cstr);
161: #endif
162: }
163:
164: Charset::~Charset() {
165: #ifdef XML
166: delete transcoder;
167: #endif
168: }
169:
170: void Charset::loadDefinition(const String& file_spec) {
171: // pcre_tables
172: // lowcase, flipcase, bits digit+word+whitespace, masks
173:
174: // must not move this inside of prepare_case_tables
175: // don't know the size there
176: memset(pcre_tables, 0, sizeof(pcre_tables));
177: prepare_case_tables(pcre_tables);
178: cstr2ctypes(pcre_tables, (const unsigned char *)"*+?{^.$|()[", ctype_meta);
179:
180: // charset
181: memset(fromTable, 0, sizeof(fromTable));
182: toTable=(XMLTransService::TransRec *)calloc(
183: sizeof(XMLTransService::TransRec)*MAX_CHARSET_UNI_CODES);
184: toTableSize=0;
185: // strangly vital
186: toTable[toTableSize].intCh=0;
187: toTable[toTableSize].extCh=(XMLByte)0;
188: toTableSize++;
189:
190: // loading text
191: char *data=file_read_text(pool(), file_spec);
192:
193: // ignore header
194: getrow(&data);
195:
196: // parse cells
197: char *row;
198: while(row=getrow(&data)) {
199: // remove empty&comment lines
200: if(!*row || *row=='#')
201: continue;
202:
203: // char white-space digit hex-digit letter word lowercase unicode1 unicode2
204: unsigned int c=0;
205: char *cell;
206: for(int column=0; cell=lsplit(&row, '\t'); column++) {
207: switch(column) {
208: case 0: c=to_wchar_code(cell); break;
209: // pcre_tables
210: case 1: element2ctypes(c, to_bool(cell), pcre_tables, ctype_space, cbit_space); break;
211: case 2: element2ctypes(c, to_bool(cell), pcre_tables, ctype_digit, cbit_digit); break;
212: case 3: element2ctypes(c, to_bool(cell), pcre_tables, ctype_xdigit); break;
213: case 4: element2ctypes(c, to_bool(cell), pcre_tables, ctype_letter); break;
214: case 5: element2ctypes(c, to_bool(cell), pcre_tables, ctype_word, cbit_word); break;
215: case 6: element2case(c, to_wchar_code(cell), pcre_tables); break;
216: case 7:
217: case 8:
218: // charset
219: if(toTableSize>MAX_CHARSET_UNI_CODES)
220: throw Exception(0, 0,
221: &file_spec,
222: "charset must contain not more then %d unicode values", MAX_CHARSET_UNI_CODES);
223:
224: XMLCh unicode=(XMLCh)to_wchar_code(cell);
225: if(!unicode && column==7/*unicode1 column*/)
226: unicode=(XMLCh)c;
227: if(unicode) {
228: if(!fromTable[c])
229: fromTable[c]=unicode;
230: toTable[toTableSize].intCh=unicode;
231: toTable[toTableSize].extCh=(XMLByte)c;
232: toTableSize++;
233: }
234: break;
235: }
236: }
237: };
238:
239: // sort by the Unicode code point
240: sort_ToTable();
241: }
242:
243: #ifdef XML
244: void Charset::addEncoding(const char *name_cstr) {
245: // addEncoding
246: XalanDOMString sencoding(name_cstr);
247: const XMLCh* const auto_encoding_cstr=sencoding.c_str();
248: int size=sizeof(XMLCh)*(sencoding.size()+1);
249: XMLCh* pool_encoding_cstr=(XMLCh*)malloc(size);
250: memcpy(pool_encoding_cstr, auto_encoding_cstr, size);
251: XMLString::upperCase(pool_encoding_cstr);
252:
253: XMLPlatformUtils::fgTransService->addEncoding(
254: pool_encoding_cstr,
255: new ENameMapFor2<XML256TableTranscoder2>(
256: pool_encoding_cstr
257: , fromTable
258: , toTable
259: , toTableSize
260: ));
261: }
262:
263: void Charset::initTranscoder(const String *source, const char *name_cstr) {
264: XMLTransService::Codes resValue;
265: transcoder=XMLPlatformUtils::fgTransService->makeNewTranscoderFor(name_cstr, resValue, 60);
266: if(!transcoder)
267: throw Exception(0, 0,
268: source,
269: "unsupported encoding");
270: }
271: #endif
272:
273: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
274: return
275: static_cast<const Charset_TransRec *>(a)->intCh-
276: static_cast<const Charset_TransRec *>(b)->intCh;
277: }
278:
279: void Charset::sort_ToTable() {
280: _qsort(toTable, toTableSize, sizeof(*toTable),
281: sort_cmp_Trans_rec_intCh);
282: //FILE *f=fopen("c:\\temp\\a", "wb");
283: //fwrite(toTable, toTableSize, sizeof(*toTable), f);
284: //fclose(f);
285: }
286:
287: XMLByte Charset::xlatOneTo(const XMLCh toXlat) const {
288: unsigned int lowOfs = 0;
289: unsigned int hiOfs = toTableSize - 1;
290: XMLByte curByte = 0;
291: do {
292: // Calc the mid point of the low and high offset.
293: const unsigned int midOfs = ((hiOfs - lowOfs) / 2) + lowOfs;
294:
295: // If our test char is greater than the mid point char, then
296: // we move up to the upper half. Else we move to the lower
297: // half. If its equal, then its our guy.
298: if (toXlat > toTable[midOfs].intCh)
299: lowOfs = midOfs;
300: else if (toXlat < toTable[midOfs].intCh)
301: hiOfs = midOfs;
302: else
303: return toTable[midOfs].extCh;
304: } while(lowOfs + 1 < hiOfs);
305:
306: return '?';
307: }
308:
309: void Charset::transcode(Pool& pool,
310: const Charset& source_charset, const void *source_body, size_t source_content_length,
311: const Charset& dest_charset, const void *& dest_body, size_t& dest_content_length
312: ) {
313: switch((source_charset.isUTF8()?0x10:0x00)|(dest_charset.isUTF8()?0x01:0x00)) {
314: default: // 0x00
315: source_charset.transcodeToCharset(pool, dest_charset,
316: source_body, source_content_length,
317: dest_body, dest_content_length);
318: break;
319: case 0x01:
320: source_charset.transcodeToUTF8(pool,
321: source_body, source_content_length,
322: dest_body, dest_content_length);
323: break;
324: case 0x10:
325: dest_charset.transcodeFromUTF8(pool,
326: source_body, source_content_length,
327: dest_body, dest_content_length);
328: break;
329: case 0x11:
330: dest_body=source_body;
331: dest_content_length=source_content_length;
332: break;
333: }
334: }
335:
336: // ---------------------------------------------------------------------------
337: // Local static data
338: //
339: // gUTFBytes
340: // A list of counts of trailing bytes for each initial byte in the input.
341: //
342: // gUTFOffsets
343: // A list of values to offset each result char type, according to how
344: // many source bytes when into making it.
345: //
346: // gFirstByteMark
347: // A list of values to mask onto the first byte of an encoded sequence,
348: // indexed by the number of bytes used to create the sequence.
349: // ---------------------------------------------------------------------------
350: static const XMLByte gUTFBytes[0x100] = {
351: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
352: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
353: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
354: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
355: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
356: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
357: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
358: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
359: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
360: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
361: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
362: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
363: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
364: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
365: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
366: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
367: };
368:
369: static const uint gUTFOffsets[6] = {
370: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
371: };
372:
373: static const XMLByte gFirstByteMark[7] = {
374: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
375: };
376:
377: /// @todo not so memory-hungry with prescan
378: void Charset::transcodeToUTF8(Pool& pool,
379: const void *source_body, size_t source_content_length,
380: const void *& adest_body, size_t& adest_content_length) const {
381:
382: size_t dest_content_length=0;
383: XMLByte *dest_body=(XMLByte*)pool.malloc(source_content_length*6/*so that surly enough*/);
384:
385: const XMLByte* srcPtr=(const XMLByte*)source_body;
386: const XMLByte* srcEnd=(const XMLByte*)source_body+source_content_length;
387: XMLByte* outPtr=dest_body;
388:
389: while (srcPtr < srcEnd) {
390: uint curVal = fromTable[*srcPtr];
391: if(!curVal) {
392: // use the replacement character
393: *outPtr++ = '?';
394: srcPtr ++;
395: continue;
396: }
397:
398: // Figure out how many bytes we need
399: unsigned int encodedBytes;
400: if (curVal < 0x80)
401: encodedBytes = 1;
402: else if (curVal < 0x800)
403: encodedBytes = 2;
404: else if (curVal < 0x10000)
405: encodedBytes = 3;
406: else if (curVal < 0x200000)
407: encodedBytes = 4;
408: else if (curVal < 0x4000000)
409: encodedBytes = 5;
410: else if (curVal <= 0x7FFFFFFF)
411: encodedBytes = 6;
412: else {
413: // use the replacement character
414: *outPtr++ = '?';
415: srcPtr ++;
416: continue;
417: }
418:
419: // If we cannot fully get this char into the output buffer,
420: // never
421:
422: // We can do it, so update the source index
423: srcPtr++;
424:
425: // And spit out the bytes. We spit them out in reverse order
426: // here, so bump up the output pointer and work down as we go.
427: outPtr += encodedBytes;
428: switch(encodedBytes) {
429: case 6 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
430: curVal >>= 6;
431: case 5 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
432: curVal >>= 6;
433: case 4 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
434: curVal >>= 6;
435: case 3 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
436: curVal >>= 6;
437: case 2 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
438: curVal >>= 6;
439: case 1 : *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
440: }
441:
442: // Add the encoded bytes back in again to indicate we've eaten them
443: outPtr += encodedBytes;
444: }
445:
446: // return
447: adest_body=dest_body;
448: adest_content_length=outPtr-dest_body;
449: }
450: void Charset::transcodeFromUTF8(Pool& pool,
451: const void *source_body, size_t source_content_length,
452: const void *& adest_body, size_t& adest_content_length) const {
453: size_t dest_content_length=0;
454: XMLByte *dest_body=(XMLByte*)pool.malloc(source_content_length/*surly enough*/);
455:
456: const XMLByte* srcPtr=(const XMLByte*)source_body;
457: const XMLByte* srcEnd=(const XMLByte*)source_body+source_content_length;
458: XMLByte* outPtr=dest_body;
459:
460: // We now loop until we either run out of input data
461: while (srcPtr < srcEnd) {
462: // Get the next leading byte out
463: const XMLByte firstByte = *srcPtr;
464:
465: // Special-case ASCII, which is a leading byte value of <= 127
466: if (firstByte <= 127) {
467: *outPtr++ = firstByte;
468: srcPtr++;
469: continue;
470: }
471:
472: // See how many trailing src bytes this sequence is going to require
473: const unsigned int trailingBytes = gUTFBytes[firstByte];
474:
475: // If there are not enough source bytes to do this one, then we
476: // are done. Note that we done >= here because we are implicitly
477: // counting the 1 byte we get no matter what.
478: if (srcPtr + trailingBytes >= srcEnd)
479: break;
480:
481: // Looks ok, so lets build up the value
482: uint tmpVal=0;
483: switch(trailingBytes) {
484: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
485: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
486: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
487: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
488: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
489: case 0: tmpVal+=*srcPtr++;
490: break;
491:
492: default:
493: throw Exception(0, 0,
494: 0,
495: "transcodeFromUTF8 error: wrong trailingBytes value (%d)", trailingBytes);
496: }
497: tmpVal-=gUTFOffsets[trailingBytes];
498:
499: // If it will fit into a single char, then put it in. Otherwise
500: // fail [*encode it as a surrogate pair. If its not valid, use the
501: // replacement char.*]
502: if (!(tmpVal & 0xFFFF0000))
503: *outPtr++ = xlatOneTo(tmpVal);
504: else
505: throw Exception(0, 0,
506: 0,
507: "transcodeFromUTF8 error: too big tmpVal (0x%08X)", tmpVal);
508: }
509:
510: // return
511: adest_body=dest_body;
512: adest_content_length=outPtr-dest_body;
513: }
514:
515: /// transcode using both charsets
516: void Charset::transcodeToCharset(Pool& pool,
517: const Charset& dest_charset,
518: const void *source_body, size_t source_content_length,
519: const void *& adest_body, size_t& dest_content_length) const {
520: throw Exception(0, 0,
521: 0,
522: "transcodeToCharset not supported(yet)");
523: /*
524: void *dest_body;
525:
526: dest_body=pool.malloc(dest_content_length=source_content_length);
527: // dummy
528: memset(dest_body, '?', dest_content_length);
529:
530: adest_body=dest_body;*/
531: }
532:
533: #ifdef XML
534: const char *Charset::transcode_cstr(const XalanDOMString& s) {
535: const unsigned int len=s.size()*2;
536: XMLByte* dest=(XMLByte *)malloc((len+1)*sizeof(XMLByte));
537: bool error=true;
538: try {
539: if(transcoder) {
540: unsigned int charsEaten;
541: unsigned int size=transcoder->transcodeTo(
542: s.c_str(), s.length(),
543: dest, len,
544: charsEaten,
545: XMLTranscoder::UnRep_RepChar //UnRep_Throw
546: );
547: dest[size]=0;
548: error=false;
549: }
550: } catch(XMLException& e) {
551: Exception::provide_source(pool(), 0, e);
552: }
553: return (const char *)dest;
554: }
555: String& Charset::transcode(const XalanDOMString& s) {
556: return *NEW String(pool(), transcode_cstr(s));
557: }
558:
559: std::auto_ptr<XalanDOMString> Charset::transcode_buf(const char *buf, size_t buf_size) {
560: unsigned int dest_size=0;
561: XMLCh* dest=(XMLCh *)malloc((buf_size+1)*sizeof(XMLCh));
562: unsigned char *charSizes=(unsigned char *)malloc(buf_size*sizeof(unsigned char));
563: XalanDOMString *result;
564: try {
565: if(transcoder) {
566: unsigned int bytesEaten;
567: unsigned int dest_size=transcoder->transcodeFrom(
568: (unsigned char *)buf,
569: (const unsigned int)buf_size,
570: dest, (const unsigned int)buf_size,
571: bytesEaten,
572: charSizes
573: );
574: result=new XalanDOMString(dest, dest_size);
575: }
576: } catch(XMLException& e) {
577: Exception::provide_source(pool(), 0, e);
578: result=0; //calm, compiler
579: }
580:
581: return std::auto_ptr<XalanDOMString>(result);
582: }
583: std::auto_ptr<XalanDOMString> Charset::transcode(const String& s) {
584: const char *cstr=s.cstr(String::UL_UNSPECIFIED);
585:
586: return transcode_buf(cstr, strlen(cstr));
587: }
588: #endif
E-mail: