Annotation of parser3/src/main/pa_transcoder.C, revision 1.2
1.1 paf 1: /** @file
2: Parser: Transcoder impl.
3:
4: Copyright (c) 2001 ArtLebedev Group (http://www.artlebedev.com)
5: Author: Alexander Petrosyan <paf@design.ru> (http://paf.design.ru)
6:
1.2 ! paf 7: $Id: pa_transcoder.C,v 1.1 2001/12/14 12:55:36 paf Exp $
1.1 paf 8: */
9:
10: #include "pa_common.h"
11: #include "pa_transcoder.h"
12: #include "pa_exception.h"
13:
14: static int sort_cmp_Trans_rec_intCh(const void *a, const void *b) {
15: return
16: static_cast<const Transcoder_TransRec *>(a)->intCh-
17: static_cast<const Transcoder_TransRec *>(b)->intCh;
18: }
19:
20: void Transcoder::sort_ToTable() {
21: _qsort(toTable, toTableSize, sizeof(*toTable),
22: sort_cmp_Trans_rec_intCh);
23: //FILE *f=fopen("c:\\temp\\a", "wb");
24: //fwrite(toTable, toTableSize, sizeof(*toTable), f);
25: //fclose(f);
26: }
27:
28: XMLByte Transcoder::xlatOneTo(const XMLCh toXlat) const {
29: unsigned int lowOfs = 0;
30: unsigned int hiOfs = toTableSize - 1;
31: XMLByte curByte = 0;
32: do {
33: // Calc the mid point of the low and high offset.
34: const unsigned int midOfs = ((hiOfs - lowOfs) / 2) + lowOfs;
35:
36: // If our test char is greater than the mid point char, then
37: // we move up to the upper half. Else we move to the lower
38: // half. If its equal, then its our guy.
39: if (toXlat > toTable[midOfs].intCh)
40: lowOfs = midOfs;
41: else if (toXlat < toTable[midOfs].intCh)
42: hiOfs = midOfs;
43: else
44: return toTable[midOfs].extCh;
45: } while(lowOfs + 1 < hiOfs);
46:
47: return 0;
48: }
49:
50: void transcoder_transcode(Pool& pool,
51: const Transcoder *source_transcoder, const void *source_body, size_t source_content_length,
52: const Transcoder *dest_transcoder, const void *& dest_body, size_t& dest_content_length
53: ) {
54: switch((source_transcoder?0x10:0x00)|(dest_transcoder?0x01:0x00)) {
55: case 0x00:
56: dest_body=source_body;
57: dest_content_length=source_content_length;
58: break;
59: case 0x10:
60: source_transcoder->transcodeToUTF8(pool,
61: source_body, source_content_length,
62: dest_body, dest_content_length);
63: break;
64: case 0x01:
65: dest_transcoder->transcodeFromUTF8(pool,
66: source_body, source_content_length,
67: dest_body, dest_content_length);
68: break;
69: default: // 0x11
70: source_transcoder->transcodeToTranscoder(pool, *dest_transcoder,
71: source_body, source_content_length,
72: dest_body, dest_content_length);
73: break;
74: }
75: }
76:
77: // ---------------------------------------------------------------------------
78: // Local static data
79: //
80: // gUTFBytes
81: // A list of counts of trailing bytes for each initial byte in the input.
82: //
83: // gUTFOffsets
84: // A list of values to offset each result char type, according to how
85: // many source bytes when into making it.
86: //
87: // gFirstByteMark
88: // A list of values to mask onto the first byte of an encoded sequence,
89: // indexed by the number of bytes used to create the sequence.
90: // ---------------------------------------------------------------------------
91: static const XMLByte gUTFBytes[0x100] = {
92: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
93: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
94: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
95: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
96: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
97: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
98: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
99: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
100: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
101: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
102: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
103: , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
104: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
105: , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
106: , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
107: , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
108: };
109:
110: static const uint gUTFOffsets[6] = {
111: 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080
112: };
113:
114: static const XMLByte gFirstByteMark[7] = {
115: 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
116: };
1.2 ! paf 117:
! 118: /// @todo not so memory-hungry with prescan
! 119: void Transcoder::transcodeToUTF8(Pool& pool,
! 120: const void *source_body, size_t source_content_length,
! 121: const void *& adest_body, size_t& adest_content_length) const {
! 122:
! 123: size_t dest_content_length=0;
! 124: XMLByte *dest_body=(XMLByte*)pool.malloc(source_content_length*6/*so that surly enough*/);
! 125:
! 126: const XMLByte* srcPtr=(const XMLByte*)source_body;
! 127: const XMLByte* srcEnd=(const XMLByte*)source_body+source_content_length;
! 128: XMLByte* outPtr=dest_body;
! 129:
! 130: while (srcPtr < srcEnd) {
! 131: uint curVal = fromTable[*srcPtr];
! 132:
! 133: // Figure out how many bytes we need
! 134: unsigned int encodedBytes;
! 135: if (curVal < 0x80)
! 136: encodedBytes = 1;
! 137: else if (curVal < 0x800)
! 138: encodedBytes = 2;
! 139: else if (curVal < 0x10000)
! 140: encodedBytes = 3;
! 141: else if (curVal < 0x200000)
! 142: encodedBytes = 4;
! 143: else if (curVal < 0x4000000)
! 144: encodedBytes = 5;
! 145: else if (curVal <= 0x7FFFFFFF)
! 146: encodedBytes = 6;
! 147: else {
! 148: // use the replacement character
! 149: *outPtr++ = '?';
! 150: srcPtr ++;
! 151: continue;
! 152: }
! 153:
! 154: // If we cannot fully get this char into the output buffer,
! 155: // never
! 156:
! 157: // We can do it, so update the source index
! 158: srcPtr++;
! 159:
! 160: // And spit out the bytes. We spit them out in reverse order
! 161: // here, so bump up the output pointer and work down as we go.
! 162: outPtr += encodedBytes;
! 163: switch(encodedBytes) {
! 164: case 6 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
! 165: curVal >>= 6;
! 166: case 5 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
! 167: curVal >>= 6;
! 168: case 4 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
! 169: curVal >>= 6;
! 170: case 3 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
! 171: curVal >>= 6;
! 172: case 2 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);
! 173: curVal >>= 6;
! 174: case 1 : *--outPtr = XMLByte(curVal | gFirstByteMark[encodedBytes]);
! 175: }
! 176:
! 177: // Add the encoded bytes back in again to indicate we've eaten them
! 178: outPtr += encodedBytes;
! 179: }
! 180:
! 181: // return
! 182: adest_body=dest_body;
! 183: adest_content_length=outPtr-dest_body;
! 184: }
1.1 paf 185: void Transcoder::transcodeFromUTF8(Pool& pool,
186: const void *source_body, size_t source_content_length,
187: const void *& adest_body, size_t& adest_content_length) const {
188: size_t dest_content_length=0;
189: XMLByte *dest_body=(XMLByte*)pool.malloc(source_content_length/*surly enough*/);
190:
191: const XMLByte* srcPtr=(const XMLByte*)source_body;
192: const XMLByte* srcEnd=(const XMLByte*)source_body+source_content_length;
193: XMLByte* outPtr=dest_body;
194:
195: // We now loop until we either run out of input data
196: while (srcPtr < srcEnd) {
197: // Get the next leading byte out
198: const XMLByte firstByte = *srcPtr;
199:
200: // Special-case ASCII, which is a leading byte value of <= 127
201: if (firstByte <= 127) {
202: *outPtr++ = firstByte;
203: srcPtr++;
204: continue;
205: }
206:
207: // See how many trailing src bytes this sequence is going to require
208: const unsigned int trailingBytes = gUTFBytes[firstByte];
209:
210: // If there are not enough source bytes to do this one, then we
211: // are done. Note that we done >= here because we are implicitly
212: // counting the 1 byte we get no matter what.
213: if (srcPtr + trailingBytes >= srcEnd)
214: break;
215:
216: // Looks ok, so lets build up the value
217: uint tmpVal=0;
218: switch(trailingBytes) {
219: case 5: tmpVal+=*srcPtr++; tmpVal<<=6;
220: case 4: tmpVal+=*srcPtr++; tmpVal<<=6;
221: case 3: tmpVal+=*srcPtr++; tmpVal<<=6;
222: case 2: tmpVal+=*srcPtr++; tmpVal<<=6;
223: case 1: tmpVal+=*srcPtr++; tmpVal<<=6;
224: case 0: tmpVal+=*srcPtr++;
225: break;
226:
227: default:
228: throw Exception(0, 0,
229: 0,
230: "transcodeFromUTF8 error: wrong trailingBytes value (%d)", trailingBytes);
231: }
232: tmpVal-=gUTFOffsets[trailingBytes];
233:
234: // If it will fit into a single char, then put it in. Otherwise
235: // fail [*encode it as a surrogate pair. If its not valid, use the
236: // replacement char.*]
237: if (!(tmpVal & 0xFFFF0000))
238: *outPtr++ = xlatOneTo(tmpVal);
239: else
240: throw Exception(0, 0,
241: 0,
242: "transcodeFromUTF8 error: too big tmpVal (0x%08X)", tmpVal);
243: }
244:
245: // return
246: adest_body=dest_body;
247: adest_content_length=outPtr-dest_body;
248: }
249:
250: /// transcode using both transcoders
251: void Transcoder::transcodeToTranscoder(Pool& pool,
252: const Transcoder& dest_transcoder,
253: const void *source_body, size_t source_content_length,
254: const void *& adest_body, size_t& dest_content_length) const {
255: throw Exception(0, 0,
256: 0,
257: "transcodeToTranscoder not supported(yet)");
258: /*
259: void *dest_body;
260:
261: dest_body=pool.malloc(dest_content_length=source_content_length);
262: // dummy
263: memset(dest_body, '?', dest_content_length);
264:
265: adest_body=dest_body;*/
266: }
E-mail: