Annotation of parser3/src/lib/punycode/pa_convert_utf.c, revision 1.2
1.1 moko 1: /*===--- pa_convert_utf.c - Universal Character Names conversions ---------------===
2: *
3: * The LLVM Compiler Infrastructure
4: *
5: * This file is distributed under the University of Illinois Open Source
6: * License. See LICENSE.TXT for details.
7: *
8: *===------------------------------------------------------------------------=*/
9: /*
10: * Copyright 2001-2004 Unicode, Inc.
11: *
12: * Disclaimer
13: *
14: * This source code is provided as is by Unicode, Inc. No claims are
15: * made as to fitness for any particular purpose. No warranties of any
16: * kind are expressed or implied. The recipient agrees to determine
17: * applicability of information provided. If this file has been
18: * purchased on magnetic or optical media from Unicode, Inc., the
19: * sole remedy for any claim will be exchange of defective media
20: * within 90 days of receipt.
21: *
22: * Limitations on Rights to Redistribute This Code
23: *
24: * Unicode, Inc. hereby grants the right to freely use the information
25: * supplied in this file in the creation of products supporting the
26: * Unicode Standard, and to make copies of this file in any form
27: * for internal or external distribution as long as this notice
28: * remains attached.
29: */
30:
31: /* ---------------------------------------------------------------------
32:
33: Conversions between UTF32, UTF-16, and UTF-8. Source code file.
34: Author: Mark E. Davis, 1994.
35: Rev History: Rick McGowan, fixes & updates May 2001.
36: Sept 2001: fixed const & error conditions per
37: mods suggested by S. Parent & A. Lillich.
38: June 2002: Tim Dodd added detection and handling of incomplete
39: source sequences, enhanced error detection, added casts
40: to eliminate compiler warnings.
41: July 2003: slight mods to back out aggressive FFFE detection.
42: Jan 2004: updated switches in from-UTF8 conversions.
43: Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
44:
45: See the header file "ConvertUTF.h" for complete documentation.
46:
47: ------------------------------------------------------------------------ */
48:
49:
50: #include "pa_convert_utf.h"
51:
52: static const int halfShift = 10; /* used for shifting by 10 bits */
53:
54: static const UTF32 halfBase = 0x0010000UL;
55: static const UTF32 halfMask = 0x3FFUL;
56:
57: #define UNI_SUR_HIGH_START (UTF32)0xD800
58: #define UNI_SUR_HIGH_END (UTF32)0xDBFF
59: #define UNI_SUR_LOW_START (UTF32)0xDC00
60: #define UNI_SUR_LOW_END (UTF32)0xDFFF
61: #define false 0
62: #define true 1
63:
64: /* --------------------------------------------------------------------- */
65:
66: /*
67: * Index into the table below with the first byte of a UTF-8 sequence to
68: * get the number of trailing bytes that are supposed to follow it.
69: * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
70: * left as-is for anyone who may want to do such conversion, which was
71: * allowed in earlier algorithms.
72: */
73: static const char trailingBytesForUTF8[256] = {
74: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
75: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
76: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
77: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
78: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
79: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
80: 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
81: 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
82: };
83:
84: /*
85: * Magic values subtracted from a buffer value during UTF8 conversion.
86: * This table contains as many values as there might be trailing bytes
87: * in a UTF-8 sequence.
88: */
89: static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
90: 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
91:
92: /*
93: * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
94: * into the first byte, depending on how many bytes follow. There are
95: * as many entries in this table as there are UTF-8 sequence types.
96: * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
97: * for *legal* UTF-8 will be 4 or fewer bytes total.
98: */
99: static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
100:
101: /* --------------------------------------------------------------------- */
102:
103: /* The interface converts a whole buffer to avoid function-call overhead.
104: * Constants have been gathered. Loops & conditionals have been removed as
105: * much as possible for efficiency, in favor of drop-through switches.
106: * (See "Note A" at the bottom of the file for equivalent code.)
107: * If your compiler supports it, the "isLegalUTF8" call can be turned
108: * into an inline function.
109: */
110:
111:
112: /* --------------------------------------------------------------------- */
113:
114: ConversionResult pa_convertUTF32toUTF16 (
115: const UTF32** sourceStart, const UTF32* sourceEnd,
116: UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
117: ConversionResult result = conversionOK;
118: const UTF32* source = *sourceStart;
119: UTF16* target = *targetStart;
120: while (source < sourceEnd) {
121: UTF32 ch;
122: if (target >= targetEnd) {
123: result = targetExhausted; break;
124: }
125: ch = *source++;
126: if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
127: /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
128: if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
129: if (flags == strictConversion) {
130: --source; /* return to the illegal value itself */
131: result = sourceIllegal;
132: break;
133: } else {
134: *target++ = UNI_REPLACEMENT_CHAR;
135: }
136: } else {
137: *target++ = (UTF16)ch; /* normal case */
138: }
139: } else if (ch > UNI_MAX_LEGAL_UTF32) {
140: if (flags == strictConversion) {
141: result = sourceIllegal;
142: } else {
143: *target++ = UNI_REPLACEMENT_CHAR;
144: }
145: } else {
146: /* target is a character in range 0xFFFF - 0x10FFFF. */
147: if (target + 1 >= targetEnd) {
148: --source; /* Back up source pointer! */
149: result = targetExhausted; break;
150: }
151: ch -= halfBase;
152: *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
153: *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
154: }
155: }
156: *sourceStart = source;
157: *targetStart = target;
158: return result;
159: }
160:
161: /* --------------------------------------------------------------------- */
162:
163: ConversionResult pa_convertUTF16toUTF32 (
164: const UTF16** sourceStart, const UTF16* sourceEnd,
165: UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
166: ConversionResult result = conversionOK;
167: const UTF16* source = *sourceStart;
168: UTF32* target = *targetStart;
169: UTF32 ch, ch2;
170: while (source < sourceEnd) {
171: const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
172: ch = *source++;
173: /* If we have a surrogate pair, convert to UTF32 first. */
174: if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
175: /* If the 16 bits following the high surrogate are in the source buffer... */
176: if (source < sourceEnd) {
177: ch2 = *source;
178: /* If it's a low surrogate, convert to UTF32. */
179: if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
180: ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
181: + (ch2 - UNI_SUR_LOW_START) + halfBase;
182: ++source;
183: } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
184: --source; /* return to the illegal value itself */
185: result = sourceIllegal;
186: break;
187: }
188: } else { /* We don't have the 16 bits following the high surrogate. */
189: --source; /* return to the high surrogate */
190: result = sourceExhausted;
191: break;
192: }
193: } else if (flags == strictConversion) {
194: /* UTF-16 surrogate values are illegal in UTF-32 */
195: if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
196: --source; /* return to the illegal value itself */
197: result = sourceIllegal;
198: break;
199: }
200: }
201: if (target >= targetEnd) {
202: source = oldSource; /* Back up source pointer! */
203: result = targetExhausted; break;
204: }
205: *target++ = ch;
206: }
207: *sourceStart = source;
208: *targetStart = target;
209: return result;
210: }
211:
212: ConversionResult pa_convertUTF16toUTF8 (
213: const UTF16** sourceStart, const UTF16* sourceEnd,
214: UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
215: ConversionResult result = conversionOK;
216: const UTF16* source = *sourceStart;
217: UTF8* target = *targetStart;
218: while (source < sourceEnd) {
219: UTF32 ch;
220: unsigned short bytesToWrite = 0;
221: const UTF32 byteMask = 0xBF;
222: const UTF32 byteMark = 0x80;
223: const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
224: ch = *source++;
225: /* If we have a surrogate pair, convert to UTF32 first. */
226: if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
227: /* If the 16 bits following the high surrogate are in the source buffer... */
228: if (source < sourceEnd) {
229: UTF32 ch2 = *source;
230: /* If it's a low surrogate, convert to UTF32. */
231: if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
232: ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
233: + (ch2 - UNI_SUR_LOW_START) + halfBase;
234: ++source;
235: } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
236: --source; /* return to the illegal value itself */
237: result = sourceIllegal;
238: break;
239: }
240: } else { /* We don't have the 16 bits following the high surrogate. */
241: --source; /* return to the high surrogate */
242: result = sourceExhausted;
243: break;
244: }
245: } else if (flags == strictConversion) {
246: /* UTF-16 surrogate values are illegal in UTF-32 */
247: if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
248: --source; /* return to the illegal value itself */
249: result = sourceIllegal;
250: break;
251: }
252: }
253: /* Figure out how many bytes the result will require */
254: if (ch < (UTF32)0x80) { bytesToWrite = 1;
255: } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
256: } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
257: } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
258: } else { bytesToWrite = 3;
259: ch = UNI_REPLACEMENT_CHAR;
260: }
261:
262: target += bytesToWrite;
263: if (target > targetEnd) {
264: source = oldSource; /* Back up source pointer! */
265: target -= bytesToWrite; result = targetExhausted; break;
266: }
267: switch (bytesToWrite) { /* note: everything falls through. */
268: case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
269: case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
270: case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
271: case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
272: }
273: target += bytesToWrite;
274: }
275: *sourceStart = source;
276: *targetStart = target;
277: return result;
278: }
279:
280: /* --------------------------------------------------------------------- */
281:
282: ConversionResult pa_convertUTF32toUTF8 (
283: const UTF32** sourceStart, const UTF32* sourceEnd,
284: UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
285: ConversionResult result = conversionOK;
286: const UTF32* source = *sourceStart;
287: UTF8* target = *targetStart;
288: while (source < sourceEnd) {
289: UTF32 ch;
290: unsigned short bytesToWrite = 0;
291: const UTF32 byteMask = 0xBF;
292: const UTF32 byteMark = 0x80;
293: ch = *source++;
294: if (flags == strictConversion ) {
295: /* UTF-16 surrogate values are illegal in UTF-32 */
296: if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
297: --source; /* return to the illegal value itself */
298: result = sourceIllegal;
299: break;
300: }
301: }
302: /*
303: * Figure out how many bytes the result will require. Turn any
304: * illegally large UTF32 things (> Plane 17) into replacement chars.
305: */
306: if (ch < (UTF32)0x80) { bytesToWrite = 1;
307: } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
308: } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
309: } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
310: } else { bytesToWrite = 3;
311: ch = UNI_REPLACEMENT_CHAR;
312: result = sourceIllegal;
313: }
314:
315: target += bytesToWrite;
316: if (target > targetEnd) {
317: --source; /* Back up source pointer! */
318: target -= bytesToWrite; result = targetExhausted; break;
319: }
320: switch (bytesToWrite) { /* note: everything falls through. */
321: case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
322: case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
323: case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
324: case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
325: }
326: target += bytesToWrite;
327: }
328: *sourceStart = source;
329: *targetStart = target;
330: return result;
331: }
332:
333: /* --------------------------------------------------------------------- */
334:
335: /*
336: * Utility routine to tell whether a sequence of bytes is legal UTF-8.
337: * This must be called with the length pre-determined by the first byte.
338: * If not calling this from ConvertUTF8to*, then the length can be set by:
339: * length = trailingBytesForUTF8[*source]+1;
340: * and the sequence is illegal right away if there aren't that many bytes
341: * available.
342: * If presented with a length > 4, this returns false. The Unicode
343: * definition of UTF-8 goes up to 4-byte sequences.
344: */
345:
346: static Boolean isLegalUTF8(const UTF8 *source, int length) {
347: UTF8 a;
348: const UTF8 *srcptr = source+length;
349: switch (length) {
350: default: return false;
351: /* Everything else falls through when "true"... */
352: case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
353: case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
354: case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
355:
356: switch (*source) {
357: /* no fall-through in this inner switch */
358: case 0xE0: if (a < 0xA0) return false; break;
359: case 0xED: if (a > 0x9F) return false; break;
360: case 0xF0: if (a < 0x90) return false; break;
361: case 0xF4: if (a > 0x8F) return false; break;
362: default: if (a < 0x80) return false;
363: }
364:
365: case 1: if (*source >= 0x80 && *source < 0xC2) return false;
366: }
367: if (*source > 0xF4) return false;
368: return true;
369: }
370:
371: /* --------------------------------------------------------------------- */
372:
373: /*
374: * Exported function to return whether a UTF-8 sequence is legal or not.
375: * This is not used here; it's just exported.
376: */
377: Boolean pa_isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
378: int length = trailingBytesForUTF8[*source]+1;
379: if (length > sourceEnd - source) {
380: return false;
381: }
382: return isLegalUTF8(source, length);
383: }
384:
385: /* --------------------------------------------------------------------- */
386:
387: static unsigned
388: findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
389: const UTF8 *sourceEnd) {
390: UTF8 b1, b2, b3;
391:
1.2 ! moko 392: assert(!pa_isLegalUTF8Sequence(source, sourceEnd));
1.1 moko 393:
394: /*
395: * Unicode 6.3.0, D93b:
396: *
397: * Maximal subpart of an ill-formed subsequence: The longest code unit
398: * subsequence starting at an unconvertible offset that is either:
399: * a. the initial subsequence of a well-formed code unit sequence, or
400: * b. a subsequence of length one.
401: */
402:
403: if (source == sourceEnd)
404: return 0;
405:
406: /*
407: * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
408: * Byte Sequences.
409: */
410:
411: b1 = *source;
412: ++source;
413: if (b1 >= 0xC2 && b1 <= 0xDF) {
414: /*
415: * First byte is valid, but we know that this code unit sequence is
416: * invalid, so the maximal subpart has to end after the first byte.
417: */
418: return 1;
419: }
420:
421: if (source == sourceEnd)
422: return 1;
423:
424: b2 = *source;
425: ++source;
426:
427: if (b1 == 0xE0) {
428: return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
429: }
430: if (b1 >= 0xE1 && b1 <= 0xEC) {
431: return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
432: }
433: if (b1 == 0xED) {
434: return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
435: }
436: if (b1 >= 0xEE && b1 <= 0xEF) {
437: return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
438: }
439: if (b1 == 0xF0) {
440: if (b2 >= 0x90 && b2 <= 0xBF) {
441: if (source == sourceEnd)
442: return 2;
443:
444: b3 = *source;
445: return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
446: }
447: return 1;
448: }
449: if (b1 >= 0xF1 && b1 <= 0xF3) {
450: if (b2 >= 0x80 && b2 <= 0xBF) {
451: if (source == sourceEnd)
452: return 2;
453:
454: b3 = *source;
455: return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
456: }
457: return 1;
458: }
459: if (b1 == 0xF4) {
460: if (b2 >= 0x80 && b2 <= 0x8F) {
461: if (source == sourceEnd)
462: return 2;
463:
464: b3 = *source;
465: return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
466: }
467: return 1;
468: }
469:
470: assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
471: /*
472: * There are no valid sequences that start with these bytes. Maximal subpart
473: * is defined to have length 1 in these cases.
474: */
475: return 1;
476: }
477:
478: /* --------------------------------------------------------------------- */
479:
480: /*
481: * Exported function to return the total number of bytes in a codepoint
482: * represented in UTF-8, given the value of the first byte.
483: */
484: unsigned pa_getNumBytesForUTF8(UTF8 first) {
485: return trailingBytesForUTF8[first] + 1;
486: }
487:
488: /* --------------------------------------------------------------------- */
489:
490: /*
491: * Exported function to return whether a UTF-8 string is legal or not.
492: * This is not used here; it's just exported.
493: */
494: Boolean pa_isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
495: while (*source != sourceEnd) {
496: int length = trailingBytesForUTF8[**source] + 1;
497: if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
498: return false;
499: *source += length;
500: }
501: return true;
502: }
503:
504: /* --------------------------------------------------------------------- */
505:
506: ConversionResult pa_convertUTF8toUTF16 (
507: const UTF8** sourceStart, const UTF8* sourceEnd,
508: UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
509: ConversionResult result = conversionOK;
510: const UTF8* source = *sourceStart;
511: UTF16* target = *targetStart;
512: while (source < sourceEnd) {
513: UTF32 ch = 0;
514: unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
515: if (extraBytesToRead >= sourceEnd - source) {
516: result = sourceExhausted; break;
517: }
518: /* Do this check whether lenient or strict */
519: if (!isLegalUTF8(source, extraBytesToRead+1)) {
520: result = sourceIllegal;
521: break;
522: }
523: /*
524: * The cases all fall through. See "Note A" below.
525: */
526: switch (extraBytesToRead) {
527: case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
528: case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
529: case 3: ch += *source++; ch <<= 6;
530: case 2: ch += *source++; ch <<= 6;
531: case 1: ch += *source++; ch <<= 6;
532: case 0: ch += *source++;
533: }
534: ch -= offsetsFromUTF8[extraBytesToRead];
535:
536: if (target >= targetEnd) {
537: source -= (extraBytesToRead+1); /* Back up source pointer! */
538: result = targetExhausted; break;
539: }
540: if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
541: /* UTF-16 surrogate values are illegal in UTF-32 */
542: if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
543: if (flags == strictConversion) {
544: source -= (extraBytesToRead+1); /* return to the illegal value itself */
545: result = sourceIllegal;
546: break;
547: } else {
548: *target++ = UNI_REPLACEMENT_CHAR;
549: }
550: } else {
551: *target++ = (UTF16)ch; /* normal case */
552: }
553: } else if (ch > UNI_MAX_UTF16) {
554: if (flags == strictConversion) {
555: result = sourceIllegal;
556: source -= (extraBytesToRead+1); /* return to the start */
557: break; /* Bail out; shouldn't continue */
558: } else {
559: *target++ = UNI_REPLACEMENT_CHAR;
560: }
561: } else {
562: /* target is a character in range 0xFFFF - 0x10FFFF. */
563: if (target + 1 >= targetEnd) {
564: source -= (extraBytesToRead+1); /* Back up source pointer! */
565: result = targetExhausted; break;
566: }
567: ch -= halfBase;
568: *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
569: *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
570: }
571: }
572: *sourceStart = source;
573: *targetStart = target;
574: return result;
575: }
576:
577: /* --------------------------------------------------------------------- */
578:
579: static ConversionResult ConvertUTF8toUTF32Impl(
580: const UTF8** sourceStart, const UTF8* sourceEnd,
581: UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
582: Boolean InputIsPartial) {
583: ConversionResult result = conversionOK;
584: const UTF8* source = *sourceStart;
585: UTF32* target = *targetStart;
586: while (source < sourceEnd) {
587: UTF32 ch = 0;
588: unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
589: if (extraBytesToRead >= sourceEnd - source) {
590: if (flags == strictConversion || InputIsPartial) {
591: result = sourceExhausted;
592: break;
593: } else {
594: result = sourceIllegal;
595:
596: /*
597: * Replace the maximal subpart of ill-formed sequence with
598: * replacement character.
599: */
600: source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
601: sourceEnd);
602: *target++ = UNI_REPLACEMENT_CHAR;
603: continue;
604: }
605: }
606: if (target >= targetEnd) {
607: result = targetExhausted; break;
608: }
609:
610: /* Do this check whether lenient or strict */
611: if (!isLegalUTF8(source, extraBytesToRead+1)) {
612: result = sourceIllegal;
613: if (flags == strictConversion) {
614: /* Abort conversion. */
615: break;
616: } else {
617: /*
618: * Replace the maximal subpart of ill-formed sequence with
619: * replacement character.
620: */
621: source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
622: sourceEnd);
623: *target++ = UNI_REPLACEMENT_CHAR;
624: continue;
625: }
626: }
627: /*
628: * The cases all fall through. See "Note A" below.
629: */
630: switch (extraBytesToRead) {
631: case 5: ch += *source++; ch <<= 6;
632: case 4: ch += *source++; ch <<= 6;
633: case 3: ch += *source++; ch <<= 6;
634: case 2: ch += *source++; ch <<= 6;
635: case 1: ch += *source++; ch <<= 6;
636: case 0: ch += *source++;
637: }
638: ch -= offsetsFromUTF8[extraBytesToRead];
639:
640: if (ch <= UNI_MAX_LEGAL_UTF32) {
641: /*
642: * UTF-16 surrogate values are illegal in UTF-32, and anything
643: * over Plane 17 (> 0x10FFFF) is illegal.
644: */
645: if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
646: if (flags == strictConversion) {
647: source -= (extraBytesToRead+1); /* return to the illegal value itself */
648: result = sourceIllegal;
649: break;
650: } else {
651: *target++ = UNI_REPLACEMENT_CHAR;
652: }
653: } else {
654: *target++ = ch;
655: }
656: } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
657: result = sourceIllegal;
658: *target++ = UNI_REPLACEMENT_CHAR;
659: }
660: }
661: *sourceStart = source;
662: *targetStart = target;
663: return result;
664: }
665:
666: ConversionResult pa_convertUTF8toUTF32Partial(const UTF8 **sourceStart,
667: const UTF8 *sourceEnd,
668: UTF32 **targetStart,
669: UTF32 *targetEnd,
670: ConversionFlags flags) {
671: return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
672: flags, /*InputIsPartial=*/true);
673: }
674:
675: ConversionResult pa_convertUTF8toUTF32(const UTF8 **sourceStart,
676: const UTF8 *sourceEnd, UTF32 **targetStart,
677: UTF32 *targetEnd, ConversionFlags flags) {
678: return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
679: flags, /*InputIsPartial=*/false);
680: }
681:
682: /* ---------------------------------------------------------------------
683:
684: Note A.
685: The fall-through switches in UTF-8 reading code save a
686: temp variable, some decrements & conditionals. The switches
687: are equivalent to the following loop:
688: {
689: int tmpBytesToRead = extraBytesToRead+1;
690: do {
691: ch += *source++;
692: --tmpBytesToRead;
693: if (tmpBytesToRead) ch <<= 6;
694: } while (tmpBytesToRead > 0);
695: }
696: In UTF-8 writing code, the switches on "bytesToWrite" are
697: similarly unrolled loops.
698:
699: --------------------------------------------------------------------- */
E-mail: