Annotation of parser3/src/lib/punycode/pa_convert_utf.h, revision 1.1
1.1 ! moko 1: /*===--- pa_convert_utf.h - Universal Character Names conversions ---------------===
! 2: *
! 3: * The LLVM Compiler Infrastructure
! 4: *
! 5: * This file is distributed under the University of Illinois Open Source
! 6: * License. See LICENSE.TXT for details.
! 7: *
! 8: *==------------------------------------------------------------------------==*/
! 9: /*
! 10: * Copyright 2001-2004 Unicode, Inc.
! 11: *
! 12: * Disclaimer
! 13: *
! 14: * This source code is provided as is by Unicode, Inc. No claims are
! 15: * made as to fitness for any particular purpose. No warranties of any
! 16: * kind are expressed or implied. The recipient agrees to determine
! 17: * applicability of information provided. If this file has been
! 18: * purchased on magnetic or optical media from Unicode, Inc., the
! 19: * sole remedy for any claim will be exchange of defective media
! 20: * within 90 days of receipt.
! 21: *
! 22: * Limitations on Rights to Redistribute This Code
! 23: *
! 24: * Unicode, Inc. hereby grants the right to freely use the information
! 25: * supplied in this file in the creation of products supporting the
! 26: * Unicode Standard, and to make copies of this file in any form
! 27: * for internal or external distribution as long as this notice
! 28: * remains attached.
! 29: */
! 30:
! 31: /* ---------------------------------------------------------------------
! 32:
! 33: Conversions between UTF32, UTF-16, and UTF-8. Header file.
! 34:
! 35: Several funtions are included here, forming a complete set of
! 36: conversions between the three formats. UTF-7 is not included
! 37: here, but is handled in a separate source file.
! 38:
! 39: Each of these routines takes pointers to input buffers and output
! 40: buffers. The input buffers are const.
! 41:
! 42: Each routine converts the text between *sourceStart and sourceEnd,
! 43: putting the result into the buffer between *targetStart and
! 44: targetEnd. Note: the end pointers are *after* the last item: e.g.
! 45: *(sourceEnd - 1) is the last item.
! 46:
! 47: The return result indicates whether the conversion was successful,
! 48: and if not, whether the problem was in the source or target buffers.
! 49: (Only the first encountered problem is indicated.)
! 50:
! 51: After the conversion, *sourceStart and *targetStart are both
! 52: updated to point to the end of last text successfully converted in
! 53: the respective buffers.
! 54:
! 55: Input parameters:
! 56: sourceStart - pointer to a pointer to the source buffer.
! 57: The contents of this are modified on return so that
! 58: it points at the next thing to be converted.
! 59: targetStart - similarly, pointer to pointer to the target buffer.
! 60: sourceEnd, targetEnd - respectively pointers to the ends of the
! 61: two buffers, for overflow checking only.
! 62:
! 63: These conversion functions take a ConversionFlags argument. When this
! 64: flag is set to strict, both irregular sequences and isolated surrogates
! 65: will cause an error. When the flag is set to lenient, both irregular
! 66: sequences and isolated surrogates are converted.
! 67:
! 68: Whether the flag is strict or lenient, all illegal sequences will cause
! 69: an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
! 70: or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
! 71: must check for illegal sequences.
! 72:
! 73: When the flag is set to lenient, characters over 0x10FFFF are converted
! 74: to the replacement character; otherwise (when the flag is set to strict)
! 75: they constitute an error.
! 76:
! 77: Output parameters:
! 78: The value "sourceIllegal" is returned from some routines if the input
! 79: sequence is malformed. When "sourceIllegal" is returned, the source
! 80: value will point to the illegal value that caused the problem. E.g.,
! 81: in UTF-8 when a sequence is malformed, it points to the start of the
! 82: malformed sequence.
! 83:
! 84: Author: Mark E. Davis, 1994.
! 85: Rev History: Rick McGowan, fixes & updates May 2001.
! 86: Fixes & updates, Sept 2001.
! 87:
! 88: ------------------------------------------------------------------------ */
! 89:
! 90: #ifndef PA_CONVERT_UTF_H
! 91: #define PA_CONVERT_UTF_H
! 92:
! 93: #include "pa_config_includes.h"
! 94:
! 95: /* ---------------------------------------------------------------------
! 96: The following 4 definitions are compiler-specific.
! 97: The C standard does not guarantee that wchar_t has at least
! 98: 16 bits, so wchar_t is no less portable than unsigned short!
! 99: All should be unsigned values to avoid sign extension during
! 100: bit mask & shift operations.
! 101: ------------------------------------------------------------------------ */
! 102:
! 103: typedef uint32_t UTF32; /* at least 32 bits */
! 104: typedef uint16_t UTF16; /* at least 16 bits */
! 105: typedef unsigned char UTF8; /* typically 8 bits */
! 106: typedef unsigned char Boolean; /* 0 or 1 */
! 107:
! 108: /* Some fundamental constants */
! 109: #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
! 110: #define UNI_MAX_BMP (UTF32)0x0000FFFF
! 111: #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
! 112: #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
! 113: #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
! 114:
! 115: #define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4
! 116:
! 117: #define UNI_UTF16_BYTE_ORDER_MARK_NATIVE 0xFEFF
! 118: #define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
! 119:
! 120: typedef enum {
! 121: conversionOK, /* conversion successful */
! 122: sourceExhausted, /* partial character in source, but hit end */
! 123: targetExhausted, /* insuff. room in target for conversion */
! 124: sourceIllegal /* source sequence is illegal/malformed */
! 125: } ConversionResult;
! 126:
! 127: typedef enum {
! 128: strictConversion = 0,
! 129: lenientConversion
! 130: } ConversionFlags;
! 131:
! 132: /* This is for C++ and does no harm in C */
! 133: #ifdef __cplusplus
! 134: extern "C" {
! 135: #endif
! 136:
! 137: ConversionResult pa_convertUTF8toUTF16 (
! 138: const UTF8** sourceStart, const UTF8* sourceEnd,
! 139: UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
! 140:
! 141: /**
! 142: * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an
! 143: * incomplete code unit sequence, returns \c sourceExhausted.
! 144: */
! 145: ConversionResult pa_convertUTF8toUTF32Partial(
! 146: const UTF8** sourceStart, const UTF8* sourceEnd,
! 147: UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
! 148:
! 149: /**
! 150: * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an
! 151: * incomplete code unit sequence, returns \c sourceIllegal.
! 152: */
! 153: ConversionResult pa_convertUTF8toUTF32(
! 154: const UTF8** sourceStart, const UTF8* sourceEnd,
! 155: UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
! 156:
! 157: ConversionResult pa_convertUTF16toUTF8 (
! 158: const UTF16** sourceStart, const UTF16* sourceEnd,
! 159: UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
! 160:
! 161: ConversionResult pa_convertUTF32toUTF8 (
! 162: const UTF32** sourceStart, const UTF32* sourceEnd,
! 163: UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
! 164:
! 165: ConversionResult pa_convertUTF16toUTF32 (
! 166: const UTF16** sourceStart, const UTF16* sourceEnd,
! 167: UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
! 168:
! 169: ConversionResult pa_convertUTF32toUTF16 (
! 170: const UTF32** sourceStart, const UTF32* sourceEnd,
! 171: UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
! 172:
! 173: Boolean pa_isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
! 174:
! 175: Boolean pa_isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
! 176:
! 177: unsigned pa_getNumBytesForUTF8(UTF8 firstByte);
! 178:
! 179: #ifdef __cplusplus
! 180: }
! 181: #endif
! 182:
! 183: #endif
E-mail: