parser3/src/lib/punycode/pa_convert_utf.h - annotate

Return to pa_convert_utf.h CVS log
Up to [parser3project] / parser3 / src / lib / punycode
Annotation of parser3/src/lib/punycode/pa_convert_utf.h, revision 1.1

1.1     ! moko        1: /*===--- pa_convert_utf.h - Universal Character Names conversions ---------------===
        !             2:  *
        !             3:  *                     The LLVM Compiler Infrastructure
        !             4:  *
        !             5:  * This file is distributed under the University of Illinois Open Source
        !             6:  * License. See LICENSE.TXT for details.
        !             7:  *
        !             8:  *==------------------------------------------------------------------------==*/
        !             9: /*
        !            10:  * Copyright 2001-2004 Unicode, Inc.
        !            11:  *
        !            12:  * Disclaimer
        !            13:  *
        !            14:  * This source code is provided as is by Unicode, Inc. No claims are
        !            15:  * made as to fitness for any particular purpose. No warranties of any
        !            16:  * kind are expressed or implied. The recipient agrees to determine
        !            17:  * applicability of information provided. If this file has been
        !            18:  * purchased on magnetic or optical media from Unicode, Inc., the
        !            19:  * sole remedy for any claim will be exchange of defective media
        !            20:  * within 90 days of receipt.
        !            21:  *
        !            22:  * Limitations on Rights to Redistribute This Code
        !            23:  *
        !            24:  * Unicode, Inc. hereby grants the right to freely use the information
        !            25:  * supplied in this file in the creation of products supporting the
        !            26:  * Unicode Standard, and to make copies of this file in any form
        !            27:  * for internal or external distribution as long as this notice
        !            28:  * remains attached.
        !            29:  */
        !            30: 
        !            31: /* ---------------------------------------------------------------------
        !            32: 
        !            33:     Conversions between UTF32, UTF-16, and UTF-8.  Header file.
        !            34: 
        !            35:     Several funtions are included here, forming a complete set of
        !            36:     conversions between the three formats.  UTF-7 is not included
        !            37:     here, but is handled in a separate source file.
        !            38: 
        !            39:     Each of these routines takes pointers to input buffers and output
        !            40:     buffers.  The input buffers are const.
        !            41: 
        !            42:     Each routine converts the text between *sourceStart and sourceEnd,
        !            43:     putting the result into the buffer between *targetStart and
        !            44:     targetEnd. Note: the end pointers are *after* the last item: e.g.
        !            45:     *(sourceEnd - 1) is the last item.
        !            46: 
        !            47:     The return result indicates whether the conversion was successful,
        !            48:     and if not, whether the problem was in the source or target buffers.
        !            49:     (Only the first encountered problem is indicated.)
        !            50: 
        !            51:     After the conversion, *sourceStart and *targetStart are both
        !            52:     updated to point to the end of last text successfully converted in
        !            53:     the respective buffers.
        !            54: 
        !            55:     Input parameters:
        !            56:         sourceStart - pointer to a pointer to the source buffer.
        !            57:                 The contents of this are modified on return so that
        !            58:                 it points at the next thing to be converted.
        !            59:         targetStart - similarly, pointer to pointer to the target buffer.
        !            60:         sourceEnd, targetEnd - respectively pointers to the ends of the
        !            61:                 two buffers, for overflow checking only.
        !            62: 
        !            63:     These conversion functions take a ConversionFlags argument. When this
        !            64:     flag is set to strict, both irregular sequences and isolated surrogates
        !            65:     will cause an error.  When the flag is set to lenient, both irregular
        !            66:     sequences and isolated surrogates are converted.
        !            67: 
        !            68:     Whether the flag is strict or lenient, all illegal sequences will cause
        !            69:     an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
        !            70:     or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
        !            71:     must check for illegal sequences.
        !            72: 
        !            73:     When the flag is set to lenient, characters over 0x10FFFF are converted
        !            74:     to the replacement character; otherwise (when the flag is set to strict)
        !            75:     they constitute an error.
        !            76: 
        !            77:     Output parameters:
        !            78:         The value "sourceIllegal" is returned from some routines if the input
        !            79:         sequence is malformed.  When "sourceIllegal" is returned, the source
        !            80:         value will point to the illegal value that caused the problem. E.g.,
        !            81:         in UTF-8 when a sequence is malformed, it points to the start of the
        !            82:         malformed sequence.
        !            83: 
        !            84:     Author: Mark E. Davis, 1994.
        !            85:     Rev History: Rick McGowan, fixes & updates May 2001.
        !            86:          Fixes & updates, Sept 2001.
        !            87: 
        !            88: ------------------------------------------------------------------------ */
        !            89: 
        !            90: #ifndef PA_CONVERT_UTF_H
        !            91: #define PA_CONVERT_UTF_H
        !            92: 
        !            93: #include "pa_config_includes.h"
        !            94: 
        !            95: /* ---------------------------------------------------------------------
        !            96:     The following 4 definitions are compiler-specific.
        !            97:     The C standard does not guarantee that wchar_t has at least
        !            98:     16 bits, so wchar_t is no less portable than unsigned short!
        !            99:     All should be unsigned values to avoid sign extension during
        !           100:     bit mask & shift operations.
        !           101: ------------------------------------------------------------------------ */
        !           102: 
        !           103: typedef uint32_t       UTF32;  /* at least 32 bits */
        !           104: typedef uint16_t       UTF16;  /* at least 16 bits */
        !           105: typedef unsigned char  UTF8;   /* typically 8 bits */
        !           106: typedef unsigned char  Boolean; /* 0 or 1 */
        !           107: 
        !           108: /* Some fundamental constants */
        !           109: #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
        !           110: #define UNI_MAX_BMP (UTF32)0x0000FFFF
        !           111: #define UNI_MAX_UTF16 (UTF32)0x0010FFFF
        !           112: #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
        !           113: #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
        !           114: 
        !           115: #define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4
        !           116: 
        !           117: #define UNI_UTF16_BYTE_ORDER_MARK_NATIVE  0xFEFF
        !           118: #define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE
        !           119: 
        !           120: typedef enum {
        !           121:   conversionOK,           /* conversion successful */
        !           122:   sourceExhausted,        /* partial character in source, but hit end */
        !           123:   targetExhausted,        /* insuff. room in target for conversion */
        !           124:   sourceIllegal           /* source sequence is illegal/malformed */
        !           125: } ConversionResult;
        !           126: 
        !           127: typedef enum {
        !           128:   strictConversion = 0,
        !           129:   lenientConversion
        !           130: } ConversionFlags;
        !           131: 
        !           132: /* This is for C++ and does no harm in C */
        !           133: #ifdef __cplusplus
        !           134: extern "C" {
        !           135: #endif
        !           136: 
        !           137: ConversionResult pa_convertUTF8toUTF16 (
        !           138:   const UTF8** sourceStart, const UTF8* sourceEnd, 
        !           139:   UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
        !           140: 
        !           141: /**
        !           142:  * Convert a partial UTF8 sequence to UTF32.  If the sequence ends in an
        !           143:  * incomplete code unit sequence, returns \c sourceExhausted.
        !           144:  */
        !           145: ConversionResult pa_convertUTF8toUTF32Partial(
        !           146:   const UTF8** sourceStart, const UTF8* sourceEnd,
        !           147:   UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
        !           148: 
        !           149: /**
        !           150:  * Convert a partial UTF8 sequence to UTF32.  If the sequence ends in an
        !           151:  * incomplete code unit sequence, returns \c sourceIllegal.
        !           152:  */
        !           153: ConversionResult pa_convertUTF8toUTF32(
        !           154:   const UTF8** sourceStart, const UTF8* sourceEnd,
        !           155:   UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
        !           156: 
        !           157: ConversionResult pa_convertUTF16toUTF8 (
        !           158:   const UTF16** sourceStart, const UTF16* sourceEnd,
        !           159:   UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
        !           160: 
        !           161: ConversionResult pa_convertUTF32toUTF8 (
        !           162:   const UTF32** sourceStart, const UTF32* sourceEnd,
        !           163:   UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
        !           164: 
        !           165: ConversionResult pa_convertUTF16toUTF32 (
        !           166:   const UTF16** sourceStart, const UTF16* sourceEnd,
        !           167:   UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
        !           168: 
        !           169: ConversionResult pa_convertUTF32toUTF16 (
        !           170:   const UTF32** sourceStart, const UTF32* sourceEnd,
        !           171:   UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
        !           172: 
        !           173: Boolean pa_isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
        !           174: 
        !           175: Boolean pa_isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
        !           176: 
        !           177: unsigned pa_getNumBytesForUTF8(UTF8 firstByte);
        !           178: 
        !           179: #ifdef __cplusplus
        !           180: }
        !           181: #endif
        !           182: 
        !           183: #endif
E-mail: