Annotation of parser3/src/lib/punycode/pa_idna.c, revision 1.1

1.1     ! moko        1: /** @file
        !             2:    Parser: IDNA support, modified Libidn Version 1.28.
        !             3:    Copyright (c) 2001-2015 Art. Lebedev Studio (http://www.artlebedev.com)
        !             4: */
        !             5: 
        !             6: /* idna.c --- Prototypes for Internationalized Domain Name library.
        !             7:    Copyright (C) 2002-2013 Simon Josefsson
        !             8: 
        !             9:    This file is part of GNU Libidn.
        !            10: 
        !            11:    GNU Libidn is free software: you can redistribute it and/or
        !            12:    modify it under the terms of either:
        !            13: 
        !            14:      * the GNU Lesser General Public License as published by the Free
        !            15:        Software Foundation; either version 3 of the License, or (at
        !            16:        your option) any later version.
        !            17: 
        !            18:    or
        !            19: 
        !            20:      * the GNU General Public License as published by the Free
        !            21:        Software Foundation; either version 2 of the License, or (at
        !            22:        your option) any later version.
        !            23: 
        !            24:    or both in parallel, as here.
        !            25: 
        !            26:    GNU Libidn is distributed in the hope that it will be useful,
        !            27:    but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            28:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            29:    General Public License for more details.
        !            30: 
        !            31:    You should have received copies of the GNU General Public License and
        !            32:    the GNU Lesser General Public License along with this program.  If
        !            33:    not, see <http://www.gnu.org/licenses/>. */
        !            34: 
        !            35: #include "pa_punycode.h"
        !            36: #include "pa_idna.h"
        !            37: 
        !            38: volatile const char * IDENT_PA_IDNA_C="$Id: pa_idna.C,v 1.1 2015/04/14 21:09:57 moko Exp $";
        !            39: 
        !            40: #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || (c) == 0xFF0E || (c) == 0xFF61)
        !            41: 
        !            42: /* Core functions */
        !            43: 
        !            44: /**
        !            45:  * idna_to_ascii_4z_internal:
        !            46:  * @src: input array with unicode code points.
        !            47:  * @len: length of input array with unicode code points.
        !            48:  * @out: output zero terminated string that must have room for at
        !            49:  *       least 63 characters plus the terminating zero.
        !            50:  * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
        !            51:  *   %IDNA_USE_STD3_ASCII_RULES.
        !            52:  *
        !            53:  * The ToASCII operation takes a sequence of Unicode code points that
        !            54:  * make up one domain label and transforms it into a sequence of code
        !            55:  * points in the ASCII range (0..7F). If ToASCII succeeds, the
        !            56:  * original sequence and the resulting sequence are equivalent labels.
        !            57:  *
        !            58:  * It is important to note that the ToASCII operation can fail. ToASCII
        !            59:  * fails if any step of it fails. If any step of the ToASCII operation
        !            60:  * fails on any label in a domain name, that domain name MUST NOT be used
        !            61:  * as an internationalized domain name. The method for deadling with this
        !            62:  * failure is application-specific.
        !            63:  *
        !            64:  * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
        !            65:  * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
        !            66:  * sequence of ASCII code points or a failure condition.
        !            67:  *
        !            68:  * ToASCII never alters a sequence of code points that are all in the ASCII
        !            69:  * range to begin with (although it could fail). Applying the ToASCII
        !            70:  * operation multiple times has exactly the same effect as applying it just
        !            71:  * once.
        !            72:  *
        !            73:  * Return value: Returns 0 on success, or an #Idna_rc error code.
        !            74:  */
        !            75: 
        !            76: static int idna_to_ascii_4i_internal (const uint32_t *src, size_t len, char *out, int flags) {
        !            77:        int rc;
        !            78:        size_t out_len;
        !            79: 
        !            80:        /*
        !            81:         * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
        !            82:         *
        !            83:         * (a) Verify the absence of non-LDH ASCII code points; that is,
        !            84:         * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
        !            85:         *
        !            86:         * (b) Verify the absence of leading and trailing hyphen-minus;
        !            87:         * that is, the absence of U+002D at the beginning and end of
        !            88:         * the sequence.
        !            89:         */
        !            90: 
        !            91:        if (flags & IDNA_USE_STD3_ASCII_RULES) {
        !            92:                size_t i;
        !            93: 
        !            94:                for (i = 0; i < len; i++)
        !            95:                        if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
        !            96:                                (src[i] >= 0x3A && src[i] <= 0x40) ||
        !            97:                                (src[i] >= 0x5B && src[i] <= 0x60) ||
        !            98:                                (src[i] >= 0x7B && src[i] <= 0x7F))
        !            99:                                return IDNA_CONTAINS_NON_LDH;
        !           100: 
        !           101:                if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
        !           102:                        return IDNA_CONTAINS_MINUS;
        !           103:        }
        !           104: 
        !           105:        /*
        !           106:         * 4. If all code points in the sequence are in the ASCII range
        !           107:         * (0..7F), then skip to step 8.
        !           108:         */
        !           109: 
        !           110:        {
        !           111:                size_t i;
        !           112: 
        !           113:                for (i = 0; i < len; i++) {
        !           114:                        if (src[i] > 0x7F)
        !           115:                                goto step5;
        !           116:                }
        !           117: 
        !           118:                if (len > 63)
        !           119:                        return PUNYCODE_BIG_OUTPUT;
        !           120: 
        !           121:                /* copy string to output buffer if we are about to skip to step8 */
        !           122:                for (i = 0; i < len; i++)
        !           123:                        out[i]=src[i];
        !           124:                out[len] = '\0';
        !           125:                goto step8;
        !           126:        }
        !           127: 
        !           128:        /*
        !           129:         * 5. Verify that the sequence does NOT begin with the ACE prefix.
        !           130:         */
        !           131: 
        !           132: step5:
        !           133:        if (len >= strlen (IDNA_ACE_PREFIX)) {
        !           134:                size_t i;
        !           135: 
        !           136:                for (i = 0; i < strlen (IDNA_ACE_PREFIX); i++)
        !           137:                        if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
        !           138:                                goto step6;
        !           139:                 return IDNA_CONTAINS_ACE_PREFIX;
        !           140:        }
        !           141: 
        !           142:        /*
        !           143:         * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
        !           144:         * and fail if there is an error.
        !           145:         */
        !           146: 
        !           147: step6:
        !           148:        out_len = 63 - strlen (IDNA_ACE_PREFIX);
        !           149:        rc = punycode_encode (len, src, NULL, &out_len, out + strlen(IDNA_ACE_PREFIX));
        !           150: 
        !           151:        if (rc != PUNYCODE_SUCCESS)
        !           152:                return rc;
        !           153: 
        !           154:        out[strlen (IDNA_ACE_PREFIX) + out_len] = '\0';
        !           155: 
        !           156:        /*
        !           157:         * 7. Prepend the ACE prefix.
        !           158:         */
        !           159: 
        !           160:        memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
        !           161: 
        !           162: step8:
        !           163:        return IDNA_SUCCESS;
        !           164: }
        !           165: 
        !           166: /**
        !           167:  * idna_to_ascii_4z:
        !           168:  * @in: zero terminated input Unicode string.
        !           169:  * @out: pointer to output string.
        !           170:  * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
        !           171:  *   %IDNA_USE_STD3_ASCII_RULES.
        !           172:  *
        !           173:  * Convert UCS-4 domain name to ASCII string.  The domain name may
        !           174:  * contain several labels, separated by dots.
        !           175:  *
        !           176:  * Return value: Returns %IDNA_SUCCESS on success, or error code.
        !           177:  **/
        !           178: int pa_idna_to_ascii_4z (const uint32_t *in, char *out, size_t out_len, int flags) {
        !           179:        int rc;
        !           180:        const uint32_t *start = in;
        !           181:        const uint32_t *end;
        !           182:        size_t add_len;
        !           183:        char buf[64];
        !           184: 
        !           185:        /* 1) Whenever dots are used as label separators, the following
        !           186:           characters MUST be recognized as dots: U+002E (full stop),
        !           187:           U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
        !           188:           U+FF61 (halfwidth ideographic full stop). */
        !           189: 
        !           190:        do {
        !           191:                for (end = start; *end && !DOTP (*end); end++);
        !           192: 
        !           193:                if (end > start) {
        !           194:                        rc = idna_to_ascii_4i_internal (start, (size_t) (end - start), buf, flags);
        !           195:                        if (rc != IDNA_SUCCESS)
        !           196:                                return rc;
        !           197: 
        !           198:                        add_len = strlen (buf);
        !           199:                        if (add_len >= out_len)
        !           200:                                return PUNYCODE_BIG_OUTPUT;
        !           201: 
        !           202:                        memcpy (out, buf, add_len);
        !           203:                        out += add_len;
        !           204:                        out_len -= add_len;
        !           205:                }
        !           206: 
        !           207:                if (*end) {
        !           208:                        if (!out_len)
        !           209:                                return PUNYCODE_BIG_OUTPUT;
        !           210:                        *(out++)='.';
        !           211:                        out_len--;
        !           212:                }
        !           213: 
        !           214:                start = end + 1;
        !           215:        } while (*end);
        !           216: 
        !           217:        if (!out_len)
        !           218:                return PUNYCODE_BIG_OUTPUT;
        !           219: 
        !           220:        *out='\0';
        !           221: 
        !           222:        return IDNA_SUCCESS;
        !           223: }
        !           224: 
        !           225: /* ToUnicode(). */
        !           226: static int idna_to_unicode_internal (const char *in, size_t in_len, uint32_t *out, size_t *out_len, int flags) {
        !           227:        int rc;
        !           228:        char tmpout[64];
        !           229: 
        !           230:        /* 3. Verify that the sequence begins with the ACE prefix
        !           231:         * ... The ToASCII and ToUnicode operations MUST recognize the ACE
        !           232:         prefix in a case-insensitive manner.
        !           233:         */
        !           234: 
        !           235:        if ( (in_len < strlen (IDNA_ACE_PREFIX)) || (strncasecmp (in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0) ){
        !           236:                size_t i;
        !           237:                if (in_len >= *out_len)
        !           238:                        return PUNYCODE_BIG_OUTPUT;
        !           239: 
        !           240:                for(i=0; i<in_len; i++){
        !           241:                        if ((unsigned char)in[i] > 0x7F)
        !           242:                                return PUNYCODE_BAD_INPUT;
        !           243:                        out[i]=in[i];
        !           244:                }
        !           245:                *out_len=in_len;
        !           246: 
        !           247:                return IDNA_SUCCESS;
        !           248:        } else {
        !           249:                /* 4. Remove the ACE prefix.
        !           250:                 */
        !           251: 
        !           252:                in += strlen (IDNA_ACE_PREFIX);
        !           253:                in_len-=strlen (IDNA_ACE_PREFIX);
        !           254: 
        !           255:                /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
        !           256:                 * and fail if there is an error. Save a copy of the result of
        !           257:                 * this step.
        !           258:                 */
        !           259: 
        !           260:                rc = punycode_decode (in_len, in, out_len, out, NULL);
        !           261:                if (rc != PUNYCODE_SUCCESS)
        !           262:                        return rc;
        !           263: 
        !           264:                /* 6. Apply ToASCII.
        !           265:                 */
        !           266: 
        !           267:                rc = idna_to_ascii_4i_internal (out, *out_len, tmpout, flags);
        !           268:                if (rc != IDNA_SUCCESS)
        !           269:                        return rc;
        !           270: 
        !           271:                /* 7. Verify that the result of step 6 matches the saved copy from
        !           272:                 * step 3, using a case-insensitive ASCII comparison.
        !           273:                 */
        !           274: 
        !           275:                if (strncasecmp (in, tmpout + strlen (IDNA_ACE_PREFIX), in_len) != 0)
        !           276:                        return IDNA_ROUNDTRIP_VERIFY_ERROR;
        !           277: 
        !           278:                /* 8. Return the saved copy from step 5.
        !           279:                 */
        !           280: 
        !           281:                return IDNA_SUCCESS;
        !           282:        }
        !           283: }
        !           284: 
        !           285: /**
        !           286:  * idna_to_unicode_4z:
        !           287:  * @in: zero-terminated string.
        !           288:  * @output: pointer to output Unicode string.
        !           289:  * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
        !           290:  *   %IDNA_USE_STD3_ASCII_RULES.
        !           291:  *
        !           292:  * Convert possibly ACE encoded domain name into a
        !           293:  * UCS-4 string.  The domain name may contain several labels,
        !           294:  * separated by dots.  The output buffer must be deallocated by the
        !           295:  * caller.
        !           296:  *
        !           297:  * Return value: Returns %IDNA_SUCCESS on success, or error code.
        !           298:  **/
        !           299: int pa_idna_to_unicode_4z (const char *in, uint32_t *out, size_t out_len, int flags) {
        !           300:        int rc;
        !           301:        const char *start = in;
        !           302:        const char *end;
        !           303:        size_t add_len;
        !           304: 
        !           305:        do {
        !           306:                for (end = start; *end && !DOTP (*end); end++);
        !           307: 
        !           308:                if (end > start) {
        !           309:                        add_len=out_len;
        !           310:                        rc = idna_to_unicode_internal (start, (size_t) (end - start), out, &add_len, flags);
        !           311:                        if (rc != IDNA_SUCCESS)
        !           312:                                return rc;
        !           313: 
        !           314:                        if (add_len >= out_len)
        !           315:                                return PUNYCODE_BIG_OUTPUT;
        !           316: 
        !           317:                        out+=add_len;
        !           318:                        out_len-=add_len;
        !           319:                }
        !           320: 
        !           321:                if (*end) {
        !           322:                        if (!out_len)
        !           323:                                return PUNYCODE_BIG_OUTPUT;
        !           324:                        *(out++) = 0x002E;      /* '.' (full stop) */
        !           325:                        out_len--;
        !           326:                }
        !           327: 
        !           328:                start = end + 1;
        !           329:        } while (*end);
        !           330: 
        !           331:        if (!out_len)
        !           332:                return PUNYCODE_BIG_OUTPUT;
        !           333: 
        !           334:        *out=0;
        !           335:        return IDNA_SUCCESS;
        !           336: }
        !           337: 
        !           338: /**
        !           339:  * IDNA_ACE_PREFIX
        !           340:  *
        !           341:  * The IANA allocated prefix to use for IDNA. "xn--"
        !           342:  */
        !           343: 
        !           344: /**
        !           345:  * Idna_rc:
        !           346:  * @IDNA_SUCCESS: Successful operation.  This value is guaranteed to
        !           347:  *   always be zero, the remaining ones are only guaranteed to hold
        !           348:  *   non-zero values, for logical comparison purposes.
        !           349:  * @IDNA_CONTAINS_NON_LDH: For IDNA_USE_STD3_ASCII_RULES, indicate that
        !           350:  *   the string contains non-LDH ASCII characters.
        !           351:  * @IDNA_CONTAINS_MINUS: For IDNA_USE_STD3_ASCII_RULES, indicate that
        !           352:  *   the string contains a leading or trailing hyphen-minus (U+002D).
        !           353:  * @IDNA_ROUNDTRIP_VERIFY_ERROR: The ToASCII operation on output
        !           354:  *   string does not equal the input.
        !           355:  * @IDNA_CONTAINS_ACE_PREFIX: The input contains the ACE prefix (for
        !           356:  *   ToASCII).
        !           357:  *
        !           358:  * Enumerated return codes of idna_to_ascii_4i(),
        !           359:  * idna_to_unicode_44i() functions (and functions derived from those
        !           360:  * functions).  The value 0 is guaranteed to always correspond to
        !           361:  * success.
        !           362:  */
        !           363: 
        !           364: const char *pa_idna_strerror(int rc) {
        !           365:        switch (rc) {
        !           366:                case IDNA_SUCCESS:
        !           367:                        return "Success";
        !           368:                case PUNYCODE_BAD_INPUT:
        !           369:                        return "Input is invalid";
        !           370:                case PUNYCODE_BIG_OUTPUT:
        !           371:                        return "String is too long";
        !           372:                case PUNYCODE_OVERFLOW:
        !           373:                        return "Wider integers needed to process input";
        !           374:                case IDNA_CONTAINS_NON_LDH:
        !           375:                        return "Non-digit/letter/hyphen in input";
        !           376:                case IDNA_CONTAINS_MINUS:
        !           377:                        return "Forbidden leading or trailing minus sign (`-')";
        !           378:                case IDNA_ROUNDTRIP_VERIFY_ERROR:
        !           379:                        return "String not idempotent under ToASCII";
        !           380:                case IDNA_CONTAINS_ACE_PREFIX:
        !           381:                        return "Input already contain ACE prefix (`xn--')";
        !           382:                default:
        !           383:                        return "Unknown error";
        !           384:        }
        !           385: }
        !           386: 
        !           387: /**
        !           388:  * Idna_flags:
        !           389:  * @IDNA_ALLOW_UNASSIGNED: Don't reject strings containing unassigned
        !           390:  *   Unicode code points.
        !           391:  * @IDNA_USE_STD3_ASCII_RULES: Validate strings according to STD3
        !           392:  *   rules (i.e., normal host name rules).
        !           393:  *
        !           394:  * Flags to pass to idna_to_ascii_4i(), idna_to_unicode_44i() etc.
        !           395:  */

E-mail: