Annotation of parser3/src/lib/punycode/pa_idna.c, revision 1.2

1.1       moko        1: /** @file
                      2:    Parser: IDNA support, modified Libidn Version 1.28.
                      3:    Copyright (c) 2001-2015 Art. Lebedev Studio (http://www.artlebedev.com)
                      4: */
                      5: 
                      6: /* idna.c --- Prototypes for Internationalized Domain Name library.
                      7:    Copyright (C) 2002-2013 Simon Josefsson
                      8: 
                      9:    This file is part of GNU Libidn.
                     10: 
                     11:    GNU Libidn is free software: you can redistribute it and/or
                     12:    modify it under the terms of either:
                     13: 
                     14:      * the GNU Lesser General Public License as published by the Free
                     15:        Software Foundation; either version 3 of the License, or (at
                     16:        your option) any later version.
                     17: 
                     18:    or
                     19: 
                     20:      * the GNU General Public License as published by the Free
                     21:        Software Foundation; either version 2 of the License, or (at
                     22:        your option) any later version.
                     23: 
                     24:    or both in parallel, as here.
                     25: 
                     26:    GNU Libidn is distributed in the hope that it will be useful,
                     27:    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     28:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     29:    General Public License for more details.
                     30: 
                     31:    You should have received copies of the GNU General Public License and
                     32:    the GNU Lesser General Public License along with this program.  If
                     33:    not, see <http://www.gnu.org/licenses/>. */
                     34: 
                     35: #include "pa_punycode.h"
                     36: #include "pa_idna.h"
                     37: 
1.2     ! moko       38: volatile const char * IDENT_PA_IDNA_C="$Id: pa_idna.c,v 1.1 2015/04/14 21:42:52 moko Exp $";
1.1       moko       39: 
                     40: #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || (c) == 0xFF0E || (c) == 0xFF61)
                     41: 
                     42: /* Core functions */
                     43: 
                     44: /**
                     45:  * idna_to_ascii_4z_internal:
                     46:  * @src: input array with unicode code points.
                     47:  * @len: length of input array with unicode code points.
                     48:  * @out: output zero terminated string that must have room for at
                     49:  *       least 63 characters plus the terminating zero.
                     50:  * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
                     51:  *   %IDNA_USE_STD3_ASCII_RULES.
                     52:  *
                     53:  * The ToASCII operation takes a sequence of Unicode code points that
                     54:  * make up one domain label and transforms it into a sequence of code
                     55:  * points in the ASCII range (0..7F). If ToASCII succeeds, the
                     56:  * original sequence and the resulting sequence are equivalent labels.
                     57:  *
                     58:  * It is important to note that the ToASCII operation can fail. ToASCII
                     59:  * fails if any step of it fails. If any step of the ToASCII operation
                     60:  * fails on any label in a domain name, that domain name MUST NOT be used
                     61:  * as an internationalized domain name. The method for deadling with this
                     62:  * failure is application-specific.
                     63:  *
                     64:  * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
                     65:  * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
                     66:  * sequence of ASCII code points or a failure condition.
                     67:  *
                     68:  * ToASCII never alters a sequence of code points that are all in the ASCII
                     69:  * range to begin with (although it could fail). Applying the ToASCII
                     70:  * operation multiple times has exactly the same effect as applying it just
                     71:  * once.
                     72:  *
                     73:  * Return value: Returns 0 on success, or an #Idna_rc error code.
                     74:  */
                     75: 
                     76: static int idna_to_ascii_4i_internal (const uint32_t *src, size_t len, char *out, int flags) {
                     77:        int rc;
                     78:        size_t out_len;
                     79: 
                     80:        /*
                     81:         * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
                     82:         *
                     83:         * (a) Verify the absence of non-LDH ASCII code points; that is,
                     84:         * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
                     85:         *
                     86:         * (b) Verify the absence of leading and trailing hyphen-minus;
                     87:         * that is, the absence of U+002D at the beginning and end of
                     88:         * the sequence.
                     89:         */
                     90: 
                     91:        if (flags & IDNA_USE_STD3_ASCII_RULES) {
                     92:                size_t i;
                     93: 
                     94:                for (i = 0; i < len; i++)
                     95:                        if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
                     96:                                (src[i] >= 0x3A && src[i] <= 0x40) ||
                     97:                                (src[i] >= 0x5B && src[i] <= 0x60) ||
                     98:                                (src[i] >= 0x7B && src[i] <= 0x7F))
                     99:                                return IDNA_CONTAINS_NON_LDH;
                    100: 
                    101:                if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
                    102:                        return IDNA_CONTAINS_MINUS;
                    103:        }
                    104: 
                    105:        /*
                    106:         * 4. If all code points in the sequence are in the ASCII range
                    107:         * (0..7F), then skip to step 8.
                    108:         */
                    109: 
                    110:        {
                    111:                size_t i;
                    112: 
                    113:                for (i = 0; i < len; i++) {
                    114:                        if (src[i] > 0x7F)
                    115:                                goto step5;
                    116:                }
                    117: 
                    118:                if (len > 63)
                    119:                        return PUNYCODE_BIG_OUTPUT;
                    120: 
                    121:                /* copy string to output buffer if we are about to skip to step8 */
                    122:                for (i = 0; i < len; i++)
1.2     ! moko      123:                        out[i]=(char)src[i];
1.1       moko      124:                out[len] = '\0';
                    125:                goto step8;
                    126:        }
                    127: 
                    128:        /*
                    129:         * 5. Verify that the sequence does NOT begin with the ACE prefix.
                    130:         */
                    131: 
                    132: step5:
                    133:        if (len >= strlen (IDNA_ACE_PREFIX)) {
                    134:                size_t i;
                    135: 
                    136:                for (i = 0; i < strlen (IDNA_ACE_PREFIX); i++)
                    137:                        if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
                    138:                                goto step6;
                    139:                 return IDNA_CONTAINS_ACE_PREFIX;
                    140:        }
                    141: 
                    142:        /*
                    143:         * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
                    144:         * and fail if there is an error.
                    145:         */
                    146: 
                    147: step6:
                    148:        out_len = 63 - strlen (IDNA_ACE_PREFIX);
                    149:        rc = punycode_encode (len, src, NULL, &out_len, out + strlen(IDNA_ACE_PREFIX));
                    150: 
                    151:        if (rc != PUNYCODE_SUCCESS)
                    152:                return rc;
                    153: 
                    154:        out[strlen (IDNA_ACE_PREFIX) + out_len] = '\0';
                    155: 
                    156:        /*
                    157:         * 7. Prepend the ACE prefix.
                    158:         */
                    159: 
                    160:        memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
                    161: 
                    162: step8:
                    163:        return IDNA_SUCCESS;
                    164: }
                    165: 
                    166: /**
                    167:  * idna_to_ascii_4z:
                    168:  * @in: zero terminated input Unicode string.
                    169:  * @out: pointer to output string.
                    170:  * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
                    171:  *   %IDNA_USE_STD3_ASCII_RULES.
                    172:  *
                    173:  * Convert UCS-4 domain name to ASCII string.  The domain name may
                    174:  * contain several labels, separated by dots.
                    175:  *
                    176:  * Return value: Returns %IDNA_SUCCESS on success, or error code.
                    177:  **/
                    178: int pa_idna_to_ascii_4z (const uint32_t *in, char *out, size_t out_len, int flags) {
                    179:        int rc;
                    180:        const uint32_t *start = in;
                    181:        const uint32_t *end;
                    182:        size_t add_len;
                    183:        char buf[64];
                    184: 
                    185:        /* 1) Whenever dots are used as label separators, the following
                    186:           characters MUST be recognized as dots: U+002E (full stop),
                    187:           U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
                    188:           U+FF61 (halfwidth ideographic full stop). */
                    189: 
                    190:        do {
                    191:                for (end = start; *end && !DOTP (*end); end++);
                    192: 
                    193:                if (end > start) {
                    194:                        rc = idna_to_ascii_4i_internal (start, (size_t) (end - start), buf, flags);
                    195:                        if (rc != IDNA_SUCCESS)
                    196:                                return rc;
                    197: 
                    198:                        add_len = strlen (buf);
                    199:                        if (add_len >= out_len)
                    200:                                return PUNYCODE_BIG_OUTPUT;
                    201: 
                    202:                        memcpy (out, buf, add_len);
                    203:                        out += add_len;
                    204:                        out_len -= add_len;
                    205:                }
                    206: 
                    207:                if (*end) {
                    208:                        if (!out_len)
                    209:                                return PUNYCODE_BIG_OUTPUT;
                    210:                        *(out++)='.';
                    211:                        out_len--;
                    212:                }
                    213: 
                    214:                start = end + 1;
                    215:        } while (*end);
                    216: 
                    217:        if (!out_len)
                    218:                return PUNYCODE_BIG_OUTPUT;
                    219: 
                    220:        *out='\0';
                    221: 
                    222:        return IDNA_SUCCESS;
                    223: }
                    224: 
                    225: /* ToUnicode(). */
                    226: static int idna_to_unicode_internal (const char *in, size_t in_len, uint32_t *out, size_t *out_len, int flags) {
                    227:        int rc;
                    228:        char tmpout[64];
                    229: 
                    230:        /* 3. Verify that the sequence begins with the ACE prefix
                    231:         * ... The ToASCII and ToUnicode operations MUST recognize the ACE
                    232:         prefix in a case-insensitive manner.
                    233:         */
                    234: 
                    235:        if ( (in_len < strlen (IDNA_ACE_PREFIX)) || (strncasecmp (in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0) ){
                    236:                size_t i;
                    237:                if (in_len >= *out_len)
                    238:                        return PUNYCODE_BIG_OUTPUT;
                    239: 
                    240:                for(i=0; i<in_len; i++){
                    241:                        if ((unsigned char)in[i] > 0x7F)
                    242:                                return PUNYCODE_BAD_INPUT;
                    243:                        out[i]=in[i];
                    244:                }
                    245:                *out_len=in_len;
                    246: 
                    247:                return IDNA_SUCCESS;
                    248:        } else {
                    249:                /* 4. Remove the ACE prefix.
                    250:                 */
                    251: 
                    252:                in += strlen (IDNA_ACE_PREFIX);
                    253:                in_len-=strlen (IDNA_ACE_PREFIX);
                    254: 
                    255:                /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
                    256:                 * and fail if there is an error. Save a copy of the result of
                    257:                 * this step.
                    258:                 */
                    259: 
                    260:                rc = punycode_decode (in_len, in, out_len, out, NULL);
                    261:                if (rc != PUNYCODE_SUCCESS)
                    262:                        return rc;
                    263: 
                    264:                /* 6. Apply ToASCII.
                    265:                 */
                    266: 
                    267:                rc = idna_to_ascii_4i_internal (out, *out_len, tmpout, flags);
                    268:                if (rc != IDNA_SUCCESS)
                    269:                        return rc;
                    270: 
                    271:                /* 7. Verify that the result of step 6 matches the saved copy from
                    272:                 * step 3, using a case-insensitive ASCII comparison.
                    273:                 */
                    274: 
                    275:                if (strncasecmp (in, tmpout + strlen (IDNA_ACE_PREFIX), in_len) != 0)
                    276:                        return IDNA_ROUNDTRIP_VERIFY_ERROR;
                    277: 
                    278:                /* 8. Return the saved copy from step 5.
                    279:                 */
                    280: 
                    281:                return IDNA_SUCCESS;
                    282:        }
                    283: }
                    284: 
                    285: /**
                    286:  * idna_to_unicode_4z:
                    287:  * @in: zero-terminated string.
                    288:  * @output: pointer to output Unicode string.
                    289:  * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
                    290:  *   %IDNA_USE_STD3_ASCII_RULES.
                    291:  *
                    292:  * Convert possibly ACE encoded domain name into a
                    293:  * UCS-4 string.  The domain name may contain several labels,
                    294:  * separated by dots.  The output buffer must be deallocated by the
                    295:  * caller.
                    296:  *
                    297:  * Return value: Returns %IDNA_SUCCESS on success, or error code.
                    298:  **/
                    299: int pa_idna_to_unicode_4z (const char *in, uint32_t *out, size_t out_len, int flags) {
                    300:        int rc;
                    301:        const char *start = in;
                    302:        const char *end;
                    303:        size_t add_len;
                    304: 
                    305:        do {
                    306:                for (end = start; *end && !DOTP (*end); end++);
                    307: 
                    308:                if (end > start) {
                    309:                        add_len=out_len;
                    310:                        rc = idna_to_unicode_internal (start, (size_t) (end - start), out, &add_len, flags);
                    311:                        if (rc != IDNA_SUCCESS)
                    312:                                return rc;
                    313: 
                    314:                        if (add_len >= out_len)
                    315:                                return PUNYCODE_BIG_OUTPUT;
                    316: 
                    317:                        out+=add_len;
                    318:                        out_len-=add_len;
                    319:                }
                    320: 
                    321:                if (*end) {
                    322:                        if (!out_len)
                    323:                                return PUNYCODE_BIG_OUTPUT;
                    324:                        *(out++) = 0x002E;      /* '.' (full stop) */
                    325:                        out_len--;
                    326:                }
                    327: 
                    328:                start = end + 1;
                    329:        } while (*end);
                    330: 
                    331:        if (!out_len)
                    332:                return PUNYCODE_BIG_OUTPUT;
                    333: 
                    334:        *out=0;
                    335:        return IDNA_SUCCESS;
                    336: }
                    337: 
                    338: /**
                    339:  * IDNA_ACE_PREFIX
                    340:  *
                    341:  * The IANA allocated prefix to use for IDNA. "xn--"
                    342:  */
                    343: 
                    344: /**
                    345:  * Idna_rc:
                    346:  * @IDNA_SUCCESS: Successful operation.  This value is guaranteed to
                    347:  *   always be zero, the remaining ones are only guaranteed to hold
                    348:  *   non-zero values, for logical comparison purposes.
                    349:  * @IDNA_CONTAINS_NON_LDH: For IDNA_USE_STD3_ASCII_RULES, indicate that
                    350:  *   the string contains non-LDH ASCII characters.
                    351:  * @IDNA_CONTAINS_MINUS: For IDNA_USE_STD3_ASCII_RULES, indicate that
                    352:  *   the string contains a leading or trailing hyphen-minus (U+002D).
                    353:  * @IDNA_ROUNDTRIP_VERIFY_ERROR: The ToASCII operation on output
                    354:  *   string does not equal the input.
                    355:  * @IDNA_CONTAINS_ACE_PREFIX: The input contains the ACE prefix (for
                    356:  *   ToASCII).
                    357:  *
                    358:  * Enumerated return codes of idna_to_ascii_4i(),
                    359:  * idna_to_unicode_44i() functions (and functions derived from those
                    360:  * functions).  The value 0 is guaranteed to always correspond to
                    361:  * success.
                    362:  */
                    363: 
                    364: const char *pa_idna_strerror(int rc) {
                    365:        switch (rc) {
                    366:                case IDNA_SUCCESS:
                    367:                        return "Success";
                    368:                case PUNYCODE_BAD_INPUT:
                    369:                        return "Input is invalid";
                    370:                case PUNYCODE_BIG_OUTPUT:
                    371:                        return "String is too long";
                    372:                case PUNYCODE_OVERFLOW:
                    373:                        return "Wider integers needed to process input";
                    374:                case IDNA_CONTAINS_NON_LDH:
                    375:                        return "Non-digit/letter/hyphen in input";
                    376:                case IDNA_CONTAINS_MINUS:
                    377:                        return "Forbidden leading or trailing minus sign (`-')";
                    378:                case IDNA_ROUNDTRIP_VERIFY_ERROR:
                    379:                        return "String not idempotent under ToASCII";
                    380:                case IDNA_CONTAINS_ACE_PREFIX:
                    381:                        return "Input already contain ACE prefix (`xn--')";
                    382:                default:
                    383:                        return "Unknown error";
                    384:        }
                    385: }
                    386: 
                    387: /**
                    388:  * Idna_flags:
                    389:  * @IDNA_ALLOW_UNASSIGNED: Don't reject strings containing unassigned
                    390:  *   Unicode code points.
                    391:  * @IDNA_USE_STD3_ASCII_RULES: Validate strings according to STD3
                    392:  *   rules (i.e., normal host name rules).
                    393:  *
                    394:  * Flags to pass to idna_to_ascii_4i(), idna_to_unicode_44i() etc.
                    395:  */

E-mail: