Annotation of parser3/src/lib/punycode/pa_idna.c, revision 1.8

1.1       moko        1: /** @file
                      2:    Parser: IDNA support, modified Libidn Version 1.28.
1.8     ! moko        3:    Copyright (c) 2001-2023 Art. Lebedev Studio (http://www.artlebedev.com)
1.1       moko        4: */
                      5: 
                      6: /* idna.c --- Prototypes for Internationalized Domain Name library.
                      7:    Copyright (C) 2002-2013 Simon Josefsson
                      8: 
                      9:    This file is part of GNU Libidn.
                     10: 
                     11:    GNU Libidn is free software: you can redistribute it and/or
                     12:    modify it under the terms of either:
                     13: 
                     14:      * the GNU Lesser General Public License as published by the Free
                     15:        Software Foundation; either version 3 of the License, or (at
                     16:        your option) any later version.
                     17: 
                     18:    or
                     19: 
                     20:      * the GNU General Public License as published by the Free
                     21:        Software Foundation; either version 2 of the License, or (at
                     22:        your option) any later version.
                     23: 
                     24:    or both in parallel, as here.
                     25: 
                     26:    GNU Libidn is distributed in the hope that it will be useful,
                     27:    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     28:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     29:    General Public License for more details.
                     30: 
                     31:    You should have received copies of the GNU General Public License and
                     32:    the GNU Lesser General Public License along with this program.  If
                     33:    not, see <http://www.gnu.org/licenses/>. */
                     34: 
                     35: #include "pa_punycode.h"
                     36: #include "pa_idna.h"
                     37: 
1.8     ! moko       38: volatile const char * IDENT_PA_IDNA_C="$Id: pa_idna.c,v 1.7 2020/12/15 17:10:33 moko Exp $";
1.1       moko       39: 
                     40: #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || (c) == 0xFF0E || (c) == 0xFF61)
                     41: 
1.4       moko       42: #ifdef _MSC_VER
1.3       moko       43: #      define strncasecmp _strnicmp
                     44: #endif
                     45: 
1.1       moko       46: /* Core functions */
                     47: 
                     48: /**
                     49:  * idna_to_ascii_4z_internal:
                     50:  * @src: input array with unicode code points.
                     51:  * @len: length of input array with unicode code points.
                     52:  * @out: output zero terminated string that must have room for at
                     53:  *       least 63 characters plus the terminating zero.
                     54:  * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
                     55:  *   %IDNA_USE_STD3_ASCII_RULES.
                     56:  *
                     57:  * The ToASCII operation takes a sequence of Unicode code points that
                     58:  * make up one domain label and transforms it into a sequence of code
                     59:  * points in the ASCII range (0..7F). If ToASCII succeeds, the
                     60:  * original sequence and the resulting sequence are equivalent labels.
                     61:  *
                     62:  * It is important to note that the ToASCII operation can fail. ToASCII
                     63:  * fails if any step of it fails. If any step of the ToASCII operation
                     64:  * fails on any label in a domain name, that domain name MUST NOT be used
                     65:  * as an internationalized domain name. The method for deadling with this
                     66:  * failure is application-specific.
                     67:  *
                     68:  * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
                     69:  * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
                     70:  * sequence of ASCII code points or a failure condition.
                     71:  *
                     72:  * ToASCII never alters a sequence of code points that are all in the ASCII
                     73:  * range to begin with (although it could fail). Applying the ToASCII
                     74:  * operation multiple times has exactly the same effect as applying it just
                     75:  * once.
                     76:  *
                     77:  * Return value: Returns 0 on success, or an #Idna_rc error code.
                     78:  */
                     79: 
                     80: static int idna_to_ascii_4i_internal (const uint32_t *src, size_t len, char *out, int flags) {
                     81:        int rc;
                     82:        size_t out_len;
                     83: 
                     84:        /*
                     85:         * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
                     86:         *
                     87:         * (a) Verify the absence of non-LDH ASCII code points; that is,
                     88:         * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
                     89:         *
                     90:         * (b) Verify the absence of leading and trailing hyphen-minus;
                     91:         * that is, the absence of U+002D at the beginning and end of
                     92:         * the sequence.
                     93:         */
                     94: 
                     95:        if (flags & IDNA_USE_STD3_ASCII_RULES) {
                     96:                size_t i;
                     97: 
                     98:                for (i = 0; i < len; i++)
                     99:                        if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
                    100:                                (src[i] >= 0x3A && src[i] <= 0x40) ||
                    101:                                (src[i] >= 0x5B && src[i] <= 0x60) ||
                    102:                                (src[i] >= 0x7B && src[i] <= 0x7F))
                    103:                                return IDNA_CONTAINS_NON_LDH;
                    104: 
                    105:                if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
                    106:                        return IDNA_CONTAINS_MINUS;
                    107:        }
                    108: 
                    109:        /*
                    110:         * 4. If all code points in the sequence are in the ASCII range
                    111:         * (0..7F), then skip to step 8.
                    112:         */
                    113: 
                    114:        {
                    115:                size_t i;
                    116: 
                    117:                for (i = 0; i < len; i++) {
                    118:                        if (src[i] > 0x7F)
                    119:                                goto step5;
                    120:                }
                    121: 
                    122:                if (len > 63)
                    123:                        return PUNYCODE_BIG_OUTPUT;
                    124: 
                    125:                /* copy string to output buffer if we are about to skip to step8 */
                    126:                for (i = 0; i < len; i++)
1.2       moko      127:                        out[i]=(char)src[i];
1.1       moko      128:                out[len] = '\0';
                    129:                goto step8;
                    130:        }
                    131: 
                    132:        /*
                    133:         * 5. Verify that the sequence does NOT begin with the ACE prefix.
                    134:         */
                    135: 
                    136: step5:
                    137:        if (len >= strlen (IDNA_ACE_PREFIX)) {
                    138:                size_t i;
                    139: 
                    140:                for (i = 0; i < strlen (IDNA_ACE_PREFIX); i++)
                    141:                        if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
                    142:                                goto step6;
                    143:                 return IDNA_CONTAINS_ACE_PREFIX;
                    144:        }
                    145: 
                    146:        /*
                    147:         * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
                    148:         * and fail if there is an error.
                    149:         */
                    150: 
                    151: step6:
                    152:        out_len = 63 - strlen (IDNA_ACE_PREFIX);
                    153:        rc = punycode_encode (len, src, NULL, &out_len, out + strlen(IDNA_ACE_PREFIX));
                    154: 
                    155:        if (rc != PUNYCODE_SUCCESS)
                    156:                return rc;
                    157: 
                    158:        out[strlen (IDNA_ACE_PREFIX) + out_len] = '\0';
                    159: 
                    160:        /*
                    161:         * 7. Prepend the ACE prefix.
                    162:         */
                    163: 
                    164:        memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
                    165: 
                    166: step8:
                    167:        return IDNA_SUCCESS;
                    168: }
                    169: 
                    170: /**
                    171:  * idna_to_ascii_4z:
                    172:  * @in: zero terminated input Unicode string.
                    173:  * @out: pointer to output string.
                    174:  * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
                    175:  *   %IDNA_USE_STD3_ASCII_RULES.
                    176:  *
                    177:  * Convert UCS-4 domain name to ASCII string.  The domain name may
                    178:  * contain several labels, separated by dots.
                    179:  *
                    180:  * Return value: Returns %IDNA_SUCCESS on success, or error code.
                    181:  **/
                    182: int pa_idna_to_ascii_4z (const uint32_t *in, char *out, size_t out_len, int flags) {
                    183:        int rc;
                    184:        const uint32_t *start = in;
                    185:        const uint32_t *end;
                    186:        size_t add_len;
                    187:        char buf[64];
                    188: 
                    189:        /* 1) Whenever dots are used as label separators, the following
                    190:           characters MUST be recognized as dots: U+002E (full stop),
                    191:           U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
                    192:           U+FF61 (halfwidth ideographic full stop). */
                    193: 
                    194:        do {
                    195:                for (end = start; *end && !DOTP (*end); end++);
                    196: 
                    197:                if (end > start) {
                    198:                        rc = idna_to_ascii_4i_internal (start, (size_t) (end - start), buf, flags);
                    199:                        if (rc != IDNA_SUCCESS)
                    200:                                return rc;
                    201: 
                    202:                        add_len = strlen (buf);
                    203:                        if (add_len >= out_len)
                    204:                                return PUNYCODE_BIG_OUTPUT;
                    205: 
                    206:                        memcpy (out, buf, add_len);
                    207:                        out += add_len;
                    208:                        out_len -= add_len;
                    209:                }
                    210: 
                    211:                if (*end) {
                    212:                        if (!out_len)
                    213:                                return PUNYCODE_BIG_OUTPUT;
                    214:                        *(out++)='.';
                    215:                        out_len--;
                    216:                }
                    217: 
                    218:                start = end + 1;
                    219:        } while (*end);
                    220: 
                    221:        if (!out_len)
                    222:                return PUNYCODE_BIG_OUTPUT;
                    223: 
                    224:        *out='\0';
                    225: 
                    226:        return IDNA_SUCCESS;
                    227: }
                    228: 
                    229: /* ToUnicode(). */
                    230: static int idna_to_unicode_internal (const char *in, size_t in_len, uint32_t *out, size_t *out_len, int flags) {
                    231:        int rc;
                    232:        char tmpout[64];
                    233: 
                    234:        /* 3. Verify that the sequence begins with the ACE prefix
                    235:         * ... The ToASCII and ToUnicode operations MUST recognize the ACE
                    236:         prefix in a case-insensitive manner.
                    237:         */
                    238: 
                    239:        if ( (in_len < strlen (IDNA_ACE_PREFIX)) || (strncasecmp (in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0) ){
                    240:                size_t i;
                    241:                if (in_len >= *out_len)
                    242:                        return PUNYCODE_BIG_OUTPUT;
                    243: 
                    244:                for(i=0; i<in_len; i++){
                    245:                        if ((unsigned char)in[i] > 0x7F)
                    246:                                return PUNYCODE_BAD_INPUT;
                    247:                        out[i]=in[i];
                    248:                }
                    249:                *out_len=in_len;
                    250: 
                    251:                return IDNA_SUCCESS;
                    252:        } else {
                    253:                /* 4. Remove the ACE prefix.
                    254:                 */
                    255: 
                    256:                in += strlen (IDNA_ACE_PREFIX);
                    257:                in_len-=strlen (IDNA_ACE_PREFIX);
                    258: 
                    259:                /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
                    260:                 * and fail if there is an error. Save a copy of the result of
                    261:                 * this step.
                    262:                 */
                    263: 
                    264:                rc = punycode_decode (in_len, in, out_len, out, NULL);
                    265:                if (rc != PUNYCODE_SUCCESS)
                    266:                        return rc;
                    267: 
                    268:                /* 6. Apply ToASCII.
                    269:                 */
                    270: 
                    271:                rc = idna_to_ascii_4i_internal (out, *out_len, tmpout, flags);
                    272:                if (rc != IDNA_SUCCESS)
                    273:                        return rc;
                    274: 
                    275:                /* 7. Verify that the result of step 6 matches the saved copy from
                    276:                 * step 3, using a case-insensitive ASCII comparison.
                    277:                 */
                    278: 
                    279:                if (strncasecmp (in, tmpout + strlen (IDNA_ACE_PREFIX), in_len) != 0)
                    280:                        return IDNA_ROUNDTRIP_VERIFY_ERROR;
                    281: 
                    282:                /* 8. Return the saved copy from step 5.
                    283:                 */
                    284: 
                    285:                return IDNA_SUCCESS;
                    286:        }
                    287: }
                    288: 
                    289: /**
                    290:  * idna_to_unicode_4z:
                    291:  * @in: zero-terminated string.
                    292:  * @output: pointer to output Unicode string.
                    293:  * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
                    294:  *   %IDNA_USE_STD3_ASCII_RULES.
                    295:  *
                    296:  * Convert possibly ACE encoded domain name into a
                    297:  * UCS-4 string.  The domain name may contain several labels,
                    298:  * separated by dots.  The output buffer must be deallocated by the
                    299:  * caller.
                    300:  *
                    301:  * Return value: Returns %IDNA_SUCCESS on success, or error code.
                    302:  **/
                    303: int pa_idna_to_unicode_4z (const char *in, uint32_t *out, size_t out_len, int flags) {
                    304:        int rc;
                    305:        const char *start = in;
                    306:        const char *end;
                    307:        size_t add_len;
                    308: 
                    309:        do {
1.5       moko      310:                for (end = start; *end && (*end != '.'); end++);
1.1       moko      311: 
                    312:                if (end > start) {
                    313:                        add_len=out_len;
                    314:                        rc = idna_to_unicode_internal (start, (size_t) (end - start), out, &add_len, flags);
                    315:                        if (rc != IDNA_SUCCESS)
                    316:                                return rc;
                    317: 
                    318:                        if (add_len >= out_len)
                    319:                                return PUNYCODE_BIG_OUTPUT;
                    320: 
                    321:                        out+=add_len;
                    322:                        out_len-=add_len;
                    323:                }
                    324: 
                    325:                if (*end) {
                    326:                        if (!out_len)
                    327:                                return PUNYCODE_BIG_OUTPUT;
                    328:                        *(out++) = 0x002E;      /* '.' (full stop) */
                    329:                        out_len--;
                    330:                }
                    331: 
                    332:                start = end + 1;
                    333:        } while (*end);
                    334: 
                    335:        if (!out_len)
                    336:                return PUNYCODE_BIG_OUTPUT;
                    337: 
                    338:        *out=0;
                    339:        return IDNA_SUCCESS;
                    340: }
                    341: 
                    342: /**
                    343:  * IDNA_ACE_PREFIX
                    344:  *
                    345:  * The IANA allocated prefix to use for IDNA. "xn--"
                    346:  */
                    347: 
                    348: /**
                    349:  * Idna_rc:
                    350:  * @IDNA_SUCCESS: Successful operation.  This value is guaranteed to
                    351:  *   always be zero, the remaining ones are only guaranteed to hold
                    352:  *   non-zero values, for logical comparison purposes.
                    353:  * @IDNA_CONTAINS_NON_LDH: For IDNA_USE_STD3_ASCII_RULES, indicate that
                    354:  *   the string contains non-LDH ASCII characters.
                    355:  * @IDNA_CONTAINS_MINUS: For IDNA_USE_STD3_ASCII_RULES, indicate that
                    356:  *   the string contains a leading or trailing hyphen-minus (U+002D).
                    357:  * @IDNA_ROUNDTRIP_VERIFY_ERROR: The ToASCII operation on output
                    358:  *   string does not equal the input.
                    359:  * @IDNA_CONTAINS_ACE_PREFIX: The input contains the ACE prefix (for
                    360:  *   ToASCII).
                    361:  *
                    362:  * Enumerated return codes of idna_to_ascii_4i(),
                    363:  * idna_to_unicode_44i() functions (and functions derived from those
                    364:  * functions).  The value 0 is guaranteed to always correspond to
                    365:  * success.
                    366:  */
                    367: 
                    368: const char *pa_idna_strerror(int rc) {
                    369:        switch (rc) {
                    370:                case IDNA_SUCCESS:
                    371:                        return "Success";
                    372:                case PUNYCODE_BAD_INPUT:
                    373:                        return "Input is invalid";
                    374:                case PUNYCODE_BIG_OUTPUT:
                    375:                        return "String is too long";
                    376:                case PUNYCODE_OVERFLOW:
                    377:                        return "Wider integers needed to process input";
                    378:                case IDNA_CONTAINS_NON_LDH:
                    379:                        return "Non-digit/letter/hyphen in input";
                    380:                case IDNA_CONTAINS_MINUS:
                    381:                        return "Forbidden leading or trailing minus sign (`-')";
                    382:                case IDNA_ROUNDTRIP_VERIFY_ERROR:
                    383:                        return "String not idempotent under ToASCII";
                    384:                case IDNA_CONTAINS_ACE_PREFIX:
                    385:                        return "Input already contain ACE prefix (`xn--')";
                    386:                default:
                    387:                        return "Unknown error";
                    388:        }
                    389: }
                    390: 
                    391: /**
                    392:  * Idna_flags:
                    393:  * @IDNA_ALLOW_UNASSIGNED: Don't reject strings containing unassigned
                    394:  *   Unicode code points.
                    395:  * @IDNA_USE_STD3_ASCII_RULES: Validate strings according to STD3
                    396:  *   rules (i.e., normal host name rules).
                    397:  *
                    398:  * Flags to pass to idna_to_ascii_4i(), idna_to_unicode_44i() etc.
                    399:  */

E-mail: