Annotation of parser3/src/lib/punycode/pa_idna.c, revision 1.1
1.1 ! moko 1: /** @file
! 2: Parser: IDNA support, modified Libidn Version 1.28.
! 3: Copyright (c) 2001-2015 Art. Lebedev Studio (http://www.artlebedev.com)
! 4: */
! 5:
! 6: /* idna.c --- Prototypes for Internationalized Domain Name library.
! 7: Copyright (C) 2002-2013 Simon Josefsson
! 8:
! 9: This file is part of GNU Libidn.
! 10:
! 11: GNU Libidn is free software: you can redistribute it and/or
! 12: modify it under the terms of either:
! 13:
! 14: * the GNU Lesser General Public License as published by the Free
! 15: Software Foundation; either version 3 of the License, or (at
! 16: your option) any later version.
! 17:
! 18: or
! 19:
! 20: * the GNU General Public License as published by the Free
! 21: Software Foundation; either version 2 of the License, or (at
! 22: your option) any later version.
! 23:
! 24: or both in parallel, as here.
! 25:
! 26: GNU Libidn is distributed in the hope that it will be useful,
! 27: but WITHOUT ANY WARRANTY; without even the implied warranty of
! 28: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 29: General Public License for more details.
! 30:
! 31: You should have received copies of the GNU General Public License and
! 32: the GNU Lesser General Public License along with this program. If
! 33: not, see <http://www.gnu.org/licenses/>. */
! 34:
! 35: #include "pa_punycode.h"
! 36: #include "pa_idna.h"
! 37:
! 38: volatile const char * IDENT_PA_IDNA_C="$Id: pa_idna.C,v 1.1 2015/04/14 21:09:57 moko Exp $";
! 39:
! 40: #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || (c) == 0xFF0E || (c) == 0xFF61)
! 41:
! 42: /* Core functions */
! 43:
! 44: /**
! 45: * idna_to_ascii_4z_internal:
! 46: * @src: input array with unicode code points.
! 47: * @len: length of input array with unicode code points.
! 48: * @out: output zero terminated string that must have room for at
! 49: * least 63 characters plus the terminating zero.
! 50: * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
! 51: * %IDNA_USE_STD3_ASCII_RULES.
! 52: *
! 53: * The ToASCII operation takes a sequence of Unicode code points that
! 54: * make up one domain label and transforms it into a sequence of code
! 55: * points in the ASCII range (0..7F). If ToASCII succeeds, the
! 56: * original sequence and the resulting sequence are equivalent labels.
! 57: *
! 58: * It is important to note that the ToASCII operation can fail. ToASCII
! 59: * fails if any step of it fails. If any step of the ToASCII operation
! 60: * fails on any label in a domain name, that domain name MUST NOT be used
! 61: * as an internationalized domain name. The method for deadling with this
! 62: * failure is application-specific.
! 63: *
! 64: * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
! 65: * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
! 66: * sequence of ASCII code points or a failure condition.
! 67: *
! 68: * ToASCII never alters a sequence of code points that are all in the ASCII
! 69: * range to begin with (although it could fail). Applying the ToASCII
! 70: * operation multiple times has exactly the same effect as applying it just
! 71: * once.
! 72: *
! 73: * Return value: Returns 0 on success, or an #Idna_rc error code.
! 74: */
! 75:
! 76: static int idna_to_ascii_4i_internal (const uint32_t *src, size_t len, char *out, int flags) {
! 77: int rc;
! 78: size_t out_len;
! 79:
! 80: /*
! 81: * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
! 82: *
! 83: * (a) Verify the absence of non-LDH ASCII code points; that is,
! 84: * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
! 85: *
! 86: * (b) Verify the absence of leading and trailing hyphen-minus;
! 87: * that is, the absence of U+002D at the beginning and end of
! 88: * the sequence.
! 89: */
! 90:
! 91: if (flags & IDNA_USE_STD3_ASCII_RULES) {
! 92: size_t i;
! 93:
! 94: for (i = 0; i < len; i++)
! 95: if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
! 96: (src[i] >= 0x3A && src[i] <= 0x40) ||
! 97: (src[i] >= 0x5B && src[i] <= 0x60) ||
! 98: (src[i] >= 0x7B && src[i] <= 0x7F))
! 99: return IDNA_CONTAINS_NON_LDH;
! 100:
! 101: if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
! 102: return IDNA_CONTAINS_MINUS;
! 103: }
! 104:
! 105: /*
! 106: * 4. If all code points in the sequence are in the ASCII range
! 107: * (0..7F), then skip to step 8.
! 108: */
! 109:
! 110: {
! 111: size_t i;
! 112:
! 113: for (i = 0; i < len; i++) {
! 114: if (src[i] > 0x7F)
! 115: goto step5;
! 116: }
! 117:
! 118: if (len > 63)
! 119: return PUNYCODE_BIG_OUTPUT;
! 120:
! 121: /* copy string to output buffer if we are about to skip to step8 */
! 122: for (i = 0; i < len; i++)
! 123: out[i]=src[i];
! 124: out[len] = '\0';
! 125: goto step8;
! 126: }
! 127:
! 128: /*
! 129: * 5. Verify that the sequence does NOT begin with the ACE prefix.
! 130: */
! 131:
! 132: step5:
! 133: if (len >= strlen (IDNA_ACE_PREFIX)) {
! 134: size_t i;
! 135:
! 136: for (i = 0; i < strlen (IDNA_ACE_PREFIX); i++)
! 137: if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
! 138: goto step6;
! 139: return IDNA_CONTAINS_ACE_PREFIX;
! 140: }
! 141:
! 142: /*
! 143: * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
! 144: * and fail if there is an error.
! 145: */
! 146:
! 147: step6:
! 148: out_len = 63 - strlen (IDNA_ACE_PREFIX);
! 149: rc = punycode_encode (len, src, NULL, &out_len, out + strlen(IDNA_ACE_PREFIX));
! 150:
! 151: if (rc != PUNYCODE_SUCCESS)
! 152: return rc;
! 153:
! 154: out[strlen (IDNA_ACE_PREFIX) + out_len] = '\0';
! 155:
! 156: /*
! 157: * 7. Prepend the ACE prefix.
! 158: */
! 159:
! 160: memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
! 161:
! 162: step8:
! 163: return IDNA_SUCCESS;
! 164: }
! 165:
! 166: /**
! 167: * idna_to_ascii_4z:
! 168: * @in: zero terminated input Unicode string.
! 169: * @out: pointer to output string.
! 170: * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
! 171: * %IDNA_USE_STD3_ASCII_RULES.
! 172: *
! 173: * Convert UCS-4 domain name to ASCII string. The domain name may
! 174: * contain several labels, separated by dots.
! 175: *
! 176: * Return value: Returns %IDNA_SUCCESS on success, or error code.
! 177: **/
! 178: int pa_idna_to_ascii_4z (const uint32_t *in, char *out, size_t out_len, int flags) {
! 179: int rc;
! 180: const uint32_t *start = in;
! 181: const uint32_t *end;
! 182: size_t add_len;
! 183: char buf[64];
! 184:
! 185: /* 1) Whenever dots are used as label separators, the following
! 186: characters MUST be recognized as dots: U+002E (full stop),
! 187: U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
! 188: U+FF61 (halfwidth ideographic full stop). */
! 189:
! 190: do {
! 191: for (end = start; *end && !DOTP (*end); end++);
! 192:
! 193: if (end > start) {
! 194: rc = idna_to_ascii_4i_internal (start, (size_t) (end - start), buf, flags);
! 195: if (rc != IDNA_SUCCESS)
! 196: return rc;
! 197:
! 198: add_len = strlen (buf);
! 199: if (add_len >= out_len)
! 200: return PUNYCODE_BIG_OUTPUT;
! 201:
! 202: memcpy (out, buf, add_len);
! 203: out += add_len;
! 204: out_len -= add_len;
! 205: }
! 206:
! 207: if (*end) {
! 208: if (!out_len)
! 209: return PUNYCODE_BIG_OUTPUT;
! 210: *(out++)='.';
! 211: out_len--;
! 212: }
! 213:
! 214: start = end + 1;
! 215: } while (*end);
! 216:
! 217: if (!out_len)
! 218: return PUNYCODE_BIG_OUTPUT;
! 219:
! 220: *out='\0';
! 221:
! 222: return IDNA_SUCCESS;
! 223: }
! 224:
! 225: /* ToUnicode(). */
! 226: static int idna_to_unicode_internal (const char *in, size_t in_len, uint32_t *out, size_t *out_len, int flags) {
! 227: int rc;
! 228: char tmpout[64];
! 229:
! 230: /* 3. Verify that the sequence begins with the ACE prefix
! 231: * ... The ToASCII and ToUnicode operations MUST recognize the ACE
! 232: prefix in a case-insensitive manner.
! 233: */
! 234:
! 235: if ( (in_len < strlen (IDNA_ACE_PREFIX)) || (strncasecmp (in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0) ){
! 236: size_t i;
! 237: if (in_len >= *out_len)
! 238: return PUNYCODE_BIG_OUTPUT;
! 239:
! 240: for(i=0; i<in_len; i++){
! 241: if ((unsigned char)in[i] > 0x7F)
! 242: return PUNYCODE_BAD_INPUT;
! 243: out[i]=in[i];
! 244: }
! 245: *out_len=in_len;
! 246:
! 247: return IDNA_SUCCESS;
! 248: } else {
! 249: /* 4. Remove the ACE prefix.
! 250: */
! 251:
! 252: in += strlen (IDNA_ACE_PREFIX);
! 253: in_len-=strlen (IDNA_ACE_PREFIX);
! 254:
! 255: /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
! 256: * and fail if there is an error. Save a copy of the result of
! 257: * this step.
! 258: */
! 259:
! 260: rc = punycode_decode (in_len, in, out_len, out, NULL);
! 261: if (rc != PUNYCODE_SUCCESS)
! 262: return rc;
! 263:
! 264: /* 6. Apply ToASCII.
! 265: */
! 266:
! 267: rc = idna_to_ascii_4i_internal (out, *out_len, tmpout, flags);
! 268: if (rc != IDNA_SUCCESS)
! 269: return rc;
! 270:
! 271: /* 7. Verify that the result of step 6 matches the saved copy from
! 272: * step 3, using a case-insensitive ASCII comparison.
! 273: */
! 274:
! 275: if (strncasecmp (in, tmpout + strlen (IDNA_ACE_PREFIX), in_len) != 0)
! 276: return IDNA_ROUNDTRIP_VERIFY_ERROR;
! 277:
! 278: /* 8. Return the saved copy from step 5.
! 279: */
! 280:
! 281: return IDNA_SUCCESS;
! 282: }
! 283: }
! 284:
! 285: /**
! 286: * idna_to_unicode_4z:
! 287: * @in: zero-terminated string.
! 288: * @output: pointer to output Unicode string.
! 289: * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
! 290: * %IDNA_USE_STD3_ASCII_RULES.
! 291: *
! 292: * Convert possibly ACE encoded domain name into a
! 293: * UCS-4 string. The domain name may contain several labels,
! 294: * separated by dots. The output buffer must be deallocated by the
! 295: * caller.
! 296: *
! 297: * Return value: Returns %IDNA_SUCCESS on success, or error code.
! 298: **/
! 299: int pa_idna_to_unicode_4z (const char *in, uint32_t *out, size_t out_len, int flags) {
! 300: int rc;
! 301: const char *start = in;
! 302: const char *end;
! 303: size_t add_len;
! 304:
! 305: do {
! 306: for (end = start; *end && !DOTP (*end); end++);
! 307:
! 308: if (end > start) {
! 309: add_len=out_len;
! 310: rc = idna_to_unicode_internal (start, (size_t) (end - start), out, &add_len, flags);
! 311: if (rc != IDNA_SUCCESS)
! 312: return rc;
! 313:
! 314: if (add_len >= out_len)
! 315: return PUNYCODE_BIG_OUTPUT;
! 316:
! 317: out+=add_len;
! 318: out_len-=add_len;
! 319: }
! 320:
! 321: if (*end) {
! 322: if (!out_len)
! 323: return PUNYCODE_BIG_OUTPUT;
! 324: *(out++) = 0x002E; /* '.' (full stop) */
! 325: out_len--;
! 326: }
! 327:
! 328: start = end + 1;
! 329: } while (*end);
! 330:
! 331: if (!out_len)
! 332: return PUNYCODE_BIG_OUTPUT;
! 333:
! 334: *out=0;
! 335: return IDNA_SUCCESS;
! 336: }
! 337:
! 338: /**
! 339: * IDNA_ACE_PREFIX
! 340: *
! 341: * The IANA allocated prefix to use for IDNA. "xn--"
! 342: */
! 343:
! 344: /**
! 345: * Idna_rc:
! 346: * @IDNA_SUCCESS: Successful operation. This value is guaranteed to
! 347: * always be zero, the remaining ones are only guaranteed to hold
! 348: * non-zero values, for logical comparison purposes.
! 349: * @IDNA_CONTAINS_NON_LDH: For IDNA_USE_STD3_ASCII_RULES, indicate that
! 350: * the string contains non-LDH ASCII characters.
! 351: * @IDNA_CONTAINS_MINUS: For IDNA_USE_STD3_ASCII_RULES, indicate that
! 352: * the string contains a leading or trailing hyphen-minus (U+002D).
! 353: * @IDNA_ROUNDTRIP_VERIFY_ERROR: The ToASCII operation on output
! 354: * string does not equal the input.
! 355: * @IDNA_CONTAINS_ACE_PREFIX: The input contains the ACE prefix (for
! 356: * ToASCII).
! 357: *
! 358: * Enumerated return codes of idna_to_ascii_4i(),
! 359: * idna_to_unicode_44i() functions (and functions derived from those
! 360: * functions). The value 0 is guaranteed to always correspond to
! 361: * success.
! 362: */
! 363:
! 364: const char *pa_idna_strerror(int rc) {
! 365: switch (rc) {
! 366: case IDNA_SUCCESS:
! 367: return "Success";
! 368: case PUNYCODE_BAD_INPUT:
! 369: return "Input is invalid";
! 370: case PUNYCODE_BIG_OUTPUT:
! 371: return "String is too long";
! 372: case PUNYCODE_OVERFLOW:
! 373: return "Wider integers needed to process input";
! 374: case IDNA_CONTAINS_NON_LDH:
! 375: return "Non-digit/letter/hyphen in input";
! 376: case IDNA_CONTAINS_MINUS:
! 377: return "Forbidden leading or trailing minus sign (`-')";
! 378: case IDNA_ROUNDTRIP_VERIFY_ERROR:
! 379: return "String not idempotent under ToASCII";
! 380: case IDNA_CONTAINS_ACE_PREFIX:
! 381: return "Input already contain ACE prefix (`xn--')";
! 382: default:
! 383: return "Unknown error";
! 384: }
! 385: }
! 386:
! 387: /**
! 388: * Idna_flags:
! 389: * @IDNA_ALLOW_UNASSIGNED: Don't reject strings containing unassigned
! 390: * Unicode code points.
! 391: * @IDNA_USE_STD3_ASCII_RULES: Validate strings according to STD3
! 392: * rules (i.e., normal host name rules).
! 393: *
! 394: * Flags to pass to idna_to_ascii_4i(), idna_to_unicode_44i() etc.
! 395: */
E-mail: