Annotation of parser3/src/lib/punycode/pa_punycode.c, revision 1.1
1.1 ! moko 1: /* punycode.c --- Implementation of punycode used to ASCII encode IDN's.
! 2: Copyright (C) 2002-2013 Simon Josefsson
! 3:
! 4: This file is part of GNU Libidn.
! 5:
! 6: GNU Libidn is free software: you can redistribute it and/or
! 7: modify it under the terms of either:
! 8:
! 9: * the GNU Lesser General Public License as published by the Free
! 10: Software Foundation; either version 3 of the License, or (at
! 11: your option) any later version.
! 12:
! 13: or
! 14:
! 15: * the GNU General Public License as published by the Free
! 16: Software Foundation; either version 2 of the License, or (at
! 17: your option) any later version.
! 18:
! 19: or both in parallel, as here.
! 20:
! 21: GNU Libidn is distributed in the hope that it will be useful,
! 22: but WITHOUT ANY WARRANTY; without even the implied warranty of
! 23: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 24: General Public License for more details.
! 25:
! 26: You should have received copies of the GNU General Public License and
! 27: the GNU Lesser General Public License along with this program. If
! 28: not, see <http://www.gnu.org/licenses/>. */
! 29:
! 30: /*
! 31: * This file is derived from RFC 3492bis written by Adam M. Costello.
! 32: *
! 33: * Disclaimer and license: Regarding this entire document or any
! 34: * portion of it (including the pseudocode and C code), the author
! 35: * makes no guarantees and is not responsible for any damage resulting
! 36: * from its use. The author grants irrevocable permission to anyone
! 37: * to use, modify, and distribute it in any way that does not diminish
! 38: * the rights of anyone else to use, modify, and distribute it,
! 39: * provided that redistributed derivative works do not contain
! 40: * misleading author or version information. Derivative works need
! 41: * not be licensed under similar terms.
! 42: *
! 43: * Copyright (C) The Internet Society (2003). All Rights Reserved.
! 44: *
! 45: * This document and translations of it may be copied and furnished to
! 46: * others, and derivative works that comment on or otherwise explain it
! 47: * or assist in its implementation may be prepared, copied, published
! 48: * and distributed, in whole or in part, without restriction of any
! 49: * kind, provided that the above copyright notice and this paragraph are
! 50: * included on all such copies and derivative works. However, this
! 51: * document itself may not be modified in any way, such as by removing
! 52: * the copyright notice or references to the Internet Society or other
! 53: * Internet organizations, except as needed for the purpose of
! 54: * developing Internet standards in which case the procedures for
! 55: * copyrights defined in the Internet Standards process must be
! 56: * followed, or as required to translate it into languages other than
! 57: * English.
! 58: *
! 59: * The limited permissions granted above are perpetual and will not be
! 60: * revoked by the Internet Society or its successors or assigns.
! 61: *
! 62: * This document and the information contained herein is provided on an
! 63: * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
! 64: * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
! 65: * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
! 66: * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
! 67: * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
! 68: */
! 69:
! 70: #include "pa_punycode.h"
! 71:
! 72: /*** Bootstring parameters for Punycode ***/
! 73:
! 74: enum
! 75: { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
! 76: initial_bias = 72, initial_n = 0x80, delimiter = 0x2D
! 77: };
! 78:
! 79: /* basic(cp) tests whether cp is a basic code point: */
! 80: #define basic(cp) ((punycode_uint)(cp) < 0x80)
! 81:
! 82: /* delim(cp) tests whether cp is a delimiter: */
! 83: #define delim(cp) ((cp) == delimiter)
! 84:
! 85: /* decode_digit(cp) returns the numeric value of a basic code */
! 86: /* point (for use in representing integers) in the range 0 to */
! 87: /* base-1, or base if cp does not represent a value. */
! 88:
! 89: static punycode_uint
! 90: decode_digit (punycode_uint cp)
! 91: {
! 92: return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
! 93: cp - 97 < 26 ? cp - 97 : base;
! 94: }
! 95:
! 96: /* encode_digit(d,flag) returns the basic code point whose value */
! 97: /* (when used for representing integers) is d, which needs to be in */
! 98: /* the range 0 to base-1. The lowercase form is used unless flag is */
! 99: /* nonzero, in which case the uppercase form is used. The behavior */
! 100: /* is undefined if flag is nonzero and digit d has no uppercase form. */
! 101:
! 102: static char
! 103: encode_digit (punycode_uint d, int flag)
! 104: {
! 105: return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
! 106: /* 0..25 map to ASCII a..z or A..Z */
! 107: /* 26..35 map to ASCII 0..9 */
! 108: }
! 109:
! 110: /* flagged(bcp) tests whether a basic code point is flagged */
! 111: /* (uppercase). The behavior is undefined if bcp is not a */
! 112: /* basic code point. */
! 113:
! 114: #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
! 115:
! 116: /* encode_basic(bcp,flag) forces a basic code point to lowercase */
! 117: /* if flag is zero, uppercase if flag is nonzero, and returns */
! 118: /* the resulting code point. The code point is unchanged if it */
! 119: /* is caseless. The behavior is undefined if bcp is not a basic */
! 120: /* code point. */
! 121:
! 122: static char
! 123: encode_basic (punycode_uint bcp, int flag)
! 124: {
! 125: bcp -= (bcp - 97 < 26) << 5;
! 126: return bcp + ((!flag && (bcp - 65 < 26)) << 5);
! 127: }
! 128:
! 129: /*** Platform-specific constants ***/
! 130:
! 131: /* maxint is the maximum value of a punycode_uint variable: */
! 132: static const punycode_uint maxint = -1;
! 133: /* Because maxint is unsigned, -1 becomes the maximum value. */
! 134:
! 135: /*** Bias adaptation function ***/
! 136:
! 137: static punycode_uint
! 138: adapt (punycode_uint delta, punycode_uint numpoints, int firsttime)
! 139: {
! 140: punycode_uint k;
! 141:
! 142: delta = firsttime ? delta / damp : delta >> 1;
! 143: /* delta >> 1 is a faster way of doing delta / 2 */
! 144: delta += delta / numpoints;
! 145:
! 146: for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base)
! 147: {
! 148: delta /= base - tmin;
! 149: }
! 150:
! 151: return k + (base - tmin + 1) * delta / (delta + skew);
! 152: }
! 153:
! 154: /*** Main encode function ***/
! 155:
! 156: /**
! 157: * punycode_encode:
! 158: * @input_length: The number of code points in the @input array and
! 159: * the number of flags in the @case_flags array.
! 160: * @input: An array of code points. They are presumed to be Unicode
! 161: * code points, but that is not strictly REQUIRED. The array
! 162: * contains code points, not code units. UTF-16 uses code units
! 163: * D800 through DFFF to refer to code points 10000..10FFFF. The
! 164: * code points D800..DFFF do not occur in any valid Unicode string.
! 165: * The code points that can occur in Unicode strings (0..D7FF and
! 166: * E000..10FFFF) are also called Unicode scalar values.
! 167: * @case_flags: A %NULL pointer or an array of boolean values parallel
! 168: * to the @input array. Nonzero (true, flagged) suggests that the
! 169: * corresponding Unicode character be forced to uppercase after
! 170: * being decoded (if possible), and zero (false, unflagged) suggests
! 171: * that it be forced to lowercase (if possible). ASCII code points
! 172: * (0..7F) are encoded literally, except that ASCII letters are
! 173: * forced to uppercase or lowercase according to the corresponding
! 174: * case flags. If @case_flags is a %NULL pointer then ASCII letters
! 175: * are left as they are, and other code points are treated as
! 176: * unflagged.
! 177: * @output_length: The caller passes in the maximum number of ASCII
! 178: * code points that it can receive. On successful return it will
! 179: * contain the number of ASCII code points actually output.
! 180: * @output: An array of ASCII code points. It is *not*
! 181: * null-terminated; it will contain zeros if and only if the @input
! 182: * contains zeros. (Of course the caller can leave room for a
! 183: * terminator and add one if needed.)
! 184: *
! 185: * Converts a sequence of code points (presumed to be Unicode code
! 186: * points) to Punycode.
! 187: *
! 188: * Return value: The return value can be any of the #Punycode_status
! 189: * values defined above except %PUNYCODE_BAD_INPUT. If not
! 190: * %PUNYCODE_SUCCESS, then @output_size and @output might contain
! 191: * garbage.
! 192: **/
! 193: int
! 194: punycode_encode (size_t input_length,
! 195: const punycode_uint input[],
! 196: const unsigned char case_flags[],
! 197: size_t * output_length, char output[])
! 198: {
! 199: punycode_uint input_len, n, delta, h, b, bias, j, m, q, k, t;
! 200: size_t out, max_out;
! 201:
! 202: /* The Punycode spec assumes that the input length is the same type */
! 203: /* of integer as a code point, so we need to convert the size_t to */
! 204: /* a punycode_uint, which could overflow. */
! 205:
! 206: if (input_length > maxint)
! 207: return punycode_overflow;
! 208: input_len = (punycode_uint) input_length;
! 209:
! 210: /* Initialize the state: */
! 211:
! 212: n = initial_n;
! 213: delta = 0;
! 214: out = 0;
! 215: max_out = *output_length;
! 216: bias = initial_bias;
! 217:
! 218: /* Handle the basic code points: */
! 219:
! 220: for (j = 0; j < input_len; ++j)
! 221: {
! 222: if (basic (input[j]))
! 223: {
! 224: if (max_out - out < 2)
! 225: return punycode_big_output;
! 226: output[out++] = case_flags ?
! 227: encode_basic (input[j], case_flags[j]) : (char) input[j];
! 228: }
! 229: /* else if (input[j] < n) return punycode_bad_input; */
! 230: /* (not needed for Punycode with unsigned code points) */
! 231: }
! 232:
! 233: h = b = (punycode_uint) out;
! 234: /* cannot overflow because out <= input_len <= maxint */
! 235:
! 236: /* h is the number of code points that have been handled, b is the */
! 237: /* number of basic code points, and out is the number of ASCII code */
! 238: /* points that have been output. */
! 239:
! 240: if (b > 0)
! 241: output[out++] = delimiter;
! 242:
! 243: /* Main encoding loop: */
! 244:
! 245: while (h < input_len)
! 246: {
! 247: /* All non-basic code points < n have been */
! 248: /* handled already. Find the next larger one: */
! 249:
! 250: for (m = maxint, j = 0; j < input_len; ++j)
! 251: {
! 252: /* if (basic(input[j])) continue; */
! 253: /* (not needed for Punycode) */
! 254: if (input[j] >= n && input[j] < m)
! 255: m = input[j];
! 256: }
! 257:
! 258: /* Increase delta enough to advance the decoder's */
! 259: /* <n,i> state to <m,0>, but guard against overflow: */
! 260:
! 261: if (m - n > (maxint - delta) / (h + 1))
! 262: return punycode_overflow;
! 263: delta += (m - n) * (h + 1);
! 264: n = m;
! 265:
! 266: for (j = 0; j < input_len; ++j)
! 267: {
! 268: /* Punycode does not need to check whether input[j] is basic: */
! 269: if (input[j] < n /* || basic(input[j]) */ )
! 270: {
! 271: if (++delta == 0)
! 272: return punycode_overflow;
! 273: }
! 274:
! 275: if (input[j] == n)
! 276: {
! 277: /* Represent delta as a generalized variable-length integer: */
! 278:
! 279: for (q = delta, k = base;; k += base)
! 280: {
! 281: if (out >= max_out)
! 282: return punycode_big_output;
! 283: t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
! 284: k >= bias + tmax ? tmax : k - bias;
! 285: if (q < t)
! 286: break;
! 287: output[out++] = encode_digit (t + (q - t) % (base - t), 0);
! 288: q = (q - t) / (base - t);
! 289: }
! 290:
! 291: output[out++] = encode_digit (q, case_flags && case_flags[j]);
! 292: bias = adapt (delta, h + 1, h == b);
! 293: delta = 0;
! 294: ++h;
! 295: }
! 296: }
! 297:
! 298: ++delta, ++n;
! 299: }
! 300:
! 301: *output_length = out;
! 302: return punycode_success;
! 303: }
! 304:
! 305: /*** Main decode function ***/
! 306:
! 307: /**
! 308: * punycode_decode:
! 309: * @input_length: The number of ASCII code points in the @input array.
! 310: * @input: An array of ASCII code points (0..7F).
! 311: * @output_length: The caller passes in the maximum number of code
! 312: * points that it can receive into the @output array (which is also
! 313: * the maximum number of flags that it can receive into the
! 314: * @case_flags array, if @case_flags is not a %NULL pointer). On
! 315: * successful return it will contain the number of code points
! 316: * actually output (which is also the number of flags actually
! 317: * output, if case_flags is not a null pointer). The decoder will
! 318: * never need to output more code points than the number of ASCII
! 319: * code points in the input, because of the way the encoding is
! 320: * defined. The number of code points output cannot exceed the
! 321: * maximum possible value of a punycode_uint, even if the supplied
! 322: * @output_length is greater than that.
! 323: * @output: An array of code points like the input argument of
! 324: * punycode_encode() (see above).
! 325: * @case_flags: A %NULL pointer (if the flags are not needed by the
! 326: * caller) or an array of boolean values parallel to the @output
! 327: * array. Nonzero (true, flagged) suggests that the corresponding
! 328: * Unicode character be forced to uppercase by the caller (if
! 329: * possible), and zero (false, unflagged) suggests that it be forced
! 330: * to lowercase (if possible). ASCII code points (0..7F) are output
! 331: * already in the proper case, but their flags will be set
! 332: * appropriately so that applying the flags would be harmless.
! 333: *
! 334: * Converts Punycode to a sequence of code points (presumed to be
! 335: * Unicode code points).
! 336: *
! 337: * Return value: The return value can be any of the #Punycode_status
! 338: * values defined above. If not %PUNYCODE_SUCCESS, then
! 339: * @output_length, @output, and @case_flags might contain garbage.
! 340: *
! 341: **/
! 342: int
! 343: punycode_decode (size_t input_length,
! 344: const char input[],
! 345: size_t * output_length,
! 346: punycode_uint output[], unsigned char case_flags[])
! 347: {
! 348: punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t;
! 349: size_t b, j, in;
! 350:
! 351: /* Initialize the state: */
! 352:
! 353: n = initial_n;
! 354: out = i = 0;
! 355: max_out = *output_length > maxint ? maxint
! 356: : (punycode_uint) * output_length;
! 357: bias = initial_bias;
! 358:
! 359: /* Handle the basic code points: Let b be the number of input code */
! 360: /* points before the last delimiter, or 0 if there is none, then */
! 361: /* copy the first b code points to the output. */
! 362:
! 363: for (b = j = 0; j < input_length; ++j)
! 364: if (delim (input[j]))
! 365: b = j;
! 366: if (b > max_out)
! 367: return punycode_big_output;
! 368:
! 369: for (j = 0; j < b; ++j)
! 370: {
! 371: if (case_flags)
! 372: case_flags[out] = flagged (input[j]);
! 373: if (!basic (input[j]))
! 374: return punycode_bad_input;
! 375: output[out++] = input[j];
! 376: }
! 377:
! 378: /* Main decoding loop: Start just after the last delimiter if any */
! 379: /* basic code points were copied; start at the beginning otherwise. */
! 380:
! 381: for (in = b > 0 ? b + 1 : 0; in < input_length; ++out)
! 382: {
! 383:
! 384: /* in is the index of the next ASCII code point to be consumed, */
! 385: /* and out is the number of code points in the output array. */
! 386:
! 387: /* Decode a generalized variable-length integer into delta, */
! 388: /* which gets added to i. The overflow checking is easier */
! 389: /* if we increase i as we go, then subtract off its starting */
! 390: /* value at the end to obtain delta. */
! 391:
! 392: for (oldi = i, w = 1, k = base;; k += base)
! 393: {
! 394: if (in >= input_length)
! 395: return punycode_bad_input;
! 396: digit = decode_digit (input[in++]);
! 397: if (digit >= base)
! 398: return punycode_bad_input;
! 399: if (digit > (maxint - i) / w)
! 400: return punycode_overflow;
! 401: i += digit * w;
! 402: t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
! 403: k >= bias + tmax ? tmax : k - bias;
! 404: if (digit < t)
! 405: break;
! 406: if (w > maxint / (base - t))
! 407: return punycode_overflow;
! 408: w *= (base - t);
! 409: }
! 410:
! 411: bias = adapt (i - oldi, out + 1, oldi == 0);
! 412:
! 413: /* i was supposed to wrap around from out+1 to 0, */
! 414: /* incrementing n each time, so we'll fix that now: */
! 415:
! 416: if (i / (out + 1) > maxint - n)
! 417: return punycode_overflow;
! 418: n += i / (out + 1);
! 419: i %= (out + 1);
! 420:
! 421: /* Insert n at position i of the output: */
! 422:
! 423: /* not needed for Punycode: */
! 424: /* if (basic(n)) return punycode_invalid_input; */
! 425: if (out >= max_out)
! 426: return punycode_big_output;
! 427:
! 428: if (case_flags)
! 429: {
! 430: memmove (case_flags + i + 1, case_flags + i, out - i);
! 431: /* Case of last ASCII code point determines case flag: */
! 432: case_flags[i] = flagged (input[in - 1]);
! 433: }
! 434:
! 435: memmove (output + i + 1, output + i, (out - i) * sizeof *output);
! 436: output[i++] = n;
! 437: }
! 438:
! 439: *output_length = (size_t) out;
! 440: /* cannot overflow because out <= old value of *output_length */
! 441: return punycode_success;
! 442: }
! 443:
! 444: /**
! 445: * punycode_uint
! 446: *
! 447: * Unicode code point data type, this is always a 32 bit unsigned
! 448: * integer.
! 449: */
! 450:
! 451: /**
! 452: * Punycode_status
! 453: * @PUNYCODE_SUCCESS: Successful operation. This value is guaranteed
! 454: * to always be zero, the remaining ones are only guaranteed to hold
! 455: * non-zero values, for logical comparison purposes.
! 456: * @PUNYCODE_BAD_INPUT: Input is invalid.
! 457: * @PUNYCODE_BIG_OUTPUT: Output would exceed the space provided.
! 458: * @PUNYCODE_OVERFLOW: Input needs wider integers to process.
! 459: *
! 460: * Enumerated return codes of punycode_encode() and punycode_decode().
! 461: * The value 0 is guaranteed to always correspond to success.
! 462: */
E-mail: