Annotation of parser3/src/lib/punycode/pa_idna.c, revision 1.2
1.1 moko 1: /** @file
2: Parser: IDNA support, modified Libidn Version 1.28.
3: Copyright (c) 2001-2015 Art. Lebedev Studio (http://www.artlebedev.com)
4: */
5:
6: /* idna.c --- Prototypes for Internationalized Domain Name library.
7: Copyright (C) 2002-2013 Simon Josefsson
8:
9: This file is part of GNU Libidn.
10:
11: GNU Libidn is free software: you can redistribute it and/or
12: modify it under the terms of either:
13:
14: * the GNU Lesser General Public License as published by the Free
15: Software Foundation; either version 3 of the License, or (at
16: your option) any later version.
17:
18: or
19:
20: * the GNU General Public License as published by the Free
21: Software Foundation; either version 2 of the License, or (at
22: your option) any later version.
23:
24: or both in parallel, as here.
25:
26: GNU Libidn is distributed in the hope that it will be useful,
27: but WITHOUT ANY WARRANTY; without even the implied warranty of
28: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
29: General Public License for more details.
30:
31: You should have received copies of the GNU General Public License and
32: the GNU Lesser General Public License along with this program. If
33: not, see <http://www.gnu.org/licenses/>. */
34:
35: #include "pa_punycode.h"
36: #include "pa_idna.h"
37:
1.2 ! moko 38: volatile const char * IDENT_PA_IDNA_C="$Id: pa_idna.c,v 1.1 2015/04/14 21:42:52 moko Exp $";
1.1 moko 39:
40: #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || (c) == 0xFF0E || (c) == 0xFF61)
41:
42: /* Core functions */
43:
44: /**
45: * idna_to_ascii_4z_internal:
46: * @src: input array with unicode code points.
47: * @len: length of input array with unicode code points.
48: * @out: output zero terminated string that must have room for at
49: * least 63 characters plus the terminating zero.
50: * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
51: * %IDNA_USE_STD3_ASCII_RULES.
52: *
53: * The ToASCII operation takes a sequence of Unicode code points that
54: * make up one domain label and transforms it into a sequence of code
55: * points in the ASCII range (0..7F). If ToASCII succeeds, the
56: * original sequence and the resulting sequence are equivalent labels.
57: *
58: * It is important to note that the ToASCII operation can fail. ToASCII
59: * fails if any step of it fails. If any step of the ToASCII operation
60: * fails on any label in a domain name, that domain name MUST NOT be used
61: * as an internationalized domain name. The method for deadling with this
62: * failure is application-specific.
63: *
64: * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
65: * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
66: * sequence of ASCII code points or a failure condition.
67: *
68: * ToASCII never alters a sequence of code points that are all in the ASCII
69: * range to begin with (although it could fail). Applying the ToASCII
70: * operation multiple times has exactly the same effect as applying it just
71: * once.
72: *
73: * Return value: Returns 0 on success, or an #Idna_rc error code.
74: */
75:
76: static int idna_to_ascii_4i_internal (const uint32_t *src, size_t len, char *out, int flags) {
77: int rc;
78: size_t out_len;
79:
80: /*
81: * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
82: *
83: * (a) Verify the absence of non-LDH ASCII code points; that is,
84: * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
85: *
86: * (b) Verify the absence of leading and trailing hyphen-minus;
87: * that is, the absence of U+002D at the beginning and end of
88: * the sequence.
89: */
90:
91: if (flags & IDNA_USE_STD3_ASCII_RULES) {
92: size_t i;
93:
94: for (i = 0; i < len; i++)
95: if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
96: (src[i] >= 0x3A && src[i] <= 0x40) ||
97: (src[i] >= 0x5B && src[i] <= 0x60) ||
98: (src[i] >= 0x7B && src[i] <= 0x7F))
99: return IDNA_CONTAINS_NON_LDH;
100:
101: if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
102: return IDNA_CONTAINS_MINUS;
103: }
104:
105: /*
106: * 4. If all code points in the sequence are in the ASCII range
107: * (0..7F), then skip to step 8.
108: */
109:
110: {
111: size_t i;
112:
113: for (i = 0; i < len; i++) {
114: if (src[i] > 0x7F)
115: goto step5;
116: }
117:
118: if (len > 63)
119: return PUNYCODE_BIG_OUTPUT;
120:
121: /* copy string to output buffer if we are about to skip to step8 */
122: for (i = 0; i < len; i++)
1.2 ! moko 123: out[i]=(char)src[i];
1.1 moko 124: out[len] = '\0';
125: goto step8;
126: }
127:
128: /*
129: * 5. Verify that the sequence does NOT begin with the ACE prefix.
130: */
131:
132: step5:
133: if (len >= strlen (IDNA_ACE_PREFIX)) {
134: size_t i;
135:
136: for (i = 0; i < strlen (IDNA_ACE_PREFIX); i++)
137: if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
138: goto step6;
139: return IDNA_CONTAINS_ACE_PREFIX;
140: }
141:
142: /*
143: * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
144: * and fail if there is an error.
145: */
146:
147: step6:
148: out_len = 63 - strlen (IDNA_ACE_PREFIX);
149: rc = punycode_encode (len, src, NULL, &out_len, out + strlen(IDNA_ACE_PREFIX));
150:
151: if (rc != PUNYCODE_SUCCESS)
152: return rc;
153:
154: out[strlen (IDNA_ACE_PREFIX) + out_len] = '\0';
155:
156: /*
157: * 7. Prepend the ACE prefix.
158: */
159:
160: memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
161:
162: step8:
163: return IDNA_SUCCESS;
164: }
165:
166: /**
167: * idna_to_ascii_4z:
168: * @in: zero terminated input Unicode string.
169: * @out: pointer to output string.
170: * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
171: * %IDNA_USE_STD3_ASCII_RULES.
172: *
173: * Convert UCS-4 domain name to ASCII string. The domain name may
174: * contain several labels, separated by dots.
175: *
176: * Return value: Returns %IDNA_SUCCESS on success, or error code.
177: **/
178: int pa_idna_to_ascii_4z (const uint32_t *in, char *out, size_t out_len, int flags) {
179: int rc;
180: const uint32_t *start = in;
181: const uint32_t *end;
182: size_t add_len;
183: char buf[64];
184:
185: /* 1) Whenever dots are used as label separators, the following
186: characters MUST be recognized as dots: U+002E (full stop),
187: U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
188: U+FF61 (halfwidth ideographic full stop). */
189:
190: do {
191: for (end = start; *end && !DOTP (*end); end++);
192:
193: if (end > start) {
194: rc = idna_to_ascii_4i_internal (start, (size_t) (end - start), buf, flags);
195: if (rc != IDNA_SUCCESS)
196: return rc;
197:
198: add_len = strlen (buf);
199: if (add_len >= out_len)
200: return PUNYCODE_BIG_OUTPUT;
201:
202: memcpy (out, buf, add_len);
203: out += add_len;
204: out_len -= add_len;
205: }
206:
207: if (*end) {
208: if (!out_len)
209: return PUNYCODE_BIG_OUTPUT;
210: *(out++)='.';
211: out_len--;
212: }
213:
214: start = end + 1;
215: } while (*end);
216:
217: if (!out_len)
218: return PUNYCODE_BIG_OUTPUT;
219:
220: *out='\0';
221:
222: return IDNA_SUCCESS;
223: }
224:
225: /* ToUnicode(). */
226: static int idna_to_unicode_internal (const char *in, size_t in_len, uint32_t *out, size_t *out_len, int flags) {
227: int rc;
228: char tmpout[64];
229:
230: /* 3. Verify that the sequence begins with the ACE prefix
231: * ... The ToASCII and ToUnicode operations MUST recognize the ACE
232: prefix in a case-insensitive manner.
233: */
234:
235: if ( (in_len < strlen (IDNA_ACE_PREFIX)) || (strncasecmp (in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0) ){
236: size_t i;
237: if (in_len >= *out_len)
238: return PUNYCODE_BIG_OUTPUT;
239:
240: for(i=0; i<in_len; i++){
241: if ((unsigned char)in[i] > 0x7F)
242: return PUNYCODE_BAD_INPUT;
243: out[i]=in[i];
244: }
245: *out_len=in_len;
246:
247: return IDNA_SUCCESS;
248: } else {
249: /* 4. Remove the ACE prefix.
250: */
251:
252: in += strlen (IDNA_ACE_PREFIX);
253: in_len-=strlen (IDNA_ACE_PREFIX);
254:
255: /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
256: * and fail if there is an error. Save a copy of the result of
257: * this step.
258: */
259:
260: rc = punycode_decode (in_len, in, out_len, out, NULL);
261: if (rc != PUNYCODE_SUCCESS)
262: return rc;
263:
264: /* 6. Apply ToASCII.
265: */
266:
267: rc = idna_to_ascii_4i_internal (out, *out_len, tmpout, flags);
268: if (rc != IDNA_SUCCESS)
269: return rc;
270:
271: /* 7. Verify that the result of step 6 matches the saved copy from
272: * step 3, using a case-insensitive ASCII comparison.
273: */
274:
275: if (strncasecmp (in, tmpout + strlen (IDNA_ACE_PREFIX), in_len) != 0)
276: return IDNA_ROUNDTRIP_VERIFY_ERROR;
277:
278: /* 8. Return the saved copy from step 5.
279: */
280:
281: return IDNA_SUCCESS;
282: }
283: }
284:
285: /**
286: * idna_to_unicode_4z:
287: * @in: zero-terminated string.
288: * @output: pointer to output Unicode string.
289: * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
290: * %IDNA_USE_STD3_ASCII_RULES.
291: *
292: * Convert possibly ACE encoded domain name into a
293: * UCS-4 string. The domain name may contain several labels,
294: * separated by dots. The output buffer must be deallocated by the
295: * caller.
296: *
297: * Return value: Returns %IDNA_SUCCESS on success, or error code.
298: **/
299: int pa_idna_to_unicode_4z (const char *in, uint32_t *out, size_t out_len, int flags) {
300: int rc;
301: const char *start = in;
302: const char *end;
303: size_t add_len;
304:
305: do {
306: for (end = start; *end && !DOTP (*end); end++);
307:
308: if (end > start) {
309: add_len=out_len;
310: rc = idna_to_unicode_internal (start, (size_t) (end - start), out, &add_len, flags);
311: if (rc != IDNA_SUCCESS)
312: return rc;
313:
314: if (add_len >= out_len)
315: return PUNYCODE_BIG_OUTPUT;
316:
317: out+=add_len;
318: out_len-=add_len;
319: }
320:
321: if (*end) {
322: if (!out_len)
323: return PUNYCODE_BIG_OUTPUT;
324: *(out++) = 0x002E; /* '.' (full stop) */
325: out_len--;
326: }
327:
328: start = end + 1;
329: } while (*end);
330:
331: if (!out_len)
332: return PUNYCODE_BIG_OUTPUT;
333:
334: *out=0;
335: return IDNA_SUCCESS;
336: }
337:
338: /**
339: * IDNA_ACE_PREFIX
340: *
341: * The IANA allocated prefix to use for IDNA. "xn--"
342: */
343:
344: /**
345: * Idna_rc:
346: * @IDNA_SUCCESS: Successful operation. This value is guaranteed to
347: * always be zero, the remaining ones are only guaranteed to hold
348: * non-zero values, for logical comparison purposes.
349: * @IDNA_CONTAINS_NON_LDH: For IDNA_USE_STD3_ASCII_RULES, indicate that
350: * the string contains non-LDH ASCII characters.
351: * @IDNA_CONTAINS_MINUS: For IDNA_USE_STD3_ASCII_RULES, indicate that
352: * the string contains a leading or trailing hyphen-minus (U+002D).
353: * @IDNA_ROUNDTRIP_VERIFY_ERROR: The ToASCII operation on output
354: * string does not equal the input.
355: * @IDNA_CONTAINS_ACE_PREFIX: The input contains the ACE prefix (for
356: * ToASCII).
357: *
358: * Enumerated return codes of idna_to_ascii_4i(),
359: * idna_to_unicode_44i() functions (and functions derived from those
360: * functions). The value 0 is guaranteed to always correspond to
361: * success.
362: */
363:
364: const char *pa_idna_strerror(int rc) {
365: switch (rc) {
366: case IDNA_SUCCESS:
367: return "Success";
368: case PUNYCODE_BAD_INPUT:
369: return "Input is invalid";
370: case PUNYCODE_BIG_OUTPUT:
371: return "String is too long";
372: case PUNYCODE_OVERFLOW:
373: return "Wider integers needed to process input";
374: case IDNA_CONTAINS_NON_LDH:
375: return "Non-digit/letter/hyphen in input";
376: case IDNA_CONTAINS_MINUS:
377: return "Forbidden leading or trailing minus sign (`-')";
378: case IDNA_ROUNDTRIP_VERIFY_ERROR:
379: return "String not idempotent under ToASCII";
380: case IDNA_CONTAINS_ACE_PREFIX:
381: return "Input already contain ACE prefix (`xn--')";
382: default:
383: return "Unknown error";
384: }
385: }
386:
387: /**
388: * Idna_flags:
389: * @IDNA_ALLOW_UNASSIGNED: Don't reject strings containing unassigned
390: * Unicode code points.
391: * @IDNA_USE_STD3_ASCII_RULES: Validate strings according to STD3
392: * rules (i.e., normal host name rules).
393: *
394: * Flags to pass to idna_to_ascii_4i(), idna_to_unicode_44i() etc.
395: */
E-mail: