Annotation of parser3/src/lib/punycode/pa_idna.c, revision 1.7
1.1 moko 1: /** @file
2: Parser: IDNA support, modified Libidn Version 1.28.
1.7 ! moko 3: Copyright (c) 2001-2020 Art. Lebedev Studio (http://www.artlebedev.com)
1.1 moko 4: */
5:
6: /* idna.c --- Prototypes for Internationalized Domain Name library.
7: Copyright (C) 2002-2013 Simon Josefsson
8:
9: This file is part of GNU Libidn.
10:
11: GNU Libidn is free software: you can redistribute it and/or
12: modify it under the terms of either:
13:
14: * the GNU Lesser General Public License as published by the Free
15: Software Foundation; either version 3 of the License, or (at
16: your option) any later version.
17:
18: or
19:
20: * the GNU General Public License as published by the Free
21: Software Foundation; either version 2 of the License, or (at
22: your option) any later version.
23:
24: or both in parallel, as here.
25:
26: GNU Libidn is distributed in the hope that it will be useful,
27: but WITHOUT ANY WARRANTY; without even the implied warranty of
28: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
29: General Public License for more details.
30:
31: You should have received copies of the GNU General Public License and
32: the GNU Lesser General Public License along with this program. If
33: not, see <http://www.gnu.org/licenses/>. */
34:
35: #include "pa_punycode.h"
36: #include "pa_idna.h"
37:
1.7 ! moko 38: volatile const char * IDENT_PA_IDNA_C="$Id: pa_idna.c,v 1.6 2017/02/07 22:00:38 moko Exp $";
1.1 moko 39:
40: #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || (c) == 0xFF0E || (c) == 0xFF61)
41:
1.4 moko 42: #ifdef _MSC_VER
1.3 moko 43: # define strncasecmp _strnicmp
44: #endif
45:
1.1 moko 46: /* Core functions */
47:
48: /**
49: * idna_to_ascii_4z_internal:
50: * @src: input array with unicode code points.
51: * @len: length of input array with unicode code points.
52: * @out: output zero terminated string that must have room for at
53: * least 63 characters plus the terminating zero.
54: * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
55: * %IDNA_USE_STD3_ASCII_RULES.
56: *
57: * The ToASCII operation takes a sequence of Unicode code points that
58: * make up one domain label and transforms it into a sequence of code
59: * points in the ASCII range (0..7F). If ToASCII succeeds, the
60: * original sequence and the resulting sequence are equivalent labels.
61: *
62: * It is important to note that the ToASCII operation can fail. ToASCII
63: * fails if any step of it fails. If any step of the ToASCII operation
64: * fails on any label in a domain name, that domain name MUST NOT be used
65: * as an internationalized domain name. The method for deadling with this
66: * failure is application-specific.
67: *
68: * The inputs to ToASCII are a sequence of code points, the AllowUnassigned
69: * flag, and the UseSTD3ASCIIRules flag. The output of ToASCII is either a
70: * sequence of ASCII code points or a failure condition.
71: *
72: * ToASCII never alters a sequence of code points that are all in the ASCII
73: * range to begin with (although it could fail). Applying the ToASCII
74: * operation multiple times has exactly the same effect as applying it just
75: * once.
76: *
77: * Return value: Returns 0 on success, or an #Idna_rc error code.
78: */
79:
80: static int idna_to_ascii_4i_internal (const uint32_t *src, size_t len, char *out, int flags) {
81: int rc;
82: size_t out_len;
83:
84: /*
85: * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
86: *
87: * (a) Verify the absence of non-LDH ASCII code points; that is,
88: * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
89: *
90: * (b) Verify the absence of leading and trailing hyphen-minus;
91: * that is, the absence of U+002D at the beginning and end of
92: * the sequence.
93: */
94:
95: if (flags & IDNA_USE_STD3_ASCII_RULES) {
96: size_t i;
97:
98: for (i = 0; i < len; i++)
99: if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
100: (src[i] >= 0x3A && src[i] <= 0x40) ||
101: (src[i] >= 0x5B && src[i] <= 0x60) ||
102: (src[i] >= 0x7B && src[i] <= 0x7F))
103: return IDNA_CONTAINS_NON_LDH;
104:
105: if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
106: return IDNA_CONTAINS_MINUS;
107: }
108:
109: /*
110: * 4. If all code points in the sequence are in the ASCII range
111: * (0..7F), then skip to step 8.
112: */
113:
114: {
115: size_t i;
116:
117: for (i = 0; i < len; i++) {
118: if (src[i] > 0x7F)
119: goto step5;
120: }
121:
122: if (len > 63)
123: return PUNYCODE_BIG_OUTPUT;
124:
125: /* copy string to output buffer if we are about to skip to step8 */
126: for (i = 0; i < len; i++)
1.2 moko 127: out[i]=(char)src[i];
1.1 moko 128: out[len] = '\0';
129: goto step8;
130: }
131:
132: /*
133: * 5. Verify that the sequence does NOT begin with the ACE prefix.
134: */
135:
136: step5:
137: if (len >= strlen (IDNA_ACE_PREFIX)) {
138: size_t i;
139:
140: for (i = 0; i < strlen (IDNA_ACE_PREFIX); i++)
141: if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
142: goto step6;
143: return IDNA_CONTAINS_ACE_PREFIX;
144: }
145:
146: /*
147: * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
148: * and fail if there is an error.
149: */
150:
151: step6:
152: out_len = 63 - strlen (IDNA_ACE_PREFIX);
153: rc = punycode_encode (len, src, NULL, &out_len, out + strlen(IDNA_ACE_PREFIX));
154:
155: if (rc != PUNYCODE_SUCCESS)
156: return rc;
157:
158: out[strlen (IDNA_ACE_PREFIX) + out_len] = '\0';
159:
160: /*
161: * 7. Prepend the ACE prefix.
162: */
163:
164: memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
165:
166: step8:
167: return IDNA_SUCCESS;
168: }
169:
170: /**
171: * idna_to_ascii_4z:
172: * @in: zero terminated input Unicode string.
173: * @out: pointer to output string.
174: * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
175: * %IDNA_USE_STD3_ASCII_RULES.
176: *
177: * Convert UCS-4 domain name to ASCII string. The domain name may
178: * contain several labels, separated by dots.
179: *
180: * Return value: Returns %IDNA_SUCCESS on success, or error code.
181: **/
182: int pa_idna_to_ascii_4z (const uint32_t *in, char *out, size_t out_len, int flags) {
183: int rc;
184: const uint32_t *start = in;
185: const uint32_t *end;
186: size_t add_len;
187: char buf[64];
188:
189: /* 1) Whenever dots are used as label separators, the following
190: characters MUST be recognized as dots: U+002E (full stop),
191: U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
192: U+FF61 (halfwidth ideographic full stop). */
193:
194: do {
195: for (end = start; *end && !DOTP (*end); end++);
196:
197: if (end > start) {
198: rc = idna_to_ascii_4i_internal (start, (size_t) (end - start), buf, flags);
199: if (rc != IDNA_SUCCESS)
200: return rc;
201:
202: add_len = strlen (buf);
203: if (add_len >= out_len)
204: return PUNYCODE_BIG_OUTPUT;
205:
206: memcpy (out, buf, add_len);
207: out += add_len;
208: out_len -= add_len;
209: }
210:
211: if (*end) {
212: if (!out_len)
213: return PUNYCODE_BIG_OUTPUT;
214: *(out++)='.';
215: out_len--;
216: }
217:
218: start = end + 1;
219: } while (*end);
220:
221: if (!out_len)
222: return PUNYCODE_BIG_OUTPUT;
223:
224: *out='\0';
225:
226: return IDNA_SUCCESS;
227: }
228:
229: /* ToUnicode(). */
230: static int idna_to_unicode_internal (const char *in, size_t in_len, uint32_t *out, size_t *out_len, int flags) {
231: int rc;
232: char tmpout[64];
233:
234: /* 3. Verify that the sequence begins with the ACE prefix
235: * ... The ToASCII and ToUnicode operations MUST recognize the ACE
236: prefix in a case-insensitive manner.
237: */
238:
239: if ( (in_len < strlen (IDNA_ACE_PREFIX)) || (strncasecmp (in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0) ){
240: size_t i;
241: if (in_len >= *out_len)
242: return PUNYCODE_BIG_OUTPUT;
243:
244: for(i=0; i<in_len; i++){
245: if ((unsigned char)in[i] > 0x7F)
246: return PUNYCODE_BAD_INPUT;
247: out[i]=in[i];
248: }
249: *out_len=in_len;
250:
251: return IDNA_SUCCESS;
252: } else {
253: /* 4. Remove the ACE prefix.
254: */
255:
256: in += strlen (IDNA_ACE_PREFIX);
257: in_len-=strlen (IDNA_ACE_PREFIX);
258:
259: /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
260: * and fail if there is an error. Save a copy of the result of
261: * this step.
262: */
263:
264: rc = punycode_decode (in_len, in, out_len, out, NULL);
265: if (rc != PUNYCODE_SUCCESS)
266: return rc;
267:
268: /* 6. Apply ToASCII.
269: */
270:
271: rc = idna_to_ascii_4i_internal (out, *out_len, tmpout, flags);
272: if (rc != IDNA_SUCCESS)
273: return rc;
274:
275: /* 7. Verify that the result of step 6 matches the saved copy from
276: * step 3, using a case-insensitive ASCII comparison.
277: */
278:
279: if (strncasecmp (in, tmpout + strlen (IDNA_ACE_PREFIX), in_len) != 0)
280: return IDNA_ROUNDTRIP_VERIFY_ERROR;
281:
282: /* 8. Return the saved copy from step 5.
283: */
284:
285: return IDNA_SUCCESS;
286: }
287: }
288:
289: /**
290: * idna_to_unicode_4z:
291: * @in: zero-terminated string.
292: * @output: pointer to output Unicode string.
293: * @flags: an #Idna_flags value, e.g., %IDNA_ALLOW_UNASSIGNED or
294: * %IDNA_USE_STD3_ASCII_RULES.
295: *
296: * Convert possibly ACE encoded domain name into a
297: * UCS-4 string. The domain name may contain several labels,
298: * separated by dots. The output buffer must be deallocated by the
299: * caller.
300: *
301: * Return value: Returns %IDNA_SUCCESS on success, or error code.
302: **/
303: int pa_idna_to_unicode_4z (const char *in, uint32_t *out, size_t out_len, int flags) {
304: int rc;
305: const char *start = in;
306: const char *end;
307: size_t add_len;
308:
309: do {
1.5 moko 310: for (end = start; *end && (*end != '.'); end++);
1.1 moko 311:
312: if (end > start) {
313: add_len=out_len;
314: rc = idna_to_unicode_internal (start, (size_t) (end - start), out, &add_len, flags);
315: if (rc != IDNA_SUCCESS)
316: return rc;
317:
318: if (add_len >= out_len)
319: return PUNYCODE_BIG_OUTPUT;
320:
321: out+=add_len;
322: out_len-=add_len;
323: }
324:
325: if (*end) {
326: if (!out_len)
327: return PUNYCODE_BIG_OUTPUT;
328: *(out++) = 0x002E; /* '.' (full stop) */
329: out_len--;
330: }
331:
332: start = end + 1;
333: } while (*end);
334:
335: if (!out_len)
336: return PUNYCODE_BIG_OUTPUT;
337:
338: *out=0;
339: return IDNA_SUCCESS;
340: }
341:
342: /**
343: * IDNA_ACE_PREFIX
344: *
345: * The IANA allocated prefix to use for IDNA. "xn--"
346: */
347:
348: /**
349: * Idna_rc:
350: * @IDNA_SUCCESS: Successful operation. This value is guaranteed to
351: * always be zero, the remaining ones are only guaranteed to hold
352: * non-zero values, for logical comparison purposes.
353: * @IDNA_CONTAINS_NON_LDH: For IDNA_USE_STD3_ASCII_RULES, indicate that
354: * the string contains non-LDH ASCII characters.
355: * @IDNA_CONTAINS_MINUS: For IDNA_USE_STD3_ASCII_RULES, indicate that
356: * the string contains a leading or trailing hyphen-minus (U+002D).
357: * @IDNA_ROUNDTRIP_VERIFY_ERROR: The ToASCII operation on output
358: * string does not equal the input.
359: * @IDNA_CONTAINS_ACE_PREFIX: The input contains the ACE prefix (for
360: * ToASCII).
361: *
362: * Enumerated return codes of idna_to_ascii_4i(),
363: * idna_to_unicode_44i() functions (and functions derived from those
364: * functions). The value 0 is guaranteed to always correspond to
365: * success.
366: */
367:
368: const char *pa_idna_strerror(int rc) {
369: switch (rc) {
370: case IDNA_SUCCESS:
371: return "Success";
372: case PUNYCODE_BAD_INPUT:
373: return "Input is invalid";
374: case PUNYCODE_BIG_OUTPUT:
375: return "String is too long";
376: case PUNYCODE_OVERFLOW:
377: return "Wider integers needed to process input";
378: case IDNA_CONTAINS_NON_LDH:
379: return "Non-digit/letter/hyphen in input";
380: case IDNA_CONTAINS_MINUS:
381: return "Forbidden leading or trailing minus sign (`-')";
382: case IDNA_ROUNDTRIP_VERIFY_ERROR:
383: return "String not idempotent under ToASCII";
384: case IDNA_CONTAINS_ACE_PREFIX:
385: return "Input already contain ACE prefix (`xn--')";
386: default:
387: return "Unknown error";
388: }
389: }
390:
391: /**
392: * Idna_flags:
393: * @IDNA_ALLOW_UNASSIGNED: Don't reject strings containing unassigned
394: * Unicode code points.
395: * @IDNA_USE_STD3_ASCII_RULES: Validate strings according to STD3
396: * rules (i.e., normal host name rules).
397: *
398: * Flags to pass to idna_to_ascii_4i(), idna_to_unicode_44i() etc.
399: */
E-mail: