libidn  1.28
idna.c
Go to the documentation of this file.
1 /* idna.c --- Prototypes for Internationalized Domain Name library.
2  Copyright (C) 2002-2013 Simon Josefsson
3 
4  This file is part of GNU Libidn.
5 
6  GNU Libidn is free software: you can redistribute it and/or
7  modify it under the terms of either:
8 
9  * the GNU Lesser General Public License as published by the Free
10  Software Foundation; either version 3 of the License, or (at
11  your option) any later version.
12 
13  or
14 
15  * the GNU General Public License as published by the Free
16  Software Foundation; either version 2 of the License, or (at
17  your option) any later version.
18 
19  or both in parallel, as here.
20 
21  GNU Libidn is distributed in the hope that it will be useful,
22  but WITHOUT ANY WARRANTY; without even the implied warranty of
23  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24  General Public License for more details.
25 
26  You should have received copies of the GNU General Public License and
27  the GNU Lesser General Public License along with this program. If
28  not, see <http://www.gnu.org/licenses/>. */
29 
30 #ifdef HAVE_CONFIG_H
31 # include "config.h"
32 #endif
33 
34 #include <stdlib.h>
35 #include <string.h>
36 #include <stringprep.h>
37 #include <punycode.h>
38 
39 #include "idna.h"
40 
41 /* Get c_strcasecmp. */
42 #include <c-strcase.h>
43 
44 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
45  (c) == 0xFF0E || (c) == 0xFF61)
46 
47 /* Core functions */
48 
80 int
81 idna_to_ascii_4i (const uint32_t * in, size_t inlen, char *out, int flags)
82 {
83  size_t len, outlen;
84  uint32_t *src; /* XXX don't need to copy data? */
85  int rc;
86 
87  /*
88  * ToASCII consists of the following steps:
89  *
90  * 1. If all code points in the sequence are in the ASCII range (0..7F)
91  * then skip to step 3.
92  */
93 
94  {
95  size_t i;
96  int inasciirange;
97 
98  inasciirange = 1;
99  for (i = 0; i < inlen; i++)
100  if (in[i] > 0x7F)
101  inasciirange = 0;
102  if (inasciirange)
103  {
104  src = malloc (sizeof (in[0]) * (inlen + 1));
105  if (src == NULL)
106  return IDNA_MALLOC_ERROR;
107 
108  memcpy (src, in, sizeof (in[0]) * inlen);
109  src[inlen] = 0;
110 
111  goto step3;
112  }
113  }
114 
115  /*
116  * 2. Perform the steps specified in [NAMEPREP] and fail if there is
117  * an error. The AllowUnassigned flag is used in [NAMEPREP].
118  */
119 
120  {
121  char *p;
122 
123  p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
124  if (p == NULL)
125  return IDNA_MALLOC_ERROR;
126 
127  len = strlen (p);
128  do
129  {
130  char *newp;
131 
132  len = 2 * len + 10; /* XXX better guess? */
133  newp = realloc (p, len);
134  if (newp == NULL)
135  {
136  free (p);
137  return IDNA_MALLOC_ERROR;
138  }
139  p = newp;
140 
141  if (flags & IDNA_ALLOW_UNASSIGNED)
142  rc = stringprep_nameprep (p, len);
143  else
144  rc = stringprep_nameprep_no_unassigned (p, len);
145  }
146  while (rc == STRINGPREP_TOO_SMALL_BUFFER);
147 
148  if (rc != STRINGPREP_OK)
149  {
150  free (p);
151  return IDNA_STRINGPREP_ERROR;
152  }
153 
154  src = stringprep_utf8_to_ucs4 (p, -1, NULL);
155 
156  free (p);
157 
158  if (!src)
159  return IDNA_MALLOC_ERROR;
160  }
161 
162 step3:
163  /*
164  * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
165  *
166  * (a) Verify the absence of non-LDH ASCII code points; that is,
167  * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
168  *
169  * (b) Verify the absence of leading and trailing hyphen-minus;
170  * that is, the absence of U+002D at the beginning and end of
171  * the sequence.
172  */
173 
174  if (flags & IDNA_USE_STD3_ASCII_RULES)
175  {
176  size_t i;
177 
178  for (i = 0; src[i]; i++)
179  if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
180  (src[i] >= 0x3A && src[i] <= 0x40) ||
181  (src[i] >= 0x5B && src[i] <= 0x60) ||
182  (src[i] >= 0x7B && src[i] <= 0x7F))
183  {
184  free (src);
185  return IDNA_CONTAINS_NON_LDH;
186  }
187 
188  if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
189  {
190  free (src);
191  return IDNA_CONTAINS_MINUS;
192  }
193  }
194 
195  /*
196  * 4. If all code points in the sequence are in the ASCII range
197  * (0..7F), then skip to step 8.
198  */
199 
200  {
201  size_t i;
202  int inasciirange;
203 
204  inasciirange = 1;
205  for (i = 0; src[i]; i++)
206  {
207  if (src[i] > 0x7F)
208  inasciirange = 0;
209  /* copy string to output buffer if we are about to skip to step8 */
210  if (i < 64)
211  out[i] = src[i];
212  }
213  if (i < 64)
214  out[i] = '\0';
215  if (inasciirange)
216  goto step8;
217  }
218 
219  /*
220  * 5. Verify that the sequence does NOT begin with the ACE prefix.
221  *
222  */
223 
224  {
225  size_t i;
226  int match;
227 
228  match = 1;
229  for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
230  if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
231  match = 0;
232  if (match)
233  {
234  free (src);
236  }
237  }
238 
239  /*
240  * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
241  * and fail if there is an error.
242  */
243  for (len = 0; src[len]; len++)
244  ;
245  src[len] = '\0';
246  outlen = 63 - strlen (IDNA_ACE_PREFIX);
247  rc = punycode_encode (len, src, NULL,
248  &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
249  if (rc != PUNYCODE_SUCCESS)
250  {
251  free (src);
252  return IDNA_PUNYCODE_ERROR;
253  }
254  out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
255 
256  /*
257  * 7. Prepend the ACE prefix.
258  */
259 
260  memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
261 
262  /*
263  * 8. Verify that the number of code points is in the range 1 to 63
264  * inclusive (0 is excluded).
265  */
266 
267 step8:
268  free (src);
269  if (strlen (out) < 1 || strlen (out) > 63)
270  return IDNA_INVALID_LENGTH;
271 
272  return IDNA_SUCCESS;
273 }
274 
275 /* ToUnicode(). May realloc() utf8in. Will free utf8in unconditionally. */
276 static int
277 idna_to_unicode_internal (char *utf8in,
278  uint32_t * out, size_t * outlen, int flags)
279 {
280  int rc;
281  char tmpout[64];
282  size_t utf8len = strlen (utf8in) + 1;
283  size_t addlen = 0;
284 
285  /*
286  * ToUnicode consists of the following steps:
287  *
288  * 1. If the sequence contains any code points outside the ASCII range
289  * (0..7F) then proceed to step 2, otherwise skip to step 3.
290  */
291 
292  {
293  size_t i;
294  int inasciirange;
295 
296  inasciirange = 1;
297  for (i = 0; utf8in[i]; i++)
298  if (utf8in[i] & ~0x7F)
299  inasciirange = 0;
300  if (inasciirange)
301  goto step3;
302  }
303 
304  /*
305  * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
306  * error. (If step 3 of ToASCII is also performed here, it will not
307  * affect the overall behavior of ToUnicode, but it is not
308  * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
309  */
310  do
311  {
312  char *newp = realloc (utf8in, utf8len + addlen);
313  if (newp == NULL)
314  {
315  free (utf8in);
316  return IDNA_MALLOC_ERROR;
317  }
318  utf8in = newp;
319  if (flags & IDNA_ALLOW_UNASSIGNED)
320  rc = stringprep_nameprep (utf8in, utf8len + addlen);
321  else
322  rc = stringprep_nameprep_no_unassigned (utf8in, utf8len + addlen);
323  addlen += 1;
324  }
325  while (rc == STRINGPREP_TOO_SMALL_BUFFER);
326 
327  if (rc != STRINGPREP_OK)
328  {
329  free (utf8in);
330  return IDNA_STRINGPREP_ERROR;
331  }
332 
333  /* 3. Verify that the sequence begins with the ACE prefix, and save a
334  * copy of the sequence.
335  * ... The ToASCII and ToUnicode operations MUST recognize the ACE
336  prefix in a case-insensitive manner.
337  */
338 
339 step3:
340  if (c_strncasecmp (utf8in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0)
341  {
342  free (utf8in);
343  return IDNA_NO_ACE_PREFIX;
344  }
345 
346  /* 4. Remove the ACE prefix.
347  */
348 
349  memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
350  strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
351 
352  /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
353  * and fail if there is an error. Save a copy of the result of
354  * this step.
355  */
356 
357  (*outlen)--; /* reserve one for the zero */
358 
359  rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
360  if (rc != PUNYCODE_SUCCESS)
361  {
362  free (utf8in);
363  return IDNA_PUNYCODE_ERROR;
364  }
365 
366  out[*outlen] = 0; /* add zero */
367 
368  /* 6. Apply ToASCII.
369  */
370 
371  rc = idna_to_ascii_4i (out, *outlen, tmpout, flags);
372  if (rc != IDNA_SUCCESS)
373  {
374  free (utf8in);
375  return rc;
376  }
377 
378  /* 7. Verify that the result of step 6 matches the saved copy from
379  * step 3, using a case-insensitive ASCII comparison.
380  */
381 
382  if (c_strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
383  {
384  free (utf8in);
386  }
387 
388  /* 8. Return the saved copy from step 5.
389  */
390 
391  free (utf8in);
392  return IDNA_SUCCESS;
393 }
394 
430 int
431 idna_to_unicode_44i (const uint32_t * in, size_t inlen,
432  uint32_t * out, size_t * outlen, int flags)
433 {
434  int rc;
435  size_t outlensave = *outlen;
436  char *p;
437 
438  p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
439  if (p == NULL)
440  return IDNA_MALLOC_ERROR;
441 
442  rc = idna_to_unicode_internal (p, out, outlen, flags);
443  if (rc != IDNA_SUCCESS)
444  {
445  memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
446  inlen : outlensave));
447  *outlen = inlen;
448  }
449 
450  /* p is freed in idna_to_unicode_internal. */
451 
452  return rc;
453 }
454 
455 /* Wrappers that handle several labels */
456 
470 int
471 idna_to_ascii_4z (const uint32_t * input, char **output, int flags)
472 {
473  const uint32_t *start = input;
474  const uint32_t *end;
475  char buf[64];
476  char *out = NULL;
477  int rc;
478 
479  /* 1) Whenever dots are used as label separators, the following
480  characters MUST be recognized as dots: U+002E (full stop),
481  U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
482  U+FF61 (halfwidth ideographic full stop). */
483 
484  if (input[0] == 0)
485  {
486  /* Handle implicit zero-length root label. */
487  *output = malloc (1);
488  if (!*output)
489  return IDNA_MALLOC_ERROR;
490  strcpy (*output, "");
491  return IDNA_SUCCESS;
492  }
493 
494  if (DOTP (input[0]) && input[1] == 0)
495  {
496  /* Handle explicit zero-length root label. */
497  *output = malloc (2);
498  if (!*output)
499  return IDNA_MALLOC_ERROR;
500  strcpy (*output, ".");
501  return IDNA_SUCCESS;
502  }
503 
504  *output = NULL;
505  do
506  {
507  end = start;
508 
509  for (; *end && !DOTP (*end); end++)
510  ;
511 
512  if (*end == '\0' && start == end)
513  {
514  /* Handle explicit zero-length root label. */
515  buf[0] = '\0';
516  }
517  else
518  {
519  rc = idna_to_ascii_4i (start, (size_t) (end - start), buf, flags);
520  if (rc != IDNA_SUCCESS)
521  {
522  free (out);
523  return rc;
524  }
525  }
526 
527  if (out)
528  {
529  size_t l = strlen (out) + 1 + strlen (buf) + 1;
530  char *newp = realloc (out, l);
531  if (!newp)
532  {
533  free (out);
534  return IDNA_MALLOC_ERROR;
535  }
536  out = newp;
537  strcat (out, ".");
538  strcat (out, buf);
539  }
540  else
541  {
542  size_t l = strlen (buf) + 1;
543  out = (char *) malloc (l);
544  if (!out)
545  return IDNA_MALLOC_ERROR;
546  strcpy (out, buf);
547  }
548 
549  start = end + 1;
550  }
551  while (*end);
552 
553  *output = out;
554 
555  return IDNA_SUCCESS;
556 }
557 
571 int
572 idna_to_ascii_8z (const char *input, char **output, int flags)
573 {
574  uint32_t *ucs4;
575  size_t ucs4len;
576  int rc;
577 
578  ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
579  if (!ucs4)
580  return IDNA_ICONV_ERROR;
581 
582  rc = idna_to_ascii_4z (ucs4, output, flags);
583 
584  free (ucs4);
585 
586  return rc;
587 
588 }
589 
604 int
605 idna_to_ascii_lz (const char *input, char **output, int flags)
606 {
607  char *utf8;
608  int rc;
609 
610  utf8 = stringprep_locale_to_utf8 (input);
611  if (!utf8)
612  return IDNA_ICONV_ERROR;
613 
614  rc = idna_to_ascii_8z (utf8, output, flags);
615 
616  free (utf8);
617 
618  return rc;
619 }
620 
635 int
636 idna_to_unicode_4z4z (const uint32_t * input, uint32_t ** output, int flags)
637 {
638  const uint32_t *start = input;
639  const uint32_t *end;
640  uint32_t *buf;
641  size_t buflen;
642  uint32_t *out = NULL;
643  size_t outlen = 0;
644 
645  *output = NULL;
646 
647  do
648  {
649  end = start;
650 
651  for (; *end && !DOTP (*end); end++)
652  ;
653 
654  buflen = (size_t) (end - start);
655  buf = malloc (sizeof (buf[0]) * (buflen + 1));
656  if (!buf)
657  return IDNA_MALLOC_ERROR;
658 
659  /* don't check return code as per specification! */
660  idna_to_unicode_44i (start, (size_t) (end - start),
661  buf, &buflen, flags);
662 
663  if (out)
664  {
665  uint32_t *newp = realloc (out,
666  sizeof (out[0])
667  * (outlen + 1 + buflen + 1));
668  if (!newp)
669  {
670  free (buf);
671  free (out);
672  return IDNA_MALLOC_ERROR;
673  }
674  out = newp;
675  out[outlen++] = 0x002E; /* '.' (full stop) */
676  memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
677  outlen += buflen;
678  out[outlen] = 0x0;
679  free (buf);
680  }
681  else
682  {
683  out = buf;
684  outlen = buflen;
685  out[outlen] = 0x0;
686  }
687 
688  start = end + 1;
689  }
690  while (*end);
691 
692  *output = out;
693 
694  return IDNA_SUCCESS;
695 }
696 
711 int
712 idna_to_unicode_8z4z (const char *input, uint32_t ** output, int flags)
713 {
714  uint32_t *ucs4;
715  size_t ucs4len;
716  int rc;
717 
718  ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
719  if (!ucs4)
720  return IDNA_ICONV_ERROR;
721 
722  rc = idna_to_unicode_4z4z (ucs4, output, flags);
723  free (ucs4);
724 
725  return rc;
726 }
727 
742 int
743 idna_to_unicode_8z8z (const char *input, char **output, int flags)
744 {
745  uint32_t *ucs4;
746  int rc;
747 
748  rc = idna_to_unicode_8z4z (input, &ucs4, flags);
749  *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
750  free (ucs4);
751 
752  if (!*output)
753  return IDNA_ICONV_ERROR;
754 
755  return rc;
756 }
757 
773 int
774 idna_to_unicode_8zlz (const char *input, char **output, int flags)
775 {
776  char *utf8;
777  int rc;
778 
779  rc = idna_to_unicode_8z8z (input, &utf8, flags);
780  *output = stringprep_utf8_to_locale (utf8);
781  free (utf8);
782 
783  if (!*output)
784  return IDNA_ICONV_ERROR;
785 
786  return rc;
787 }
788 
805 int
806 idna_to_unicode_lzlz (const char *input, char **output, int flags)
807 {
808  char *utf8;
809  int rc;
810 
811  utf8 = stringprep_locale_to_utf8 (input);
812  if (!utf8)
813  return IDNA_ICONV_ERROR;
814 
815  rc = idna_to_unicode_8zlz (utf8, output, flags);
816  free (utf8);
817 
818  return rc;
819 }
820