libidn  1.32
idna.c
Go to the documentation of this file.
1 /* idna.c --- Prototypes for Internationalized Domain Name library.
2  Copyright (C) 2002-2015 Simon Josefsson
3 
4  This file is part of GNU Libidn.
5 
6  GNU Libidn is free software: you can redistribute it and/or
7  modify it under the terms of either:
8 
9  * the GNU Lesser General Public License as published by the Free
10  Software Foundation; either version 3 of the License, or (at
11  your option) any later version.
12 
13  or
14 
15  * the GNU General Public License as published by the Free
16  Software Foundation; either version 2 of the License, or (at
17  your option) any later version.
18 
19  or both in parallel, as here.
20 
21  GNU Libidn is distributed in the hope that it will be useful,
22  but WITHOUT ANY WARRANTY; without even the implied warranty of
23  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24  General Public License for more details.
25 
26  You should have received copies of the GNU General Public License and
27  the GNU Lesser General Public License along with this program. If
28  not, see <http://www.gnu.org/licenses/>. */
29 
30 #ifdef HAVE_CONFIG_H
31 # include "config.h"
32 #endif
33 
34 #include <stdlib.h>
35 #include <string.h>
36 #include <stringprep.h>
37 #include <punycode.h>
38 
39 #include "idna.h"
40 
41 /* Get c_strcasecmp. */
42 #include <c-strcase.h>
43 
44 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
45  (c) == 0xFF0E || (c) == 0xFF61)
46 
47 /* Core functions */
48 
80 int
81 idna_to_ascii_4i (const uint32_t * in, size_t inlen, char *out, int flags)
82 {
83  size_t len, outlen;
84  uint32_t *src; /* XXX don't need to copy data? */
85  int rc;
86 
87  /*
88  * ToASCII consists of the following steps:
89  *
90  * 1. If all code points in the sequence are in the ASCII range (0..7F)
91  * then skip to step 3.
92  */
93 
94  {
95  size_t i;
96  int inasciirange;
97 
98  inasciirange = 1;
99  for (i = 0; i < inlen; i++)
100  if (in[i] > 0x7F)
101  inasciirange = 0;
102  if (inasciirange)
103  {
104  src = malloc (sizeof (in[0]) * (inlen + 1));
105  if (src == NULL)
106  return IDNA_MALLOC_ERROR;
107 
108  memcpy (src, in, sizeof (in[0]) * inlen);
109  src[inlen] = 0;
110 
111  goto step3;
112  }
113  }
114 
115  /*
116  * 2. Perform the steps specified in [NAMEPREP] and fail if there is
117  * an error. The AllowUnassigned flag is used in [NAMEPREP].
118  */
119 
120  {
121  char *p;
122 
123  p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
124  if (p == NULL)
125  return IDNA_MALLOC_ERROR;
126 
127  len = strlen (p);
128  do
129  {
130  char *newp;
131 
132  len = 2 * len + 10; /* XXX better guess? */
133  newp = realloc (p, len);
134  if (newp == NULL)
135  {
136  free (p);
137  return IDNA_MALLOC_ERROR;
138  }
139  p = newp;
140 
141  if (flags & IDNA_ALLOW_UNASSIGNED)
142  rc = stringprep_nameprep (p, len);
143  else
144  rc = stringprep_nameprep_no_unassigned (p, len);
145  }
146  while (rc == STRINGPREP_TOO_SMALL_BUFFER);
147 
148  if (rc != STRINGPREP_OK)
149  {
150  free (p);
151  return IDNA_STRINGPREP_ERROR;
152  }
153 
154  src = stringprep_utf8_to_ucs4 (p, -1, NULL);
155 
156  free (p);
157 
158  if (!src)
159  return IDNA_MALLOC_ERROR;
160  }
161 
162 step3:
163  /*
164  * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
165  *
166  * (a) Verify the absence of non-LDH ASCII code points; that is,
167  * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
168  *
169  * (b) Verify the absence of leading and trailing hyphen-minus;
170  * that is, the absence of U+002D at the beginning and end of
171  * the sequence.
172  */
173 
174  if (flags & IDNA_USE_STD3_ASCII_RULES)
175  {
176  size_t i;
177 
178  for (i = 0; src[i]; i++)
179  if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
180  (src[i] >= 0x3A && src[i] <= 0x40) ||
181  (src[i] >= 0x5B && src[i] <= 0x60) ||
182  (src[i] >= 0x7B && src[i] <= 0x7F))
183  {
184  free (src);
185  return IDNA_CONTAINS_NON_LDH;
186  }
187 
188  if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
189  {
190  free (src);
191  return IDNA_CONTAINS_MINUS;
192  }
193  }
194 
195  /*
196  * 4. If all code points in the sequence are in the ASCII range
197  * (0..7F), then skip to step 8.
198  */
199 
200  {
201  size_t i;
202  int inasciirange;
203 
204  inasciirange = 1;
205  for (i = 0; src[i]; i++)
206  {
207  if (src[i] > 0x7F)
208  inasciirange = 0;
209  /* copy string to output buffer if we are about to skip to step8 */
210  if (i < 64)
211  out[i] = src[i];
212  }
213  if (i < 64)
214  out[i] = '\0';
215  if (inasciirange)
216  goto step8;
217  }
218 
219  /*
220  * 5. Verify that the sequence does NOT begin with the ACE prefix.
221  *
222  */
223 
224  {
225  size_t i;
226  int match;
227 
228  match = 1;
229  for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
230  if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
231  match = 0;
232  if (match)
233  {
234  free (src);
236  }
237  }
238 
239  /*
240  * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
241  * and fail if there is an error.
242  */
243  for (len = 0; src[len]; len++)
244  ;
245  src[len] = '\0';
246  outlen = 63 - strlen (IDNA_ACE_PREFIX);
247  rc = punycode_encode (len, src, NULL,
248  &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
249  if (rc != PUNYCODE_SUCCESS)
250  {
251  free (src);
252  return IDNA_PUNYCODE_ERROR;
253  }
254  out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
255 
256  /*
257  * 7. Prepend the ACE prefix.
258  */
259 
260  memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
261 
262  /*
263  * 8. Verify that the number of code points is in the range 1 to 63
264  * inclusive (0 is excluded).
265  */
266 
267 step8:
268  free (src);
269  if (strlen (out) < 1 || strlen (out) > 63)
270  return IDNA_INVALID_LENGTH;
271 
272  return IDNA_SUCCESS;
273 }
274 
275 /* ToUnicode(). May realloc() utf8in. Will free utf8in unconditionally. */
276 static int
277 idna_to_unicode_internal (char *utf8in,
278  uint32_t * out, size_t * outlen, int flags)
279 {
280  int rc;
281  char tmpout[64];
282  size_t utf8len = strlen (utf8in) + 1;
283  size_t addlen = 0;
284 
285  /*
286  * ToUnicode consists of the following steps:
287  *
288  * 1. If the sequence contains any code points outside the ASCII range
289  * (0..7F) then proceed to step 2, otherwise skip to step 3.
290  */
291 
292  {
293  size_t i;
294  int inasciirange;
295 
296  inasciirange = 1;
297  for (i = 0; utf8in[i]; i++)
298  if (utf8in[i] & ~0x7F)
299  inasciirange = 0;
300  if (inasciirange)
301  goto step3;
302  }
303 
304  /*
305  * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
306  * error. (If step 3 of ToASCII is also performed here, it will not
307  * affect the overall behavior of ToUnicode, but it is not
308  * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
309  */
310  do
311  {
312  char *newp = realloc (utf8in, utf8len + addlen);
313  if (newp == NULL)
314  {
315  free (utf8in);
316  return IDNA_MALLOC_ERROR;
317  }
318  utf8in = newp;
319  if (flags & IDNA_ALLOW_UNASSIGNED)
320  rc = stringprep_nameprep (utf8in, utf8len + addlen);
321  else
322  rc = stringprep_nameprep_no_unassigned (utf8in, utf8len + addlen);
323  addlen += 1;
324  }
325  while (rc == STRINGPREP_TOO_SMALL_BUFFER);
326 
327  if (rc != STRINGPREP_OK)
328  {
329  free (utf8in);
330  return IDNA_STRINGPREP_ERROR;
331  }
332 
333  /* 3. Verify that the sequence begins with the ACE prefix, and save a
334  * copy of the sequence.
335  * ... The ToASCII and ToUnicode operations MUST recognize the ACE
336  prefix in a case-insensitive manner.
337  */
338 
339 step3:
340  if (c_strncasecmp (utf8in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0)
341  {
342  free (utf8in);
343  return IDNA_NO_ACE_PREFIX;
344  }
345 
346  /* 4. Remove the ACE prefix.
347  */
348 
349  memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
350  strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
351 
352  /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
353  * and fail if there is an error. Save a copy of the result of
354  * this step.
355  */
356 
357  (*outlen)--; /* reserve one for the zero */
358 
359  rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
360  if (rc != PUNYCODE_SUCCESS)
361  {
362  free (utf8in);
363  return IDNA_PUNYCODE_ERROR;
364  }
365 
366  out[*outlen] = 0; /* add zero */
367 
368  /* 6. Apply ToASCII.
369  */
370 
371  rc = idna_to_ascii_4i (out, *outlen, tmpout, flags);
372  if (rc != IDNA_SUCCESS)
373  {
374  free (utf8in);
375  return rc;
376  }
377 
378  /* 7. Verify that the result of step 6 matches the saved copy from
379  * step 3, using a case-insensitive ASCII comparison.
380  */
381 
382  if (c_strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
383  {
384  free (utf8in);
386  }
387 
388  /* 8. Return the saved copy from step 5.
389  */
390 
391  free (utf8in);
392  return IDNA_SUCCESS;
393 }
394 
430 int
431 idna_to_unicode_44i (const uint32_t * in, size_t inlen,
432  uint32_t * out, size_t * outlen, int flags)
433 {
434  int rc;
435  size_t outlensave = *outlen;
436  char *p;
437 
438  p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
439  if (p == NULL)
440  return IDNA_MALLOC_ERROR;
441 
442  rc = idna_to_unicode_internal (p, out, outlen, flags);
443  if (rc != IDNA_SUCCESS)
444  {
445  memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
446  inlen : outlensave));
447  *outlen = inlen;
448  }
449 
450  /* p is freed in idna_to_unicode_internal. */
451 
452  return rc;
453 }
454 
455 /* Wrappers that handle several labels */
456 
470 int
471 idna_to_ascii_4z (const uint32_t * input, char **output, int flags)
472 {
473  const uint32_t *start = input;
474  const uint32_t *end;
475  char buf[64];
476  char *out = NULL;
477  int rc;
478 
479  /* 1) Whenever dots are used as label separators, the following
480  characters MUST be recognized as dots: U+002E (full stop),
481  U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
482  U+FF61 (halfwidth ideographic full stop). */
483 
484  if (input[0] == 0)
485  {
486  /* Handle implicit zero-length root label. */
487  *output = malloc (1);
488  if (!*output)
489  return IDNA_MALLOC_ERROR;
490  strcpy (*output, "");
491  return IDNA_SUCCESS;
492  }
493 
494  if (DOTP (input[0]) && input[1] == 0)
495  {
496  /* Handle explicit zero-length root label. */
497  *output = malloc (2);
498  if (!*output)
499  return IDNA_MALLOC_ERROR;
500  strcpy (*output, ".");
501  return IDNA_SUCCESS;
502  }
503 
504  *output = NULL;
505  do
506  {
507  end = start;
508 
509  for (; *end && !DOTP (*end); end++)
510  ;
511 
512  if (*end == '\0' && start == end)
513  {
514  /* Handle explicit zero-length root label. */
515  buf[0] = '\0';
516  }
517  else
518  {
519  rc = idna_to_ascii_4i (start, (size_t) (end - start), buf, flags);
520  if (rc != IDNA_SUCCESS)
521  {
522  free (out);
523  return rc;
524  }
525  }
526 
527  if (out)
528  {
529  size_t l = strlen (out) + 1 + strlen (buf) + 1;
530  char *newp = realloc (out, l);
531  if (!newp)
532  {
533  free (out);
534  return IDNA_MALLOC_ERROR;
535  }
536  out = newp;
537  strcat (out, ".");
538  strcat (out, buf);
539  }
540  else
541  {
542  out = strdup (buf);
543  if (!out)
544  return IDNA_MALLOC_ERROR;
545  }
546 
547  start = end + 1;
548  }
549  while (*end);
550 
551  *output = out;
552 
553  return IDNA_SUCCESS;
554 }
555 
569 int
570 idna_to_ascii_8z (const char *input, char **output, int flags)
571 {
572  uint32_t *ucs4;
573  size_t ucs4len;
574  int rc;
575 
576  ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
577  if (!ucs4)
578  return IDNA_ICONV_ERROR;
579 
580  rc = idna_to_ascii_4z (ucs4, output, flags);
581 
582  free (ucs4);
583 
584  return rc;
585 
586 }
587 
602 int
603 idna_to_ascii_lz (const char *input, char **output, int flags)
604 {
605  char *utf8;
606  int rc;
607 
608  utf8 = stringprep_locale_to_utf8 (input);
609  if (!utf8)
610  return IDNA_ICONV_ERROR;
611 
612  rc = idna_to_ascii_8z (utf8, output, flags);
613 
614  free (utf8);
615 
616  return rc;
617 }
618 
633 int
634 idna_to_unicode_4z4z (const uint32_t * input, uint32_t ** output, int flags)
635 {
636  const uint32_t *start = input;
637  const uint32_t *end;
638  uint32_t *buf;
639  size_t buflen;
640  uint32_t *out = NULL;
641  size_t outlen = 0;
642 
643  *output = NULL;
644 
645  do
646  {
647  end = start;
648 
649  for (; *end && !DOTP (*end); end++)
650  ;
651 
652  buflen = (size_t) (end - start);
653  buf = malloc (sizeof (buf[0]) * (buflen + 1));
654  if (!buf)
655  return IDNA_MALLOC_ERROR;
656 
657  /* don't check return code as per specification! */
658  idna_to_unicode_44i (start, (size_t) (end - start),
659  buf, &buflen, flags);
660 
661  if (out)
662  {
663  uint32_t *newp = realloc (out,
664  sizeof (out[0])
665  * (outlen + 1 + buflen + 1));
666  if (!newp)
667  {
668  free (buf);
669  free (out);
670  return IDNA_MALLOC_ERROR;
671  }
672  out = newp;
673  out[outlen++] = 0x002E; /* '.' (full stop) */
674  memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
675  outlen += buflen;
676  out[outlen] = 0x0;
677  free (buf);
678  }
679  else
680  {
681  out = buf;
682  outlen = buflen;
683  out[outlen] = 0x0;
684  }
685 
686  start = end + 1;
687  }
688  while (*end);
689 
690  *output = out;
691 
692  return IDNA_SUCCESS;
693 }
694 
709 int
710 idna_to_unicode_8z4z (const char *input, uint32_t ** output, int flags)
711 {
712  uint32_t *ucs4;
713  size_t ucs4len;
714  int rc;
715 
716  ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
717  if (!ucs4)
718  return IDNA_ICONV_ERROR;
719 
720  rc = idna_to_unicode_4z4z (ucs4, output, flags);
721  free (ucs4);
722 
723  return rc;
724 }
725 
740 int
741 idna_to_unicode_8z8z (const char *input, char **output, int flags)
742 {
743  uint32_t *ucs4;
744  int rc;
745 
746  rc = idna_to_unicode_8z4z (input, &ucs4, flags);
747  if (rc != IDNA_SUCCESS)
748  return rc;
749 
750  *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
751  free (ucs4);
752 
753  if (!*output)
754  return IDNA_ICONV_ERROR;
755 
756  return IDNA_SUCCESS;
757 }
758 
774 int
775 idna_to_unicode_8zlz (const char *input, char **output, int flags)
776 {
777  char *utf8;
778  int rc;
779 
780  rc = idna_to_unicode_8z8z (input, &utf8, flags);
781  if (rc != IDNA_SUCCESS)
782  return rc;
783 
784  *output = stringprep_utf8_to_locale (utf8);
785  free (utf8);
786 
787  if (!*output)
788  return IDNA_ICONV_ERROR;
789 
790  return IDNA_SUCCESS;
791 }
792 
809 int
810 idna_to_unicode_lzlz (const char *input, char **output, int flags)
811 {
812  char *utf8;
813  int rc;
814 
815  utf8 = stringprep_locale_to_utf8 (input);
816  if (!utf8)
817  return IDNA_ICONV_ERROR;
818 
819  rc = idna_to_unicode_8zlz (utf8, output, flags);
820  free (utf8);
821 
822  return rc;
823 }
824 
int idna_to_unicode_lzlz(const char *input, char **output, int flags)
Definition: idna.c:810
#define IDNA_ACE_PREFIX
Definition: idna.h:81
int idna_to_unicode_44i(const uint32_t *in, size_t inlen, uint32_t *out, size_t *outlen, int flags)
Definition: idna.c:431
int idna_to_ascii_4z(const uint32_t *input, char **output, int flags)
Definition: idna.c:471
int idna_to_unicode_4z4z(const uint32_t *input, uint32_t **output, int flags)
Definition: idna.c:634
int idna_to_unicode_8zlz(const char *input, char **output, int flags)
Definition: idna.c:775
IDNAPI char * stringprep_locale_to_utf8(const char *str)
Definition: toutf8.c:143
#define stringprep_nameprep_no_unassigned(in, maxlen)
Definition: stringprep.h:151
int idna_to_unicode_8z4z(const char *input, uint32_t **output, int flags)
Definition: idna.c:710
int punycode_encode(size_t input_length, const punycode_uint input[], const unsigned char case_flags[], size_t *output_length, char output[])
Definition: punycode.c:196
IDNAPI char * stringprep_utf8_to_locale(const char *str)
Definition: toutf8.c:159
int punycode_decode(size_t input_length, const char input[], size_t *output_length, punycode_uint output[], unsigned char case_flags[])
Definition: punycode.c:345
int idna_to_unicode_8z8z(const char *input, char **output, int flags)
Definition: idna.c:741
uint32_t * stringprep_utf8_to_ucs4(const char *str, ssize_t len, size_t *items_written)
Definition: nfkc.c:1024
#define DOTP(c)
Definition: idna.c:44
char * stringprep_ucs4_to_utf8(const uint32_t *str, ssize_t len, size_t *items_read, size_t *items_written)
Definition: nfkc.c:1057
int idna_to_ascii_lz(const char *input, char **output, int flags)
Definition: idna.c:603
#define stringprep_nameprep(in, maxlen)
Definition: stringprep.h:148
int idna_to_ascii_4i(const uint32_t *in, size_t inlen, char *out, int flags)
Definition: idna.c:81
int idna_to_ascii_8z(const char *input, char **output, int flags)
Definition: idna.c:570