libidn  1.33
idna.c
Go to the documentation of this file.
1 /* idna.c --- Prototypes for Internationalized Domain Name library.
2  Copyright (C) 2002-2016 Simon Josefsson
3 
4  This file is part of GNU Libidn.
5 
6  GNU Libidn is free software: you can redistribute it and/or
7  modify it under the terms of either:
8 
9  * the GNU Lesser General Public License as published by the Free
10  Software Foundation; either version 3 of the License, or (at
11  your option) any later version.
12 
13  or
14 
15  * the GNU General Public License as published by the Free
16  Software Foundation; either version 2 of the License, or (at
17  your option) any later version.
18 
19  or both in parallel, as here.
20 
21  GNU Libidn is distributed in the hope that it will be useful,
22  but WITHOUT ANY WARRANTY; without even the implied warranty of
23  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24  General Public License for more details.
25 
26  You should have received copies of the GNU General Public License and
27  the GNU Lesser General Public License along with this program. If
28  not, see <http://www.gnu.org/licenses/>. */
29 
30 #ifdef HAVE_CONFIG_H
31 # include "config.h"
32 #endif
33 
34 #include <stdlib.h>
35 #include <string.h>
36 #include <stringprep.h>
37 #include <punycode.h>
38 
39 #include "idna.h"
40 
41 /* Get c_strcasecmp. */
42 #include <c-strcase.h>
43 
44 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
45  (c) == 0xFF0E || (c) == 0xFF61)
46 
47 /* Core functions */
48 
80 int
81 idna_to_ascii_4i (const uint32_t * in, size_t inlen, char *out, int flags)
82 {
83  size_t len, outlen;
84  uint32_t *src; /* XXX don't need to copy data? */
85  int rc;
86 
87  /*
88  * ToASCII consists of the following steps:
89  *
90  * 1. If all code points in the sequence are in the ASCII range (0..7F)
91  * then skip to step 3.
92  */
93 
94  {
95  size_t i;
96  int inasciirange;
97 
98  inasciirange = 1;
99  for (i = 0; i < inlen; i++)
100  if (in[i] > 0x7F)
101  inasciirange = 0;
102  if (inasciirange)
103  {
104  src = malloc (sizeof (in[0]) * (inlen + 1));
105  if (src == NULL)
106  return IDNA_MALLOC_ERROR;
107 
108  memcpy (src, in, sizeof (in[0]) * inlen);
109  src[inlen] = 0;
110 
111  goto step3;
112  }
113  }
114 
115  /*
116  * 2. Perform the steps specified in [NAMEPREP] and fail if there is
117  * an error. The AllowUnassigned flag is used in [NAMEPREP].
118  */
119 
120  {
121  char *p;
122 
123  p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
124  if (p == NULL)
125  return IDNA_MALLOC_ERROR;
126 
127  len = strlen (p);
128  do
129  {
130  char *newp;
131 
132  len = 2 * len + 10; /* XXX better guess? */
133  newp = realloc (p, len);
134  if (newp == NULL)
135  {
136  free (p);
137  return IDNA_MALLOC_ERROR;
138  }
139  p = newp;
140 
141  if (flags & IDNA_ALLOW_UNASSIGNED)
142  rc = stringprep_nameprep (p, len);
143  else
144  rc = stringprep_nameprep_no_unassigned (p, len);
145  }
146  while (rc == STRINGPREP_TOO_SMALL_BUFFER);
147 
148  if (rc != STRINGPREP_OK)
149  {
150  free (p);
151  return IDNA_STRINGPREP_ERROR;
152  }
153 
154  src = stringprep_utf8_to_ucs4 (p, -1, NULL);
155 
156  free (p);
157 
158  if (!src)
159  return IDNA_MALLOC_ERROR;
160  }
161 
162 step3:
163  /*
164  * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
165  *
166  * (a) Verify the absence of non-LDH ASCII code points; that is,
167  * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
168  *
169  * (b) Verify the absence of leading and trailing hyphen-minus;
170  * that is, the absence of U+002D at the beginning and end of
171  * the sequence.
172  */
173 
174  if (flags & IDNA_USE_STD3_ASCII_RULES)
175  {
176  size_t i;
177 
178  for (i = 0; src[i]; i++)
179  if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
180  (src[i] >= 0x3A && src[i] <= 0x40) ||
181  (src[i] >= 0x5B && src[i] <= 0x60) ||
182  (src[i] >= 0x7B && src[i] <= 0x7F))
183  {
184  free (src);
185  return IDNA_CONTAINS_NON_LDH;
186  }
187 
188  if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
189  {
190  free (src);
191  return IDNA_CONTAINS_MINUS;
192  }
193  }
194 
195  /*
196  * 4. If all code points in the sequence are in the ASCII range
197  * (0..7F), then skip to step 8.
198  */
199 
200  {
201  size_t i;
202  int inasciirange;
203 
204  inasciirange = 1;
205  for (i = 0; src[i]; i++)
206  {
207  if (src[i] > 0x7F)
208  inasciirange = 0;
209  /* copy string to output buffer if we are about to skip to step8 */
210  if (i < 64)
211  out[i] = src[i];
212  }
213  if (i < 64)
214  out[i] = '\0';
215  else
216  {
217  free (src);
218  return IDNA_INVALID_LENGTH;
219  }
220  if (inasciirange)
221  goto step8;
222  }
223 
224  /*
225  * 5. Verify that the sequence does NOT begin with the ACE prefix.
226  *
227  */
228 
229  {
230  size_t i;
231  int match;
232 
233  match = 1;
234  for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
235  if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
236  match = 0;
237  if (match)
238  {
239  free (src);
241  }
242  }
243 
244  /*
245  * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
246  * and fail if there is an error.
247  */
248  for (len = 0; src[len]; len++)
249  ;
250  src[len] = '\0';
251  outlen = 63 - strlen (IDNA_ACE_PREFIX);
252  rc = punycode_encode (len, src, NULL,
253  &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
254  if (rc != PUNYCODE_SUCCESS)
255  {
256  free (src);
257  return IDNA_PUNYCODE_ERROR;
258  }
259  out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
260 
261  /*
262  * 7. Prepend the ACE prefix.
263  */
264 
265  memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
266 
267  /*
268  * 8. Verify that the number of code points is in the range 1 to 63
269  * inclusive (0 is excluded).
270  */
271 
272 step8:
273  free (src);
274  if (strlen (out) < 1)
275  return IDNA_INVALID_LENGTH;
276 
277  return IDNA_SUCCESS;
278 }
279 
280 /* ToUnicode(). May realloc() utf8in. Will free utf8in unconditionally. */
281 static int
282 idna_to_unicode_internal (char *utf8in,
283  uint32_t * out, size_t * outlen, int flags)
284 {
285  int rc;
286  char tmpout[64];
287  size_t utf8len = strlen (utf8in) + 1;
288  size_t addlen = 0;
289 
290  /*
291  * ToUnicode consists of the following steps:
292  *
293  * 1. If the sequence contains any code points outside the ASCII range
294  * (0..7F) then proceed to step 2, otherwise skip to step 3.
295  */
296 
297  {
298  size_t i;
299  int inasciirange;
300 
301  inasciirange = 1;
302  for (i = 0; utf8in[i]; i++)
303  if (utf8in[i] & ~0x7F)
304  inasciirange = 0;
305  if (inasciirange)
306  goto step3;
307  }
308 
309  /*
310  * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
311  * error. (If step 3 of ToASCII is also performed here, it will not
312  * affect the overall behavior of ToUnicode, but it is not
313  * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
314  */
315  do
316  {
317  char *newp = realloc (utf8in, utf8len + addlen);
318  if (newp == NULL)
319  {
320  free (utf8in);
321  return IDNA_MALLOC_ERROR;
322  }
323  utf8in = newp;
324  if (flags & IDNA_ALLOW_UNASSIGNED)
325  rc = stringprep_nameprep (utf8in, utf8len + addlen);
326  else
327  rc = stringprep_nameprep_no_unassigned (utf8in, utf8len + addlen);
328  addlen += 1;
329  }
330  while (rc == STRINGPREP_TOO_SMALL_BUFFER);
331 
332  if (rc != STRINGPREP_OK)
333  {
334  free (utf8in);
335  return IDNA_STRINGPREP_ERROR;
336  }
337 
338  /* 3. Verify that the sequence begins with the ACE prefix, and save a
339  * copy of the sequence.
340  * ... The ToASCII and ToUnicode operations MUST recognize the ACE
341  prefix in a case-insensitive manner.
342  */
343 
344 step3:
345  if (c_strncasecmp (utf8in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0)
346  {
347  free (utf8in);
348  return IDNA_NO_ACE_PREFIX;
349  }
350 
351  /* 4. Remove the ACE prefix.
352  */
353 
354  memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
355  strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
356 
357  /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
358  * and fail if there is an error. Save a copy of the result of
359  * this step.
360  */
361 
362  (*outlen)--; /* reserve one for the zero */
363 
364  rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
365  if (rc != PUNYCODE_SUCCESS)
366  {
367  free (utf8in);
368  return IDNA_PUNYCODE_ERROR;
369  }
370 
371  out[*outlen] = 0; /* add zero */
372 
373  /* 6. Apply ToASCII.
374  */
375 
376  rc = idna_to_ascii_4i (out, *outlen, tmpout, flags);
377  if (rc != IDNA_SUCCESS)
378  {
379  free (utf8in);
380  return rc;
381  }
382 
383  /* 7. Verify that the result of step 6 matches the saved copy from
384  * step 3, using a case-insensitive ASCII comparison.
385  */
386 
387  if (c_strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
388  {
389  free (utf8in);
391  }
392 
393  /* 8. Return the saved copy from step 5.
394  */
395 
396  free (utf8in);
397  return IDNA_SUCCESS;
398 }
399 
435 int
436 idna_to_unicode_44i (const uint32_t * in, size_t inlen,
437  uint32_t * out, size_t * outlen, int flags)
438 {
439  int rc;
440  size_t outlensave = *outlen;
441  char *p;
442 
443  p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
444  if (p == NULL)
445  return IDNA_MALLOC_ERROR;
446 
447  rc = idna_to_unicode_internal (p, out, outlen, flags);
448  if (rc != IDNA_SUCCESS)
449  {
450  memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
451  inlen : outlensave));
452  *outlen = inlen;
453  }
454 
455  /* p is freed in idna_to_unicode_internal. */
456 
457  return rc;
458 }
459 
460 /* Wrappers that handle several labels */
461 
475 int
476 idna_to_ascii_4z (const uint32_t * input, char **output, int flags)
477 {
478  const uint32_t *start = input;
479  const uint32_t *end;
480  char buf[64];
481  char *out = NULL;
482  int rc;
483 
484  /* 1) Whenever dots are used as label separators, the following
485  characters MUST be recognized as dots: U+002E (full stop),
486  U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
487  U+FF61 (halfwidth ideographic full stop). */
488 
489  if (input[0] == 0)
490  {
491  /* Handle implicit zero-length root label. */
492  *output = malloc (1);
493  if (!*output)
494  return IDNA_MALLOC_ERROR;
495  strcpy (*output, "");
496  return IDNA_SUCCESS;
497  }
498 
499  if (DOTP (input[0]) && input[1] == 0)
500  {
501  /* Handle explicit zero-length root label. */
502  *output = malloc (2);
503  if (!*output)
504  return IDNA_MALLOC_ERROR;
505  strcpy (*output, ".");
506  return IDNA_SUCCESS;
507  }
508 
509  *output = NULL;
510  do
511  {
512  end = start;
513 
514  for (; *end && !DOTP (*end); end++)
515  ;
516 
517  if (*end == '\0' && start == end)
518  {
519  /* Handle explicit zero-length root label. */
520  buf[0] = '\0';
521  }
522  else
523  {
524  rc = idna_to_ascii_4i (start, (size_t) (end - start), buf, flags);
525  if (rc != IDNA_SUCCESS)
526  {
527  free (out);
528  return rc;
529  }
530  }
531 
532  if (out)
533  {
534  size_t l = strlen (out) + 1 + strlen (buf) + 1;
535  char *newp = realloc (out, l);
536  if (!newp)
537  {
538  free (out);
539  return IDNA_MALLOC_ERROR;
540  }
541  out = newp;
542  strcat (out, ".");
543  strcat (out, buf);
544  }
545  else
546  {
547  out = strdup (buf);
548  if (!out)
549  return IDNA_MALLOC_ERROR;
550  }
551 
552  start = end + 1;
553  }
554  while (*end);
555 
556  *output = out;
557 
558  return IDNA_SUCCESS;
559 }
560 
574 int
575 idna_to_ascii_8z (const char *input, char **output, int flags)
576 {
577  uint32_t *ucs4;
578  size_t ucs4len;
579  int rc;
580 
581  ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
582  if (!ucs4)
583  return IDNA_ICONV_ERROR;
584 
585  rc = idna_to_ascii_4z (ucs4, output, flags);
586 
587  free (ucs4);
588 
589  return rc;
590 
591 }
592 
607 int
608 idna_to_ascii_lz (const char *input, char **output, int flags)
609 {
610  char *utf8;
611  int rc;
612 
613  utf8 = stringprep_locale_to_utf8 (input);
614  if (!utf8)
615  return IDNA_ICONV_ERROR;
616 
617  rc = idna_to_ascii_8z (utf8, output, flags);
618 
619  free (utf8);
620 
621  return rc;
622 }
623 
638 int
639 idna_to_unicode_4z4z (const uint32_t * input, uint32_t ** output, int flags)
640 {
641  const uint32_t *start = input;
642  const uint32_t *end;
643  uint32_t *buf;
644  size_t buflen;
645  uint32_t *out = NULL;
646  size_t outlen = 0;
647 
648  *output = NULL;
649 
650  do
651  {
652  end = start;
653 
654  for (; *end && !DOTP (*end); end++)
655  ;
656 
657  buflen = (size_t) (end - start);
658  buf = malloc (sizeof (buf[0]) * (buflen + 1));
659  if (!buf)
660  return IDNA_MALLOC_ERROR;
661 
662  /* don't check return code as per specification! */
663  idna_to_unicode_44i (start, (size_t) (end - start),
664  buf, &buflen, flags);
665 
666  if (out)
667  {
668  uint32_t *newp = realloc (out,
669  sizeof (out[0])
670  * (outlen + 1 + buflen + 1));
671  if (!newp)
672  {
673  free (buf);
674  free (out);
675  return IDNA_MALLOC_ERROR;
676  }
677  out = newp;
678  out[outlen++] = 0x002E; /* '.' (full stop) */
679  memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
680  outlen += buflen;
681  out[outlen] = 0x0;
682  free (buf);
683  }
684  else
685  {
686  out = buf;
687  outlen = buflen;
688  out[outlen] = 0x0;
689  }
690 
691  start = end + 1;
692  }
693  while (*end);
694 
695  *output = out;
696 
697  return IDNA_SUCCESS;
698 }
699 
714 int
715 idna_to_unicode_8z4z (const char *input, uint32_t ** output, int flags)
716 {
717  uint32_t *ucs4;
718  size_t ucs4len;
719  int rc;
720 
721  ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
722  if (!ucs4)
723  return IDNA_ICONV_ERROR;
724 
725  rc = idna_to_unicode_4z4z (ucs4, output, flags);
726  free (ucs4);
727 
728  return rc;
729 }
730 
745 int
746 idna_to_unicode_8z8z (const char *input, char **output, int flags)
747 {
748  uint32_t *ucs4;
749  int rc;
750 
751  rc = idna_to_unicode_8z4z (input, &ucs4, flags);
752  if (rc != IDNA_SUCCESS)
753  return rc;
754 
755  *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
756  free (ucs4);
757 
758  if (!*output)
759  return IDNA_ICONV_ERROR;
760 
761  return IDNA_SUCCESS;
762 }
763 
779 int
780 idna_to_unicode_8zlz (const char *input, char **output, int flags)
781 {
782  char *utf8;
783  int rc;
784 
785  rc = idna_to_unicode_8z8z (input, &utf8, flags);
786  if (rc != IDNA_SUCCESS)
787  return rc;
788 
789  *output = stringprep_utf8_to_locale (utf8);
790  free (utf8);
791 
792  if (!*output)
793  return IDNA_ICONV_ERROR;
794 
795  return IDNA_SUCCESS;
796 }
797 
814 int
815 idna_to_unicode_lzlz (const char *input, char **output, int flags)
816 {
817  char *utf8;
818  int rc;
819 
820  utf8 = stringprep_locale_to_utf8 (input);
821  if (!utf8)
822  return IDNA_ICONV_ERROR;
823 
824  rc = idna_to_unicode_8zlz (utf8, output, flags);
825  free (utf8);
826 
827  return rc;
828 }
829 
int idna_to_unicode_lzlz(const char *input, char **output, int flags)
Definition: idna.c:815
#define IDNA_ACE_PREFIX
Definition: idna.h:81
int idna_to_unicode_44i(const uint32_t *in, size_t inlen, uint32_t *out, size_t *outlen, int flags)
Definition: idna.c:436
int idna_to_ascii_4z(const uint32_t *input, char **output, int flags)
Definition: idna.c:476
int idna_to_unicode_4z4z(const uint32_t *input, uint32_t **output, int flags)
Definition: idna.c:639
int idna_to_unicode_8zlz(const char *input, char **output, int flags)
Definition: idna.c:780
IDNAPI char * stringprep_locale_to_utf8(const char *str)
Definition: toutf8.c:143
#define stringprep_nameprep_no_unassigned(in, maxlen)
Definition: stringprep.h:151
int idna_to_unicode_8z4z(const char *input, uint32_t **output, int flags)
Definition: idna.c:715
int punycode_encode(size_t input_length, const punycode_uint input[], const unsigned char case_flags[], size_t *output_length, char output[])
Definition: punycode.c:196
IDNAPI char * stringprep_utf8_to_locale(const char *str)
Definition: toutf8.c:159
int punycode_decode(size_t input_length, const char input[], size_t *output_length, punycode_uint output[], unsigned char case_flags[])
Definition: punycode.c:345
int idna_to_unicode_8z8z(const char *input, char **output, int flags)
Definition: idna.c:746
uint32_t * stringprep_utf8_to_ucs4(const char *str, ssize_t len, size_t *items_written)
Definition: nfkc.c:1024
#define DOTP(c)
Definition: idna.c:44
char * stringprep_ucs4_to_utf8(const uint32_t *str, ssize_t len, size_t *items_read, size_t *items_written)
Definition: nfkc.c:1057
int idna_to_ascii_lz(const char *input, char **output, int flags)
Definition: idna.c:608
#define stringprep_nameprep(in, maxlen)
Definition: stringprep.h:148
int idna_to_ascii_4i(const uint32_t *in, size_t inlen, char *out, int flags)
Definition: idna.c:81
int idna_to_ascii_8z(const char *input, char **output, int flags)
Definition: idna.c:575