libidn  1.42
idna.c
Go to the documentation of this file.
1 /* idna.c --- Prototypes for Internationalized Domain Name library.
2  Copyright (C) 2002-2024 Simon Josefsson
3 
4  This file is part of GNU Libidn.
5 
6  GNU Libidn is free software: you can redistribute it and/or
7  modify it under the terms of either:
8 
9  * the GNU Lesser General Public License as published by the Free
10  Software Foundation; either version 3 of the License, or (at
11  your option) any later version.
12 
13  or
14 
15  * the GNU General Public License as published by the Free
16  Software Foundation; either version 2 of the License, or (at
17  your option) any later version.
18 
19  or both in parallel, as here.
20 
21  GNU Libidn is distributed in the hope that it will be useful,
22  but WITHOUT ANY WARRANTY; without even the implied warranty of
23  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24  General Public License for more details.
25 
26  You should have received copies of the GNU General Public License and
27  the GNU Lesser General Public License along with this program. If
28  not, see <https://www.gnu.org/licenses/>. */
29 
30 #ifdef HAVE_CONFIG_H
31 # include "config.h"
32 #endif
33 
34 #include <stdlib.h>
35 #include <string.h>
36 #include <stringprep.h>
37 #include <punycode.h>
38 
39 #include "idna.h"
40 
41 /* Get c_strcasecmp. */
42 #include <c-strcase.h>
43 
44 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
45  (c) == 0xFF0E || (c) == 0xFF61)
46 
47 /* Core functions */
48 
80 int
81 idna_to_ascii_4i (const uint32_t *in, size_t inlen, char *out, int flags)
82 {
83  size_t len, outlen;
84  uint32_t *src; /* XXX don't need to copy data? */
85  int rc;
86 
87  /*
88  * ToASCII consists of the following steps:
89  *
90  * 1. If all code points in the sequence are in the ASCII range (0..7F)
91  * then skip to step 3.
92  */
93 
94  {
95  size_t i;
96  int inasciirange;
97 
98  inasciirange = 1;
99  for (i = 0; i < inlen; i++)
100  if (in[i] > 0x7F)
101  inasciirange = 0;
102  if (inasciirange)
103  {
104  src = malloc (sizeof (in[0]) * (inlen + 1));
105  if (src == NULL)
106  return IDNA_MALLOC_ERROR;
107 
108  memcpy (src, in, sizeof (in[0]) * inlen);
109  src[inlen] = 0;
110 
111  goto step3;
112  }
113  }
114 
115  /*
116  * 2. Perform the steps specified in [NAMEPREP] and fail if there is
117  * an error. The AllowUnassigned flag is used in [NAMEPREP].
118  */
119 
120  {
121  char *p;
122 
123  p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
124  if (p == NULL)
125  return IDNA_MALLOC_ERROR;
126 
127  len = strlen (p);
128  do
129  {
130  char *newp;
131 
132  len = 2 * len + 10; /* XXX better guess? */
133  newp = realloc (p, len);
134  if (newp == NULL)
135  {
136  free (p);
137  return IDNA_MALLOC_ERROR;
138  }
139  p = newp;
140 
141  if (flags & IDNA_ALLOW_UNASSIGNED)
142  rc = stringprep_nameprep (p, len);
143  else
144  rc = stringprep_nameprep_no_unassigned (p, len);
145  }
146  while (rc == STRINGPREP_TOO_SMALL_BUFFER);
147 
148  if (rc != STRINGPREP_OK)
149  {
150  free (p);
151  return IDNA_STRINGPREP_ERROR;
152  }
153 
154  src = stringprep_utf8_to_ucs4 (p, -1, NULL);
155 
156  free (p);
157 
158  if (!src)
159  return IDNA_MALLOC_ERROR;
160  }
161 
162 step3:
163  /*
164  * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
165  *
166  * (a) Verify the absence of non-LDH ASCII code points; that is,
167  * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
168  *
169  * (b) Verify the absence of leading and trailing hyphen-minus;
170  * that is, the absence of U+002D at the beginning and end of
171  * the sequence.
172  */
173 
174  if (flags & IDNA_USE_STD3_ASCII_RULES)
175  {
176  size_t i;
177 
178  for (i = 0; src[i]; i++)
179  if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
180  (src[i] >= 0x3A && src[i] <= 0x40) ||
181  (src[i] >= 0x5B && src[i] <= 0x60) ||
182  (src[i] >= 0x7B && src[i] <= 0x7F))
183  {
184  free (src);
185  return IDNA_CONTAINS_NON_LDH;
186  }
187 
188  if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
189  {
190  free (src);
191  return IDNA_CONTAINS_MINUS;
192  }
193  }
194 
195  /*
196  * 4. If all code points in the sequence are in the ASCII range
197  * (0..7F), then skip to step 8.
198  */
199 
200  {
201  size_t i;
202  int inasciirange;
203 
204  inasciirange = 1;
205  for (i = 0; src[i]; i++)
206  {
207  if (src[i] > 0x7F)
208  inasciirange = 0;
209  /* copy string to output buffer if we are about to skip to step8 */
210  if (i < 64)
211  out[i] = src[i];
212  }
213  if (i < 64)
214  out[i] = '\0';
215  else
216  {
217  free (src);
218  return IDNA_INVALID_LENGTH;
219  }
220  if (inasciirange)
221  goto step8;
222  }
223 
224  /*
225  * 5. Verify that the sequence does NOT begin with the ACE prefix.
226  *
227  */
228 
229  {
230  size_t i;
231  int match;
232 
233  match = 1;
234  for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
235  if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
236  match = 0;
237  if (match)
238  {
239  free (src);
241  }
242  }
243 
244  /*
245  * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
246  * and fail if there is an error.
247  */
248  for (len = 0; src[len]; len++)
249  ;
250  src[len] = '\0';
251  outlen = 63 - strlen (IDNA_ACE_PREFIX);
252  rc = punycode_encode (len, src, NULL,
253  &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
254  if (rc != PUNYCODE_SUCCESS)
255  {
256  free (src);
257  return IDNA_PUNYCODE_ERROR;
258  }
259  out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
260 
261  /*
262  * 7. Prepend the ACE prefix.
263  */
264 
265  memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
266 
267  /*
268  * 8. Verify that the number of code points is in the range 1 to 63
269  * inclusive (0 is excluded).
270  */
271 
272 step8:
273  free (src);
274  if (strlen (out) < 1)
275  return IDNA_INVALID_LENGTH;
276 
277  return IDNA_SUCCESS;
278 }
279 
280 /* ToUnicode(). May realloc() utf8in. Will free utf8in unconditionally. */
281 static int
282 idna_to_unicode_internal (char *utf8in,
283  uint32_t *out, size_t *outlen, int flags)
284 {
285  int rc;
286  char tmpout[64];
287  size_t utf8len = strlen (utf8in) + 1;
288  size_t addlen = 0, addinc = utf8len / 10 + 1;
289 
290  /*
291  * ToUnicode consists of the following steps:
292  *
293  * 1. If the sequence contains any code points outside the ASCII range
294  * (0..7F) then proceed to step 2, otherwise skip to step 3.
295  */
296 
297  {
298  size_t i;
299  int inasciirange;
300 
301  inasciirange = 1;
302  for (i = 0; utf8in[i]; i++)
303  if (utf8in[i] & ~0x7F)
304  inasciirange = 0;
305  if (inasciirange)
306  goto step3;
307  }
308 
309  /*
310  * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
311  * error. (If step 3 of ToASCII is also performed here, it will not
312  * affect the overall behavior of ToUnicode, but it is not
313  * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
314  */
315  do
316  {
317  char *newp = realloc (utf8in, utf8len + addlen);
318  if (newp == NULL)
319  {
320  free (utf8in);
321  return IDNA_MALLOC_ERROR;
322  }
323  utf8in = newp;
324  if (flags & IDNA_ALLOW_UNASSIGNED)
325  rc = stringprep_nameprep (utf8in, utf8len + addlen);
326  else
327  rc = stringprep_nameprep_no_unassigned (utf8in, utf8len + addlen);
328  addlen += addinc;
329  addinc *= 2;
330  }
331  while (rc == STRINGPREP_TOO_SMALL_BUFFER);
332 
333  if (rc != STRINGPREP_OK)
334  {
335  free (utf8in);
336  return IDNA_STRINGPREP_ERROR;
337  }
338 
339  /* 3. Verify that the sequence begins with the ACE prefix, and save a
340  * copy of the sequence.
341  * ... The ToASCII and ToUnicode operations MUST recognize the ACE
342  prefix in a case-insensitive manner.
343  */
344 
345 step3:
346  if (c_strncasecmp (utf8in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0)
347  {
348  free (utf8in);
349  return IDNA_NO_ACE_PREFIX;
350  }
351 
352  /* 4. Remove the ACE prefix.
353  */
354 
355  memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
356  strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
357 
358  /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
359  * and fail if there is an error. Save a copy of the result of
360  * this step.
361  */
362 
363  (*outlen)--; /* reserve one for the zero */
364 
365  rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
366  if (rc != PUNYCODE_SUCCESS)
367  {
368  free (utf8in);
369  return IDNA_PUNYCODE_ERROR;
370  }
371 
372  out[*outlen] = 0; /* add zero */
373 
374  /* 6. Apply ToASCII.
375  */
376 
377  rc = idna_to_ascii_4i (out, *outlen, tmpout, flags);
378  if (rc != IDNA_SUCCESS)
379  {
380  free (utf8in);
381  return rc;
382  }
383 
384  /* 7. Verify that the result of step 6 matches the saved copy from
385  * step 3, using a case-insensitive ASCII comparison.
386  */
387 
388  if (c_strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
389  {
390  free (utf8in);
392  }
393 
394  /* 8. Return the saved copy from step 5.
395  */
396 
397  free (utf8in);
398  return IDNA_SUCCESS;
399 }
400 
436 int
437 idna_to_unicode_44i (const uint32_t *in, size_t inlen,
438  uint32_t *out, size_t *outlen, int flags)
439 {
440  int rc;
441  size_t outlensave = *outlen;
442  char *p;
443 
444  p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
445  if (p == NULL)
446  return IDNA_MALLOC_ERROR;
447 
448  rc = idna_to_unicode_internal (p, out, outlen, flags);
449  if (rc != IDNA_SUCCESS)
450  {
451  memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
452  inlen : outlensave));
453  *outlen = inlen;
454  }
455 
456  /* p is freed in idna_to_unicode_internal. */
457 
458  return rc;
459 }
460 
461 /* Wrappers that handle several labels */
462 
476 int
477 idna_to_ascii_4z (const uint32_t *input, char **output, int flags)
478 {
479  const uint32_t *start = input;
480  const uint32_t *end;
481  char buf[64];
482  char *out = NULL;
483  int rc;
484 
485  /* 1) Whenever dots are used as label separators, the following
486  characters MUST be recognized as dots: U+002E (full stop),
487  U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
488  U+FF61 (halfwidth ideographic full stop). */
489 
490  if (input[0] == 0)
491  {
492  /* Handle implicit zero-length root label. */
493  *output = malloc (1);
494  if (!*output)
495  return IDNA_MALLOC_ERROR;
496  strcpy (*output, "");
497  return IDNA_SUCCESS;
498  }
499 
500  if (DOTP (input[0]) && input[1] == 0)
501  {
502  /* Handle explicit zero-length root label. */
503  *output = malloc (2);
504  if (!*output)
505  return IDNA_MALLOC_ERROR;
506  strcpy (*output, ".");
507  return IDNA_SUCCESS;
508  }
509 
510  *output = NULL;
511  do
512  {
513  end = start;
514 
515  for (; *end && !DOTP (*end); end++)
516  ;
517 
518  if (*end == '\0' && start == end)
519  {
520  /* Handle explicit zero-length root label. */
521  buf[0] = '\0';
522  }
523  else
524  {
525  rc = idna_to_ascii_4i (start, (size_t) (end - start), buf, flags);
526  if (rc != IDNA_SUCCESS)
527  {
528  free (out);
529  return rc;
530  }
531  }
532 
533  if (out)
534  {
535  size_t l = strlen (out) + 1 + strlen (buf) + 1;
536  char *newp = realloc (out, l);
537  if (!newp)
538  {
539  free (out);
540  return IDNA_MALLOC_ERROR;
541  }
542  out = newp;
543  strcat (out, ".");
544  strcat (out, buf);
545  }
546  else
547  {
548  out = strdup (buf);
549  if (!out)
550  return IDNA_MALLOC_ERROR;
551  }
552 
553  start = end + 1;
554  }
555  while (*end);
556 
557  *output = out;
558 
559  return IDNA_SUCCESS;
560 }
561 
575 int
576 idna_to_ascii_8z (const char *input, char **output, int flags)
577 {
578  uint32_t *ucs4;
579  size_t ucs4len;
580  int rc;
581 
582  ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
583  if (!ucs4)
584  return IDNA_ICONV_ERROR;
585 
586  rc = idna_to_ascii_4z (ucs4, output, flags);
587 
588  free (ucs4);
589 
590  return rc;
591 
592 }
593 
608 int
609 idna_to_ascii_lz (const char *input, char **output, int flags)
610 {
611  char *utf8;
612  int rc;
613 
614  utf8 = stringprep_locale_to_utf8 (input);
615  if (!utf8)
616  return IDNA_ICONV_ERROR;
617 
618  rc = idna_to_ascii_8z (utf8, output, flags);
619 
620  free (utf8);
621 
622  return rc;
623 }
624 
639 int
640 idna_to_unicode_4z4z (const uint32_t *input, uint32_t **output, int flags)
641 {
642  const uint32_t *start = input;
643  const uint32_t *end;
644  uint32_t *buf;
645  size_t buflen;
646  uint32_t *out = NULL;
647  size_t outlen = 0;
648 
649  *output = NULL;
650 
651  do
652  {
653  end = start;
654 
655  for (; *end && !DOTP (*end); end++)
656  ;
657 
658  buflen = (size_t) (end - start);
659  buf = malloc (sizeof (buf[0]) * (buflen + 1));
660  if (!buf)
661  {
662  free (out);
663  return IDNA_MALLOC_ERROR;
664  }
665 
666  /* don't check return code as per specification! */
667  idna_to_unicode_44i (start, (size_t) (end - start),
668  buf, &buflen, flags);
669 
670  if (out)
671  {
672  uint32_t *newp = realloc (out,
673  sizeof (out[0])
674  * (outlen + 1 + buflen + 1));
675  if (!newp)
676  {
677  free (buf);
678  free (out);
679  return IDNA_MALLOC_ERROR;
680  }
681  out = newp;
682  out[outlen++] = 0x002E; /* '.' (full stop) */
683  memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
684  outlen += buflen;
685  out[outlen] = 0x0;
686  free (buf);
687  }
688  else
689  {
690  out = buf;
691  outlen = buflen;
692  out[outlen] = 0x0;
693  }
694 
695  start = end + 1;
696  }
697  while (*end);
698 
699  *output = out;
700 
701  return IDNA_SUCCESS;
702 }
703 
718 int
719 idna_to_unicode_8z4z (const char *input, uint32_t **output, int flags)
720 {
721  uint32_t *ucs4;
722  size_t ucs4len;
723  int rc;
724 
725  ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
726  if (!ucs4)
727  return IDNA_ICONV_ERROR;
728 
729  rc = idna_to_unicode_4z4z (ucs4, output, flags);
730  free (ucs4);
731 
732  return rc;
733 }
734 
749 int
750 idna_to_unicode_8z8z (const char *input, char **output, int flags)
751 {
752  uint32_t *ucs4;
753  int rc;
754 
755  rc = idna_to_unicode_8z4z (input, &ucs4, flags);
756  if (rc != IDNA_SUCCESS)
757  return rc;
758 
759  *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
760  free (ucs4);
761 
762  if (!*output)
763  return IDNA_ICONV_ERROR;
764 
765  return IDNA_SUCCESS;
766 }
767 
783 int
784 idna_to_unicode_8zlz (const char *input, char **output, int flags)
785 {
786  char *utf8;
787  int rc;
788 
789  rc = idna_to_unicode_8z8z (input, &utf8, flags);
790  if (rc != IDNA_SUCCESS)
791  return rc;
792 
793  *output = stringprep_utf8_to_locale (utf8);
794  free (utf8);
795 
796  if (!*output)
797  return IDNA_ICONV_ERROR;
798 
799  return IDNA_SUCCESS;
800 }
801 
818 int
819 idna_to_unicode_lzlz (const char *input, char **output, int flags)
820 {
821  char *utf8;
822  int rc;
823 
824  utf8 = stringprep_locale_to_utf8 (input);
825  if (!utf8)
826  return IDNA_ICONV_ERROR;
827 
828  rc = idna_to_unicode_8zlz (utf8, output, flags);
829  free (utf8);
830 
831  return rc;
832 }
833 
int idna_to_unicode_8zlz(const char *input, char **output, int flags)
Definition: idna.c:784
#define DOTP(c)
Definition: idna.c:44
int idna_to_unicode_4z4z(const uint32_t *input, uint32_t **output, int flags)
Definition: idna.c:640
int idna_to_ascii_8z(const char *input, char **output, int flags)
Definition: idna.c:576
int idna_to_ascii_4z(const uint32_t *input, char **output, int flags)
Definition: idna.c:477
int idna_to_unicode_lzlz(const char *input, char **output, int flags)
Definition: idna.c:819
int idna_to_unicode_8z4z(const char *input, uint32_t **output, int flags)
Definition: idna.c:719
int idna_to_unicode_44i(const uint32_t *in, size_t inlen, uint32_t *out, size_t *outlen, int flags)
Definition: idna.c:437
int idna_to_unicode_8z8z(const char *input, char **output, int flags)
Definition: idna.c:750
int idna_to_ascii_4i(const uint32_t *in, size_t inlen, char *out, int flags)
Definition: idna.c:81
int idna_to_ascii_lz(const char *input, char **output, int flags)
Definition: idna.c:609
@ IDNA_ROUNDTRIP_VERIFY_ERROR
Definition: idna.h:83
@ IDNA_PUNYCODE_ERROR
Definition: idna.h:76
@ IDNA_SUCCESS
Definition: idna.h:74
@ IDNA_NO_ACE_PREFIX
Definition: idna.h:82
@ IDNA_CONTAINS_MINUS
Definition: idna.h:80
@ IDNA_ICONV_ERROR
Definition: idna.h:85
@ IDNA_STRINGPREP_ERROR
Definition: idna.h:75
@ IDNA_CONTAINS_ACE_PREFIX
Definition: idna.h:84
@ IDNA_CONTAINS_NON_LDH
Definition: idna.h:77
@ IDNA_INVALID_LENGTH
Definition: idna.h:81
@ IDNA_MALLOC_ERROR
Definition: idna.h:87
@ IDNA_USE_STD3_ASCII_RULES
Definition: idna.h:95
@ IDNA_ALLOW_UNASSIGNED
Definition: idna.h:94
#define IDNA_ACE_PREFIX
Definition: idna.h:99
char * stringprep_ucs4_to_utf8(const uint32_t *str, ssize_t len, size_t *items_read, size_t *items_written)
Definition: nfkc.c:1039
uint32_t * stringprep_utf8_to_ucs4(const char *str, ssize_t len, size_t *items_written)
Definition: nfkc.c:1006
int punycode_decode(size_t input_length, const char input[], size_t *output_length, punycode_uint output[], unsigned char case_flags[])
Definition: punycode.c:348
int punycode_encode(size_t input_length, const punycode_uint input[], const unsigned char case_flags[], size_t *output_length, char output[])
Definition: punycode.c:196
@ PUNYCODE_SUCCESS
Definition: punycode.h:110
IDNAPI char * stringprep_locale_to_utf8(const char *str)
Definition: toutf8.c:145
@ STRINGPREP_TOO_SMALL_BUFFER
Definition: stringprep.h:75
@ STRINGPREP_OK
Definition: stringprep.h:67
#define stringprep_nameprep(in, maxlen)
Definition: stringprep.h:202
IDNAPI char * stringprep_utf8_to_locale(const char *str)
Definition: toutf8.c:161
#define stringprep_nameprep_no_unassigned(in, maxlen)
Definition: stringprep.h:205