libidn  1.42
punycode.c
Go to the documentation of this file.
1 /* punycode.c --- Implementation of punycode used to ASCII encode IDN's.
2  Copyright (C) 2002-2024 Simon Josefsson
3 
4  This file is part of GNU Libidn.
5 
6  GNU Libidn is free software: you can redistribute it and/or
7  modify it under the terms of either:
8 
9  * the GNU Lesser General Public License as published by the Free
10  Software Foundation; either version 3 of the License, or (at
11  your option) any later version.
12 
13  or
14 
15  * the GNU General Public License as published by the Free
16  Software Foundation; either version 2 of the License, or (at
17  your option) any later version.
18 
19  or both in parallel, as here.
20 
21  GNU Libidn is distributed in the hope that it will be useful,
22  but WITHOUT ANY WARRANTY; without even the implied warranty of
23  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24  General Public License for more details.
25 
26  You should have received copies of the GNU General Public License and
27  the GNU Lesser General Public License along with this program. If
28  not, see <https://www.gnu.org/licenses/>. */
29 
30 /*
31  * This file is derived from RFC 3492bis written by Adam M. Costello,
32  * downloaded from http://www.nicemice.net/idn/punycode-spec.gz on
33  * 2015-03-02 with SHA1 a966a8017f6be579d74a50a226accc7607c40133, a
34  * copy of which is stored in the GNU Libidn version controlled
35  * repository under doc/specification/punycode-spec.gz.
36  *
37  * The changes compared to Adam's file include: re-indentation, adding
38  * the license boilerplate and this comment, #include of config.h and
39  * punycode.h, adding GTK-DOC comments, changing the return code of
40  * punycode_encode and punycode_decode from enum to int, renaming the
41  * input_length_orig function input variable to input_length (and
42  * renaming the internal input_length variable to input_len) in
43  * punycode_encode.
44  *
45  * Adam's file contains the following:
46  *
47  * punycode-sample.c 2.0.0 (2004-Mar-21-Sun)
48  * http://www.nicemice.net/idn/
49  * Adam M. Costello
50  * http://www.nicemice.net/amc/
51  *
52  * This is ANSI C code (C89) implementing Punycode 1.0.x.
53  *
54  * Disclaimer and license: Regarding this entire document or any
55  * portion of it (including the pseudocode and C code), the author
56  * makes no guarantees and is not responsible for any damage resulting
57  * from its use. The author grants irrevocable permission to anyone
58  * to use, modify, and distribute it in any way that does not diminish
59  * the rights of anyone else to use, modify, and distribute it,
60  * provided that redistributed derivative works do not contain
61  * misleading author or version information. Derivative works need
62  * not be licensed under similar terms.
63  */
64 
65 #include <config.h>
66 
67 /**********************************************************/
68 /* Implementation (would normally go in its own .c file): */
69 
70 #include <string.h>
71 
72 #include "punycode.h"
73 
74 /*** Bootstring parameters for Punycode ***/
75 
76 enum
77 { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
78  initial_bias = 72, initial_n = 0x80, delimiter = 0x2D
79 };
80 
81 /* basic(cp) tests whether cp is a basic code point: */
82 #define basic(cp) ((punycode_uint)(cp) < 0x80)
83 
84 /* delim(cp) tests whether cp is a delimiter: */
85 #define delim(cp) ((cp) == delimiter)
86 
87 /* decode_digit(cp) returns the numeric value of a basic code */
88 /* point (for use in representing integers) in the range 0 to */
89 /* base-1, or base if cp does not represent a value. */
90 
91 static unsigned
92 decode_digit (int cp)
93 {
94  return (unsigned) (cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
95  cp - 97 < 26 ? cp - 97 : base);
96 }
97 
98 /* encode_digit(d,flag) returns the basic code point whose value */
99 /* (when used for representing integers) is d, which needs to be in */
100 /* the range 0 to base-1. The lowercase form is used unless flag is */
101 /* nonzero, in which case the uppercase form is used. The behavior */
102 /* is undefined if flag is nonzero and digit d has no uppercase form. */
103 
104 static char
105 encode_digit (punycode_uint d, int flag)
106 {
107  return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
108  /* 0..25 map to ASCII a..z or A..Z */
109  /* 26..35 map to ASCII 0..9 */
110 }
111 
112 /* flagged(bcp) tests whether a basic code point is flagged */
113 /* (uppercase). The behavior is undefined if bcp is not a */
114 /* basic code point. */
115 
116 #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
117 
118 /* encode_basic(bcp,flag) forces a basic code point to lowercase */
119 /* if flag is zero, uppercase if flag is nonzero, and returns */
120 /* the resulting code point. The code point is unchanged if it */
121 /* is caseless. The behavior is undefined if bcp is not a basic */
122 /* code point. */
123 
124 static char
125 encode_basic (punycode_uint bcp, int flag)
126 {
127  bcp -= (bcp - 97 < 26) << 5;
128  return bcp + ((!flag && (bcp - 65 < 26)) << 5);
129 }
130 
131 /*** Platform-specific constants ***/
132 
133 /* maxint is the maximum value of a punycode_uint variable: */
134 static const punycode_uint maxint = -1;
135 /* Because maxint is unsigned, -1 becomes the maximum value. */
136 
137 /*** Bias adaptation function ***/
138 
139 static punycode_uint
140 adapt (punycode_uint delta, punycode_uint numpoints, int firsttime)
141 {
142  punycode_uint k;
143 
144  delta = firsttime ? delta / damp : delta >> 1;
145  /* delta >> 1 is a faster way of doing delta / 2 */
146  delta += delta / numpoints;
147 
148  for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base)
149  {
150  delta /= base - tmin;
151  }
152 
153  return k + (base - tmin + 1) * delta / (delta + skew);
154 }
155 
156 /*** Main encode function ***/
157 
195 int
196 punycode_encode (size_t input_length,
197  const punycode_uint input[],
198  const unsigned char case_flags[],
199  size_t *output_length, char output[])
200 {
201  punycode_uint input_len, n, delta, h, b, bias, j, m, q, k, t;
202  size_t out, max_out;
203 
204  /* The Punycode spec assumes that the input length is the same type */
205  /* of integer as a code point, so we need to convert the size_t to */
206  /* a punycode_uint, which could overflow. */
207 
208  if (input_length > maxint)
209  return punycode_overflow;
210  input_len = (punycode_uint) input_length;
211 
212  /* Initialize the state: */
213 
214  n = initial_n;
215  delta = 0;
216  out = 0;
217  max_out = *output_length;
218  bias = initial_bias;
219 
220  /* Handle the basic code points: */
221 
222  for (j = 0; j < input_len; ++j)
223  {
224  if (basic (input[j]))
225  {
226  if (max_out - out < 2)
227  return punycode_big_output;
228  output[out++] = case_flags ?
229  encode_basic (input[j], case_flags[j]) : (char) input[j];
230  }
231  else if (input[j] > 0x10FFFF
232  || (input[j] >= 0xD800 && input[j] <= 0xDBFF))
233  return punycode_bad_input;
234  /* else if (input[j] < n) return punycode_bad_input; */
235  /* (not needed for Punycode with unsigned code points) */
236  }
237 
238  h = b = (punycode_uint) out;
239  /* cannot overflow because out <= input_len <= maxint */
240 
241  /* h is the number of code points that have been handled, b is the */
242  /* number of basic code points, and out is the number of ASCII code */
243  /* points that have been output. */
244 
245  if (b > 0)
246  output[out++] = delimiter;
247 
248  /* Main encoding loop: */
249 
250  while (h < input_len)
251  {
252  /* All non-basic code points < n have been */
253  /* handled already. Find the next larger one: */
254 
255  for (m = maxint, j = 0; j < input_len; ++j)
256  {
257  /* if (basic(input[j])) continue; */
258  /* (not needed for Punycode) */
259  if (input[j] >= n && input[j] < m)
260  m = input[j];
261  }
262 
263  /* Increase delta enough to advance the decoder's */
264  /* <n,i> state to <m,0>, but guard against overflow: */
265 
266  if (m - n > (maxint - delta) / (h + 1))
267  return punycode_overflow;
268  delta += (m - n) * (h + 1);
269  n = m;
270 
271  for (j = 0; j < input_len; ++j)
272  {
273  /* Punycode does not need to check whether input[j] is basic: */
274  if (input[j] < n /* || basic(input[j]) */ )
275  {
276  if (++delta == 0)
277  return punycode_overflow;
278  }
279 
280  if (input[j] == n)
281  {
282  /* Represent delta as a generalized variable-length integer: */
283 
284  for (q = delta, k = base;; k += base)
285  {
286  if (out >= max_out)
287  return punycode_big_output;
288  t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
289  k >= bias + tmax ? tmax : k - bias;
290  if (q < t)
291  break;
292  output[out++] = encode_digit (t + (q - t) % (base - t), 0);
293  q = (q - t) / (base - t);
294  }
295 
296  output[out++] = encode_digit (q, case_flags && case_flags[j]);
297  bias = adapt (delta, h + 1, h == b);
298  delta = 0;
299  ++h;
300  }
301  }
302 
303  ++delta, ++n;
304  }
305 
306  *output_length = out;
307  return punycode_success;
308 }
309 
310 /*** Main decode function ***/
311 
347 int
348 punycode_decode (size_t input_length,
349  const char input[],
350  size_t *output_length,
351  punycode_uint output[], unsigned char case_flags[])
352 {
353  punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t;
354  size_t b, j, in;
355 
356  /* Initialize the state: */
357 
358  n = initial_n;
359  out = i = 0;
360  max_out = *output_length > maxint ? maxint
361  : (punycode_uint) * output_length;
362  bias = initial_bias;
363 
364  /* Handle the basic code points: Let b be the number of input code */
365  /* points before the last delimiter, or 0 if there is none, then */
366  /* copy the first b code points to the output. */
367 
368  for (b = j = 0; j < input_length; ++j)
369  if (delim (input[j]))
370  b = j;
371  if (b > max_out)
372  return punycode_big_output;
373 
374  for (j = 0; j < b; ++j)
375  {
376  if (case_flags)
377  case_flags[out] = flagged (input[j]);
378  if (!basic (input[j]))
379  return punycode_bad_input;
380  output[out++] = input[j];
381  }
382  for (j = b + (b > 0); j < input_length; ++j)
383  if (!basic (input[j]))
384  return punycode_bad_input;
385 
386  /* Main decoding loop: Start just after the last delimiter if any */
387  /* basic code points were copied; start at the beginning otherwise. */
388 
389  for (in = b > 0 ? b + 1 : 0; in < input_length; ++out)
390  {
391 
392  /* in is the index of the next ASCII code point to be consumed, */
393  /* and out is the number of code points in the output array. */
394 
395  /* Decode a generalized variable-length integer into delta, */
396  /* which gets added to i. The overflow checking is easier */
397  /* if we increase i as we go, then subtract off its starting */
398  /* value at the end to obtain delta. */
399 
400  for (oldi = i, w = 1, k = base;; k += base)
401  {
402  if (in >= input_length)
403  return punycode_bad_input;
404  digit = decode_digit (input[in++]);
405  if (digit >= base)
406  return punycode_bad_input;
407  if (digit > (maxint - i) / w)
408  return punycode_overflow;
409  i += digit * w;
410  t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
411  k >= bias + tmax ? tmax : k - bias;
412  if (digit < t)
413  break;
414  if (w > maxint / (base - t))
415  return punycode_overflow;
416  w *= (base - t);
417  }
418 
419  bias = adapt (i - oldi, out + 1, oldi == 0);
420 
421  /* i was supposed to wrap around from out+1 to 0, */
422  /* incrementing n each time, so we'll fix that now: */
423 
424  if (i / (out + 1) > maxint - n)
425  return punycode_overflow;
426  n += i / (out + 1);
427  if (n > 0x10FFFF || (n >= 0xD800 && n <= 0xDBFF))
428  return punycode_bad_input;
429  i %= (out + 1);
430 
431  /* Insert n at position i of the output: */
432 
433  /* not needed for Punycode: */
434  /* if (basic(n)) return punycode_bad_input; */
435  if (out >= max_out)
436  return punycode_big_output;
437 
438  if (case_flags)
439  {
440  memmove (case_flags + i + 1, case_flags + i, out - i);
441  /* Case of last ASCII code point determines case flag: */
442  case_flags[i] = flagged (input[in - 1]);
443  }
444 
445  memmove (output + i + 1, output + i, (out - i) * sizeof *output);
446  output[i++] = n;
447  }
448 
449  *output_length = (size_t) out;
450  /* cannot overflow because out <= old value of *output_length */
451  return punycode_success;
452 }
453 
@ base
Definition: punycode.c:77
@ tmax
Definition: punycode.c:77
@ tmin
Definition: punycode.c:77
@ initial_bias
Definition: punycode.c:78
@ damp
Definition: punycode.c:77
@ skew
Definition: punycode.c:77
@ delimiter
Definition: punycode.c:78
@ initial_n
Definition: punycode.c:78
#define flagged(bcp)
Definition: punycode.c:116
#define basic(cp)
Definition: punycode.c:82
int punycode_decode(size_t input_length, const char input[], size_t *output_length, punycode_uint output[], unsigned char case_flags[])
Definition: punycode.c:348
int punycode_encode(size_t input_length, const punycode_uint input[], const unsigned char case_flags[], size_t *output_length, char output[])
Definition: punycode.c:196
#define delim(cp)
Definition: punycode.c:85
@ punycode_bad_input
Definition: punycode.h:103
@ punycode_success
Definition: punycode.h:102
@ punycode_overflow
Definition: punycode.h:105
@ punycode_big_output
Definition: punycode.h:104
uint32_t punycode_uint
Definition: punycode.h:123