libidn  1.30
punycode.c
Go to the documentation of this file.
1 /* punycode.c --- Implementation of punycode used to ASCII encode IDN's.
2  Copyright (C) 2002-2015 Simon Josefsson
3 
4  This file is part of GNU Libidn.
5 
6  GNU Libidn is free software: you can redistribute it and/or
7  modify it under the terms of either:
8 
9  * the GNU Lesser General Public License as published by the Free
10  Software Foundation; either version 3 of the License, or (at
11  your option) any later version.
12 
13  or
14 
15  * the GNU General Public License as published by the Free
16  Software Foundation; either version 2 of the License, or (at
17  your option) any later version.
18 
19  or both in parallel, as here.
20 
21  GNU Libidn is distributed in the hope that it will be useful,
22  but WITHOUT ANY WARRANTY; without even the implied warranty of
23  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24  General Public License for more details.
25 
26  You should have received copies of the GNU General Public License and
27  the GNU Lesser General Public License along with this program. If
28  not, see <http://www.gnu.org/licenses/>. */
29 
30 /*
31  * This file is derived from RFC 3492bis written by Adam M. Costello,
32  * downloaded from http://www.nicemice.net/idn/punycode-spec.gz on
33  * 2015-03-02 with SHA1 a966a8017f6be579d74a50a226accc7607c40133, a
34  * copy of which is stored in the GNU Libidn version controlled
35  * repository under doc/specification/punycode-spec.gz.
36  *
37  * The changes compared to Adam's file include: re-indentation, adding
38  * the license boilerplate and this comment, #include of config.h and
39  * punycode.h, adding GTK-DOC comments, changing the return code of
40  * punycode_encode and punycode_decode from enum to int, renaming the
41  * input_length_orig function input variable to input_length (and
42  * renaming the internal input_length variable to input_len) in
43  * punycode_encode.
44  *
45  * Adam's file contains the following:
46  *
47  * punycode-sample.c 2.0.0 (2004-Mar-21-Sun)
48  * http://www.nicemice.net/idn/
49  * Adam M. Costello
50  * http://www.nicemice.net/amc/
51  *
52  * This is ANSI C code (C89) implementing Punycode 1.0.x.
53  *
54  * Disclaimer and license: Regarding this entire document or any
55  * portion of it (including the pseudocode and C code), the author
56  * makes no guarantees and is not responsible for any damage resulting
57  * from its use. The author grants irrevocable permission to anyone
58  * to use, modify, and distribute it in any way that does not diminish
59  * the rights of anyone else to use, modify, and distribute it,
60  * provided that redistributed derivative works do not contain
61  * misleading author or version information. Derivative works need
62  * not be licensed under similar terms.
63  */
64 
65 #include <config.h>
66 
67 /**********************************************************/
68 /* Implementation (would normally go in its own .c file): */
69 
70 #include <string.h>
71 
72 #include "punycode.h"
73 
74 /*** Bootstring parameters for Punycode ***/
75 
76 enum
77 { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
78  initial_bias = 72, initial_n = 0x80, delimiter = 0x2D
79 };
80 
81 /* basic(cp) tests whether cp is a basic code point: */
82 #define basic(cp) ((punycode_uint)(cp) < 0x80)
83 
84 /* delim(cp) tests whether cp is a delimiter: */
85 #define delim(cp) ((cp) == delimiter)
86 
87 /* decode_digit(cp) returns the numeric value of a basic code */
88 /* point (for use in representing integers) in the range 0 to */
89 /* base-1, or base if cp does not represent a value. */
90 
91 static punycode_uint
92 decode_digit (punycode_uint cp)
93 {
94  return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
95  cp - 97 < 26 ? cp - 97 : base;
96 }
97 
98 /* encode_digit(d,flag) returns the basic code point whose value */
99 /* (when used for representing integers) is d, which needs to be in */
100 /* the range 0 to base-1. The lowercase form is used unless flag is */
101 /* nonzero, in which case the uppercase form is used. The behavior */
102 /* is undefined if flag is nonzero and digit d has no uppercase form. */
103 
104 static char
105 encode_digit (punycode_uint d, int flag)
106 {
107  return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
108  /* 0..25 map to ASCII a..z or A..Z */
109  /* 26..35 map to ASCII 0..9 */
110 }
111 
112 /* flagged(bcp) tests whether a basic code point is flagged */
113 /* (uppercase). The behavior is undefined if bcp is not a */
114 /* basic code point. */
115 
116 #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
117 
118 /* encode_basic(bcp,flag) forces a basic code point to lowercase */
119 /* if flag is zero, uppercase if flag is nonzero, and returns */
120 /* the resulting code point. The code point is unchanged if it */
121 /* is caseless. The behavior is undefined if bcp is not a basic */
122 /* code point. */
123 
124 static char
125 encode_basic (punycode_uint bcp, int flag)
126 {
127  bcp -= (bcp - 97 < 26) << 5;
128  return bcp + ((!flag && (bcp - 65 < 26)) << 5);
129 }
130 
131 /*** Platform-specific constants ***/
132 
133 /* maxint is the maximum value of a punycode_uint variable: */
134 static const punycode_uint maxint = -1;
135 /* Because maxint is unsigned, -1 becomes the maximum value. */
136 
137 /*** Bias adaptation function ***/
138 
139 static punycode_uint
140 adapt (punycode_uint delta, punycode_uint numpoints, int firsttime)
141 {
142  punycode_uint k;
143 
144  delta = firsttime ? delta / damp : delta >> 1;
145  /* delta >> 1 is a faster way of doing delta / 2 */
146  delta += delta / numpoints;
147 
148  for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base)
149  {
150  delta /= base - tmin;
151  }
152 
153  return k + (base - tmin + 1) * delta / (delta + skew);
154 }
155 
156 /*** Main encode function ***/
157 
195 int
196 punycode_encode (size_t input_length,
197  const punycode_uint input[],
198  const unsigned char case_flags[],
199  size_t * output_length, char output[])
200 {
201  punycode_uint input_len, n, delta, h, b, bias, j, m, q, k, t;
202  size_t out, max_out;
203 
204  /* The Punycode spec assumes that the input length is the same type */
205  /* of integer as a code point, so we need to convert the size_t to */
206  /* a punycode_uint, which could overflow. */
207 
208  if (input_length > maxint)
209  return punycode_overflow;
210  input_len = (punycode_uint) input_length;
211 
212  /* Initialize the state: */
213 
214  n = initial_n;
215  delta = 0;
216  out = 0;
217  max_out = *output_length;
218  bias = initial_bias;
219 
220  /* Handle the basic code points: */
221 
222  for (j = 0; j < input_len; ++j)
223  {
224  if (basic (input[j]))
225  {
226  if (max_out - out < 2)
227  return punycode_big_output;
228  output[out++] = case_flags ?
229  encode_basic (input[j], case_flags[j]) : (char) input[j];
230  }
231  /* else if (input[j] < n) return punycode_bad_input; */
232  /* (not needed for Punycode with unsigned code points) */
233  }
234 
235  h = b = (punycode_uint) out;
236  /* cannot overflow because out <= input_len <= maxint */
237 
238  /* h is the number of code points that have been handled, b is the */
239  /* number of basic code points, and out is the number of ASCII code */
240  /* points that have been output. */
241 
242  if (b > 0)
243  output[out++] = delimiter;
244 
245  /* Main encoding loop: */
246 
247  while (h < input_len)
248  {
249  /* All non-basic code points < n have been */
250  /* handled already. Find the next larger one: */
251 
252  for (m = maxint, j = 0; j < input_len; ++j)
253  {
254  /* if (basic(input[j])) continue; */
255  /* (not needed for Punycode) */
256  if (input[j] >= n && input[j] < m)
257  m = input[j];
258  }
259 
260  /* Increase delta enough to advance the decoder's */
261  /* <n,i> state to <m,0>, but guard against overflow: */
262 
263  if (m - n > (maxint - delta) / (h + 1))
264  return punycode_overflow;
265  delta += (m - n) * (h + 1);
266  n = m;
267 
268  for (j = 0; j < input_len; ++j)
269  {
270  /* Punycode does not need to check whether input[j] is basic: */
271  if (input[j] < n /* || basic(input[j]) */ )
272  {
273  if (++delta == 0)
274  return punycode_overflow;
275  }
276 
277  if (input[j] == n)
278  {
279  /* Represent delta as a generalized variable-length integer: */
280 
281  for (q = delta, k = base;; k += base)
282  {
283  if (out >= max_out)
284  return punycode_big_output;
285  t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
286  k >= bias + tmax ? tmax : k - bias;
287  if (q < t)
288  break;
289  output[out++] = encode_digit (t + (q - t) % (base - t), 0);
290  q = (q - t) / (base - t);
291  }
292 
293  output[out++] = encode_digit (q, case_flags && case_flags[j]);
294  bias = adapt (delta, h + 1, h == b);
295  delta = 0;
296  ++h;
297  }
298  }
299 
300  ++delta, ++n;
301  }
302 
303  *output_length = out;
304  return punycode_success;
305 }
306 
307 /*** Main decode function ***/
308 
344 int
345 punycode_decode (size_t input_length,
346  const char input[],
347  size_t * output_length,
348  punycode_uint output[], unsigned char case_flags[])
349 {
350  punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t;
351  size_t b, j, in;
352 
353  /* Initialize the state: */
354 
355  n = initial_n;
356  out = i = 0;
357  max_out = *output_length > maxint ? maxint
358  : (punycode_uint) * output_length;
359  bias = initial_bias;
360 
361  /* Handle the basic code points: Let b be the number of input code */
362  /* points before the last delimiter, or 0 if there is none, then */
363  /* copy the first b code points to the output. */
364 
365  for (b = j = 0; j < input_length; ++j)
366  if (delim (input[j]))
367  b = j;
368  if (b > max_out)
369  return punycode_big_output;
370 
371  for (j = 0; j < b; ++j)
372  {
373  if (case_flags)
374  case_flags[out] = flagged (input[j]);
375  if (!basic (input[j]))
376  return punycode_bad_input;
377  output[out++] = input[j];
378  }
379 
380  /* Main decoding loop: Start just after the last delimiter if any */
381  /* basic code points were copied; start at the beginning otherwise. */
382 
383  for (in = b > 0 ? b + 1 : 0; in < input_length; ++out)
384  {
385 
386  /* in is the index of the next ASCII code point to be consumed, */
387  /* and out is the number of code points in the output array. */
388 
389  /* Decode a generalized variable-length integer into delta, */
390  /* which gets added to i. The overflow checking is easier */
391  /* if we increase i as we go, then subtract off its starting */
392  /* value at the end to obtain delta. */
393 
394  for (oldi = i, w = 1, k = base;; k += base)
395  {
396  if (in >= input_length)
397  return punycode_bad_input;
398  digit = decode_digit (input[in++]);
399  if (digit >= base)
400  return punycode_bad_input;
401  if (digit > (maxint - i) / w)
402  return punycode_overflow;
403  i += digit * w;
404  t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
405  k >= bias + tmax ? tmax : k - bias;
406  if (digit < t)
407  break;
408  if (w > maxint / (base - t))
409  return punycode_overflow;
410  w *= (base - t);
411  }
412 
413  bias = adapt (i - oldi, out + 1, oldi == 0);
414 
415  /* i was supposed to wrap around from out+1 to 0, */
416  /* incrementing n each time, so we'll fix that now: */
417 
418  if (i / (out + 1) > maxint - n)
419  return punycode_overflow;
420  n += i / (out + 1);
421  i %= (out + 1);
422 
423  /* Insert n at position i of the output: */
424 
425  /* not needed for Punycode: */
426  /* if (basic(n)) return punycode_bad_input; */
427  if (out >= max_out)
428  return punycode_big_output;
429 
430  if (case_flags)
431  {
432  memmove (case_flags + i + 1, case_flags + i, out - i);
433  /* Case of last ASCII code point determines case flag: */
434  case_flags[i] = flagged (input[in - 1]);
435  }
436 
437  memmove (output + i + 1, output + i, (out - i) * sizeof *output);
438  output[i++] = n;
439  }
440 
441  *output_length = (size_t) out;
442  /* cannot overflow because out <= old value of *output_length */
443  return punycode_success;
444 }
445 
#define flagged(bcp)
Definition: punycode.c:116
#define delim(cp)
Definition: punycode.c:85
Definition: punycode.c:77
Definition: punycode.c:77
int punycode_encode(size_t input_length, const punycode_uint input[], const unsigned char case_flags[], size_t *output_length, char output[])
Definition: punycode.c:196
int punycode_decode(size_t input_length, const char input[], size_t *output_length, punycode_uint output[], unsigned char case_flags[])
Definition: punycode.c:345
Definition: punycode.c:77
#define basic(cp)
Definition: punycode.c:82
Definition: punycode.c:77
Definition: punycode.c:77
uint32_t punycode_uint
Definition: punycode.h:115