libidn  1.28
punycode.c
Go to the documentation of this file.
1 /* punycode.c --- Implementation of punycode used to ASCII encode IDN's.
2  Copyright (C) 2002-2013 Simon Josefsson
3 
4  This file is part of GNU Libidn.
5 
6  GNU Libidn is free software: you can redistribute it and/or
7  modify it under the terms of either:
8 
9  * the GNU Lesser General Public License as published by the Free
10  Software Foundation; either version 3 of the License, or (at
11  your option) any later version.
12 
13  or
14 
15  * the GNU General Public License as published by the Free
16  Software Foundation; either version 2 of the License, or (at
17  your option) any later version.
18 
19  or both in parallel, as here.
20 
21  GNU Libidn is distributed in the hope that it will be useful,
22  but WITHOUT ANY WARRANTY; without even the implied warranty of
23  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24  General Public License for more details.
25 
26  You should have received copies of the GNU General Public License and
27  the GNU Lesser General Public License along with this program. If
28  not, see <http://www.gnu.org/licenses/>. */
29 
30 /*
31  * This file is derived from RFC 3492bis written by Adam M. Costello.
32  *
33  * Disclaimer and license: Regarding this entire document or any
34  * portion of it (including the pseudocode and C code), the author
35  * makes no guarantees and is not responsible for any damage resulting
36  * from its use. The author grants irrevocable permission to anyone
37  * to use, modify, and distribute it in any way that does not diminish
38  * the rights of anyone else to use, modify, and distribute it,
39  * provided that redistributed derivative works do not contain
40  * misleading author or version information. Derivative works need
41  * not be licensed under similar terms.
42  *
43  * Copyright (C) The Internet Society (2003). All Rights Reserved.
44  *
45  * This document and translations of it may be copied and furnished to
46  * others, and derivative works that comment on or otherwise explain it
47  * or assist in its implementation may be prepared, copied, published
48  * and distributed, in whole or in part, without restriction of any
49  * kind, provided that the above copyright notice and this paragraph are
50  * included on all such copies and derivative works. However, this
51  * document itself may not be modified in any way, such as by removing
52  * the copyright notice or references to the Internet Society or other
53  * Internet organizations, except as needed for the purpose of
54  * developing Internet standards in which case the procedures for
55  * copyrights defined in the Internet Standards process must be
56  * followed, or as required to translate it into languages other than
57  * English.
58  *
59  * The limited permissions granted above are perpetual and will not be
60  * revoked by the Internet Society or its successors or assigns.
61  *
62  * This document and the information contained herein is provided on an
63  * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
64  * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
65  * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
66  * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
67  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
68  */
69 
70 #include <config.h>
71 #include <string.h>
72 
73 #include "punycode.h"
74 
75 /*** Bootstring parameters for Punycode ***/
76 
77 enum
78 { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
79  initial_bias = 72, initial_n = 0x80, delimiter = 0x2D
80 };
81 
82 /* basic(cp) tests whether cp is a basic code point: */
83 #define basic(cp) ((punycode_uint)(cp) < 0x80)
84 
85 /* delim(cp) tests whether cp is a delimiter: */
86 #define delim(cp) ((cp) == delimiter)
87 
88 /* decode_digit(cp) returns the numeric value of a basic code */
89 /* point (for use in representing integers) in the range 0 to */
90 /* base-1, or base if cp does not represent a value. */
91 
92 static punycode_uint
93 decode_digit (punycode_uint cp)
94 {
95  return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
96  cp - 97 < 26 ? cp - 97 : base;
97 }
98 
99 /* encode_digit(d,flag) returns the basic code point whose value */
100 /* (when used for representing integers) is d, which needs to be in */
101 /* the range 0 to base-1. The lowercase form is used unless flag is */
102 /* nonzero, in which case the uppercase form is used. The behavior */
103 /* is undefined if flag is nonzero and digit d has no uppercase form. */
104 
105 static char
106 encode_digit (punycode_uint d, int flag)
107 {
108  return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
109  /* 0..25 map to ASCII a..z or A..Z */
110  /* 26..35 map to ASCII 0..9 */
111 }
112 
113 /* flagged(bcp) tests whether a basic code point is flagged */
114 /* (uppercase). The behavior is undefined if bcp is not a */
115 /* basic code point. */
116 
117 #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
118 
119 /* encode_basic(bcp,flag) forces a basic code point to lowercase */
120 /* if flag is zero, uppercase if flag is nonzero, and returns */
121 /* the resulting code point. The code point is unchanged if it */
122 /* is caseless. The behavior is undefined if bcp is not a basic */
123 /* code point. */
124 
125 static char
126 encode_basic (punycode_uint bcp, int flag)
127 {
128  bcp -= (bcp - 97 < 26) << 5;
129  return bcp + ((!flag && (bcp - 65 < 26)) << 5);
130 }
131 
132 /*** Platform-specific constants ***/
133 
134 /* maxint is the maximum value of a punycode_uint variable: */
135 static const punycode_uint maxint = -1;
136 /* Because maxint is unsigned, -1 becomes the maximum value. */
137 
138 /*** Bias adaptation function ***/
139 
140 static punycode_uint
141 adapt (punycode_uint delta, punycode_uint numpoints, int firsttime)
142 {
143  punycode_uint k;
144 
145  delta = firsttime ? delta / damp : delta >> 1;
146  /* delta >> 1 is a faster way of doing delta / 2 */
147  delta += delta / numpoints;
148 
149  for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base)
150  {
151  delta /= base - tmin;
152  }
153 
154  return k + (base - tmin + 1) * delta / (delta + skew);
155 }
156 
157 /*** Main encode function ***/
158 
196 int
197 punycode_encode (size_t input_length,
198  const punycode_uint input[],
199  const unsigned char case_flags[],
200  size_t * output_length, char output[])
201 {
202  punycode_uint input_len, n, delta, h, b, bias, j, m, q, k, t;
203  size_t out, max_out;
204 
205  /* The Punycode spec assumes that the input length is the same type */
206  /* of integer as a code point, so we need to convert the size_t to */
207  /* a punycode_uint, which could overflow. */
208 
209  if (input_length > maxint)
210  return punycode_overflow;
211  input_len = (punycode_uint) input_length;
212 
213  /* Initialize the state: */
214 
215  n = initial_n;
216  delta = 0;
217  out = 0;
218  max_out = *output_length;
219  bias = initial_bias;
220 
221  /* Handle the basic code points: */
222 
223  for (j = 0; j < input_len; ++j)
224  {
225  if (basic (input[j]))
226  {
227  if (max_out - out < 2)
228  return punycode_big_output;
229  output[out++] = case_flags ?
230  encode_basic (input[j], case_flags[j]) : (char) input[j];
231  }
232  /* else if (input[j] < n) return punycode_bad_input; */
233  /* (not needed for Punycode with unsigned code points) */
234  }
235 
236  h = b = (punycode_uint) out;
237  /* cannot overflow because out <= input_len <= maxint */
238 
239  /* h is the number of code points that have been handled, b is the */
240  /* number of basic code points, and out is the number of ASCII code */
241  /* points that have been output. */
242 
243  if (b > 0)
244  output[out++] = delimiter;
245 
246  /* Main encoding loop: */
247 
248  while (h < input_len)
249  {
250  /* All non-basic code points < n have been */
251  /* handled already. Find the next larger one: */
252 
253  for (m = maxint, j = 0; j < input_len; ++j)
254  {
255  /* if (basic(input[j])) continue; */
256  /* (not needed for Punycode) */
257  if (input[j] >= n && input[j] < m)
258  m = input[j];
259  }
260 
261  /* Increase delta enough to advance the decoder's */
262  /* <n,i> state to <m,0>, but guard against overflow: */
263 
264  if (m - n > (maxint - delta) / (h + 1))
265  return punycode_overflow;
266  delta += (m - n) * (h + 1);
267  n = m;
268 
269  for (j = 0; j < input_len; ++j)
270  {
271  /* Punycode does not need to check whether input[j] is basic: */
272  if (input[j] < n /* || basic(input[j]) */ )
273  {
274  if (++delta == 0)
275  return punycode_overflow;
276  }
277 
278  if (input[j] == n)
279  {
280  /* Represent delta as a generalized variable-length integer: */
281 
282  for (q = delta, k = base;; k += base)
283  {
284  if (out >= max_out)
285  return punycode_big_output;
286  t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
287  k >= bias + tmax ? tmax : k - bias;
288  if (q < t)
289  break;
290  output[out++] = encode_digit (t + (q - t) % (base - t), 0);
291  q = (q - t) / (base - t);
292  }
293 
294  output[out++] = encode_digit (q, case_flags && case_flags[j]);
295  bias = adapt (delta, h + 1, h == b);
296  delta = 0;
297  ++h;
298  }
299  }
300 
301  ++delta, ++n;
302  }
303 
304  *output_length = out;
305  return punycode_success;
306 }
307 
308 /*** Main decode function ***/
309 
345 int
346 punycode_decode (size_t input_length,
347  const char input[],
348  size_t * output_length,
349  punycode_uint output[], unsigned char case_flags[])
350 {
351  punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t;
352  size_t b, j, in;
353 
354  /* Initialize the state: */
355 
356  n = initial_n;
357  out = i = 0;
358  max_out = *output_length > maxint ? maxint
359  : (punycode_uint) * output_length;
360  bias = initial_bias;
361 
362  /* Handle the basic code points: Let b be the number of input code */
363  /* points before the last delimiter, or 0 if there is none, then */
364  /* copy the first b code points to the output. */
365 
366  for (b = j = 0; j < input_length; ++j)
367  if (delim (input[j]))
368  b = j;
369  if (b > max_out)
370  return punycode_big_output;
371 
372  for (j = 0; j < b; ++j)
373  {
374  if (case_flags)
375  case_flags[out] = flagged (input[j]);
376  if (!basic (input[j]))
377  return punycode_bad_input;
378  output[out++] = input[j];
379  }
380 
381  /* Main decoding loop: Start just after the last delimiter if any */
382  /* basic code points were copied; start at the beginning otherwise. */
383 
384  for (in = b > 0 ? b + 1 : 0; in < input_length; ++out)
385  {
386 
387  /* in is the index of the next ASCII code point to be consumed, */
388  /* and out is the number of code points in the output array. */
389 
390  /* Decode a generalized variable-length integer into delta, */
391  /* which gets added to i. The overflow checking is easier */
392  /* if we increase i as we go, then subtract off its starting */
393  /* value at the end to obtain delta. */
394 
395  for (oldi = i, w = 1, k = base;; k += base)
396  {
397  if (in >= input_length)
398  return punycode_bad_input;
399  digit = decode_digit (input[in++]);
400  if (digit >= base)
401  return punycode_bad_input;
402  if (digit > (maxint - i) / w)
403  return punycode_overflow;
404  i += digit * w;
405  t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
406  k >= bias + tmax ? tmax : k - bias;
407  if (digit < t)
408  break;
409  if (w > maxint / (base - t))
410  return punycode_overflow;
411  w *= (base - t);
412  }
413 
414  bias = adapt (i - oldi, out + 1, oldi == 0);
415 
416  /* i was supposed to wrap around from out+1 to 0, */
417  /* incrementing n each time, so we'll fix that now: */
418 
419  if (i / (out + 1) > maxint - n)
420  return punycode_overflow;
421  n += i / (out + 1);
422  i %= (out + 1);
423 
424  /* Insert n at position i of the output: */
425 
426  /* not needed for Punycode: */
427  /* if (basic(n)) return punycode_invalid_input; */
428  if (out >= max_out)
429  return punycode_big_output;
430 
431  if (case_flags)
432  {
433  memmove (case_flags + i + 1, case_flags + i, out - i);
434  /* Case of last ASCII code point determines case flag: */
435  case_flags[i] = flagged (input[in - 1]);
436  }
437 
438  memmove (output + i + 1, output + i, (out - i) * sizeof *output);
439  output[i++] = n;
440  }
441 
442  *output_length = (size_t) out;
443  /* cannot overflow because out <= old value of *output_length */
444  return punycode_success;
445 }
446