libidn  1.25
punycode.c
Go to the documentation of this file.
00001 /* punycode.c --- Implementation of punycode used to ASCII encode IDN's.
00002    Copyright (C) 2002-2012 Simon Josefsson
00003 
00004    This file is part of GNU Libidn.
00005 
00006    GNU Libidn is free software: you can redistribute it and/or
00007    modify it under the terms of either:
00008 
00009      * the GNU Lesser General Public License as published by the Free
00010        Software Foundation; either version 3 of the License, or (at
00011        your option) any later version.
00012 
00013    or
00014 
00015      * the GNU General Public License as published by the Free
00016        Software Foundation; either version 2 of the License, or (at
00017        your option) any later version.
00018 
00019    or both in parallel, as here.
00020 
00021    GNU Libidn is distributed in the hope that it will be useful,
00022    but WITHOUT ANY WARRANTY; without even the implied warranty of
00023    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00024    General Public License for more details.
00025 
00026    You should have received copies of the GNU General Public License and
00027    the GNU Lesser General Public License along with this program.  If
00028    not, see <http://www.gnu.org/licenses/>. */
00029 
00030 /*
00031  * This file is derived from RFC 3492bis written by Adam M. Costello.
00032  *
00033  * Disclaimer and license: Regarding this entire document or any
00034  * portion of it (including the pseudocode and C code), the author
00035  * makes no guarantees and is not responsible for any damage resulting
00036  * from its use.  The author grants irrevocable permission to anyone
00037  * to use, modify, and distribute it in any way that does not diminish
00038  * the rights of anyone else to use, modify, and distribute it,
00039  * provided that redistributed derivative works do not contain
00040  * misleading author or version information.  Derivative works need
00041  * not be licensed under similar terms.
00042  *
00043  * Copyright (C) The Internet Society (2003).  All Rights Reserved.
00044  *
00045  * This document and translations of it may be copied and furnished to
00046  * others, and derivative works that comment on or otherwise explain it
00047  * or assist in its implementation may be prepared, copied, published
00048  * and distributed, in whole or in part, without restriction of any
00049  * kind, provided that the above copyright notice and this paragraph are
00050  * included on all such copies and derivative works.  However, this
00051  * document itself may not be modified in any way, such as by removing
00052  * the copyright notice or references to the Internet Society or other
00053  * Internet organizations, except as needed for the purpose of
00054  * developing Internet standards in which case the procedures for
00055  * copyrights defined in the Internet Standards process must be
00056  * followed, or as required to translate it into languages other than
00057  * English.
00058  *
00059  * The limited permissions granted above are perpetual and will not be
00060  * revoked by the Internet Society or its successors or assigns.
00061  *
00062  * This document and the information contained herein is provided on an
00063  * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
00064  * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
00065  * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
00066  * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
00067  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
00068  */
00069 
00070 #include <config.h>
00071 #include <string.h>
00072 
00073 #include "punycode.h"
00074 
00075 /*** Bootstring parameters for Punycode ***/
00076 
00077 enum
00078 { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
00079   initial_bias = 72, initial_n = 0x80, delimiter = 0x2D
00080 };
00081 
00082 /* basic(cp) tests whether cp is a basic code point: */
00083 #define basic(cp) ((punycode_uint)(cp) < 0x80)
00084 
00085 /* delim(cp) tests whether cp is a delimiter: */
00086 #define delim(cp) ((cp) == delimiter)
00087 
00088 /* decode_digit(cp) returns the numeric value of a basic code */
00089 /* point (for use in representing integers) in the range 0 to */
00090 /* base-1, or base if cp does not represent a value.          */
00091 
00092 static punycode_uint
00093 decode_digit (punycode_uint cp)
00094 {
00095   return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
00096     cp - 97 < 26 ? cp - 97 : base;
00097 }
00098 
00099 /* encode_digit(d,flag) returns the basic code point whose value      */
00100 /* (when used for representing integers) is d, which needs to be in   */
00101 /* the range 0 to base-1.  The lowercase form is used unless flag is  */
00102 /* nonzero, in which case the uppercase form is used.  The behavior   */
00103 /* is undefined if flag is nonzero and digit d has no uppercase form. */
00104 
00105 static char
00106 encode_digit (punycode_uint d, int flag)
00107 {
00108   return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
00109   /*  0..25 map to ASCII a..z or A..Z */
00110   /* 26..35 map to ASCII 0..9         */
00111 }
00112 
00113 /* flagged(bcp) tests whether a basic code point is flagged */
00114 /* (uppercase).  The behavior is undefined if bcp is not a  */
00115 /* basic code point.                                        */
00116 
00117 #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
00118 
00119 /* encode_basic(bcp,flag) forces a basic code point to lowercase */
00120 /* if flag is zero, uppercase if flag is nonzero, and returns    */
00121 /* the resulting code point.  The code point is unchanged if it  */
00122 /* is caseless.  The behavior is undefined if bcp is not a basic */
00123 /* code point.                                                   */
00124 
00125 static char
00126 encode_basic (punycode_uint bcp, int flag)
00127 {
00128   bcp -= (bcp - 97 < 26) << 5;
00129   return bcp + ((!flag && (bcp - 65 < 26)) << 5);
00130 }
00131 
00132 /*** Platform-specific constants ***/
00133 
00134 /* maxint is the maximum value of a punycode_uint variable: */
00135 static const punycode_uint maxint = -1;
00136 /* Because maxint is unsigned, -1 becomes the maximum value. */
00137 
00138 /*** Bias adaptation function ***/
00139 
00140 static punycode_uint
00141 adapt (punycode_uint delta, punycode_uint numpoints, int firsttime)
00142 {
00143   punycode_uint k;
00144 
00145   delta = firsttime ? delta / damp : delta >> 1;
00146   /* delta >> 1 is a faster way of doing delta / 2 */
00147   delta += delta / numpoints;
00148 
00149   for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base)
00150     {
00151       delta /= base - tmin;
00152     }
00153 
00154   return k + (base - tmin + 1) * delta / (delta + skew);
00155 }
00156 
00157 /*** Main encode function ***/
00158 
00196 int
00197 punycode_encode (size_t input_length,
00198                  const punycode_uint input[],
00199                  const unsigned char case_flags[],
00200                  size_t * output_length, char output[])
00201 {
00202   punycode_uint input_len, n, delta, h, b, bias, j, m, q, k, t;
00203   size_t out, max_out;
00204 
00205   /* The Punycode spec assumes that the input length is the same type */
00206   /* of integer as a code point, so we need to convert the size_t to  */
00207   /* a punycode_uint, which could overflow.                           */
00208 
00209   if (input_length > maxint)
00210     return punycode_overflow;
00211   input_len = (punycode_uint) input_length;
00212 
00213   /* Initialize the state: */
00214 
00215   n = initial_n;
00216   delta = 0;
00217   out = 0;
00218   max_out = *output_length;
00219   bias = initial_bias;
00220 
00221   /* Handle the basic code points: */
00222 
00223   for (j = 0; j < input_len; ++j)
00224     {
00225       if (basic (input[j]))
00226         {
00227           if (max_out - out < 2)
00228             return punycode_big_output;
00229           output[out++] = case_flags ?
00230             encode_basic (input[j], case_flags[j]) : (char) input[j];
00231         }
00232       /* else if (input[j] < n) return punycode_bad_input; */
00233       /* (not needed for Punycode with unsigned code points) */
00234     }
00235 
00236   h = b = (punycode_uint) out;
00237   /* cannot overflow because out <= input_len <= maxint */
00238 
00239   /* h is the number of code points that have been handled, b is the  */
00240   /* number of basic code points, and out is the number of ASCII code */
00241   /* points that have been output.                                    */
00242 
00243   if (b > 0)
00244     output[out++] = delimiter;
00245 
00246   /* Main encoding loop: */
00247 
00248   while (h < input_len)
00249     {
00250       /* All non-basic code points < n have been     */
00251       /* handled already.  Find the next larger one: */
00252 
00253       for (m = maxint, j = 0; j < input_len; ++j)
00254         {
00255           /* if (basic(input[j])) continue; */
00256           /* (not needed for Punycode) */
00257           if (input[j] >= n && input[j] < m)
00258             m = input[j];
00259         }
00260 
00261       /* Increase delta enough to advance the decoder's    */
00262       /* <n,i> state to <m,0>, but guard against overflow: */
00263 
00264       if (m - n > (maxint - delta) / (h + 1))
00265         return punycode_overflow;
00266       delta += (m - n) * (h + 1);
00267       n = m;
00268 
00269       for (j = 0; j < input_len; ++j)
00270         {
00271           /* Punycode does not need to check whether input[j] is basic: */
00272           if (input[j] < n /* || basic(input[j]) */ )
00273             {
00274               if (++delta == 0)
00275                 return punycode_overflow;
00276             }
00277 
00278           if (input[j] == n)
00279             {
00280               /* Represent delta as a generalized variable-length integer: */
00281 
00282               for (q = delta, k = base;; k += base)
00283                 {
00284                   if (out >= max_out)
00285                     return punycode_big_output;
00286                   t = k <= bias /* + tmin */ ? tmin :   /* +tmin not needed */
00287                     k >= bias + tmax ? tmax : k - bias;
00288                   if (q < t)
00289                     break;
00290                   output[out++] = encode_digit (t + (q - t) % (base - t), 0);
00291                   q = (q - t) / (base - t);
00292                 }
00293 
00294               output[out++] = encode_digit (q, case_flags && case_flags[j]);
00295               bias = adapt (delta, h + 1, h == b);
00296               delta = 0;
00297               ++h;
00298             }
00299         }
00300 
00301       ++delta, ++n;
00302     }
00303 
00304   *output_length = out;
00305   return punycode_success;
00306 }
00307 
00308 /*** Main decode function ***/
00309 
00345 int
00346 punycode_decode (size_t input_length,
00347                  const char input[],
00348                  size_t * output_length,
00349                  punycode_uint output[], unsigned char case_flags[])
00350 {
00351   punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t;
00352   size_t b, j, in;
00353 
00354   /* Initialize the state: */
00355 
00356   n = initial_n;
00357   out = i = 0;
00358   max_out = *output_length > maxint ? maxint
00359     : (punycode_uint) * output_length;
00360   bias = initial_bias;
00361 
00362   /* Handle the basic code points:  Let b be the number of input code */
00363   /* points before the last delimiter, or 0 if there is none, then    */
00364   /* copy the first b code points to the output.                      */
00365 
00366   for (b = j = 0; j < input_length; ++j)
00367     if (delim (input[j]))
00368       b = j;
00369   if (b > max_out)
00370     return punycode_big_output;
00371 
00372   for (j = 0; j < b; ++j)
00373     {
00374       if (case_flags)
00375         case_flags[out] = flagged (input[j]);
00376       if (!basic (input[j]))
00377         return punycode_bad_input;
00378       output[out++] = input[j];
00379     }
00380 
00381   /* Main decoding loop:  Start just after the last delimiter if any  */
00382   /* basic code points were copied; start at the beginning otherwise. */
00383 
00384   for (in = b > 0 ? b + 1 : 0; in < input_length; ++out)
00385     {
00386 
00387       /* in is the index of the next ASCII code point to be consumed, */
00388       /* and out is the number of code points in the output array.    */
00389 
00390       /* Decode a generalized variable-length integer into delta,  */
00391       /* which gets added to i.  The overflow checking is easier   */
00392       /* if we increase i as we go, then subtract off its starting */
00393       /* value at the end to obtain delta.                         */
00394 
00395       for (oldi = i, w = 1, k = base;; k += base)
00396         {
00397           if (in >= input_length)
00398             return punycode_bad_input;
00399           digit = decode_digit (input[in++]);
00400           if (digit >= base)
00401             return punycode_bad_input;
00402           if (digit > (maxint - i) / w)
00403             return punycode_overflow;
00404           i += digit * w;
00405           t = k <= bias /* + tmin */ ? tmin :   /* +tmin not needed */
00406             k >= bias + tmax ? tmax : k - bias;
00407           if (digit < t)
00408             break;
00409           if (w > maxint / (base - t))
00410             return punycode_overflow;
00411           w *= (base - t);
00412         }
00413 
00414       bias = adapt (i - oldi, out + 1, oldi == 0);
00415 
00416       /* i was supposed to wrap around from out+1 to 0,   */
00417       /* incrementing n each time, so we'll fix that now: */
00418 
00419       if (i / (out + 1) > maxint - n)
00420         return punycode_overflow;
00421       n += i / (out + 1);
00422       i %= (out + 1);
00423 
00424       /* Insert n at position i of the output: */
00425 
00426       /* not needed for Punycode: */
00427       /* if (basic(n)) return punycode_invalid_input; */
00428       if (out >= max_out)
00429         return punycode_big_output;
00430 
00431       if (case_flags)
00432         {
00433           memmove (case_flags + i + 1, case_flags + i, out - i);
00434           /* Case of last ASCII code point determines case flag: */
00435           case_flags[i] = flagged (input[in - 1]);
00436         }
00437 
00438       memmove (output + i + 1, output + i, (out - i) * sizeof *output);
00439       output[i++] = n;
00440     }
00441 
00442   *output_length = (size_t) out;
00443   /* cannot overflow because out <= old value of *output_length */
00444   return punycode_success;
00445 }
00446