|
libidn
1.25
|
00001 /* punycode.c --- Implementation of punycode used to ASCII encode IDN's. 00002 Copyright (C) 2002-2012 Simon Josefsson 00003 00004 This file is part of GNU Libidn. 00005 00006 GNU Libidn is free software: you can redistribute it and/or 00007 modify it under the terms of either: 00008 00009 * the GNU Lesser General Public License as published by the Free 00010 Software Foundation; either version 3 of the License, or (at 00011 your option) any later version. 00012 00013 or 00014 00015 * the GNU General Public License as published by the Free 00016 Software Foundation; either version 2 of the License, or (at 00017 your option) any later version. 00018 00019 or both in parallel, as here. 00020 00021 GNU Libidn is distributed in the hope that it will be useful, 00022 but WITHOUT ANY WARRANTY; without even the implied warranty of 00023 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00024 General Public License for more details. 00025 00026 You should have received copies of the GNU General Public License and 00027 the GNU Lesser General Public License along with this program. If 00028 not, see <http://www.gnu.org/licenses/>. */ 00029 00030 /* 00031 * This file is derived from RFC 3492bis written by Adam M. Costello. 00032 * 00033 * Disclaimer and license: Regarding this entire document or any 00034 * portion of it (including the pseudocode and C code), the author 00035 * makes no guarantees and is not responsible for any damage resulting 00036 * from its use. The author grants irrevocable permission to anyone 00037 * to use, modify, and distribute it in any way that does not diminish 00038 * the rights of anyone else to use, modify, and distribute it, 00039 * provided that redistributed derivative works do not contain 00040 * misleading author or version information. Derivative works need 00041 * not be licensed under similar terms. 00042 * 00043 * Copyright (C) The Internet Society (2003). All Rights Reserved. 00044 * 00045 * This document and translations of it may be copied and furnished to 00046 * others, and derivative works that comment on or otherwise explain it 00047 * or assist in its implementation may be prepared, copied, published 00048 * and distributed, in whole or in part, without restriction of any 00049 * kind, provided that the above copyright notice and this paragraph are 00050 * included on all such copies and derivative works. However, this 00051 * document itself may not be modified in any way, such as by removing 00052 * the copyright notice or references to the Internet Society or other 00053 * Internet organizations, except as needed for the purpose of 00054 * developing Internet standards in which case the procedures for 00055 * copyrights defined in the Internet Standards process must be 00056 * followed, or as required to translate it into languages other than 00057 * English. 00058 * 00059 * The limited permissions granted above are perpetual and will not be 00060 * revoked by the Internet Society or its successors or assigns. 00061 * 00062 * This document and the information contained herein is provided on an 00063 * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING 00064 * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING 00065 * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION 00066 * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF 00067 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. 00068 */ 00069 00070 #include <config.h> 00071 #include <string.h> 00072 00073 #include "punycode.h" 00074 00075 /*** Bootstring parameters for Punycode ***/ 00076 00077 enum 00078 { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700, 00079 initial_bias = 72, initial_n = 0x80, delimiter = 0x2D 00080 }; 00081 00082 /* basic(cp) tests whether cp is a basic code point: */ 00083 #define basic(cp) ((punycode_uint)(cp) < 0x80) 00084 00085 /* delim(cp) tests whether cp is a delimiter: */ 00086 #define delim(cp) ((cp) == delimiter) 00087 00088 /* decode_digit(cp) returns the numeric value of a basic code */ 00089 /* point (for use in representing integers) in the range 0 to */ 00090 /* base-1, or base if cp does not represent a value. */ 00091 00092 static punycode_uint 00093 decode_digit (punycode_uint cp) 00094 { 00095 return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 : 00096 cp - 97 < 26 ? cp - 97 : base; 00097 } 00098 00099 /* encode_digit(d,flag) returns the basic code point whose value */ 00100 /* (when used for representing integers) is d, which needs to be in */ 00101 /* the range 0 to base-1. The lowercase form is used unless flag is */ 00102 /* nonzero, in which case the uppercase form is used. The behavior */ 00103 /* is undefined if flag is nonzero and digit d has no uppercase form. */ 00104 00105 static char 00106 encode_digit (punycode_uint d, int flag) 00107 { 00108 return d + 22 + 75 * (d < 26) - ((flag != 0) << 5); 00109 /* 0..25 map to ASCII a..z or A..Z */ 00110 /* 26..35 map to ASCII 0..9 */ 00111 } 00112 00113 /* flagged(bcp) tests whether a basic code point is flagged */ 00114 /* (uppercase). The behavior is undefined if bcp is not a */ 00115 /* basic code point. */ 00116 00117 #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26) 00118 00119 /* encode_basic(bcp,flag) forces a basic code point to lowercase */ 00120 /* if flag is zero, uppercase if flag is nonzero, and returns */ 00121 /* the resulting code point. The code point is unchanged if it */ 00122 /* is caseless. The behavior is undefined if bcp is not a basic */ 00123 /* code point. */ 00124 00125 static char 00126 encode_basic (punycode_uint bcp, int flag) 00127 { 00128 bcp -= (bcp - 97 < 26) << 5; 00129 return bcp + ((!flag && (bcp - 65 < 26)) << 5); 00130 } 00131 00132 /*** Platform-specific constants ***/ 00133 00134 /* maxint is the maximum value of a punycode_uint variable: */ 00135 static const punycode_uint maxint = -1; 00136 /* Because maxint is unsigned, -1 becomes the maximum value. */ 00137 00138 /*** Bias adaptation function ***/ 00139 00140 static punycode_uint 00141 adapt (punycode_uint delta, punycode_uint numpoints, int firsttime) 00142 { 00143 punycode_uint k; 00144 00145 delta = firsttime ? delta / damp : delta >> 1; 00146 /* delta >> 1 is a faster way of doing delta / 2 */ 00147 delta += delta / numpoints; 00148 00149 for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) 00150 { 00151 delta /= base - tmin; 00152 } 00153 00154 return k + (base - tmin + 1) * delta / (delta + skew); 00155 } 00156 00157 /*** Main encode function ***/ 00158 00196 int 00197 punycode_encode (size_t input_length, 00198 const punycode_uint input[], 00199 const unsigned char case_flags[], 00200 size_t * output_length, char output[]) 00201 { 00202 punycode_uint input_len, n, delta, h, b, bias, j, m, q, k, t; 00203 size_t out, max_out; 00204 00205 /* The Punycode spec assumes that the input length is the same type */ 00206 /* of integer as a code point, so we need to convert the size_t to */ 00207 /* a punycode_uint, which could overflow. */ 00208 00209 if (input_length > maxint) 00210 return punycode_overflow; 00211 input_len = (punycode_uint) input_length; 00212 00213 /* Initialize the state: */ 00214 00215 n = initial_n; 00216 delta = 0; 00217 out = 0; 00218 max_out = *output_length; 00219 bias = initial_bias; 00220 00221 /* Handle the basic code points: */ 00222 00223 for (j = 0; j < input_len; ++j) 00224 { 00225 if (basic (input[j])) 00226 { 00227 if (max_out - out < 2) 00228 return punycode_big_output; 00229 output[out++] = case_flags ? 00230 encode_basic (input[j], case_flags[j]) : (char) input[j]; 00231 } 00232 /* else if (input[j] < n) return punycode_bad_input; */ 00233 /* (not needed for Punycode with unsigned code points) */ 00234 } 00235 00236 h = b = (punycode_uint) out; 00237 /* cannot overflow because out <= input_len <= maxint */ 00238 00239 /* h is the number of code points that have been handled, b is the */ 00240 /* number of basic code points, and out is the number of ASCII code */ 00241 /* points that have been output. */ 00242 00243 if (b > 0) 00244 output[out++] = delimiter; 00245 00246 /* Main encoding loop: */ 00247 00248 while (h < input_len) 00249 { 00250 /* All non-basic code points < n have been */ 00251 /* handled already. Find the next larger one: */ 00252 00253 for (m = maxint, j = 0; j < input_len; ++j) 00254 { 00255 /* if (basic(input[j])) continue; */ 00256 /* (not needed for Punycode) */ 00257 if (input[j] >= n && input[j] < m) 00258 m = input[j]; 00259 } 00260 00261 /* Increase delta enough to advance the decoder's */ 00262 /* <n,i> state to <m,0>, but guard against overflow: */ 00263 00264 if (m - n > (maxint - delta) / (h + 1)) 00265 return punycode_overflow; 00266 delta += (m - n) * (h + 1); 00267 n = m; 00268 00269 for (j = 0; j < input_len; ++j) 00270 { 00271 /* Punycode does not need to check whether input[j] is basic: */ 00272 if (input[j] < n /* || basic(input[j]) */ ) 00273 { 00274 if (++delta == 0) 00275 return punycode_overflow; 00276 } 00277 00278 if (input[j] == n) 00279 { 00280 /* Represent delta as a generalized variable-length integer: */ 00281 00282 for (q = delta, k = base;; k += base) 00283 { 00284 if (out >= max_out) 00285 return punycode_big_output; 00286 t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */ 00287 k >= bias + tmax ? tmax : k - bias; 00288 if (q < t) 00289 break; 00290 output[out++] = encode_digit (t + (q - t) % (base - t), 0); 00291 q = (q - t) / (base - t); 00292 } 00293 00294 output[out++] = encode_digit (q, case_flags && case_flags[j]); 00295 bias = adapt (delta, h + 1, h == b); 00296 delta = 0; 00297 ++h; 00298 } 00299 } 00300 00301 ++delta, ++n; 00302 } 00303 00304 *output_length = out; 00305 return punycode_success; 00306 } 00307 00308 /*** Main decode function ***/ 00309 00345 int 00346 punycode_decode (size_t input_length, 00347 const char input[], 00348 size_t * output_length, 00349 punycode_uint output[], unsigned char case_flags[]) 00350 { 00351 punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t; 00352 size_t b, j, in; 00353 00354 /* Initialize the state: */ 00355 00356 n = initial_n; 00357 out = i = 0; 00358 max_out = *output_length > maxint ? maxint 00359 : (punycode_uint) * output_length; 00360 bias = initial_bias; 00361 00362 /* Handle the basic code points: Let b be the number of input code */ 00363 /* points before the last delimiter, or 0 if there is none, then */ 00364 /* copy the first b code points to the output. */ 00365 00366 for (b = j = 0; j < input_length; ++j) 00367 if (delim (input[j])) 00368 b = j; 00369 if (b > max_out) 00370 return punycode_big_output; 00371 00372 for (j = 0; j < b; ++j) 00373 { 00374 if (case_flags) 00375 case_flags[out] = flagged (input[j]); 00376 if (!basic (input[j])) 00377 return punycode_bad_input; 00378 output[out++] = input[j]; 00379 } 00380 00381 /* Main decoding loop: Start just after the last delimiter if any */ 00382 /* basic code points were copied; start at the beginning otherwise. */ 00383 00384 for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) 00385 { 00386 00387 /* in is the index of the next ASCII code point to be consumed, */ 00388 /* and out is the number of code points in the output array. */ 00389 00390 /* Decode a generalized variable-length integer into delta, */ 00391 /* which gets added to i. The overflow checking is easier */ 00392 /* if we increase i as we go, then subtract off its starting */ 00393 /* value at the end to obtain delta. */ 00394 00395 for (oldi = i, w = 1, k = base;; k += base) 00396 { 00397 if (in >= input_length) 00398 return punycode_bad_input; 00399 digit = decode_digit (input[in++]); 00400 if (digit >= base) 00401 return punycode_bad_input; 00402 if (digit > (maxint - i) / w) 00403 return punycode_overflow; 00404 i += digit * w; 00405 t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */ 00406 k >= bias + tmax ? tmax : k - bias; 00407 if (digit < t) 00408 break; 00409 if (w > maxint / (base - t)) 00410 return punycode_overflow; 00411 w *= (base - t); 00412 } 00413 00414 bias = adapt (i - oldi, out + 1, oldi == 0); 00415 00416 /* i was supposed to wrap around from out+1 to 0, */ 00417 /* incrementing n each time, so we'll fix that now: */ 00418 00419 if (i / (out + 1) > maxint - n) 00420 return punycode_overflow; 00421 n += i / (out + 1); 00422 i %= (out + 1); 00423 00424 /* Insert n at position i of the output: */ 00425 00426 /* not needed for Punycode: */ 00427 /* if (basic(n)) return punycode_invalid_input; */ 00428 if (out >= max_out) 00429 return punycode_big_output; 00430 00431 if (case_flags) 00432 { 00433 memmove (case_flags + i + 1, case_flags + i, out - i); 00434 /* Case of last ASCII code point determines case flag: */ 00435 case_flags[i] = flagged (input[in - 1]); 00436 } 00437 00438 memmove (output + i + 1, output + i, (out - i) * sizeof *output); 00439 output[i++] = n; 00440 } 00441 00442 *output_length = (size_t) out; 00443 /* cannot overflow because out <= old value of *output_length */ 00444 return punycode_success; 00445 } 00446
1.7.6.1