|
libidn
1.25
|
00001 /* idna.c --- Prototypes for Internationalized Domain Name library. 00002 Copyright (C) 2002-2012 Simon Josefsson 00003 00004 This file is part of GNU Libidn. 00005 00006 GNU Libidn is free software: you can redistribute it and/or 00007 modify it under the terms of either: 00008 00009 * the GNU Lesser General Public License as published by the Free 00010 Software Foundation; either version 3 of the License, or (at 00011 your option) any later version. 00012 00013 or 00014 00015 * the GNU General Public License as published by the Free 00016 Software Foundation; either version 2 of the License, or (at 00017 your option) any later version. 00018 00019 or both in parallel, as here. 00020 00021 GNU Libidn is distributed in the hope that it will be useful, 00022 but WITHOUT ANY WARRANTY; without even the implied warranty of 00023 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00024 General Public License for more details. 00025 00026 You should have received copies of the GNU General Public License and 00027 the GNU Lesser General Public License along with this program. If 00028 not, see <http://www.gnu.org/licenses/>. */ 00029 00030 #ifdef HAVE_CONFIG_H 00031 # include "config.h" 00032 #endif 00033 00034 #include <stdlib.h> 00035 #include <string.h> 00036 #include <stringprep.h> 00037 #include <punycode.h> 00038 00039 #include "idna.h" 00040 00041 /* Get c_strcasecmp. */ 00042 #include <c-strcase.h> 00043 00044 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \ 00045 (c) == 0xFF0E || (c) == 0xFF61) 00046 00047 /* Core functions */ 00048 00080 int 00081 idna_to_ascii_4i (const uint32_t * in, size_t inlen, char *out, int flags) 00082 { 00083 size_t len, outlen; 00084 uint32_t *src; /* XXX don't need to copy data? */ 00085 int rc; 00086 00087 /* 00088 * ToASCII consists of the following steps: 00089 * 00090 * 1. If all code points in the sequence are in the ASCII range (0..7F) 00091 * then skip to step 3. 00092 */ 00093 00094 { 00095 size_t i; 00096 int inasciirange; 00097 00098 inasciirange = 1; 00099 for (i = 0; i < inlen; i++) 00100 if (in[i] > 0x7F) 00101 inasciirange = 0; 00102 if (inasciirange) 00103 { 00104 src = malloc (sizeof (in[0]) * (inlen + 1)); 00105 if (src == NULL) 00106 return IDNA_MALLOC_ERROR; 00107 00108 memcpy (src, in, sizeof (in[0]) * inlen); 00109 src[inlen] = 0; 00110 00111 goto step3; 00112 } 00113 } 00114 00115 /* 00116 * 2. Perform the steps specified in [NAMEPREP] and fail if there is 00117 * an error. The AllowUnassigned flag is used in [NAMEPREP]. 00118 */ 00119 00120 { 00121 char *p; 00122 00123 p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL); 00124 if (p == NULL) 00125 return IDNA_MALLOC_ERROR; 00126 00127 len = strlen (p); 00128 do 00129 { 00130 char *newp; 00131 00132 len = 2 * len + 10; /* XXX better guess? */ 00133 newp = realloc (p, len); 00134 if (newp == NULL) 00135 { 00136 free (p); 00137 return IDNA_MALLOC_ERROR; 00138 } 00139 p = newp; 00140 00141 if (flags & IDNA_ALLOW_UNASSIGNED) 00142 rc = stringprep_nameprep (p, len); 00143 else 00144 rc = stringprep_nameprep_no_unassigned (p, len); 00145 } 00146 while (rc == STRINGPREP_TOO_SMALL_BUFFER); 00147 00148 if (rc != STRINGPREP_OK) 00149 { 00150 free (p); 00151 return IDNA_STRINGPREP_ERROR; 00152 } 00153 00154 src = stringprep_utf8_to_ucs4 (p, -1, NULL); 00155 00156 free (p); 00157 } 00158 00159 step3: 00160 /* 00161 * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks: 00162 * 00163 * (a) Verify the absence of non-LDH ASCII code points; that is, 00164 * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F. 00165 * 00166 * (b) Verify the absence of leading and trailing hyphen-minus; 00167 * that is, the absence of U+002D at the beginning and end of 00168 * the sequence. 00169 */ 00170 00171 if (flags & IDNA_USE_STD3_ASCII_RULES) 00172 { 00173 size_t i; 00174 00175 for (i = 0; src[i]; i++) 00176 if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F || 00177 (src[i] >= 0x3A && src[i] <= 0x40) || 00178 (src[i] >= 0x5B && src[i] <= 0x60) || 00179 (src[i] >= 0x7B && src[i] <= 0x7F)) 00180 { 00181 free (src); 00182 return IDNA_CONTAINS_NON_LDH; 00183 } 00184 00185 if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D)) 00186 { 00187 free (src); 00188 return IDNA_CONTAINS_MINUS; 00189 } 00190 } 00191 00192 /* 00193 * 4. If all code points in the sequence are in the ASCII range 00194 * (0..7F), then skip to step 8. 00195 */ 00196 00197 { 00198 size_t i; 00199 int inasciirange; 00200 00201 inasciirange = 1; 00202 for (i = 0; src[i]; i++) 00203 { 00204 if (src[i] > 0x7F) 00205 inasciirange = 0; 00206 /* copy string to output buffer if we are about to skip to step8 */ 00207 if (i < 64) 00208 out[i] = src[i]; 00209 } 00210 if (i < 64) 00211 out[i] = '\0'; 00212 if (inasciirange) 00213 goto step8; 00214 } 00215 00216 /* 00217 * 5. Verify that the sequence does NOT begin with the ACE prefix. 00218 * 00219 */ 00220 00221 { 00222 size_t i; 00223 int match; 00224 00225 match = 1; 00226 for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++) 00227 if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i]) 00228 match = 0; 00229 if (match) 00230 { 00231 free (src); 00232 return IDNA_CONTAINS_ACE_PREFIX; 00233 } 00234 } 00235 00236 /* 00237 * 6. Encode the sequence using the encoding algorithm in [PUNYCODE] 00238 * and fail if there is an error. 00239 */ 00240 for (len = 0; src[len]; len++) 00241 ; 00242 src[len] = '\0'; 00243 outlen = 63 - strlen (IDNA_ACE_PREFIX); 00244 rc = punycode_encode (len, src, NULL, 00245 &outlen, &out[strlen (IDNA_ACE_PREFIX)]); 00246 if (rc != PUNYCODE_SUCCESS) 00247 { 00248 free (src); 00249 return IDNA_PUNYCODE_ERROR; 00250 } 00251 out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0'; 00252 00253 /* 00254 * 7. Prepend the ACE prefix. 00255 */ 00256 00257 memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)); 00258 00259 /* 00260 * 8. Verify that the number of code points is in the range 1 to 63 00261 * inclusive (0 is excluded). 00262 */ 00263 00264 step8: 00265 free (src); 00266 if (strlen (out) < 1 || strlen (out) > 63) 00267 return IDNA_INVALID_LENGTH; 00268 00269 return IDNA_SUCCESS; 00270 } 00271 00272 /* ToUnicode(). May realloc() utf8in. Will free utf8in unconditionally. */ 00273 static int 00274 idna_to_unicode_internal (char *utf8in, 00275 uint32_t * out, size_t * outlen, int flags) 00276 { 00277 int rc; 00278 char tmpout[64]; 00279 size_t utf8len = strlen (utf8in) + 1; 00280 size_t addlen = 0; 00281 00282 /* 00283 * ToUnicode consists of the following steps: 00284 * 00285 * 1. If the sequence contains any code points outside the ASCII range 00286 * (0..7F) then proceed to step 2, otherwise skip to step 3. 00287 */ 00288 00289 { 00290 size_t i; 00291 int inasciirange; 00292 00293 inasciirange = 1; 00294 for (i = 0; utf8in[i]; i++) 00295 if (utf8in[i] & ~0x7F) 00296 inasciirange = 0; 00297 if (inasciirange) 00298 goto step3; 00299 } 00300 00301 /* 00302 * 2. Perform the steps specified in [NAMEPREP] and fail if there is an 00303 * error. (If step 3 of ToASCII is also performed here, it will not 00304 * affect the overall behavior of ToUnicode, but it is not 00305 * necessary.) The AllowUnassigned flag is used in [NAMEPREP]. 00306 */ 00307 do 00308 { 00309 char *newp = realloc (utf8in, utf8len + addlen); 00310 if (newp == NULL) 00311 { 00312 free (utf8in); 00313 return IDNA_MALLOC_ERROR; 00314 } 00315 utf8in = newp; 00316 if (flags & IDNA_ALLOW_UNASSIGNED) 00317 rc = stringprep_nameprep (utf8in, utf8len + addlen); 00318 else 00319 rc = stringprep_nameprep_no_unassigned (utf8in, utf8len + addlen); 00320 addlen += 1; 00321 } 00322 while (rc == STRINGPREP_TOO_SMALL_BUFFER); 00323 00324 if (rc != STRINGPREP_OK) 00325 { 00326 free (utf8in); 00327 return IDNA_STRINGPREP_ERROR; 00328 } 00329 00330 /* 3. Verify that the sequence begins with the ACE prefix, and save a 00331 * copy of the sequence. 00332 * ... The ToASCII and ToUnicode operations MUST recognize the ACE 00333 prefix in a case-insensitive manner. 00334 */ 00335 00336 step3: 00337 if (c_strncasecmp (utf8in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0) 00338 { 00339 free (utf8in); 00340 return IDNA_NO_ACE_PREFIX; 00341 } 00342 00343 /* 4. Remove the ACE prefix. 00344 */ 00345 00346 memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)], 00347 strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1); 00348 00349 /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE] 00350 * and fail if there is an error. Save a copy of the result of 00351 * this step. 00352 */ 00353 00354 (*outlen)--; /* reserve one for the zero */ 00355 00356 rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL); 00357 if (rc != PUNYCODE_SUCCESS) 00358 { 00359 free (utf8in); 00360 return IDNA_PUNYCODE_ERROR; 00361 } 00362 00363 out[*outlen] = 0; /* add zero */ 00364 00365 /* 6. Apply ToASCII. 00366 */ 00367 00368 rc = idna_to_ascii_4i (out, *outlen, tmpout, flags); 00369 if (rc != IDNA_SUCCESS) 00370 { 00371 free (utf8in); 00372 return rc; 00373 } 00374 00375 /* 7. Verify that the result of step 6 matches the saved copy from 00376 * step 3, using a case-insensitive ASCII comparison. 00377 */ 00378 00379 if (c_strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0) 00380 { 00381 free (utf8in); 00382 return IDNA_ROUNDTRIP_VERIFY_ERROR; 00383 } 00384 00385 /* 8. Return the saved copy from step 5. 00386 */ 00387 00388 free (utf8in); 00389 return IDNA_SUCCESS; 00390 } 00391 00427 int 00428 idna_to_unicode_44i (const uint32_t * in, size_t inlen, 00429 uint32_t * out, size_t * outlen, int flags) 00430 { 00431 int rc; 00432 size_t outlensave = *outlen; 00433 char *p; 00434 00435 p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL); 00436 if (p == NULL) 00437 return IDNA_MALLOC_ERROR; 00438 00439 rc = idna_to_unicode_internal (p, out, outlen, flags); 00440 if (rc != IDNA_SUCCESS) 00441 { 00442 memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ? 00443 inlen : outlensave)); 00444 *outlen = inlen; 00445 } 00446 00447 /* p is freed in idna_to_unicode_internal. */ 00448 00449 return rc; 00450 } 00451 00452 /* Wrappers that handle several labels */ 00453 00467 int 00468 idna_to_ascii_4z (const uint32_t * input, char **output, int flags) 00469 { 00470 const uint32_t *start = input; 00471 const uint32_t *end; 00472 char buf[64]; 00473 char *out = NULL; 00474 int rc; 00475 00476 /* 1) Whenever dots are used as label separators, the following 00477 characters MUST be recognized as dots: U+002E (full stop), 00478 U+3002 (ideographic full stop), U+FF0E (fullwidth full stop), 00479 U+FF61 (halfwidth ideographic full stop). */ 00480 00481 if (input[0] == 0) 00482 { 00483 /* Handle implicit zero-length root label. */ 00484 *output = malloc (1); 00485 if (!*output) 00486 return IDNA_MALLOC_ERROR; 00487 strcpy (*output, ""); 00488 return IDNA_SUCCESS; 00489 } 00490 00491 if (DOTP (input[0]) && input[1] == 0) 00492 { 00493 /* Handle explicit zero-length root label. */ 00494 *output = malloc (2); 00495 if (!*output) 00496 return IDNA_MALLOC_ERROR; 00497 strcpy (*output, "."); 00498 return IDNA_SUCCESS; 00499 } 00500 00501 *output = NULL; 00502 do 00503 { 00504 end = start; 00505 00506 for (; *end && !DOTP (*end); end++) 00507 ; 00508 00509 if (*end == '\0' && start == end) 00510 { 00511 /* Handle explicit zero-length root label. */ 00512 buf[0] = '\0'; 00513 } 00514 else 00515 { 00516 rc = idna_to_ascii_4i (start, (size_t) (end - start), buf, flags); 00517 if (rc != IDNA_SUCCESS) 00518 { 00519 free (out); 00520 return rc; 00521 } 00522 } 00523 00524 if (out) 00525 { 00526 size_t l = strlen (out) + 1 + strlen (buf) + 1; 00527 char *newp = realloc (out, l); 00528 if (!newp) 00529 { 00530 free (out); 00531 return IDNA_MALLOC_ERROR; 00532 } 00533 out = newp; 00534 strcat (out, "."); 00535 strcat (out, buf); 00536 } 00537 else 00538 { 00539 size_t l = strlen (buf) + 1; 00540 out = (char *) malloc (l); 00541 if (!out) 00542 return IDNA_MALLOC_ERROR; 00543 strcpy (out, buf); 00544 } 00545 00546 start = end + 1; 00547 } 00548 while (*end); 00549 00550 *output = out; 00551 00552 return IDNA_SUCCESS; 00553 } 00554 00568 int 00569 idna_to_ascii_8z (const char *input, char **output, int flags) 00570 { 00571 uint32_t *ucs4; 00572 size_t ucs4len; 00573 int rc; 00574 00575 ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len); 00576 if (!ucs4) 00577 return IDNA_ICONV_ERROR; 00578 00579 rc = idna_to_ascii_4z (ucs4, output, flags); 00580 00581 free (ucs4); 00582 00583 return rc; 00584 00585 } 00586 00601 int 00602 idna_to_ascii_lz (const char *input, char **output, int flags) 00603 { 00604 char *utf8; 00605 int rc; 00606 00607 utf8 = stringprep_locale_to_utf8 (input); 00608 if (!utf8) 00609 return IDNA_ICONV_ERROR; 00610 00611 rc = idna_to_ascii_8z (utf8, output, flags); 00612 00613 free (utf8); 00614 00615 return rc; 00616 } 00617 00632 int 00633 idna_to_unicode_4z4z (const uint32_t * input, uint32_t ** output, int flags) 00634 { 00635 const uint32_t *start = input; 00636 const uint32_t *end; 00637 uint32_t *buf; 00638 size_t buflen; 00639 uint32_t *out = NULL; 00640 size_t outlen = 0; 00641 00642 *output = NULL; 00643 00644 do 00645 { 00646 end = start; 00647 00648 for (; *end && !DOTP (*end); end++) 00649 ; 00650 00651 buflen = (size_t) (end - start); 00652 buf = malloc (sizeof (buf[0]) * (buflen + 1)); 00653 if (!buf) 00654 return IDNA_MALLOC_ERROR; 00655 00656 /* don't check return code as per specification! */ 00657 idna_to_unicode_44i (start, (size_t) (end - start), 00658 buf, &buflen, flags); 00659 00660 if (out) 00661 { 00662 uint32_t *newp = realloc (out, 00663 sizeof (out[0]) 00664 * (outlen + 1 + buflen + 1)); 00665 if (!newp) 00666 { 00667 free (buf); 00668 free (out); 00669 return IDNA_MALLOC_ERROR; 00670 } 00671 out = newp; 00672 out[outlen++] = 0x002E; /* '.' (full stop) */ 00673 memcpy (out + outlen, buf, sizeof (buf[0]) * buflen); 00674 outlen += buflen; 00675 out[outlen] = 0x0; 00676 free (buf); 00677 } 00678 else 00679 { 00680 out = buf; 00681 outlen = buflen; 00682 out[outlen] = 0x0; 00683 } 00684 00685 start = end + 1; 00686 } 00687 while (*end); 00688 00689 *output = out; 00690 00691 return IDNA_SUCCESS; 00692 } 00693 00708 int 00709 idna_to_unicode_8z4z (const char *input, uint32_t ** output, int flags) 00710 { 00711 uint32_t *ucs4; 00712 size_t ucs4len; 00713 int rc; 00714 00715 ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len); 00716 if (!ucs4) 00717 return IDNA_ICONV_ERROR; 00718 00719 rc = idna_to_unicode_4z4z (ucs4, output, flags); 00720 free (ucs4); 00721 00722 return rc; 00723 } 00724 00739 int 00740 idna_to_unicode_8z8z (const char *input, char **output, int flags) 00741 { 00742 uint32_t *ucs4; 00743 int rc; 00744 00745 rc = idna_to_unicode_8z4z (input, &ucs4, flags); 00746 *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL); 00747 free (ucs4); 00748 00749 if (!*output) 00750 return IDNA_ICONV_ERROR; 00751 00752 return rc; 00753 } 00754 00770 int 00771 idna_to_unicode_8zlz (const char *input, char **output, int flags) 00772 { 00773 char *utf8; 00774 int rc; 00775 00776 rc = idna_to_unicode_8z8z (input, &utf8, flags); 00777 *output = stringprep_utf8_to_locale (utf8); 00778 free (utf8); 00779 00780 if (!*output) 00781 return IDNA_ICONV_ERROR; 00782 00783 return rc; 00784 } 00785 00802 int 00803 idna_to_unicode_lzlz (const char *input, char **output, int flags) 00804 { 00805 char *utf8; 00806 int rc; 00807 00808 utf8 = stringprep_locale_to_utf8 (input); 00809 if (!utf8) 00810 return IDNA_ICONV_ERROR; 00811 00812 rc = idna_to_unicode_8zlz (utf8, output, flags); 00813 free (utf8); 00814 00815 return rc; 00816 } 00817
1.7.6.1