libidn  1.25
idna.c
Go to the documentation of this file.
00001 /* idna.c --- Prototypes for Internationalized Domain Name library.
00002    Copyright (C) 2002-2012 Simon Josefsson
00003 
00004    This file is part of GNU Libidn.
00005 
00006    GNU Libidn is free software: you can redistribute it and/or
00007    modify it under the terms of either:
00008 
00009      * the GNU Lesser General Public License as published by the Free
00010        Software Foundation; either version 3 of the License, or (at
00011        your option) any later version.
00012 
00013    or
00014 
00015      * the GNU General Public License as published by the Free
00016        Software Foundation; either version 2 of the License, or (at
00017        your option) any later version.
00018 
00019    or both in parallel, as here.
00020 
00021    GNU Libidn is distributed in the hope that it will be useful,
00022    but WITHOUT ANY WARRANTY; without even the implied warranty of
00023    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00024    General Public License for more details.
00025 
00026    You should have received copies of the GNU General Public License and
00027    the GNU Lesser General Public License along with this program.  If
00028    not, see <http://www.gnu.org/licenses/>. */
00029 
00030 #ifdef HAVE_CONFIG_H
00031 # include "config.h"
00032 #endif
00033 
00034 #include <stdlib.h>
00035 #include <string.h>
00036 #include <stringprep.h>
00037 #include <punycode.h>
00038 
00039 #include "idna.h"
00040 
00041 /* Get c_strcasecmp. */
00042 #include <c-strcase.h>
00043 
00044 #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 ||      \
00045                  (c) == 0xFF0E || (c) == 0xFF61)
00046 
00047 /* Core functions */
00048 
00080 int
00081 idna_to_ascii_4i (const uint32_t * in, size_t inlen, char *out, int flags)
00082 {
00083   size_t len, outlen;
00084   uint32_t *src;                /* XXX don't need to copy data? */
00085   int rc;
00086 
00087   /*
00088    * ToASCII consists of the following steps:
00089    *
00090    * 1. If all code points in the sequence are in the ASCII range (0..7F)
00091    * then skip to step 3.
00092    */
00093 
00094   {
00095     size_t i;
00096     int inasciirange;
00097 
00098     inasciirange = 1;
00099     for (i = 0; i < inlen; i++)
00100       if (in[i] > 0x7F)
00101         inasciirange = 0;
00102     if (inasciirange)
00103       {
00104         src = malloc (sizeof (in[0]) * (inlen + 1));
00105         if (src == NULL)
00106           return IDNA_MALLOC_ERROR;
00107 
00108         memcpy (src, in, sizeof (in[0]) * inlen);
00109         src[inlen] = 0;
00110 
00111         goto step3;
00112       }
00113   }
00114 
00115   /*
00116    * 2. Perform the steps specified in [NAMEPREP] and fail if there is
00117    * an error. The AllowUnassigned flag is used in [NAMEPREP].
00118    */
00119 
00120   {
00121     char *p;
00122 
00123     p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
00124     if (p == NULL)
00125       return IDNA_MALLOC_ERROR;
00126 
00127     len = strlen (p);
00128     do
00129       {
00130         char *newp;
00131 
00132         len = 2 * len + 10;     /* XXX better guess? */
00133         newp = realloc (p, len);
00134         if (newp == NULL)
00135           {
00136             free (p);
00137             return IDNA_MALLOC_ERROR;
00138           }
00139         p = newp;
00140 
00141         if (flags & IDNA_ALLOW_UNASSIGNED)
00142           rc = stringprep_nameprep (p, len);
00143         else
00144           rc = stringprep_nameprep_no_unassigned (p, len);
00145       }
00146     while (rc == STRINGPREP_TOO_SMALL_BUFFER);
00147 
00148     if (rc != STRINGPREP_OK)
00149       {
00150         free (p);
00151         return IDNA_STRINGPREP_ERROR;
00152       }
00153 
00154     src = stringprep_utf8_to_ucs4 (p, -1, NULL);
00155 
00156     free (p);
00157   }
00158 
00159 step3:
00160   /*
00161    * 3. If the UseSTD3ASCIIRules flag is set, then perform these checks:
00162    *
00163    * (a) Verify the absence of non-LDH ASCII code points; that is,
00164    * the absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
00165    *
00166    * (b) Verify the absence of leading and trailing hyphen-minus;
00167    * that is, the absence of U+002D at the beginning and end of
00168    * the sequence.
00169    */
00170 
00171   if (flags & IDNA_USE_STD3_ASCII_RULES)
00172     {
00173       size_t i;
00174 
00175       for (i = 0; src[i]; i++)
00176         if (src[i] <= 0x2C || src[i] == 0x2E || src[i] == 0x2F ||
00177             (src[i] >= 0x3A && src[i] <= 0x40) ||
00178             (src[i] >= 0x5B && src[i] <= 0x60) ||
00179             (src[i] >= 0x7B && src[i] <= 0x7F))
00180           {
00181             free (src);
00182             return IDNA_CONTAINS_NON_LDH;
00183           }
00184 
00185       if (src[0] == 0x002D || (i > 0 && src[i - 1] == 0x002D))
00186         {
00187           free (src);
00188           return IDNA_CONTAINS_MINUS;
00189         }
00190     }
00191 
00192   /*
00193    * 4. If all code points in the sequence are in the ASCII range
00194    * (0..7F), then skip to step 8.
00195    */
00196 
00197   {
00198     size_t i;
00199     int inasciirange;
00200 
00201     inasciirange = 1;
00202     for (i = 0; src[i]; i++)
00203       {
00204         if (src[i] > 0x7F)
00205           inasciirange = 0;
00206         /* copy string to output buffer if we are about to skip to step8 */
00207         if (i < 64)
00208           out[i] = src[i];
00209       }
00210     if (i < 64)
00211       out[i] = '\0';
00212     if (inasciirange)
00213       goto step8;
00214   }
00215 
00216   /*
00217    * 5. Verify that the sequence does NOT begin with the ACE prefix.
00218    *
00219    */
00220 
00221   {
00222     size_t i;
00223     int match;
00224 
00225     match = 1;
00226     for (i = 0; match && i < strlen (IDNA_ACE_PREFIX); i++)
00227       if (((uint32_t) IDNA_ACE_PREFIX[i] & 0xFF) != src[i])
00228         match = 0;
00229     if (match)
00230       {
00231         free (src);
00232         return IDNA_CONTAINS_ACE_PREFIX;
00233       }
00234   }
00235 
00236   /*
00237    * 6. Encode the sequence using the encoding algorithm in [PUNYCODE]
00238    * and fail if there is an error.
00239    */
00240   for (len = 0; src[len]; len++)
00241     ;
00242   src[len] = '\0';
00243   outlen = 63 - strlen (IDNA_ACE_PREFIX);
00244   rc = punycode_encode (len, src, NULL,
00245                         &outlen, &out[strlen (IDNA_ACE_PREFIX)]);
00246   if (rc != PUNYCODE_SUCCESS)
00247     {
00248       free (src);
00249       return IDNA_PUNYCODE_ERROR;
00250     }
00251   out[strlen (IDNA_ACE_PREFIX) + outlen] = '\0';
00252 
00253   /*
00254    * 7. Prepend the ACE prefix.
00255    */
00256 
00257   memcpy (out, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX));
00258 
00259   /*
00260    * 8. Verify that the number of code points is in the range 1 to 63
00261    * inclusive (0 is excluded).
00262    */
00263 
00264 step8:
00265   free (src);
00266   if (strlen (out) < 1 || strlen (out) > 63)
00267     return IDNA_INVALID_LENGTH;
00268 
00269   return IDNA_SUCCESS;
00270 }
00271 
00272 /* ToUnicode().  May realloc() utf8in.  Will free utf8in unconditionally. */
00273 static int
00274 idna_to_unicode_internal (char *utf8in,
00275                           uint32_t * out, size_t * outlen, int flags)
00276 {
00277   int rc;
00278   char tmpout[64];
00279   size_t utf8len = strlen (utf8in) + 1;
00280   size_t addlen = 0;
00281 
00282   /*
00283    * ToUnicode consists of the following steps:
00284    *
00285    * 1. If the sequence contains any code points outside the ASCII range
00286    * (0..7F) then proceed to step 2, otherwise skip to step 3.
00287    */
00288 
00289   {
00290     size_t i;
00291     int inasciirange;
00292 
00293     inasciirange = 1;
00294     for (i = 0; utf8in[i]; i++)
00295       if (utf8in[i] & ~0x7F)
00296         inasciirange = 0;
00297     if (inasciirange)
00298       goto step3;
00299   }
00300 
00301   /*
00302    * 2. Perform the steps specified in [NAMEPREP] and fail if there is an
00303    * error. (If step 3 of ToASCII is also performed here, it will not
00304    * affect the overall behavior of ToUnicode, but it is not
00305    * necessary.) The AllowUnassigned flag is used in [NAMEPREP].
00306    */
00307   do
00308     {
00309       char *newp = realloc (utf8in, utf8len + addlen);
00310       if (newp == NULL)
00311         {
00312           free (utf8in);
00313           return IDNA_MALLOC_ERROR;
00314         }
00315       utf8in = newp;
00316       if (flags & IDNA_ALLOW_UNASSIGNED)
00317         rc = stringprep_nameprep (utf8in, utf8len + addlen);
00318       else
00319         rc = stringprep_nameprep_no_unassigned (utf8in, utf8len + addlen);
00320       addlen += 1;
00321     }
00322   while (rc == STRINGPREP_TOO_SMALL_BUFFER);
00323 
00324   if (rc != STRINGPREP_OK)
00325     {
00326       free (utf8in);
00327       return IDNA_STRINGPREP_ERROR;
00328     }
00329 
00330   /* 3. Verify that the sequence begins with the ACE prefix, and save a
00331    * copy of the sequence.
00332    * ... The ToASCII and ToUnicode operations MUST recognize the ACE
00333    prefix in a case-insensitive manner.
00334    */
00335 
00336 step3:
00337   if (c_strncasecmp (utf8in, IDNA_ACE_PREFIX, strlen (IDNA_ACE_PREFIX)) != 0)
00338     {
00339       free (utf8in);
00340       return IDNA_NO_ACE_PREFIX;
00341     }
00342 
00343   /* 4. Remove the ACE prefix.
00344    */
00345 
00346   memmove (utf8in, &utf8in[strlen (IDNA_ACE_PREFIX)],
00347            strlen (utf8in) - strlen (IDNA_ACE_PREFIX) + 1);
00348 
00349   /* 5. Decode the sequence using the decoding algorithm in [PUNYCODE]
00350    * and fail if there is an error. Save a copy of the result of
00351    * this step.
00352    */
00353 
00354   (*outlen)--;                  /* reserve one for the zero */
00355 
00356   rc = punycode_decode (strlen (utf8in), utf8in, outlen, out, NULL);
00357   if (rc != PUNYCODE_SUCCESS)
00358     {
00359       free (utf8in);
00360       return IDNA_PUNYCODE_ERROR;
00361     }
00362 
00363   out[*outlen] = 0;             /* add zero */
00364 
00365   /* 6. Apply ToASCII.
00366    */
00367 
00368   rc = idna_to_ascii_4i (out, *outlen, tmpout, flags);
00369   if (rc != IDNA_SUCCESS)
00370     {
00371       free (utf8in);
00372       return rc;
00373     }
00374 
00375   /* 7. Verify that the result of step 6 matches the saved copy from
00376    * step 3, using a case-insensitive ASCII comparison.
00377    */
00378 
00379   if (c_strcasecmp (utf8in, tmpout + strlen (IDNA_ACE_PREFIX)) != 0)
00380     {
00381       free (utf8in);
00382       return IDNA_ROUNDTRIP_VERIFY_ERROR;
00383     }
00384 
00385   /* 8. Return the saved copy from step 5.
00386    */
00387 
00388   free (utf8in);
00389   return IDNA_SUCCESS;
00390 }
00391 
00427 int
00428 idna_to_unicode_44i (const uint32_t * in, size_t inlen,
00429                      uint32_t * out, size_t * outlen, int flags)
00430 {
00431   int rc;
00432   size_t outlensave = *outlen;
00433   char *p;
00434 
00435   p = stringprep_ucs4_to_utf8 (in, (ssize_t) inlen, NULL, NULL);
00436   if (p == NULL)
00437     return IDNA_MALLOC_ERROR;
00438 
00439   rc = idna_to_unicode_internal (p, out, outlen, flags);
00440   if (rc != IDNA_SUCCESS)
00441     {
00442       memcpy (out, in, sizeof (in[0]) * (inlen < outlensave ?
00443                                          inlen : outlensave));
00444       *outlen = inlen;
00445     }
00446 
00447   /* p is freed in idna_to_unicode_internal.  */
00448 
00449   return rc;
00450 }
00451 
00452 /* Wrappers that handle several labels */
00453 
00467 int
00468 idna_to_ascii_4z (const uint32_t * input, char **output, int flags)
00469 {
00470   const uint32_t *start = input;
00471   const uint32_t *end;
00472   char buf[64];
00473   char *out = NULL;
00474   int rc;
00475 
00476   /* 1) Whenever dots are used as label separators, the following
00477      characters MUST be recognized as dots: U+002E (full stop),
00478      U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
00479      U+FF61 (halfwidth ideographic full stop). */
00480 
00481   if (input[0] == 0)
00482     {
00483       /* Handle implicit zero-length root label. */
00484       *output = malloc (1);
00485       if (!*output)
00486         return IDNA_MALLOC_ERROR;
00487       strcpy (*output, "");
00488       return IDNA_SUCCESS;
00489     }
00490 
00491   if (DOTP (input[0]) && input[1] == 0)
00492     {
00493       /* Handle explicit zero-length root label. */
00494       *output = malloc (2);
00495       if (!*output)
00496         return IDNA_MALLOC_ERROR;
00497       strcpy (*output, ".");
00498       return IDNA_SUCCESS;
00499     }
00500 
00501   *output = NULL;
00502   do
00503     {
00504       end = start;
00505 
00506       for (; *end && !DOTP (*end); end++)
00507         ;
00508 
00509       if (*end == '\0' && start == end)
00510         {
00511           /* Handle explicit zero-length root label. */
00512           buf[0] = '\0';
00513         }
00514       else
00515         {
00516           rc = idna_to_ascii_4i (start, (size_t) (end - start), buf, flags);
00517           if (rc != IDNA_SUCCESS)
00518             {
00519               free (out);
00520               return rc;
00521             }
00522         }
00523 
00524       if (out)
00525         {
00526           size_t l = strlen (out) + 1 + strlen (buf) + 1;
00527           char *newp = realloc (out, l);
00528           if (!newp)
00529             {
00530               free (out);
00531               return IDNA_MALLOC_ERROR;
00532             }
00533           out = newp;
00534           strcat (out, ".");
00535           strcat (out, buf);
00536         }
00537       else
00538         {
00539           size_t l = strlen (buf) + 1;
00540           out = (char *) malloc (l);
00541           if (!out)
00542             return IDNA_MALLOC_ERROR;
00543           strcpy (out, buf);
00544         }
00545 
00546       start = end + 1;
00547     }
00548   while (*end);
00549 
00550   *output = out;
00551 
00552   return IDNA_SUCCESS;
00553 }
00554 
00568 int
00569 idna_to_ascii_8z (const char *input, char **output, int flags)
00570 {
00571   uint32_t *ucs4;
00572   size_t ucs4len;
00573   int rc;
00574 
00575   ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
00576   if (!ucs4)
00577     return IDNA_ICONV_ERROR;
00578 
00579   rc = idna_to_ascii_4z (ucs4, output, flags);
00580 
00581   free (ucs4);
00582 
00583   return rc;
00584 
00585 }
00586 
00601 int
00602 idna_to_ascii_lz (const char *input, char **output, int flags)
00603 {
00604   char *utf8;
00605   int rc;
00606 
00607   utf8 = stringprep_locale_to_utf8 (input);
00608   if (!utf8)
00609     return IDNA_ICONV_ERROR;
00610 
00611   rc = idna_to_ascii_8z (utf8, output, flags);
00612 
00613   free (utf8);
00614 
00615   return rc;
00616 }
00617 
00632 int
00633 idna_to_unicode_4z4z (const uint32_t * input, uint32_t ** output, int flags)
00634 {
00635   const uint32_t *start = input;
00636   const uint32_t *end;
00637   uint32_t *buf;
00638   size_t buflen;
00639   uint32_t *out = NULL;
00640   size_t outlen = 0;
00641 
00642   *output = NULL;
00643 
00644   do
00645     {
00646       end = start;
00647 
00648       for (; *end && !DOTP (*end); end++)
00649         ;
00650 
00651       buflen = (size_t) (end - start);
00652       buf = malloc (sizeof (buf[0]) * (buflen + 1));
00653       if (!buf)
00654         return IDNA_MALLOC_ERROR;
00655 
00656       /* don't check return code as per specification! */
00657       idna_to_unicode_44i (start, (size_t) (end - start),
00658                            buf, &buflen, flags);
00659 
00660       if (out)
00661         {
00662           uint32_t *newp = realloc (out,
00663                                     sizeof (out[0])
00664                                     * (outlen + 1 + buflen + 1));
00665           if (!newp)
00666             {
00667               free (buf);
00668               free (out);
00669               return IDNA_MALLOC_ERROR;
00670             }
00671           out = newp;
00672           out[outlen++] = 0x002E;       /* '.' (full stop) */
00673           memcpy (out + outlen, buf, sizeof (buf[0]) * buflen);
00674           outlen += buflen;
00675           out[outlen] = 0x0;
00676           free (buf);
00677         }
00678       else
00679         {
00680           out = buf;
00681           outlen = buflen;
00682           out[outlen] = 0x0;
00683         }
00684 
00685       start = end + 1;
00686     }
00687   while (*end);
00688 
00689   *output = out;
00690 
00691   return IDNA_SUCCESS;
00692 }
00693 
00708 int
00709 idna_to_unicode_8z4z (const char *input, uint32_t ** output, int flags)
00710 {
00711   uint32_t *ucs4;
00712   size_t ucs4len;
00713   int rc;
00714 
00715   ucs4 = stringprep_utf8_to_ucs4 (input, -1, &ucs4len);
00716   if (!ucs4)
00717     return IDNA_ICONV_ERROR;
00718 
00719   rc = idna_to_unicode_4z4z (ucs4, output, flags);
00720   free (ucs4);
00721 
00722   return rc;
00723 }
00724 
00739 int
00740 idna_to_unicode_8z8z (const char *input, char **output, int flags)
00741 {
00742   uint32_t *ucs4;
00743   int rc;
00744 
00745   rc = idna_to_unicode_8z4z (input, &ucs4, flags);
00746   *output = stringprep_ucs4_to_utf8 (ucs4, -1, NULL, NULL);
00747   free (ucs4);
00748 
00749   if (!*output)
00750     return IDNA_ICONV_ERROR;
00751 
00752   return rc;
00753 }
00754 
00770 int
00771 idna_to_unicode_8zlz (const char *input, char **output, int flags)
00772 {
00773   char *utf8;
00774   int rc;
00775 
00776   rc = idna_to_unicode_8z8z (input, &utf8, flags);
00777   *output = stringprep_utf8_to_locale (utf8);
00778   free (utf8);
00779 
00780   if (!*output)
00781     return IDNA_ICONV_ERROR;
00782 
00783   return rc;
00784 }
00785 
00802 int
00803 idna_to_unicode_lzlz (const char *input, char **output, int flags)
00804 {
00805   char *utf8;
00806   int rc;
00807 
00808   utf8 = stringprep_locale_to_utf8 (input);
00809   if (!utf8)
00810     return IDNA_ICONV_ERROR;
00811 
00812   rc = idna_to_unicode_8zlz (utf8, output, flags);
00813   free (utf8);
00814 
00815   return rc;
00816 }
00817