LCOV - code coverage report
Current view: top level - lib - tld.c (source / functions) Hit Total Coverage
Test: GNU Libidn Lines: 121 126 96.0 %
Date: 2020-07-22 17:53:13 Functions: 12 12 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* tld.c --- Declarations for TLD restriction checking.
       2             :    Copyright (C) 2004-2020 Simon Josefsson.
       3             :    Copyright (C) 2003-2020 Free Software Foundation, Inc.
       4             : 
       5             :    Author: Thomas Jacob, Internet24.de
       6             : 
       7             :    This file is part of GNU Libidn.
       8             : 
       9             :    GNU Libidn is free software: you can redistribute it and/or
      10             :    modify it under the terms of either:
      11             : 
      12             :      * the GNU Lesser General Public License as published by the Free
      13             :        Software Foundation; either version 3 of the License, or (at
      14             :        your option) any later version.
      15             : 
      16             :    or
      17             : 
      18             :      * the GNU General Public License as published by the Free
      19             :        Software Foundation; either version 2 of the License, or (at
      20             :        your option) any later version.
      21             : 
      22             :    or both in parallel, as here.
      23             : 
      24             :    GNU Libidn is distributed in the hope that it will be useful,
      25             :    but WITHOUT ANY WARRANTY; without even the implied warranty of
      26             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      27             :    General Public License for more details.
      28             : 
      29             :    You should have received copies of the GNU General Public License and
      30             :    the GNU Lesser General Public License along with this program.  If
      31             :    not, see <http://www.gnu.org/licenses/>. */
      32             : 
      33             : #include <config.h>
      34             : 
      35             : /* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */
      36             : #include <stringprep.h>
      37             : 
      38             : /* Get strcmp(). */
      39             : #include <string.h>
      40             : 
      41             : /* Get specifications. */
      42             : #include <tld.h>
      43             : 
      44             : /* Array of built-in domain restriction structures.  See tlds.c.  */
      45             : extern const Tld_table *_tld_tables[];
      46             : 
      47             : /**
      48             :  * tld_get_table:
      49             :  * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
      50             :  * @tables: Zero terminated array of #Tld_table info-structures for
      51             :  *   TLDs.
      52             :  *
      53             :  * Get the TLD table for a named TLD by searching through the given
      54             :  * TLD table array.
      55             :  *
      56             :  * Return value: Return structure corresponding to TLD @tld by going
      57             :  *   thru @tables, or return %NULL if no such structure is found.
      58             :  */
      59             : const Tld_table *
      60         431 : tld_get_table (const char *tld, const Tld_table ** tables)
      61             : {
      62         431 :   const Tld_table **tldtable = NULL;
      63             : 
      64         431 :   if (!tld || !tables)
      65           2 :     return NULL;
      66             : 
      67         516 :   for (tldtable = tables; *tldtable; tldtable++)
      68         488 :     if (!strcmp ((*tldtable)->name, tld))
      69         401 :       return *tldtable;
      70             : 
      71          28 :   return NULL;
      72             : }
      73             : 
      74             : /**
      75             :  * tld_default_table:
      76             :  * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
      77             :  * @overrides: Additional zero terminated array of #Tld_table
      78             :  *   info-structures for TLDs, or %NULL to only use library deault
      79             :  *   tables.
      80             :  *
      81             :  * Get the TLD table for a named TLD, using the internal defaults,
      82             :  * possibly overrided by the (optional) supplied tables.
      83             :  *
      84             :  * Return value: Return structure corresponding to TLD @tld_str, first
      85             :  *   looking through @overrides then thru built-in list, or %NULL if
      86             :  *   no such structure found.
      87             :  */
      88             : const Tld_table *
      89         431 : tld_default_table (const char *tld, const Tld_table ** overrides)
      90             : {
      91         431 :   const Tld_table *tldtable = NULL;
      92             : 
      93         431 :   if (!tld)
      94           2 :     return NULL;
      95             : 
      96         429 :   if (overrides)
      97           1 :     tldtable = tld_get_table (tld, overrides);
      98             : 
      99         429 :   if (!tldtable)
     100         428 :     tldtable = tld_get_table (tld, _tld_tables);
     101             : 
     102         429 :   return tldtable;
     103             : }
     104             : 
     105             : #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 ||      \
     106             :                  (c) == 0xFF0E || (c) == 0xFF61)
     107             : 
     108             : /**
     109             :  * tld_get_4:
     110             :  * @in: Array of unicode code points to process. Does not need to be
     111             :  *   zero terminated.
     112             :  * @inlen: Number of unicode code points.
     113             :  * @out: Zero terminated ascii result string pointer.
     114             :  *
     115             :  * Isolate the top-level domain of @in and return it as an ASCII
     116             :  * string in @out.
     117             :  *
     118             :  * Return value: Return %TLD_SUCCESS on success, or the corresponding
     119             :  *   #Tld_rc error code otherwise.
     120             :  */
     121             : int
     122        1187 : tld_get_4 (const uint32_t * in, size_t inlen, char **out)
     123             : {
     124             :   const uint32_t *ipos;
     125             :   size_t olen;
     126             : 
     127        1187 :   *out = NULL;
     128        1187 :   if (!in || inlen == 0)
     129          54 :     return TLD_NODATA;
     130             : 
     131        1133 :   ipos = &in[inlen - 1];
     132        1133 :   olen = 0;
     133             :   /* Scan backwards for non(latin)letters. */
     134        2961 :   while (ipos >= in && ((*ipos >= 0x41 && *ipos <= 0x5A) ||
     135        1473 :                         (*ipos >= 0x61 && *ipos <= 0x7A)))
     136        1828 :     ipos--, olen++;
     137             : 
     138        1133 :   if (olen > 0 && ipos >= in && DOTP (*ipos))
     139             :     {
     140             :       /* Found something that appears a TLD. */
     141         104 :       char *out_s = malloc (sizeof (char) * (olen + 1));
     142         104 :       char *opos = out_s;
     143             : 
     144         104 :       if (!opos)
     145           0 :         return TLD_MALLOC_ERROR;
     146             : 
     147         104 :       ipos++;
     148             :       /* Transcribe to lowercase ascii string. */
     149         519 :       for (; ipos < &in[inlen]; ipos++, opos++)
     150         415 :         *opos = *ipos > 0x5A ? *ipos : *ipos + 0x20;
     151         104 :       *opos = 0;
     152         104 :       *out = out_s;
     153         104 :       return TLD_SUCCESS;
     154             :     }
     155             : 
     156        1029 :   return TLD_NO_TLD;
     157             : }
     158             : 
     159             : /**
     160             :  * tld_get_4z:
     161             :  * @in: Zero terminated array of unicode code points to process.
     162             :  * @out: Zero terminated ascii result string pointer.
     163             :  *
     164             :  * Isolate the top-level domain of @in and return it as an ASCII
     165             :  * string in @out.
     166             :  *
     167             :  * Return value: Return %TLD_SUCCESS on success, or the corresponding
     168             :  *   #Tld_rc error code otherwise.
     169             :  */
     170             : int
     171         221 : tld_get_4z (const uint32_t * in, char **out)
     172             : {
     173         221 :   const uint32_t *ipos = in;
     174             : 
     175         221 :   if (!in)
     176           0 :     return TLD_NODATA;
     177             : 
     178        5570 :   while (*ipos)
     179        5349 :     ipos++;
     180             : 
     181         221 :   return tld_get_4 (in, ipos - in, out);
     182             : }
     183             : 
     184             : /**
     185             :  * tld_get_z:
     186             :  * @in: Zero terminated character array to process.
     187             :  * @out: Zero terminated ascii result string pointer.
     188             :  *
     189             :  * Isolate the top-level domain of @in and return it as an ASCII
     190             :  * string in @out.  The input string @in may be UTF-8, ISO-8859-1 or
     191             :  * any ASCII compatible character encoding.
     192             :  *
     193             :  * Return value: Return %TLD_SUCCESS on success, or the corresponding
     194             :  *   #Tld_rc error code otherwise.
     195             :  */
     196             : int
     197         368 : tld_get_z (const char *in, char **out)
     198             : {
     199             :   uint32_t *iucs;
     200             :   size_t i, ilen;
     201             :   int rc;
     202             : 
     203         368 :   ilen = strlen (in);
     204         368 :   iucs = calloc (ilen, sizeof (*iucs));
     205             : 
     206         368 :   if (!iucs)
     207           0 :     return TLD_MALLOC_ERROR;
     208             : 
     209       26326 :   for (i = 0; i < ilen; i++)
     210       25958 :     iucs[i] = in[i];
     211             : 
     212         368 :   rc = tld_get_4 (iucs, ilen, out);
     213             : 
     214         368 :   free (iucs);
     215             : 
     216         368 :   return rc;
     217             : }
     218             : 
     219             : /*
     220             :  * tld_checkchar - verify that character is permitted
     221             :  * @ch: 32 bit unicode character to check.
     222             :  * @tld: A #Tld_table data structure to check @ch against.
     223             :  *
     224             :  * Verify if @ch is either in [a-z0-9-.] or mentioned as a valid
     225             :  * character in @tld.
     226             :  *
     227             :  * Return value: Return the #Tld_rc value %TLD_SUCCESS if @ch is a
     228             :  *   valid character for the TLD @tld or if @tld is %NULL,
     229             :  *   %TLD_INVALID if @ch is invalid as defined by @tld.
     230             :  */
     231             : static int
     232        2611 : _tld_checkchar (uint32_t ch, const Tld_table * tld)
     233             : {
     234             :   const Tld_table_element *s, *e, *m;
     235             : 
     236        2611 :   if (!tld)
     237           0 :     return TLD_SUCCESS;
     238             : 
     239             :   /* Check for [-a-z0-9.]. */
     240        2611 :   if ((ch >= 0x61 && ch <= 0x7A) ||
     241        2096 :       (ch >= 0x30 && ch <= 0x39) || ch == 0x2D || DOTP (ch))
     242        1924 :     return TLD_SUCCESS;
     243             : 
     244         687 :   s = tld->valid;
     245         687 :   e = s + tld->nvalid;
     246        2632 :   while (s < e)
     247             :     {
     248        2423 :       m = s + ((e - s) >> 1);
     249        2423 :       if (ch < m->start)
     250         525 :         e = m;
     251        1898 :       else if (ch > m->end)
     252        1420 :         s = m + 1;
     253             :       else
     254         478 :         return TLD_SUCCESS;
     255             :     }
     256             : 
     257         209 :   return TLD_INVALID;
     258             : }
     259             : 
     260             : /**
     261             :  * tld_check_4t:
     262             :  * @in: Array of unicode code points to process. Does not need to be
     263             :  *   zero terminated.
     264             :  * @inlen: Number of unicode code points.
     265             :  * @errpos: Position of offending character is returned here.
     266             :  * @tld: A #Tld_table data structure representing the restrictions for
     267             :  *   which the input should be tested.
     268             :  *
     269             :  * Test each of the code points in @in for whether or not
     270             :  * they are allowed by the data structure in @tld, return
     271             :  * the position of the first character for which this is not
     272             :  * the case in @errpos.
     273             :  *
     274             :  * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
     275             :  *   points are valid or when @tld is null, %TLD_INVALID if a
     276             :  *   character is not allowed, or additional error codes on general
     277             :  *   failure conditions.
     278             :  */
     279             : int
     280         284 : tld_check_4t (const uint32_t * in, size_t inlen, size_t *errpos,
     281             :               const Tld_table * tld)
     282             : {
     283             :   const uint32_t *ipos;
     284             :   int rc;
     285             : 
     286         284 :   if (!tld)                     /* No data for TLD so everything is valid. */
     287          29 :     return TLD_SUCCESS;
     288             : 
     289         255 :   ipos = in;
     290        2657 :   while (ipos < &in[inlen])
     291             :     {
     292        2611 :       rc = _tld_checkchar (*ipos, tld);
     293        2611 :       if (rc != TLD_SUCCESS)
     294             :         {
     295         209 :           if (errpos)
     296         209 :             *errpos = ipos - in;
     297         209 :           return rc;
     298             :         }
     299        2402 :       ipos++;
     300             :     }
     301          46 :   return TLD_SUCCESS;
     302             : }
     303             : 
     304             : /**
     305             :  * tld_check_4tz:
     306             :  * @in: Zero terminated array of unicode code points to process.
     307             :  * @errpos: Position of offending character is returned here.
     308             :  * @tld: A #Tld_table data structure representing the restrictions for
     309             :  *   which the input should be tested.
     310             :  *
     311             :  * Test each of the code points in @in for whether or not
     312             :  * they are allowed by the data structure in @tld, return
     313             :  * the position of the first character for which this is not
     314             :  * the case in @errpos.
     315             :  *
     316             :  * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
     317             :  *   points are valid or when @tld is null, %TLD_INVALID if a
     318             :  *   character is not allowed, or additional error codes on general
     319             :  *   failure conditions.
     320             :  */
     321             : int
     322         220 : tld_check_4tz (const uint32_t * in, size_t *errpos, const Tld_table * tld)
     323             : {
     324         220 :   const uint32_t *ipos = in;
     325             : 
     326         220 :   if (!ipos)
     327           0 :     return TLD_NODATA;
     328             : 
     329        5563 :   while (*ipos)
     330        5343 :     ipos++;
     331             : 
     332         220 :   return tld_check_4t (in, ipos - in, errpos, tld);
     333             : }
     334             : 
     335             : /**
     336             :  * tld_check_4:
     337             :  * @in: Array of unicode code points to process. Does not need to be
     338             :  *   zero terminated.
     339             :  * @inlen: Number of unicode code points.
     340             :  * @errpos: Position of offending character is returned here.
     341             :  * @overrides: A #Tld_table array of additional domain restriction
     342             :  *  structures that complement and supersede the built-in information.
     343             :  *
     344             :  * Test each of the code points in @in for whether or not they are
     345             :  * allowed by the information in @overrides or by the built-in TLD
     346             :  * restriction data. When data for the same TLD is available both
     347             :  * internally and in @overrides, the information in @overrides takes
     348             :  * precedence. If several entries for a specific TLD are found, the
     349             :  * first one is used.  If @overrides is %NULL, only the built-in
     350             :  * information is used.  The position of the first offending character
     351             :  * is returned in @errpos.
     352             :  *
     353             :  * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
     354             :  *   points are valid or when @tld is null, %TLD_INVALID if a
     355             :  *   character is not allowed, or additional error codes on general
     356             :  *   failure conditions.
     357             :  */
     358             : int
     359         595 : tld_check_4 (const uint32_t * in, size_t inlen, size_t *errpos,
     360             :              const Tld_table ** overrides)
     361             : {
     362             :   const Tld_table *tld;
     363             :   char *domain;
     364             :   int rc;
     365             : 
     366         595 :   if (errpos)
     367         593 :     *errpos = 0;
     368             : 
     369             :   /* Get TLD name. */
     370         595 :   rc = tld_get_4 (in, inlen, &domain);
     371             : 
     372         595 :   if (rc != TLD_SUCCESS)
     373             :     {
     374         534 :       if (rc == TLD_NO_TLD)     /* No TLD, say OK */
     375         500 :         return TLD_SUCCESS;
     376             :       else
     377          34 :         return rc;
     378             :     }
     379             : 
     380             :   /* Retrieve appropriate data structure. */
     381          61 :   tld = tld_default_table (domain, overrides);
     382          61 :   free (domain);
     383             : 
     384          61 :   return tld_check_4t (in, inlen, errpos, tld);
     385             : }
     386             : 
     387             : /**
     388             :  * tld_check_4z:
     389             :  * @in: Zero-terminated array of unicode code points to process.
     390             :  * @errpos: Position of offending character is returned here.
     391             :  * @overrides: A #Tld_table array of additional domain restriction
     392             :  *   structures that complement and supersede the built-in information.
     393             :  *
     394             :  * Test each of the code points in @in for whether or not they are
     395             :  * allowed by the information in @overrides or by the built-in TLD
     396             :  * restriction data. When data for the same TLD is available both
     397             :  * internally and in @overrides, the information in @overrides takes
     398             :  * precedence. If several entries for a specific TLD are found, the
     399             :  * first one is used.  If @overrides is %NULL, only the built-in
     400             :  * information is used.  The position of the first offending character
     401             :  * is returned in @errpos.
     402             :  *
     403             :  * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
     404             :  *   points are valid or when @tld is null, %TLD_INVALID if a
     405             :  *   character is not allowed, or additional error codes on general
     406             :  *   failure conditions.
     407             :  */
     408             : int
     409         222 : tld_check_4z (const uint32_t * in, size_t *errpos,
     410             :               const Tld_table ** overrides)
     411             : {
     412         222 :   const uint32_t *ipos = in;
     413             : 
     414         222 :   if (!ipos)
     415           1 :     return TLD_NODATA;
     416             : 
     417        5567 :   while (*ipos)
     418        5346 :     ipos++;
     419             : 
     420         221 :   return tld_check_4 (in, ipos - in, errpos, overrides);
     421             : }
     422             : 
     423             : /**
     424             :  * tld_check_8z:
     425             :  * @in: Zero-terminated UTF8 string to process.
     426             :  * @errpos: Position of offending character is returned here.
     427             :  * @overrides: A #Tld_table array of additional domain restriction
     428             :  *   structures that complement and supersede the built-in information.
     429             :  *
     430             :  * Test each of the characters in @in for whether or not they are
     431             :  * allowed by the information in @overrides or by the built-in TLD
     432             :  * restriction data. When data for the same TLD is available both
     433             :  * internally and in @overrides, the information in @overrides takes
     434             :  * precedence. If several entries for a specific TLD are found, the
     435             :  * first one is used.  If @overrides is %NULL, only the built-in
     436             :  * information is used.  The position of the first offending character
     437             :  * is returned in @errpos.  Note that the error position refers to the
     438             :  * decoded character offset rather than the byte position in the
     439             :  * string.
     440             :  *
     441             :  * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
     442             :  *   characters are valid or when @tld is null, %TLD_INVALID if a
     443             :  *   character is not allowed, or additional error codes on general
     444             :  *   failure conditions.
     445             :  */
     446             : int
     447         465 : tld_check_8z (const char *in, size_t *errpos, const Tld_table ** overrides)
     448             : {
     449             :   uint32_t *iucs;
     450             :   size_t ilen;
     451             :   int rc;
     452             : 
     453         465 :   if (!in)
     454           1 :     return TLD_NODATA;
     455             : 
     456         464 :   iucs = stringprep_utf8_to_ucs4 (in, -1, &ilen);
     457             : 
     458         464 :   if (!iucs)
     459          90 :     return TLD_MALLOC_ERROR;
     460             : 
     461         374 :   rc = tld_check_4 (iucs, ilen, errpos, overrides);
     462             : 
     463         374 :   free (iucs);
     464             : 
     465         374 :   return rc;
     466             : }
     467             : 
     468             : /**
     469             :  * tld_check_lz:
     470             :  * @in: Zero-terminated string in the current locales encoding to process.
     471             :  * @errpos: Position of offending character is returned here.
     472             :  * @overrides: A #Tld_table array of additional domain restriction
     473             :  *   structures that complement and supersede the built-in information.
     474             :  *
     475             :  * Test each of the characters in @in for whether or not they are
     476             :  * allowed by the information in @overrides or by the built-in TLD
     477             :  * restriction data. When data for the same TLD is available both
     478             :  * internally and in @overrides, the information in @overrides takes
     479             :  * precedence. If several entries for a specific TLD are found, the
     480             :  * first one is used.  If @overrides is %NULL, only the built-in
     481             :  * information is used.  The position of the first offending character
     482             :  * is returned in @errpos.  Note that the error position refers to the
     483             :  * decoded character offset rather than the byte position in the
     484             :  * string.
     485             :  *
     486             :  * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
     487             :  *   characters are valid or when @tld is null, %TLD_INVALID if a
     488             :  *   character is not allowed, or additional error codes on general
     489             :  *   failure conditions.
     490             :  */
     491             : int
     492         366 : tld_check_lz (const char *in, size_t *errpos, const Tld_table ** overrides)
     493             : {
     494             :   char *utf8;
     495             :   int rc;
     496             : 
     497         366 :   if (!in)
     498           1 :     return TLD_NODATA;
     499             : 
     500         365 :   utf8 = stringprep_locale_to_utf8 (in);
     501         365 :   if (!utf8)
     502         267 :     return TLD_ICONV_ERROR;
     503             : 
     504             : 
     505          98 :   rc = tld_check_8z (utf8, errpos, overrides);
     506             : 
     507          98 :   free (utf8);
     508             : 
     509          98 :   return rc;
     510             : }
     511             : 
     512             : /**
     513             :  * Tld_rc:
     514             :  * @TLD_SUCCESS: Successful operation.  This value is guaranteed to
     515             :  *   always be zero, the remaining ones are only guaranteed to hold
     516             :  *   non-zero values, for logical comparison purposes.
     517             :  * @TLD_INVALID: Invalid character found.
     518             :  * @TLD_NODATA: No input data was provided.
     519             :  * @TLD_MALLOC_ERROR: Error during memory allocation.
     520             :  * @TLD_ICONV_ERROR: Character encoding conversion error.
     521             :  * @TLD_NO_TLD: No top-level domain found in domain string.
     522             :  * @TLD_NOTLD: Same as @TLD_NO_TLD, for compatibility
     523             :  *   with typo in earlier versions.
     524             :  *
     525             :  * Enumerated return codes of the TLD checking functions.
     526             :  * The value 0 is guaranteed to always correspond to success.
     527             :  */

Generated by: LCOV version 1.13