Branch data Line data Source code
1 : : /* tld.c --- Declarations for TLD restriction checking.
2 : : Copyright (C) 2004-2012 Simon Josefsson.
3 : : Copyright (C) 2003-2012 Free Software Foundation, Inc.
4 : :
5 : : Author: Thomas Jacob, Internet24.de
6 : :
7 : : This file is part of GNU Libidn.
8 : :
9 : : GNU Libidn is free software: you can redistribute it and/or
10 : : modify it under the terms of either:
11 : :
12 : : * the GNU Lesser General Public License as published by the Free
13 : : Software Foundation; either version 3 of the License, or (at
14 : : your option) any later version.
15 : :
16 : : or
17 : :
18 : : * the GNU General Public License as published by the Free
19 : : Software Foundation; either version 2 of the License, or (at
20 : : your option) any later version.
21 : :
22 : : or both in parallel, as here.
23 : :
24 : : GNU Libidn is distributed in the hope that it will be useful,
25 : : but WITHOUT ANY WARRANTY; without even the implied warranty of
26 : : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27 : : General Public License for more details.
28 : :
29 : : You should have received copies of the GNU General Public License and
30 : : the GNU Lesser General Public License along with this program. If
31 : : not, see <http://www.gnu.org/licenses/>. */
32 : :
33 : : #include <config.h>
34 : :
35 : : /* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */
36 : : #include <stringprep.h>
37 : :
38 : : /* Get strcmp(). */
39 : : #include <string.h>
40 : :
41 : : /* Get specifications. */
42 : : #include <tld.h>
43 : :
44 : : /* Array of built-in domain restriction structures. See tlds.c. */
45 : : extern const Tld_table *_tld_tables[];
46 : :
47 : : /**
48 : : * tld_get_table:
49 : : * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
50 : : * @tables: Zero terminated array of #Tld_table info-structures for
51 : : * TLDs.
52 : : *
53 : : * Get the TLD table for a named TLD by searching through the given
54 : : * TLD table array.
55 : : *
56 : : * Return value: Return structure corresponding to TLD @tld by going
57 : : * thru @tables, or return %NULL if no such structure is found.
58 : : */
59 : : const Tld_table *
60 : 8 : tld_get_table (const char *tld, const Tld_table ** tables)
61 : : {
62 : 8 : const Tld_table **tldtable = NULL;
63 : :
64 [ + + ][ + + ]: 8 : if (!tld || !tables)
65 : 2 : return NULL;
66 : :
67 [ + - ]: 6 : for (tldtable = tables; *tldtable; tldtable++)
68 [ + - ]: 6 : if (!strcmp ((*tldtable)->name, tld))
69 : 6 : return *tldtable;
70 : :
71 : 8 : return NULL;
72 : : }
73 : :
74 : : /**
75 : : * tld_default_table:
76 : : * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
77 : : * @overrides: Additional zero terminated array of #Tld_table
78 : : * info-structures for TLDs, or %NULL to only use library deault
79 : : * tables.
80 : : *
81 : : * Get the TLD table for a named TLD, using the internal defaults,
82 : : * possibly overrided by the (optional) supplied tables.
83 : : *
84 : : * Return value: Return structure corresponding to TLD @tld_str, first
85 : : * looking through @overrides then thru built-in list, or %NULL if
86 : : * no such structure found.
87 : : */
88 : : const Tld_table *
89 : 8 : tld_default_table (const char *tld, const Tld_table ** overrides)
90 : : {
91 : 8 : const Tld_table *tldtable = NULL;
92 : :
93 [ + + ]: 8 : if (!tld)
94 : 2 : return NULL;
95 : :
96 [ + + ]: 6 : if (overrides)
97 : 1 : tldtable = tld_get_table (tld, overrides);
98 : :
99 [ + + ]: 6 : if (!tldtable)
100 : 5 : tldtable = tld_get_table (tld, _tld_tables);
101 : :
102 : 8 : return tldtable;
103 : : }
104 : :
105 : : #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
106 : : (c) == 0xFF0E || (c) == 0xFF61)
107 : :
108 : : /**
109 : : * tld_get_4:
110 : : * @in: Array of unicode code points to process. Does not need to be
111 : : * zero terminated.
112 : : * @inlen: Number of unicode code points.
113 : : * @out: Zero terminated ascii result string pointer.
114 : : *
115 : : * Isolate the top-level domain of @in and return it as an ASCII
116 : : * string in @out.
117 : : *
118 : : * Return value: Return %TLD_SUCCESS on success, or the corresponding
119 : : * #Tld_rc error code otherwise.
120 : : */
121 : : int
122 : 12 : tld_get_4 (const uint32_t * in, size_t inlen, char **out)
123 : : {
124 : : const uint32_t *ipos;
125 : : size_t olen;
126 : :
127 : 12 : *out = NULL;
128 [ + + ][ + + ]: 12 : if (!in || inlen == 0)
129 : 2 : return TLD_NODATA;
130 : :
131 : 10 : ipos = &in[inlen - 1];
132 : 10 : olen = 0;
133 : : /* Scan backwards for non(latin)letters. */
134 [ + + ][ + + ]: 29 : while (ipos >= in && ((*ipos >= 0x41 && *ipos <= 0x5A) ||
[ - + ][ + + ]
135 [ + + ]: 20 : (*ipos >= 0x61 && *ipos <= 0x7A)))
136 : 19 : ipos--, olen++;
137 : :
138 [ + + ][ + + ]: 10 : if (olen > 0 && ipos >= in && DOTP (*ipos))
[ - + ][ # # ]
[ # # ][ # # ]
139 : : {
140 : : /* Found something that appears a TLD. */
141 : 8 : char *out_s = malloc (sizeof (char) * (olen + 1));
142 : 8 : char *opos = out_s;
143 : :
144 [ - + ]: 8 : if (!opos)
145 : 0 : return TLD_MALLOC_ERROR;
146 : :
147 : 8 : ipos++;
148 : : /* Transcribe to lowercase ascii string. */
149 [ + + ]: 24 : for (; ipos < &in[inlen]; ipos++, opos++)
150 [ + - ]: 16 : *opos = *ipos > 0x5A ? *ipos : *ipos + 0x20;
151 : 8 : *opos = 0;
152 : 8 : *out = out_s;
153 : 8 : return TLD_SUCCESS;
154 : : }
155 : :
156 : 12 : return TLD_NO_TLD;
157 : : }
158 : :
159 : : /**
160 : : * tld_get_4z:
161 : : * @in: Zero terminated array of unicode code points to process.
162 : : * @out: Zero terminated ascii result string pointer.
163 : : *
164 : : * Isolate the top-level domain of @in and return it as an ASCII
165 : : * string in @out.
166 : : *
167 : : * Return value: Return %TLD_SUCCESS on success, or the corresponding
168 : : * #Tld_rc error code otherwise.
169 : : */
170 : : int
171 : 1 : tld_get_4z (const uint32_t * in, char **out)
172 : : {
173 : 1 : const uint32_t *ipos = in;
174 : :
175 [ - + ]: 1 : if (!in)
176 : 0 : return TLD_NODATA;
177 : :
178 [ + + ]: 7 : while (*ipos)
179 : 6 : ipos++;
180 : :
181 : 1 : return tld_get_4 (in, ipos - in, out);
182 : : }
183 : :
184 : : /**
185 : : * tld_get_z:
186 : : * @in: Zero terminated character array to process.
187 : : * @out: Zero terminated ascii result string pointer.
188 : : *
189 : : * Isolate the top-level domain of @in and return it as an ASCII
190 : : * string in @out. The input string @in may be UTF-8, ISO-8859-1 or
191 : : * any ASCII compatible character encoding.
192 : : *
193 : : * Return value: Return %TLD_SUCCESS on success, or the corresponding
194 : : * #Tld_rc error code otherwise.
195 : : */
196 : : int
197 : 4 : tld_get_z (const char *in, char **out)
198 : : {
199 : : uint32_t *iucs;
200 : : size_t i, ilen;
201 : : int rc;
202 : :
203 : 4 : ilen = strlen (in);
204 : 4 : iucs = calloc (ilen, sizeof (*iucs));
205 : :
206 [ - + ]: 4 : if (!iucs)
207 : 0 : return TLD_MALLOC_ERROR;
208 : :
209 [ + + ]: 38 : for (i = 0; i < ilen; i++)
210 : 34 : iucs[i] = in[i];
211 : :
212 : 4 : rc = tld_get_4 (iucs, ilen, out);
213 : :
214 : 4 : free (iucs);
215 : :
216 : 4 : return rc;
217 : : }
218 : :
219 : : /*
220 : : * tld_checkchar - verify that character is permitted
221 : : * @ch: 32 bit unicode character to check.
222 : : * @tld: A #Tld_table data structure to check @ch against.
223 : : *
224 : : * Verify if @ch is either in [a-z0-9-.] or mentioned as a valid
225 : : * character in @tld.
226 : : *
227 : : * Return value: Return the #Tld_rc value %TLD_SUCCESS if @ch is a
228 : : * valid character for the TLD @tld or if @tld is %NULL,
229 : : * %TLD_INVALID if @ch is invalid as defined by @tld.
230 : : */
231 : : static int
232 : 18 : _tld_checkchar (uint32_t ch, const Tld_table * tld)
233 : : {
234 : : const Tld_table_element *s, *e, *m;
235 : :
236 [ - + ]: 18 : if (!tld)
237 : 0 : return TLD_SUCCESS;
238 : :
239 : : /* Check for [-a-z0-9.]. */
240 [ + + ][ + + ]: 18 : if ((ch >= 0x61 && ch <= 0x7A) ||
[ + + ]
241 [ + - ][ + - ]: 9 : (ch >= 0x30 && ch <= 0x39) || ch == 0x2D || DOTP (ch))
[ + + ][ + - ]
[ + - ][ - + ]
242 : 10 : return TLD_SUCCESS;
243 : :
244 : 8 : s = tld->valid;
245 : 8 : e = s + tld->nvalid;
246 [ + + ]: 27 : while (s < e)
247 : : {
248 : 25 : m = s + ((e - s) >> 1);
249 [ + + ]: 25 : if (ch < m->start)
250 : 10 : e = m;
251 [ + + ]: 15 : else if (ch > m->end)
252 : 9 : s = m + 1;
253 : : else
254 : 6 : return TLD_SUCCESS;
255 : : }
256 : :
257 : 18 : return TLD_INVALID;
258 : : }
259 : :
260 : : /**
261 : : * tld_check_4t:
262 : : * @in: Array of unicode code points to process. Does not need to be
263 : : * zero terminated.
264 : : * @inlen: Number of unicode code points.
265 : : * @errpos: Position of offending character is returned here.
266 : : * @tld: A #Tld_table data structure representing the restrictions for
267 : : * which the input should be tested.
268 : : *
269 : : * Test each of the code points in @in for whether or not
270 : : * they are allowed by the data structure in @tld, return
271 : : * the position of the first character for which this is not
272 : : * the case in @errpos.
273 : : *
274 : : * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
275 : : * points are valid or when @tld is null, %TLD_INVALID if a
276 : : * character is not allowed, or additional error codes on general
277 : : * failure conditions.
278 : : */
279 : : int
280 : 5 : tld_check_4t (const uint32_t * in, size_t inlen, size_t * errpos,
281 : : const Tld_table * tld)
282 : : {
283 : : const uint32_t *ipos;
284 : : int rc;
285 : :
286 [ + + ]: 5 : if (!tld) /* No data for TLD so everything is valid. */
287 : 1 : return TLD_SUCCESS;
288 : :
289 : 4 : ipos = in;
290 [ + + ]: 20 : while (ipos < &in[inlen])
291 : : {
292 : 18 : rc = _tld_checkchar (*ipos, tld);
293 [ + + ]: 18 : if (rc != TLD_SUCCESS)
294 : : {
295 [ + - ]: 2 : if (errpos)
296 : 2 : *errpos = ipos - in;
297 : 2 : return rc;
298 : : }
299 : 16 : ipos++;
300 : : }
301 : 5 : return TLD_SUCCESS;
302 : : }
303 : :
304 : : /**
305 : : * tld_check_4tz:
306 : : * @in: Zero terminated array of unicode code points to process.
307 : : * @errpos: Position of offending character is returned here.
308 : : * @tld: A #Tld_table data structure representing the restrictions for
309 : : * which the input should be tested.
310 : : *
311 : : * Test each of the code points in @in for whether or not
312 : : * they are allowed by the data structure in @tld, return
313 : : * the position of the first character for which this is not
314 : : * the case in @errpos.
315 : : *
316 : : * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
317 : : * points are valid or when @tld is null, %TLD_INVALID if a
318 : : * character is not allowed, or additional error codes on general
319 : : * failure conditions.
320 : : */
321 : : int
322 : 0 : tld_check_4tz (const uint32_t * in, size_t * errpos, const Tld_table * tld)
323 : : {
324 : 0 : const uint32_t *ipos = in;
325 : :
326 [ # # ]: 0 : if (!ipos)
327 : 0 : return TLD_NODATA;
328 : :
329 [ # # ]: 0 : while (*ipos)
330 : 0 : ipos++;
331 : :
332 : 0 : return tld_check_4t (in, ipos - in, errpos, tld);
333 : : }
334 : :
335 : : /**
336 : : * tld_check_4:
337 : : * @in: Array of unicode code points to process. Does not need to be
338 : : * zero terminated.
339 : : * @inlen: Number of unicode code points.
340 : : * @errpos: Position of offending character is returned here.
341 : : * @overrides: A #Tld_table array of additional domain restriction
342 : : * structures that complement and supersede the built-in information.
343 : : *
344 : : * Test each of the code points in @in for whether or not they are
345 : : * allowed by the information in @overrides or by the built-in TLD
346 : : * restriction data. When data for the same TLD is available both
347 : : * internally and in @overrides, the information in @overrides takes
348 : : * precedence. If several entries for a specific TLD are found, the
349 : : * first one is used. If @overrides is %NULL, only the built-in
350 : : * information is used. The position of the first offending character
351 : : * is returned in @errpos.
352 : : *
353 : : * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
354 : : * points are valid or when @tld is null, %TLD_INVALID if a
355 : : * character is not allowed, or additional error codes on general
356 : : * failure conditions.
357 : : */
358 : : int
359 : 4 : tld_check_4 (const uint32_t * in, size_t inlen, size_t * errpos,
360 : : const Tld_table ** overrides)
361 : : {
362 : : const Tld_table *tld;
363 : : char *domain;
364 : : int rc;
365 : :
366 [ + + ]: 4 : if (errpos)
367 : 2 : *errpos = 0;
368 : :
369 : : /* Get TLD name. */
370 : 4 : rc = tld_get_4 (in, inlen, &domain);
371 : :
372 [ + + ]: 4 : if (rc != TLD_SUCCESS)
373 : : {
374 [ + - ]: 2 : if (rc == TLD_NO_TLD) /* No TLD, say OK */
375 : 2 : return TLD_SUCCESS;
376 : : else
377 : 0 : return rc;
378 : : }
379 : :
380 : : /* Retrieve appropriate data structure. */
381 : 2 : tld = tld_default_table (domain, overrides);
382 : 2 : free (domain);
383 : :
384 : 4 : return tld_check_4t (in, inlen, errpos, tld);
385 : : }
386 : :
387 : : /**
388 : : * tld_check_4z:
389 : : * @in: Zero-terminated array of unicode code points to process.
390 : : * @errpos: Position of offending character is returned here.
391 : : * @overrides: A #Tld_table array of additional domain restriction
392 : : * structures that complement and supersede the built-in information.
393 : : *
394 : : * Test each of the code points in @in for whether or not they are
395 : : * allowed by the information in @overrides or by the built-in TLD
396 : : * restriction data. When data for the same TLD is available both
397 : : * internally and in @overrides, the information in @overrides takes
398 : : * precedence. If several entries for a specific TLD are found, the
399 : : * first one is used. If @overrides is %NULL, only the built-in
400 : : * information is used. The position of the first offending character
401 : : * is returned in @errpos.
402 : : *
403 : : * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
404 : : * points are valid or when @tld is null, %TLD_INVALID if a
405 : : * character is not allowed, or additional error codes on general
406 : : * failure conditions.
407 : : */
408 : : int
409 : 2 : tld_check_4z (const uint32_t * in, size_t * errpos,
410 : : const Tld_table ** overrides)
411 : : {
412 : 2 : const uint32_t *ipos = in;
413 : :
414 [ + + ]: 2 : if (!ipos)
415 : 1 : return TLD_NODATA;
416 : :
417 [ + + ]: 4 : while (*ipos)
418 : 3 : ipos++;
419 : :
420 : 2 : return tld_check_4 (in, ipos - in, errpos, overrides);
421 : : }
422 : :
423 : : /**
424 : : * tld_check_8z:
425 : : * @in: Zero-terminated UTF8 string to process.
426 : : * @errpos: Position of offending character is returned here.
427 : : * @overrides: A #Tld_table array of additional domain restriction
428 : : * structures that complement and supersede the built-in information.
429 : : *
430 : : * Test each of the characters in @in for whether or not they are
431 : : * allowed by the information in @overrides or by the built-in TLD
432 : : * restriction data. When data for the same TLD is available both
433 : : * internally and in @overrides, the information in @overrides takes
434 : : * precedence. If several entries for a specific TLD are found, the
435 : : * first one is used. If @overrides is %NULL, only the built-in
436 : : * information is used. The position of the first offending character
437 : : * is returned in @errpos. Note that the error position refers to the
438 : : * decoded character offset rather than the byte position in the
439 : : * string.
440 : : *
441 : : * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
442 : : * characters are valid or when @tld is null, %TLD_INVALID if a
443 : : * character is not allowed, or additional error codes on general
444 : : * failure conditions.
445 : : */
446 : : int
447 : 4 : tld_check_8z (const char *in, size_t * errpos, const Tld_table ** overrides)
448 : : {
449 : : uint32_t *iucs;
450 : : size_t ilen;
451 : : int rc;
452 : :
453 [ + + ]: 4 : if (!in)
454 : 1 : return TLD_NODATA;
455 : :
456 : 3 : iucs = stringprep_utf8_to_ucs4 (in, -1, &ilen);
457 : :
458 [ - + ]: 3 : if (!iucs)
459 : 0 : return TLD_MALLOC_ERROR;
460 : :
461 : 3 : rc = tld_check_4 (iucs, ilen, errpos, overrides);
462 : :
463 : 3 : free (iucs);
464 : :
465 : 4 : return rc;
466 : : }
467 : :
468 : : /**
469 : : * tld_check_lz:
470 : : * @in: Zero-terminated string in the current locales encoding to process.
471 : : * @errpos: Position of offending character is returned here.
472 : : * @overrides: A #Tld_table array of additional domain restriction
473 : : * structures that complement and supersede the built-in information.
474 : : *
475 : : * Test each of the characters in @in for whether or not they are
476 : : * allowed by the information in @overrides or by the built-in TLD
477 : : * restriction data. When data for the same TLD is available both
478 : : * internally and in @overrides, the information in @overrides takes
479 : : * precedence. If several entries for a specific TLD are found, the
480 : : * first one is used. If @overrides is %NULL, only the built-in
481 : : * information is used. The position of the first offending character
482 : : * is returned in @errpos. Note that the error position refers to the
483 : : * decoded character offset rather than the byte position in the
484 : : * string.
485 : : *
486 : : * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
487 : : * characters are valid or when @tld is null, %TLD_INVALID if a
488 : : * character is not allowed, or additional error codes on general
489 : : * failure conditions.
490 : : */
491 : : int
492 : 2 : tld_check_lz (const char *in, size_t * errpos, const Tld_table ** overrides)
493 : : {
494 : : char *utf8;
495 : : int rc;
496 : :
497 [ + + ]: 2 : if (!in)
498 : 1 : return TLD_NODATA;
499 : :
500 : 1 : utf8 = stringprep_locale_to_utf8 (in);
501 [ - + ]: 1 : if (!utf8)
502 : 0 : return TLD_ICONV_ERROR;
503 : :
504 : :
505 : 1 : rc = tld_check_8z (utf8, errpos, overrides);
506 : :
507 : 1 : free (utf8);
508 : :
509 : 2 : return rc;
510 : : }
511 : :
512 : : /**
513 : : * Tld_rc:
514 : : * @TLD_SUCCESS: Successful operation. This value is guaranteed to
515 : : * always be zero, the remaining ones are only guaranteed to hold
516 : : * non-zero values, for logical comparison purposes.
517 : : * @TLD_INVALID: Invalid character found.
518 : : * @TLD_NODATA: No input data was provided.
519 : : * @TLD_MALLOC_ERROR: Error during memory allocation.
520 : : * @TLD_ICONV_ERROR: Error during iconv string conversion.
521 : : * @TLD_NO_TLD: No top-level domain found in domain string.
522 : : * @TLD_NOTLD: Same as @TLD_NO_TLD, for compatibility
523 : : * with typo in earlier versions.
524 : : *
525 : : * Enumerated return codes of the TLD checking functions.
526 : : * The value 0 is guaranteed to always correspond to success.
527 : : */
|