Line data Source code
1 : /* tld.c --- Declarations for TLD restriction checking.
2 : Copyright (C) 2004-2020 Simon Josefsson.
3 : Copyright (C) 2003-2020 Free Software Foundation, Inc.
4 :
5 : Author: Thomas Jacob, Internet24.de
6 :
7 : This file is part of GNU Libidn.
8 :
9 : GNU Libidn is free software: you can redistribute it and/or
10 : modify it under the terms of either:
11 :
12 : * the GNU Lesser General Public License as published by the Free
13 : Software Foundation; either version 3 of the License, or (at
14 : your option) any later version.
15 :
16 : or
17 :
18 : * the GNU General Public License as published by the Free
19 : Software Foundation; either version 2 of the License, or (at
20 : your option) any later version.
21 :
22 : or both in parallel, as here.
23 :
24 : GNU Libidn is distributed in the hope that it will be useful,
25 : but WITHOUT ANY WARRANTY; without even the implied warranty of
26 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
27 : General Public License for more details.
28 :
29 : You should have received copies of the GNU General Public License and
30 : the GNU Lesser General Public License along with this program. If
31 : not, see <http://www.gnu.org/licenses/>. */
32 :
33 : #include <config.h>
34 :
35 : /* Get stringprep_utf8_to_ucs4, stringprep_locale_to_utf8. */
36 : #include <stringprep.h>
37 :
38 : /* Get strcmp(). */
39 : #include <string.h>
40 :
41 : /* Get specifications. */
42 : #include <tld.h>
43 :
44 : /* Array of built-in domain restriction structures. See tlds.c. */
45 : extern const Tld_table *_tld_tables[];
46 :
47 : /**
48 : * tld_get_table:
49 : * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
50 : * @tables: Zero terminated array of #Tld_table info-structures for
51 : * TLDs.
52 : *
53 : * Get the TLD table for a named TLD by searching through the given
54 : * TLD table array.
55 : *
56 : * Return value: Return structure corresponding to TLD @tld by going
57 : * thru @tables, or return %NULL if no such structure is found.
58 : */
59 : const Tld_table *
60 431 : tld_get_table (const char *tld, const Tld_table ** tables)
61 : {
62 431 : const Tld_table **tldtable = NULL;
63 :
64 431 : if (!tld || !tables)
65 2 : return NULL;
66 :
67 516 : for (tldtable = tables; *tldtable; tldtable++)
68 488 : if (!strcmp ((*tldtable)->name, tld))
69 401 : return *tldtable;
70 :
71 28 : return NULL;
72 : }
73 :
74 : /**
75 : * tld_default_table:
76 : * @tld: TLD name (e.g. "com") as zero terminated ASCII byte string.
77 : * @overrides: Additional zero terminated array of #Tld_table
78 : * info-structures for TLDs, or %NULL to only use library deault
79 : * tables.
80 : *
81 : * Get the TLD table for a named TLD, using the internal defaults,
82 : * possibly overrided by the (optional) supplied tables.
83 : *
84 : * Return value: Return structure corresponding to TLD @tld_str, first
85 : * looking through @overrides then thru built-in list, or %NULL if
86 : * no such structure found.
87 : */
88 : const Tld_table *
89 431 : tld_default_table (const char *tld, const Tld_table ** overrides)
90 : {
91 431 : const Tld_table *tldtable = NULL;
92 :
93 431 : if (!tld)
94 2 : return NULL;
95 :
96 429 : if (overrides)
97 1 : tldtable = tld_get_table (tld, overrides);
98 :
99 429 : if (!tldtable)
100 428 : tldtable = tld_get_table (tld, _tld_tables);
101 :
102 429 : return tldtable;
103 : }
104 :
105 : #define DOTP(c) ((c) == 0x002E || (c) == 0x3002 || \
106 : (c) == 0xFF0E || (c) == 0xFF61)
107 :
108 : /**
109 : * tld_get_4:
110 : * @in: Array of unicode code points to process. Does not need to be
111 : * zero terminated.
112 : * @inlen: Number of unicode code points.
113 : * @out: Zero terminated ascii result string pointer.
114 : *
115 : * Isolate the top-level domain of @in and return it as an ASCII
116 : * string in @out.
117 : *
118 : * Return value: Return %TLD_SUCCESS on success, or the corresponding
119 : * #Tld_rc error code otherwise.
120 : */
121 : int
122 1187 : tld_get_4 (const uint32_t * in, size_t inlen, char **out)
123 : {
124 : const uint32_t *ipos;
125 : size_t olen;
126 :
127 1187 : *out = NULL;
128 1187 : if (!in || inlen == 0)
129 54 : return TLD_NODATA;
130 :
131 1133 : ipos = &in[inlen - 1];
132 1133 : olen = 0;
133 : /* Scan backwards for non(latin)letters. */
134 2961 : while (ipos >= in && ((*ipos >= 0x41 && *ipos <= 0x5A) ||
135 1473 : (*ipos >= 0x61 && *ipos <= 0x7A)))
136 1828 : ipos--, olen++;
137 :
138 1133 : if (olen > 0 && ipos >= in && DOTP (*ipos))
139 : {
140 : /* Found something that appears a TLD. */
141 104 : char *out_s = malloc (sizeof (char) * (olen + 1));
142 104 : char *opos = out_s;
143 :
144 104 : if (!opos)
145 0 : return TLD_MALLOC_ERROR;
146 :
147 104 : ipos++;
148 : /* Transcribe to lowercase ascii string. */
149 519 : for (; ipos < &in[inlen]; ipos++, opos++)
150 415 : *opos = *ipos > 0x5A ? *ipos : *ipos + 0x20;
151 104 : *opos = 0;
152 104 : *out = out_s;
153 104 : return TLD_SUCCESS;
154 : }
155 :
156 1029 : return TLD_NO_TLD;
157 : }
158 :
159 : /**
160 : * tld_get_4z:
161 : * @in: Zero terminated array of unicode code points to process.
162 : * @out: Zero terminated ascii result string pointer.
163 : *
164 : * Isolate the top-level domain of @in and return it as an ASCII
165 : * string in @out.
166 : *
167 : * Return value: Return %TLD_SUCCESS on success, or the corresponding
168 : * #Tld_rc error code otherwise.
169 : */
170 : int
171 221 : tld_get_4z (const uint32_t * in, char **out)
172 : {
173 221 : const uint32_t *ipos = in;
174 :
175 221 : if (!in)
176 0 : return TLD_NODATA;
177 :
178 5570 : while (*ipos)
179 5349 : ipos++;
180 :
181 221 : return tld_get_4 (in, ipos - in, out);
182 : }
183 :
184 : /**
185 : * tld_get_z:
186 : * @in: Zero terminated character array to process.
187 : * @out: Zero terminated ascii result string pointer.
188 : *
189 : * Isolate the top-level domain of @in and return it as an ASCII
190 : * string in @out. The input string @in may be UTF-8, ISO-8859-1 or
191 : * any ASCII compatible character encoding.
192 : *
193 : * Return value: Return %TLD_SUCCESS on success, or the corresponding
194 : * #Tld_rc error code otherwise.
195 : */
196 : int
197 368 : tld_get_z (const char *in, char **out)
198 : {
199 : uint32_t *iucs;
200 : size_t i, ilen;
201 : int rc;
202 :
203 368 : ilen = strlen (in);
204 368 : iucs = calloc (ilen, sizeof (*iucs));
205 :
206 368 : if (!iucs)
207 0 : return TLD_MALLOC_ERROR;
208 :
209 26326 : for (i = 0; i < ilen; i++)
210 25958 : iucs[i] = in[i];
211 :
212 368 : rc = tld_get_4 (iucs, ilen, out);
213 :
214 368 : free (iucs);
215 :
216 368 : return rc;
217 : }
218 :
219 : /*
220 : * tld_checkchar - verify that character is permitted
221 : * @ch: 32 bit unicode character to check.
222 : * @tld: A #Tld_table data structure to check @ch against.
223 : *
224 : * Verify if @ch is either in [a-z0-9-.] or mentioned as a valid
225 : * character in @tld.
226 : *
227 : * Return value: Return the #Tld_rc value %TLD_SUCCESS if @ch is a
228 : * valid character for the TLD @tld or if @tld is %NULL,
229 : * %TLD_INVALID if @ch is invalid as defined by @tld.
230 : */
231 : static int
232 2611 : _tld_checkchar (uint32_t ch, const Tld_table * tld)
233 : {
234 : const Tld_table_element *s, *e, *m;
235 :
236 2611 : if (!tld)
237 0 : return TLD_SUCCESS;
238 :
239 : /* Check for [-a-z0-9.]. */
240 2611 : if ((ch >= 0x61 && ch <= 0x7A) ||
241 2096 : (ch >= 0x30 && ch <= 0x39) || ch == 0x2D || DOTP (ch))
242 1924 : return TLD_SUCCESS;
243 :
244 687 : s = tld->valid;
245 687 : e = s + tld->nvalid;
246 2632 : while (s < e)
247 : {
248 2423 : m = s + ((e - s) >> 1);
249 2423 : if (ch < m->start)
250 525 : e = m;
251 1898 : else if (ch > m->end)
252 1420 : s = m + 1;
253 : else
254 478 : return TLD_SUCCESS;
255 : }
256 :
257 209 : return TLD_INVALID;
258 : }
259 :
260 : /**
261 : * tld_check_4t:
262 : * @in: Array of unicode code points to process. Does not need to be
263 : * zero terminated.
264 : * @inlen: Number of unicode code points.
265 : * @errpos: Position of offending character is returned here.
266 : * @tld: A #Tld_table data structure representing the restrictions for
267 : * which the input should be tested.
268 : *
269 : * Test each of the code points in @in for whether or not
270 : * they are allowed by the data structure in @tld, return
271 : * the position of the first character for which this is not
272 : * the case in @errpos.
273 : *
274 : * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
275 : * points are valid or when @tld is null, %TLD_INVALID if a
276 : * character is not allowed, or additional error codes on general
277 : * failure conditions.
278 : */
279 : int
280 284 : tld_check_4t (const uint32_t * in, size_t inlen, size_t *errpos,
281 : const Tld_table * tld)
282 : {
283 : const uint32_t *ipos;
284 : int rc;
285 :
286 284 : if (!tld) /* No data for TLD so everything is valid. */
287 29 : return TLD_SUCCESS;
288 :
289 255 : ipos = in;
290 2657 : while (ipos < &in[inlen])
291 : {
292 2611 : rc = _tld_checkchar (*ipos, tld);
293 2611 : if (rc != TLD_SUCCESS)
294 : {
295 209 : if (errpos)
296 209 : *errpos = ipos - in;
297 209 : return rc;
298 : }
299 2402 : ipos++;
300 : }
301 46 : return TLD_SUCCESS;
302 : }
303 :
304 : /**
305 : * tld_check_4tz:
306 : * @in: Zero terminated array of unicode code points to process.
307 : * @errpos: Position of offending character is returned here.
308 : * @tld: A #Tld_table data structure representing the restrictions for
309 : * which the input should be tested.
310 : *
311 : * Test each of the code points in @in for whether or not
312 : * they are allowed by the data structure in @tld, return
313 : * the position of the first character for which this is not
314 : * the case in @errpos.
315 : *
316 : * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
317 : * points are valid or when @tld is null, %TLD_INVALID if a
318 : * character is not allowed, or additional error codes on general
319 : * failure conditions.
320 : */
321 : int
322 220 : tld_check_4tz (const uint32_t * in, size_t *errpos, const Tld_table * tld)
323 : {
324 220 : const uint32_t *ipos = in;
325 :
326 220 : if (!ipos)
327 0 : return TLD_NODATA;
328 :
329 5563 : while (*ipos)
330 5343 : ipos++;
331 :
332 220 : return tld_check_4t (in, ipos - in, errpos, tld);
333 : }
334 :
335 : /**
336 : * tld_check_4:
337 : * @in: Array of unicode code points to process. Does not need to be
338 : * zero terminated.
339 : * @inlen: Number of unicode code points.
340 : * @errpos: Position of offending character is returned here.
341 : * @overrides: A #Tld_table array of additional domain restriction
342 : * structures that complement and supersede the built-in information.
343 : *
344 : * Test each of the code points in @in for whether or not they are
345 : * allowed by the information in @overrides or by the built-in TLD
346 : * restriction data. When data for the same TLD is available both
347 : * internally and in @overrides, the information in @overrides takes
348 : * precedence. If several entries for a specific TLD are found, the
349 : * first one is used. If @overrides is %NULL, only the built-in
350 : * information is used. The position of the first offending character
351 : * is returned in @errpos.
352 : *
353 : * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
354 : * points are valid or when @tld is null, %TLD_INVALID if a
355 : * character is not allowed, or additional error codes on general
356 : * failure conditions.
357 : */
358 : int
359 595 : tld_check_4 (const uint32_t * in, size_t inlen, size_t *errpos,
360 : const Tld_table ** overrides)
361 : {
362 : const Tld_table *tld;
363 : char *domain;
364 : int rc;
365 :
366 595 : if (errpos)
367 593 : *errpos = 0;
368 :
369 : /* Get TLD name. */
370 595 : rc = tld_get_4 (in, inlen, &domain);
371 :
372 595 : if (rc != TLD_SUCCESS)
373 : {
374 534 : if (rc == TLD_NO_TLD) /* No TLD, say OK */
375 500 : return TLD_SUCCESS;
376 : else
377 34 : return rc;
378 : }
379 :
380 : /* Retrieve appropriate data structure. */
381 61 : tld = tld_default_table (domain, overrides);
382 61 : free (domain);
383 :
384 61 : return tld_check_4t (in, inlen, errpos, tld);
385 : }
386 :
387 : /**
388 : * tld_check_4z:
389 : * @in: Zero-terminated array of unicode code points to process.
390 : * @errpos: Position of offending character is returned here.
391 : * @overrides: A #Tld_table array of additional domain restriction
392 : * structures that complement and supersede the built-in information.
393 : *
394 : * Test each of the code points in @in for whether or not they are
395 : * allowed by the information in @overrides or by the built-in TLD
396 : * restriction data. When data for the same TLD is available both
397 : * internally and in @overrides, the information in @overrides takes
398 : * precedence. If several entries for a specific TLD are found, the
399 : * first one is used. If @overrides is %NULL, only the built-in
400 : * information is used. The position of the first offending character
401 : * is returned in @errpos.
402 : *
403 : * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all code
404 : * points are valid or when @tld is null, %TLD_INVALID if a
405 : * character is not allowed, or additional error codes on general
406 : * failure conditions.
407 : */
408 : int
409 222 : tld_check_4z (const uint32_t * in, size_t *errpos,
410 : const Tld_table ** overrides)
411 : {
412 222 : const uint32_t *ipos = in;
413 :
414 222 : if (!ipos)
415 1 : return TLD_NODATA;
416 :
417 5567 : while (*ipos)
418 5346 : ipos++;
419 :
420 221 : return tld_check_4 (in, ipos - in, errpos, overrides);
421 : }
422 :
423 : /**
424 : * tld_check_8z:
425 : * @in: Zero-terminated UTF8 string to process.
426 : * @errpos: Position of offending character is returned here.
427 : * @overrides: A #Tld_table array of additional domain restriction
428 : * structures that complement and supersede the built-in information.
429 : *
430 : * Test each of the characters in @in for whether or not they are
431 : * allowed by the information in @overrides or by the built-in TLD
432 : * restriction data. When data for the same TLD is available both
433 : * internally and in @overrides, the information in @overrides takes
434 : * precedence. If several entries for a specific TLD are found, the
435 : * first one is used. If @overrides is %NULL, only the built-in
436 : * information is used. The position of the first offending character
437 : * is returned in @errpos. Note that the error position refers to the
438 : * decoded character offset rather than the byte position in the
439 : * string.
440 : *
441 : * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
442 : * characters are valid or when @tld is null, %TLD_INVALID if a
443 : * character is not allowed, or additional error codes on general
444 : * failure conditions.
445 : */
446 : int
447 465 : tld_check_8z (const char *in, size_t *errpos, const Tld_table ** overrides)
448 : {
449 : uint32_t *iucs;
450 : size_t ilen;
451 : int rc;
452 :
453 465 : if (!in)
454 1 : return TLD_NODATA;
455 :
456 464 : iucs = stringprep_utf8_to_ucs4 (in, -1, &ilen);
457 :
458 464 : if (!iucs)
459 90 : return TLD_MALLOC_ERROR;
460 :
461 374 : rc = tld_check_4 (iucs, ilen, errpos, overrides);
462 :
463 374 : free (iucs);
464 :
465 374 : return rc;
466 : }
467 :
468 : /**
469 : * tld_check_lz:
470 : * @in: Zero-terminated string in the current locales encoding to process.
471 : * @errpos: Position of offending character is returned here.
472 : * @overrides: A #Tld_table array of additional domain restriction
473 : * structures that complement and supersede the built-in information.
474 : *
475 : * Test each of the characters in @in for whether or not they are
476 : * allowed by the information in @overrides or by the built-in TLD
477 : * restriction data. When data for the same TLD is available both
478 : * internally and in @overrides, the information in @overrides takes
479 : * precedence. If several entries for a specific TLD are found, the
480 : * first one is used. If @overrides is %NULL, only the built-in
481 : * information is used. The position of the first offending character
482 : * is returned in @errpos. Note that the error position refers to the
483 : * decoded character offset rather than the byte position in the
484 : * string.
485 : *
486 : * Return value: Returns the #Tld_rc value %TLD_SUCCESS if all
487 : * characters are valid or when @tld is null, %TLD_INVALID if a
488 : * character is not allowed, or additional error codes on general
489 : * failure conditions.
490 : */
491 : int
492 366 : tld_check_lz (const char *in, size_t *errpos, const Tld_table ** overrides)
493 : {
494 : char *utf8;
495 : int rc;
496 :
497 366 : if (!in)
498 1 : return TLD_NODATA;
499 :
500 365 : utf8 = stringprep_locale_to_utf8 (in);
501 365 : if (!utf8)
502 267 : return TLD_ICONV_ERROR;
503 :
504 :
505 98 : rc = tld_check_8z (utf8, errpos, overrides);
506 :
507 98 : free (utf8);
508 :
509 98 : return rc;
510 : }
511 :
512 : /**
513 : * Tld_rc:
514 : * @TLD_SUCCESS: Successful operation. This value is guaranteed to
515 : * always be zero, the remaining ones are only guaranteed to hold
516 : * non-zero values, for logical comparison purposes.
517 : * @TLD_INVALID: Invalid character found.
518 : * @TLD_NODATA: No input data was provided.
519 : * @TLD_MALLOC_ERROR: Error during memory allocation.
520 : * @TLD_ICONV_ERROR: Character encoding conversion error.
521 : * @TLD_NO_TLD: No top-level domain found in domain string.
522 : * @TLD_NOTLD: Same as @TLD_NO_TLD, for compatibility
523 : * with typo in earlier versions.
524 : *
525 : * Enumerated return codes of the TLD checking functions.
526 : * The value 0 is guaranteed to always correspond to success.
527 : */
|