Line data Source code
1 : /* nfkc.c --- Unicode normalization utilities.
2 : Copyright (C) 2002-2020 Simon Josefsson
3 :
4 : This file is part of GNU Libidn.
5 :
6 : GNU Libidn is free software: you can redistribute it and/or
7 : modify it under the terms of either:
8 :
9 : * the GNU Lesser General Public License as published by the Free
10 : Software Foundation; either version 3 of the License, or (at
11 : your option) any later version.
12 :
13 : or
14 :
15 : * the GNU General Public License as published by the Free
16 : Software Foundation; either version 2 of the License, or (at
17 : your option) any later version.
18 :
19 : or both in parallel, as here.
20 :
21 : GNU Libidn is distributed in the hope that it will be useful,
22 : but WITHOUT ANY WARRANTY; without even the implied warranty of
23 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 : General Public License for more details.
25 :
26 : You should have received copies of the GNU General Public License and
27 : the GNU Lesser General Public License along with this program. If
28 : not, see <http://www.gnu.org/licenses/>. */
29 :
30 : #ifdef HAVE_CONFIG_H
31 : # include "config.h"
32 : #endif
33 :
34 : #include <stdlib.h>
35 : #include <string.h>
36 :
37 : #include "stringprep.h"
38 :
39 : /* Hacks to make syncing with GLIB code easier. */
40 : #define gboolean int
41 : #define gchar char
42 : #define guchar unsigned char
43 : #define glong long
44 : #define gint int
45 : #define guint unsigned int
46 : #define gushort unsigned short
47 : #define gint16 int16_t
48 : #define guint16 uint16_t
49 : #define gunichar uint32_t
50 : #define gsize size_t
51 : #define gssize ssize_t
52 : #define g_malloc malloc
53 : #define g_free free
54 : #define g_return_val_if_fail(expr,val) { \
55 : if (!(expr)) \
56 : return (val); \
57 : }
58 :
59 : /* Code from GLIB gmacros.h starts here. */
60 :
61 : /* GLIB - Library of useful routines for C programming
62 : * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald
63 : *
64 : * This library is free software; you can redistribute it and/or
65 : * modify it under the terms of the GNU Lesser General Public
66 : * License as published by the Free Software Foundation; either
67 : * version 2 of the License, or (at your option) any later version.
68 : *
69 : * This library is distributed in the hope that it will be useful,
70 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
71 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
72 : * Lesser General Public License for more details.
73 : *
74 : * You should have received a copy of the GNU Lesser General Public
75 : * License along with this library; if not, write to the
76 : * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
77 : * Boston, MA 02111-1307, USA.
78 : */
79 :
80 : #ifndef FALSE
81 : # define FALSE (0)
82 : #endif
83 :
84 : #ifndef TRUE
85 : # define TRUE (!FALSE)
86 : #endif
87 :
88 : #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
89 :
90 : #define G_UNLIKELY(expr) (expr)
91 :
92 : /* Code from GLIB gunicode.h starts here. */
93 :
94 : /* gunicode.h - Unicode manipulation functions
95 : *
96 : * Copyright (C) 1999, 2000 Tom Tromey
97 : * Copyright 2000, 2005 Red Hat, Inc.
98 : *
99 : * The Gnome Library is free software; you can redistribute it and/or
100 : * modify it under the terms of the GNU Lesser General Public License as
101 : * published by the Free Software Foundation; either version 2 of the
102 : * License, or (at your option) any later version.
103 : *
104 : * The Gnome Library is distributed in the hope that it will be useful,
105 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
106 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
107 : * Lesser General Public License for more details.
108 : *
109 : * You should have received a copy of the GNU Lesser General Public
110 : * License along with the Gnome Library; see the file COPYING.LIB. If not,
111 : * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
112 : * Boston, MA 02111-1307, USA.
113 : */
114 :
115 : typedef enum
116 : {
117 : G_NORMALIZE_DEFAULT,
118 : G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
119 : G_NORMALIZE_DEFAULT_COMPOSE,
120 : G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
121 : G_NORMALIZE_ALL,
122 : G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
123 : G_NORMALIZE_ALL_COMPOSE,
124 : G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
125 : }
126 : GNormalizeMode;
127 :
128 : #define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
129 :
130 : /* Code from GLIB gutf8.c starts here. */
131 :
132 : /* gutf8.c - Operations on UTF-8 strings.
133 : *
134 : * Copyright (C) 1999 Tom Tromey
135 : * Copyright (C) 2000 Red Hat, Inc.
136 : *
137 : * This library is free software; you can redistribute it and/or
138 : * modify it under the terms of the GNU Lesser General Public
139 : * License as published by the Free Software Foundation; either
140 : * version 2 of the License, or (at your option) any later version.
141 : *
142 : * This library is distributed in the hope that it will be useful,
143 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
144 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
145 : * Lesser General Public License for more details.
146 : *
147 : * You should have received a copy of the GNU Lesser General Public
148 : * License along with this library; if not, write to the
149 : * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
150 : * Boston, MA 02111-1307, USA.
151 : */
152 :
153 : #define UTF8_COMPUTE(Char, Mask, Len) \
154 : if (Char < 128) \
155 : { \
156 : Len = 1; \
157 : Mask = 0x7f; \
158 : } \
159 : else if ((Char & 0xe0) == 0xc0) \
160 : { \
161 : Len = 2; \
162 : Mask = 0x1f; \
163 : } \
164 : else if ((Char & 0xf0) == 0xe0) \
165 : { \
166 : Len = 3; \
167 : Mask = 0x0f; \
168 : } \
169 : else if ((Char & 0xf8) == 0xf0) \
170 : { \
171 : Len = 4; \
172 : Mask = 0x07; \
173 : } \
174 : else if ((Char & 0xfc) == 0xf8) \
175 : { \
176 : Len = 5; \
177 : Mask = 0x03; \
178 : } \
179 : else if ((Char & 0xfe) == 0xfc) \
180 : { \
181 : Len = 6; \
182 : Mask = 0x01; \
183 : } \
184 : else \
185 : Len = -1;
186 :
187 : #define UTF8_LENGTH(Char) \
188 : ((Char) < 0x80 ? 1 : \
189 : ((Char) < 0x800 ? 2 : \
190 : ((Char) < 0x10000 ? 3 : \
191 : ((Char) < 0x200000 ? 4 : \
192 : ((Char) < 0x4000000 ? 5 : 6)))))
193 :
194 : #define UTF8_GET(Result, Chars, Count, Mask, Len) \
195 : (Result) = (Chars)[0] & (Mask); \
196 : for ((Count) = 1; (Count) < (Len); ++(Count)) \
197 : { \
198 : if (((Chars)[(Count)] & 0xc0) != 0x80) \
199 : { \
200 : (Result) = -1; \
201 : break; \
202 : } \
203 : (Result) <<= 6; \
204 : (Result) |= ((Chars)[(Count)] & 0x3f); \
205 : }
206 :
207 : static const gchar utf8_skip_data[256] = {
208 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
209 : 1, 1, 1, 1, 1, 1, 1,
210 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
211 : 1, 1, 1, 1, 1, 1, 1,
212 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
213 : 1, 1, 1, 1, 1, 1, 1,
214 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
215 : 1, 1, 1, 1, 1, 1, 1,
216 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
217 : 1, 1, 1, 1, 1, 1, 1,
218 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
219 : 1, 1, 1, 1, 1, 1, 1,
220 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
221 : 2, 2, 2, 2, 2, 2, 2,
222 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
223 : 5, 5, 5, 6, 6, 1, 1
224 : };
225 :
226 : static const gchar *const g_utf8_skip = utf8_skip_data;
227 :
228 : /*
229 : * g_utf8_strlen:
230 : * @p: pointer to the start of a UTF-8 encoded string
231 : * @max: the maximum number of bytes to examine. If @max
232 : * is less than 0, then the string is assumed to be
233 : * nul-terminated. If @max is 0, @p will not be examined and
234 : * may be %NULL.
235 : *
236 : * Computes the length of the string in characters, not including
237 : * the terminating nul character.
238 : *
239 : * Return value: the length of the string in characters
240 : **/
241 : static glong
242 237358 : g_utf8_strlen (const gchar * p)
243 : {
244 237358 : glong len = 0;
245 :
246 237358 : g_return_val_if_fail (p != NULL, 0);
247 :
248 1460478 : while (*p)
249 : {
250 1223120 : p = g_utf8_next_char (p);
251 1223120 : ++len;
252 : }
253 :
254 237358 : return len;
255 : }
256 :
257 : /*
258 : * g_utf8_get_char:
259 : * @p: a pointer to Unicode character encoded as UTF-8
260 : *
261 : * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
262 : * If @p does not point to a valid UTF-8 encoded character, results are
263 : * undefined. If you are not sure that the bytes are complete
264 : * valid Unicode characters, you should use g_utf8_get_char_validated()
265 : * instead.
266 : *
267 : * Return value: the resulting character
268 : **/
269 : static gunichar
270 2322340 : g_utf8_get_char (const gchar * p)
271 : {
272 2322340 : int i, mask = 0, len;
273 : gunichar result;
274 2322340 : unsigned char c = (unsigned char) *p;
275 :
276 2322340 : UTF8_COMPUTE (c, mask, len);
277 2322340 : if (len == -1)
278 16 : return (gunichar) - 1;
279 5152591 : UTF8_GET (result, p, i, mask, len);
280 :
281 2322324 : return result;
282 : }
283 :
284 : /*
285 : * g_unichar_to_utf8:
286 : * @c: a Unicode character code
287 : * @outbuf: output buffer, must have at least 6 bytes of space.
288 : * If %NULL, the length will be computed and returned
289 : * and nothing will be written to @outbuf.
290 : *
291 : * Converts a single character to UTF-8.
292 : *
293 : * Return value: number of bytes written
294 : **/
295 : static int
296 1544635 : g_unichar_to_utf8 (gunichar c, gchar * outbuf)
297 : {
298 : /* If this gets modified, also update the copy in g_string_insert_unichar() */
299 1544635 : guint len = 0;
300 : int first;
301 : int i;
302 :
303 1544635 : if (c < 0x80)
304 : {
305 883799 : first = 0;
306 883799 : len = 1;
307 : }
308 660836 : else if (c < 0x800)
309 : {
310 181377 : first = 0xc0;
311 181377 : len = 2;
312 : }
313 479459 : else if (c < 0x10000)
314 : {
315 434178 : first = 0xe0;
316 434178 : len = 3;
317 : }
318 45281 : else if (c < 0x200000)
319 : {
320 20222 : first = 0xf0;
321 20222 : len = 4;
322 : }
323 25059 : else if (c < 0x4000000)
324 : {
325 460 : first = 0xf8;
326 460 : len = 5;
327 : }
328 : else
329 : {
330 24599 : first = 0xfc;
331 24599 : len = 6;
332 : }
333 :
334 1544635 : if (outbuf)
335 : {
336 2779869 : for (i = len - 1; i > 0; --i)
337 : {
338 1235234 : outbuf[i] = (c & 0x3f) | 0x80;
339 1235234 : c >>= 6;
340 : }
341 1544635 : outbuf[0] = c | first;
342 : }
343 :
344 1544635 : return len;
345 : }
346 :
347 : /*
348 : * g_utf8_to_ucs4_fast:
349 : * @str: a UTF-8 encoded string
350 : * @len: the maximum length of @str to use, in bytes. If @len < 0,
351 : * then the string is nul-terminated.
352 : * @items_written: location to store the number of characters in the
353 : * result, or %NULL.
354 : *
355 : * Convert a string from UTF-8 to a 32-bit fixed width
356 : * representation as UCS-4, assuming valid UTF-8 input.
357 : * This function is roughly twice as fast as g_utf8_to_ucs4()
358 : * but does no error checking on the input. A trailing 0 character
359 : * will be added to the string after the converted text.
360 : *
361 : * Return value: a pointer to a newly allocated UCS-4 string.
362 : * This value must be freed with g_free().
363 : **/
364 : static gunichar *
365 73814 : g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
366 : {
367 : gunichar *result;
368 : gsize n_chars, i;
369 : const gchar *p;
370 :
371 73814 : g_return_val_if_fail (str != NULL, NULL);
372 :
373 73814 : p = str;
374 73814 : n_chars = 0;
375 73814 : if (len < 0)
376 : {
377 1140777 : while (*p)
378 : {
379 1066963 : p = g_utf8_next_char (p);
380 1066963 : ++n_chars;
381 : }
382 : }
383 : else
384 : {
385 0 : while (p < str + len && *p)
386 : {
387 0 : p = g_utf8_next_char (p);
388 0 : ++n_chars;
389 : }
390 : }
391 :
392 73814 : result = g_malloc (sizeof (gunichar) * (n_chars + 1));
393 73814 : if (!result)
394 0 : return NULL;
395 :
396 73814 : p = str;
397 1140777 : for (i = 0; i < n_chars; i++)
398 : {
399 1066963 : gunichar wc = (guchar) * p++;
400 :
401 1066963 : if (wc < 0x80)
402 : {
403 606540 : result[i] = wc;
404 : }
405 : else
406 : {
407 460423 : gunichar mask = 0x40;
408 :
409 460423 : if (G_UNLIKELY ((wc & mask) == 0))
410 : {
411 : /* It's an out-of-sequence 10xxxxxxx byte.
412 : * Rather than making an ugly hash of this and the next byte
413 : * and overrunning the buffer, it's more useful to treat it
414 : * with a replacement character */
415 0 : result[i] = 0xfffd;
416 0 : continue;
417 : }
418 :
419 : do
420 : {
421 822646 : wc <<= 6;
422 822646 : wc |= (guchar) (*p++) & 0x3f;
423 822646 : mask <<= 5;
424 : }
425 822646 : while ((wc & mask) != 0);
426 :
427 460423 : wc &= mask - 1;
428 :
429 460423 : result[i] = wc;
430 : }
431 : }
432 73814 : result[i] = 0;
433 :
434 73814 : if (items_written)
435 61919 : *items_written = i;
436 :
437 73814 : return result;
438 : }
439 :
440 : /*
441 : * g_ucs4_to_utf8:
442 : * @str: a UCS-4 encoded string
443 : * @len: the maximum length (number of characters) of @str to use.
444 : * If @len < 0, then the string is nul-terminated.
445 : * @items_read: location to store number of characters read, or %NULL.
446 : * @items_written: location to store number of bytes written or %NULL.
447 : * The value here stored does not include the trailing 0
448 : * byte.
449 : * @error: location to store the error occurring, or %NULL to ignore
450 : * errors. Any of the errors in #GConvertError other than
451 : * %G_CONVERT_ERROR_NO_CONVERSION may occur.
452 : *
453 : * Convert a string from a 32-bit fixed width representation as UCS-4.
454 : * to UTF-8. The result will be terminated with a 0 byte.
455 : *
456 : * Return value: a pointer to a newly allocated UTF-8 string.
457 : * This value must be freed with g_free(). If an
458 : * error occurs, %NULL will be returned and
459 : * @error set. In that case, @items_read will be
460 : * set to the position of the first invalid input
461 : * character.
462 : **/
463 : static gchar *
464 138364 : g_ucs4_to_utf8 (const gunichar * str,
465 : glong len, glong * items_read, glong * items_written)
466 : {
467 : gint result_length;
468 138364 : gchar *result = NULL;
469 : gchar *p;
470 : gint i;
471 :
472 138364 : result_length = 0;
473 1687002 : for (i = 0; len < 0 || i < len; i++)
474 : {
475 1552845 : if (!str[i])
476 3059 : break;
477 :
478 1549786 : if (str[i] >= 0x80000000)
479 1148 : goto err_out;
480 :
481 1548638 : result_length += UTF8_LENGTH (str[i]);
482 : }
483 :
484 137216 : result = g_malloc (result_length + 1);
485 137216 : if (!result)
486 0 : return NULL;
487 137216 : p = result;
488 :
489 137216 : i = 0;
490 1681487 : while (p < result + result_length)
491 1544271 : p += g_unichar_to_utf8 (str[i++], p);
492 :
493 137216 : *p = '\0';
494 :
495 137216 : if (items_written)
496 9 : *items_written = p - result;
497 :
498 137207 : err_out:
499 138364 : if (items_read)
500 9 : *items_read = i;
501 :
502 138364 : return result;
503 : }
504 :
505 : /* Code from GLIB gunidecomp.c starts here. */
506 :
507 : /* decomp.c - Character decomposition.
508 : *
509 : * Copyright (C) 1999, 2000 Tom Tromey
510 : * Copyright 2000 Red Hat, Inc.
511 : *
512 : * The Gnome Library is free software; you can redistribute it and/or
513 : * modify it under the terms of the GNU Lesser General Public License as
514 : * published by the Free Software Foundation; either version 2 of the
515 : * License, or (at your option) any later version.
516 : *
517 : * The Gnome Library is distributed in the hope that it will be useful,
518 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
519 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
520 : * Lesser General Public License for more details.
521 : *
522 : * You should have received a copy of the GNU Lesser General Public
523 : * License along with the Gnome Library; see the file COPYING.LIB. If not,
524 : * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
525 : * Boston, MA 02111-1307, USA.
526 : */
527 :
528 : #include "gunidecomp.h"
529 : #include "gunicomp.h"
530 :
531 : #define CC_PART1(Page, Char) \
532 : ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
533 : ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
534 : : (cclass_data[combining_class_table_part1[Page]][Char]))
535 :
536 : #define CC_PART2(Page, Char) \
537 : ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
538 : ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
539 : : (cclass_data[combining_class_table_part2[Page]][Char]))
540 :
541 : #define COMBINING_CLASS(Char) \
542 : (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
543 : ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
544 : : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
545 : ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
546 : : 0))
547 :
548 : /* constants for hangul syllable [de]composition */
549 : #define SBase 0xAC00
550 : #define LBase 0x1100
551 : #define VBase 0x1161
552 : #define TBase 0x11A7
553 : #define LCount 19
554 : #define VCount 21
555 : #define TCount 28
556 : #define NCount (VCount * TCount)
557 : #define SCount (LCount * NCount)
558 :
559 : /*
560 : * g_unicode_canonical_ordering:
561 : * @string: a UCS-4 encoded string.
562 : * @len: the maximum length of @string to use.
563 : *
564 : * Computes the canonical ordering of a string in-place.
565 : * This rearranges decomposed characters in the string
566 : * according to their combining classes. See the Unicode
567 : * manual for more information.
568 : **/
569 : static void
570 576651 : g_unicode_canonical_ordering (gunichar * string, gsize len)
571 : {
572 : gsize i;
573 576651 : int swap = 1;
574 :
575 1162192 : while (swap)
576 : {
577 : int last;
578 585541 : swap = 0;
579 585541 : last = COMBINING_CLASS (string[0]);
580 3129202 : for (i = 0; i < len - 1; ++i)
581 : {
582 2543661 : int next = COMBINING_CLASS (string[i + 1]);
583 2543661 : if (next != 0 && last > next)
584 : {
585 : gsize j;
586 : /* Percolate item leftward through string. */
587 33974 : for (j = i + 1; j > 0; --j)
588 : {
589 : gunichar t;
590 33305 : if (COMBINING_CLASS (string[j - 1]) <= next)
591 13690 : break;
592 19615 : t = string[j];
593 19615 : string[j] = string[j - 1];
594 19615 : string[j - 1] = t;
595 19615 : swap = 1;
596 : }
597 : /* We're re-entering the loop looking at the old
598 : character again. */
599 14359 : next = last;
600 : }
601 2543661 : last = next;
602 : }
603 : }
604 576651 : }
605 :
606 : /* http://www.unicode.org/unicode/reports/tr15/#Hangul
607 : * r should be null or have sufficient space. Calling with r == NULL will
608 : * only calculate the result_len; however, a buffer with space for three
609 : * characters will always be big enough. */
610 : static void
611 13126 : decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
612 : {
613 13126 : gint SIndex = s - SBase;
614 13126 : gint TIndex = SIndex % TCount;
615 :
616 13126 : if (r)
617 : {
618 6563 : r[0] = LBase + SIndex / NCount;
619 6563 : r[1] = VBase + (SIndex % NCount) / TCount;
620 : }
621 :
622 13126 : if (TIndex)
623 : {
624 7320 : if (r)
625 3660 : r[2] = TBase + TIndex;
626 7320 : *result_len = 3;
627 : }
628 : else
629 5806 : *result_len = 2;
630 13126 : }
631 :
632 : /* returns a pointer to a null-terminated UTF-8 string */
633 : static const gchar *
634 1085730 : find_decomposition (gunichar ch, gboolean compat)
635 : {
636 1085730 : int start = 0;
637 1085730 : int end = G_N_ELEMENTS (decomp_table);
638 :
639 1085730 : if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
640 : {
641 : while (TRUE)
642 6672056 : {
643 7287978 : int half = (start + end) / 2;
644 7287978 : if (ch == decomp_table[half].ch)
645 : {
646 : int offset;
647 :
648 474716 : if (compat)
649 : {
650 474716 : offset = decomp_table[half].compat_offset;
651 474716 : if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
652 36212 : offset = decomp_table[half].canon_offset;
653 : }
654 : else
655 : {
656 0 : offset = decomp_table[half].canon_offset;
657 0 : if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
658 0 : return NULL;
659 : }
660 :
661 474716 : return &(decomp_expansion_string[offset]);
662 : }
663 6813262 : else if (half == start)
664 141206 : break;
665 6672056 : else if (ch > decomp_table[half].ch)
666 3084608 : start = half;
667 : else
668 3587448 : end = half;
669 : }
670 : }
671 :
672 611014 : return NULL;
673 : }
674 :
675 : /* L,V => LV and LV,T => LVT */
676 : static gboolean
677 1483980 : combine_hangul (gunichar a, gunichar b, gunichar * result)
678 : {
679 1483980 : if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
680 : {
681 7077 : gint LIndex = a - LBase;
682 7077 : gint VIndex = b - VBase;
683 :
684 7077 : *result = SBase + (LIndex * VCount + VIndex) * TCount;
685 7077 : return TRUE;
686 : }
687 :
688 1476903 : if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
689 : {
690 3661 : gint SIndex = a - SBase;
691 :
692 3661 : if ((SIndex % TCount) == 0)
693 : {
694 3661 : gint TIndex = b - TBase;
695 :
696 3661 : *result = a + TIndex;
697 3661 : return TRUE;
698 : }
699 : }
700 :
701 1473242 : return FALSE;
702 : }
703 :
704 : #define CI(Page, Char) \
705 : ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
706 : ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
707 : : (compose_data[compose_table[Page]][Char]))
708 :
709 : #define COMPOSE_INDEX(Char) \
710 : (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
711 :
712 : static gboolean
713 1483980 : combine (gunichar a, gunichar b, gunichar * result)
714 : {
715 : gushort index_a, index_b;
716 :
717 1483980 : if (combine_hangul (a, b, result))
718 10738 : return TRUE;
719 :
720 1473242 : index_a = COMPOSE_INDEX (a);
721 :
722 1473242 : if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
723 : {
724 177288 : if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
725 : {
726 30806 : *result =
727 30806 : compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
728 30806 : return TRUE;
729 : }
730 : else
731 146482 : return FALSE;
732 : }
733 :
734 1295954 : index_b = COMPOSE_INDEX (b);
735 :
736 1295954 : if (index_b >= COMPOSE_SECOND_SINGLE_START)
737 : {
738 10403 : if (a ==
739 10403 : compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
740 : {
741 8837 : *result =
742 8837 : compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
743 8837 : return TRUE;
744 : }
745 : else
746 1566 : return FALSE;
747 : }
748 :
749 1285551 : if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
750 240731 : && index_b >= COMPOSE_SECOND_START
751 37700 : && index_b < COMPOSE_SECOND_SINGLE_START)
752 : {
753 37700 : gunichar res =
754 37700 : compose_array[index_a - COMPOSE_FIRST_START][index_b -
755 : COMPOSE_SECOND_START];
756 :
757 37700 : if (res)
758 : {
759 35402 : *result = res;
760 35402 : return TRUE;
761 : }
762 : }
763 :
764 1250149 : return FALSE;
765 : }
766 :
767 : static gunichar *
768 54130 : _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
769 : {
770 : gsize n_wc;
771 : gunichar *wc_buffer;
772 : const char *p;
773 : gsize last_start;
774 54130 : gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
775 54130 : gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
776 :
777 54130 : n_wc = 0;
778 54130 : p = str;
779 603558 : while ((max_len < 0 || p < str + max_len) && *p)
780 : {
781 : const gchar *decomp;
782 549428 : gunichar wc = g_utf8_get_char (p);
783 :
784 549428 : if (wc >= SBase && wc < SBase + SCount)
785 6563 : {
786 : gsize result_len;
787 6563 : decompose_hangul (wc, NULL, &result_len);
788 6563 : n_wc += result_len;
789 : }
790 : else
791 : {
792 542865 : decomp = find_decomposition (wc, do_compat);
793 :
794 542865 : if (decomp)
795 237358 : n_wc += g_utf8_strlen (decomp);
796 : else
797 305507 : n_wc++;
798 : }
799 :
800 549428 : p = g_utf8_next_char (p);
801 : }
802 :
803 54130 : wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
804 54130 : if (!wc_buffer)
805 0 : return NULL;
806 :
807 54130 : last_start = 0;
808 54130 : n_wc = 0;
809 54130 : p = str;
810 603558 : while ((max_len < 0 || p < str + max_len) && *p)
811 : {
812 549428 : gunichar wc = g_utf8_get_char (p);
813 : const gchar *decomp;
814 : int cc;
815 549428 : gsize old_n_wc = n_wc;
816 :
817 549428 : if (wc >= SBase && wc < SBase + SCount)
818 6563 : {
819 : gsize result_len;
820 6563 : decompose_hangul (wc, wc_buffer + n_wc, &result_len);
821 6563 : n_wc += result_len;
822 : }
823 : else
824 : {
825 542865 : decomp = find_decomposition (wc, do_compat);
826 :
827 542865 : if (decomp)
828 : {
829 : const char *pd;
830 1460478 : for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
831 1223120 : wc_buffer[n_wc++] = g_utf8_get_char (pd);
832 : }
833 : else
834 305507 : wc_buffer[n_wc++] = wc;
835 : }
836 :
837 549428 : if (n_wc > 0)
838 : {
839 549428 : cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
840 :
841 549428 : if (cc == 0)
842 : {
843 523684 : g_unicode_canonical_ordering (wc_buffer + last_start,
844 : n_wc - last_start);
845 523684 : last_start = old_n_wc;
846 : }
847 : }
848 :
849 549428 : p = g_utf8_next_char (p);
850 : }
851 :
852 54130 : if (n_wc > 0)
853 : {
854 52967 : g_unicode_canonical_ordering (wc_buffer + last_start,
855 : n_wc - last_start);
856 : /* dead assignment: last_start = n_wc; */
857 : }
858 :
859 54130 : wc_buffer[n_wc] = 0;
860 :
861 : /* All decomposed and reordered */
862 :
863 54130 : if (do_compose && n_wc > 0)
864 : {
865 : gsize i, j;
866 52967 : int last_cc = 0;
867 52967 : last_start = 0;
868 :
869 1598380 : for (i = 0; i < n_wc; i++)
870 : {
871 1545413 : int cc = COMBINING_CLASS (wc_buffer[i]);
872 :
873 1545413 : if (i > 0 &&
874 1514264 : (last_cc == 0 || last_cc != cc) &&
875 1483980 : combine (wc_buffer[last_start], wc_buffer[i],
876 1483980 : &wc_buffer[last_start]))
877 : {
878 13987439 : for (j = i + 1; j < n_wc; j++)
879 13901656 : wc_buffer[j - 1] = wc_buffer[j];
880 85783 : n_wc--;
881 85783 : i--;
882 :
883 85783 : if (i == last_start)
884 82904 : last_cc = 0;
885 : else
886 2879 : last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
887 :
888 85783 : continue;
889 : }
890 :
891 1459630 : if (cc == 0)
892 1426841 : last_start = i;
893 :
894 1459630 : last_cc = cc;
895 : }
896 : }
897 :
898 54130 : wc_buffer[n_wc] = 0;
899 :
900 54130 : return wc_buffer;
901 : }
902 :
903 : /*
904 : * g_utf8_normalize:
905 : * @str: a UTF-8 encoded string.
906 : * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
907 : * @mode: the type of normalization to perform.
908 : *
909 : * Converts a string into canonical form, standardizing
910 : * such issues as whether a character with an accent
911 : * is represented as a base character and combining
912 : * accent or as a single precomposed character. The
913 : * string has to be valid UTF-8, otherwise %NULL is
914 : * returned. You should generally call g_utf8_normalize()
915 : * before comparing two Unicode strings.
916 : *
917 : * The normalization mode %G_NORMALIZE_DEFAULT only
918 : * standardizes differences that do not affect the
919 : * text content, such as the above-mentioned accent
920 : * representation. %G_NORMALIZE_ALL also standardizes
921 : * the "compatibility" characters in Unicode, such
922 : * as SUPERSCRIPT THREE to the standard forms
923 : * (in this case DIGIT THREE). Formatting information
924 : * may be lost but for most text operations such
925 : * characters should be considered the same.
926 : *
927 : * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
928 : * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
929 : * but returned a result with composed forms rather
930 : * than a maximally decomposed form. This is often
931 : * useful if you intend to convert the string to
932 : * a legacy encoding or pass it to a system with
933 : * less capable Unicode handling.
934 : *
935 : * Return value: a newly allocated string, that is the
936 : * normalized form of @str, or %NULL if @str is not
937 : * valid UTF-8.
938 : **/
939 : static gchar *
940 189 : g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
941 : {
942 189 : gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
943 189 : gchar *result = NULL;
944 :
945 189 : if (result_wc)
946 189 : result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
947 :
948 189 : g_free (result_wc);
949 :
950 189 : return result;
951 : }
952 :
953 : /* Public Libidn API starts here. */
954 :
955 : /**
956 : * stringprep_utf8_to_unichar:
957 : * @p: a pointer to Unicode character encoded as UTF-8
958 : *
959 : * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
960 : * If @p does not point to a valid UTF-8 encoded character, results are
961 : * undefined.
962 : *
963 : * Return value: the resulting character.
964 : **/
965 : uint32_t
966 364 : stringprep_utf8_to_unichar (const char *p)
967 : {
968 364 : return g_utf8_get_char (p);
969 : }
970 :
971 : /**
972 : * stringprep_unichar_to_utf8:
973 : * @c: a ISO10646 character code
974 : * @outbuf: output buffer, must have at least 6 bytes of space.
975 : * If %NULL, the length will be computed and returned
976 : * and nothing will be written to @outbuf.
977 : *
978 : * Converts a single character to UTF-8.
979 : *
980 : * Return value: number of bytes written.
981 : **/
982 : int
983 364 : stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
984 : {
985 364 : return g_unichar_to_utf8 (c, outbuf);
986 : }
987 :
988 : #include <unistr.h>
989 :
990 : /**
991 : * stringprep_utf8_to_ucs4:
992 : * @str: a UTF-8 encoded string
993 : * @len: the maximum length of @str to use. If @len < 0, then
994 : * the string is nul-terminated.
995 : * @items_written: location to store the number of characters in the
996 : * result, or %NULL.
997 : *
998 : * Convert a string from UTF-8 to a 32-bit fixed width representation
999 : * as UCS-4. The function now performs error checking to verify that
1000 : * the input is valid UTF-8 (before it was documented to not do error
1001 : * checking).
1002 : *
1003 : * Return value: a pointer to a newly allocated UCS-4 string.
1004 : * This value must be deallocated by the caller.
1005 : **/
1006 : uint32_t *
1007 75525 : stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
1008 : {
1009 : size_t n;
1010 :
1011 75525 : if (len < 0)
1012 75525 : n = strlen (str);
1013 : else
1014 0 : n = len;
1015 :
1016 75525 : if (u8_check ((const uint8_t *) str, n))
1017 1711 : return NULL;
1018 :
1019 73814 : return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
1020 : }
1021 :
1022 : /**
1023 : * stringprep_ucs4_to_utf8:
1024 : * @str: a UCS-4 encoded string
1025 : * @len: the maximum length of @str to use. If @len < 0, then
1026 : * the string is terminated with a 0 character.
1027 : * @items_read: location to store number of characters read read, or %NULL.
1028 : * @items_written: location to store number of bytes written or %NULL.
1029 : * The value here stored does not include the trailing 0
1030 : * byte.
1031 : *
1032 : * Convert a string from a 32-bit fixed width representation as UCS-4.
1033 : * to UTF-8. The result will be terminated with a 0 byte.
1034 : *
1035 : * Return value: a pointer to a newly allocated UTF-8 string.
1036 : * This value must be deallocated by the caller.
1037 : * If an error occurs, %NULL will be returned.
1038 : **/
1039 : char *
1040 138175 : stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1041 : size_t *items_read, size_t *items_written)
1042 : {
1043 138175 : return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1044 : (glong *) items_written);
1045 : }
1046 :
1047 : /**
1048 : * stringprep_utf8_nfkc_normalize:
1049 : * @str: a UTF-8 encoded string.
1050 : * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1051 : *
1052 : * Converts a string into canonical form, standardizing
1053 : * such issues as whether a character with an accent
1054 : * is represented as a base character and combining
1055 : * accent or as a single precomposed character.
1056 : *
1057 : * The normalization mode is NFKC (ALL COMPOSE). It standardizes
1058 : * differences that do not affect the text content, such as the
1059 : * above-mentioned accent representation. It standardizes the
1060 : * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1061 : * the standard forms (in this case DIGIT THREE). Formatting
1062 : * information may be lost but for most text operations such
1063 : * characters should be considered the same. It returns a result with
1064 : * composed forms rather than a maximally decomposed form.
1065 : *
1066 : * Return value: a newly allocated string, that is the
1067 : * NFKC normalized form of @str.
1068 : **/
1069 : char *
1070 369 : stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1071 : {
1072 : size_t n;
1073 :
1074 369 : if (len < 0)
1075 1 : n = strlen (str);
1076 : else
1077 368 : n = len;
1078 :
1079 369 : if (u8_check ((const uint8_t *) str, n))
1080 180 : return NULL;
1081 :
1082 189 : return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1083 : }
1084 :
1085 : #include <stdio.h>
1086 : /**
1087 : * stringprep_ucs4_nfkc_normalize:
1088 : * @str: a Unicode string.
1089 : * @len: length of @str array, or -1 if @str is nul-terminated.
1090 : *
1091 : * Converts a UCS4 string into canonical form, see
1092 : * stringprep_utf8_nfkc_normalize() for more information.
1093 : *
1094 : * Return value: a newly allocated Unicode string, that is the NFKC
1095 : * normalized form of @str.
1096 : **/
1097 : uint32_t *
1098 54021 : stringprep_ucs4_nfkc_normalize (const uint32_t * str, ssize_t len)
1099 : {
1100 : char *p;
1101 : uint32_t *result_wc;
1102 :
1103 54021 : p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1104 54021 : if (!p)
1105 80 : return NULL;
1106 :
1107 53941 : result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1108 53941 : free (p);
1109 :
1110 53941 : return result_wc;
1111 : }
|