libidn  1.43
nfkc.c
Go to the documentation of this file.
1 /* nfkc.c --- Unicode normalization utilities.
2  Copyright (C) 2002-2025 Simon Josefsson
3 
4  This file is part of GNU Libidn.
5 
6  GNU Libidn is free software: you can redistribute it and/or
7  modify it under the terms of either:
8 
9  * the GNU Lesser General Public License as published by the Free
10  Software Foundation; either version 3 of the License, or (at
11  your option) any later version.
12 
13  or
14 
15  * the GNU General Public License as published by the Free
16  Software Foundation; either version 2 of the License, or (at
17  your option) any later version.
18 
19  or both in parallel, as here.
20 
21  GNU Libidn is distributed in the hope that it will be useful,
22  but WITHOUT ANY WARRANTY; without even the implied warranty of
23  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24  General Public License for more details.
25 
26  You should have received copies of the GNU General Public License and
27  the GNU Lesser General Public License along with this program. If
28  not, see <https://www.gnu.org/licenses/>. */
29 
30 #ifdef HAVE_CONFIG_H
31 # include "config.h"
32 #endif
33 
34 #include <stdlib.h>
35 #include <string.h>
36 
37 #include "stringprep.h"
38 
39 /* Hacks to make syncing with GLIB code easier. */
40 #define gboolean int
41 #define gchar char
42 #define guchar unsigned char
43 #define gint int
44 #define guint unsigned int
45 #define gushort unsigned short
46 #define gint16 int16_t
47 #define guint16 uint16_t
48 #define gunichar uint32_t
49 #define gsize size_t
50 #define gssize ssize_t
51 #define g_malloc malloc
52 #define g_free free
53 #define g_return_val_if_fail(expr,val) { \
54  if (!(expr)) \
55  return (val); \
56  }
57 
58 /* Code from GLIB gmacros.h starts here. */
59 
60 /* GLIB - Library of useful routines for C programming
61  * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald
62  *
63  * This library is free software; you can redistribute it and/or
64  * modify it under the terms of the GNU Lesser General Public
65  * License as published by the Free Software Foundation; either
66  * version 2 of the License, or (at your option) any later version.
67  *
68  * This library is distributed in the hope that it will be useful,
69  * but WITHOUT ANY WARRANTY; without even the implied warranty of
70  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
71  * Lesser General Public License for more details.
72  */
73 
74 #ifndef FALSE
75 # define FALSE (0)
76 #endif
77 
78 #ifndef TRUE
79 # define TRUE (!FALSE)
80 #endif
81 
82 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
83 
84 #define G_UNLIKELY(expr) (expr)
85 
86 /* Code from GLIB gunicode.h starts here. */
87 
88 /* gunicode.h - Unicode manipulation functions
89  *
90  * Copyright (C) 1999, 2000 Tom Tromey
91  * Copyright 2000, 2005 Red Hat, Inc.
92  *
93  * The Gnome Library is free software; you can redistribute it and/or
94  * modify it under the terms of the GNU Lesser General Public License as
95  * published by the Free Software Foundation; either version 2 of the
96  * License, or (at your option) any later version.
97  *
98  * The Gnome Library is distributed in the hope that it will be useful,
99  * but WITHOUT ANY WARRANTY; without even the implied warranty of
100  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
101  * Lesser General Public License for more details.
102  */
103 
104 typedef enum
105 {
114 }
116 
117 #define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
118 
119 /* Code from GLIB gutf8.c starts here. */
120 
121 /* gutf8.c - Operations on UTF-8 strings.
122  *
123  * Copyright (C) 1999 Tom Tromey
124  * Copyright (C) 2000 Red Hat, Inc.
125  *
126  * This library is free software; you can redistribute it and/or
127  * modify it under the terms of the GNU Lesser General Public
128  * License as published by the Free Software Foundation; either
129  * version 2 of the License, or (at your option) any later version.
130  *
131  * This library is distributed in the hope that it will be useful,
132  * but WITHOUT ANY WARRANTY; without even the implied warranty of
133  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
134  * Lesser General Public License for more details.
135  */
136 
137 #define UTF8_COMPUTE(Char, Mask, Len) \
138  if (Char < 128) \
139  { \
140  Len = 1; \
141  Mask = 0x7f; \
142  } \
143  else if ((Char & 0xe0) == 0xc0) \
144  { \
145  Len = 2; \
146  Mask = 0x1f; \
147  } \
148  else if ((Char & 0xf0) == 0xe0) \
149  { \
150  Len = 3; \
151  Mask = 0x0f; \
152  } \
153  else if ((Char & 0xf8) == 0xf0) \
154  { \
155  Len = 4; \
156  Mask = 0x07; \
157  } \
158  else if ((Char & 0xfc) == 0xf8) \
159  { \
160  Len = 5; \
161  Mask = 0x03; \
162  } \
163  else if ((Char & 0xfe) == 0xfc) \
164  { \
165  Len = 6; \
166  Mask = 0x01; \
167  } \
168  else \
169  Len = -1;
170 
171 #define UTF8_LENGTH(Char) \
172  ((Char) < 0x80 ? 1 : \
173  ((Char) < 0x800 ? 2 : \
174  ((Char) < 0x10000 ? 3 : \
175  ((Char) < 0x200000 ? 4 : \
176  ((Char) < 0x4000000 ? 5 : 6)))))
177 
178 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
179  (Result) = (Chars)[0] & (Mask); \
180  for ((Count) = 1; (Count) < (Len); ++(Count)) \
181  { \
182  if (((Chars)[(Count)] & 0xc0) != 0x80) \
183  { \
184  (Result) = -1; \
185  break; \
186  } \
187  (Result) <<= 6; \
188  (Result) |= ((Chars)[(Count)] & 0x3f); \
189  }
190 
191 static const gchar utf8_skip_data[256] = {
192  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
193  1, 1, 1, 1, 1, 1, 1,
194  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
195  1, 1, 1, 1, 1, 1, 1,
196  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
197  1, 1, 1, 1, 1, 1, 1,
198  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
199  1, 1, 1, 1, 1, 1, 1,
200  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
201  1, 1, 1, 1, 1, 1, 1,
202  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
203  1, 1, 1, 1, 1, 1, 1,
204  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
205  2, 2, 2, 2, 2, 2, 2,
206  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
207  5, 5, 5, 6, 6, 1, 1
208 };
209 
210 static const gchar *const g_utf8_skip = utf8_skip_data;
211 
212 /*
213  * g_utf8_strlen:
214  * @p: pointer to the start of a UTF-8 encoded string
215  * @max: the maximum number of bytes to examine. If @max
216  * is less than 0, then the string is assumed to be
217  * nul-terminated. If @max is 0, @p will not be examined and
218  * may be %NULL.
219  *
220  * Computes the length of the string in characters, not including
221  * the terminating nul character.
222  *
223  * Return value: the length of the string in characters
224  **/
225 static gsize
226 g_utf8_strlen (const gchar *p)
227 {
228  gsize len = 0;
229 
230  g_return_val_if_fail (p != NULL, 0);
231 
232  while (*p)
233  {
234  p = g_utf8_next_char (p);
235  ++len;
236  }
237 
238  return len;
239 }
240 
241 /*
242  * g_utf8_get_char:
243  * @p: a pointer to Unicode character encoded as UTF-8
244  *
245  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
246  * If @p does not point to a valid UTF-8 encoded character, results are
247  * undefined. If you are not sure that the bytes are complete
248  * valid Unicode characters, you should use g_utf8_get_char_validated()
249  * instead.
250  *
251  * Return value: the resulting character
252  **/
253 static gunichar
254 g_utf8_get_char (const gchar *p)
255 {
256  int i, mask = 0, len;
257  gunichar result;
258  unsigned char c = (unsigned char) *p;
259 
260  UTF8_COMPUTE (c, mask, len);
261  if (len == -1)
262  return (gunichar) - 1;
263  UTF8_GET (result, p, i, mask, len);
264 
265  return result;
266 }
267 
268 /*
269  * g_unichar_to_utf8:
270  * @c: a Unicode character code
271  * @outbuf: output buffer, must have at least 6 bytes of space.
272  * If %NULL, the length will be computed and returned
273  * and nothing will be written to @outbuf.
274  *
275  * Converts a single character to UTF-8.
276  *
277  * Return value: number of bytes written
278  **/
279 static int
280 g_unichar_to_utf8 (gunichar c, gchar *outbuf)
281 {
282  /* If this gets modified, also update the copy in g_string_insert_unichar() */
283  guint len = 0;
284  int first;
285  int i;
286 
287  if (c < 0x80)
288  {
289  first = 0;
290  len = 1;
291  }
292  else if (c < 0x800)
293  {
294  first = 0xc0;
295  len = 2;
296  }
297  else if (c < 0x10000)
298  {
299  first = 0xe0;
300  len = 3;
301  }
302  else if (c < 0x200000)
303  {
304  first = 0xf0;
305  len = 4;
306  }
307  else if (c < 0x4000000)
308  {
309  first = 0xf8;
310  len = 5;
311  }
312  else
313  {
314  first = 0xfc;
315  len = 6;
316  }
317 
318  if (outbuf)
319  {
320  for (i = len - 1; i > 0; --i)
321  {
322  outbuf[i] = (c & 0x3f) | 0x80;
323  c >>= 6;
324  }
325  outbuf[0] = c | first;
326  }
327 
328  return len;
329 }
330 
331 /*
332  * g_utf8_to_ucs4_fast:
333  * @str: a UTF-8 encoded string
334  * @len: the maximum length of @str to use, in bytes. If @len < 0,
335  * then the string is nul-terminated.
336  * @items_written: location to store the number of characters in the
337  * result, or %NULL.
338  *
339  * Convert a string from UTF-8 to a 32-bit fixed width
340  * representation as UCS-4, assuming valid UTF-8 input.
341  * This function is roughly twice as fast as g_utf8_to_ucs4()
342  * but does no error checking on the input. A trailing 0 character
343  * will be added to the string after the converted text.
344  *
345  * Return value: a pointer to a newly allocated UCS-4 string.
346  * This value must be freed with g_free().
347  **/
348 static gunichar *
349 g_utf8_to_ucs4_fast (const gchar *str, gssize len, gsize *items_written)
350 {
351  gunichar *result;
352  gsize n_chars, i;
353  const gchar *p;
354 
355  g_return_val_if_fail (str != NULL, NULL);
356 
357  p = str;
358  n_chars = 0;
359  if (len < 0)
360  {
361  while (*p)
362  {
363  p = g_utf8_next_char (p);
364  ++n_chars;
365  }
366  }
367  else
368  {
369  while (p < str + len && *p)
370  {
371  p = g_utf8_next_char (p);
372  ++n_chars;
373  }
374  }
375 
376  result = g_malloc (sizeof (gunichar) * (n_chars + 1));
377  if (!result)
378  return NULL;
379 
380  p = str;
381  for (i = 0; i < n_chars; i++)
382  {
383  gunichar wc = (guchar) * p++;
384 
385  if (wc < 0x80)
386  {
387  result[i] = wc;
388  }
389  else
390  {
391  gunichar mask = 0x40;
392 
393  if (G_UNLIKELY ((wc & mask) == 0))
394  {
395  /* It's an out-of-sequence 10xxxxxxx byte.
396  * Rather than making an ugly hash of this and the next byte
397  * and overrunning the buffer, it's more useful to treat it
398  * with a replacement character */
399  result[i] = 0xfffd;
400  continue;
401  }
402 
403  do
404  {
405  wc <<= 6;
406  wc |= (guchar) (*p++) & 0x3f;
407  mask <<= 5;
408  }
409  while ((wc & mask) != 0);
410 
411  wc &= mask - 1;
412 
413  result[i] = wc;
414  }
415  }
416  result[i] = 0;
417 
418  if (items_written)
419  *items_written = i;
420 
421  return result;
422 }
423 
424 /*
425  * g_ucs4_to_utf8:
426  * @str: a UCS-4 encoded string
427  * @len: the maximum length (number of characters) of @str to use.
428  * If @len < 0, then the string is nul-terminated.
429  * @items_read: location to store number of characters read, or %NULL.
430  * @items_written: location to store number of bytes written or %NULL.
431  * The value here stored does not include the trailing 0
432  * byte.
433  * @error: location to store the error occurring, or %NULL to ignore
434  * errors. Any of the errors in #GConvertError other than
435  * %G_CONVERT_ERROR_NO_CONVERSION may occur.
436  *
437  * Convert a string from a 32-bit fixed width representation as UCS-4.
438  * to UTF-8. The result will be terminated with a 0 byte.
439  *
440  * Return value: a pointer to a newly allocated UTF-8 string.
441  * This value must be freed with g_free(). If an
442  * error occurs, %NULL will be returned and
443  * @error set. In that case, @items_read will be
444  * set to the position of the first invalid input
445  * character.
446  **/
447 static gchar *
448 g_ucs4_to_utf8 (const gunichar *str,
449  gsize len, gsize *items_read, gsize *items_written)
450 {
451  gint result_length;
452  gchar *result = NULL;
453  gchar *p;
454  gsize i;
455 
456  result_length = 0;
457  for (i = 0; i < len; i++)
458  {
459  if (!str[i])
460  break;
461 
462  if (str[i] >= 0x80000000)
463  goto err_out;
464 
465  result_length += UTF8_LENGTH (str[i]);
466  }
467 
468  result = g_malloc (result_length + 1);
469  if (!result)
470  return NULL;
471  p = result;
472 
473  i = 0;
474  while (p < result + result_length)
475  p += g_unichar_to_utf8 (str[i++], p);
476 
477  *p = '\0';
478 
479  if (items_written)
480  *items_written = p - result;
481 
482 err_out:
483  if (items_read)
484  *items_read = i;
485 
486  return result;
487 }
488 
489 /* Code from GLIB gunidecomp.c starts here. */
490 
491 /* decomp.c - Character decomposition.
492  *
493  * Copyright (C) 1999, 2000 Tom Tromey
494  * Copyright 2000 Red Hat, Inc.
495  *
496  * The Gnome Library is free software; you can redistribute it and/or
497  * modify it under the terms of the GNU Lesser General Public License as
498  * published by the Free Software Foundation; either version 2 of the
499  * License, or (at your option) any later version.
500  *
501  * The Gnome Library is distributed in the hope that it will be useful,
502  * but WITHOUT ANY WARRANTY; without even the implied warranty of
503  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
504  * Lesser General Public License for more details.
505  */
506 
507 #include "gunidecomp.h"
508 #include "gunicomp.h"
509 
510 #define CC_PART1(Page, Char) \
511  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
512  ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
513  : (cclass_data[combining_class_table_part1[Page]][Char]))
514 
515 #define CC_PART2(Page, Char) \
516  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
517  ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
518  : (cclass_data[combining_class_table_part2[Page]][Char]))
519 
520 #define COMBINING_CLASS(Char) \
521  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
522  ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
523  : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
524  ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
525  : 0))
526 
527 /* constants for hangul syllable [de]composition */
528 #define SBase 0xAC00
529 #define LBase 0x1100
530 #define VBase 0x1161
531 #define TBase 0x11A7
532 #define LCount 19
533 #define VCount 21
534 #define TCount 28
535 #define NCount (VCount * TCount)
536 #define SCount (LCount * NCount)
537 
538 /*
539  * g_unicode_canonical_ordering:
540  * @string: a UCS-4 encoded string.
541  * @len: the maximum length of @string to use.
542  *
543  * Computes the canonical ordering of a string in-place.
544  * This rearranges decomposed characters in the string
545  * according to their combining classes. See the Unicode
546  * manual for more information.
547  **/
548 static void
549 g_unicode_canonical_ordering (gunichar *string, gsize len)
550 {
551  gsize i;
552  int swap = 1;
553 
554  while (swap)
555  {
556  int last;
557  swap = 0;
558  last = COMBINING_CLASS (string[0]);
559  for (i = 0; i < len - 1; ++i)
560  {
561  int next = COMBINING_CLASS (string[i + 1]);
562  if (next != 0 && last > next)
563  {
564  gsize j;
565  /* Percolate item leftward through string. */
566  for (j = i + 1; j > 0; --j)
567  {
568  gunichar t;
569  if (COMBINING_CLASS (string[j - 1]) <= next)
570  break;
571  t = string[j];
572  string[j] = string[j - 1];
573  string[j - 1] = t;
574  swap = 1;
575  }
576  /* We're re-entering the loop looking at the old
577  character again. */
578  next = last;
579  }
580  last = next;
581  }
582  }
583 }
584 
585 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
586  * r should be null or have sufficient space. Calling with r == NULL will
587  * only calculate the result_len; however, a buffer with space for three
588  * characters will always be big enough. */
589 static void
590 decompose_hangul (gunichar s, gunichar *r, gsize *result_len)
591 {
592  gint SIndex = s - SBase;
593  gint TIndex = SIndex % TCount;
594 
595  if (r)
596  {
597  r[0] = LBase + SIndex / NCount;
598  r[1] = VBase + (SIndex % NCount) / TCount;
599  }
600 
601  if (TIndex)
602  {
603  if (r)
604  r[2] = TBase + TIndex;
605  *result_len = 3;
606  }
607  else
608  *result_len = 2;
609 }
610 
611 /* returns a pointer to a null-terminated UTF-8 string */
612 static const gchar *
613 find_decomposition (gunichar ch, gboolean compat)
614 {
615  int start = 0;
616  int end = G_N_ELEMENTS (decomp_table);
617 
618  if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
619  {
620  while (TRUE)
621  {
622  int half = (start + end) / 2;
623  if (ch == decomp_table[half].ch)
624  {
625  int offset;
626 
627  if (compat)
628  {
629  offset = decomp_table[half].compat_offset;
630  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
631  offset = decomp_table[half].canon_offset;
632  }
633  else
634  {
635  offset = decomp_table[half].canon_offset;
636  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
637  return NULL;
638  }
639 
640  return &(decomp_expansion_string[offset]);
641  }
642  else if (half == start)
643  break;
644  else if (ch > decomp_table[half].ch)
645  start = half;
646  else
647  end = half;
648  }
649  }
650 
651  return NULL;
652 }
653 
654 /* L,V => LV and LV,T => LVT */
655 static gboolean
656 combine_hangul (gunichar a, gunichar b, gunichar *result)
657 {
658  if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
659  {
660  gint LIndex = a - LBase;
661  gint VIndex = b - VBase;
662 
663  *result = SBase + (LIndex * VCount + VIndex) * TCount;
664  return TRUE;
665  }
666 
667  if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
668  {
669  gint SIndex = a - SBase;
670 
671  if ((SIndex % TCount) == 0)
672  {
673  gint TIndex = b - TBase;
674 
675  *result = a + TIndex;
676  return TRUE;
677  }
678  }
679 
680  return FALSE;
681 }
682 
683 #define CI(Page, Char) \
684  ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
685  ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
686  : (compose_data[compose_table[Page]][Char]))
687 
688 #define COMPOSE_INDEX(Char) \
689  (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
690 
691 static gboolean
692 combine (gunichar a, gunichar b, gunichar *result)
693 {
694  gushort index_a, index_b;
695 
696  if (combine_hangul (a, b, result))
697  return TRUE;
698 
699  index_a = COMPOSE_INDEX (a);
700 
701  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
702  {
703  if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
704  {
705  *result =
706  compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
707  return TRUE;
708  }
709  else
710  return FALSE;
711  }
712 
713  index_b = COMPOSE_INDEX (b);
714 
715  if (index_b >= COMPOSE_SECOND_SINGLE_START)
716  {
717  if (a ==
718  compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
719  {
720  *result =
721  compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
722  return TRUE;
723  }
724  else
725  return FALSE;
726  }
727 
728  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
729  && index_b >= COMPOSE_SECOND_START
730  && index_b < COMPOSE_SECOND_SINGLE_START)
731  {
732  gunichar res =
733  compose_array[index_a - COMPOSE_FIRST_START][index_b -
735 
736  if (res)
737  {
738  *result = res;
739  return TRUE;
740  }
741  }
742 
743  return FALSE;
744 }
745 
746 static gunichar *
747 _g_utf8_normalize_wc (const gchar *str, gssize max_len, GNormalizeMode mode)
748 {
749  gsize n_wc;
750  gunichar *wc_buffer;
751  const char *p;
752  gsize last_start;
753  gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
754  gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
755 
756  n_wc = 0;
757  p = str;
758  while ((max_len < 0 || p < str + max_len) && *p)
759  {
760  const gchar *decomp;
761  gunichar wc = g_utf8_get_char (p);
762 
763  if (wc >= SBase && wc < SBase + SCount)
764  {
765  gsize result_len;
766  decompose_hangul (wc, NULL, &result_len);
767  n_wc += result_len;
768  }
769  else
770  {
771  decomp = find_decomposition (wc, do_compat);
772 
773  if (decomp)
774  n_wc += g_utf8_strlen (decomp);
775  else
776  n_wc++;
777  }
778 
779  p = g_utf8_next_char (p);
780  }
781 
782  wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
783  if (!wc_buffer)
784  return NULL;
785 
786  last_start = 0;
787  n_wc = 0;
788  p = str;
789  while ((max_len < 0 || p < str + max_len) && *p)
790  {
791  gunichar wc = g_utf8_get_char (p);
792  const gchar *decomp;
793  int cc;
794  gsize old_n_wc = n_wc;
795 
796  if (wc >= SBase && wc < SBase + SCount)
797  {
798  gsize result_len;
799  decompose_hangul (wc, wc_buffer + n_wc, &result_len);
800  n_wc += result_len;
801  }
802  else
803  {
804  decomp = find_decomposition (wc, do_compat);
805 
806  if (decomp)
807  {
808  const char *pd;
809  for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
810  wc_buffer[n_wc++] = g_utf8_get_char (pd);
811  }
812  else
813  wc_buffer[n_wc++] = wc;
814  }
815 
816  if (n_wc > 0)
817  {
818  cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
819 
820  if (cc == 0)
821  {
822  g_unicode_canonical_ordering (wc_buffer + last_start,
823  n_wc - last_start);
824  last_start = old_n_wc;
825  }
826  }
827 
828  p = g_utf8_next_char (p);
829  }
830 
831  if (n_wc > 0)
832  {
833  g_unicode_canonical_ordering (wc_buffer + last_start,
834  n_wc - last_start);
835  /* dead assignment: last_start = n_wc; */
836  }
837 
838  wc_buffer[n_wc] = 0;
839 
840  /* All decomposed and reordered */
841 
842  if (do_compose && n_wc > 0)
843  {
844  gsize i, j;
845  int last_cc = 0;
846  last_start = 0;
847 
848  for (i = 0; i < n_wc; i++)
849  {
850  int cc = COMBINING_CLASS (wc_buffer[i]);
851 
852  if (i > 0 &&
853  (last_cc == 0 || last_cc != cc) &&
854  combine (wc_buffer[last_start], wc_buffer[i],
855  &wc_buffer[last_start]))
856  {
857  for (j = i + 1; j < n_wc; j++)
858  wc_buffer[j - 1] = wc_buffer[j];
859  n_wc--;
860  i--;
861 
862  if (i == last_start)
863  last_cc = 0;
864  else
865  last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
866 
867  continue;
868  }
869 
870  if (cc == 0)
871  last_start = i;
872 
873  last_cc = cc;
874  }
875  }
876 
877  wc_buffer[n_wc] = 0;
878 
879  return wc_buffer;
880 }
881 
882 /*
883  * g_utf8_normalize:
884  * @str: a UTF-8 encoded string.
885  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
886  * @mode: the type of normalization to perform.
887  *
888  * Converts a string into canonical form, standardizing
889  * such issues as whether a character with an accent
890  * is represented as a base character and combining
891  * accent or as a single precomposed character. The
892  * string has to be valid UTF-8, otherwise %NULL is
893  * returned. You should generally call g_utf8_normalize()
894  * before comparing two Unicode strings.
895  *
896  * The normalization mode %G_NORMALIZE_DEFAULT only
897  * standardizes differences that do not affect the
898  * text content, such as the above-mentioned accent
899  * representation. %G_NORMALIZE_ALL also standardizes
900  * the "compatibility" characters in Unicode, such
901  * as SUPERSCRIPT THREE to the standard forms
902  * (in this case DIGIT THREE). Formatting information
903  * may be lost but for most text operations such
904  * characters should be considered the same.
905  *
906  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
907  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
908  * but returned a result with composed forms rather
909  * than a maximally decomposed form. This is often
910  * useful if you intend to convert the string to
911  * a legacy encoding or pass it to a system with
912  * less capable Unicode handling.
913  *
914  * Return value: a newly allocated string, that is the
915  * normalized form of @str, or %NULL if @str is not
916  * valid UTF-8.
917  **/
918 static gchar *
919 g_utf8_normalize (const gchar *str, gssize len, GNormalizeMode mode)
920 {
921  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
922  gchar *result = NULL;
923 
924  if (result_wc)
925  result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
926 
927  g_free (result_wc);
928 
929  return result;
930 }
931 
932 /* Public Libidn API starts here. */
933 
944 uint32_t
946 {
947  return g_utf8_get_char (p);
948 }
949 
961 int
962 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
963 {
964  return g_unichar_to_utf8 (c, outbuf);
965 }
966 
967 #include <unistr.h>
968 
985 uint32_t *
986 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
987 {
988  size_t n;
989 
990  if (len < 0)
991  n = strlen (str);
992  else
993  n = len;
994 
995  if (u8_check ((const uint8_t *) str, n))
996  return NULL;
997 
998  return g_utf8_to_ucs4_fast (str, len, items_written);
999 }
1000 
1018 char *
1019 stringprep_ucs4_to_utf8 (const uint32_t *str, ssize_t len,
1020  size_t *items_read, size_t *items_written)
1021 {
1022  return g_ucs4_to_utf8 (str, len, items_read, items_written);
1023 }
1024 
1047 char *
1048 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1049 {
1050  size_t n;
1051 
1052  if (len < 0)
1053  n = strlen (str);
1054  else
1055  n = len;
1056 
1057  if (u8_check ((const uint8_t *) str, n))
1058  return NULL;
1059 
1060  return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1061 }
1062 
1063 #include <stdio.h>
1075 uint32_t *
1076 stringprep_ucs4_nfkc_normalize (const uint32_t *str, ssize_t len)
1077 {
1078  char *p;
1079  uint32_t *result_wc;
1080 
1081  p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1082  if (!p)
1083  return NULL;
1084 
1085  result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1086  free (p);
1087 
1088  return result_wc;
1089 }
#define COMPOSE_SECOND_SINGLE_START
Definition: gunicomp.h:7
#define COMPOSE_SECOND_START
Definition: gunicomp.h:6
#define COMPOSE_FIRST_START
Definition: gunicomp.h:4
#define COMPOSE_FIRST_SINGLE_START
Definition: gunicomp.h:5
#define G_UNICODE_NOT_PRESENT_OFFSET
Definition: gunidecomp.h:15
#define g_return_val_if_fail(expr, val)
Definition: nfkc.c:53
#define SCount
Definition: nfkc.c:536
#define gssize
Definition: nfkc.c:50
#define gushort
Definition: nfkc.c:45
#define UTF8_COMPUTE(Char, Mask, Len)
Definition: nfkc.c:137
int stringprep_unichar_to_utf8(uint32_t c, char *outbuf)
Definition: nfkc.c:962
#define gunichar
Definition: nfkc.c:48
#define COMPOSE_INDEX(Char)
Definition: nfkc.c:688
#define guint
Definition: nfkc.c:44
#define g_free
Definition: nfkc.c:52
#define G_N_ELEMENTS(arr)
Definition: nfkc.c:82
#define gchar
Definition: nfkc.c:41
char * stringprep_utf8_nfkc_normalize(const char *str, ssize_t len)
Definition: nfkc.c:1048
#define LBase
Definition: nfkc.c:529
#define gint
Definition: nfkc.c:43
#define UTF8_LENGTH(Char)
Definition: nfkc.c:171
#define g_utf8_next_char(p)
Definition: nfkc.c:117
char * stringprep_ucs4_to_utf8(const uint32_t *str, ssize_t len, size_t *items_read, size_t *items_written)
Definition: nfkc.c:1019
#define TRUE
Definition: nfkc.c:79
#define FALSE
Definition: nfkc.c:75
#define G_UNLIKELY(expr)
Definition: nfkc.c:84
#define TBase
Definition: nfkc.c:531
#define UTF8_GET(Result, Chars, Count, Mask, Len)
Definition: nfkc.c:178
#define VBase
Definition: nfkc.c:530
uint32_t stringprep_utf8_to_unichar(const char *p)
Definition: nfkc.c:945
#define COMBINING_CLASS(Char)
Definition: nfkc.c:520
#define NCount
Definition: nfkc.c:535
#define guchar
Definition: nfkc.c:42
#define g_malloc
Definition: nfkc.c:51
GNormalizeMode
Definition: nfkc.c:105
@ G_NORMALIZE_DEFAULT_COMPOSE
Definition: nfkc.c:108
@ G_NORMALIZE_NFKC
Definition: nfkc.c:113
@ G_NORMALIZE_NFKD
Definition: nfkc.c:111
@ G_NORMALIZE_ALL
Definition: nfkc.c:110
@ G_NORMALIZE_NFD
Definition: nfkc.c:107
@ G_NORMALIZE_DEFAULT
Definition: nfkc.c:106
@ G_NORMALIZE_ALL_COMPOSE
Definition: nfkc.c:112
@ G_NORMALIZE_NFC
Definition: nfkc.c:109
#define SBase
Definition: nfkc.c:528
#define TCount
Definition: nfkc.c:534
uint32_t * stringprep_utf8_to_ucs4(const char *str, ssize_t len, size_t *items_written)
Definition: nfkc.c:986
#define gsize
Definition: nfkc.c:49
uint32_t * stringprep_ucs4_nfkc_normalize(const uint32_t *str, ssize_t len)
Definition: nfkc.c:1076
#define VCount
Definition: nfkc.c:533
#define gboolean
Definition: nfkc.c:40