libidn  1.42
nfkc.c
Go to the documentation of this file.
1 /* nfkc.c --- Unicode normalization utilities.
2  Copyright (C) 2002-2024 Simon Josefsson
3 
4  This file is part of GNU Libidn.
5 
6  GNU Libidn is free software: you can redistribute it and/or
7  modify it under the terms of either:
8 
9  * the GNU Lesser General Public License as published by the Free
10  Software Foundation; either version 3 of the License, or (at
11  your option) any later version.
12 
13  or
14 
15  * the GNU General Public License as published by the Free
16  Software Foundation; either version 2 of the License, or (at
17  your option) any later version.
18 
19  or both in parallel, as here.
20 
21  GNU Libidn is distributed in the hope that it will be useful,
22  but WITHOUT ANY WARRANTY; without even the implied warranty of
23  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24  General Public License for more details.
25 
26  You should have received copies of the GNU General Public License and
27  the GNU Lesser General Public License along with this program. If
28  not, see <https://www.gnu.org/licenses/>. */
29 
30 #ifdef HAVE_CONFIG_H
31 # include "config.h"
32 #endif
33 
34 #include <stdlib.h>
35 #include <string.h>
36 
37 #include "stringprep.h"
38 
39 /* Hacks to make syncing with GLIB code easier. */
40 #define gboolean int
41 #define gchar char
42 #define guchar unsigned char
43 #define gint int
44 #define guint unsigned int
45 #define gushort unsigned short
46 #define gint16 int16_t
47 #define guint16 uint16_t
48 #define gunichar uint32_t
49 #define gsize size_t
50 #define gssize ssize_t
51 #define g_malloc malloc
52 #define g_free free
53 #define g_return_val_if_fail(expr,val) { \
54  if (!(expr)) \
55  return (val); \
56  }
57 
58 /* Code from GLIB gmacros.h starts here. */
59 
60 /* GLIB - Library of useful routines for C programming
61  * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald
62  *
63  * This library is free software; you can redistribute it and/or
64  * modify it under the terms of the GNU Lesser General Public
65  * License as published by the Free Software Foundation; either
66  * version 2 of the License, or (at your option) any later version.
67  *
68  * This library is distributed in the hope that it will be useful,
69  * but WITHOUT ANY WARRANTY; without even the implied warranty of
70  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
71  * Lesser General Public License for more details.
72  *
73  * You should have received a copy of the GNU Lesser General Public
74  * License along with this library; if not, write to the
75  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
76  * Boston, MA 02111-1307, USA.
77  */
78 
79 #ifndef FALSE
80 # define FALSE (0)
81 #endif
82 
83 #ifndef TRUE
84 # define TRUE (!FALSE)
85 #endif
86 
87 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
88 
89 #define G_UNLIKELY(expr) (expr)
90 
91 /* Code from GLIB gunicode.h starts here. */
92 
93 /* gunicode.h - Unicode manipulation functions
94  *
95  * Copyright (C) 1999, 2000 Tom Tromey
96  * Copyright 2000, 2005 Red Hat, Inc.
97  *
98  * The Gnome Library is free software; you can redistribute it and/or
99  * modify it under the terms of the GNU Lesser General Public License as
100  * published by the Free Software Foundation; either version 2 of the
101  * License, or (at your option) any later version.
102  *
103  * The Gnome Library is distributed in the hope that it will be useful,
104  * but WITHOUT ANY WARRANTY; without even the implied warranty of
105  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
106  * Lesser General Public License for more details.
107  *
108  * You should have received a copy of the GNU Lesser General Public
109  * License along with the Gnome Library; see the file COPYING.LIB. If not,
110  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
111  * Boston, MA 02111-1307, USA.
112  */
113 
114 typedef enum
115 {
124 }
126 
127 #define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
128 
129 /* Code from GLIB gutf8.c starts here. */
130 
131 /* gutf8.c - Operations on UTF-8 strings.
132  *
133  * Copyright (C) 1999 Tom Tromey
134  * Copyright (C) 2000 Red Hat, Inc.
135  *
136  * This library is free software; you can redistribute it and/or
137  * modify it under the terms of the GNU Lesser General Public
138  * License as published by the Free Software Foundation; either
139  * version 2 of the License, or (at your option) any later version.
140  *
141  * This library is distributed in the hope that it will be useful,
142  * but WITHOUT ANY WARRANTY; without even the implied warranty of
143  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
144  * Lesser General Public License for more details.
145  *
146  * You should have received a copy of the GNU Lesser General Public
147  * License along with this library; if not, write to the
148  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
149  * Boston, MA 02111-1307, USA.
150  */
151 
152 #define UTF8_COMPUTE(Char, Mask, Len) \
153  if (Char < 128) \
154  { \
155  Len = 1; \
156  Mask = 0x7f; \
157  } \
158  else if ((Char & 0xe0) == 0xc0) \
159  { \
160  Len = 2; \
161  Mask = 0x1f; \
162  } \
163  else if ((Char & 0xf0) == 0xe0) \
164  { \
165  Len = 3; \
166  Mask = 0x0f; \
167  } \
168  else if ((Char & 0xf8) == 0xf0) \
169  { \
170  Len = 4; \
171  Mask = 0x07; \
172  } \
173  else if ((Char & 0xfc) == 0xf8) \
174  { \
175  Len = 5; \
176  Mask = 0x03; \
177  } \
178  else if ((Char & 0xfe) == 0xfc) \
179  { \
180  Len = 6; \
181  Mask = 0x01; \
182  } \
183  else \
184  Len = -1;
185 
186 #define UTF8_LENGTH(Char) \
187  ((Char) < 0x80 ? 1 : \
188  ((Char) < 0x800 ? 2 : \
189  ((Char) < 0x10000 ? 3 : \
190  ((Char) < 0x200000 ? 4 : \
191  ((Char) < 0x4000000 ? 5 : 6)))))
192 
193 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
194  (Result) = (Chars)[0] & (Mask); \
195  for ((Count) = 1; (Count) < (Len); ++(Count)) \
196  { \
197  if (((Chars)[(Count)] & 0xc0) != 0x80) \
198  { \
199  (Result) = -1; \
200  break; \
201  } \
202  (Result) <<= 6; \
203  (Result) |= ((Chars)[(Count)] & 0x3f); \
204  }
205 
206 static const gchar utf8_skip_data[256] = {
207  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
208  1, 1, 1, 1, 1, 1, 1,
209  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
210  1, 1, 1, 1, 1, 1, 1,
211  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
212  1, 1, 1, 1, 1, 1, 1,
213  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
214  1, 1, 1, 1, 1, 1, 1,
215  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
216  1, 1, 1, 1, 1, 1, 1,
217  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
218  1, 1, 1, 1, 1, 1, 1,
219  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
220  2, 2, 2, 2, 2, 2, 2,
221  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
222  5, 5, 5, 6, 6, 1, 1
223 };
224 
225 static const gchar *const g_utf8_skip = utf8_skip_data;
226 
227 /*
228  * g_utf8_strlen:
229  * @p: pointer to the start of a UTF-8 encoded string
230  * @max: the maximum number of bytes to examine. If @max
231  * is less than 0, then the string is assumed to be
232  * nul-terminated. If @max is 0, @p will not be examined and
233  * may be %NULL.
234  *
235  * Computes the length of the string in characters, not including
236  * the terminating nul character.
237  *
238  * Return value: the length of the string in characters
239  **/
240 static gsize
241 g_utf8_strlen (const gchar *p)
242 {
243  gsize len = 0;
244 
245  g_return_val_if_fail (p != NULL, 0);
246 
247  while (*p)
248  {
249  p = g_utf8_next_char (p);
250  ++len;
251  }
252 
253  return len;
254 }
255 
256 /*
257  * g_utf8_get_char:
258  * @p: a pointer to Unicode character encoded as UTF-8
259  *
260  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
261  * If @p does not point to a valid UTF-8 encoded character, results are
262  * undefined. If you are not sure that the bytes are complete
263  * valid Unicode characters, you should use g_utf8_get_char_validated()
264  * instead.
265  *
266  * Return value: the resulting character
267  **/
268 static gunichar
269 g_utf8_get_char (const gchar *p)
270 {
271  int i, mask = 0, len;
272  gunichar result;
273  unsigned char c = (unsigned char) *p;
274 
275  UTF8_COMPUTE (c, mask, len);
276  if (len == -1)
277  return (gunichar) - 1;
278  UTF8_GET (result, p, i, mask, len);
279 
280  return result;
281 }
282 
283 /*
284  * g_unichar_to_utf8:
285  * @c: a Unicode character code
286  * @outbuf: output buffer, must have at least 6 bytes of space.
287  * If %NULL, the length will be computed and returned
288  * and nothing will be written to @outbuf.
289  *
290  * Converts a single character to UTF-8.
291  *
292  * Return value: number of bytes written
293  **/
294 static int
295 g_unichar_to_utf8 (gunichar c, gchar *outbuf)
296 {
297  /* If this gets modified, also update the copy in g_string_insert_unichar() */
298  guint len = 0;
299  int first;
300  int i;
301 
302  if (c < 0x80)
303  {
304  first = 0;
305  len = 1;
306  }
307  else if (c < 0x800)
308  {
309  first = 0xc0;
310  len = 2;
311  }
312  else if (c < 0x10000)
313  {
314  first = 0xe0;
315  len = 3;
316  }
317  else if (c < 0x200000)
318  {
319  first = 0xf0;
320  len = 4;
321  }
322  else if (c < 0x4000000)
323  {
324  first = 0xf8;
325  len = 5;
326  }
327  else
328  {
329  first = 0xfc;
330  len = 6;
331  }
332 
333  if (outbuf)
334  {
335  for (i = len - 1; i > 0; --i)
336  {
337  outbuf[i] = (c & 0x3f) | 0x80;
338  c >>= 6;
339  }
340  outbuf[0] = c | first;
341  }
342 
343  return len;
344 }
345 
346 /*
347  * g_utf8_to_ucs4_fast:
348  * @str: a UTF-8 encoded string
349  * @len: the maximum length of @str to use, in bytes. If @len < 0,
350  * then the string is nul-terminated.
351  * @items_written: location to store the number of characters in the
352  * result, or %NULL.
353  *
354  * Convert a string from UTF-8 to a 32-bit fixed width
355  * representation as UCS-4, assuming valid UTF-8 input.
356  * This function is roughly twice as fast as g_utf8_to_ucs4()
357  * but does no error checking on the input. A trailing 0 character
358  * will be added to the string after the converted text.
359  *
360  * Return value: a pointer to a newly allocated UCS-4 string.
361  * This value must be freed with g_free().
362  **/
363 static gunichar *
364 g_utf8_to_ucs4_fast (const gchar *str, gssize len, gsize *items_written)
365 {
366  gunichar *result;
367  gsize n_chars, i;
368  const gchar *p;
369 
370  g_return_val_if_fail (str != NULL, NULL);
371 
372  p = str;
373  n_chars = 0;
374  if (len < 0)
375  {
376  while (*p)
377  {
378  p = g_utf8_next_char (p);
379  ++n_chars;
380  }
381  }
382  else
383  {
384  while (p < str + len && *p)
385  {
386  p = g_utf8_next_char (p);
387  ++n_chars;
388  }
389  }
390 
391  result = g_malloc (sizeof (gunichar) * (n_chars + 1));
392  if (!result)
393  return NULL;
394 
395  p = str;
396  for (i = 0; i < n_chars; i++)
397  {
398  gunichar wc = (guchar) * p++;
399 
400  if (wc < 0x80)
401  {
402  result[i] = wc;
403  }
404  else
405  {
406  gunichar mask = 0x40;
407 
408  if (G_UNLIKELY ((wc & mask) == 0))
409  {
410  /* It's an out-of-sequence 10xxxxxxx byte.
411  * Rather than making an ugly hash of this and the next byte
412  * and overrunning the buffer, it's more useful to treat it
413  * with a replacement character */
414  result[i] = 0xfffd;
415  continue;
416  }
417 
418  do
419  {
420  wc <<= 6;
421  wc |= (guchar) (*p++) & 0x3f;
422  mask <<= 5;
423  }
424  while ((wc & mask) != 0);
425 
426  wc &= mask - 1;
427 
428  result[i] = wc;
429  }
430  }
431  result[i] = 0;
432 
433  if (items_written)
434  *items_written = i;
435 
436  return result;
437 }
438 
439 /*
440  * g_ucs4_to_utf8:
441  * @str: a UCS-4 encoded string
442  * @len: the maximum length (number of characters) of @str to use.
443  * If @len < 0, then the string is nul-terminated.
444  * @items_read: location to store number of characters read, or %NULL.
445  * @items_written: location to store number of bytes written or %NULL.
446  * The value here stored does not include the trailing 0
447  * byte.
448  * @error: location to store the error occurring, or %NULL to ignore
449  * errors. Any of the errors in #GConvertError other than
450  * %G_CONVERT_ERROR_NO_CONVERSION may occur.
451  *
452  * Convert a string from a 32-bit fixed width representation as UCS-4.
453  * to UTF-8. The result will be terminated with a 0 byte.
454  *
455  * Return value: a pointer to a newly allocated UTF-8 string.
456  * This value must be freed with g_free(). If an
457  * error occurs, %NULL will be returned and
458  * @error set. In that case, @items_read will be
459  * set to the position of the first invalid input
460  * character.
461  **/
462 static gchar *
463 g_ucs4_to_utf8 (const gunichar *str,
464  gsize len, gsize *items_read, gsize *items_written)
465 {
466  gint result_length;
467  gchar *result = NULL;
468  gchar *p;
469  gsize i;
470 
471  result_length = 0;
472  for (i = 0; i < len; i++)
473  {
474  if (!str[i])
475  break;
476 
477  if (str[i] >= 0x80000000)
478  goto err_out;
479 
480  result_length += UTF8_LENGTH (str[i]);
481  }
482 
483  result = g_malloc (result_length + 1);
484  if (!result)
485  return NULL;
486  p = result;
487 
488  i = 0;
489  while (p < result + result_length)
490  p += g_unichar_to_utf8 (str[i++], p);
491 
492  *p = '\0';
493 
494  if (items_written)
495  *items_written = p - result;
496 
497 err_out:
498  if (items_read)
499  *items_read = i;
500 
501  return result;
502 }
503 
504 /* Code from GLIB gunidecomp.c starts here. */
505 
506 /* decomp.c - Character decomposition.
507  *
508  * Copyright (C) 1999, 2000 Tom Tromey
509  * Copyright 2000 Red Hat, Inc.
510  *
511  * The Gnome Library is free software; you can redistribute it and/or
512  * modify it under the terms of the GNU Lesser General Public License as
513  * published by the Free Software Foundation; either version 2 of the
514  * License, or (at your option) any later version.
515  *
516  * The Gnome Library is distributed in the hope that it will be useful,
517  * but WITHOUT ANY WARRANTY; without even the implied warranty of
518  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
519  * Lesser General Public License for more details.
520  *
521  * You should have received a copy of the GNU Lesser General Public
522  * License along with the Gnome Library; see the file COPYING.LIB. If not,
523  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
524  * Boston, MA 02111-1307, USA.
525  */
526 
527 #include "gunidecomp.h"
528 #include "gunicomp.h"
529 
530 #define CC_PART1(Page, Char) \
531  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
532  ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
533  : (cclass_data[combining_class_table_part1[Page]][Char]))
534 
535 #define CC_PART2(Page, Char) \
536  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
537  ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
538  : (cclass_data[combining_class_table_part2[Page]][Char]))
539 
540 #define COMBINING_CLASS(Char) \
541  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
542  ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
543  : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
544  ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
545  : 0))
546 
547 /* constants for hangul syllable [de]composition */
548 #define SBase 0xAC00
549 #define LBase 0x1100
550 #define VBase 0x1161
551 #define TBase 0x11A7
552 #define LCount 19
553 #define VCount 21
554 #define TCount 28
555 #define NCount (VCount * TCount)
556 #define SCount (LCount * NCount)
557 
558 /*
559  * g_unicode_canonical_ordering:
560  * @string: a UCS-4 encoded string.
561  * @len: the maximum length of @string to use.
562  *
563  * Computes the canonical ordering of a string in-place.
564  * This rearranges decomposed characters in the string
565  * according to their combining classes. See the Unicode
566  * manual for more information.
567  **/
568 static void
569 g_unicode_canonical_ordering (gunichar *string, gsize len)
570 {
571  gsize i;
572  int swap = 1;
573 
574  while (swap)
575  {
576  int last;
577  swap = 0;
578  last = COMBINING_CLASS (string[0]);
579  for (i = 0; i < len - 1; ++i)
580  {
581  int next = COMBINING_CLASS (string[i + 1]);
582  if (next != 0 && last > next)
583  {
584  gsize j;
585  /* Percolate item leftward through string. */
586  for (j = i + 1; j > 0; --j)
587  {
588  gunichar t;
589  if (COMBINING_CLASS (string[j - 1]) <= next)
590  break;
591  t = string[j];
592  string[j] = string[j - 1];
593  string[j - 1] = t;
594  swap = 1;
595  }
596  /* We're re-entering the loop looking at the old
597  character again. */
598  next = last;
599  }
600  last = next;
601  }
602  }
603 }
604 
605 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
606  * r should be null or have sufficient space. Calling with r == NULL will
607  * only calculate the result_len; however, a buffer with space for three
608  * characters will always be big enough. */
609 static void
610 decompose_hangul (gunichar s, gunichar *r, gsize *result_len)
611 {
612  gint SIndex = s - SBase;
613  gint TIndex = SIndex % TCount;
614 
615  if (r)
616  {
617  r[0] = LBase + SIndex / NCount;
618  r[1] = VBase + (SIndex % NCount) / TCount;
619  }
620 
621  if (TIndex)
622  {
623  if (r)
624  r[2] = TBase + TIndex;
625  *result_len = 3;
626  }
627  else
628  *result_len = 2;
629 }
630 
631 /* returns a pointer to a null-terminated UTF-8 string */
632 static const gchar *
633 find_decomposition (gunichar ch, gboolean compat)
634 {
635  int start = 0;
636  int end = G_N_ELEMENTS (decomp_table);
637 
638  if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
639  {
640  while (TRUE)
641  {
642  int half = (start + end) / 2;
643  if (ch == decomp_table[half].ch)
644  {
645  int offset;
646 
647  if (compat)
648  {
649  offset = decomp_table[half].compat_offset;
650  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
651  offset = decomp_table[half].canon_offset;
652  }
653  else
654  {
655  offset = decomp_table[half].canon_offset;
656  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
657  return NULL;
658  }
659 
660  return &(decomp_expansion_string[offset]);
661  }
662  else if (half == start)
663  break;
664  else if (ch > decomp_table[half].ch)
665  start = half;
666  else
667  end = half;
668  }
669  }
670 
671  return NULL;
672 }
673 
674 /* L,V => LV and LV,T => LVT */
675 static gboolean
676 combine_hangul (gunichar a, gunichar b, gunichar *result)
677 {
678  if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
679  {
680  gint LIndex = a - LBase;
681  gint VIndex = b - VBase;
682 
683  *result = SBase + (LIndex * VCount + VIndex) * TCount;
684  return TRUE;
685  }
686 
687  if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
688  {
689  gint SIndex = a - SBase;
690 
691  if ((SIndex % TCount) == 0)
692  {
693  gint TIndex = b - TBase;
694 
695  *result = a + TIndex;
696  return TRUE;
697  }
698  }
699 
700  return FALSE;
701 }
702 
703 #define CI(Page, Char) \
704  ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
705  ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
706  : (compose_data[compose_table[Page]][Char]))
707 
708 #define COMPOSE_INDEX(Char) \
709  (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
710 
711 static gboolean
712 combine (gunichar a, gunichar b, gunichar *result)
713 {
714  gushort index_a, index_b;
715 
716  if (combine_hangul (a, b, result))
717  return TRUE;
718 
719  index_a = COMPOSE_INDEX (a);
720 
721  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
722  {
723  if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
724  {
725  *result =
726  compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
727  return TRUE;
728  }
729  else
730  return FALSE;
731  }
732 
733  index_b = COMPOSE_INDEX (b);
734 
735  if (index_b >= COMPOSE_SECOND_SINGLE_START)
736  {
737  if (a ==
738  compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
739  {
740  *result =
741  compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
742  return TRUE;
743  }
744  else
745  return FALSE;
746  }
747 
748  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
749  && index_b >= COMPOSE_SECOND_START
750  && index_b < COMPOSE_SECOND_SINGLE_START)
751  {
752  gunichar res =
753  compose_array[index_a - COMPOSE_FIRST_START][index_b -
755 
756  if (res)
757  {
758  *result = res;
759  return TRUE;
760  }
761  }
762 
763  return FALSE;
764 }
765 
766 static gunichar *
767 _g_utf8_normalize_wc (const gchar *str, gssize max_len, GNormalizeMode mode)
768 {
769  gsize n_wc;
770  gunichar *wc_buffer;
771  const char *p;
772  gsize last_start;
773  gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
774  gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
775 
776  n_wc = 0;
777  p = str;
778  while ((max_len < 0 || p < str + max_len) && *p)
779  {
780  const gchar *decomp;
781  gunichar wc = g_utf8_get_char (p);
782 
783  if (wc >= SBase && wc < SBase + SCount)
784  {
785  gsize result_len;
786  decompose_hangul (wc, NULL, &result_len);
787  n_wc += result_len;
788  }
789  else
790  {
791  decomp = find_decomposition (wc, do_compat);
792 
793  if (decomp)
794  n_wc += g_utf8_strlen (decomp);
795  else
796  n_wc++;
797  }
798 
799  p = g_utf8_next_char (p);
800  }
801 
802  wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
803  if (!wc_buffer)
804  return NULL;
805 
806  last_start = 0;
807  n_wc = 0;
808  p = str;
809  while ((max_len < 0 || p < str + max_len) && *p)
810  {
811  gunichar wc = g_utf8_get_char (p);
812  const gchar *decomp;
813  int cc;
814  gsize old_n_wc = n_wc;
815 
816  if (wc >= SBase && wc < SBase + SCount)
817  {
818  gsize result_len;
819  decompose_hangul (wc, wc_buffer + n_wc, &result_len);
820  n_wc += result_len;
821  }
822  else
823  {
824  decomp = find_decomposition (wc, do_compat);
825 
826  if (decomp)
827  {
828  const char *pd;
829  for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
830  wc_buffer[n_wc++] = g_utf8_get_char (pd);
831  }
832  else
833  wc_buffer[n_wc++] = wc;
834  }
835 
836  if (n_wc > 0)
837  {
838  cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
839 
840  if (cc == 0)
841  {
842  g_unicode_canonical_ordering (wc_buffer + last_start,
843  n_wc - last_start);
844  last_start = old_n_wc;
845  }
846  }
847 
848  p = g_utf8_next_char (p);
849  }
850 
851  if (n_wc > 0)
852  {
853  g_unicode_canonical_ordering (wc_buffer + last_start,
854  n_wc - last_start);
855  /* dead assignment: last_start = n_wc; */
856  }
857 
858  wc_buffer[n_wc] = 0;
859 
860  /* All decomposed and reordered */
861 
862  if (do_compose && n_wc > 0)
863  {
864  gsize i, j;
865  int last_cc = 0;
866  last_start = 0;
867 
868  for (i = 0; i < n_wc; i++)
869  {
870  int cc = COMBINING_CLASS (wc_buffer[i]);
871 
872  if (i > 0 &&
873  (last_cc == 0 || last_cc != cc) &&
874  combine (wc_buffer[last_start], wc_buffer[i],
875  &wc_buffer[last_start]))
876  {
877  for (j = i + 1; j < n_wc; j++)
878  wc_buffer[j - 1] = wc_buffer[j];
879  n_wc--;
880  i--;
881 
882  if (i == last_start)
883  last_cc = 0;
884  else
885  last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
886 
887  continue;
888  }
889 
890  if (cc == 0)
891  last_start = i;
892 
893  last_cc = cc;
894  }
895  }
896 
897  wc_buffer[n_wc] = 0;
898 
899  return wc_buffer;
900 }
901 
902 /*
903  * g_utf8_normalize:
904  * @str: a UTF-8 encoded string.
905  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
906  * @mode: the type of normalization to perform.
907  *
908  * Converts a string into canonical form, standardizing
909  * such issues as whether a character with an accent
910  * is represented as a base character and combining
911  * accent or as a single precomposed character. The
912  * string has to be valid UTF-8, otherwise %NULL is
913  * returned. You should generally call g_utf8_normalize()
914  * before comparing two Unicode strings.
915  *
916  * The normalization mode %G_NORMALIZE_DEFAULT only
917  * standardizes differences that do not affect the
918  * text content, such as the above-mentioned accent
919  * representation. %G_NORMALIZE_ALL also standardizes
920  * the "compatibility" characters in Unicode, such
921  * as SUPERSCRIPT THREE to the standard forms
922  * (in this case DIGIT THREE). Formatting information
923  * may be lost but for most text operations such
924  * characters should be considered the same.
925  *
926  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
927  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
928  * but returned a result with composed forms rather
929  * than a maximally decomposed form. This is often
930  * useful if you intend to convert the string to
931  * a legacy encoding or pass it to a system with
932  * less capable Unicode handling.
933  *
934  * Return value: a newly allocated string, that is the
935  * normalized form of @str, or %NULL if @str is not
936  * valid UTF-8.
937  **/
938 static gchar *
939 g_utf8_normalize (const gchar *str, gssize len, GNormalizeMode mode)
940 {
941  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
942  gchar *result = NULL;
943 
944  if (result_wc)
945  result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
946 
947  g_free (result_wc);
948 
949  return result;
950 }
951 
952 /* Public Libidn API starts here. */
953 
964 uint32_t
966 {
967  return g_utf8_get_char (p);
968 }
969 
981 int
982 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
983 {
984  return g_unichar_to_utf8 (c, outbuf);
985 }
986 
987 #include <unistr.h>
988 
1005 uint32_t *
1006 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
1007 {
1008  size_t n;
1009 
1010  if (len < 0)
1011  n = strlen (str);
1012  else
1013  n = len;
1014 
1015  if (u8_check ((const uint8_t *) str, n))
1016  return NULL;
1017 
1018  return g_utf8_to_ucs4_fast (str, len, items_written);
1019 }
1020 
1038 char *
1039 stringprep_ucs4_to_utf8 (const uint32_t *str, ssize_t len,
1040  size_t *items_read, size_t *items_written)
1041 {
1042  return g_ucs4_to_utf8 (str, len, items_read, items_written);
1043 }
1044 
1067 char *
1068 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1069 {
1070  size_t n;
1071 
1072  if (len < 0)
1073  n = strlen (str);
1074  else
1075  n = len;
1076 
1077  if (u8_check ((const uint8_t *) str, n))
1078  return NULL;
1079 
1080  return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1081 }
1082 
1083 #include <stdio.h>
1095 uint32_t *
1096 stringprep_ucs4_nfkc_normalize (const uint32_t *str, ssize_t len)
1097 {
1098  char *p;
1099  uint32_t *result_wc;
1100 
1101  p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1102  if (!p)
1103  return NULL;
1104 
1105  result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1106  free (p);
1107 
1108  return result_wc;
1109 }
#define COMPOSE_SECOND_SINGLE_START
Definition: gunicomp.h:8
#define COMPOSE_SECOND_START
Definition: gunicomp.h:7
#define COMPOSE_FIRST_START
Definition: gunicomp.h:5
#define COMPOSE_FIRST_SINGLE_START
Definition: gunicomp.h:6
#define G_UNICODE_NOT_PRESENT_OFFSET
Definition: gunidecomp.h:16
#define g_return_val_if_fail(expr, val)
Definition: nfkc.c:53
#define SCount
Definition: nfkc.c:556
#define gssize
Definition: nfkc.c:50
#define gushort
Definition: nfkc.c:45
#define UTF8_COMPUTE(Char, Mask, Len)
Definition: nfkc.c:152
int stringprep_unichar_to_utf8(uint32_t c, char *outbuf)
Definition: nfkc.c:982
#define gunichar
Definition: nfkc.c:48
#define COMPOSE_INDEX(Char)
Definition: nfkc.c:708
#define guint
Definition: nfkc.c:44
#define g_free
Definition: nfkc.c:52
#define G_N_ELEMENTS(arr)
Definition: nfkc.c:87
#define gchar
Definition: nfkc.c:41
char * stringprep_utf8_nfkc_normalize(const char *str, ssize_t len)
Definition: nfkc.c:1068
#define LBase
Definition: nfkc.c:549
#define gint
Definition: nfkc.c:43
#define UTF8_LENGTH(Char)
Definition: nfkc.c:186
#define g_utf8_next_char(p)
Definition: nfkc.c:127
char * stringprep_ucs4_to_utf8(const uint32_t *str, ssize_t len, size_t *items_read, size_t *items_written)
Definition: nfkc.c:1039
#define TRUE
Definition: nfkc.c:84
#define FALSE
Definition: nfkc.c:80
#define G_UNLIKELY(expr)
Definition: nfkc.c:89
#define TBase
Definition: nfkc.c:551
#define UTF8_GET(Result, Chars, Count, Mask, Len)
Definition: nfkc.c:193
#define VBase
Definition: nfkc.c:550
uint32_t stringprep_utf8_to_unichar(const char *p)
Definition: nfkc.c:965
#define COMBINING_CLASS(Char)
Definition: nfkc.c:540
#define NCount
Definition: nfkc.c:555
#define guchar
Definition: nfkc.c:42
#define g_malloc
Definition: nfkc.c:51
GNormalizeMode
Definition: nfkc.c:115
@ G_NORMALIZE_DEFAULT_COMPOSE
Definition: nfkc.c:118
@ G_NORMALIZE_NFKC
Definition: nfkc.c:123
@ G_NORMALIZE_NFKD
Definition: nfkc.c:121
@ G_NORMALIZE_ALL
Definition: nfkc.c:120
@ G_NORMALIZE_NFD
Definition: nfkc.c:117
@ G_NORMALIZE_DEFAULT
Definition: nfkc.c:116
@ G_NORMALIZE_ALL_COMPOSE
Definition: nfkc.c:122
@ G_NORMALIZE_NFC
Definition: nfkc.c:119
#define SBase
Definition: nfkc.c:548
#define TCount
Definition: nfkc.c:554
uint32_t * stringprep_utf8_to_ucs4(const char *str, ssize_t len, size_t *items_written)
Definition: nfkc.c:1006
#define gsize
Definition: nfkc.c:49
uint32_t * stringprep_ucs4_nfkc_normalize(const uint32_t *str, ssize_t len)
Definition: nfkc.c:1096
#define VCount
Definition: nfkc.c:553
#define gboolean
Definition: nfkc.c:40