libidn  1.41
nfkc.c
Go to the documentation of this file.
1 /* nfkc.c --- Unicode normalization utilities.
2  Copyright (C) 2002-2022 Simon Josefsson
3 
4  This file is part of GNU Libidn.
5 
6  GNU Libidn is free software: you can redistribute it and/or
7  modify it under the terms of either:
8 
9  * the GNU Lesser General Public License as published by the Free
10  Software Foundation; either version 3 of the License, or (at
11  your option) any later version.
12 
13  or
14 
15  * the GNU General Public License as published by the Free
16  Software Foundation; either version 2 of the License, or (at
17  your option) any later version.
18 
19  or both in parallel, as here.
20 
21  GNU Libidn is distributed in the hope that it will be useful,
22  but WITHOUT ANY WARRANTY; without even the implied warranty of
23  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24  General Public License for more details.
25 
26  You should have received copies of the GNU General Public License and
27  the GNU Lesser General Public License along with this program. If
28  not, see <https://www.gnu.org/licenses/>. */
29 
30 #ifdef HAVE_CONFIG_H
31 # include "config.h"
32 #endif
33 
34 #include <stdlib.h>
35 #include <string.h>
36 
37 #include "stringprep.h"
38 
39 /* Hacks to make syncing with GLIB code easier. */
40 #define gboolean int
41 #define gchar char
42 #define guchar unsigned char
43 #define glong long
44 #define gint int
45 #define guint unsigned int
46 #define gushort unsigned short
47 #define gint16 int16_t
48 #define guint16 uint16_t
49 #define gunichar uint32_t
50 #define gsize size_t
51 #define gssize ssize_t
52 #define g_malloc malloc
53 #define g_free free
54 #define g_return_val_if_fail(expr,val) { \
55  if (!(expr)) \
56  return (val); \
57  }
58 
59 /* Code from GLIB gmacros.h starts here. */
60 
61 /* GLIB - Library of useful routines for C programming
62  * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald
63  *
64  * This library is free software; you can redistribute it and/or
65  * modify it under the terms of the GNU Lesser General Public
66  * License as published by the Free Software Foundation; either
67  * version 2 of the License, or (at your option) any later version.
68  *
69  * This library is distributed in the hope that it will be useful,
70  * but WITHOUT ANY WARRANTY; without even the implied warranty of
71  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
72  * Lesser General Public License for more details.
73  *
74  * You should have received a copy of the GNU Lesser General Public
75  * License along with this library; if not, write to the
76  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
77  * Boston, MA 02111-1307, USA.
78  */
79 
80 #ifndef FALSE
81 # define FALSE (0)
82 #endif
83 
84 #ifndef TRUE
85 # define TRUE (!FALSE)
86 #endif
87 
88 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
89 
90 #define G_UNLIKELY(expr) (expr)
91 
92 /* Code from GLIB gunicode.h starts here. */
93 
94 /* gunicode.h - Unicode manipulation functions
95  *
96  * Copyright (C) 1999, 2000 Tom Tromey
97  * Copyright 2000, 2005 Red Hat, Inc.
98  *
99  * The Gnome Library is free software; you can redistribute it and/or
100  * modify it under the terms of the GNU Lesser General Public License as
101  * published by the Free Software Foundation; either version 2 of the
102  * License, or (at your option) any later version.
103  *
104  * The Gnome Library is distributed in the hope that it will be useful,
105  * but WITHOUT ANY WARRANTY; without even the implied warranty of
106  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
107  * Lesser General Public License for more details.
108  *
109  * You should have received a copy of the GNU Lesser General Public
110  * License along with the Gnome Library; see the file COPYING.LIB. If not,
111  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
112  * Boston, MA 02111-1307, USA.
113  */
114 
115 typedef enum
116 {
125 }
127 
128 #define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
129 
130 /* Code from GLIB gutf8.c starts here. */
131 
132 /* gutf8.c - Operations on UTF-8 strings.
133  *
134  * Copyright (C) 1999 Tom Tromey
135  * Copyright (C) 2000 Red Hat, Inc.
136  *
137  * This library is free software; you can redistribute it and/or
138  * modify it under the terms of the GNU Lesser General Public
139  * License as published by the Free Software Foundation; either
140  * version 2 of the License, or (at your option) any later version.
141  *
142  * This library is distributed in the hope that it will be useful,
143  * but WITHOUT ANY WARRANTY; without even the implied warranty of
144  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
145  * Lesser General Public License for more details.
146  *
147  * You should have received a copy of the GNU Lesser General Public
148  * License along with this library; if not, write to the
149  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
150  * Boston, MA 02111-1307, USA.
151  */
152 
153 #define UTF8_COMPUTE(Char, Mask, Len) \
154  if (Char < 128) \
155  { \
156  Len = 1; \
157  Mask = 0x7f; \
158  } \
159  else if ((Char & 0xe0) == 0xc0) \
160  { \
161  Len = 2; \
162  Mask = 0x1f; \
163  } \
164  else if ((Char & 0xf0) == 0xe0) \
165  { \
166  Len = 3; \
167  Mask = 0x0f; \
168  } \
169  else if ((Char & 0xf8) == 0xf0) \
170  { \
171  Len = 4; \
172  Mask = 0x07; \
173  } \
174  else if ((Char & 0xfc) == 0xf8) \
175  { \
176  Len = 5; \
177  Mask = 0x03; \
178  } \
179  else if ((Char & 0xfe) == 0xfc) \
180  { \
181  Len = 6; \
182  Mask = 0x01; \
183  } \
184  else \
185  Len = -1;
186 
187 #define UTF8_LENGTH(Char) \
188  ((Char) < 0x80 ? 1 : \
189  ((Char) < 0x800 ? 2 : \
190  ((Char) < 0x10000 ? 3 : \
191  ((Char) < 0x200000 ? 4 : \
192  ((Char) < 0x4000000 ? 5 : 6)))))
193 
194 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
195  (Result) = (Chars)[0] & (Mask); \
196  for ((Count) = 1; (Count) < (Len); ++(Count)) \
197  { \
198  if (((Chars)[(Count)] & 0xc0) != 0x80) \
199  { \
200  (Result) = -1; \
201  break; \
202  } \
203  (Result) <<= 6; \
204  (Result) |= ((Chars)[(Count)] & 0x3f); \
205  }
206 
207 static const gchar utf8_skip_data[256] = {
208  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
209  1, 1, 1, 1, 1, 1, 1,
210  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
211  1, 1, 1, 1, 1, 1, 1,
212  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
213  1, 1, 1, 1, 1, 1, 1,
214  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
215  1, 1, 1, 1, 1, 1, 1,
216  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
217  1, 1, 1, 1, 1, 1, 1,
218  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
219  1, 1, 1, 1, 1, 1, 1,
220  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
221  2, 2, 2, 2, 2, 2, 2,
222  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
223  5, 5, 5, 6, 6, 1, 1
224 };
225 
226 static const gchar *const g_utf8_skip = utf8_skip_data;
227 
228 /*
229  * g_utf8_strlen:
230  * @p: pointer to the start of a UTF-8 encoded string
231  * @max: the maximum number of bytes to examine. If @max
232  * is less than 0, then the string is assumed to be
233  * nul-terminated. If @max is 0, @p will not be examined and
234  * may be %NULL.
235  *
236  * Computes the length of the string in characters, not including
237  * the terminating nul character.
238  *
239  * Return value: the length of the string in characters
240  **/
241 static glong
242 g_utf8_strlen (const gchar * p)
243 {
244  glong len = 0;
245 
246  g_return_val_if_fail (p != NULL, 0);
247 
248  while (*p)
249  {
250  p = g_utf8_next_char (p);
251  ++len;
252  }
253 
254  return len;
255 }
256 
257 /*
258  * g_utf8_get_char:
259  * @p: a pointer to Unicode character encoded as UTF-8
260  *
261  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
262  * If @p does not point to a valid UTF-8 encoded character, results are
263  * undefined. If you are not sure that the bytes are complete
264  * valid Unicode characters, you should use g_utf8_get_char_validated()
265  * instead.
266  *
267  * Return value: the resulting character
268  **/
269 static gunichar
270 g_utf8_get_char (const gchar * p)
271 {
272  int i, mask = 0, len;
273  gunichar result;
274  unsigned char c = (unsigned char) *p;
275 
276  UTF8_COMPUTE (c, mask, len);
277  if (len == -1)
278  return (gunichar) - 1;
279  UTF8_GET (result, p, i, mask, len);
280 
281  return result;
282 }
283 
284 /*
285  * g_unichar_to_utf8:
286  * @c: a Unicode character code
287  * @outbuf: output buffer, must have at least 6 bytes of space.
288  * If %NULL, the length will be computed and returned
289  * and nothing will be written to @outbuf.
290  *
291  * Converts a single character to UTF-8.
292  *
293  * Return value: number of bytes written
294  **/
295 static int
296 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
297 {
298  /* If this gets modified, also update the copy in g_string_insert_unichar() */
299  guint len = 0;
300  int first;
301  int i;
302 
303  if (c < 0x80)
304  {
305  first = 0;
306  len = 1;
307  }
308  else if (c < 0x800)
309  {
310  first = 0xc0;
311  len = 2;
312  }
313  else if (c < 0x10000)
314  {
315  first = 0xe0;
316  len = 3;
317  }
318  else if (c < 0x200000)
319  {
320  first = 0xf0;
321  len = 4;
322  }
323  else if (c < 0x4000000)
324  {
325  first = 0xf8;
326  len = 5;
327  }
328  else
329  {
330  first = 0xfc;
331  len = 6;
332  }
333 
334  if (outbuf)
335  {
336  for (i = len - 1; i > 0; --i)
337  {
338  outbuf[i] = (c & 0x3f) | 0x80;
339  c >>= 6;
340  }
341  outbuf[0] = c | first;
342  }
343 
344  return len;
345 }
346 
347 /*
348  * g_utf8_to_ucs4_fast:
349  * @str: a UTF-8 encoded string
350  * @len: the maximum length of @str to use, in bytes. If @len < 0,
351  * then the string is nul-terminated.
352  * @items_written: location to store the number of characters in the
353  * result, or %NULL.
354  *
355  * Convert a string from UTF-8 to a 32-bit fixed width
356  * representation as UCS-4, assuming valid UTF-8 input.
357  * This function is roughly twice as fast as g_utf8_to_ucs4()
358  * but does no error checking on the input. A trailing 0 character
359  * will be added to the string after the converted text.
360  *
361  * Return value: a pointer to a newly allocated UCS-4 string.
362  * This value must be freed with g_free().
363  **/
364 static gunichar *
365 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
366 {
367  gunichar *result;
368  gsize n_chars, i;
369  const gchar *p;
370 
371  g_return_val_if_fail (str != NULL, NULL);
372 
373  p = str;
374  n_chars = 0;
375  if (len < 0)
376  {
377  while (*p)
378  {
379  p = g_utf8_next_char (p);
380  ++n_chars;
381  }
382  }
383  else
384  {
385  while (p < str + len && *p)
386  {
387  p = g_utf8_next_char (p);
388  ++n_chars;
389  }
390  }
391 
392  result = g_malloc (sizeof (gunichar) * (n_chars + 1));
393  if (!result)
394  return NULL;
395 
396  p = str;
397  for (i = 0; i < n_chars; i++)
398  {
399  gunichar wc = (guchar) * p++;
400 
401  if (wc < 0x80)
402  {
403  result[i] = wc;
404  }
405  else
406  {
407  gunichar mask = 0x40;
408 
409  if (G_UNLIKELY ((wc & mask) == 0))
410  {
411  /* It's an out-of-sequence 10xxxxxxx byte.
412  * Rather than making an ugly hash of this and the next byte
413  * and overrunning the buffer, it's more useful to treat it
414  * with a replacement character */
415  result[i] = 0xfffd;
416  continue;
417  }
418 
419  do
420  {
421  wc <<= 6;
422  wc |= (guchar) (*p++) & 0x3f;
423  mask <<= 5;
424  }
425  while ((wc & mask) != 0);
426 
427  wc &= mask - 1;
428 
429  result[i] = wc;
430  }
431  }
432  result[i] = 0;
433 
434  if (items_written)
435  *items_written = i;
436 
437  return result;
438 }
439 
440 /*
441  * g_ucs4_to_utf8:
442  * @str: a UCS-4 encoded string
443  * @len: the maximum length (number of characters) of @str to use.
444  * If @len < 0, then the string is nul-terminated.
445  * @items_read: location to store number of characters read, or %NULL.
446  * @items_written: location to store number of bytes written or %NULL.
447  * The value here stored does not include the trailing 0
448  * byte.
449  * @error: location to store the error occurring, or %NULL to ignore
450  * errors. Any of the errors in #GConvertError other than
451  * %G_CONVERT_ERROR_NO_CONVERSION may occur.
452  *
453  * Convert a string from a 32-bit fixed width representation as UCS-4.
454  * to UTF-8. The result will be terminated with a 0 byte.
455  *
456  * Return value: a pointer to a newly allocated UTF-8 string.
457  * This value must be freed with g_free(). If an
458  * error occurs, %NULL will be returned and
459  * @error set. In that case, @items_read will be
460  * set to the position of the first invalid input
461  * character.
462  **/
463 static gchar *
464 g_ucs4_to_utf8 (const gunichar * str,
465  glong len, glong * items_read, glong * items_written)
466 {
467  gint result_length;
468  gchar *result = NULL;
469  gchar *p;
470  gint i;
471 
472  result_length = 0;
473  for (i = 0; len < 0 || i < len; i++)
474  {
475  if (!str[i])
476  break;
477 
478  if (str[i] >= 0x80000000)
479  goto err_out;
480 
481  result_length += UTF8_LENGTH (str[i]);
482  }
483 
484  result = g_malloc (result_length + 1);
485  if (!result)
486  return NULL;
487  p = result;
488 
489  i = 0;
490  while (p < result + result_length)
491  p += g_unichar_to_utf8 (str[i++], p);
492 
493  *p = '\0';
494 
495  if (items_written)
496  *items_written = p - result;
497 
498 err_out:
499  if (items_read)
500  *items_read = i;
501 
502  return result;
503 }
504 
505 /* Code from GLIB gunidecomp.c starts here. */
506 
507 /* decomp.c - Character decomposition.
508  *
509  * Copyright (C) 1999, 2000 Tom Tromey
510  * Copyright 2000 Red Hat, Inc.
511  *
512  * The Gnome Library is free software; you can redistribute it and/or
513  * modify it under the terms of the GNU Lesser General Public License as
514  * published by the Free Software Foundation; either version 2 of the
515  * License, or (at your option) any later version.
516  *
517  * The Gnome Library is distributed in the hope that it will be useful,
518  * but WITHOUT ANY WARRANTY; without even the implied warranty of
519  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
520  * Lesser General Public License for more details.
521  *
522  * You should have received a copy of the GNU Lesser General Public
523  * License along with the Gnome Library; see the file COPYING.LIB. If not,
524  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
525  * Boston, MA 02111-1307, USA.
526  */
527 
528 #include "gunidecomp.h"
529 #include "gunicomp.h"
530 
531 #define CC_PART1(Page, Char) \
532  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
533  ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
534  : (cclass_data[combining_class_table_part1[Page]][Char]))
535 
536 #define CC_PART2(Page, Char) \
537  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
538  ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
539  : (cclass_data[combining_class_table_part2[Page]][Char]))
540 
541 #define COMBINING_CLASS(Char) \
542  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
543  ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
544  : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
545  ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
546  : 0))
547 
548 /* constants for hangul syllable [de]composition */
549 #define SBase 0xAC00
550 #define LBase 0x1100
551 #define VBase 0x1161
552 #define TBase 0x11A7
553 #define LCount 19
554 #define VCount 21
555 #define TCount 28
556 #define NCount (VCount * TCount)
557 #define SCount (LCount * NCount)
558 
559 /*
560  * g_unicode_canonical_ordering:
561  * @string: a UCS-4 encoded string.
562  * @len: the maximum length of @string to use.
563  *
564  * Computes the canonical ordering of a string in-place.
565  * This rearranges decomposed characters in the string
566  * according to their combining classes. See the Unicode
567  * manual for more information.
568  **/
569 static void
570 g_unicode_canonical_ordering (gunichar * string, gsize len)
571 {
572  gsize i;
573  int swap = 1;
574 
575  while (swap)
576  {
577  int last;
578  swap = 0;
579  last = COMBINING_CLASS (string[0]);
580  for (i = 0; i < len - 1; ++i)
581  {
582  int next = COMBINING_CLASS (string[i + 1]);
583  if (next != 0 && last > next)
584  {
585  gsize j;
586  /* Percolate item leftward through string. */
587  for (j = i + 1; j > 0; --j)
588  {
589  gunichar t;
590  if (COMBINING_CLASS (string[j - 1]) <= next)
591  break;
592  t = string[j];
593  string[j] = string[j - 1];
594  string[j - 1] = t;
595  swap = 1;
596  }
597  /* We're re-entering the loop looking at the old
598  character again. */
599  next = last;
600  }
601  last = next;
602  }
603  }
604 }
605 
606 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
607  * r should be null or have sufficient space. Calling with r == NULL will
608  * only calculate the result_len; however, a buffer with space for three
609  * characters will always be big enough. */
610 static void
611 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
612 {
613  gint SIndex = s - SBase;
614  gint TIndex = SIndex % TCount;
615 
616  if (r)
617  {
618  r[0] = LBase + SIndex / NCount;
619  r[1] = VBase + (SIndex % NCount) / TCount;
620  }
621 
622  if (TIndex)
623  {
624  if (r)
625  r[2] = TBase + TIndex;
626  *result_len = 3;
627  }
628  else
629  *result_len = 2;
630 }
631 
632 /* returns a pointer to a null-terminated UTF-8 string */
633 static const gchar *
634 find_decomposition (gunichar ch, gboolean compat)
635 {
636  int start = 0;
637  int end = G_N_ELEMENTS (decomp_table);
638 
639  if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
640  {
641  while (TRUE)
642  {
643  int half = (start + end) / 2;
644  if (ch == decomp_table[half].ch)
645  {
646  int offset;
647 
648  if (compat)
649  {
650  offset = decomp_table[half].compat_offset;
651  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
652  offset = decomp_table[half].canon_offset;
653  }
654  else
655  {
656  offset = decomp_table[half].canon_offset;
657  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
658  return NULL;
659  }
660 
661  return &(decomp_expansion_string[offset]);
662  }
663  else if (half == start)
664  break;
665  else if (ch > decomp_table[half].ch)
666  start = half;
667  else
668  end = half;
669  }
670  }
671 
672  return NULL;
673 }
674 
675 /* L,V => LV and LV,T => LVT */
676 static gboolean
677 combine_hangul (gunichar a, gunichar b, gunichar * result)
678 {
679  if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
680  {
681  gint LIndex = a - LBase;
682  gint VIndex = b - VBase;
683 
684  *result = SBase + (LIndex * VCount + VIndex) * TCount;
685  return TRUE;
686  }
687 
688  if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
689  {
690  gint SIndex = a - SBase;
691 
692  if ((SIndex % TCount) == 0)
693  {
694  gint TIndex = b - TBase;
695 
696  *result = a + TIndex;
697  return TRUE;
698  }
699  }
700 
701  return FALSE;
702 }
703 
704 #define CI(Page, Char) \
705  ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
706  ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
707  : (compose_data[compose_table[Page]][Char]))
708 
709 #define COMPOSE_INDEX(Char) \
710  (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
711 
712 static gboolean
713 combine (gunichar a, gunichar b, gunichar * result)
714 {
715  gushort index_a, index_b;
716 
717  if (combine_hangul (a, b, result))
718  return TRUE;
719 
720  index_a = COMPOSE_INDEX (a);
721 
722  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
723  {
724  if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
725  {
726  *result =
727  compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
728  return TRUE;
729  }
730  else
731  return FALSE;
732  }
733 
734  index_b = COMPOSE_INDEX (b);
735 
736  if (index_b >= COMPOSE_SECOND_SINGLE_START)
737  {
738  if (a ==
739  compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
740  {
741  *result =
742  compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
743  return TRUE;
744  }
745  else
746  return FALSE;
747  }
748 
749  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
750  && index_b >= COMPOSE_SECOND_START
751  && index_b < COMPOSE_SECOND_SINGLE_START)
752  {
753  gunichar res =
754  compose_array[index_a - COMPOSE_FIRST_START][index_b -
756 
757  if (res)
758  {
759  *result = res;
760  return TRUE;
761  }
762  }
763 
764  return FALSE;
765 }
766 
767 static gunichar *
768 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
769 {
770  gsize n_wc;
771  gunichar *wc_buffer;
772  const char *p;
773  gsize last_start;
774  gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
775  gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
776 
777  n_wc = 0;
778  p = str;
779  while ((max_len < 0 || p < str + max_len) && *p)
780  {
781  const gchar *decomp;
782  gunichar wc = g_utf8_get_char (p);
783 
784  if (wc >= SBase && wc < SBase + SCount)
785  {
786  gsize result_len;
787  decompose_hangul (wc, NULL, &result_len);
788  n_wc += result_len;
789  }
790  else
791  {
792  decomp = find_decomposition (wc, do_compat);
793 
794  if (decomp)
795  n_wc += g_utf8_strlen (decomp);
796  else
797  n_wc++;
798  }
799 
800  p = g_utf8_next_char (p);
801  }
802 
803  wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
804  if (!wc_buffer)
805  return NULL;
806 
807  last_start = 0;
808  n_wc = 0;
809  p = str;
810  while ((max_len < 0 || p < str + max_len) && *p)
811  {
812  gunichar wc = g_utf8_get_char (p);
813  const gchar *decomp;
814  int cc;
815  gsize old_n_wc = n_wc;
816 
817  if (wc >= SBase && wc < SBase + SCount)
818  {
819  gsize result_len;
820  decompose_hangul (wc, wc_buffer + n_wc, &result_len);
821  n_wc += result_len;
822  }
823  else
824  {
825  decomp = find_decomposition (wc, do_compat);
826 
827  if (decomp)
828  {
829  const char *pd;
830  for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
831  wc_buffer[n_wc++] = g_utf8_get_char (pd);
832  }
833  else
834  wc_buffer[n_wc++] = wc;
835  }
836 
837  if (n_wc > 0)
838  {
839  cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
840 
841  if (cc == 0)
842  {
843  g_unicode_canonical_ordering (wc_buffer + last_start,
844  n_wc - last_start);
845  last_start = old_n_wc;
846  }
847  }
848 
849  p = g_utf8_next_char (p);
850  }
851 
852  if (n_wc > 0)
853  {
854  g_unicode_canonical_ordering (wc_buffer + last_start,
855  n_wc - last_start);
856  /* dead assignment: last_start = n_wc; */
857  }
858 
859  wc_buffer[n_wc] = 0;
860 
861  /* All decomposed and reordered */
862 
863  if (do_compose && n_wc > 0)
864  {
865  gsize i, j;
866  int last_cc = 0;
867  last_start = 0;
868 
869  for (i = 0; i < n_wc; i++)
870  {
871  int cc = COMBINING_CLASS (wc_buffer[i]);
872 
873  if (i > 0 &&
874  (last_cc == 0 || last_cc != cc) &&
875  combine (wc_buffer[last_start], wc_buffer[i],
876  &wc_buffer[last_start]))
877  {
878  for (j = i + 1; j < n_wc; j++)
879  wc_buffer[j - 1] = wc_buffer[j];
880  n_wc--;
881  i--;
882 
883  if (i == last_start)
884  last_cc = 0;
885  else
886  last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
887 
888  continue;
889  }
890 
891  if (cc == 0)
892  last_start = i;
893 
894  last_cc = cc;
895  }
896  }
897 
898  wc_buffer[n_wc] = 0;
899 
900  return wc_buffer;
901 }
902 
903 /*
904  * g_utf8_normalize:
905  * @str: a UTF-8 encoded string.
906  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
907  * @mode: the type of normalization to perform.
908  *
909  * Converts a string into canonical form, standardizing
910  * such issues as whether a character with an accent
911  * is represented as a base character and combining
912  * accent or as a single precomposed character. The
913  * string has to be valid UTF-8, otherwise %NULL is
914  * returned. You should generally call g_utf8_normalize()
915  * before comparing two Unicode strings.
916  *
917  * The normalization mode %G_NORMALIZE_DEFAULT only
918  * standardizes differences that do not affect the
919  * text content, such as the above-mentioned accent
920  * representation. %G_NORMALIZE_ALL also standardizes
921  * the "compatibility" characters in Unicode, such
922  * as SUPERSCRIPT THREE to the standard forms
923  * (in this case DIGIT THREE). Formatting information
924  * may be lost but for most text operations such
925  * characters should be considered the same.
926  *
927  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
928  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
929  * but returned a result with composed forms rather
930  * than a maximally decomposed form. This is often
931  * useful if you intend to convert the string to
932  * a legacy encoding or pass it to a system with
933  * less capable Unicode handling.
934  *
935  * Return value: a newly allocated string, that is the
936  * normalized form of @str, or %NULL if @str is not
937  * valid UTF-8.
938  **/
939 static gchar *
940 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
941 {
942  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
943  gchar *result = NULL;
944 
945  if (result_wc)
946  result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
947 
948  g_free (result_wc);
949 
950  return result;
951 }
952 
953 /* Public Libidn API starts here. */
954 
965 uint32_t
967 {
968  return g_utf8_get_char (p);
969 }
970 
982 int
983 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
984 {
985  return g_unichar_to_utf8 (c, outbuf);
986 }
987 
988 #include <unistr.h>
989 
1006 uint32_t *
1007 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
1008 {
1009  size_t n;
1010 
1011  if (len < 0)
1012  n = strlen (str);
1013  else
1014  n = len;
1015 
1016  if (u8_check ((const uint8_t *) str, n))
1017  return NULL;
1018 
1019  return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
1020 }
1021 
1039 char *
1040 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1041  size_t *items_read, size_t *items_written)
1042 {
1043  return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1044  (glong *) items_written);
1045 }
1046 
1069 char *
1070 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1071 {
1072  size_t n;
1073 
1074  if (len < 0)
1075  n = strlen (str);
1076  else
1077  n = len;
1078 
1079  if (u8_check ((const uint8_t *) str, n))
1080  return NULL;
1081 
1082  return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1083 }
1084 
1085 #include <stdio.h>
1097 uint32_t *
1098 stringprep_ucs4_nfkc_normalize (const uint32_t * str, ssize_t len)
1099 {
1100  char *p;
1101  uint32_t *result_wc;
1102 
1103  p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1104  if (!p)
1105  return NULL;
1106 
1107  result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1108  free (p);
1109 
1110  return result_wc;
1111 }
#define COMPOSE_SECOND_SINGLE_START
Definition: gunicomp.h:8
#define COMPOSE_SECOND_START
Definition: gunicomp.h:7
#define COMPOSE_FIRST_START
Definition: gunicomp.h:5
#define COMPOSE_FIRST_SINGLE_START
Definition: gunicomp.h:6
#define G_UNICODE_NOT_PRESENT_OFFSET
Definition: gunidecomp.h:16
#define g_return_val_if_fail(expr, val)
Definition: nfkc.c:54
#define SCount
Definition: nfkc.c:557
#define glong
Definition: nfkc.c:43
#define gssize
Definition: nfkc.c:51
#define gushort
Definition: nfkc.c:46
#define UTF8_COMPUTE(Char, Mask, Len)
Definition: nfkc.c:153
int stringprep_unichar_to_utf8(uint32_t c, char *outbuf)
Definition: nfkc.c:983
#define gunichar
Definition: nfkc.c:49
#define COMPOSE_INDEX(Char)
Definition: nfkc.c:709
#define guint
Definition: nfkc.c:45
#define g_free
Definition: nfkc.c:53
#define G_N_ELEMENTS(arr)
Definition: nfkc.c:88
#define gchar
Definition: nfkc.c:41
char * stringprep_utf8_nfkc_normalize(const char *str, ssize_t len)
Definition: nfkc.c:1070
#define LBase
Definition: nfkc.c:550
#define gint
Definition: nfkc.c:44
#define UTF8_LENGTH(Char)
Definition: nfkc.c:187
#define g_utf8_next_char(p)
Definition: nfkc.c:128
char * stringprep_ucs4_to_utf8(const uint32_t *str, ssize_t len, size_t *items_read, size_t *items_written)
Definition: nfkc.c:1040
#define TRUE
Definition: nfkc.c:85
#define FALSE
Definition: nfkc.c:81
#define G_UNLIKELY(expr)
Definition: nfkc.c:90
#define TBase
Definition: nfkc.c:552
#define UTF8_GET(Result, Chars, Count, Mask, Len)
Definition: nfkc.c:194
#define VBase
Definition: nfkc.c:551
uint32_t stringprep_utf8_to_unichar(const char *p)
Definition: nfkc.c:966
#define COMBINING_CLASS(Char)
Definition: nfkc.c:541
#define NCount
Definition: nfkc.c:556
#define guchar
Definition: nfkc.c:42
#define g_malloc
Definition: nfkc.c:52
GNormalizeMode
Definition: nfkc.c:116
@ G_NORMALIZE_DEFAULT_COMPOSE
Definition: nfkc.c:119
@ G_NORMALIZE_NFKC
Definition: nfkc.c:124
@ G_NORMALIZE_NFKD
Definition: nfkc.c:122
@ G_NORMALIZE_ALL
Definition: nfkc.c:121
@ G_NORMALIZE_NFD
Definition: nfkc.c:118
@ G_NORMALIZE_DEFAULT
Definition: nfkc.c:117
@ G_NORMALIZE_ALL_COMPOSE
Definition: nfkc.c:123
@ G_NORMALIZE_NFC
Definition: nfkc.c:120
#define SBase
Definition: nfkc.c:549
#define TCount
Definition: nfkc.c:555
uint32_t * stringprep_utf8_to_ucs4(const char *str, ssize_t len, size_t *items_written)
Definition: nfkc.c:1007
#define gsize
Definition: nfkc.c:50
uint32_t * stringprep_ucs4_nfkc_normalize(const uint32_t *str, ssize_t len)
Definition: nfkc.c:1098
#define VCount
Definition: nfkc.c:554
#define gboolean
Definition: nfkc.c:40