libidn  1.29
nfkc.c
Go to the documentation of this file.
1 /* nfkc.c --- Unicode normalization utilities.
2  Copyright (C) 2002-2014 Simon Josefsson
3 
4  This file is part of GNU Libidn.
5 
6  GNU Libidn is free software: you can redistribute it and/or
7  modify it under the terms of either:
8 
9  * the GNU Lesser General Public License as published by the Free
10  Software Foundation; either version 3 of the License, or (at
11  your option) any later version.
12 
13  or
14 
15  * the GNU General Public License as published by the Free
16  Software Foundation; either version 2 of the License, or (at
17  your option) any later version.
18 
19  or both in parallel, as here.
20 
21  GNU Libidn is distributed in the hope that it will be useful,
22  but WITHOUT ANY WARRANTY; without even the implied warranty of
23  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24  General Public License for more details.
25 
26  You should have received copies of the GNU General Public License and
27  the GNU Lesser General Public License along with this program. If
28  not, see <http://www.gnu.org/licenses/>. */
29 
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33 
34 #include <stdlib.h>
35 #include <string.h>
36 
37 #include "stringprep.h"
38 
39 /* Hacks to make syncing with GLIB code easier. */
40 #define gboolean int
41 #define gchar char
42 #define guchar unsigned char
43 #define glong long
44 #define gint int
45 #define guint unsigned int
46 #define gushort unsigned short
47 #define gint16 int16_t
48 #define guint16 uint16_t
49 #define gunichar uint32_t
50 #define gsize size_t
51 #define gssize ssize_t
52 #define g_malloc malloc
53 #define g_free free
54 #define g_return_val_if_fail(expr,val) { \
55  if (!(expr)) \
56  return (val); \
57  }
58 
59 /* Code from GLIB gmacros.h starts here. */
60 
61 /* GLIB - Library of useful routines for C programming
62  * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald
63  *
64  * This library is free software; you can redistribute it and/or
65  * modify it under the terms of the GNU Lesser General Public
66  * License as published by the Free Software Foundation; either
67  * version 2 of the License, or (at your option) any later version.
68  *
69  * This library is distributed in the hope that it will be useful,
70  * but WITHOUT ANY WARRANTY; without even the implied warranty of
71  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
72  * Lesser General Public License for more details.
73  *
74  * You should have received a copy of the GNU Lesser General Public
75  * License along with this library; if not, write to the
76  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
77  * Boston, MA 02111-1307, USA.
78  */
79 
80 #ifndef FALSE
81 #define FALSE (0)
82 #endif
83 
84 #ifndef TRUE
85 #define TRUE (!FALSE)
86 #endif
87 
88 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
89 
90 #define G_UNLIKELY(expr) (expr)
91 
92 /* Code from GLIB gunicode.h starts here. */
93 
94 /* gunicode.h - Unicode manipulation functions
95  *
96  * Copyright (C) 1999, 2000 Tom Tromey
97  * Copyright 2000, 2005 Red Hat, Inc.
98  *
99  * The Gnome Library is free software; you can redistribute it and/or
100  * modify it under the terms of the GNU Lesser General Public License as
101  * published by the Free Software Foundation; either version 2 of the
102  * License, or (at your option) any later version.
103  *
104  * The Gnome Library is distributed in the hope that it will be useful,
105  * but WITHOUT ANY WARRANTY; without even the implied warranty of
106  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
107  * Lesser General Public License for more details.
108  *
109  * You should have received a copy of the GNU Lesser General Public
110  * License along with the Gnome Library; see the file COPYING.LIB. If not,
111  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
112  * Boston, MA 02111-1307, USA.
113  */
114 
115 typedef enum
116 {
125 }
127 
128 #define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
129 
130 /* Code from GLIB gutf8.c starts here. */
131 
132 /* gutf8.c - Operations on UTF-8 strings.
133  *
134  * Copyright (C) 1999 Tom Tromey
135  * Copyright (C) 2000 Red Hat, Inc.
136  *
137  * This library is free software; you can redistribute it and/or
138  * modify it under the terms of the GNU Lesser General Public
139  * License as published by the Free Software Foundation; either
140  * version 2 of the License, or (at your option) any later version.
141  *
142  * This library is distributed in the hope that it will be useful,
143  * but WITHOUT ANY WARRANTY; without even the implied warranty of
144  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
145  * Lesser General Public License for more details.
146  *
147  * You should have received a copy of the GNU Lesser General Public
148  * License along with this library; if not, write to the
149  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
150  * Boston, MA 02111-1307, USA.
151  */
152 
153 #define UTF8_COMPUTE(Char, Mask, Len) \
154  if (Char < 128) \
155  { \
156  Len = 1; \
157  Mask = 0x7f; \
158  } \
159  else if ((Char & 0xe0) == 0xc0) \
160  { \
161  Len = 2; \
162  Mask = 0x1f; \
163  } \
164  else if ((Char & 0xf0) == 0xe0) \
165  { \
166  Len = 3; \
167  Mask = 0x0f; \
168  } \
169  else if ((Char & 0xf8) == 0xf0) \
170  { \
171  Len = 4; \
172  Mask = 0x07; \
173  } \
174  else if ((Char & 0xfc) == 0xf8) \
175  { \
176  Len = 5; \
177  Mask = 0x03; \
178  } \
179  else if ((Char & 0xfe) == 0xfc) \
180  { \
181  Len = 6; \
182  Mask = 0x01; \
183  } \
184  else \
185  Len = -1;
186 
187 #define UTF8_LENGTH(Char) \
188  ((Char) < 0x80 ? 1 : \
189  ((Char) < 0x800 ? 2 : \
190  ((Char) < 0x10000 ? 3 : \
191  ((Char) < 0x200000 ? 4 : \
192  ((Char) < 0x4000000 ? 5 : 6)))))
193 
194 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
195  (Result) = (Chars)[0] & (Mask); \
196  for ((Count) = 1; (Count) < (Len); ++(Count)) \
197  { \
198  if (((Chars)[(Count)] & 0xc0) != 0x80) \
199  { \
200  (Result) = -1; \
201  break; \
202  } \
203  (Result) <<= 6; \
204  (Result) |= ((Chars)[(Count)] & 0x3f); \
205  }
206 
207 static const gchar utf8_skip_data[256] = {
208  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
209  1, 1, 1, 1, 1, 1, 1,
210  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
211  1, 1, 1, 1, 1, 1, 1,
212  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
213  1, 1, 1, 1, 1, 1, 1,
214  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
215  1, 1, 1, 1, 1, 1, 1,
216  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
217  1, 1, 1, 1, 1, 1, 1,
218  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
219  1, 1, 1, 1, 1, 1, 1,
220  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
221  2, 2, 2, 2, 2, 2, 2,
222  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
223  5, 5, 5, 6, 6, 1, 1
224 };
225 
226 static const gchar *const g_utf8_skip = utf8_skip_data;
227 
228 /*
229  * g_utf8_strlen:
230  * @p: pointer to the start of a UTF-8 encoded string
231  * @max: the maximum number of bytes to examine. If @max
232  * is less than 0, then the string is assumed to be
233  * nul-terminated. If @max is 0, @p will not be examined and
234  * may be %NULL.
235  *
236  * Computes the length of the string in characters, not including
237  * the terminating nul character.
238  *
239  * Return value: the length of the string in characters
240  **/
241 static glong
242 g_utf8_strlen (const gchar * p, gssize max)
243 {
244  glong len = 0;
245  const gchar *start = p;
246  g_return_val_if_fail (p != NULL || max == 0, 0);
247 
248  if (max < 0)
249  {
250  while (*p)
251  {
252  p = g_utf8_next_char (p);
253  ++len;
254  }
255  }
256  else
257  {
258  if (max == 0 || !*p)
259  return 0;
260 
261  p = g_utf8_next_char (p);
262 
263  while (p - start < max && *p)
264  {
265  ++len;
266  p = g_utf8_next_char (p);
267  }
268 
269  /* only do the last len increment if we got a complete
270  * char (don't count partial chars)
271  */
272  if (p - start <= max)
273  ++len;
274  }
275 
276  return len;
277 }
278 
279 /*
280  * g_utf8_get_char:
281  * @p: a pointer to Unicode character encoded as UTF-8
282  *
283  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
284  * If @p does not point to a valid UTF-8 encoded character, results are
285  * undefined. If you are not sure that the bytes are complete
286  * valid Unicode characters, you should use g_utf8_get_char_validated()
287  * instead.
288  *
289  * Return value: the resulting character
290  **/
291 static gunichar
292 g_utf8_get_char (const gchar * p)
293 {
294  int i, mask = 0, len;
295  gunichar result;
296  unsigned char c = (unsigned char) *p;
297 
298  UTF8_COMPUTE (c, mask, len);
299  if (len == -1)
300  return (gunichar) - 1;
301  UTF8_GET (result, p, i, mask, len);
302 
303  return result;
304 }
305 
306 /*
307  * g_unichar_to_utf8:
308  * @c: a Unicode character code
309  * @outbuf: output buffer, must have at least 6 bytes of space.
310  * If %NULL, the length will be computed and returned
311  * and nothing will be written to @outbuf.
312  *
313  * Converts a single character to UTF-8.
314  *
315  * Return value: number of bytes written
316  **/
317 static int
318 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
319 {
320  /* If this gets modified, also update the copy in g_string_insert_unichar() */
321  guint len = 0;
322  int first;
323  int i;
324 
325  if (c < 0x80)
326  {
327  first = 0;
328  len = 1;
329  }
330  else if (c < 0x800)
331  {
332  first = 0xc0;
333  len = 2;
334  }
335  else if (c < 0x10000)
336  {
337  first = 0xe0;
338  len = 3;
339  }
340  else if (c < 0x200000)
341  {
342  first = 0xf0;
343  len = 4;
344  }
345  else if (c < 0x4000000)
346  {
347  first = 0xf8;
348  len = 5;
349  }
350  else
351  {
352  first = 0xfc;
353  len = 6;
354  }
355 
356  if (outbuf)
357  {
358  for (i = len - 1; i > 0; --i)
359  {
360  outbuf[i] = (c & 0x3f) | 0x80;
361  c >>= 6;
362  }
363  outbuf[0] = c | first;
364  }
365 
366  return len;
367 }
368 
369 /*
370  * g_utf8_to_ucs4_fast:
371  * @str: a UTF-8 encoded string
372  * @len: the maximum length of @str to use, in bytes. If @len < 0,
373  * then the string is nul-terminated.
374  * @items_written: location to store the number of characters in the
375  * result, or %NULL.
376  *
377  * Convert a string from UTF-8 to a 32-bit fixed width
378  * representation as UCS-4, assuming valid UTF-8 input.
379  * This function is roughly twice as fast as g_utf8_to_ucs4()
380  * but does no error checking on the input. A trailing 0 character
381  * will be added to the string after the converted text.
382  *
383  * Return value: a pointer to a newly allocated UCS-4 string.
384  * This value must be freed with g_free().
385  **/
386 static gunichar *
387 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
388 {
389  gunichar *result;
390  gsize n_chars, i;
391  const gchar *p;
392 
393  g_return_val_if_fail (str != NULL, NULL);
394 
395  p = str;
396  n_chars = 0;
397  if (len < 0)
398  {
399  while (*p)
400  {
401  p = g_utf8_next_char (p);
402  ++n_chars;
403  }
404  }
405  else
406  {
407  while (p < str + len && *p)
408  {
409  p = g_utf8_next_char (p);
410  ++n_chars;
411  }
412  }
413 
414  result = g_malloc (sizeof (gunichar) * (n_chars + 1));
415  if (!result)
416  return NULL;
417 
418  p = str;
419  for (i = 0; i < n_chars; i++)
420  {
421  gunichar wc = (guchar) * p++;
422 
423  if (wc < 0x80)
424  {
425  result[i] = wc;
426  }
427  else
428  {
429  gunichar mask = 0x40;
430 
431  if (G_UNLIKELY ((wc & mask) == 0))
432  {
433  /* It's an out-of-sequence 10xxxxxxx byte.
434  * Rather than making an ugly hash of this and the next byte
435  * and overrunning the buffer, it's more useful to treat it
436  * with a replacement character */
437  result[i] = 0xfffd;
438  continue;
439  }
440 
441  do
442  {
443  wc <<= 6;
444  wc |= (guchar) (*p++) & 0x3f;
445  mask <<= 5;
446  }
447  while ((wc & mask) != 0);
448 
449  wc &= mask - 1;
450 
451  result[i] = wc;
452  }
453  }
454  result[i] = 0;
455 
456  if (items_written)
457  *items_written = i;
458 
459  return result;
460 }
461 
462 /*
463  * g_ucs4_to_utf8:
464  * @str: a UCS-4 encoded string
465  * @len: the maximum length (number of characters) of @str to use.
466  * If @len < 0, then the string is nul-terminated.
467  * @items_read: location to store number of characters read, or %NULL.
468  * @items_written: location to store number of bytes written or %NULL.
469  * The value here stored does not include the trailing 0
470  * byte.
471  * @error: location to store the error occurring, or %NULL to ignore
472  * errors. Any of the errors in #GConvertError other than
473  * %G_CONVERT_ERROR_NO_CONVERSION may occur.
474  *
475  * Convert a string from a 32-bit fixed width representation as UCS-4.
476  * to UTF-8. The result will be terminated with a 0 byte.
477  *
478  * Return value: a pointer to a newly allocated UTF-8 string.
479  * This value must be freed with g_free(). If an
480  * error occurs, %NULL will be returned and
481  * @error set. In that case, @items_read will be
482  * set to the position of the first invalid input
483  * character.
484  **/
485 static gchar *
486 g_ucs4_to_utf8 (const gunichar * str,
487  glong len,
488  glong * items_read, glong * items_written)
489 {
490  gint result_length;
491  gchar *result = NULL;
492  gchar *p;
493  gint i;
494 
495  result_length = 0;
496  for (i = 0; len < 0 || i < len; i++)
497  {
498  if (!str[i])
499  break;
500 
501  if (str[i] >= 0x80000000)
502  goto err_out;
503 
504  result_length += UTF8_LENGTH (str[i]);
505  }
506 
507  result = g_malloc (result_length + 1);
508  if (!result)
509  return NULL;
510  p = result;
511 
512  i = 0;
513  while (p < result + result_length)
514  p += g_unichar_to_utf8 (str[i++], p);
515 
516  *p = '\0';
517 
518  if (items_written)
519  *items_written = p - result;
520 
521 err_out:
522  if (items_read)
523  *items_read = i;
524 
525  return result;
526 }
527 
528 /* Code from GLIB gunidecomp.c starts here. */
529 
530 /* decomp.c - Character decomposition.
531  *
532  * Copyright (C) 1999, 2000 Tom Tromey
533  * Copyright 2000 Red Hat, Inc.
534  *
535  * The Gnome Library is free software; you can redistribute it and/or
536  * modify it under the terms of the GNU Lesser General Public License as
537  * published by the Free Software Foundation; either version 2 of the
538  * License, or (at your option) any later version.
539  *
540  * The Gnome Library is distributed in the hope that it will be useful,
541  * but WITHOUT ANY WARRANTY; without even the implied warranty of
542  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
543  * Lesser General Public License for more details.
544  *
545  * You should have received a copy of the GNU Lesser General Public
546  * License along with the Gnome Library; see the file COPYING.LIB. If not,
547  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
548  * Boston, MA 02111-1307, USA.
549  */
550 
551 #include "gunidecomp.h"
552 #include "gunicomp.h"
553 
554 #define CC_PART1(Page, Char) \
555  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
556  ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
557  : (cclass_data[combining_class_table_part1[Page]][Char]))
558 
559 #define CC_PART2(Page, Char) \
560  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
561  ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
562  : (cclass_data[combining_class_table_part2[Page]][Char]))
563 
564 #define COMBINING_CLASS(Char) \
565  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
566  ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
567  : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
568  ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
569  : 0))
570 
571 /* constants for hangul syllable [de]composition */
572 #define SBase 0xAC00
573 #define LBase 0x1100
574 #define VBase 0x1161
575 #define TBase 0x11A7
576 #define LCount 19
577 #define VCount 21
578 #define TCount 28
579 #define NCount (VCount * TCount)
580 #define SCount (LCount * NCount)
581 
582 /*
583  * g_unicode_canonical_ordering:
584  * @string: a UCS-4 encoded string.
585  * @len: the maximum length of @string to use.
586  *
587  * Computes the canonical ordering of a string in-place.
588  * This rearranges decomposed characters in the string
589  * according to their combining classes. See the Unicode
590  * manual for more information.
591  **/
592 static void
593 g_unicode_canonical_ordering (gunichar * string, gsize len)
594 {
595  gsize i;
596  int swap = 1;
597 
598  while (swap)
599  {
600  int last;
601  swap = 0;
602  last = COMBINING_CLASS (string[0]);
603  for (i = 0; i < len - 1; ++i)
604  {
605  int next = COMBINING_CLASS (string[i + 1]);
606  if (next != 0 && last > next)
607  {
608  gsize j;
609  /* Percolate item leftward through string. */
610  for (j = i + 1; j > 0; --j)
611  {
612  gunichar t;
613  if (COMBINING_CLASS (string[j - 1]) <= next)
614  break;
615  t = string[j];
616  string[j] = string[j - 1];
617  string[j - 1] = t;
618  swap = 1;
619  }
620  /* We're re-entering the loop looking at the old
621  character again. */
622  next = last;
623  }
624  last = next;
625  }
626  }
627 }
628 
629 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
630  * r should be null or have sufficient space. Calling with r == NULL will
631  * only calculate the result_len; however, a buffer with space for three
632  * characters will always be big enough. */
633 static void
634 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
635 {
636  gint SIndex = s - SBase;
637  gint TIndex = SIndex % TCount;
638 
639  if (r)
640  {
641  r[0] = LBase + SIndex / NCount;
642  r[1] = VBase + (SIndex % NCount) / TCount;
643  }
644 
645  if (TIndex)
646  {
647  if (r)
648  r[2] = TBase + TIndex;
649  *result_len = 3;
650  }
651  else
652  *result_len = 2;
653 }
654 
655 /* returns a pointer to a null-terminated UTF-8 string */
656 static const gchar *
657 find_decomposition (gunichar ch, gboolean compat)
658 {
659  int start = 0;
660  int end = G_N_ELEMENTS (decomp_table);
661 
662  if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
663  {
664  while (TRUE)
665  {
666  int half = (start + end) / 2;
667  if (ch == decomp_table[half].ch)
668  {
669  int offset;
670 
671  if (compat)
672  {
673  offset = decomp_table[half].compat_offset;
674  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
675  offset = decomp_table[half].canon_offset;
676  }
677  else
678  {
679  offset = decomp_table[half].canon_offset;
680  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
681  return NULL;
682  }
683 
684  return &(decomp_expansion_string[offset]);
685  }
686  else if (half == start)
687  break;
688  else if (ch > decomp_table[half].ch)
689  start = half;
690  else
691  end = half;
692  }
693  }
694 
695  return NULL;
696 }
697 
698 /* L,V => LV and LV,T => LVT */
699 static gboolean
700 combine_hangul (gunichar a, gunichar b, gunichar * result)
701 {
702  gint LIndex = a - LBase;
703  gint SIndex = a - SBase;
704 
705  gint VIndex = b - VBase;
706  gint TIndex = b - TBase;
707 
708  if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
709  {
710  *result = SBase + (LIndex * VCount + VIndex) * TCount;
711  return TRUE;
712  }
713  else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
714  && 0 < TIndex && TIndex < TCount)
715  {
716  *result = a + TIndex;
717  return TRUE;
718  }
719 
720  return FALSE;
721 }
722 
723 #define CI(Page, Char) \
724  ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
725  ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
726  : (compose_data[compose_table[Page]][Char]))
727 
728 #define COMPOSE_INDEX(Char) \
729  (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
730 
731 static gboolean
732 combine (gunichar a, gunichar b, gunichar * result)
733 {
734  gushort index_a, index_b;
735 
736  if (combine_hangul (a, b, result))
737  return TRUE;
738 
739  index_a = COMPOSE_INDEX (a);
740 
741  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
742  {
743  if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
744  {
745  *result =
746  compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
747  return TRUE;
748  }
749  else
750  return FALSE;
751  }
752 
753  index_b = COMPOSE_INDEX (b);
754 
755  if (index_b >= COMPOSE_SECOND_SINGLE_START)
756  {
757  if (a ==
758  compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
759  {
760  *result =
761  compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
762  return TRUE;
763  }
764  else
765  return FALSE;
766  }
767 
768  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
769  && index_b >= COMPOSE_SECOND_START
770  && index_b < COMPOSE_SECOND_SINGLE_START)
771  {
772  gunichar res =
773  compose_array[index_a - COMPOSE_FIRST_START][index_b -
775 
776  if (res)
777  {
778  *result = res;
779  return TRUE;
780  }
781  }
782 
783  return FALSE;
784 }
785 
786 static gunichar *
787 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
788 {
789  gsize n_wc;
790  gunichar *wc_buffer;
791  const char *p;
792  gsize last_start;
793  gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
794  gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
795 
796  n_wc = 0;
797  p = str;
798  while ((max_len < 0 || p < str + max_len) && *p)
799  {
800  const gchar *decomp;
801  gunichar wc = g_utf8_get_char (p);
802 
803  if (wc >= SBase && wc < SBase + SCount)
804  {
805  gsize result_len;
806  decompose_hangul (wc, NULL, &result_len);
807  n_wc += result_len;
808  }
809  else
810  {
811  decomp = find_decomposition (wc, do_compat);
812 
813  if (decomp)
814  n_wc += g_utf8_strlen (decomp, -1);
815  else
816  n_wc++;
817  }
818 
819  p = g_utf8_next_char (p);
820  }
821 
822  wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
823  if (!wc_buffer)
824  return NULL;
825 
826  last_start = 0;
827  n_wc = 0;
828  p = str;
829  while ((max_len < 0 || p < str + max_len) && *p)
830  {
831  gunichar wc = g_utf8_get_char (p);
832  const gchar *decomp;
833  int cc;
834  gsize old_n_wc = n_wc;
835 
836  if (wc >= SBase && wc < SBase + SCount)
837  {
838  gsize result_len;
839  decompose_hangul (wc, wc_buffer + n_wc, &result_len);
840  n_wc += result_len;
841  }
842  else
843  {
844  decomp = find_decomposition (wc, do_compat);
845 
846  if (decomp)
847  {
848  const char *pd;
849  for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
850  wc_buffer[n_wc++] = g_utf8_get_char (pd);
851  }
852  else
853  wc_buffer[n_wc++] = wc;
854  }
855 
856  if (n_wc > 0)
857  {
858  cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
859 
860  if (cc == 0)
861  {
862  g_unicode_canonical_ordering (wc_buffer + last_start,
863  n_wc - last_start);
864  last_start = old_n_wc;
865  }
866  }
867 
868  p = g_utf8_next_char (p);
869  }
870 
871  if (n_wc > 0)
872  {
873  g_unicode_canonical_ordering (wc_buffer + last_start,
874  n_wc - last_start);
875  // dead assignment: last_start = n_wc;
876  }
877 
878  wc_buffer[n_wc] = 0;
879 
880  /* All decomposed and reordered */
881 
882  if (do_compose && n_wc > 0)
883  {
884  gsize i, j;
885  int last_cc = 0;
886  last_start = 0;
887 
888  for (i = 0; i < n_wc; i++)
889  {
890  int cc = COMBINING_CLASS (wc_buffer[i]);
891 
892  if (i > 0 &&
893  (last_cc == 0 || last_cc != cc) &&
894  combine (wc_buffer[last_start], wc_buffer[i],
895  &wc_buffer[last_start]))
896  {
897  for (j = i + 1; j < n_wc; j++)
898  wc_buffer[j - 1] = wc_buffer[j];
899  n_wc--;
900  i--;
901 
902  if (i == last_start)
903  last_cc = 0;
904  else
905  last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
906 
907  continue;
908  }
909 
910  if (cc == 0)
911  last_start = i;
912 
913  last_cc = cc;
914  }
915  }
916 
917  wc_buffer[n_wc] = 0;
918 
919  return wc_buffer;
920 }
921 
922 /*
923  * g_utf8_normalize:
924  * @str: a UTF-8 encoded string.
925  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
926  * @mode: the type of normalization to perform.
927  *
928  * Converts a string into canonical form, standardizing
929  * such issues as whether a character with an accent
930  * is represented as a base character and combining
931  * accent or as a single precomposed character. The
932  * string has to be valid UTF-8, otherwise %NULL is
933  * returned. You should generally call g_utf8_normalize()
934  * before comparing two Unicode strings.
935  *
936  * The normalization mode %G_NORMALIZE_DEFAULT only
937  * standardizes differences that do not affect the
938  * text content, such as the above-mentioned accent
939  * representation. %G_NORMALIZE_ALL also standardizes
940  * the "compatibility" characters in Unicode, such
941  * as SUPERSCRIPT THREE to the standard forms
942  * (in this case DIGIT THREE). Formatting information
943  * may be lost but for most text operations such
944  * characters should be considered the same.
945  *
946  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
947  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
948  * but returned a result with composed forms rather
949  * than a maximally decomposed form. This is often
950  * useful if you intend to convert the string to
951  * a legacy encoding or pass it to a system with
952  * less capable Unicode handling.
953  *
954  * Return value: a newly allocated string, that is the
955  * normalized form of @str, or %NULL if @str is not
956  * valid UTF-8.
957  **/
958 static gchar *
959 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
960 {
961  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
962  gchar *result;
963 
964  result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
965  g_free (result_wc);
966 
967  return result;
968 }
969 
970 /* Public Libidn API starts here. */
971 
982 uint32_t
984 {
985  return g_utf8_get_char (p);
986 }
987 
999 int
1000 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
1001 {
1002  return g_unichar_to_utf8 (c, outbuf);
1003 }
1004 
1020 uint32_t *
1021 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
1022 {
1023  return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
1024 }
1025 
1043 char *
1044 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1045  size_t * items_read, size_t * items_written)
1046 {
1047  return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1048  (glong *) items_written);
1049 }
1050 
1073 char *
1074 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1075 {
1076  return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1077 }
1078 
1090 uint32_t *
1091 stringprep_ucs4_nfkc_normalize (const uint32_t * str, ssize_t len)
1092 {
1093  char *p;
1094  uint32_t *result_wc;
1095 
1096  p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1097  result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1098  free (p);
1099 
1100  return result_wc;
1101 }