libidn  1.25
nfkc.c
Go to the documentation of this file.
00001 /* nfkc.c --- Unicode normalization utilities.
00002    Copyright (C) 2002-2012 Simon Josefsson
00003 
00004    This file is part of GNU Libidn.
00005 
00006    GNU Libidn is free software: you can redistribute it and/or
00007    modify it under the terms of either:
00008 
00009      * the GNU Lesser General Public License as published by the Free
00010        Software Foundation; either version 3 of the License, or (at
00011        your option) any later version.
00012 
00013    or
00014 
00015      * the GNU General Public License as published by the Free
00016        Software Foundation; either version 2 of the License, or (at
00017        your option) any later version.
00018 
00019    or both in parallel, as here.
00020 
00021    GNU Libidn is distributed in the hope that it will be useful,
00022    but WITHOUT ANY WARRANTY; without even the implied warranty of
00023    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00024    General Public License for more details.
00025 
00026    You should have received copies of the GNU General Public License and
00027    the GNU Lesser General Public License along with this program.  If
00028    not, see <http://www.gnu.org/licenses/>. */
00029 
00030 #ifdef HAVE_CONFIG_H
00031 #include "config.h"
00032 #endif
00033 
00034 #include <stdlib.h>
00035 #include <string.h>
00036 
00037 #include "stringprep.h"
00038 
00039 /* Hacks to make syncing with GLIB code easier. */
00040 #define gboolean int
00041 #define gchar char
00042 #define guchar unsigned char
00043 #define glong long
00044 #define gint int
00045 #define guint unsigned int
00046 #define gushort unsigned short
00047 #define gint16 int16_t
00048 #define guint16 uint16_t
00049 #define gunichar uint32_t
00050 #define gsize size_t
00051 #define gssize ssize_t
00052 #define g_malloc malloc
00053 #define g_free free
00054 #define g_return_val_if_fail(expr,val)  {               \
00055     if (!(expr))                                        \
00056       return (val);                                     \
00057   }
00058 
00059 /* Code from GLIB gmacros.h starts here. */
00060 
00061 /* GLIB - Library of useful routines for C programming
00062  * Copyright (C) 1995-1997  Peter Mattis, Spencer Kimball and Josh MacDonald
00063  *
00064  * This library is free software; you can redistribute it and/or
00065  * modify it under the terms of the GNU Lesser General Public
00066  * License as published by the Free Software Foundation; either
00067  * version 2 of the License, or (at your option) any later version.
00068  *
00069  * This library is distributed in the hope that it will be useful,
00070  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00071  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00072  * Lesser General Public License for more details.
00073  *
00074  * You should have received a copy of the GNU Lesser General Public
00075  * License along with this library; if not, write to the
00076  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00077  * Boston, MA 02111-1307, USA.
00078  */
00079 
00080 #ifndef FALSE
00081 #define FALSE   (0)
00082 #endif
00083 
00084 #ifndef TRUE
00085 #define TRUE    (!FALSE)
00086 #endif
00087 
00088 #define G_N_ELEMENTS(arr)               (sizeof (arr) / sizeof ((arr)[0]))
00089 
00090 #define G_UNLIKELY(expr) (expr)
00091 
00092 /* Code from GLIB gunicode.h starts here. */
00093 
00094 /* gunicode.h - Unicode manipulation functions
00095  *
00096  *  Copyright (C) 1999, 2000 Tom Tromey
00097  *  Copyright 2000, 2005 Red Hat, Inc.
00098  *
00099  * The Gnome Library is free software; you can redistribute it and/or
00100  * modify it under the terms of the GNU Lesser General Public License as
00101  * published by the Free Software Foundation; either version 2 of the
00102  * License, or (at your option) any later version.
00103  *
00104  * The Gnome Library is distributed in the hope that it will be useful,
00105  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00106  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00107  * Lesser General Public License for more details.
00108  *
00109  * You should have received a copy of the GNU Lesser General Public
00110  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
00111  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00112  *   Boston, MA 02111-1307, USA.
00113  */
00114 
00115 typedef enum
00116 {
00117   G_NORMALIZE_DEFAULT,
00118   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
00119   G_NORMALIZE_DEFAULT_COMPOSE,
00120   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
00121   G_NORMALIZE_ALL,
00122   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
00123   G_NORMALIZE_ALL_COMPOSE,
00124   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
00125 }
00126 GNormalizeMode;
00127 
00128 #define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
00129 
00130 /* Code from GLIB gutf8.c starts here. */
00131 
00132 /* gutf8.c - Operations on UTF-8 strings.
00133  *
00134  * Copyright (C) 1999 Tom Tromey
00135  * Copyright (C) 2000 Red Hat, Inc.
00136  *
00137  * This library is free software; you can redistribute it and/or
00138  * modify it under the terms of the GNU Lesser General Public
00139  * License as published by the Free Software Foundation; either
00140  * version 2 of the License, or (at your option) any later version.
00141  *
00142  * This library is distributed in the hope that it will be useful,
00143  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00144  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00145  * Lesser General Public License for more details.
00146  *
00147  * You should have received a copy of the GNU Lesser General Public
00148  * License along with this library; if not, write to the
00149  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00150  * Boston, MA 02111-1307, USA.
00151  */
00152 
00153 #define UTF8_COMPUTE(Char, Mask, Len)           \
00154   if (Char < 128)                               \
00155     {                                           \
00156       Len = 1;                                  \
00157       Mask = 0x7f;                              \
00158     }                                           \
00159   else if ((Char & 0xe0) == 0xc0)               \
00160     {                                           \
00161       Len = 2;                                  \
00162       Mask = 0x1f;                              \
00163     }                                           \
00164   else if ((Char & 0xf0) == 0xe0)               \
00165     {                                           \
00166       Len = 3;                                  \
00167       Mask = 0x0f;                              \
00168     }                                           \
00169   else if ((Char & 0xf8) == 0xf0)               \
00170     {                                           \
00171       Len = 4;                                  \
00172       Mask = 0x07;                              \
00173     }                                           \
00174   else if ((Char & 0xfc) == 0xf8)               \
00175     {                                           \
00176       Len = 5;                                  \
00177       Mask = 0x03;                              \
00178     }                                           \
00179   else if ((Char & 0xfe) == 0xfc)               \
00180     {                                           \
00181       Len = 6;                                  \
00182       Mask = 0x01;                              \
00183     }                                           \
00184   else                                          \
00185     Len = -1;
00186 
00187 #define UTF8_LENGTH(Char)                       \
00188   ((Char) < 0x80 ? 1 :                          \
00189    ((Char) < 0x800 ? 2 :                        \
00190     ((Char) < 0x10000 ? 3 :                     \
00191      ((Char) < 0x200000 ? 4 :                   \
00192       ((Char) < 0x4000000 ? 5 : 6)))))
00193 
00194 #define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
00195   (Result) = (Chars)[0] & (Mask);                                             \
00196   for ((Count) = 1; (Count) < (Len); ++(Count))                               \
00197     {                                                                         \
00198       if (((Chars)[(Count)] & 0xc0) != 0x80)                                  \
00199         {                                                                     \
00200           (Result) = -1;                                                      \
00201           break;                                                              \
00202         }                                                                     \
00203       (Result) <<= 6;                                                         \
00204       (Result) |= ((Chars)[(Count)] & 0x3f);                                  \
00205     }
00206 
00207 static const gchar utf8_skip_data[256] = {
00208   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00209   1, 1, 1, 1, 1, 1, 1,
00210   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00211   1, 1, 1, 1, 1, 1, 1,
00212   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00213   1, 1, 1, 1, 1, 1, 1,
00214   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00215   1, 1, 1, 1, 1, 1, 1,
00216   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00217   1, 1, 1, 1, 1, 1, 1,
00218   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00219   1, 1, 1, 1, 1, 1, 1,
00220   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
00221   2, 2, 2, 2, 2, 2, 2,
00222   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
00223   5, 5, 5, 6, 6, 1, 1
00224 };
00225 
00226 const gchar *const g_utf8_skip = utf8_skip_data;
00227 
00228 /*
00229  * g_utf8_strlen:
00230  * @p: pointer to the start of a UTF-8 encoded string
00231  * @max: the maximum number of bytes to examine. If @max
00232  *       is less than 0, then the string is assumed to be
00233  *       nul-terminated. If @max is 0, @p will not be examined and
00234  *       may be %NULL.
00235  *
00236  * Computes the length of the string in characters, not including
00237  * the terminating nul character.
00238  *
00239  * Return value: the length of the string in characters
00240  **/
00241 static glong
00242 g_utf8_strlen (const gchar * p, gssize max)
00243 {
00244   glong len = 0;
00245   const gchar *start = p;
00246   g_return_val_if_fail (p != NULL || max == 0, 0);
00247 
00248   if (max < 0)
00249     {
00250       while (*p)
00251         {
00252           p = g_utf8_next_char (p);
00253           ++len;
00254         }
00255     }
00256   else
00257     {
00258       if (max == 0 || !*p)
00259         return 0;
00260 
00261       p = g_utf8_next_char (p);
00262 
00263       while (p - start < max && *p)
00264         {
00265           ++len;
00266           p = g_utf8_next_char (p);
00267         }
00268 
00269       /* only do the last len increment if we got a complete
00270        * char (don't count partial chars)
00271        */
00272       if (p - start <= max)
00273         ++len;
00274     }
00275 
00276   return len;
00277 }
00278 
00279 /*
00280  * g_utf8_get_char:
00281  * @p: a pointer to Unicode character encoded as UTF-8
00282  *
00283  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
00284  * If @p does not point to a valid UTF-8 encoded character, results are
00285  * undefined. If you are not sure that the bytes are complete
00286  * valid Unicode characters, you should use g_utf8_get_char_validated()
00287  * instead.
00288  *
00289  * Return value: the resulting character
00290  **/
00291 static gunichar
00292 g_utf8_get_char (const gchar * p)
00293 {
00294   int i, mask = 0, len;
00295   gunichar result;
00296   unsigned char c = (unsigned char) *p;
00297 
00298   UTF8_COMPUTE (c, mask, len);
00299   if (len == -1)
00300     return (gunichar) - 1;
00301   UTF8_GET (result, p, i, mask, len);
00302 
00303   return result;
00304 }
00305 
00306 /*
00307  * g_unichar_to_utf8:
00308  * @c: a Unicode character code
00309  * @outbuf: output buffer, must have at least 6 bytes of space.
00310  *       If %NULL, the length will be computed and returned
00311  *       and nothing will be written to @outbuf.
00312  *
00313  * Converts a single character to UTF-8.
00314  *
00315  * Return value: number of bytes written
00316  **/
00317 static int
00318 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
00319 {
00320   /* If this gets modified, also update the copy in g_string_insert_unichar() */
00321   guint len = 0;
00322   int first;
00323   int i;
00324 
00325   if (c < 0x80)
00326     {
00327       first = 0;
00328       len = 1;
00329     }
00330   else if (c < 0x800)
00331     {
00332       first = 0xc0;
00333       len = 2;
00334     }
00335   else if (c < 0x10000)
00336     {
00337       first = 0xe0;
00338       len = 3;
00339     }
00340   else if (c < 0x200000)
00341     {
00342       first = 0xf0;
00343       len = 4;
00344     }
00345   else if (c < 0x4000000)
00346     {
00347       first = 0xf8;
00348       len = 5;
00349     }
00350   else
00351     {
00352       first = 0xfc;
00353       len = 6;
00354     }
00355 
00356   if (outbuf)
00357     {
00358       for (i = len - 1; i > 0; --i)
00359         {
00360           outbuf[i] = (c & 0x3f) | 0x80;
00361           c >>= 6;
00362         }
00363       outbuf[0] = c | first;
00364     }
00365 
00366   return len;
00367 }
00368 
00369 /*
00370  * g_utf8_to_ucs4_fast:
00371  * @str: a UTF-8 encoded string
00372  * @len: the maximum length of @str to use, in bytes. If @len < 0,
00373  *       then the string is nul-terminated.
00374  * @items_written: location to store the number of characters in the
00375  *                 result, or %NULL.
00376  *
00377  * Convert a string from UTF-8 to a 32-bit fixed width
00378  * representation as UCS-4, assuming valid UTF-8 input.
00379  * This function is roughly twice as fast as g_utf8_to_ucs4()
00380  * but does no error checking on the input. A trailing 0 character
00381  * will be added to the string after the converted text.
00382  *
00383  * Return value: a pointer to a newly allocated UCS-4 string.
00384  *               This value must be freed with g_free().
00385  **/
00386 static gunichar *
00387 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
00388 {
00389   gunichar *result;
00390   gsize n_chars, i;
00391   const gchar *p;
00392 
00393   g_return_val_if_fail (str != NULL, NULL);
00394 
00395   p = str;
00396   n_chars = 0;
00397   if (len < 0)
00398     {
00399       while (*p)
00400         {
00401           p = g_utf8_next_char (p);
00402           ++n_chars;
00403         }
00404     }
00405   else
00406     {
00407       while (p < str + len && *p)
00408         {
00409           p = g_utf8_next_char (p);
00410           ++n_chars;
00411         }
00412     }
00413 
00414   result = g_malloc (sizeof (gunichar) * (n_chars + 1));
00415   if (!result)
00416     return NULL;
00417 
00418   p = str;
00419   for (i = 0; i < n_chars; i++)
00420     {
00421       gunichar wc = (guchar) * p++;
00422 
00423       if (wc < 0x80)
00424         {
00425           result[i] = wc;
00426         }
00427       else
00428         {
00429           gunichar mask = 0x40;
00430 
00431           if (G_UNLIKELY ((wc & mask) == 0))
00432             {
00433               /* It's an out-of-sequence 10xxxxxxx byte.
00434                * Rather than making an ugly hash of this and the next byte
00435                * and overrunning the buffer, it's more useful to treat it
00436                * with a replacement character */
00437               result[i] = 0xfffd;
00438               continue;
00439             }
00440 
00441           do
00442             {
00443               wc <<= 6;
00444               wc |= (guchar) (*p++) & 0x3f;
00445               mask <<= 5;
00446             }
00447           while ((wc & mask) != 0);
00448 
00449           wc &= mask - 1;
00450 
00451           result[i] = wc;
00452         }
00453     }
00454   result[i] = 0;
00455 
00456   if (items_written)
00457     *items_written = i;
00458 
00459   return result;
00460 }
00461 
00462 /*
00463  * g_ucs4_to_utf8:
00464  * @str: a UCS-4 encoded string
00465  * @len: the maximum length (number of characters) of @str to use.
00466  *       If @len < 0, then the string is nul-terminated.
00467  * @items_read: location to store number of characters read, or %NULL.
00468  * @items_written: location to store number of bytes written or %NULL.
00469  *                 The value here stored does not include the trailing 0
00470  *                 byte.
00471  * @error: location to store the error occurring, or %NULL to ignore
00472  *         errors. Any of the errors in #GConvertError other than
00473  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
00474  *
00475  * Convert a string from a 32-bit fixed width representation as UCS-4.
00476  * to UTF-8. The result will be terminated with a 0 byte.
00477  *
00478  * Return value: a pointer to a newly allocated UTF-8 string.
00479  *               This value must be freed with g_free(). If an
00480  *               error occurs, %NULL will be returned and
00481  *               @error set. In that case, @items_read will be
00482  *               set to the position of the first invalid input
00483  *               character.
00484  **/
00485 static gchar *
00486 g_ucs4_to_utf8 (const gunichar * str,
00487                 glong len,
00488                 glong * items_read, glong * items_written)
00489 {
00490   gint result_length;
00491   gchar *result = NULL;
00492   gchar *p;
00493   gint i;
00494 
00495   result_length = 0;
00496   for (i = 0; len < 0 || i < len; i++)
00497     {
00498       if (!str[i])
00499         break;
00500 
00501       if (str[i] >= 0x80000000)
00502         goto err_out;
00503 
00504       result_length += UTF8_LENGTH (str[i]);
00505     }
00506 
00507   result = g_malloc (result_length + 1);
00508   if (!result)
00509     return NULL;
00510   p = result;
00511 
00512   i = 0;
00513   while (p < result + result_length)
00514     p += g_unichar_to_utf8 (str[i++], p);
00515 
00516   *p = '\0';
00517 
00518   if (items_written)
00519     *items_written = p - result;
00520 
00521 err_out:
00522   if (items_read)
00523     *items_read = i;
00524 
00525   return result;
00526 }
00527 
00528 /* Code from GLIB gunidecomp.c starts here. */
00529 
00530 /* decomp.c - Character decomposition.
00531  *
00532  *  Copyright (C) 1999, 2000 Tom Tromey
00533  *  Copyright 2000 Red Hat, Inc.
00534  *
00535  * The Gnome Library is free software; you can redistribute it and/or
00536  * modify it under the terms of the GNU Lesser General Public License as
00537  * published by the Free Software Foundation; either version 2 of the
00538  * License, or (at your option) any later version.
00539  *
00540  * The Gnome Library is distributed in the hope that it will be useful,
00541  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00542  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00543  * Lesser General Public License for more details.
00544  *
00545  * You should have received a copy of the GNU Lesser General Public
00546  * License along with the Gnome Library; see the file COPYING.LIB.  If not,
00547  * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
00548  *   Boston, MA 02111-1307, USA.
00549  */
00550 
00551 #include "gunidecomp.h"
00552 #include "gunicomp.h"
00553 
00554 #define CC_PART1(Page, Char)                                            \
00555   ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX)     \
00556    ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX)    \
00557    : (cclass_data[combining_class_table_part1[Page]][Char]))
00558 
00559 #define CC_PART2(Page, Char)                                            \
00560   ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX)     \
00561    ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX)    \
00562    : (cclass_data[combining_class_table_part2[Page]][Char]))
00563 
00564 #define COMBINING_CLASS(Char)                                   \
00565   (((Char) <= G_UNICODE_LAST_CHAR_PART1)                        \
00566    ? CC_PART1 ((Char) >> 8, (Char) & 0xff)                      \
00567    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR)      \
00568       ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff)       \
00569       : 0))
00570 
00571 /* constants for hangul syllable [de]composition */
00572 #define SBase 0xAC00
00573 #define LBase 0x1100
00574 #define VBase 0x1161
00575 #define TBase 0x11A7
00576 #define LCount 19
00577 #define VCount 21
00578 #define TCount 28
00579 #define NCount (VCount * TCount)
00580 #define SCount (LCount * NCount)
00581 
00582 /*
00583  * g_unicode_canonical_ordering:
00584  * @string: a UCS-4 encoded string.
00585  * @len: the maximum length of @string to use.
00586  *
00587  * Computes the canonical ordering of a string in-place.
00588  * This rearranges decomposed characters in the string
00589  * according to their combining classes.  See the Unicode
00590  * manual for more information.
00591  **/
00592 static void
00593 g_unicode_canonical_ordering (gunichar * string, gsize len)
00594 {
00595   gsize i;
00596   int swap = 1;
00597 
00598   while (swap)
00599     {
00600       int last;
00601       swap = 0;
00602       last = COMBINING_CLASS (string[0]);
00603       for (i = 0; i < len - 1; ++i)
00604         {
00605           int next = COMBINING_CLASS (string[i + 1]);
00606           if (next != 0 && last > next)
00607             {
00608               gsize j;
00609               /* Percolate item leftward through string.  */
00610               for (j = i + 1; j > 0; --j)
00611                 {
00612                   gunichar t;
00613                   if (COMBINING_CLASS (string[j - 1]) <= next)
00614                     break;
00615                   t = string[j];
00616                   string[j] = string[j - 1];
00617                   string[j - 1] = t;
00618                   swap = 1;
00619                 }
00620               /* We're re-entering the loop looking at the old
00621                  character again.  */
00622               next = last;
00623             }
00624           last = next;
00625         }
00626     }
00627 }
00628 
00629 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
00630  * r should be null or have sufficient space. Calling with r == NULL will
00631  * only calculate the result_len; however, a buffer with space for three
00632  * characters will always be big enough. */
00633 static void
00634 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
00635 {
00636   gint SIndex = s - SBase;
00637   gint TIndex = SIndex % TCount;
00638 
00639   if (r)
00640     {
00641       r[0] = LBase + SIndex / NCount;
00642       r[1] = VBase + (SIndex % NCount) / TCount;
00643     }
00644 
00645   if (TIndex)
00646     {
00647       if (r)
00648         r[2] = TBase + TIndex;
00649       *result_len = 3;
00650     }
00651   else
00652     *result_len = 2;
00653 }
00654 
00655 /* returns a pointer to a null-terminated UTF-8 string */
00656 static const gchar *
00657 find_decomposition (gunichar ch, gboolean compat)
00658 {
00659   int start = 0;
00660   int end = G_N_ELEMENTS (decomp_table);
00661 
00662   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
00663     {
00664       while (TRUE)
00665         {
00666           int half = (start + end) / 2;
00667           if (ch == decomp_table[half].ch)
00668             {
00669               int offset;
00670 
00671               if (compat)
00672                 {
00673                   offset = decomp_table[half].compat_offset;
00674                   if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
00675                     offset = decomp_table[half].canon_offset;
00676                 }
00677               else
00678                 {
00679                   offset = decomp_table[half].canon_offset;
00680                   if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
00681                     return NULL;
00682                 }
00683 
00684               return &(decomp_expansion_string[offset]);
00685             }
00686           else if (half == start)
00687             break;
00688           else if (ch > decomp_table[half].ch)
00689             start = half;
00690           else
00691             end = half;
00692         }
00693     }
00694 
00695   return NULL;
00696 }
00697 
00698 /* L,V => LV and LV,T => LVT  */
00699 static gboolean
00700 combine_hangul (gunichar a, gunichar b, gunichar * result)
00701 {
00702   gint LIndex = a - LBase;
00703   gint SIndex = a - SBase;
00704 
00705   gint VIndex = b - VBase;
00706   gint TIndex = b - TBase;
00707 
00708   if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
00709     {
00710       *result = SBase + (LIndex * VCount + VIndex) * TCount;
00711       return TRUE;
00712     }
00713   else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
00714            && 0 < TIndex && TIndex < TCount)
00715     {
00716       *result = a + TIndex;
00717       return TRUE;
00718     }
00719 
00720   return FALSE;
00721 }
00722 
00723 #define CI(Page, Char)                                  \
00724   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX)   \
00725    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX)  \
00726    : (compose_data[compose_table[Page]][Char]))
00727 
00728 #define COMPOSE_INDEX(Char)                                             \
00729   (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
00730 
00731 static gboolean
00732 combine (gunichar a, gunichar b, gunichar * result)
00733 {
00734   gushort index_a, index_b;
00735 
00736   if (combine_hangul (a, b, result))
00737     return TRUE;
00738 
00739   index_a = COMPOSE_INDEX (a);
00740 
00741   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
00742     {
00743       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
00744         {
00745           *result =
00746             compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
00747           return TRUE;
00748         }
00749       else
00750         return FALSE;
00751     }
00752 
00753   index_b = COMPOSE_INDEX (b);
00754 
00755   if (index_b >= COMPOSE_SECOND_SINGLE_START)
00756     {
00757       if (a ==
00758           compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
00759         {
00760           *result =
00761             compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
00762           return TRUE;
00763         }
00764       else
00765         return FALSE;
00766     }
00767 
00768   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
00769       && index_b >= COMPOSE_SECOND_START
00770       && index_b < COMPOSE_SECOND_SINGLE_START)
00771     {
00772       gunichar res =
00773         compose_array[index_a - COMPOSE_FIRST_START][index_b -
00774                                                      COMPOSE_SECOND_START];
00775 
00776       if (res)
00777         {
00778           *result = res;
00779           return TRUE;
00780         }
00781     }
00782 
00783   return FALSE;
00784 }
00785 
00786 static gunichar *
00787 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
00788 {
00789   gsize n_wc;
00790   gunichar *wc_buffer;
00791   const char *p;
00792   gsize last_start;
00793   gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
00794   gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
00795 
00796   n_wc = 0;
00797   p = str;
00798   while ((max_len < 0 || p < str + max_len) && *p)
00799     {
00800       const gchar *decomp;
00801       gunichar wc = g_utf8_get_char (p);
00802 
00803       if (wc >= SBase && wc < SBase + SCount)
00804         {
00805           gsize result_len;
00806           decompose_hangul (wc, NULL, &result_len);
00807           n_wc += result_len;
00808         }
00809       else
00810         {
00811           decomp = find_decomposition (wc, do_compat);
00812 
00813           if (decomp)
00814             n_wc += g_utf8_strlen (decomp, -1);
00815           else
00816             n_wc++;
00817         }
00818 
00819       p = g_utf8_next_char (p);
00820     }
00821 
00822   wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
00823   if (!wc_buffer)
00824     return NULL;
00825 
00826   last_start = 0;
00827   n_wc = 0;
00828   p = str;
00829   while ((max_len < 0 || p < str + max_len) && *p)
00830     {
00831       gunichar wc = g_utf8_get_char (p);
00832       const gchar *decomp;
00833       int cc;
00834       gsize old_n_wc = n_wc;
00835 
00836       if (wc >= SBase && wc < SBase + SCount)
00837         {
00838           gsize result_len;
00839           decompose_hangul (wc, wc_buffer + n_wc, &result_len);
00840           n_wc += result_len;
00841         }
00842       else
00843         {
00844           decomp = find_decomposition (wc, do_compat);
00845 
00846           if (decomp)
00847             {
00848               const char *pd;
00849               for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
00850                 wc_buffer[n_wc++] = g_utf8_get_char (pd);
00851             }
00852           else
00853             wc_buffer[n_wc++] = wc;
00854         }
00855 
00856       if (n_wc > 0)
00857         {
00858           cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
00859 
00860           if (cc == 0)
00861             {
00862               g_unicode_canonical_ordering (wc_buffer + last_start,
00863                                             n_wc - last_start);
00864               last_start = old_n_wc;
00865             }
00866         }
00867 
00868       p = g_utf8_next_char (p);
00869     }
00870 
00871   if (n_wc > 0)
00872     {
00873       g_unicode_canonical_ordering (wc_buffer + last_start,
00874                                     n_wc - last_start);
00875       last_start = n_wc;
00876     }
00877 
00878   wc_buffer[n_wc] = 0;
00879 
00880   /* All decomposed and reordered */
00881 
00882   if (do_compose && n_wc > 0)
00883     {
00884       gsize i, j;
00885       int last_cc = 0;
00886       last_start = 0;
00887 
00888       for (i = 0; i < n_wc; i++)
00889         {
00890           int cc = COMBINING_CLASS (wc_buffer[i]);
00891 
00892           if (i > 0 &&
00893               (last_cc == 0 || last_cc != cc) &&
00894               combine (wc_buffer[last_start], wc_buffer[i],
00895                        &wc_buffer[last_start]))
00896             {
00897               for (j = i + 1; j < n_wc; j++)
00898                 wc_buffer[j - 1] = wc_buffer[j];
00899               n_wc--;
00900               i--;
00901 
00902               if (i == last_start)
00903                 last_cc = 0;
00904               else
00905                 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
00906 
00907               continue;
00908             }
00909 
00910           if (cc == 0)
00911             last_start = i;
00912 
00913           last_cc = cc;
00914         }
00915     }
00916 
00917   wc_buffer[n_wc] = 0;
00918 
00919   return wc_buffer;
00920 }
00921 
00922 /*
00923  * g_utf8_normalize:
00924  * @str: a UTF-8 encoded string.
00925  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
00926  * @mode: the type of normalization to perform.
00927  *
00928  * Converts a string into canonical form, standardizing
00929  * such issues as whether a character with an accent
00930  * is represented as a base character and combining
00931  * accent or as a single precomposed character. The
00932  * string has to be valid UTF-8, otherwise %NULL is
00933  * returned. You should generally call g_utf8_normalize()
00934  * before comparing two Unicode strings.
00935  *
00936  * The normalization mode %G_NORMALIZE_DEFAULT only
00937  * standardizes differences that do not affect the
00938  * text content, such as the above-mentioned accent
00939  * representation. %G_NORMALIZE_ALL also standardizes
00940  * the "compatibility" characters in Unicode, such
00941  * as SUPERSCRIPT THREE to the standard forms
00942  * (in this case DIGIT THREE). Formatting information
00943  * may be lost but for most text operations such
00944  * characters should be considered the same.
00945  *
00946  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
00947  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
00948  * but returned a result with composed forms rather
00949  * than a maximally decomposed form. This is often
00950  * useful if you intend to convert the string to
00951  * a legacy encoding or pass it to a system with
00952  * less capable Unicode handling.
00953  *
00954  * Return value: a newly allocated string, that is the
00955  *   normalized form of @str, or %NULL if @str is not
00956  *   valid UTF-8.
00957  **/
00958 static gchar *
00959 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
00960 {
00961   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
00962   gchar *result;
00963 
00964   result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
00965   g_free (result_wc);
00966 
00967   return result;
00968 }
00969 
00970 /* Public Libidn API starts here. */
00971 
00982 uint32_t
00983 stringprep_utf8_to_unichar (const char *p)
00984 {
00985   return g_utf8_get_char (p);
00986 }
00987 
00999 int
01000 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
01001 {
01002   return g_unichar_to_utf8 (c, outbuf);
01003 }
01004 
01020 uint32_t *
01021 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
01022 {
01023   return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
01024 }
01025 
01043 char *
01044 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
01045                          size_t * items_read, size_t * items_written)
01046 {
01047   return g_ucs4_to_utf8 (str, len, (glong *) items_read,
01048                          (glong *) items_written);
01049 }
01050 
01073 char *
01074 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
01075 {
01076   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
01077 }
01078 
01090 uint32_t *
01091 stringprep_ucs4_nfkc_normalize (const uint32_t * str, ssize_t len)
01092 {
01093   char *p;
01094   uint32_t *result_wc;
01095 
01096   p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
01097   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
01098   free (p);
01099 
01100   return result_wc;
01101 }