|
libidn
1.25
|
00001 /* nfkc.c --- Unicode normalization utilities. 00002 Copyright (C) 2002-2012 Simon Josefsson 00003 00004 This file is part of GNU Libidn. 00005 00006 GNU Libidn is free software: you can redistribute it and/or 00007 modify it under the terms of either: 00008 00009 * the GNU Lesser General Public License as published by the Free 00010 Software Foundation; either version 3 of the License, or (at 00011 your option) any later version. 00012 00013 or 00014 00015 * the GNU General Public License as published by the Free 00016 Software Foundation; either version 2 of the License, or (at 00017 your option) any later version. 00018 00019 or both in parallel, as here. 00020 00021 GNU Libidn is distributed in the hope that it will be useful, 00022 but WITHOUT ANY WARRANTY; without even the implied warranty of 00023 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00024 General Public License for more details. 00025 00026 You should have received copies of the GNU General Public License and 00027 the GNU Lesser General Public License along with this program. If 00028 not, see <http://www.gnu.org/licenses/>. */ 00029 00030 #ifdef HAVE_CONFIG_H 00031 #include "config.h" 00032 #endif 00033 00034 #include <stdlib.h> 00035 #include <string.h> 00036 00037 #include "stringprep.h" 00038 00039 /* Hacks to make syncing with GLIB code easier. */ 00040 #define gboolean int 00041 #define gchar char 00042 #define guchar unsigned char 00043 #define glong long 00044 #define gint int 00045 #define guint unsigned int 00046 #define gushort unsigned short 00047 #define gint16 int16_t 00048 #define guint16 uint16_t 00049 #define gunichar uint32_t 00050 #define gsize size_t 00051 #define gssize ssize_t 00052 #define g_malloc malloc 00053 #define g_free free 00054 #define g_return_val_if_fail(expr,val) { \ 00055 if (!(expr)) \ 00056 return (val); \ 00057 } 00058 00059 /* Code from GLIB gmacros.h starts here. */ 00060 00061 /* GLIB - Library of useful routines for C programming 00062 * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald 00063 * 00064 * This library is free software; you can redistribute it and/or 00065 * modify it under the terms of the GNU Lesser General Public 00066 * License as published by the Free Software Foundation; either 00067 * version 2 of the License, or (at your option) any later version. 00068 * 00069 * This library is distributed in the hope that it will be useful, 00070 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00071 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00072 * Lesser General Public License for more details. 00073 * 00074 * You should have received a copy of the GNU Lesser General Public 00075 * License along with this library; if not, write to the 00076 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 00077 * Boston, MA 02111-1307, USA. 00078 */ 00079 00080 #ifndef FALSE 00081 #define FALSE (0) 00082 #endif 00083 00084 #ifndef TRUE 00085 #define TRUE (!FALSE) 00086 #endif 00087 00088 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0])) 00089 00090 #define G_UNLIKELY(expr) (expr) 00091 00092 /* Code from GLIB gunicode.h starts here. */ 00093 00094 /* gunicode.h - Unicode manipulation functions 00095 * 00096 * Copyright (C) 1999, 2000 Tom Tromey 00097 * Copyright 2000, 2005 Red Hat, Inc. 00098 * 00099 * The Gnome Library is free software; you can redistribute it and/or 00100 * modify it under the terms of the GNU Lesser General Public License as 00101 * published by the Free Software Foundation; either version 2 of the 00102 * License, or (at your option) any later version. 00103 * 00104 * The Gnome Library is distributed in the hope that it will be useful, 00105 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00106 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00107 * Lesser General Public License for more details. 00108 * 00109 * You should have received a copy of the GNU Lesser General Public 00110 * License along with the Gnome Library; see the file COPYING.LIB. If not, 00111 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, 00112 * Boston, MA 02111-1307, USA. 00113 */ 00114 00115 typedef enum 00116 { 00117 G_NORMALIZE_DEFAULT, 00118 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT, 00119 G_NORMALIZE_DEFAULT_COMPOSE, 00120 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE, 00121 G_NORMALIZE_ALL, 00122 G_NORMALIZE_NFKD = G_NORMALIZE_ALL, 00123 G_NORMALIZE_ALL_COMPOSE, 00124 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE 00125 } 00126 GNormalizeMode; 00127 00128 #define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)]) 00129 00130 /* Code from GLIB gutf8.c starts here. */ 00131 00132 /* gutf8.c - Operations on UTF-8 strings. 00133 * 00134 * Copyright (C) 1999 Tom Tromey 00135 * Copyright (C) 2000 Red Hat, Inc. 00136 * 00137 * This library is free software; you can redistribute it and/or 00138 * modify it under the terms of the GNU Lesser General Public 00139 * License as published by the Free Software Foundation; either 00140 * version 2 of the License, or (at your option) any later version. 00141 * 00142 * This library is distributed in the hope that it will be useful, 00143 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00144 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00145 * Lesser General Public License for more details. 00146 * 00147 * You should have received a copy of the GNU Lesser General Public 00148 * License along with this library; if not, write to the 00149 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 00150 * Boston, MA 02111-1307, USA. 00151 */ 00152 00153 #define UTF8_COMPUTE(Char, Mask, Len) \ 00154 if (Char < 128) \ 00155 { \ 00156 Len = 1; \ 00157 Mask = 0x7f; \ 00158 } \ 00159 else if ((Char & 0xe0) == 0xc0) \ 00160 { \ 00161 Len = 2; \ 00162 Mask = 0x1f; \ 00163 } \ 00164 else if ((Char & 0xf0) == 0xe0) \ 00165 { \ 00166 Len = 3; \ 00167 Mask = 0x0f; \ 00168 } \ 00169 else if ((Char & 0xf8) == 0xf0) \ 00170 { \ 00171 Len = 4; \ 00172 Mask = 0x07; \ 00173 } \ 00174 else if ((Char & 0xfc) == 0xf8) \ 00175 { \ 00176 Len = 5; \ 00177 Mask = 0x03; \ 00178 } \ 00179 else if ((Char & 0xfe) == 0xfc) \ 00180 { \ 00181 Len = 6; \ 00182 Mask = 0x01; \ 00183 } \ 00184 else \ 00185 Len = -1; 00186 00187 #define UTF8_LENGTH(Char) \ 00188 ((Char) < 0x80 ? 1 : \ 00189 ((Char) < 0x800 ? 2 : \ 00190 ((Char) < 0x10000 ? 3 : \ 00191 ((Char) < 0x200000 ? 4 : \ 00192 ((Char) < 0x4000000 ? 5 : 6))))) 00193 00194 #define UTF8_GET(Result, Chars, Count, Mask, Len) \ 00195 (Result) = (Chars)[0] & (Mask); \ 00196 for ((Count) = 1; (Count) < (Len); ++(Count)) \ 00197 { \ 00198 if (((Chars)[(Count)] & 0xc0) != 0x80) \ 00199 { \ 00200 (Result) = -1; \ 00201 break; \ 00202 } \ 00203 (Result) <<= 6; \ 00204 (Result) |= ((Chars)[(Count)] & 0x3f); \ 00205 } 00206 00207 static const gchar utf8_skip_data[256] = { 00208 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00209 1, 1, 1, 1, 1, 1, 1, 00210 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00211 1, 1, 1, 1, 1, 1, 1, 00212 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00213 1, 1, 1, 1, 1, 1, 1, 00214 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00215 1, 1, 1, 1, 1, 1, 1, 00216 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00217 1, 1, 1, 1, 1, 1, 1, 00218 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00219 1, 1, 1, 1, 1, 1, 1, 00220 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 00221 2, 2, 2, 2, 2, 2, 2, 00222 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 00223 5, 5, 5, 6, 6, 1, 1 00224 }; 00225 00226 const gchar *const g_utf8_skip = utf8_skip_data; 00227 00228 /* 00229 * g_utf8_strlen: 00230 * @p: pointer to the start of a UTF-8 encoded string 00231 * @max: the maximum number of bytes to examine. If @max 00232 * is less than 0, then the string is assumed to be 00233 * nul-terminated. If @max is 0, @p will not be examined and 00234 * may be %NULL. 00235 * 00236 * Computes the length of the string in characters, not including 00237 * the terminating nul character. 00238 * 00239 * Return value: the length of the string in characters 00240 **/ 00241 static glong 00242 g_utf8_strlen (const gchar * p, gssize max) 00243 { 00244 glong len = 0; 00245 const gchar *start = p; 00246 g_return_val_if_fail (p != NULL || max == 0, 0); 00247 00248 if (max < 0) 00249 { 00250 while (*p) 00251 { 00252 p = g_utf8_next_char (p); 00253 ++len; 00254 } 00255 } 00256 else 00257 { 00258 if (max == 0 || !*p) 00259 return 0; 00260 00261 p = g_utf8_next_char (p); 00262 00263 while (p - start < max && *p) 00264 { 00265 ++len; 00266 p = g_utf8_next_char (p); 00267 } 00268 00269 /* only do the last len increment if we got a complete 00270 * char (don't count partial chars) 00271 */ 00272 if (p - start <= max) 00273 ++len; 00274 } 00275 00276 return len; 00277 } 00278 00279 /* 00280 * g_utf8_get_char: 00281 * @p: a pointer to Unicode character encoded as UTF-8 00282 * 00283 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. 00284 * If @p does not point to a valid UTF-8 encoded character, results are 00285 * undefined. If you are not sure that the bytes are complete 00286 * valid Unicode characters, you should use g_utf8_get_char_validated() 00287 * instead. 00288 * 00289 * Return value: the resulting character 00290 **/ 00291 static gunichar 00292 g_utf8_get_char (const gchar * p) 00293 { 00294 int i, mask = 0, len; 00295 gunichar result; 00296 unsigned char c = (unsigned char) *p; 00297 00298 UTF8_COMPUTE (c, mask, len); 00299 if (len == -1) 00300 return (gunichar) - 1; 00301 UTF8_GET (result, p, i, mask, len); 00302 00303 return result; 00304 } 00305 00306 /* 00307 * g_unichar_to_utf8: 00308 * @c: a Unicode character code 00309 * @outbuf: output buffer, must have at least 6 bytes of space. 00310 * If %NULL, the length will be computed and returned 00311 * and nothing will be written to @outbuf. 00312 * 00313 * Converts a single character to UTF-8. 00314 * 00315 * Return value: number of bytes written 00316 **/ 00317 static int 00318 g_unichar_to_utf8 (gunichar c, gchar * outbuf) 00319 { 00320 /* If this gets modified, also update the copy in g_string_insert_unichar() */ 00321 guint len = 0; 00322 int first; 00323 int i; 00324 00325 if (c < 0x80) 00326 { 00327 first = 0; 00328 len = 1; 00329 } 00330 else if (c < 0x800) 00331 { 00332 first = 0xc0; 00333 len = 2; 00334 } 00335 else if (c < 0x10000) 00336 { 00337 first = 0xe0; 00338 len = 3; 00339 } 00340 else if (c < 0x200000) 00341 { 00342 first = 0xf0; 00343 len = 4; 00344 } 00345 else if (c < 0x4000000) 00346 { 00347 first = 0xf8; 00348 len = 5; 00349 } 00350 else 00351 { 00352 first = 0xfc; 00353 len = 6; 00354 } 00355 00356 if (outbuf) 00357 { 00358 for (i = len - 1; i > 0; --i) 00359 { 00360 outbuf[i] = (c & 0x3f) | 0x80; 00361 c >>= 6; 00362 } 00363 outbuf[0] = c | first; 00364 } 00365 00366 return len; 00367 } 00368 00369 /* 00370 * g_utf8_to_ucs4_fast: 00371 * @str: a UTF-8 encoded string 00372 * @len: the maximum length of @str to use, in bytes. If @len < 0, 00373 * then the string is nul-terminated. 00374 * @items_written: location to store the number of characters in the 00375 * result, or %NULL. 00376 * 00377 * Convert a string from UTF-8 to a 32-bit fixed width 00378 * representation as UCS-4, assuming valid UTF-8 input. 00379 * This function is roughly twice as fast as g_utf8_to_ucs4() 00380 * but does no error checking on the input. A trailing 0 character 00381 * will be added to the string after the converted text. 00382 * 00383 * Return value: a pointer to a newly allocated UCS-4 string. 00384 * This value must be freed with g_free(). 00385 **/ 00386 static gunichar * 00387 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written) 00388 { 00389 gunichar *result; 00390 gsize n_chars, i; 00391 const gchar *p; 00392 00393 g_return_val_if_fail (str != NULL, NULL); 00394 00395 p = str; 00396 n_chars = 0; 00397 if (len < 0) 00398 { 00399 while (*p) 00400 { 00401 p = g_utf8_next_char (p); 00402 ++n_chars; 00403 } 00404 } 00405 else 00406 { 00407 while (p < str + len && *p) 00408 { 00409 p = g_utf8_next_char (p); 00410 ++n_chars; 00411 } 00412 } 00413 00414 result = g_malloc (sizeof (gunichar) * (n_chars + 1)); 00415 if (!result) 00416 return NULL; 00417 00418 p = str; 00419 for (i = 0; i < n_chars; i++) 00420 { 00421 gunichar wc = (guchar) * p++; 00422 00423 if (wc < 0x80) 00424 { 00425 result[i] = wc; 00426 } 00427 else 00428 { 00429 gunichar mask = 0x40; 00430 00431 if (G_UNLIKELY ((wc & mask) == 0)) 00432 { 00433 /* It's an out-of-sequence 10xxxxxxx byte. 00434 * Rather than making an ugly hash of this and the next byte 00435 * and overrunning the buffer, it's more useful to treat it 00436 * with a replacement character */ 00437 result[i] = 0xfffd; 00438 continue; 00439 } 00440 00441 do 00442 { 00443 wc <<= 6; 00444 wc |= (guchar) (*p++) & 0x3f; 00445 mask <<= 5; 00446 } 00447 while ((wc & mask) != 0); 00448 00449 wc &= mask - 1; 00450 00451 result[i] = wc; 00452 } 00453 } 00454 result[i] = 0; 00455 00456 if (items_written) 00457 *items_written = i; 00458 00459 return result; 00460 } 00461 00462 /* 00463 * g_ucs4_to_utf8: 00464 * @str: a UCS-4 encoded string 00465 * @len: the maximum length (number of characters) of @str to use. 00466 * If @len < 0, then the string is nul-terminated. 00467 * @items_read: location to store number of characters read, or %NULL. 00468 * @items_written: location to store number of bytes written or %NULL. 00469 * The value here stored does not include the trailing 0 00470 * byte. 00471 * @error: location to store the error occurring, or %NULL to ignore 00472 * errors. Any of the errors in #GConvertError other than 00473 * %G_CONVERT_ERROR_NO_CONVERSION may occur. 00474 * 00475 * Convert a string from a 32-bit fixed width representation as UCS-4. 00476 * to UTF-8. The result will be terminated with a 0 byte. 00477 * 00478 * Return value: a pointer to a newly allocated UTF-8 string. 00479 * This value must be freed with g_free(). If an 00480 * error occurs, %NULL will be returned and 00481 * @error set. In that case, @items_read will be 00482 * set to the position of the first invalid input 00483 * character. 00484 **/ 00485 static gchar * 00486 g_ucs4_to_utf8 (const gunichar * str, 00487 glong len, 00488 glong * items_read, glong * items_written) 00489 { 00490 gint result_length; 00491 gchar *result = NULL; 00492 gchar *p; 00493 gint i; 00494 00495 result_length = 0; 00496 for (i = 0; len < 0 || i < len; i++) 00497 { 00498 if (!str[i]) 00499 break; 00500 00501 if (str[i] >= 0x80000000) 00502 goto err_out; 00503 00504 result_length += UTF8_LENGTH (str[i]); 00505 } 00506 00507 result = g_malloc (result_length + 1); 00508 if (!result) 00509 return NULL; 00510 p = result; 00511 00512 i = 0; 00513 while (p < result + result_length) 00514 p += g_unichar_to_utf8 (str[i++], p); 00515 00516 *p = '\0'; 00517 00518 if (items_written) 00519 *items_written = p - result; 00520 00521 err_out: 00522 if (items_read) 00523 *items_read = i; 00524 00525 return result; 00526 } 00527 00528 /* Code from GLIB gunidecomp.c starts here. */ 00529 00530 /* decomp.c - Character decomposition. 00531 * 00532 * Copyright (C) 1999, 2000 Tom Tromey 00533 * Copyright 2000 Red Hat, Inc. 00534 * 00535 * The Gnome Library is free software; you can redistribute it and/or 00536 * modify it under the terms of the GNU Lesser General Public License as 00537 * published by the Free Software Foundation; either version 2 of the 00538 * License, or (at your option) any later version. 00539 * 00540 * The Gnome Library is distributed in the hope that it will be useful, 00541 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00542 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00543 * Lesser General Public License for more details. 00544 * 00545 * You should have received a copy of the GNU Lesser General Public 00546 * License along with the Gnome Library; see the file COPYING.LIB. If not, 00547 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, 00548 * Boston, MA 02111-1307, USA. 00549 */ 00550 00551 #include "gunidecomp.h" 00552 #include "gunicomp.h" 00553 00554 #define CC_PART1(Page, Char) \ 00555 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ 00556 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ 00557 : (cclass_data[combining_class_table_part1[Page]][Char])) 00558 00559 #define CC_PART2(Page, Char) \ 00560 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ 00561 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ 00562 : (cclass_data[combining_class_table_part2[Page]][Char])) 00563 00564 #define COMBINING_CLASS(Char) \ 00565 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ 00566 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \ 00567 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ 00568 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ 00569 : 0)) 00570 00571 /* constants for hangul syllable [de]composition */ 00572 #define SBase 0xAC00 00573 #define LBase 0x1100 00574 #define VBase 0x1161 00575 #define TBase 0x11A7 00576 #define LCount 19 00577 #define VCount 21 00578 #define TCount 28 00579 #define NCount (VCount * TCount) 00580 #define SCount (LCount * NCount) 00581 00582 /* 00583 * g_unicode_canonical_ordering: 00584 * @string: a UCS-4 encoded string. 00585 * @len: the maximum length of @string to use. 00586 * 00587 * Computes the canonical ordering of a string in-place. 00588 * This rearranges decomposed characters in the string 00589 * according to their combining classes. See the Unicode 00590 * manual for more information. 00591 **/ 00592 static void 00593 g_unicode_canonical_ordering (gunichar * string, gsize len) 00594 { 00595 gsize i; 00596 int swap = 1; 00597 00598 while (swap) 00599 { 00600 int last; 00601 swap = 0; 00602 last = COMBINING_CLASS (string[0]); 00603 for (i = 0; i < len - 1; ++i) 00604 { 00605 int next = COMBINING_CLASS (string[i + 1]); 00606 if (next != 0 && last > next) 00607 { 00608 gsize j; 00609 /* Percolate item leftward through string. */ 00610 for (j = i + 1; j > 0; --j) 00611 { 00612 gunichar t; 00613 if (COMBINING_CLASS (string[j - 1]) <= next) 00614 break; 00615 t = string[j]; 00616 string[j] = string[j - 1]; 00617 string[j - 1] = t; 00618 swap = 1; 00619 } 00620 /* We're re-entering the loop looking at the old 00621 character again. */ 00622 next = last; 00623 } 00624 last = next; 00625 } 00626 } 00627 } 00628 00629 /* http://www.unicode.org/unicode/reports/tr15/#Hangul 00630 * r should be null or have sufficient space. Calling with r == NULL will 00631 * only calculate the result_len; however, a buffer with space for three 00632 * characters will always be big enough. */ 00633 static void 00634 decompose_hangul (gunichar s, gunichar * r, gsize * result_len) 00635 { 00636 gint SIndex = s - SBase; 00637 gint TIndex = SIndex % TCount; 00638 00639 if (r) 00640 { 00641 r[0] = LBase + SIndex / NCount; 00642 r[1] = VBase + (SIndex % NCount) / TCount; 00643 } 00644 00645 if (TIndex) 00646 { 00647 if (r) 00648 r[2] = TBase + TIndex; 00649 *result_len = 3; 00650 } 00651 else 00652 *result_len = 2; 00653 } 00654 00655 /* returns a pointer to a null-terminated UTF-8 string */ 00656 static const gchar * 00657 find_decomposition (gunichar ch, gboolean compat) 00658 { 00659 int start = 0; 00660 int end = G_N_ELEMENTS (decomp_table); 00661 00662 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch) 00663 { 00664 while (TRUE) 00665 { 00666 int half = (start + end) / 2; 00667 if (ch == decomp_table[half].ch) 00668 { 00669 int offset; 00670 00671 if (compat) 00672 { 00673 offset = decomp_table[half].compat_offset; 00674 if (offset == G_UNICODE_NOT_PRESENT_OFFSET) 00675 offset = decomp_table[half].canon_offset; 00676 } 00677 else 00678 { 00679 offset = decomp_table[half].canon_offset; 00680 if (offset == G_UNICODE_NOT_PRESENT_OFFSET) 00681 return NULL; 00682 } 00683 00684 return &(decomp_expansion_string[offset]); 00685 } 00686 else if (half == start) 00687 break; 00688 else if (ch > decomp_table[half].ch) 00689 start = half; 00690 else 00691 end = half; 00692 } 00693 } 00694 00695 return NULL; 00696 } 00697 00698 /* L,V => LV and LV,T => LVT */ 00699 static gboolean 00700 combine_hangul (gunichar a, gunichar b, gunichar * result) 00701 { 00702 gint LIndex = a - LBase; 00703 gint SIndex = a - SBase; 00704 00705 gint VIndex = b - VBase; 00706 gint TIndex = b - TBase; 00707 00708 if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount) 00709 { 00710 *result = SBase + (LIndex * VCount + VIndex) * TCount; 00711 return TRUE; 00712 } 00713 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0 00714 && 0 < TIndex && TIndex < TCount) 00715 { 00716 *result = a + TIndex; 00717 return TRUE; 00718 } 00719 00720 return FALSE; 00721 } 00722 00723 #define CI(Page, Char) \ 00724 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ 00725 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ 00726 : (compose_data[compose_table[Page]][Char])) 00727 00728 #define COMPOSE_INDEX(Char) \ 00729 (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff)) 00730 00731 static gboolean 00732 combine (gunichar a, gunichar b, gunichar * result) 00733 { 00734 gushort index_a, index_b; 00735 00736 if (combine_hangul (a, b, result)) 00737 return TRUE; 00738 00739 index_a = COMPOSE_INDEX (a); 00740 00741 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START) 00742 { 00743 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0]) 00744 { 00745 *result = 00746 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1]; 00747 return TRUE; 00748 } 00749 else 00750 return FALSE; 00751 } 00752 00753 index_b = COMPOSE_INDEX (b); 00754 00755 if (index_b >= COMPOSE_SECOND_SINGLE_START) 00756 { 00757 if (a == 00758 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0]) 00759 { 00760 *result = 00761 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1]; 00762 return TRUE; 00763 } 00764 else 00765 return FALSE; 00766 } 00767 00768 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START 00769 && index_b >= COMPOSE_SECOND_START 00770 && index_b < COMPOSE_SECOND_SINGLE_START) 00771 { 00772 gunichar res = 00773 compose_array[index_a - COMPOSE_FIRST_START][index_b - 00774 COMPOSE_SECOND_START]; 00775 00776 if (res) 00777 { 00778 *result = res; 00779 return TRUE; 00780 } 00781 } 00782 00783 return FALSE; 00784 } 00785 00786 static gunichar * 00787 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode) 00788 { 00789 gsize n_wc; 00790 gunichar *wc_buffer; 00791 const char *p; 00792 gsize last_start; 00793 gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD); 00794 gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC); 00795 00796 n_wc = 0; 00797 p = str; 00798 while ((max_len < 0 || p < str + max_len) && *p) 00799 { 00800 const gchar *decomp; 00801 gunichar wc = g_utf8_get_char (p); 00802 00803 if (wc >= SBase && wc < SBase + SCount) 00804 { 00805 gsize result_len; 00806 decompose_hangul (wc, NULL, &result_len); 00807 n_wc += result_len; 00808 } 00809 else 00810 { 00811 decomp = find_decomposition (wc, do_compat); 00812 00813 if (decomp) 00814 n_wc += g_utf8_strlen (decomp, -1); 00815 else 00816 n_wc++; 00817 } 00818 00819 p = g_utf8_next_char (p); 00820 } 00821 00822 wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1)); 00823 if (!wc_buffer) 00824 return NULL; 00825 00826 last_start = 0; 00827 n_wc = 0; 00828 p = str; 00829 while ((max_len < 0 || p < str + max_len) && *p) 00830 { 00831 gunichar wc = g_utf8_get_char (p); 00832 const gchar *decomp; 00833 int cc; 00834 gsize old_n_wc = n_wc; 00835 00836 if (wc >= SBase && wc < SBase + SCount) 00837 { 00838 gsize result_len; 00839 decompose_hangul (wc, wc_buffer + n_wc, &result_len); 00840 n_wc += result_len; 00841 } 00842 else 00843 { 00844 decomp = find_decomposition (wc, do_compat); 00845 00846 if (decomp) 00847 { 00848 const char *pd; 00849 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd)) 00850 wc_buffer[n_wc++] = g_utf8_get_char (pd); 00851 } 00852 else 00853 wc_buffer[n_wc++] = wc; 00854 } 00855 00856 if (n_wc > 0) 00857 { 00858 cc = COMBINING_CLASS (wc_buffer[old_n_wc]); 00859 00860 if (cc == 0) 00861 { 00862 g_unicode_canonical_ordering (wc_buffer + last_start, 00863 n_wc - last_start); 00864 last_start = old_n_wc; 00865 } 00866 } 00867 00868 p = g_utf8_next_char (p); 00869 } 00870 00871 if (n_wc > 0) 00872 { 00873 g_unicode_canonical_ordering (wc_buffer + last_start, 00874 n_wc - last_start); 00875 last_start = n_wc; 00876 } 00877 00878 wc_buffer[n_wc] = 0; 00879 00880 /* All decomposed and reordered */ 00881 00882 if (do_compose && n_wc > 0) 00883 { 00884 gsize i, j; 00885 int last_cc = 0; 00886 last_start = 0; 00887 00888 for (i = 0; i < n_wc; i++) 00889 { 00890 int cc = COMBINING_CLASS (wc_buffer[i]); 00891 00892 if (i > 0 && 00893 (last_cc == 0 || last_cc != cc) && 00894 combine (wc_buffer[last_start], wc_buffer[i], 00895 &wc_buffer[last_start])) 00896 { 00897 for (j = i + 1; j < n_wc; j++) 00898 wc_buffer[j - 1] = wc_buffer[j]; 00899 n_wc--; 00900 i--; 00901 00902 if (i == last_start) 00903 last_cc = 0; 00904 else 00905 last_cc = COMBINING_CLASS (wc_buffer[i - 1]); 00906 00907 continue; 00908 } 00909 00910 if (cc == 0) 00911 last_start = i; 00912 00913 last_cc = cc; 00914 } 00915 } 00916 00917 wc_buffer[n_wc] = 0; 00918 00919 return wc_buffer; 00920 } 00921 00922 /* 00923 * g_utf8_normalize: 00924 * @str: a UTF-8 encoded string. 00925 * @len: length of @str, in bytes, or -1 if @str is nul-terminated. 00926 * @mode: the type of normalization to perform. 00927 * 00928 * Converts a string into canonical form, standardizing 00929 * such issues as whether a character with an accent 00930 * is represented as a base character and combining 00931 * accent or as a single precomposed character. The 00932 * string has to be valid UTF-8, otherwise %NULL is 00933 * returned. You should generally call g_utf8_normalize() 00934 * before comparing two Unicode strings. 00935 * 00936 * The normalization mode %G_NORMALIZE_DEFAULT only 00937 * standardizes differences that do not affect the 00938 * text content, such as the above-mentioned accent 00939 * representation. %G_NORMALIZE_ALL also standardizes 00940 * the "compatibility" characters in Unicode, such 00941 * as SUPERSCRIPT THREE to the standard forms 00942 * (in this case DIGIT THREE). Formatting information 00943 * may be lost but for most text operations such 00944 * characters should be considered the same. 00945 * 00946 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE 00947 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, 00948 * but returned a result with composed forms rather 00949 * than a maximally decomposed form. This is often 00950 * useful if you intend to convert the string to 00951 * a legacy encoding or pass it to a system with 00952 * less capable Unicode handling. 00953 * 00954 * Return value: a newly allocated string, that is the 00955 * normalized form of @str, or %NULL if @str is not 00956 * valid UTF-8. 00957 **/ 00958 static gchar * 00959 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode) 00960 { 00961 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode); 00962 gchar *result; 00963 00964 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL); 00965 g_free (result_wc); 00966 00967 return result; 00968 } 00969 00970 /* Public Libidn API starts here. */ 00971 00982 uint32_t 00983 stringprep_utf8_to_unichar (const char *p) 00984 { 00985 return g_utf8_get_char (p); 00986 } 00987 00999 int 01000 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf) 01001 { 01002 return g_unichar_to_utf8 (c, outbuf); 01003 } 01004 01020 uint32_t * 01021 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written) 01022 { 01023 return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written); 01024 } 01025 01043 char * 01044 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len, 01045 size_t * items_read, size_t * items_written) 01046 { 01047 return g_ucs4_to_utf8 (str, len, (glong *) items_read, 01048 (glong *) items_written); 01049 } 01050 01073 char * 01074 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len) 01075 { 01076 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC); 01077 } 01078 01090 uint32_t * 01091 stringprep_ucs4_nfkc_normalize (const uint32_t * str, ssize_t len) 01092 { 01093 char *p; 01094 uint32_t *result_wc; 01095 01096 p = stringprep_ucs4_to_utf8 (str, len, 0, 0); 01097 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC); 01098 free (p); 01099 01100 return result_wc; 01101 }
1.7.6.1