Branch data Line data Source code
1 : : /* Determine a canonical name for the current locale's character encoding.
2 : :
3 : : Copyright (C) 2000-2006, 2008-2012 Free Software Foundation, Inc.
4 : :
5 : : This program is free software; you can redistribute it and/or modify
6 : : it under the terms of the GNU General Public License as published by
7 : : the Free Software Foundation; either version 3, or (at your option)
8 : : any later version.
9 : :
10 : : This program is distributed in the hope that it will be useful,
11 : : but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : : GNU General Public License for more details.
14 : :
15 : : You should have received a copy of the GNU General Public License along
16 : : with this program; if not, see <http://www.gnu.org/licenses/>. */
17 : :
18 : : /* Written by Bruno Haible <bruno@clisp.org>. */
19 : :
20 : : #include <config.h>
21 : :
22 : : /* Specification. */
23 : : #include "localcharset.h"
24 : :
25 : : #include <fcntl.h>
26 : : #include <stddef.h>
27 : : #include <stdio.h>
28 : : #include <string.h>
29 : : #include <stdlib.h>
30 : :
31 : : #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
32 : : # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
33 : : #endif
34 : :
35 : : #if defined _WIN32 || defined __WIN32__
36 : : # define WINDOWS_NATIVE
37 : : #endif
38 : :
39 : : #if defined __EMX__
40 : : /* Assume EMX program runs on OS/2, even if compiled under DOS. */
41 : : # ifndef OS2
42 : : # define OS2
43 : : # endif
44 : : #endif
45 : :
46 : : #if !defined WINDOWS_NATIVE
47 : : # include <unistd.h>
48 : : # if HAVE_LANGINFO_CODESET
49 : : # include <langinfo.h>
50 : : # else
51 : : # if 0 /* see comment below */
52 : : # include <locale.h>
53 : : # endif
54 : : # endif
55 : : # ifdef __CYGWIN__
56 : : # define WIN32_LEAN_AND_MEAN
57 : : # include <windows.h>
58 : : # endif
59 : : #elif defined WINDOWS_NATIVE
60 : : # define WIN32_LEAN_AND_MEAN
61 : : # include <windows.h>
62 : : #endif
63 : : #if defined OS2
64 : : # define INCL_DOS
65 : : # include <os2.h>
66 : : #endif
67 : :
68 : : #if ENABLE_RELOCATABLE
69 : : # include "relocatable.h"
70 : : #else
71 : : # define relocate(pathname) (pathname)
72 : : #endif
73 : :
74 : : /* Get LIBDIR. */
75 : : #ifndef LIBDIR
76 : : # include "configmake.h"
77 : : #endif
78 : :
79 : : /* Define O_NOFOLLOW to 0 on platforms where it does not exist. */
80 : : #ifndef O_NOFOLLOW
81 : : # define O_NOFOLLOW 0
82 : : #endif
83 : :
84 : : #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
85 : : /* Native Windows, Cygwin, OS/2, DOS */
86 : : # define ISSLASH(C) ((C) == '/' || (C) == '\\')
87 : : #endif
88 : :
89 : : #ifndef DIRECTORY_SEPARATOR
90 : : # define DIRECTORY_SEPARATOR '/'
91 : : #endif
92 : :
93 : : #ifndef ISSLASH
94 : : # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
95 : : #endif
96 : :
97 : : #if HAVE_DECL_GETC_UNLOCKED
98 : : # undef getc
99 : : # define getc getc_unlocked
100 : : #endif
101 : :
102 : : /* The following static variable is declared 'volatile' to avoid a
103 : : possible multithread problem in the function get_charset_aliases. If we
104 : : are running in a threaded environment, and if two threads initialize
105 : : 'charset_aliases' simultaneously, both will produce the same value,
106 : : and everything will be ok if the two assignments to 'charset_aliases'
107 : : are atomic. But I don't know what will happen if the two assignments mix. */
108 : : #if __STDC__ != 1
109 : : # define volatile /* empty */
110 : : #endif
111 : : /* Pointer to the contents of the charset.alias file, if it has already been
112 : : read, else NULL. Its format is:
113 : : ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
114 : : static const char * volatile charset_aliases;
115 : :
116 : : /* Return a pointer to the contents of the charset.alias file. */
117 : : static const char *
118 : 88 : get_charset_aliases (void)
119 : : {
120 : : const char *cp;
121 : :
122 : 88 : cp = charset_aliases;
123 [ + + ]: 88 : if (cp == NULL)
124 : : {
125 : : #if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__)
126 : : const char *dir;
127 : 1 : const char *base = "charset.alias";
128 : : char *file_name;
129 : :
130 : : /* Make it possible to override the charset.alias location. This is
131 : : necessary for running the testsuite before "make install". */
132 : 1 : dir = getenv ("CHARSETALIASDIR");
133 [ - + ][ # # ]: 1 : if (dir == NULL || dir[0] == '\0')
134 : 1 : dir = relocate (LIBDIR);
135 : :
136 : : /* Concatenate dir and base into freshly allocated file_name. */
137 : : {
138 : 1 : size_t dir_len = strlen (dir);
139 : 1 : size_t base_len = strlen (base);
140 [ + - ][ + - ]: 1 : int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
141 : 1 : file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
142 [ + - ]: 1 : if (file_name != NULL)
143 : : {
144 : 1 : memcpy (file_name, dir, dir_len);
145 [ + - ]: 1 : if (add_slash)
146 : 1 : file_name[dir_len] = DIRECTORY_SEPARATOR;
147 : 1 : memcpy (file_name + dir_len + add_slash, base, base_len + 1);
148 : : }
149 : : }
150 : :
151 [ - + ]: 1 : if (file_name == NULL)
152 : : /* Out of memory. Treat the file as empty. */
153 : 0 : cp = "";
154 : : else
155 : : {
156 : : int fd;
157 : :
158 : : /* Open the file. Reject symbolic links on platforms that support
159 : : O_NOFOLLOW. This is a security feature. Without it, an attacker
160 : : could retrieve parts of the contents (namely, the tail of the
161 : : first line that starts with "* ") of an arbitrary file by placing
162 : : a symbolic link to that file under the name "charset.alias" in
163 : : some writable directory and defining the environment variable
164 : : CHARSETALIASDIR to point to that directory. */
165 : 1 : fd = open (file_name,
166 : : O_RDONLY | (HAVE_WORKING_O_NOFOLLOW ? O_NOFOLLOW : 0));
167 [ + - ]: 1 : if (fd < 0)
168 : : /* File not found. Treat it as empty. */
169 : 1 : cp = "";
170 : : else
171 : : {
172 : : FILE *fp;
173 : :
174 : 0 : fp = fdopen (fd, "r");
175 [ # # ]: 0 : if (fp == NULL)
176 : : {
177 : : /* Out of memory. Treat the file as empty. */
178 : 0 : close (fd);
179 : 0 : cp = "";
180 : : }
181 : : else
182 : : {
183 : : /* Parse the file's contents. */
184 : 0 : char *res_ptr = NULL;
185 : 0 : size_t res_size = 0;
186 : :
187 : : for (;;)
188 : : {
189 : : int c;
190 : : char buf1[50+1];
191 : : char buf2[50+1];
192 : : size_t l1, l2;
193 : : char *old_res_ptr;
194 : :
195 : 0 : c = getc (fp);
196 [ # # ]: 0 : if (c == EOF)
197 : 0 : break;
198 [ # # ][ # # ]: 0 : if (c == '\n' || c == ' ' || c == '\t')
[ # # ]
199 : 0 : continue;
200 [ # # ]: 0 : if (c == '#')
201 : : {
202 : : /* Skip comment, to end of line. */
203 : : do
204 : 0 : c = getc (fp);
205 [ # # ][ # # ]: 0 : while (!(c == EOF || c == '\n'));
206 [ # # ]: 0 : if (c == EOF)
207 : 0 : break;
208 : 0 : continue;
209 : : }
210 : 0 : ungetc (c, fp);
211 [ # # ]: 0 : if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
212 : 0 : break;
213 : 0 : l1 = strlen (buf1);
214 : 0 : l2 = strlen (buf2);
215 : 0 : old_res_ptr = res_ptr;
216 [ # # ]: 0 : if (res_size == 0)
217 : : {
218 : 0 : res_size = l1 + 1 + l2 + 1;
219 : 0 : res_ptr = (char *) malloc (res_size + 1);
220 : : }
221 : : else
222 : : {
223 : 0 : res_size += l1 + 1 + l2 + 1;
224 : 0 : res_ptr = (char *) realloc (res_ptr, res_size + 1);
225 : : }
226 [ # # ]: 0 : if (res_ptr == NULL)
227 : : {
228 : : /* Out of memory. */
229 : 0 : res_size = 0;
230 : 0 : free (old_res_ptr);
231 : 0 : break;
232 : : }
233 : 0 : strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
234 : 0 : strcpy (res_ptr + res_size - (l2 + 1), buf2);
235 : 0 : }
236 : 0 : fclose (fp);
237 [ # # ]: 0 : if (res_size == 0)
238 : 0 : cp = "";
239 : : else
240 : : {
241 : 0 : *(res_ptr + res_size) = '\0';
242 : 0 : cp = res_ptr;
243 : : }
244 : : }
245 : : }
246 : :
247 : 1 : free (file_name);
248 : : }
249 : :
250 : : #else
251 : :
252 : : # if defined DARWIN7
253 : : /* To avoid the trouble of installing a file that is shared by many
254 : : GNU packages -- many packaging systems have problems with this --,
255 : : simply inline the aliases here. */
256 : : cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
257 : : "ISO8859-2" "\0" "ISO-8859-2" "\0"
258 : : "ISO8859-4" "\0" "ISO-8859-4" "\0"
259 : : "ISO8859-5" "\0" "ISO-8859-5" "\0"
260 : : "ISO8859-7" "\0" "ISO-8859-7" "\0"
261 : : "ISO8859-9" "\0" "ISO-8859-9" "\0"
262 : : "ISO8859-13" "\0" "ISO-8859-13" "\0"
263 : : "ISO8859-15" "\0" "ISO-8859-15" "\0"
264 : : "KOI8-R" "\0" "KOI8-R" "\0"
265 : : "KOI8-U" "\0" "KOI8-U" "\0"
266 : : "CP866" "\0" "CP866" "\0"
267 : : "CP949" "\0" "CP949" "\0"
268 : : "CP1131" "\0" "CP1131" "\0"
269 : : "CP1251" "\0" "CP1251" "\0"
270 : : "eucCN" "\0" "GB2312" "\0"
271 : : "GB2312" "\0" "GB2312" "\0"
272 : : "eucJP" "\0" "EUC-JP" "\0"
273 : : "eucKR" "\0" "EUC-KR" "\0"
274 : : "Big5" "\0" "BIG5" "\0"
275 : : "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
276 : : "GBK" "\0" "GBK" "\0"
277 : : "GB18030" "\0" "GB18030" "\0"
278 : : "SJIS" "\0" "SHIFT_JIS" "\0"
279 : : "ARMSCII-8" "\0" "ARMSCII-8" "\0"
280 : : "PT154" "\0" "PT154" "\0"
281 : : /*"ISCII-DEV" "\0" "?" "\0"*/
282 : : "*" "\0" "UTF-8" "\0";
283 : : # endif
284 : :
285 : : # if defined VMS
286 : : /* To avoid the troubles of an extra file charset.alias_vms in the
287 : : sources of many GNU packages, simply inline the aliases here. */
288 : : /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
289 : : "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
290 : : section 10.7 "Handling Different Character Sets". */
291 : : cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
292 : : "ISO8859-2" "\0" "ISO-8859-2" "\0"
293 : : "ISO8859-5" "\0" "ISO-8859-5" "\0"
294 : : "ISO8859-7" "\0" "ISO-8859-7" "\0"
295 : : "ISO8859-8" "\0" "ISO-8859-8" "\0"
296 : : "ISO8859-9" "\0" "ISO-8859-9" "\0"
297 : : /* Japanese */
298 : : "eucJP" "\0" "EUC-JP" "\0"
299 : : "SJIS" "\0" "SHIFT_JIS" "\0"
300 : : "DECKANJI" "\0" "DEC-KANJI" "\0"
301 : : "SDECKANJI" "\0" "EUC-JP" "\0"
302 : : /* Chinese */
303 : : "eucTW" "\0" "EUC-TW" "\0"
304 : : "DECHANYU" "\0" "DEC-HANYU" "\0"
305 : : "DECHANZI" "\0" "GB2312" "\0"
306 : : /* Korean */
307 : : "DECKOREAN" "\0" "EUC-KR" "\0";
308 : : # endif
309 : :
310 : : # if defined WINDOWS_NATIVE || defined __CYGWIN__
311 : : /* To avoid the troubles of installing a separate file in the same
312 : : directory as the DLL and of retrieving the DLL's directory at
313 : : runtime, simply inline the aliases here. */
314 : :
315 : : cp = "CP936" "\0" "GBK" "\0"
316 : : "CP1361" "\0" "JOHAB" "\0"
317 : : "CP20127" "\0" "ASCII" "\0"
318 : : "CP20866" "\0" "KOI8-R" "\0"
319 : : "CP20936" "\0" "GB2312" "\0"
320 : : "CP21866" "\0" "KOI8-RU" "\0"
321 : : "CP28591" "\0" "ISO-8859-1" "\0"
322 : : "CP28592" "\0" "ISO-8859-2" "\0"
323 : : "CP28593" "\0" "ISO-8859-3" "\0"
324 : : "CP28594" "\0" "ISO-8859-4" "\0"
325 : : "CP28595" "\0" "ISO-8859-5" "\0"
326 : : "CP28596" "\0" "ISO-8859-6" "\0"
327 : : "CP28597" "\0" "ISO-8859-7" "\0"
328 : : "CP28598" "\0" "ISO-8859-8" "\0"
329 : : "CP28599" "\0" "ISO-8859-9" "\0"
330 : : "CP28605" "\0" "ISO-8859-15" "\0"
331 : : "CP38598" "\0" "ISO-8859-8" "\0"
332 : : "CP51932" "\0" "EUC-JP" "\0"
333 : : "CP51936" "\0" "GB2312" "\0"
334 : : "CP51949" "\0" "EUC-KR" "\0"
335 : : "CP51950" "\0" "EUC-TW" "\0"
336 : : "CP54936" "\0" "GB18030" "\0"
337 : : "CP65001" "\0" "UTF-8" "\0";
338 : : # endif
339 : : #endif
340 : :
341 : 1 : charset_aliases = cp;
342 : : }
343 : :
344 : 88 : return cp;
345 : : }
346 : :
347 : : /* Determine the current locale's character encoding, and canonicalize it
348 : : into one of the canonical names listed in config.charset.
349 : : The result must not be freed; it is statically allocated.
350 : : If the canonical name cannot be determined, the result is a non-canonical
351 : : name. */
352 : :
353 : : #ifdef STATIC
354 : : STATIC
355 : : #endif
356 : : const char *
357 : 88 : locale_charset (void)
358 : : {
359 : : const char *codeset;
360 : : const char *aliases;
361 : :
362 : : #if !(defined WINDOWS_NATIVE || defined OS2)
363 : :
364 : : # if HAVE_LANGINFO_CODESET
365 : :
366 : : /* Most systems support nl_langinfo (CODESET) nowadays. */
367 : 88 : codeset = nl_langinfo (CODESET);
368 : :
369 : : # ifdef __CYGWIN__
370 : : /* Cygwin < 1.7 does not have locales. nl_langinfo (CODESET) always
371 : : returns "US-ASCII". Return the suffix of the locale name from the
372 : : environment variables (if present) or the codepage as a number. */
373 : : if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
374 : : {
375 : : const char *locale;
376 : : static char buf[2 + 10 + 1];
377 : :
378 : : locale = getenv ("LC_ALL");
379 : : if (locale == NULL || locale[0] == '\0')
380 : : {
381 : : locale = getenv ("LC_CTYPE");
382 : : if (locale == NULL || locale[0] == '\0')
383 : : locale = getenv ("LANG");
384 : : }
385 : : if (locale != NULL && locale[0] != '\0')
386 : : {
387 : : /* If the locale name contains an encoding after the dot, return
388 : : it. */
389 : : const char *dot = strchr (locale, '.');
390 : :
391 : : if (dot != NULL)
392 : : {
393 : : const char *modifier;
394 : :
395 : : dot++;
396 : : /* Look for the possible @... trailer and remove it, if any. */
397 : : modifier = strchr (dot, '@');
398 : : if (modifier == NULL)
399 : : return dot;
400 : : if (modifier - dot < sizeof (buf))
401 : : {
402 : : memcpy (buf, dot, modifier - dot);
403 : : buf [modifier - dot] = '\0';
404 : : return buf;
405 : : }
406 : : }
407 : : }
408 : :
409 : : /* The Windows API has a function returning the locale's codepage as a
410 : : number: GetACP(). This encoding is used by Cygwin, unless the user
411 : : has set the environment variable CYGWIN=codepage:oem (which very few
412 : : people do).
413 : : Output directed to console windows needs to be converted (to
414 : : GetOEMCP() if the console is using a raster font, or to
415 : : GetConsoleOutputCP() if it is using a TrueType font). Cygwin does
416 : : this conversion transparently (see winsup/cygwin/fhandler_console.cc),
417 : : converting to GetConsoleOutputCP(). This leads to correct results,
418 : : except when SetConsoleOutputCP has been called and a raster font is
419 : : in use. */
420 : : sprintf (buf, "CP%u", GetACP ());
421 : : codeset = buf;
422 : : }
423 : : # endif
424 : :
425 : : # else
426 : :
427 : : /* On old systems which lack it, use setlocale or getenv. */
428 : : const char *locale = NULL;
429 : :
430 : : /* But most old systems don't have a complete set of locales. Some
431 : : (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
432 : : use setlocale here; it would return "C" when it doesn't support the
433 : : locale name the user has set. */
434 : : # if 0
435 : : locale = setlocale (LC_CTYPE, NULL);
436 : : # endif
437 : : if (locale == NULL || locale[0] == '\0')
438 : : {
439 : : locale = getenv ("LC_ALL");
440 : : if (locale == NULL || locale[0] == '\0')
441 : : {
442 : : locale = getenv ("LC_CTYPE");
443 : : if (locale == NULL || locale[0] == '\0')
444 : : locale = getenv ("LANG");
445 : : }
446 : : }
447 : :
448 : : /* On some old systems, one used to set locale = "iso8859_1". On others,
449 : : you set it to "language_COUNTRY.charset". In any case, we resolve it
450 : : through the charset.alias file. */
451 : : codeset = locale;
452 : :
453 : : # endif
454 : :
455 : : #elif defined WINDOWS_NATIVE
456 : :
457 : : static char buf[2 + 10 + 1];
458 : :
459 : : /* The Windows API has a function returning the locale's codepage as a
460 : : number: GetACP().
461 : : When the output goes to a console window, it needs to be provided in
462 : : GetOEMCP() encoding if the console is using a raster font, or in
463 : : GetConsoleOutputCP() encoding if it is using a TrueType font.
464 : : But in GUI programs and for output sent to files and pipes, GetACP()
465 : : encoding is the best bet. */
466 : : sprintf (buf, "CP%u", GetACP ());
467 : : codeset = buf;
468 : :
469 : : #elif defined OS2
470 : :
471 : : const char *locale;
472 : : static char buf[2 + 10 + 1];
473 : : ULONG cp[3];
474 : : ULONG cplen;
475 : :
476 : : /* Allow user to override the codeset, as set in the operating system,
477 : : with standard language environment variables. */
478 : : locale = getenv ("LC_ALL");
479 : : if (locale == NULL || locale[0] == '\0')
480 : : {
481 : : locale = getenv ("LC_CTYPE");
482 : : if (locale == NULL || locale[0] == '\0')
483 : : locale = getenv ("LANG");
484 : : }
485 : : if (locale != NULL && locale[0] != '\0')
486 : : {
487 : : /* If the locale name contains an encoding after the dot, return it. */
488 : : const char *dot = strchr (locale, '.');
489 : :
490 : : if (dot != NULL)
491 : : {
492 : : const char *modifier;
493 : :
494 : : dot++;
495 : : /* Look for the possible @... trailer and remove it, if any. */
496 : : modifier = strchr (dot, '@');
497 : : if (modifier == NULL)
498 : : return dot;
499 : : if (modifier - dot < sizeof (buf))
500 : : {
501 : : memcpy (buf, dot, modifier - dot);
502 : : buf [modifier - dot] = '\0';
503 : : return buf;
504 : : }
505 : : }
506 : :
507 : : /* Resolve through the charset.alias file. */
508 : : codeset = locale;
509 : : }
510 : : else
511 : : {
512 : : /* OS/2 has a function returning the locale's codepage as a number. */
513 : : if (DosQueryCp (sizeof (cp), cp, &cplen))
514 : : codeset = "";
515 : : else
516 : : {
517 : : sprintf (buf, "CP%u", cp[0]);
518 : : codeset = buf;
519 : : }
520 : : }
521 : :
522 : : #endif
523 : :
524 [ - + ]: 88 : if (codeset == NULL)
525 : : /* The canonical name cannot be determined. */
526 : 0 : codeset = "";
527 : :
528 : : /* Resolve alias. */
529 [ - + ]: 88 : for (aliases = get_charset_aliases ();
530 : 88 : *aliases != '\0';
531 : 0 : aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
532 [ # # ]: 0 : if (strcmp (codeset, aliases) == 0
533 [ # # ][ # # ]: 0 : || (aliases[0] == '*' && aliases[1] == '\0'))
534 : : {
535 : 0 : codeset = aliases + strlen (aliases) + 1;
536 : 0 : break;
537 : : }
538 : :
539 : : /* Don't return an empty string. GNU libc and GNU libiconv interpret
540 : : the empty string as denoting "the locale's character encoding",
541 : : thus GNU libiconv would call this function a second time. */
542 [ - + ]: 88 : if (codeset[0] == '\0')
543 : 0 : codeset = "ASCII";
544 : :
545 : 88 : return codeset;
546 : : }
|