Line data Source code
1 : /* Charset conversion.
2 : Copyright (C) 2001-2007, 2010-2020 Free Software Foundation, Inc.
3 : Written by Bruno Haible and Simon Josefsson.
4 :
5 : This program is free software; you can redistribute it and/or modify
6 : it under the terms of the GNU Lesser General Public License as published by
7 : the Free Software Foundation; either version 2.1, or (at your option)
8 : any later version.
9 :
10 : This program is distributed in the hope that it will be useful,
11 : but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : GNU Lesser General Public License for more details.
14 :
15 : You should have received a copy of the GNU Lesser General Public License
16 : along with this program; if not, see <https://www.gnu.org/licenses/>. */
17 :
18 : #include <config.h>
19 :
20 : /* Specification. */
21 : #include "striconv.h"
22 :
23 : #include <errno.h>
24 : #include <stdlib.h>
25 : #include <string.h>
26 :
27 : #if HAVE_ICONV
28 : # include <iconv.h>
29 : /* Get MB_LEN_MAX, CHAR_BIT. */
30 : # include <limits.h>
31 : #endif
32 :
33 : #include "c-strcase.h"
34 :
35 : #ifndef SIZE_MAX
36 : # define SIZE_MAX ((size_t) -1)
37 : #endif
38 :
39 :
40 : #if HAVE_ICONV
41 :
42 : int
43 4 : mem_cd_iconv (const char *src, size_t srclen, iconv_t cd,
44 : char **resultp, size_t *lengthp)
45 : {
46 : # define tmpbufsize 4096
47 : size_t length;
48 : char *result;
49 :
50 : /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
51 : # if defined _LIBICONV_VERSION \
52 : || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
53 : || defined __sun)
54 : /* Set to the initial state. */
55 4 : iconv (cd, NULL, NULL, NULL, NULL);
56 : # endif
57 :
58 : /* Determine the length we need. */
59 : {
60 4 : size_t count = 0;
61 : /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
62 : libiconv's UCS-4-INTERNAL encoding. */
63 : union { unsigned int align; char buf[tmpbufsize]; } tmp;
64 : # define tmpbuf tmp.buf
65 4 : const char *inptr = src;
66 4 : size_t insize = srclen;
67 :
68 6 : while (insize > 0)
69 : {
70 4 : char *outptr = tmpbuf;
71 4 : size_t outsize = tmpbufsize;
72 4 : size_t res = iconv (cd,
73 : (ICONV_CONST char **) &inptr, &insize,
74 : &outptr, &outsize);
75 :
76 4 : if (res == (size_t)(-1))
77 : {
78 2 : if (errno == E2BIG)
79 : ;
80 2 : else if (errno == EINVAL)
81 1 : break;
82 : else
83 1 : return -1;
84 : }
85 : # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
86 : /* Irix iconv() inserts a NUL byte if it cannot convert.
87 : NetBSD iconv() inserts a question mark if it cannot convert.
88 : Only GNU libiconv and GNU libc are known to prefer to fail rather
89 : than doing a lossy conversion. */
90 : else if (res > 0)
91 : {
92 : errno = EILSEQ;
93 : return -1;
94 : }
95 : # endif
96 2 : count += outptr - tmpbuf;
97 : }
98 : /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */
99 : # if defined _LIBICONV_VERSION \
100 : || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
101 : || defined __sun)
102 : {
103 3 : char *outptr = tmpbuf;
104 3 : size_t outsize = tmpbufsize;
105 3 : size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
106 :
107 3 : if (res == (size_t)(-1))
108 0 : return -1;
109 3 : count += outptr - tmpbuf;
110 : }
111 : # endif
112 3 : length = count;
113 : # undef tmpbuf
114 : }
115 :
116 3 : if (length == 0)
117 : {
118 1 : *lengthp = 0;
119 1 : return 0;
120 : }
121 2 : if (*resultp != NULL && *lengthp >= length)
122 0 : result = *resultp;
123 : else
124 : {
125 2 : result = (char *) malloc (length);
126 2 : if (result == NULL)
127 : {
128 0 : errno = ENOMEM;
129 0 : return -1;
130 : }
131 : }
132 :
133 : /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
134 : # if defined _LIBICONV_VERSION \
135 : || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
136 : || defined __sun)
137 : /* Return to the initial state. */
138 2 : iconv (cd, NULL, NULL, NULL, NULL);
139 : # endif
140 :
141 : /* Do the conversion for real. */
142 : {
143 2 : const char *inptr = src;
144 2 : size_t insize = srclen;
145 2 : char *outptr = result;
146 2 : size_t outsize = length;
147 :
148 4 : while (insize > 0)
149 : {
150 2 : size_t res = iconv (cd,
151 : (ICONV_CONST char **) &inptr, &insize,
152 : &outptr, &outsize);
153 :
154 2 : if (res == (size_t)(-1))
155 : {
156 0 : if (errno == EINVAL)
157 0 : break;
158 : else
159 0 : goto fail;
160 : }
161 : # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
162 : /* Irix iconv() inserts a NUL byte if it cannot convert.
163 : NetBSD iconv() inserts a question mark if it cannot convert.
164 : Only GNU libiconv and GNU libc are known to prefer to fail rather
165 : than doing a lossy conversion. */
166 : else if (res > 0)
167 : {
168 : errno = EILSEQ;
169 : goto fail;
170 : }
171 : # endif
172 : }
173 : /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */
174 : # if defined _LIBICONV_VERSION \
175 : || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
176 : || defined __sun)
177 : {
178 2 : size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
179 :
180 2 : if (res == (size_t)(-1))
181 0 : goto fail;
182 : }
183 : # endif
184 2 : if (outsize != 0)
185 0 : abort ();
186 : }
187 :
188 2 : *resultp = result;
189 2 : *lengthp = length;
190 :
191 2 : return 0;
192 :
193 0 : fail:
194 : {
195 0 : if (result != *resultp)
196 : {
197 0 : int saved_errno = errno;
198 0 : free (result);
199 0 : errno = saved_errno;
200 : }
201 0 : return -1;
202 : }
203 : # undef tmpbufsize
204 : }
205 :
206 : char *
207 3617 : str_cd_iconv (const char *src, iconv_t cd)
208 : {
209 : /* For most encodings, a trailing NUL byte in the input will be converted
210 : to a trailing NUL byte in the output. But not for UTF-7. So that this
211 : function is usable for UTF-7, we have to exclude the NUL byte from the
212 : conversion and add it by hand afterwards. */
213 : # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
214 : /* Irix iconv() inserts a NUL byte if it cannot convert.
215 : NetBSD iconv() inserts a question mark if it cannot convert.
216 : Only GNU libiconv and GNU libc are known to prefer to fail rather
217 : than doing a lossy conversion. For other iconv() implementations,
218 : we have to look at the number of irreversible conversions returned;
219 : but this information is lost when iconv() returns for an E2BIG reason.
220 : Therefore we cannot use the second, faster algorithm. */
221 :
222 : char *result = NULL;
223 : size_t length = 0;
224 : int retval = mem_cd_iconv (src, strlen (src), cd, &result, &length);
225 : char *final_result;
226 :
227 : if (retval < 0)
228 : {
229 : if (result != NULL)
230 : abort ();
231 : return NULL;
232 : }
233 :
234 : /* Add the terminating NUL byte. */
235 : final_result =
236 : (result != NULL ? realloc (result, length + 1) : malloc (length + 1));
237 : if (final_result == NULL)
238 : {
239 : free (result);
240 : errno = ENOMEM;
241 : return NULL;
242 : }
243 : final_result[length] = '\0';
244 :
245 : return final_result;
246 :
247 : # else
248 : /* This algorithm is likely faster than the one above. But it may produce
249 : iconv() returns for an E2BIG reason, when the output size guess is too
250 : small. Therefore it can only be used when we don't need the number of
251 : irreversible conversions performed. */
252 : char *result;
253 : size_t result_size;
254 : size_t length;
255 3617 : const char *inptr = src;
256 3617 : size_t inbytes_remaining = strlen (src);
257 :
258 : /* Make a guess for the worst-case output size, in order to avoid a
259 : realloc. It's OK if the guess is wrong as long as it is not zero and
260 : doesn't lead to an integer overflow. */
261 3617 : result_size = inbytes_remaining;
262 : {
263 3617 : size_t approx_sqrt_SIZE_MAX = SIZE_MAX >> (sizeof (size_t) * CHAR_BIT / 2);
264 3617 : if (result_size <= approx_sqrt_SIZE_MAX / MB_LEN_MAX)
265 3617 : result_size *= MB_LEN_MAX;
266 : }
267 3617 : result_size += 1; /* for the terminating NUL */
268 :
269 3617 : result = (char *) malloc (result_size);
270 3617 : if (result == NULL)
271 : {
272 0 : errno = ENOMEM;
273 0 : return NULL;
274 : }
275 :
276 : /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
277 : # if defined _LIBICONV_VERSION \
278 : || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
279 : || defined __sun)
280 : /* Set to the initial state. */
281 3617 : iconv (cd, NULL, NULL, NULL, NULL);
282 : # endif
283 :
284 : /* Do the conversion. */
285 : {
286 3617 : char *outptr = result;
287 3617 : size_t outbytes_remaining = result_size - 1;
288 :
289 : for (;;)
290 0 : {
291 : /* Here inptr + inbytes_remaining = src + strlen (src),
292 : outptr + outbytes_remaining = result + result_size - 1. */
293 3617 : size_t res = iconv (cd,
294 : (ICONV_CONST char **) &inptr, &inbytes_remaining,
295 : &outptr, &outbytes_remaining);
296 :
297 3617 : if (res == (size_t)(-1))
298 : {
299 1776 : if (errno == EINVAL)
300 2 : break;
301 1774 : else if (errno == E2BIG)
302 : {
303 0 : size_t used = outptr - result;
304 0 : size_t newsize = result_size * 2;
305 : char *newresult;
306 :
307 0 : if (!(newsize > result_size))
308 : {
309 0 : errno = ENOMEM;
310 1774 : goto failed;
311 : }
312 0 : newresult = (char *) realloc (result, newsize);
313 0 : if (newresult == NULL)
314 : {
315 0 : errno = ENOMEM;
316 0 : goto failed;
317 : }
318 0 : result = newresult;
319 0 : result_size = newsize;
320 0 : outptr = result + used;
321 0 : outbytes_remaining = result_size - 1 - used;
322 : }
323 : else
324 1774 : goto failed;
325 : }
326 : else
327 1841 : break;
328 : }
329 : /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */
330 : # if defined _LIBICONV_VERSION \
331 : || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
332 : || defined __sun)
333 : for (;;)
334 0 : {
335 : /* Here outptr + outbytes_remaining = result + result_size - 1. */
336 1843 : size_t res = iconv (cd, NULL, NULL, &outptr, &outbytes_remaining);
337 :
338 1843 : if (res == (size_t)(-1))
339 : {
340 0 : if (errno == E2BIG)
341 : {
342 0 : size_t used = outptr - result;
343 0 : size_t newsize = result_size * 2;
344 : char *newresult;
345 :
346 0 : if (!(newsize > result_size))
347 : {
348 0 : errno = ENOMEM;
349 0 : goto failed;
350 : }
351 0 : newresult = (char *) realloc (result, newsize);
352 0 : if (newresult == NULL)
353 : {
354 0 : errno = ENOMEM;
355 0 : goto failed;
356 : }
357 0 : result = newresult;
358 0 : result_size = newsize;
359 0 : outptr = result + used;
360 0 : outbytes_remaining = result_size - 1 - used;
361 : }
362 : else
363 0 : goto failed;
364 : }
365 : else
366 1843 : break;
367 : }
368 : # endif
369 :
370 : /* Add the terminating NUL byte. */
371 1843 : *outptr++ = '\0';
372 :
373 1843 : length = outptr - result;
374 : }
375 :
376 : /* Give away unused memory. */
377 1843 : if (length < result_size)
378 : {
379 1843 : char *smaller_result = (char *) realloc (result, length);
380 :
381 1843 : if (smaller_result != NULL)
382 1843 : result = smaller_result;
383 : }
384 :
385 1843 : return result;
386 :
387 1774 : failed:
388 : {
389 1774 : int saved_errno = errno;
390 1774 : free (result);
391 1774 : errno = saved_errno;
392 1774 : return NULL;
393 : }
394 :
395 : # endif
396 : }
397 :
398 : #endif
399 :
400 : char *
401 3783 : str_iconv (const char *src, const char *from_codeset, const char *to_codeset)
402 : {
403 3783 : if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
404 : {
405 170 : char *result = strdup (src);
406 :
407 170 : if (result == NULL)
408 0 : errno = ENOMEM;
409 170 : return result;
410 : }
411 : else
412 : {
413 : #if HAVE_ICONV
414 : iconv_t cd;
415 : char *result;
416 :
417 : /* Avoid glibc-2.1 bug with EUC-KR. */
418 : # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
419 : && !defined _LIBICONV_VERSION
420 : if (c_strcasecmp (from_codeset, "EUC-KR") == 0
421 : || c_strcasecmp (to_codeset, "EUC-KR") == 0)
422 : {
423 : errno = EINVAL;
424 : return NULL;
425 : }
426 : # endif
427 3613 : cd = iconv_open (to_codeset, from_codeset);
428 3613 : if (cd == (iconv_t) -1)
429 0 : return NULL;
430 :
431 3613 : result = str_cd_iconv (src, cd);
432 :
433 3613 : if (result == NULL)
434 : {
435 : /* Close cd, but preserve the errno from str_cd_iconv. */
436 1773 : int saved_errno = errno;
437 1773 : iconv_close (cd);
438 1773 : errno = saved_errno;
439 : }
440 : else
441 : {
442 1840 : if (iconv_close (cd) < 0)
443 : {
444 : /* Return NULL, but free the allocated memory, and while doing
445 : that, preserve the errno from iconv_close. */
446 0 : int saved_errno = errno;
447 0 : free (result);
448 0 : errno = saved_errno;
449 0 : return NULL;
450 : }
451 : }
452 3613 : return result;
453 : #else
454 : /* This is a different error code than if iconv_open existed but didn't
455 : support from_codeset and to_codeset, so that the caller can emit
456 : an error message such as
457 : "iconv() is not supported. Installing GNU libiconv and
458 : then reinstalling this package would fix this." */
459 : errno = ENOSYS;
460 : return NULL;
461 : #endif
462 : }
463 : }
|