1 /* Encoding utility from UTF-8 to another charset and vice versa
2 Copyright (C) 2001, 2002 Peter Verthez
4 The UTF8 tools library is free software; you can redistribute it
5 and/or modify it under the terms of the GNU Lesser General Public
6 License as published by the Free Software Foundation; either
7 version 2.1 of the License, or (at your option) any later version.
9 The Gedcom parser library is distributed in the hope that it will be
10 useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public
15 License along with the Gedcom parser library; if not, write to the
16 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
23 #include "utf8tools.h"
30 #define INITIAL_BUFSIZE 256
31 #define DEFAULT_UNKNOWN "?"
33 #define INTERNAL_BUFFER 0
34 #define EXTERNAL_BUFFER 1
39 int type; /* For internal use */
45 struct conv_buffer* inbuf;
47 struct conv_buffer* outbuf;
51 void reset_conv_buffer(conv_buffer_t buf)
53 memset(buf->buffer, 0, buf->size);
56 conv_buffer_t create_conv_buffer(int size)
58 struct conv_buffer* buf = NULL;
60 if (size == 0) size = INITIAL_BUFSIZE;
62 buf = (struct conv_buffer*) malloc(sizeof(struct conv_buffer));
65 buf->buffer = (char*)malloc(size);
66 buf->type = EXTERNAL_BUFFER;
68 reset_conv_buffer(buf);
76 void free_conv_buffer(conv_buffer_t buf)
84 char* grow_conv_buffer(conv_buffer_t buf, char* curr_pos)
86 size_t outlen, new_size;
88 outlen = curr_pos - buf->buffer;
89 new_size = buf->size * 2;
90 new_buffer = realloc(buf->buffer, new_size);
92 buf->buffer = new_buffer;
94 curr_pos = buf->buffer + outlen;
95 memset(curr_pos, 0, buf->size - (curr_pos - buf->buffer));
102 convert_t initialize_utf8_conversion(const char* charset, int external_outbuf)
104 struct convert *conv = NULL;
108 conv = (struct convert *)malloc(sizeof(struct convert));
110 /* Unless reset to 0 at the end, this will force cleanup */
112 /* First initialize to default values */
113 conv->from_utf8 = (iconv_t)-1;
114 conv->to_utf8 = (iconv_t)-1;
118 conv->unknown = NULL;
120 /* Now initialize everything to what it should be */
121 conv->from_utf8 = iconv_open(charset, "UTF-8");
122 if (conv->from_utf8 != (iconv_t)-1) {
123 conv->to_utf8 = iconv_open("UTF-8", charset);
124 if (conv->to_utf8 != (iconv_t)-1) {
125 conv->unknown = strdup(DEFAULT_UNKNOWN);
127 conv->inbuf = create_conv_buffer(INITIAL_BUFSIZE);
128 conv->inbuf->type = INTERNAL_BUFFER;
133 conv->outbuf = create_conv_buffer(INITIAL_BUFSIZE);
134 conv->outbuf->type = INTERNAL_BUFFER;
136 cleanup = 0; /* All successful */
146 cleanup_utf8_conversion(conv);
154 int conversion_set_unknown(convert_t conv, const char* unknown)
158 if (conv && unknown) {
159 char* unknown_copy = strdup(unknown);
161 if (conv->unknown) free(conv->unknown);
162 conv->unknown = unknown_copy;
171 int conversion_set_output_buffer(convert_t conv, conv_buffer_t buf)
175 else if ((!conv->outbuf || conv->outbuf->type == EXTERNAL_BUFFER)
176 && buf && buf->type == EXTERNAL_BUFFER) {
184 void cleanup_utf8_conversion(convert_t conv)
187 if (conv->from_utf8 != (iconv_t)-1)
188 iconv_close(conv->from_utf8);
189 if (conv->to_utf8 != (iconv_t)-1)
190 iconv_close(conv->to_utf8);
191 if (conv->inbuf && conv->inbuf->type == INTERNAL_BUFFER)
192 free_conv_buffer(conv->inbuf);
193 if (conv->outbuf && conv->outbuf->type == INTERNAL_BUFFER)
194 free_conv_buffer(conv->outbuf);
201 char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails,
204 size_t insize = strlen(input);
206 ICONV_CONST char* inptr = (ICONV_CONST char*) input;
209 struct conv_buffer* outbuf;
211 if (!conv || !conv->outbuf) {
212 if (conv_fails != NULL) *conv_fails = insize;
215 /* make sure we start from an empty state */
216 iconv(conv->from_utf8, NULL, NULL, NULL, NULL);
217 if (conv_fails != NULL) *conv_fails = 0;
218 /* set up output buffer (empty it) */
219 outbuf = conv->outbuf;
220 outptr = outbuf->buffer;
221 outsize = outbuf->size;
222 reset_conv_buffer(conv->outbuf);
223 nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize);
224 while (nconv == (size_t)-1) {
225 if (errno == E2BIG) {
226 /* grow the output buffer */
227 outptr = grow_conv_buffer(outbuf, outptr);
229 outsize = outbuf->size - (outptr - outbuf->buffer);
235 else if (errno == EILSEQ) {
236 /* skip over character */
237 const char* unkn_ptr = conv->unknown;
238 if (conv_fails != NULL) (*conv_fails)++;
239 if ((*inptr & 0x80) == 0) {
240 /* an ASCII character, just skip one (this case is very improbable) */
244 /* a general UTF-8 character, skip all 0x10xxxxxx bytes */
246 while ((*inptr & 0xC0) == 0x80) {
250 /* append the "unknown" string to the output */
251 while (*unkn_ptr) { *outptr++ = *unkn_ptr++; outsize--; }
254 /* EINVAL should not happen, since we convert entire strings */
255 /* EBADF is an error which should be captured by the first if above */
256 if (conv_fails != NULL) *conv_fails += insize;
259 nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize);
261 if (output_len) *output_len = outptr - outbuf->buffer;
262 return outbuf->buffer;
265 char* convert_to_utf8(convert_t conv, const char* input, size_t input_len)
268 ICONV_CONST char *inptr = (ICONV_CONST char*) input;
271 struct conv_buffer* outbuf;
273 if (!conv || !conv->outbuf)
275 /* make sure we start from an empty state */
276 iconv(conv->to_utf8, NULL, NULL, NULL, NULL);
277 /* set up output buffer (empty it) */
278 outbuf = conv->outbuf;
279 outptr = outbuf->buffer;
280 outsize = outbuf->size;
281 reset_conv_buffer(conv->outbuf);
282 nconv = iconv(conv->to_utf8, &inptr, &input_len, &outptr, &outsize);
283 while (nconv == (size_t)-1) {
284 if (errno == E2BIG) {
285 /* grow the output buffer */
286 outptr = grow_conv_buffer(outbuf, outptr);
288 outsize = outbuf->size - (outptr - outbuf->buffer);
295 /* EILSEQ happens when the input doesn't match the source encoding,
296 return NULL in this case */
297 /* EINVAL should not happen, since we convert entire strings */
298 /* EBADF is an error which should be captured by the first if above */
301 nconv = iconv(conv->to_utf8, &inptr, &input_len, &outptr, &outsize);
303 return outbuf->buffer;
306 char* convert_to_utf8_incremental(convert_t conv,
307 const char* input, size_t input_len)
310 struct conv_buffer* outbuf = conv->outbuf;
311 struct conv_buffer* inbuf = conv->inbuf;
312 size_t outsize = outbuf->size;
313 char* wrptr = outbuf->buffer;
314 ICONV_CONST char* rdptr = (ICONV_CONST char*) inbuf->buffer;
315 char* retval = outbuf->buffer;
317 if (!conv || !conv->outbuf)
320 /* set up input buffer (concatenate to what was left previous time) */
321 /* can't use strcpy, because possible null bytes from unicode */
322 while (conv->insize + input_len > inbuf->size)
323 grow_conv_buffer(inbuf, inbuf->buffer + conv->insize);
324 memcpy(inbuf->buffer + conv->insize, input, input_len);
325 conv->insize += input_len;
327 /* set up output buffer (empty it) */
328 reset_conv_buffer(outbuf);
330 /* do the conversion */
331 res = iconv(conv->to_utf8, &rdptr, &conv->insize, &wrptr, &outsize);
332 if (res == (size_t)-1) {
333 if (errno == EILSEQ) {
334 /* restart from an empty state and return NULL */
339 else if (errno == EINVAL) {
340 /* Do nothing, leave it to next iteration */
347 /* then shift what is left over to the head of the input buffer */
348 memmove(inbuf->buffer, rdptr, conv->insize);
349 memset(inbuf->buffer + conv->insize, 0, inbuf->size - conv->insize);