1 /* Encoding utility from UTF-8 to another charset and vice versa
2 Copyright (C) 2001, 2002 Peter Verthez
4 The UTF8 tools library is free software; you can redistribute it
5 and/or modify it under the terms of the GNU Lesser General Public
6 License as published by the Free Software Foundation; either
7 version 2.1 of the License, or (at your option) any later version.
9 The Gedcom parser library is distributed in the hope that it will be
10 useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
14 You should have received a copy of the GNU Lesser General Public
15 License along with the Gedcom parser library; if not, write to the
16 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
23 #include "utf8tools.h"
30 #define INITIAL_BUFSIZE 256
31 #define DEFAULT_UNKNOWN "?"
33 #define INTERNAL_BUFFER 0
34 #define EXTERNAL_BUFFER 1
39 int type; /* For internal use */
45 struct conv_buffer* inbuf;
47 struct conv_buffer* outbuf;
51 static void reset_conv_buffer(conv_buffer_t buf)
53 memset(buf->buffer, 0, buf->size);
56 conv_buffer_t create_conv_buffer(int size)
58 struct conv_buffer* buf = NULL;
60 if (size == 0) size = INITIAL_BUFSIZE;
62 buf = (struct conv_buffer*) malloc(sizeof(struct conv_buffer));
65 buf->buffer = (char*)malloc(size);
66 buf->type = EXTERNAL_BUFFER;
68 reset_conv_buffer(buf);
76 void free_conv_buffer(conv_buffer_t buf)
84 static char* grow_conv_buffer(conv_buffer_t buf, char* curr_pos)
86 size_t outlen, new_size;
88 outlen = curr_pos - buf->buffer;
89 new_size = buf->size * 2;
90 new_buffer = realloc(buf->buffer, new_size);
92 buf->buffer = new_buffer;
94 curr_pos = buf->buffer + outlen;
95 memset(curr_pos, 0, buf->size - (curr_pos - buf->buffer));
102 convert_t initialize_utf8_conversion(const char* charset, int external_outbuf)
104 struct convert *conv = NULL;
108 conv = (struct convert *)malloc(sizeof(struct convert));
110 /* Unless reset to 0 at the end, this will force cleanup */
112 /* First initialize to default values */
113 conv->from_utf8 = (iconv_t)-1;
114 conv->to_utf8 = (iconv_t)-1;
118 conv->unknown = NULL;
120 /* Now initialize everything to what it should be */
121 conv->from_utf8 = iconv_open(charset, "UTF-8");
122 if (conv->from_utf8 != (iconv_t)-1) {
123 conv->to_utf8 = iconv_open("UTF-8", charset);
124 if (conv->to_utf8 != (iconv_t)-1) {
125 conv->unknown = strdup(DEFAULT_UNKNOWN);
127 conv->inbuf = create_conv_buffer(INITIAL_BUFSIZE);
128 conv->inbuf->type = INTERNAL_BUFFER;
133 conv->outbuf = create_conv_buffer(INITIAL_BUFSIZE);
134 conv->outbuf->type = INTERNAL_BUFFER;
136 cleanup = 0; /* All successful */
146 cleanup_utf8_conversion(conv);
154 int conversion_set_unknown(convert_t conv, const char* unknown)
158 if (conv && unknown) {
159 char* unknown_copy = strdup(unknown);
161 if (conv->unknown) free(conv->unknown);
162 conv->unknown = unknown_copy;
171 int conversion_set_output_buffer(convert_t conv, conv_buffer_t buf)
175 else if ((!conv->outbuf || conv->outbuf->type == EXTERNAL_BUFFER)
176 && buf && buf->type == EXTERNAL_BUFFER) {
184 void cleanup_utf8_conversion(convert_t conv)
187 if (conv->from_utf8 != (iconv_t)-1)
188 iconv_close(conv->from_utf8);
189 if (conv->to_utf8 != (iconv_t)-1)
190 iconv_close(conv->to_utf8);
191 if (conv->inbuf && conv->inbuf->type == INTERNAL_BUFFER)
192 free_conv_buffer(conv->inbuf);
193 if (conv->outbuf && conv->outbuf->type == INTERNAL_BUFFER)
194 free_conv_buffer(conv->outbuf);
201 char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails,
206 ICONV_CONST char* inptr = (ICONV_CONST char*) input;
209 struct conv_buffer* outbuf;
211 if (!conv || !conv->outbuf || !input) {
212 if (conv_fails != NULL) *conv_fails = (input ? strlen(input) : 0);
215 insize = strlen(input);
216 /* make sure we start from an empty state */
217 iconv(conv->from_utf8, NULL, NULL, NULL, NULL);
218 if (conv_fails != NULL) *conv_fails = 0;
219 /* set up output buffer (empty it) */
220 outbuf = conv->outbuf;
221 outptr = outbuf->buffer;
222 outsize = outbuf->size;
223 reset_conv_buffer(conv->outbuf);
224 nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize);
225 while (nconv == (size_t)-1) {
226 if (errno == E2BIG) {
227 /* grow the output buffer */
228 outptr = grow_conv_buffer(outbuf, outptr);
230 outsize = outbuf->size - (outptr - outbuf->buffer);
236 else if (errno == EILSEQ) {
237 /* skip over character */
238 const char* unkn_ptr = conv->unknown;
239 if (conv_fails != NULL) (*conv_fails)++;
240 if ((*inptr & 0x80) == 0) {
241 /* an ASCII character, just skip one (this case is very improbable) */
245 /* a general UTF-8 character, skip all 0x10xxxxxx bytes */
247 while ((*inptr & 0xC0) == 0x80) {
251 /* append the "unknown" string to the output */
252 while (*unkn_ptr) { *outptr++ = *unkn_ptr++; outsize--; }
255 /* EINVAL should not happen, since we convert entire strings */
256 /* EBADF is an error which should be captured by the first if above */
257 if (conv_fails != NULL) *conv_fails += insize;
260 nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize);
262 if (output_len) *output_len = outptr - outbuf->buffer;
263 return outbuf->buffer;
266 char* convert_to_utf8(convert_t conv, const char* input, size_t input_len)
269 ICONV_CONST char *inptr = (ICONV_CONST char*) input;
272 struct conv_buffer* outbuf;
274 if (!conv || !conv->outbuf || !input)
276 /* make sure we start from an empty state */
277 iconv(conv->to_utf8, NULL, NULL, NULL, NULL);
278 /* set up output buffer (empty it) */
279 outbuf = conv->outbuf;
280 outptr = outbuf->buffer;
281 outsize = outbuf->size;
282 reset_conv_buffer(conv->outbuf);
283 nconv = iconv(conv->to_utf8, &inptr, &input_len, &outptr, &outsize);
284 while (nconv == (size_t)-1) {
285 if (errno == E2BIG) {
286 /* grow the output buffer */
287 outptr = grow_conv_buffer(outbuf, outptr);
289 outsize = outbuf->size - (outptr - outbuf->buffer);
296 /* EILSEQ happens when the input doesn't match the source encoding,
297 return NULL in this case */
298 /* EINVAL should not happen, since we convert entire strings */
299 /* EBADF is an error which should be captured by the first if above */
302 nconv = iconv(conv->to_utf8, &inptr, &input_len, &outptr, &outsize);
304 return outbuf->buffer;
307 char* convert_to_utf8_incremental(convert_t conv,
308 const char* input, size_t input_len)
311 struct conv_buffer* outbuf = conv->outbuf;
312 struct conv_buffer* inbuf = conv->inbuf;
313 size_t outsize = outbuf->size;
314 char* wrptr = outbuf->buffer;
315 ICONV_CONST char* rdptr = (ICONV_CONST char*) inbuf->buffer;
316 char* retval = outbuf->buffer;
318 if (!conv || !conv->outbuf)
322 iconv(conv->to_utf8, NULL, NULL, NULL, NULL);
323 reset_conv_buffer(inbuf);
328 /* set up input buffer (concatenate to what was left previous time) */
329 /* can't use strcpy, because possible null bytes from unicode */
330 while (conv->insize + input_len > inbuf->size)
331 grow_conv_buffer(inbuf, inbuf->buffer + conv->insize);
332 memcpy(inbuf->buffer + conv->insize, input, input_len);
333 conv->insize += input_len;
335 /* set up output buffer (empty it) */
336 reset_conv_buffer(outbuf);
338 /* do the conversion */
339 res = iconv(conv->to_utf8, &rdptr, &conv->insize, &wrptr, &outsize);
340 if (res == (size_t)-1) {
341 if (errno == EILSEQ) {
342 /* restart from an empty state and return NULL */
347 else if (errno == EINVAL) {
348 /* Do nothing, leave it to next iteration */
355 /* then shift what is left over to the head of the input buffer */
356 memmove(inbuf->buffer, rdptr, conv->insize);
357 memset(inbuf->buffer + conv->insize, 0, inbuf->size - conv->insize);