1 /* Encoding utility from UTF-8 to another charset and vice versa
2 Copyright (C) 2001, 2002 Peter Verthez
4 Permission granted to do anything with this file that you want, as long
5 as the above copyright is retained in all copies.
6 THERE IS NO WARRANTY - USE AT YOUR OWN RISK
19 #define INITIAL_BUFSIZE 256
20 #define DEFAULT_UNKNOWN "?"
22 #define INTERNAL_BUFFER 0
23 #define EXTERNAL_BUFFER 1
28 int type; /* For internal use */
34 struct conv_buffer* inbuf;
36 struct conv_buffer* outbuf;
40 void reset_conv_buffer(conv_buffer_t buf)
42 memset(buf->buffer, 0, buf->size);
45 conv_buffer_t create_conv_buffer(int size)
47 struct conv_buffer* buf = NULL;
49 if (size == 0) size = INITIAL_BUFSIZE;
51 buf = (struct conv_buffer*) malloc(sizeof(struct conv_buffer));
54 buf->buffer = (char*)malloc(size);
55 buf->type = EXTERNAL_BUFFER;
57 reset_conv_buffer(buf);
65 void free_conv_buffer(conv_buffer_t buf)
73 char* grow_conv_buffer(conv_buffer_t buf, char* curr_pos)
75 size_t outlen, new_size;
77 outlen = curr_pos - buf->buffer;
78 new_size = buf->size * 2;
79 new_buffer = realloc(buf->buffer, new_size);
81 buf->buffer = new_buffer;
83 curr_pos = buf->buffer + outlen;
84 memset(curr_pos, 0, buf->size - (curr_pos - buf->buffer));
91 convert_t initialize_utf8_conversion(const char* charset, int external_outbuf)
93 struct convert *conv = NULL;
97 conv = (struct convert *)malloc(sizeof(struct convert));
99 /* Unless reset to 0 at the end, this will force cleanup */
101 /* First initialize to default values */
102 conv->from_utf8 = (iconv_t)-1;
103 conv->to_utf8 = (iconv_t)-1;
107 conv->unknown = NULL;
109 /* Now initialize everything to what it should be */
110 conv->from_utf8 = iconv_open(charset, "UTF-8");
111 if (conv->from_utf8 != (iconv_t)-1) {
112 conv->to_utf8 = iconv_open("UTF-8", charset);
113 if (conv->to_utf8 != (iconv_t)-1) {
114 conv->unknown = strdup(DEFAULT_UNKNOWN);
116 conv->inbuf = create_conv_buffer(INITIAL_BUFSIZE);
117 conv->inbuf->type = INTERNAL_BUFFER;
122 conv->outbuf = create_conv_buffer(INITIAL_BUFSIZE);
123 conv->outbuf->type = INTERNAL_BUFFER;
125 cleanup = 0; /* All successful */
135 cleanup_utf8_conversion(conv);
143 int conversion_set_unknown(convert_t conv, const char* unknown)
147 if (conv && unknown) {
148 char* unknown_copy = strdup(unknown);
150 if (conv->unknown) free(conv->unknown);
151 conv->unknown = unknown_copy;
160 int conversion_set_output_buffer(convert_t conv, conv_buffer_t buf)
164 else if ((!conv->outbuf || conv->outbuf->type == EXTERNAL_BUFFER)
165 && buf && buf->type == EXTERNAL_BUFFER) {
173 void cleanup_utf8_conversion(convert_t conv)
176 if (conv->from_utf8 != (iconv_t)-1)
177 iconv_close(conv->from_utf8);
178 if (conv->to_utf8 != (iconv_t)-1)
179 iconv_close(conv->to_utf8);
180 if (conv->inbuf && conv->inbuf->type == INTERNAL_BUFFER)
181 free_conv_buffer(conv->inbuf);
182 if (conv->outbuf && conv->outbuf->type == INTERNAL_BUFFER)
183 free_conv_buffer(conv->outbuf);
190 char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails,
193 size_t insize = strlen(input);
195 ICONV_CONST char* inptr = (ICONV_CONST char*) input;
198 struct conv_buffer* outbuf;
200 if (!conv || !conv->outbuf) {
201 if (conv_fails != NULL) *conv_fails = insize;
204 /* make sure we start from an empty state */
205 iconv(conv->from_utf8, NULL, NULL, NULL, NULL);
206 if (conv_fails != NULL) *conv_fails = 0;
207 /* set up output buffer (empty it) */
208 outbuf = conv->outbuf;
209 outptr = outbuf->buffer;
210 outsize = outbuf->size;
211 reset_conv_buffer(conv->outbuf);
212 nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize);
213 while (nconv == (size_t)-1) {
214 if (errno == E2BIG) {
215 /* grow the output buffer */
216 outptr = grow_conv_buffer(outbuf, outptr);
218 outsize = outbuf->size - (outptr - outbuf->buffer);
224 else if (errno == EILSEQ) {
225 /* skip over character */
226 const char* unkn_ptr = conv->unknown;
227 if (conv_fails != NULL) (*conv_fails)++;
228 if ((*inptr & 0x80) == 0) {
229 /* an ASCII character, just skip one (this case is very improbable) */
233 /* a general UTF-8 character, skip all 0x10xxxxxx bytes */
235 while ((*inptr & 0xC0) == 0x80) {
239 /* append the "unknown" string to the output */
240 while (*unkn_ptr) { *outptr++ = *unkn_ptr++; outsize--; }
243 /* EINVAL should not happen, since we convert entire strings */
244 /* EBADF is an error which should be captured by the first if above */
245 if (conv_fails != NULL) *conv_fails += insize;
248 nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize);
250 if (output_len) *output_len = outptr - outbuf->buffer;
251 return outbuf->buffer;
254 char* convert_to_utf8(convert_t conv, const char* input, size_t input_len)
257 ICONV_CONST char *inptr = (ICONV_CONST char*) input;
260 struct conv_buffer* outbuf;
262 if (!conv || !conv->outbuf)
264 /* make sure we start from an empty state */
265 iconv(conv->to_utf8, NULL, NULL, NULL, NULL);
266 /* set up output buffer (empty it) */
267 outbuf = conv->outbuf;
268 outptr = outbuf->buffer;
269 outsize = outbuf->size;
270 reset_conv_buffer(conv->outbuf);
271 nconv = iconv(conv->to_utf8, &inptr, &input_len, &outptr, &outsize);
272 while (nconv == (size_t)-1) {
273 if (errno == E2BIG) {
274 /* grow the output buffer */
275 outptr = grow_conv_buffer(outbuf, outptr);
277 outsize = outbuf->size - (outptr - outbuf->buffer);
284 /* EILSEQ happens when the input doesn't match the source encoding,
285 return NULL in this case */
286 /* EINVAL should not happen, since we convert entire strings */
287 /* EBADF is an error which should be captured by the first if above */
290 nconv = iconv(conv->to_utf8, &inptr, &input_len, &outptr, &outsize);
292 return outbuf->buffer;
295 char* convert_to_utf8_incremental(convert_t conv,
296 const char* input, size_t input_len)
299 struct conv_buffer* outbuf = conv->outbuf;
300 struct conv_buffer* inbuf = conv->inbuf;
301 size_t outsize = outbuf->size;
302 char* wrptr = outbuf->buffer;
303 ICONV_CONST char* rdptr = (ICONV_CONST char*) inbuf->buffer;
304 char* retval = outbuf->buffer;
306 if (!conv || !conv->outbuf)
309 /* set up input buffer (concatenate to what was left previous time) */
310 /* can't use strcpy, because possible null bytes from unicode */
311 while (conv->insize + input_len > inbuf->size)
312 grow_conv_buffer(inbuf, inbuf->buffer + conv->insize);
313 memcpy(inbuf->buffer + conv->insize, input, input_len);
314 conv->insize += input_len;
316 /* set up output buffer (empty it) */
317 reset_conv_buffer(outbuf);
319 /* do the conversion */
320 res = iconv(conv->to_utf8, &rdptr, &conv->insize, &wrptr, &outsize);
321 if (res == (size_t)-1) {
322 if (errno == EILSEQ) {
323 /* restart from an empty state and return NULL */
328 else if (errno == EINVAL) {
329 /* Do nothing, leave it to next iteration */
336 /* then shift what is left over to the head of the input buffer */
337 memmove(inbuf->buffer, rdptr, conv->insize);
338 memset(inbuf->buffer + conv->insize, 0, inbuf->size - conv->insize);