1 /* Encoding utility from UTF-8 to another charset and vice versa
2 Copyright (C) 2001, 2002 Peter Verthez
4 Permission granted to do anything with this file that you want, as long
5 as the above copyright is retained in all copies.
6 THERE IS NO WARRANTY - USE AT YOUR OWN RISK
19 #define INITIAL_BUFSIZE 256
20 #define DEFAULT_UNKNOWN "?"
22 #define INTERNAL_BUFFER 0
23 #define EXTERNAL_BUFFER 1
28 int type; /* For internal use */
34 struct conv_buffer* inbuf;
36 struct conv_buffer* outbuf;
40 void reset_conv_buffer(conv_buffer_t buf)
42 memset(buf->buffer, 0, buf->size);
45 conv_buffer_t create_conv_buffer(int size)
47 struct conv_buffer* buf = NULL;
49 if (size == 0) size = INITIAL_BUFSIZE;
51 buf = (struct conv_buffer*) malloc(sizeof(struct conv_buffer));
54 buf->buffer = (char*)malloc(size);
55 buf->type = EXTERNAL_BUFFER;
57 reset_conv_buffer(buf);
65 void free_conv_buffer(conv_buffer_t buf)
73 char* grow_conv_buffer(conv_buffer_t buf, char* curr_pos)
75 size_t outlen, new_size;
77 outlen = curr_pos - buf->buffer;
78 new_size = buf->size * 2;
79 new_buffer = realloc(buf->buffer, new_size);
81 buf->buffer = new_buffer;
83 curr_pos = buf->buffer + outlen;
84 memset(curr_pos, 0, buf->size - (curr_pos - buf->buffer));
91 convert_t initialize_utf8_conversion(const char* charset, int external_outbuf)
93 struct convert *conv = NULL;
97 conv = (struct convert *)malloc(sizeof(struct convert));
99 /* Unless reset to 0 at the end, this will force cleanup */
101 /* First initialize to default values */
102 conv->from_utf8 = (iconv_t)-1;
103 conv->to_utf8 = (iconv_t)-1;
107 conv->unknown = NULL;
109 /* Now initialize everything to what it should be */
110 conv->from_utf8 = iconv_open(charset, "UTF-8");
111 if (conv->from_utf8 != (iconv_t)-1) {
112 conv->to_utf8 = iconv_open("UTF-8", charset);
113 if (conv->to_utf8 != (iconv_t)-1) {
114 conv->unknown = strdup(DEFAULT_UNKNOWN);
116 conv->inbuf = create_conv_buffer(INITIAL_BUFSIZE);
117 conv->inbuf->type = INTERNAL_BUFFER;
122 conv->outbuf = create_conv_buffer(INITIAL_BUFSIZE);
123 conv->outbuf->type = INTERNAL_BUFFER;
125 cleanup = 0; /* All successful */
135 cleanup_utf8_conversion(conv);
143 int conversion_set_unknown(convert_t conv, const char* unknown)
147 if (conv && unknown) {
148 char* unknown_copy = strdup(unknown);
150 if (conv->unknown) free(conv->unknown);
151 conv->unknown = unknown_copy;
160 int conversion_set_output_buffer(convert_t conv, conv_buffer_t buf)
164 else if ((!conv->outbuf || conv->outbuf->type == EXTERNAL_BUFFER)
165 && buf && buf->type == EXTERNAL_BUFFER) {
173 void cleanup_utf8_conversion(convert_t conv)
176 if (conv->from_utf8 != (iconv_t)-1)
177 iconv_close(conv->from_utf8);
178 if (conv->to_utf8 != (iconv_t)-1)
179 iconv_close(conv->to_utf8);
180 if (conv->inbuf && conv->inbuf->type == INTERNAL_BUFFER)
181 free_conv_buffer(conv->inbuf);
182 if (conv->outbuf && conv->outbuf->type == INTERNAL_BUFFER)
183 free_conv_buffer(conv->outbuf);
190 char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails)
192 size_t insize = strlen(input);
194 ICONV_CONST char* inptr = (ICONV_CONST char*) input;
197 struct conv_buffer* outbuf;
199 if (!conv || !conv->outbuf) {
200 if (conv_fails != NULL) *conv_fails = insize;
203 /* make sure we start from an empty state */
204 iconv(conv->from_utf8, NULL, NULL, NULL, NULL);
205 if (conv_fails != NULL) *conv_fails = 0;
206 /* set up output buffer (empty it) */
207 outbuf = conv->outbuf;
208 outptr = outbuf->buffer;
209 outsize = outbuf->size;
210 reset_conv_buffer(conv->outbuf);
211 nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize);
212 while (nconv == (size_t)-1) {
213 if (errno == E2BIG) {
214 /* grow the output buffer */
215 outptr = grow_conv_buffer(outbuf, outptr);
217 outsize = outbuf->size - (outptr - outbuf->buffer);
223 else if (errno == EILSEQ) {
224 /* skip over character */
225 const char* unkn_ptr = conv->unknown;
226 if (conv_fails != NULL) (*conv_fails)++;
227 if ((*inptr & 0x80) == 0) {
228 /* an ASCII character, just skip one (this case is very improbable) */
232 /* a general UTF-8 character, skip all 0x10xxxxxx bytes */
234 while ((*inptr & 0xC0) == 0x80) {
238 /* append the "unknown" string to the output */
239 while (*unkn_ptr) { *outptr++ = *unkn_ptr++; outsize--; }
242 /* EINVAL should not happen, since we convert entire strings */
243 /* EBADF is an error which should be captured by the first if above */
244 if (conv_fails != NULL) *conv_fails += insize;
247 nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize);
249 return outbuf->buffer;
252 char* convert_to_utf8(convert_t conv, const char* input)
254 size_t insize = strlen(input);
256 ICONV_CONST char *inptr = (ICONV_CONST char*) input;
259 struct conv_buffer* outbuf;
261 if (!conv || !conv->outbuf)
263 /* make sure we start from an empty state */
264 iconv(conv->to_utf8, NULL, NULL, NULL, NULL);
265 /* set up output buffer (empty it) */
266 outbuf = conv->outbuf;
267 outptr = outbuf->buffer;
268 outsize = outbuf->size;
269 reset_conv_buffer(conv->outbuf);
270 nconv = iconv(conv->to_utf8, &inptr, &insize, &outptr, &outsize);
271 while (nconv == (size_t)-1) {
272 if (errno == E2BIG) {
273 /* grow the output buffer */
274 outptr = grow_conv_buffer(outbuf, outptr);
276 outsize = outbuf->size - (outptr - outbuf->buffer);
283 /* EILSEQ happens when the input doesn't match the source encoding,
284 return NULL in this case */
285 /* EINVAL should not happen, since we convert entire strings */
286 /* EBADF is an error which should be captured by the first if above */
289 nconv = iconv(conv->to_utf8, &inptr, &insize, &outptr, &outsize);
291 return outbuf->buffer;
294 char* convert_to_utf8_incremental(convert_t conv,
295 const char* input, size_t input_len)
298 struct conv_buffer* outbuf = conv->outbuf;
299 struct conv_buffer* inbuf = conv->inbuf;
300 size_t outsize = outbuf->size;
301 char* wrptr = outbuf->buffer;
302 ICONV_CONST char* rdptr = (ICONV_CONST char*) inbuf->buffer;
303 char* retval = outbuf->buffer;
305 if (!conv || !conv->outbuf)
308 /* set up input buffer (concatenate to what was left previous time) */
309 /* can't use strcpy, because possible null bytes from unicode */
310 while (conv->insize + input_len > inbuf->size)
311 grow_conv_buffer(inbuf, inbuf->buffer + conv->insize);
312 memcpy(inbuf->buffer + conv->insize, input, input_len);
313 conv->insize += input_len;
315 /* set up output buffer (empty it) */
316 reset_conv_buffer(outbuf);
318 /* do the conversion */
319 res = iconv(conv->to_utf8, &rdptr, &conv->insize, &wrptr, &outsize);
320 if (res == (size_t)-1) {
321 if (errno == EILSEQ) {
322 /* restart from an empty state and return NULL */
327 else if (errno == EINVAL) {
328 /* Do nothing, leave it to next iteration */
335 /* then shift what is left over to the head of the input buffer */
336 memmove(inbuf->buffer, rdptr, conv->insize);
337 memset(inbuf->buffer + conv->insize, 0, inbuf->size - conv->insize);