1 /* Encoding utility from UTF-8 to another charset and vice versa
2 Copyright (C) 2001, 2002 Peter Verthez
4 Permission granted to do anything with this file that you want, as long
5 as the above copyright is retained in all copies.
6 THERE IS NO WARRANTY - USE AT YOUR OWN RISK
19 #define INITIAL_BUFSIZE 256
20 #define DEFAULT_UNKNOWN "?"
22 #define INTERNAL_BUFFER 0
23 #define EXTERNAL_BUFFER 1
25 void reset_conv_buffer(struct conv_buffer* buf)
27 memset(buf->buffer, 0, buf->size);
30 struct conv_buffer* create_conv_buffer(int size)
32 struct conv_buffer* buf = NULL;
34 if (size == 0) size = INITIAL_BUFSIZE;
36 buf = (struct conv_buffer*) malloc(sizeof(struct conv_buffer));
39 buf->buffer = (char*)malloc(size);
40 buf->type = EXTERNAL_BUFFER;
42 reset_conv_buffer(buf);
50 void free_conv_buffer(struct conv_buffer* buf)
58 char* grow_conv_buffer(struct conv_buffer* buf, char* curr_pos)
60 size_t outlen, new_size;
62 outlen = curr_pos - buf->buffer;
63 new_size = buf->size * 2;
64 new_buffer = realloc(buf->buffer, new_size);
66 buf->buffer = new_buffer;
68 curr_pos = buf->buffer + outlen;
69 memset(curr_pos, 0, buf->size - (curr_pos - buf->buffer));
76 convert_t initialize_utf8_conversion(const char* charset, int external_outbuf)
78 struct convert *conv = NULL;
82 conv = (struct convert *)malloc(sizeof(struct convert));
84 /* Unless reset to 0 at the end, this will force cleanup */
86 /* First initialize to default values */
87 conv->from_utf8 = (iconv_t)-1;
88 conv->to_utf8 = (iconv_t)-1;
94 /* Now initialize everything to what it should be */
95 conv->from_utf8 = iconv_open(charset, "UTF-8");
96 if (conv->from_utf8 != (iconv_t)-1) {
97 conv->to_utf8 = iconv_open("UTF-8", charset);
98 if (conv->to_utf8 != (iconv_t)-1) {
99 conv->unknown = strdup(DEFAULT_UNKNOWN);
101 conv->inbuf = create_conv_buffer(INITIAL_BUFSIZE);
102 conv->inbuf->type = INTERNAL_BUFFER;
107 conv->outbuf = create_conv_buffer(INITIAL_BUFSIZE);
108 conv->outbuf->type = INTERNAL_BUFFER;
110 cleanup = 0; /* All successful */
120 cleanup_utf8_conversion(conv);
128 int conversion_set_unknown(convert_t conv, const char* unknown)
132 if (conv && unknown) {
133 char* unknown_copy = strdup(unknown);
135 if (conv->unknown) free(conv->unknown);
136 conv->unknown = unknown_copy;
145 int conversion_set_output_buffer(convert_t conv, struct conv_buffer* buf)
149 else if ((!conv->outbuf || conv->outbuf->type == EXTERNAL_BUFFER)
150 && buf && buf->type == EXTERNAL_BUFFER) {
158 void cleanup_utf8_conversion(convert_t conv)
161 if (conv->from_utf8 != (iconv_t)-1)
162 iconv_close(conv->from_utf8);
163 if (conv->to_utf8 != (iconv_t)-1)
164 iconv_close(conv->to_utf8);
165 if (conv->inbuf && conv->inbuf->type == INTERNAL_BUFFER)
166 free_conv_buffer(conv->inbuf);
167 if (conv->outbuf && conv->outbuf->type == INTERNAL_BUFFER)
168 free_conv_buffer(conv->outbuf);
175 char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails)
177 size_t insize = strlen(input);
179 ICONV_CONST char* inptr = (ICONV_CONST char*) input;
182 struct conv_buffer* outbuf;
184 if (!conv || !conv->outbuf) {
185 if (conv_fails != NULL) *conv_fails = insize;
188 /* make sure we start from an empty state */
189 iconv(conv->from_utf8, NULL, NULL, NULL, NULL);
190 if (conv_fails != NULL) *conv_fails = 0;
191 /* set up output buffer (empty it) */
192 outbuf = conv->outbuf;
193 outptr = outbuf->buffer;
194 outsize = outbuf->size;
195 reset_conv_buffer(conv->outbuf);
196 nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize);
197 while (nconv == (size_t)-1) {
198 if (errno == E2BIG) {
199 /* grow the output buffer */
200 outptr = grow_conv_buffer(outbuf, outptr);
202 outsize = outbuf->size - (outptr - outbuf->buffer);
208 else if (errno == EILSEQ) {
209 /* skip over character */
210 const char* unkn_ptr = conv->unknown;
211 if (conv_fails != NULL) (*conv_fails)++;
212 if ((*inptr & 0x80) == 0) {
213 /* an ASCII character, just skip one (this case is very improbable) */
217 /* a general UTF-8 character, skip all 0x10xxxxxx bytes */
219 while ((*inptr & 0xC0) == 0x80) {
223 /* append the "unknown" string to the output */
224 while (*unkn_ptr) { *outptr++ = *unkn_ptr++; outsize--; }
227 /* EINVAL should not happen, since we convert entire strings */
228 /* EBADF is an error which should be captured by the first if above */
229 if (conv_fails != NULL) *conv_fails += insize;
232 nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize);
234 return outbuf->buffer;
237 char* convert_to_utf8(convert_t conv, const char* input)
239 size_t insize = strlen(input);
241 ICONV_CONST char *inptr = (ICONV_CONST char*) input;
244 struct conv_buffer* outbuf;
246 if (!conv || !conv->outbuf)
248 /* make sure we start from an empty state */
249 iconv(conv->to_utf8, NULL, NULL, NULL, NULL);
250 /* set up output buffer (empty it) */
251 outbuf = conv->outbuf;
252 outptr = outbuf->buffer;
253 outsize = outbuf->size;
254 reset_conv_buffer(conv->outbuf);
255 nconv = iconv(conv->to_utf8, &inptr, &insize, &outptr, &outsize);
256 while (nconv == (size_t)-1) {
257 if (errno == E2BIG) {
258 /* grow the output buffer */
259 outptr = grow_conv_buffer(outbuf, outptr);
261 outsize = outbuf->size - (outptr - outbuf->buffer);
268 /* EILSEQ happens when the input doesn't match the source encoding,
269 return NULL in this case */
270 /* EINVAL should not happen, since we convert entire strings */
271 /* EBADF is an error which should be captured by the first if above */
274 nconv = iconv(conv->to_utf8, &inptr, &insize, &outptr, &outsize);
276 return outbuf->buffer;
279 char* convert_to_utf8_incremental(convert_t conv,
280 const char* input, size_t input_len)
283 struct conv_buffer* outbuf = conv->outbuf;
284 struct conv_buffer* inbuf = conv->inbuf;
285 size_t outsize = outbuf->size;
286 char* wrptr = outbuf->buffer;
287 ICONV_CONST char* rdptr = (ICONV_CONST char*) inbuf->buffer;
288 char* retval = outbuf->buffer;
290 if (!conv || !conv->outbuf)
293 /* set up input buffer (concatenate to what was left previous time) */
294 /* can't use strcpy, because possible null bytes from unicode */
295 while (conv->insize + input_len > inbuf->size)
296 grow_conv_buffer(inbuf, inbuf->buffer + conv->insize);
297 memcpy(inbuf->buffer + conv->insize, input, input_len);
298 conv->insize += input_len;
300 /* set up output buffer (empty it) */
301 reset_conv_buffer(outbuf);
303 /* do the conversion */
304 res = iconv(conv->to_utf8, &rdptr, &conv->insize, &wrptr, &outsize);
305 if (res == (size_t)-1) {
306 if (errno == EILSEQ) {
307 /* restart from an empty state and return NULL */
312 else if (errno == EINVAL) {
313 /* Do nothing, leave it to next iteration */
320 /* then shift what is left over to the head of the input buffer */
321 memmove(inbuf->buffer, rdptr, conv->insize);
322 memset(inbuf->buffer + conv->insize, 0, inbuf->size - conv->insize);