1 /* Encoding utility from UTF-8 to locale and vice versa
2 Copyright (C) 2001, 2002 Peter Verthez
4 Permission granted to do anything with this file that you want, as long
5 as the above copyright is retained in all copies.
6 THERE IS NO WARRANTY - USE AT YOUR OWN RISK
19 #define INITIAL_OUTSIZE 256
20 #define DEFAULT_UNKNOWN "?"
22 struct conv_buffer* create_conv_buffer(int size)
24 struct conv_buffer* buf = NULL;
26 buf = (struct conv_buffer*) malloc(sizeof(struct conv_buffer));
29 buf->buffer = (char*)malloc(size);
37 void free_conv_buffer(struct conv_buffer* buf)
45 char* grow_conv_buffer(struct conv_buffer* buf, char* curr_pos)
47 size_t outlen, new_size;
49 outlen = curr_pos - buf->buffer;
50 new_size = buf->size * 2;
51 new_buffer = realloc(buf->buffer, new_size);
53 buf->buffer = new_buffer;
55 curr_pos = buf->buffer + outlen;
56 memset(curr_pos, 0, buf->size - (curr_pos - buf->buffer));
63 convert_t initialize_utf8_conversion(const char* charset)
65 struct convert *conv = NULL;
68 conv = (struct convert *)malloc(sizeof(struct convert));
70 /* Unless reset to 0 at the end, this will force cleanup */
72 /* First initialize to default values */
73 conv->from_utf8 = (iconv_t)-1;
74 conv->to_utf8 = (iconv_t)-1;
78 /* Now initialize everything to what it should be */
79 conv->from_utf8 = iconv_open(charset, "UTF-8");
80 if (conv->from_utf8 != (iconv_t)-1) {
81 conv->to_utf8 = iconv_open("UTF-8", charset);
82 if (conv->to_utf8 != (iconv_t)-1) {
83 conv->outbuf = create_conv_buffer(INITIAL_OUTSIZE);
85 conv->unknown = strdup(DEFAULT_UNKNOWN);
87 cleanup = 0; /* All successful */
94 cleanup_utf8_conversion(conv);
101 int conversion_set_unknown(convert_t conv, const char* unknown)
105 if (conv && unknown) {
106 char* unknown_copy = strdup(unknown);
108 if (conv->unknown) free(conv->unknown);
109 conv->unknown = unknown_copy;
118 void cleanup_utf8_conversion(convert_t conv)
121 if (conv->from_utf8 != (iconv_t)-1)
122 iconv_close(conv->from_utf8);
123 if (conv->to_utf8 != (iconv_t)-1)
124 iconv_close(conv->to_utf8);
126 free_conv_buffer(conv->outbuf);
133 char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails)
135 size_t insize = strlen(input);
137 ICONV_CONST char* inptr = (ICONV_CONST char*) input;
140 struct conv_buffer* outbuf;
143 if (conv_fails != NULL) *conv_fails = insize;
146 /* make sure we start from an empty state */
147 iconv(conv->from_utf8, NULL, NULL, NULL, NULL);
148 if (conv_fails != NULL) *conv_fails = 0;
149 /* set up output buffer (empty it) */
150 outbuf = conv->outbuf;
151 outptr = outbuf->buffer;
152 outsize = outbuf->size;
153 memset(outbuf->buffer, 0, outbuf->size);
154 nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize);
155 while (nconv == (size_t)-1) {
156 if (errno == E2BIG) {
157 /* grow the output buffer */
158 outptr = grow_conv_buffer(outbuf, outptr);
160 outsize = outbuf->size - (outptr - outbuf->buffer);
166 else if (errno == EILSEQ) {
167 /* skip over character */
168 const char* unkn_ptr = conv->unknown;
169 if (conv_fails != NULL) (*conv_fails)++;
170 if ((*inptr & 0x80) == 0) {
171 /* an ASCII character, just skip one (this case is very improbable) */
175 /* a general UTF-8 character, skip all 0x10xxxxxx bytes */
177 while ((*inptr & 0xC0) == 0x80) {
181 /* append the "unknown" string to the output */
182 while (*unkn_ptr) { *outptr++ = *unkn_ptr++; outsize--; }
185 /* EINVAL should not happen, since we convert entire strings */
186 /* EBADF is an error which should be captured by the first if above */
187 if (conv_fails != NULL) *conv_fails += insize;
190 nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize);
192 return outbuf->buffer;
195 char* convert_to_utf8(convert_t conv, const char* input)
197 size_t insize = strlen(input);
199 ICONV_CONST char *inptr = (ICONV_CONST char*) input;
202 struct conv_buffer* outbuf;
206 /* make sure we start from an empty state */
207 iconv(conv->to_utf8, NULL, NULL, NULL, NULL);
208 /* set up output buffer (empty it) */
209 outbuf = conv->outbuf;
210 outptr = outbuf->buffer;
211 outsize = outbuf->size;
212 memset(outbuf->buffer, 0, outbuf->size);
213 nconv = iconv(conv->to_utf8, &inptr, &insize, &outptr, &outsize);
214 while (nconv == (size_t)-1) {
215 if (errno == E2BIG) {
216 /* grow the output buffer */
217 outptr = grow_conv_buffer(outbuf, outptr);
219 outsize = outbuf->size - (outptr - outbuf->buffer);
226 /* EILSEQ happens when the input doesn't match the source encoding,
227 return NULL in this case */
228 /* EINVAL should not happen, since we convert entire strings */
229 /* EBADF is an error which should be captured by the first if above */
232 nconv = iconv(conv->to_utf8, &inptr, &insize, &outptr, &outsize);
234 return outbuf->buffer;