CFLAGS = -g -O2 -W -Wall -pedantic -Wno-long-long
noinst_LTLIBRARIES = libutf8.la
-libutf8_la_SOURCES = utf8-locale.c utf8.c
+libutf8_la_SOURCES = utf8-locale.c utf8.c utf8-convert.c
libutf8_la_LIBADD = @LCS_LIBADD@ $(LIBICONV)
noinst_HEADERS = utf8.h
--- /dev/null
+/* Encoding utility from UTF-8 to locale and vice versa
+ Copyright (C) 2001, 2002 Peter Verthez
+
+ Permission granted to do anything with this file that you want, as long
+ as the above copyright is retained in all copies.
+ THERE IS NO WARRANTY - USE AT YOUR OWN RISK
+*/
+
+/* $Id$ */
+/* $Name$ */
+
+#include "utf8.h"
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <iconv.h>
+#include "config.h"
+
+#define INITIAL_OUTSIZE 256
+#define DEFAULT_UNKNOWN "?"
+
+struct conv_buffer* create_conv_buffer(int size)
+{
+ struct conv_buffer* buf = NULL;
+
+ buf = (struct conv_buffer*) malloc(sizeof(struct conv_buffer));
+ if (buf) {
+ buf->size = size;
+ buf->buffer = (char*)malloc(size);
+ if (!buf->buffer)
+ buf->size = 0;
+ }
+
+ return buf;
+}
+
+void free_conv_buffer(struct conv_buffer* buf)
+{
+ if (buf) {
+ free(buf->buffer);
+ free(buf);
+ }
+}
+
+char* grow_conv_buffer(struct conv_buffer* buf, char* curr_pos)
+{
+ size_t outlen, new_size;
+ char* new_buffer;
+ outlen = curr_pos - buf->buffer;
+ new_size = buf->size * 2;
+ new_buffer = realloc(buf->buffer, new_size);
+ if (new_buffer) {
+ buf->buffer = new_buffer;
+ buf->size = new_size;
+ curr_pos = buf->buffer + outlen;
+ memset(curr_pos, 0, buf->size - (curr_pos - buf->buffer));
+ return curr_pos;
+ }
+ else
+ return NULL;
+}
+
+convert_t initialize_utf8_conversion(const char* charset)
+{
+ struct convert *conv = NULL;
+ int cleanup = 0;
+
+ conv = (struct convert *)malloc(sizeof(struct convert));
+ if (conv) {
+ /* Unless reset to 0 at the end, this will force cleanup */
+ cleanup = 1;
+ /* First initialize to default values */
+ conv->from_utf8 = (iconv_t)-1;
+ conv->to_utf8 = (iconv_t)-1;
+ conv->outbuf = NULL;
+ conv->unknown = NULL;
+
+ /* Now initialize everything to what it should be */
+ conv->from_utf8 = iconv_open(charset, "UTF-8");
+ if (conv->from_utf8 != (iconv_t)-1) {
+ conv->to_utf8 = iconv_open("UTF-8", charset);
+ if (conv->to_utf8 != (iconv_t)-1) {
+ conv->outbuf = create_conv_buffer(INITIAL_OUTSIZE);
+ if (conv->outbuf) {
+ conv->unknown = strdup(DEFAULT_UNKNOWN);
+ if (conv->unknown)
+ cleanup = 0; /* All successful */
+ }
+ }
+ }
+ }
+
+ if (cleanup) {
+ cleanup_utf8_conversion(conv);
+ conv = NULL;
+ }
+
+ return conv;
+}
+
+int conversion_set_unknown(convert_t conv, const char* unknown)
+{
+ int result = 1;
+
+ if (conv && unknown) {
+ char* unknown_copy = strdup(unknown);
+ if (unknown_copy) {
+ if (conv->unknown) free(conv->unknown);
+ conv->unknown = unknown_copy;
+ }
+ else
+ result = 0;
+ }
+
+ return result;
+}
+
+void cleanup_utf8_conversion(convert_t conv)
+{
+ if (conv) {
+ if (conv->from_utf8 != (iconv_t)-1)
+ iconv_close(conv->from_utf8);
+ if (conv->to_utf8 != (iconv_t)-1)
+ iconv_close(conv->to_utf8);
+ if (conv->outbuf)
+ free_conv_buffer(conv->outbuf);
+ if (conv->unknown)
+ free(conv->unknown);
+ free(conv);
+ }
+}
+
+char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails)
+{
+ size_t insize = strlen(input);
+ size_t outsize;
+ ICONV_CONST char* inptr = (ICONV_CONST char*) input;
+ char *outptr;
+ size_t nconv;
+ struct conv_buffer* outbuf;
+
+ if (!conv) {
+ if (conv_fails != NULL) *conv_fails = insize;
+ return NULL;
+ }
+ /* make sure we start from an empty state */
+ iconv(conv->from_utf8, NULL, NULL, NULL, NULL);
+ if (conv_fails != NULL) *conv_fails = 0;
+ /* set up output buffer (empty it) */
+ outbuf = conv->outbuf;
+ outptr = outbuf->buffer;
+ outsize = outbuf->size;
+ memset(outbuf->buffer, 0, outbuf->size);
+ nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize);
+ while (nconv == (size_t)-1) {
+ if (errno == E2BIG) {
+ /* grow the output buffer */
+ outptr = grow_conv_buffer(outbuf, outptr);
+ if (outptr)
+ outsize = outbuf->size - (outptr - outbuf->buffer);
+ else {
+ errno = ENOMEM;
+ return NULL;
+ }
+ }
+ else if (errno == EILSEQ) {
+ /* skip over character */
+ const char* unkn_ptr = conv->unknown;
+ if (conv_fails != NULL) (*conv_fails)++;
+ if ((*inptr & 0x80) == 0) {
+ /* an ASCII character, just skip one (this case is very improbable) */
+ inptr++; insize--;
+ }
+ else {
+ /* a general UTF-8 character, skip all 0x10xxxxxx bytes */
+ inptr++; insize--;
+ while ((*inptr & 0xC0) == 0x80) {
+ inptr++; insize--;
+ }
+ }
+ /* append the "unknown" string to the output */
+ while (*unkn_ptr) { *outptr++ = *unkn_ptr++; outsize--; }
+ }
+ else {
+ /* EINVAL should not happen, since we convert entire strings */
+ /* EBADF is an error which should be captured by the first if above */
+ if (conv_fails != NULL) *conv_fails += insize;
+ return NULL;
+ }
+ nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize);
+ }
+ return outbuf->buffer;
+}
+
+char* convert_to_utf8(convert_t conv, const char* input)
+{
+ size_t insize = strlen(input);
+ size_t outsize;
+ ICONV_CONST char *inptr = (ICONV_CONST char*) input;
+ char *outptr;
+ size_t nconv;
+ struct conv_buffer* outbuf;
+
+ if (!conv)
+ return NULL;
+ /* make sure we start from an empty state */
+ iconv(conv->to_utf8, NULL, NULL, NULL, NULL);
+ /* set up output buffer (empty it) */
+ outbuf = conv->outbuf;
+ outptr = outbuf->buffer;
+ outsize = outbuf->size;
+ memset(outbuf->buffer, 0, outbuf->size);
+ nconv = iconv(conv->to_utf8, &inptr, &insize, &outptr, &outsize);
+ while (nconv == (size_t)-1) {
+ if (errno == E2BIG) {
+ /* grow the output buffer */
+ outptr = grow_conv_buffer(outbuf, outptr);
+ if (outptr)
+ outsize = outbuf->size - (outptr - outbuf->buffer);
+ else {
+ errno = ENOMEM;
+ return NULL;
+ }
+ }
+ else {
+ /* EILSEQ happens when the input doesn't match the source encoding,
+ return NULL in this case */
+ /* EINVAL should not happen, since we convert entire strings */
+ /* EBADF is an error which should be captured by the first if above */
+ return NULL;
+ }
+ nconv = iconv(conv->to_utf8, &inptr, &insize, &outptr, &outsize);
+ }
+ return outbuf->buffer;
+}
/* $Id$ */
/* $Name$ */
+#include "utf8.h"
#include <stdlib.h>
-#include <iconv.h>
#include <assert.h>
-#include <errno.h>
-#include <string.h>
-#include "config.h"
#include "libcharset.h"
-#include "utf8.h"
-
-#define INITIAL_OUTSIZE 256
-static iconv_t utf8_to_locale = (iconv_t) -1;
-static iconv_t locale_to_utf8 = (iconv_t) -1;
-static char* outbuffer = NULL;
-static size_t outbufsize = 0;
-static const char* the_unknown = "?";
+static convert_t locale_conv = NULL;
void convert_set_unknown(const char* unknown)
{
- the_unknown = unknown;
+ conversion_set_unknown(locale_conv, unknown);
}
void close_conversion_contexts()
{
- iconv_close(utf8_to_locale);
- iconv_close(locale_to_utf8);
- utf8_to_locale = (iconv_t) -1;
- locale_to_utf8 = (iconv_t) -1;
- free(outbuffer);
+ cleanup_utf8_conversion(locale_conv);
}
int open_conversion_contexts()
{
- assert(utf8_to_locale == (iconv_t) -1);
- assert(locale_to_utf8 == (iconv_t) -1);
- utf8_to_locale = iconv_open(locale_charset(), "UTF-8");
- if (utf8_to_locale == (iconv_t) -1)
- return -1;
+ assert (locale_conv == NULL);
+ locale_conv = initialize_utf8_conversion(locale_charset());
+
+ if (locale_conv) {
+ atexit(close_conversion_contexts);
+ return 0;
+ }
else {
- locale_to_utf8 = iconv_open("UTF-8", locale_charset());
- if (locale_to_utf8 == (iconv_t) -1) {
- close_conversion_contexts();
- return -1;
- }
- else {
- outbufsize = INITIAL_OUTSIZE;
- outbuffer = (char*)malloc(outbufsize);
- atexit(close_conversion_contexts);
- return 0;
- }
+ return -1;
}
}
char* convert_utf8_to_locale(const char* input, int *conv_fails)
{
- size_t insize = strlen(input);
- size_t outsize;
- ICONV_CONST char *inptr = (ICONV_CONST char*) input;
- char *outptr;
- size_t nconv;
+ if (!locale_conv)
+ open_conversion_contexts();
- if (utf8_to_locale == (iconv_t) -1 && (open_conversion_contexts() == -1)) {
- if (conv_fails != NULL) *conv_fails = insize;
- return NULL;
- }
- assert(utf8_to_locale != (iconv_t) -1);
- /* make sure we start from an empty state */
- iconv(utf8_to_locale, NULL, NULL, NULL, NULL);
- if (conv_fails != NULL) *conv_fails = 0;
- /* set up output buffer (empty it) */
- outptr = outbuffer;
- outsize = outbufsize;
- memset(outbuffer, 0, outbufsize);
- nconv = iconv(utf8_to_locale, &inptr, &insize, &outptr, &outsize);
- while (nconv == (size_t)-1) {
- if (errno == E2BIG) {
- /* grow the output buffer */
- size_t outlen;
- outlen = outptr - outbuffer;
- outbufsize *= 2;
- outbuffer = realloc(outbuffer, outbufsize);
- outptr = outbuffer + outlen;
- outsize = outbufsize - outlen;
- memset(outptr, 0, outsize);
- }
- else if (errno == EILSEQ) {
- /* skip over character */
- const char* unkn_ptr = the_unknown;
- if (conv_fails != NULL) (*conv_fails)++;
- if ((*inptr & 0x80) == 0) {
- /* an ASCII character, just skip one (this case is very improbable) */
- inptr++; insize--;
- }
- else {
- /* a general UTF-8 character, skip all 0x10xxxxxx bytes */
- inptr++; insize--;
- while ((*inptr & 0xC0) == 0x80) {
- inptr++; insize--;
- }
- }
- /* append the "unknown" string to the output */
- while (*unkn_ptr) { *outptr++ = *unkn_ptr++; outsize--; }
- }
- else {
- /* EINVAL should not happen, since we convert entire strings */
- /* EBADF is an error which should be captured by the assert above */
- if (conv_fails != NULL) *conv_fails += insize;
- return NULL;
- }
- nconv = iconv(utf8_to_locale, &inptr, &insize, &outptr, &outsize);
- }
- return outbuffer;
+ return convert_from_utf8(locale_conv, input, conv_fails);
}
char* convert_locale_to_utf8(const char* input)
{
- size_t insize = strlen(input);
- size_t outsize;
- ICONV_CONST char *inptr = (ICONV_CONST char*) input;
- char *outptr;
- size_t nconv;
+ if (!locale_conv)
+ open_conversion_contexts();
- if (locale_to_utf8 == (iconv_t) -1 && (open_conversion_contexts() == -1))
- return NULL;
- assert(locale_to_utf8 != (iconv_t) -1);
- /* make sure we start from an empty state */
- iconv(locale_to_utf8, NULL, NULL, NULL, NULL);
- /* set up output buffer (empty it) */
- outptr = outbuffer;
- outsize = outbufsize;
- memset(outbuffer, 0, outbufsize);
- nconv = iconv(locale_to_utf8, &inptr, &insize, &outptr, &outsize);
- while (nconv == (size_t)-1) {
- if (errno == E2BIG) {
- /* grow the output buffer */
- size_t outlen;
- outlen = outptr - outbuffer;
- outbufsize *= 2;
- outbuffer = realloc(outbuffer, outbufsize);
- outptr = outbuffer + outlen;
- outsize = outbufsize - outlen;
- memset(outptr, 0, outsize);
- }
- else {
- /* EILSEQ should not happen, because UTF-8 can represent anything */
- /* EINVAL should not happen, since we convert entire strings */
- /* EBADF is an error which should be captured by the assert above */
- return NULL;
- }
- nconv = iconv(locale_to_utf8, &inptr, &insize, &outptr, &outsize);
- }
- return outbuffer;
+ return convert_to_utf8(locale_conv, input);
}
extern "C" {
#endif
+#include "iconv.h"
+
+struct conv_buffer {
+ char* buffer;
+ size_t size;
+};
+
+struct convert {
+ iconv_t from_utf8;
+ iconv_t to_utf8;
+ struct conv_buffer* outbuf;
+ char* unknown;
+};
+
+typedef struct convert *convert_t;
+
/* Returns -1 if the string is not a valid UTF-8 string, returns its
string length otherwise */
int utf8_strlen(const char* input);
/* Returns 1 if string is valid UTF-8 string, 0 otherwise */
int is_utf8_string(const char* input);
-
+
+ /* General conversion interface (is bidirectional) */
+convert_t initialize_utf8_conversion(const char* charset);
+int conversion_set_unknown(convert_t conv, const char* unknown);
+void cleanup_utf8_conversion(convert_t conv);
+char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails);
+char* convert_to_utf8(convert_t conv, const char* input);
+
+ /* Specific locale conversion interface */
void convert_set_unknown(const char* unknown);
char* convert_utf8_to_locale(const char* input, int *conv_fails);
char* convert_locale_to_utf8(const char* input);