From: Peter Verthez Date: Sat, 30 Nov 2002 15:50:13 +0000 (+0000) Subject: Made conversion interface more general. X-Git-Url: https://git.dlugolecki.net.pl/?a=commitdiff_plain;h=af581cf02cbbed0d24636be0b3533587448e7e0f;p=gedcom-parse.git Made conversion interface more general. --- diff --git a/utf8/Makefile.am b/utf8/Makefile.am index 670425f..7b67f46 100644 --- a/utf8/Makefile.am +++ b/utf8/Makefile.am @@ -8,6 +8,6 @@ INCLUDES = @LCS_INCLUDES@ CFLAGS = -g -O2 -W -Wall -pedantic -Wno-long-long noinst_LTLIBRARIES = libutf8.la -libutf8_la_SOURCES = utf8-locale.c utf8.c +libutf8_la_SOURCES = utf8-locale.c utf8.c utf8-convert.c libutf8_la_LIBADD = @LCS_LIBADD@ $(LIBICONV) noinst_HEADERS = utf8.h diff --git a/utf8/utf8-convert.c b/utf8/utf8-convert.c new file mode 100644 index 0000000..83c3608 --- /dev/null +++ b/utf8/utf8-convert.c @@ -0,0 +1,235 @@ +/* Encoding utility from UTF-8 to locale and vice versa + Copyright (C) 2001, 2002 Peter Verthez + + Permission granted to do anything with this file that you want, as long + as the above copyright is retained in all copies. + THERE IS NO WARRANTY - USE AT YOUR OWN RISK +*/ + +/* $Id$ */ +/* $Name$ */ + +#include "utf8.h" +#include +#include +#include +#include +#include "config.h" + +#define INITIAL_OUTSIZE 256 +#define DEFAULT_UNKNOWN "?" + +struct conv_buffer* create_conv_buffer(int size) +{ + struct conv_buffer* buf = NULL; + + buf = (struct conv_buffer*) malloc(sizeof(struct conv_buffer)); + if (buf) { + buf->size = size; + buf->buffer = (char*)malloc(size); + if (!buf->buffer) + buf->size = 0; + } + + return buf; +} + +void free_conv_buffer(struct conv_buffer* buf) +{ + if (buf) { + free(buf->buffer); + free(buf); + } +} + +char* grow_conv_buffer(struct conv_buffer* buf, char* curr_pos) +{ + size_t outlen, new_size; + char* new_buffer; + outlen = curr_pos - buf->buffer; + new_size = buf->size * 2; + new_buffer = realloc(buf->buffer, new_size); + if (new_buffer) { + buf->buffer = new_buffer; + buf->size = new_size; + curr_pos = buf->buffer + outlen; + memset(curr_pos, 0, buf->size - (curr_pos - buf->buffer)); + return curr_pos; + } + else + return NULL; +} + +convert_t initialize_utf8_conversion(const char* charset) +{ + struct convert *conv = NULL; + int cleanup = 0; + + conv = (struct convert *)malloc(sizeof(struct convert)); + if (conv) { + /* Unless reset to 0 at the end, this will force cleanup */ + cleanup = 1; + /* First initialize to default values */ + conv->from_utf8 = (iconv_t)-1; + conv->to_utf8 = (iconv_t)-1; + conv->outbuf = NULL; + conv->unknown = NULL; + + /* Now initialize everything to what it should be */ + conv->from_utf8 = iconv_open(charset, "UTF-8"); + if (conv->from_utf8 != (iconv_t)-1) { + conv->to_utf8 = iconv_open("UTF-8", charset); + if (conv->to_utf8 != (iconv_t)-1) { + conv->outbuf = create_conv_buffer(INITIAL_OUTSIZE); + if (conv->outbuf) { + conv->unknown = strdup(DEFAULT_UNKNOWN); + if (conv->unknown) + cleanup = 0; /* All successful */ + } + } + } + } + + if (cleanup) { + cleanup_utf8_conversion(conv); + conv = NULL; + } + + return conv; +} + +int conversion_set_unknown(convert_t conv, const char* unknown) +{ + int result = 1; + + if (conv && unknown) { + char* unknown_copy = strdup(unknown); + if (unknown_copy) { + if (conv->unknown) free(conv->unknown); + conv->unknown = unknown_copy; + } + else + result = 0; + } + + return result; +} + +void cleanup_utf8_conversion(convert_t conv) +{ + if (conv) { + if (conv->from_utf8 != (iconv_t)-1) + iconv_close(conv->from_utf8); + if (conv->to_utf8 != (iconv_t)-1) + iconv_close(conv->to_utf8); + if (conv->outbuf) + free_conv_buffer(conv->outbuf); + if (conv->unknown) + free(conv->unknown); + free(conv); + } +} + +char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails) +{ + size_t insize = strlen(input); + size_t outsize; + ICONV_CONST char* inptr = (ICONV_CONST char*) input; + char *outptr; + size_t nconv; + struct conv_buffer* outbuf; + + if (!conv) { + if (conv_fails != NULL) *conv_fails = insize; + return NULL; + } + /* make sure we start from an empty state */ + iconv(conv->from_utf8, NULL, NULL, NULL, NULL); + if (conv_fails != NULL) *conv_fails = 0; + /* set up output buffer (empty it) */ + outbuf = conv->outbuf; + outptr = outbuf->buffer; + outsize = outbuf->size; + memset(outbuf->buffer, 0, outbuf->size); + nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize); + while (nconv == (size_t)-1) { + if (errno == E2BIG) { + /* grow the output buffer */ + outptr = grow_conv_buffer(outbuf, outptr); + if (outptr) + outsize = outbuf->size - (outptr - outbuf->buffer); + else { + errno = ENOMEM; + return NULL; + } + } + else if (errno == EILSEQ) { + /* skip over character */ + const char* unkn_ptr = conv->unknown; + if (conv_fails != NULL) (*conv_fails)++; + if ((*inptr & 0x80) == 0) { + /* an ASCII character, just skip one (this case is very improbable) */ + inptr++; insize--; + } + else { + /* a general UTF-8 character, skip all 0x10xxxxxx bytes */ + inptr++; insize--; + while ((*inptr & 0xC0) == 0x80) { + inptr++; insize--; + } + } + /* append the "unknown" string to the output */ + while (*unkn_ptr) { *outptr++ = *unkn_ptr++; outsize--; } + } + else { + /* EINVAL should not happen, since we convert entire strings */ + /* EBADF is an error which should be captured by the first if above */ + if (conv_fails != NULL) *conv_fails += insize; + return NULL; + } + nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize); + } + return outbuf->buffer; +} + +char* convert_to_utf8(convert_t conv, const char* input) +{ + size_t insize = strlen(input); + size_t outsize; + ICONV_CONST char *inptr = (ICONV_CONST char*) input; + char *outptr; + size_t nconv; + struct conv_buffer* outbuf; + + if (!conv) + return NULL; + /* make sure we start from an empty state */ + iconv(conv->to_utf8, NULL, NULL, NULL, NULL); + /* set up output buffer (empty it) */ + outbuf = conv->outbuf; + outptr = outbuf->buffer; + outsize = outbuf->size; + memset(outbuf->buffer, 0, outbuf->size); + nconv = iconv(conv->to_utf8, &inptr, &insize, &outptr, &outsize); + while (nconv == (size_t)-1) { + if (errno == E2BIG) { + /* grow the output buffer */ + outptr = grow_conv_buffer(outbuf, outptr); + if (outptr) + outsize = outbuf->size - (outptr - outbuf->buffer); + else { + errno = ENOMEM; + return NULL; + } + } + else { + /* EILSEQ happens when the input doesn't match the source encoding, + return NULL in this case */ + /* EINVAL should not happen, since we convert entire strings */ + /* EBADF is an error which should be captured by the first if above */ + return NULL; + } + nconv = iconv(conv->to_utf8, &inptr, &insize, &outptr, &outsize); + } + return outbuf->buffer; +} diff --git a/utf8/utf8-locale.c b/utf8/utf8-locale.c index 28eeb4b..115c66e 100644 --- a/utf8/utf8-locale.c +++ b/utf8/utf8-locale.c @@ -9,156 +9,49 @@ /* $Id$ */ /* $Name$ */ +#include "utf8.h" #include -#include #include -#include -#include -#include "config.h" #include "libcharset.h" -#include "utf8.h" - -#define INITIAL_OUTSIZE 256 -static iconv_t utf8_to_locale = (iconv_t) -1; -static iconv_t locale_to_utf8 = (iconv_t) -1; -static char* outbuffer = NULL; -static size_t outbufsize = 0; -static const char* the_unknown = "?"; +static convert_t locale_conv = NULL; void convert_set_unknown(const char* unknown) { - the_unknown = unknown; + conversion_set_unknown(locale_conv, unknown); } void close_conversion_contexts() { - iconv_close(utf8_to_locale); - iconv_close(locale_to_utf8); - utf8_to_locale = (iconv_t) -1; - locale_to_utf8 = (iconv_t) -1; - free(outbuffer); + cleanup_utf8_conversion(locale_conv); } int open_conversion_contexts() { - assert(utf8_to_locale == (iconv_t) -1); - assert(locale_to_utf8 == (iconv_t) -1); - utf8_to_locale = iconv_open(locale_charset(), "UTF-8"); - if (utf8_to_locale == (iconv_t) -1) - return -1; + assert (locale_conv == NULL); + locale_conv = initialize_utf8_conversion(locale_charset()); + + if (locale_conv) { + atexit(close_conversion_contexts); + return 0; + } else { - locale_to_utf8 = iconv_open("UTF-8", locale_charset()); - if (locale_to_utf8 == (iconv_t) -1) { - close_conversion_contexts(); - return -1; - } - else { - outbufsize = INITIAL_OUTSIZE; - outbuffer = (char*)malloc(outbufsize); - atexit(close_conversion_contexts); - return 0; - } + return -1; } } char* convert_utf8_to_locale(const char* input, int *conv_fails) { - size_t insize = strlen(input); - size_t outsize; - ICONV_CONST char *inptr = (ICONV_CONST char*) input; - char *outptr; - size_t nconv; + if (!locale_conv) + open_conversion_contexts(); - if (utf8_to_locale == (iconv_t) -1 && (open_conversion_contexts() == -1)) { - if (conv_fails != NULL) *conv_fails = insize; - return NULL; - } - assert(utf8_to_locale != (iconv_t) -1); - /* make sure we start from an empty state */ - iconv(utf8_to_locale, NULL, NULL, NULL, NULL); - if (conv_fails != NULL) *conv_fails = 0; - /* set up output buffer (empty it) */ - outptr = outbuffer; - outsize = outbufsize; - memset(outbuffer, 0, outbufsize); - nconv = iconv(utf8_to_locale, &inptr, &insize, &outptr, &outsize); - while (nconv == (size_t)-1) { - if (errno == E2BIG) { - /* grow the output buffer */ - size_t outlen; - outlen = outptr - outbuffer; - outbufsize *= 2; - outbuffer = realloc(outbuffer, outbufsize); - outptr = outbuffer + outlen; - outsize = outbufsize - outlen; - memset(outptr, 0, outsize); - } - else if (errno == EILSEQ) { - /* skip over character */ - const char* unkn_ptr = the_unknown; - if (conv_fails != NULL) (*conv_fails)++; - if ((*inptr & 0x80) == 0) { - /* an ASCII character, just skip one (this case is very improbable) */ - inptr++; insize--; - } - else { - /* a general UTF-8 character, skip all 0x10xxxxxx bytes */ - inptr++; insize--; - while ((*inptr & 0xC0) == 0x80) { - inptr++; insize--; - } - } - /* append the "unknown" string to the output */ - while (*unkn_ptr) { *outptr++ = *unkn_ptr++; outsize--; } - } - else { - /* EINVAL should not happen, since we convert entire strings */ - /* EBADF is an error which should be captured by the assert above */ - if (conv_fails != NULL) *conv_fails += insize; - return NULL; - } - nconv = iconv(utf8_to_locale, &inptr, &insize, &outptr, &outsize); - } - return outbuffer; + return convert_from_utf8(locale_conv, input, conv_fails); } char* convert_locale_to_utf8(const char* input) { - size_t insize = strlen(input); - size_t outsize; - ICONV_CONST char *inptr = (ICONV_CONST char*) input; - char *outptr; - size_t nconv; + if (!locale_conv) + open_conversion_contexts(); - if (locale_to_utf8 == (iconv_t) -1 && (open_conversion_contexts() == -1)) - return NULL; - assert(locale_to_utf8 != (iconv_t) -1); - /* make sure we start from an empty state */ - iconv(locale_to_utf8, NULL, NULL, NULL, NULL); - /* set up output buffer (empty it) */ - outptr = outbuffer; - outsize = outbufsize; - memset(outbuffer, 0, outbufsize); - nconv = iconv(locale_to_utf8, &inptr, &insize, &outptr, &outsize); - while (nconv == (size_t)-1) { - if (errno == E2BIG) { - /* grow the output buffer */ - size_t outlen; - outlen = outptr - outbuffer; - outbufsize *= 2; - outbuffer = realloc(outbuffer, outbufsize); - outptr = outbuffer + outlen; - outsize = outbufsize - outlen; - memset(outptr, 0, outsize); - } - else { - /* EILSEQ should not happen, because UTF-8 can represent anything */ - /* EINVAL should not happen, since we convert entire strings */ - /* EBADF is an error which should be captured by the assert above */ - return NULL; - } - nconv = iconv(locale_to_utf8, &inptr, &insize, &outptr, &outsize); - } - return outbuffer; + return convert_to_utf8(locale_conv, input); } diff --git a/utf8/utf8.h b/utf8/utf8.h index 79836dd..425b7a4 100644 --- a/utf8/utf8.h +++ b/utf8/utf8.h @@ -16,13 +16,37 @@ extern "C" { #endif +#include "iconv.h" + +struct conv_buffer { + char* buffer; + size_t size; +}; + +struct convert { + iconv_t from_utf8; + iconv_t to_utf8; + struct conv_buffer* outbuf; + char* unknown; +}; + +typedef struct convert *convert_t; + /* Returns -1 if the string is not a valid UTF-8 string, returns its string length otherwise */ int utf8_strlen(const char* input); /* Returns 1 if string is valid UTF-8 string, 0 otherwise */ int is_utf8_string(const char* input); - + + /* General conversion interface (is bidirectional) */ +convert_t initialize_utf8_conversion(const char* charset); +int conversion_set_unknown(convert_t conv, const char* unknown); +void cleanup_utf8_conversion(convert_t conv); +char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails); +char* convert_to_utf8(convert_t conv, const char* input); + + /* Specific locale conversion interface */ void convert_set_unknown(const char* unknown); char* convert_utf8_to_locale(const char* input, int *conv_fails); char* convert_locale_to_utf8(const char* input);