From: Peter Verthez Date: Sun, 13 Jan 2002 12:22:35 +0000 (+0000) Subject: Character encoding (UTF-8 to locale) example and use in standalone.c. X-Git-Url: https://git.dlugolecki.net.pl/?a=commitdiff_plain;h=5c40e570e3bd0f3d7139e4d1e429527400c50133;p=gedcom-parse.git Character encoding (UTF-8 to locale) example and use in standalone.c. --- diff --git a/Makefile.am b/Makefile.am index 031d2a3..3e5d93c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3,11 +3,13 @@ # $Name$ SUBDIRS = intl ansel gedcom include . t doc po INCLUDES = -I $(srcdir)/include +CFLAGS = -g -O2 pkgdata_DATA = gedcom.enc noinst_PROGRAMS = testgedcom -testgedcom_SOURCES = standalone.c +testgedcom_SOURCES = standalone.c utf8-locale.c +noinst_HEADERS = utf8-locale.h testgedcom_LDFLAGS = -L gedcom/.libs -lgedcom testgedcom_LDADD = @INTLLIBS@ diff --git a/standalone.c b/standalone.c index 70a3ac1..9c8dc15 100644 --- a/standalone.c +++ b/standalone.c @@ -25,7 +25,10 @@ #include #include #include +#include +#include #include "gedcom.h" +#include "utf8-locale.h" #define OUTFILE "testgedcom.out" FILE* outfile = NULL; @@ -168,8 +171,11 @@ Gedcom_ctxt source_date_start(Gedcom_ctxt parent, int level, char *tag, void default_cb(Gedcom_ctxt ctxt, int level, char *tag, char *raw_value, int tag_value) { + char *converted = NULL; + if (raw_value) + converted = convert_utf8_to_locale(raw_value); output(0, "== %d %s (%d) %s (ctxt is %d)\n", - level, tag, tag_value, raw_value, (int)ctxt); + level, tag, tag_value, converted, (int)ctxt); } void subscribe_callbacks() @@ -252,6 +258,7 @@ int main(int argc, char* argv[]) exit(1); } + setlocale(LC_ALL, ""); gedcom_set_debug_level(debug_level, NULL); gedcom_set_compat_handling(compat_enabled); gedcom_set_error_handling(mech); diff --git a/utf8-locale.c b/utf8-locale.c new file mode 100644 index 0000000..c89aa5c --- /dev/null +++ b/utf8-locale.c @@ -0,0 +1,157 @@ +/* Encoding utility from UTF-8 to locale and vice versa + Copyright (C) 2001, 2002 Peter Verthez + + Permission granted to do anything with this file that you want, as long + as the above copyright is retained in all copies. + THERE IS NO WARRANTY - USE AT YOUR OWN RISK +*/ + +/* $Id$ */ +/* $Name$ */ + +#include +#include +#include +#include +#include +#include "utf8-locale.h" + +#define INITIAL_OUTSIZE 256 + +static iconv_t utf8_to_locale = (iconv_t) -1; +static iconv_t locale_to_utf8 = (iconv_t) -1; +static char* outbuffer = NULL; +static size_t outbufsize = 0; +static const char* the_unknown = "?"; + +void convert_set_unknown(const char* unknown) +{ + the_unknown = unknown; +} + +void close_conversion_contexts() +{ + iconv_close(utf8_to_locale); + iconv_close(locale_to_utf8); + utf8_to_locale = (iconv_t) -1; + locale_to_utf8 = (iconv_t) -1; + free(outbuffer); +} + +int open_conversion_contexts() +{ + assert(utf8_to_locale == (iconv_t) -1); + assert(locale_to_utf8 == (iconv_t) -1); + utf8_to_locale = iconv_open(nl_langinfo(CODESET), "UTF-8"); + if (utf8_to_locale == (iconv_t) -1) + return -1; + else { + locale_to_utf8 = iconv_open("UTF-8", nl_langinfo(CODESET)); + if (locale_to_utf8 == (iconv_t) -1) { + close_conversion_contexts(); + return -1; + } + else { + outbufsize = INITIAL_OUTSIZE; + outbuffer = (char*)malloc(outbufsize); + atexit(close_conversion_contexts); + return 0; + } + } +} + +char* convert_utf8_to_locale(char* input) +{ + size_t insize = strlen(input); + size_t outsize; + char *inptr = input; + char *outptr; + size_t nconv; + + if (utf8_to_locale == (iconv_t) -1 && (open_conversion_contexts() == -1)) + return NULL; + assert(utf8_to_locale != (iconv_t) -1); + /* make sure we start from an empty state */ + iconv(utf8_to_locale, NULL, NULL, NULL, NULL); + /* set up output buffer (empty it) */ + outptr = outbuffer; + outsize = outbufsize; + memset(outbuffer, 0, outbufsize); + nconv = iconv(utf8_to_locale, &inptr, &insize, &outptr, &outsize); + while (nconv == -1) { + if (errno == E2BIG) { + /* grow the output buffer */ + size_t outlen; + outlen = outptr - outbuffer; + outbufsize *= 2; + outbuffer = realloc(outbuffer, outbufsize); + outptr = outbuffer + outlen; + outsize = outbufsize - outlen; + memset(outptr, 0, outsize); + } + else if (errno == EILSEQ) { + /* skip over character */ + const char* unkn_ptr = the_unknown; + if ((*inptr & 0x80) == 0) { + /* an ASCII character, just skip one (this case is very improbable) */ + inptr++; insize--; + } + else { + /* a general UTF-8 character, skip all 0x10xxxxxx bytes */ + inptr++; insize--; + while ((*inptr & 0xC0) == 0x80) { + inptr++; insize--; + } + } + /* append the "unknown" string to the output */ + while (*unkn_ptr) { *outptr++ = *unkn_ptr++; outsize--; } + } + else { + /* EINVAL should not happen, since we convert entire strings */ + /* EBADF is an error which should be captured by the assert above */ + return NULL; + } + nconv = iconv(utf8_to_locale, &inptr, &insize, &outptr, &outsize); + } + return outbuffer; +} + +char* convert_locale_to_utf8(char* input) +{ + size_t insize = strlen(input); + size_t outsize; + char *inptr = input; + char *outptr; + size_t nconv; + + if (locale_to_utf8 == (iconv_t) -1 && (open_conversion_contexts() == -1)) + return NULL; + assert(locale_to_utf8 != (iconv_t) -1); + /* make sure we start from an empty state */ + iconv(locale_to_utf8, NULL, NULL, NULL, NULL); + /* set up output buffer (empty it) */ + outptr = outbuffer; + outsize = outbufsize; + memset(outbuffer, 0, outbufsize); + nconv = iconv(locale_to_utf8, &inptr, &insize, &outptr, &outsize); + while (nconv == -1) { + if (errno == E2BIG) { + /* grow the output buffer */ + size_t outlen; + outlen = outptr - outbuffer; + outbufsize *= 2; + outbuffer = realloc(outbuffer, outbufsize); + outptr = outbuffer + outlen; + outsize = outbufsize - outlen; + memset(outptr, 0, outsize); + } + else { + /* EILSEQ should not happen, because UTF-8 can represent anything */ + /* EINVAL should not happen, since we convert entire strings */ + /* EBADF is an error which should be captured by the assert above */ + return NULL; + } + nconv = iconv(locale_to_utf8, &inptr, &insize, &outptr, &outsize); + } + return outbuffer; +} diff --git a/utf8-locale.h b/utf8-locale.h new file mode 100644 index 0000000..4a06187 --- /dev/null +++ b/utf8-locale.h @@ -0,0 +1,19 @@ +/* Encoding utility from UTF-8 to locale and vice versa + Copyright (C) 2001, 2002 Peter Verthez + + Permission granted to do anything with this file that you want, as long + as the above copyright is retained in all copies. + THERE IS NO WARRANTY - USE AT YOUR OWN RISK +*/ + +/* $Id$ */ +/* $Name$ */ + +#ifndef __UTF8_LOCALE_H +#define __UTF8_LOCALE_H + +void convert_set_unknown(const char* unknown); +char* convert_utf8_to_locale(char* input); +char* convert_locale_to_utf8(char* input); + +#endif /* __UTF8_LOCALE_H */