From 74fde9789d7acaa629a576298d45421953a51bb1 Mon Sep 17 00:00:00 2001 From: Peter Verthez Date: Sat, 19 Jan 2002 13:29:02 +0000 Subject: [PATCH] Added parameter to conversion from UTF-8 to locale, to return the number of conversion failures. --- doc/usage.html | 69 ++++++++++++++++++++++++++++++-------------------- standalone.c | 7 ++--- utf8-locale.c | 4 ++- utf8-locale.h | 2 +- 4 files changed, 49 insertions(+), 33 deletions(-) diff --git a/doc/usage.html b/doc/usage.html index 00003be..5d36b89 100644 --- a/doc/usage.html +++ b/doc/usage.html @@ -452,7 +452,43 @@ All strings passed by the GEDCOM parser to the application are in UTF-8 encoding be able to display it.

The most common case is that the output character set is controlled by the locale mechanism (i.e. via the LANG, LC_ALL or LC_CTYPE environment variables), which also controls the gettext - mechanism in the application.  For this, the following steps need to + mechanism in the application.  
+
+
+ + + + + The source distribution of gedcom-parse contains an example implementation (utf8-locale.c and utf8-locale.h + in the top directory).   Feel free to use it in +your source code (it is not part of the library, and it isn't installed anywhere, +so you need to take over the source and header file in your application). + 
+
+ +Its interface is:
+
char *convert_utf8_to_locale (char *input, int *conv_failures);
char *convert_locale_to_utf8 (char *input);
+ +Both functions return a pointer to a static buffer that is overwritten on +each call.  To function properly, the application must first set the +locale using the setlocale function (the second step detailed below). + All other steps given below, including setting up and closing down the conversion +handles, are transparantly handled by the two functions.  
+
+If you pass a pointer to an integer to the first function, it will be set +to the number of conversion failures, i.e. characters that couldn't be converted; +you can also just pass NULL if you are not interested (note that usually, the interesting information is just whether there were + conversion failures or not, which is then given by the integer being bigger +than zero or not).  The second function doesn't need this, because any +locale can be converted to UTF-8.
+
+ +You can change the "?" that is output for characters that can't be converted +to any string you want, using the following function before the conversion +calls:
+
void convert_set_unknown (const char *unknown);
+
+If you want to have your own functions for it instead of this example implementation, the following steps need to be taken by the application (more detailed info can be found in the info file of the GNU libc library in the "Generic Charset Conversion" section under "Character Set Handling" or online here):
@@ -527,33 +563,10 @@ characters can't be represented in the target character set).  The ic
iconv_close(iconv_handle);
- - - - The source distribution of gedcom-parse contains an example implementation (utf8-locale.c and utf8-locale.h - in the top directory) that grows the output buffer dynamically and outputs -"?" for characters that can't be converted.  Feel free to use it in -your source code (it is not part of the library, and it isn't installed anywhere, -so you need to take over the source and header file in your application). - 
-
-Its interface is:
-
-
char *convert_utf8_to_locale (char *input);
char *convert_locale_to_utf8 (char *input);
-
-Both functions return a pointer to a static buffer that is overwritten on -each call.  To function properly, the application must first set the -locale using the setlocale function (the second step above). - All other steps, including setting up and closing down the conversion -handles, are transparantly handled by the two functions.
-
-You can change the "?" that is output for characters that can't be converted -to any string you want, using the following function before the conversion -calls:
-
-
void convert_set_unknown (const char *unknown);
-
-
+ The example implementation mentioned above grows the output buffer dynamically and outputs +"?" for characters that can't be converted.
+
+
$Id$
$Name$

diff --git a/standalone.c b/standalone.c index 9c8dc15..a4d4477 100644 --- a/standalone.c +++ b/standalone.c @@ -172,10 +172,11 @@ void default_cb(Gedcom_ctxt ctxt, int level, char *tag, char *raw_value, int tag_value) { char *converted = NULL; + int conv_fails; if (raw_value) - converted = convert_utf8_to_locale(raw_value); - output(0, "== %d %s (%d) %s (ctxt is %d)\n", - level, tag, tag_value, converted, (int)ctxt); + converted = convert_utf8_to_locale(raw_value, &conv_fails); + output(0, "== %d %s (%d) %s (ctxt is %d, conversion failures: %d)\n", + level, tag, tag_value, converted, (int)ctxt, conv_fails); } void subscribe_callbacks() diff --git a/utf8-locale.c b/utf8-locale.c index c89aa5c..9946cc9 100644 --- a/utf8-locale.c +++ b/utf8-locale.c @@ -60,7 +60,7 @@ int open_conversion_contexts() } } -char* convert_utf8_to_locale(char* input) +char* convert_utf8_to_locale(char* input, int *conv_fails) { size_t insize = strlen(input); size_t outsize; @@ -73,6 +73,7 @@ char* convert_utf8_to_locale(char* input) assert(utf8_to_locale != (iconv_t) -1); /* make sure we start from an empty state */ iconv(utf8_to_locale, NULL, NULL, NULL, NULL); + if (conv_fails != NULL) *conv_fails = 0; /* set up output buffer (empty it) */ outptr = outbuffer; outsize = outbufsize; @@ -92,6 +93,7 @@ char* convert_utf8_to_locale(char* input) else if (errno == EILSEQ) { /* skip over character */ const char* unkn_ptr = the_unknown; + if (conv_fails != NULL) (*conv_fails)++; if ((*inptr & 0x80) == 0) { /* an ASCII character, just skip one (this case is very improbable) */ inptr++; insize--; diff --git a/utf8-locale.h b/utf8-locale.h index dec20ef..ffe974b 100644 --- a/utf8-locale.h +++ b/utf8-locale.h @@ -17,7 +17,7 @@ __BEGIN_DECLS void convert_set_unknown(const char* unknown); -char* convert_utf8_to_locale(char* input); +char* convert_utf8_to_locale(char* input, int *conv_fails); char* convert_locale_to_utf8(char* input); __END_DECLS -- 2.30.2