X-Git-Url: https://git.dlugolecki.net.pl/?a=blobdiff_plain;f=utf8%2Futf8-convert.c;h=9dbfaf50e8b15d1ca03613e458faa07ca0913f01;hb=18ff02c2f0dff12904dbd2dc4d6c40ef3ad4a6d2;hp=83c3608b50a2eeb6de287ee1bbed35c7a515121e;hpb=af581cf02cbbed0d24636be0b3533587448e7e0f;p=gedcom-parse.git diff --git a/utf8/utf8-convert.c b/utf8/utf8-convert.c index 83c3608..9dbfaf5 100644 --- a/utf8/utf8-convert.c +++ b/utf8/utf8-convert.c @@ -1,40 +1,79 @@ -/* Encoding utility from UTF-8 to locale and vice versa +/* Encoding utility from UTF-8 to another charset and vice versa Copyright (C) 2001, 2002 Peter Verthez - Permission granted to do anything with this file that you want, as long - as the above copyright is retained in all copies. - THERE IS NO WARRANTY - USE AT YOUR OWN RISK + The UTF8 tools library is free software; you can redistribute it + and/or modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The Gedcom parser library is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the Gedcom parser library; if not, write to the + Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ /* $Id$ */ /* $Name$ */ -#include "utf8.h" +#include "utf8tools.h" #include #include #include #include #include "config.h" -#define INITIAL_OUTSIZE 256 +#define INITIAL_BUFSIZE 256 #define DEFAULT_UNKNOWN "?" -struct conv_buffer* create_conv_buffer(int size) +#define INTERNAL_BUFFER 0 +#define EXTERNAL_BUFFER 1 + +struct conv_buffer { + char* buffer; + size_t size; + int type; /* For internal use */ +}; + +struct convert { + iconv_t from_utf8; + iconv_t to_utf8; + struct conv_buffer* inbuf; + size_t insize; + struct conv_buffer* outbuf; + char* unknown; +}; + +void reset_conv_buffer(conv_buffer_t buf) +{ + memset(buf->buffer, 0, buf->size); +} + +conv_buffer_t create_conv_buffer(int size) { struct conv_buffer* buf = NULL; + if (size == 0) size = INITIAL_BUFSIZE; + buf = (struct conv_buffer*) malloc(sizeof(struct conv_buffer)); if (buf) { buf->size = size; buf->buffer = (char*)malloc(size); - if (!buf->buffer) + buf->type = EXTERNAL_BUFFER; + if (buf->buffer) + reset_conv_buffer(buf); + else buf->size = 0; } return buf; } -void free_conv_buffer(struct conv_buffer* buf) +void free_conv_buffer(conv_buffer_t buf) { if (buf) { free(buf->buffer); @@ -42,7 +81,7 @@ void free_conv_buffer(struct conv_buffer* buf) } } -char* grow_conv_buffer(struct conv_buffer* buf, char* curr_pos) +char* grow_conv_buffer(conv_buffer_t buf, char* curr_pos) { size_t outlen, new_size; char* new_buffer; @@ -60,9 +99,10 @@ char* grow_conv_buffer(struct conv_buffer* buf, char* curr_pos) return NULL; } -convert_t initialize_utf8_conversion(const char* charset) +convert_t initialize_utf8_conversion(const char* charset, int external_outbuf) { struct convert *conv = NULL; + int save_errno = 0; int cleanup = 0; conv = (struct convert *)malloc(sizeof(struct convert)); @@ -72,6 +112,8 @@ convert_t initialize_utf8_conversion(const char* charset) /* First initialize to default values */ conv->from_utf8 = (iconv_t)-1; conv->to_utf8 = (iconv_t)-1; + conv->inbuf = NULL; + conv->insize = 0; conv->outbuf = NULL; conv->unknown = NULL; @@ -80,18 +122,29 @@ convert_t initialize_utf8_conversion(const char* charset) if (conv->from_utf8 != (iconv_t)-1) { conv->to_utf8 = iconv_open("UTF-8", charset); if (conv->to_utf8 != (iconv_t)-1) { - conv->outbuf = create_conv_buffer(INITIAL_OUTSIZE); - if (conv->outbuf) { - conv->unknown = strdup(DEFAULT_UNKNOWN); - if (conv->unknown) - cleanup = 0; /* All successful */ + conv->unknown = strdup(DEFAULT_UNKNOWN); + if (conv->unknown) { + conv->inbuf = create_conv_buffer(INITIAL_BUFSIZE); + conv->inbuf->type = INTERNAL_BUFFER; + if (conv->inbuf) { + if (external_outbuf) + cleanup = 0; + else { + conv->outbuf = create_conv_buffer(INITIAL_BUFSIZE); + conv->outbuf->type = INTERNAL_BUFFER; + if (conv->outbuf) + cleanup = 0; /* All successful */ + } + } } } } } if (cleanup) { + save_errno = errno; cleanup_utf8_conversion(conv); + errno = save_errno; conv = NULL; } @@ -115,6 +168,19 @@ int conversion_set_unknown(convert_t conv, const char* unknown) return result; } +int conversion_set_output_buffer(convert_t conv, conv_buffer_t buf) +{ + if (!conv) + return 0; + else if ((!conv->outbuf || conv->outbuf->type == EXTERNAL_BUFFER) + && buf && buf->type == EXTERNAL_BUFFER) { + conv->outbuf = buf; + return 1; + } + else + return 0; +} + void cleanup_utf8_conversion(convert_t conv) { if (conv) { @@ -122,7 +188,9 @@ void cleanup_utf8_conversion(convert_t conv) iconv_close(conv->from_utf8); if (conv->to_utf8 != (iconv_t)-1) iconv_close(conv->to_utf8); - if (conv->outbuf) + if (conv->inbuf && conv->inbuf->type == INTERNAL_BUFFER) + free_conv_buffer(conv->inbuf); + if (conv->outbuf && conv->outbuf->type == INTERNAL_BUFFER) free_conv_buffer(conv->outbuf); if (conv->unknown) free(conv->unknown); @@ -130,7 +198,8 @@ void cleanup_utf8_conversion(convert_t conv) } } -char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails) +char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails, + size_t* output_len) { size_t insize = strlen(input); size_t outsize; @@ -139,7 +208,7 @@ char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails) size_t nconv; struct conv_buffer* outbuf; - if (!conv) { + if (!conv || !conv->outbuf) { if (conv_fails != NULL) *conv_fails = insize; return NULL; } @@ -150,7 +219,7 @@ char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails) outbuf = conv->outbuf; outptr = outbuf->buffer; outsize = outbuf->size; - memset(outbuf->buffer, 0, outbuf->size); + reset_conv_buffer(conv->outbuf); nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize); while (nconv == (size_t)-1) { if (errno == E2BIG) { @@ -189,19 +258,19 @@ char* convert_from_utf8(convert_t conv, const char* input, int* conv_fails) } nconv = iconv(conv->from_utf8, &inptr, &insize, &outptr, &outsize); } + if (output_len) *output_len = outptr - outbuf->buffer; return outbuf->buffer; } -char* convert_to_utf8(convert_t conv, const char* input) +char* convert_to_utf8(convert_t conv, const char* input, size_t input_len) { - size_t insize = strlen(input); size_t outsize; ICONV_CONST char *inptr = (ICONV_CONST char*) input; char *outptr; size_t nconv; struct conv_buffer* outbuf; - if (!conv) + if (!conv || !conv->outbuf) return NULL; /* make sure we start from an empty state */ iconv(conv->to_utf8, NULL, NULL, NULL, NULL); @@ -209,8 +278,8 @@ char* convert_to_utf8(convert_t conv, const char* input) outbuf = conv->outbuf; outptr = outbuf->buffer; outsize = outbuf->size; - memset(outbuf->buffer, 0, outbuf->size); - nconv = iconv(conv->to_utf8, &inptr, &insize, &outptr, &outsize); + reset_conv_buffer(conv->outbuf); + nconv = iconv(conv->to_utf8, &inptr, &input_len, &outptr, &outsize); while (nconv == (size_t)-1) { if (errno == E2BIG) { /* grow the output buffer */ @@ -229,7 +298,54 @@ char* convert_to_utf8(convert_t conv, const char* input) /* EBADF is an error which should be captured by the first if above */ return NULL; } - nconv = iconv(conv->to_utf8, &inptr, &insize, &outptr, &outsize); + nconv = iconv(conv->to_utf8, &inptr, &input_len, &outptr, &outsize); } return outbuf->buffer; } + +char* convert_to_utf8_incremental(convert_t conv, + const char* input, size_t input_len) +{ + size_t res; + struct conv_buffer* outbuf = conv->outbuf; + struct conv_buffer* inbuf = conv->inbuf; + size_t outsize = outbuf->size; + char* wrptr = outbuf->buffer; + ICONV_CONST char* rdptr = (ICONV_CONST char*) inbuf->buffer; + char* retval = outbuf->buffer; + + if (!conv || !conv->outbuf) + return NULL; + + /* set up input buffer (concatenate to what was left previous time) */ + /* can't use strcpy, because possible null bytes from unicode */ + while (conv->insize + input_len > inbuf->size) + grow_conv_buffer(inbuf, inbuf->buffer + conv->insize); + memcpy(inbuf->buffer + conv->insize, input, input_len); + conv->insize += input_len; + + /* set up output buffer (empty it) */ + reset_conv_buffer(outbuf); + + /* do the conversion */ + res = iconv(conv->to_utf8, &rdptr, &conv->insize, &wrptr, &outsize); + if (res == (size_t)-1) { + if (errno == EILSEQ) { + /* restart from an empty state and return NULL */ + retval = NULL; + rdptr++; + conv->insize--; + } + else if (errno == EINVAL) { + /* Do nothing, leave it to next iteration */ + } + else { + retval = NULL; + } + } + + /* then shift what is left over to the head of the input buffer */ + memmove(inbuf->buffer, rdptr, conv->insize); + memset(inbuf->buffer + conv->insize, 0, inbuf->size - conv->insize); + return retval; +}