From 845f7ac49d5e63b96aca7a4cd1f60ac5e3230033 Mon Sep 17 00:00:00 2001 From: Peter Verthez Date: Fri, 23 Nov 2001 21:04:57 +0000 Subject: [PATCH] Added encoding mapping via configuration file. --- Makefile | 14 ++++++-- encoding.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++- encoding.h | 2 ++ gedcom.enc | 8 +++++ gedcom.y | 4 +-- gedcom_lohi.lex | 7 ++-- standalone.c | 2 +- 7 files changed, 114 insertions(+), 8 deletions(-) create mode 100644 gedcom.enc diff --git a/Makefile b/Makefile index 5b3e9b1..125b82e 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ YACC=bison LEX=flex -CFLAGS=-g -Wall -pedantic +CFLAGS=-g -W -Wall -pedantic YFLAGS=--debug --defines LFLAGS=-8 @@ -13,6 +13,8 @@ gedcom_parse: standalone.o lex.gedcom_1byte_.o lex.gedcom_hilo_.o \ encoding.o $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@ +libgedcom.so: + lex.gedcom_1byte_.c: gedcom_1byte.lex gedcom.tab.h gedcom.h multilex.h $(LEX) $(LFLAGS) -Pgedcom_1byte_ gedcom_1byte.lex @@ -29,7 +31,7 @@ clean: rm -f core gedcom_parse test_* *.o lex.gedcom_* \ gedcom.tab.* gedcom.output -# Test programs +# Lexer test programs test_1byte: lex.gedcom_1byte_.test.o message.o encoding.o $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@ @@ -48,3 +50,11 @@ test_lohi: lex.gedcom_lohi_.test.o message.o encoding.o lex.gedcom_lohi_.test.o: lex.gedcom_lohi_.c $(CC) -DLEXER_TEST -c $(CPPFLAGS) $(CFLAGS) $^ -o $@ + +# Test of parser + +test: gedcom_parse + @for file in t/*.ged; do \ + echo "=== testing $$file"; \ + ./gedcom_parse $$file; \ + done diff --git a/encoding.c b/encoding.c index 08f69db..ba5aa1d 100644 --- a/encoding.c +++ b/encoding.c @@ -1,24 +1,107 @@ #include #include +#include +#include #include "gedcom.h" #include "encoding.h" #define INTERNAL_ENCODING "UTF8" +#define ENCODING_CONF_FILE "gedcom.enc" +#define MAXBUF 255 static iconv_t cd_to_internal = (iconv_t) -1; static char int_buf[MAXGEDCLINELEN*2]; +static void *encoding_mapping = NULL; + +struct node { + char *gedcom_name; + char *iconv_name; +}; + +int node_compare(const void *node1, const void *node2) +{ + return strcmp(((const struct node *) node1)->gedcom_name, + ((const struct node *) node2)->gedcom_name); +} + +void add_encoding(char *gedcom_n, char *iconv_n) +{ + void **datum; + struct node *nodeptr = (struct node *) malloc(sizeof *nodeptr); + nodeptr->gedcom_name = (char *) malloc(strlen(gedcom_n) + 1); + nodeptr->iconv_name = (char *) malloc(strlen(iconv_n) + 1); + strcpy(nodeptr->gedcom_name, gedcom_n); + strcpy(nodeptr->iconv_name, iconv_n); + datum = tsearch(nodeptr, &encoding_mapping, node_compare); + if ((datum == NULL) || (*datum != nodeptr)) { + gedcom_warning("Duplicate entry found for encoding '%s', ignoring", + gedcom_n); + } +} + +char* get_encoding(char* gedcom_n) +{ + void **datum; + struct node search_node; + search_node.gedcom_name = gedcom_n; + datum = tfind(&search_node, &encoding_mapping, node_compare); + if (datum == NULL) { + gedcom_error("No encoding found for '%s'", gedcom_n); + return NULL; + } + else { + return ((const struct node *) *datum)->iconv_name; + } +} + +void init_encodings() +{ + if (encoding_mapping == NULL) { + FILE *in; + char buffer[MAXBUF + 1]; + char gedcom_n[MAXBUF + 1]; + char iconv_n[MAXBUF + 1]; + in = fopen(ENCODING_CONF_FILE, "r"); + if (in != NULL) { + while (fgets(buffer, sizeof(buffer), in) != NULL) { + if (buffer[strlen(buffer) - 1] != '\n') { + gedcom_error("Line too long in encoding configuration file '%s'", + ENCODING_CONF_FILE); + return; + } + else if (buffer[0] != '#') { + if (sscanf(buffer, "%s %s", gedcom_n, iconv_n) == 2) { + add_encoding(gedcom_n, iconv_n); + } + } + } + fclose(in); + } + else { + gedcom_warning("Could not open encoding configuration file '%s'", + ENCODING_CONF_FILE); + } + } +} int open_conv_to_internal(char* fromcode) { + char *encoding = get_encoding(fromcode); if (cd_to_internal != (iconv_t) -1) iconv_close(cd_to_internal); - cd_to_internal = iconv_open(INTERNAL_ENCODING, fromcode); + if (encoding == NULL) { + cd_to_internal = (iconv_t) -1; + } + else { + cd_to_internal = iconv_open(INTERNAL_ENCODING, encoding); + } return (cd_to_internal != (iconv_t) -1); } void close_conv_to_internal() { iconv_close(cd_to_internal); + cd_to_internal = (iconv_t) -1; } char* to_internal(char* str, size_t len) diff --git a/encoding.h b/encoding.h index c4b70ee..287e478 100644 --- a/encoding.h +++ b/encoding.h @@ -1,3 +1,5 @@ int open_conv_to_internal(char* fromcode); void close_conv_to_internal(); char* to_internal(char* str, size_t len); +void init_encodings(); +char* get_encoding(char* gedcom_name); diff --git a/gedcom.enc b/gedcom.enc new file mode 100644 index 0000000..8c2a56b --- /dev/null +++ b/gedcom.enc @@ -0,0 +1,8 @@ +# Mapping of charsets for gedcom parsing +# Each line contains the gedcom name, appended with (LOHI) or (HILO) +# for 2 byte encodings, and the iconv name of the charset, separated +# by whitespace +UNICODE(LOHI) UTF16LE +UNICODE(HILO) UTF16BE +ASCII ASCII +ANSI CP1252 diff --git a/gedcom.y b/gedcom.y index 7b8b2ef..8aad4fd 100644 --- a/gedcom.y +++ b/gedcom.y @@ -2110,7 +2110,7 @@ opt_line_item : /* empty */ { } | DELIM line_item { } ; -line_item : anychar { int i; +line_item : anychar { size_t i; CLEAR_BUFFER(string_buf); string_buf_ptr = string_buf; /* The following also takes care of '@@' */ @@ -2132,7 +2132,7 @@ line_item : anychar { int i; YYERROR; } else { - int i; + size_t i; /* The following also takes care of '@@' */ if (!strncmp($2, "@@", 3)) *string_buf_ptr++ = '@'; diff --git a/gedcom_lohi.lex b/gedcom_lohi.lex index 6d88b43..bc85468 100644 --- a/gedcom_lohi.lex +++ b/gedcom_lohi.lex @@ -9,6 +9,8 @@ #include "gedcom.h" #include "multilex.h" #include "encoding.h" + +#define YY_NO_UNPUT %} %s NORMAL @@ -328,8 +330,9 @@ int yywrap() int main() { - int tok; - int res = open_conv_to_internal("UTF16LE"); + int tok, res; + init_encodings(); + res = open_conv_to_internal("UNICODE(LOHI)"); if (!res) { gedcom_error("Unable to open conversion context: %s", strerror(errno)); diff --git a/standalone.c b/standalone.c index 452c1db..cedfca2 100644 --- a/standalone.c +++ b/standalone.c @@ -66,7 +66,7 @@ int main(int argc, char* argv[]) gedcom_set_compat_handling(compat_enabled); gedcom_set_error_handling(mech); - if (gedcom_xxx_parse(file_name) == 0) { + if (gedcom_parse_file(file_name) == 0) { printf("Parse succeeded\n"); return 0; } -- 2.30.2