From 95deed7400e87af12a987af2abf5187ab1ddf9aa Mon Sep 17 00:00:00 2001 From: Peter Verthez Date: Sun, 11 Nov 2001 17:39:41 +0000 Subject: [PATCH] Basic framework for recognizing Unicode encoding. --- gedcom.h | 8 ++++++ gedcom.lex | 3 +- standalone.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 87 insertions(+), 4 deletions(-) diff --git a/gedcom.h b/gedcom.h index be18280..6f35d4e 100644 --- a/gedcom.h +++ b/gedcom.h @@ -18,6 +18,13 @@ typedef enum _MECH { IGNORE_ERRORS } MECHANISM; +/* Basic file encoding */ +typedef enum _ENC { + ONE_BYTE, + TWO_BYTE_HILO, + TWO_BYTE_LOHI +} ENCODING; + int gedcom_error(char* s, ...); int gedcom_warning(char* s, ...); int gedcom_debug_print(char* s, ...); @@ -27,3 +34,4 @@ void gedcom_set_compat_handling(int enable_compat); int gedcom_parse(); int gedcom_lex(); extern int line_no; +extern FILE *gedcom_in; diff --git a/gedcom.lex b/gedcom.lex index 7d3f2e2..62fd3f6 100644 --- a/gedcom.lex +++ b/gedcom.lex @@ -301,7 +301,8 @@ else { } } -. { gedcom_error("Unexpected character: '%s'", gedcom_text); +. { gedcom_error("Unexpected character: '%s' (0x%02x)", + gedcom_text, gedcom_text[0]); return BADTOKEN; } diff --git a/standalone.c b/standalone.c index 06ac82a..41c7752 100644 --- a/standalone.c +++ b/standalone.c @@ -5,7 +5,8 @@ void show_help () { - printf("gedcom-parse test program for libgedcom\n"); + printf("gedcom-parse test program for libgedcom\n\n"); + printf("Usage: gedcom-parse [options] file\n"); printf("Options:\n"); printf(" -h Show this help text\n"); printf(" -nc Disable compatibility mode\n"); @@ -16,11 +17,72 @@ void show_help () printf(" -da Debug setting: libgedcom + yacc debug messages\n"); } +int determine_encoding(FILE* f) +{ + char first[2]; + + fread(first, 1, 2, f); + if ((first[0] == '0') && (first[1] == ' ')) { + gedcom_warning("One-byte encoding"); + fseek(f, 0, 0); + return ONE_BYTE; + } + else if ((first[0] == '\0') && (first[1] == '0')) + { + gedcom_warning("Two-byte encoding, high-low"); + fseek(f, 0, 0); + return TWO_BYTE_HILO; + } + else if ((first[0] == '\xFE') && (first[1] == '\xFF')) + { + gedcom_warning("Two-byte encoding, high-low, with BOM"); + return TWO_BYTE_HILO; + } + else if ((first[0] == '0') && (first[1] == '\0')) + { + gedcom_warning("Two-byte encoding, low-high"); + fseek(f, 0, 0); + return TWO_BYTE_LOHI; + } + else if ((first[0] == '\xFF') && (first[1] == '\xFE')) + { + gedcom_warning("Two-byte encoding, low-high, with BOM"); + return TWO_BYTE_LOHI; + } + else { + gedcom_warning("Unknown encoding, falling back to one-byte"); + fseek(f, 0, 0); + return ONE_BYTE; + } +} + +int gedcom_xxx_parse(char* file_name) +{ + ENCODING enc; + FILE* file = fopen (file_name, "r"); + if (!file) { + printf("Could not open file '%s'\n", file_name); + exit(1); + } + enc = determine_encoding(file); + + if (enc == ONE_BYTE) { + gedcom_in = file; + return gedcom_parse(); + } + else { + printf("No parser yet for encoding\n"); + exit(1); + } +} + int main(int argc, char* argv[]) { MECHANISM mech = IMMED_FAIL; int compat_enabled = 1; int debug_level = 0; + char* file_name = NULL; + if (argc > 1) { int i; for (i=1; i