X-Git-Url: https://git.dlugolecki.net.pl/?a=blobdiff_plain;f=gedcom%2Fmultilex.c;h=e4128174d5e6393ea497a8b6faa9f0be079a44c2;hb=681b182b6f003e91a47a098b7e8ac632315f61a6;hp=420bfcb8b1c88b7a684328313f85055601df6af2;hpb=32da62601457ba994c6b71d71470ae066fc3969b;p=gedcom-parse.git diff --git a/gedcom/multilex.c b/gedcom/multilex.c index 420bfcb..e412817 100644 --- a/gedcom/multilex.c +++ b/gedcom/multilex.c @@ -24,6 +24,7 @@ #include "gedcom_internal.h" #include "multilex.h" #include "encoding.h" +#include "encoding_state.h" #include "xref.h" int line_no = 0; @@ -38,19 +39,19 @@ int lexer_init(Encoding enc, FILE* f) if (enc == ONE_BYTE) { lf = &gedcom_1byte_lex; gedcom_1byte_myinit(f); - set_encoding_width(enc); + set_read_encoding_width(enc); return open_conv_to_internal("ASCII"); } else if (enc == TWO_BYTE_HILO) { lf = &gedcom_hilo_lex; gedcom_hilo_myinit(f); - set_encoding_width(enc); + set_read_encoding_width(enc); return open_conv_to_internal("UNICODE"); } else if (enc == TWO_BYTE_LOHI) { lf = &gedcom_lohi_lex; gedcom_lohi_myinit(f); - set_encoding_width(enc); + set_read_encoding_width(enc); return open_conv_to_internal("UNICODE"); } else { @@ -79,6 +80,7 @@ int determine_encoding(FILE* f) char first[2]; int read; + set_read_encoding_bom(WITHOUT_BOM); read = fread(first, 1, 2, f); if (read != 2) { gedcom_warning(_("Error reading from input file: %s"), strerror(errno)); @@ -86,26 +88,28 @@ int determine_encoding(FILE* f) return ONE_BYTE; } else if ((first[0] == '0') && (first[1] == ' ')) { - gedcom_debug_print(_("One-byte encoding")); + gedcom_debug_print("One-byte encoding"); rewind_file(f); return ONE_BYTE; } else if ((first[0] == '\0') && (first[1] == '0')) { - gedcom_debug_print(_("Two-byte encoding, high-low")); + gedcom_debug_print("Two-byte encoding, high-low"); rewind_file(f); return TWO_BYTE_HILO; } else if ((first[0] == '\xFE') && (first[1] == '\xFF')) { - gedcom_debug_print(_("Two-byte encoding, high-low, with BOM")); + gedcom_debug_print("Two-byte encoding, high-low, with BOM"); + set_read_encoding_bom(WITH_BOM); return TWO_BYTE_HILO; } else if ((first[0] == '0') && (first[1] == '\0')) { - gedcom_debug_print(_("Two-byte encoding, low-high")); + gedcom_debug_print("Two-byte encoding, low-high"); rewind_file(f); return TWO_BYTE_LOHI; } else if ((first[0] == '\xFF') && (first[1] == '\xFE')) { - gedcom_debug_print(_("Two-byte encoding, low-high, with BOM")); + gedcom_debug_print("Two-byte encoding, low-high, with BOM"); + set_read_encoding_bom(WITH_BOM); return TWO_BYTE_LOHI; } else if ((first[0] == '\xEF') && (first[1] == '\xBB')) { @@ -115,7 +119,12 @@ int determine_encoding(FILE* f) rewind_file(f); } else if (first[0] == '\xBF') { - gedcom_debug_print(_("UTF-8 encoding, with BOM")); + set_read_encoding_bom(WITH_BOM); + gedcom_debug_print("UTF-8 encoding, with BOM"); + } + else { + gedcom_warning(_("Unknown encoding, falling back to one-byte")); + rewind_file(f); } return ONE_BYTE; }