From a2f7d56476e81f8689d56e6a5641469f6497c487 Mon Sep 17 00:00:00 2001 From: Peter Verthez Date: Fri, 16 Nov 2001 10:27:29 +0000 Subject: [PATCH] Full unicode support. --- Makefile | 48 ++++++- encoding.c | 34 +++++ encoding.h | 3 + gedcom.h | 12 +- gedcom.lex | 314 ------------------------------------------ gedcom.y | 20 ++- gedcom_lohi.lex | 357 ++++++++++++++++++++++++++++++++++++++++++++++++ standalone.c | 88 +----------- 8 files changed, 457 insertions(+), 419 deletions(-) create mode 100644 encoding.c create mode 100644 encoding.h delete mode 100644 gedcom.lex create mode 100644 gedcom_lohi.lex diff --git a/Makefile b/Makefile index a0ca68a..5b3e9b1 100644 --- a/Makefile +++ b/Makefile @@ -1,16 +1,50 @@ # $Id$ # $Name$ -CFLAGS=-Wall -pedantic +YACC=bison +LEX=flex -gedcom_parse: standalone.o lex.gedcom_.o gedcom.tab.o - cc standalone.o lex.gedcom_.o gedcom.tab.o -o gedcom_parse +CFLAGS=-g -Wall -pedantic +YFLAGS=--debug --defines +LFLAGS=-8 -lex.gedcom_.c: gedcom.lex gedcom.tab.h gedcom.h - flex -8 -Pgedcom_ gedcom.lex +gedcom_parse: standalone.o lex.gedcom_1byte_.o lex.gedcom_hilo_.o \ + lex.gedcom_lohi_.o gedcom.tab.o message.o multilex.o \ + encoding.o + $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@ + +lex.gedcom_1byte_.c: gedcom_1byte.lex gedcom.tab.h gedcom.h multilex.h + $(LEX) $(LFLAGS) -Pgedcom_1byte_ gedcom_1byte.lex + +lex.gedcom_hilo_.c: gedcom_hilo.lex gedcom.tab.h gedcom.h multilex.h + $(LEX) $(LFLAGS) -Pgedcom_hilo_ gedcom_hilo.lex + +lex.gedcom_lohi_.c: gedcom_lohi.lex gedcom.tab.h gedcom.h multilex.h + $(LEX) $(LFLAGS) -Pgedcom_lohi_ gedcom_lohi.lex gedcom.tab.c gedcom.tab.h: gedcom.y gedcom.h - bison --debug --defines --name-prefix=gedcom_ gedcom.y + $(YACC) $(YFLAGS) --name-prefix=gedcom_ gedcom.y clean: - rm -f core gedcom_parse *.o lex.gedcom_.c gedcom.tab.* gedcom.output + rm -f core gedcom_parse test_* *.o lex.gedcom_* \ + gedcom.tab.* gedcom.output + +# Test programs + +test_1byte: lex.gedcom_1byte_.test.o message.o encoding.o + $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@ + +lex.gedcom_1byte_.test.o: lex.gedcom_1byte_.c + $(CC) -DLEXER_TEST -c $(CPPFLAGS) $(CFLAGS) $^ -o $@ + +test_hilo: lex.gedcom_hilo_.test.o message.o encoding.o + $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@ + +lex.gedcom_hilo_.test.o: lex.gedcom_hilo_.c + $(CC) -DLEXER_TEST -c $(CPPFLAGS) $(CFLAGS) $^ -o $@ + +test_lohi: lex.gedcom_lohi_.test.o message.o encoding.o + $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@ + +lex.gedcom_lohi_.test.o: lex.gedcom_lohi_.c + $(CC) -DLEXER_TEST -c $(CPPFLAGS) $(CFLAGS) $^ -o $@ diff --git a/encoding.c b/encoding.c new file mode 100644 index 0000000..08f69db --- /dev/null +++ b/encoding.c @@ -0,0 +1,34 @@ +#include +#include +#include "gedcom.h" +#include "encoding.h" + +#define INTERNAL_ENCODING "UTF8" + +static iconv_t cd_to_internal = (iconv_t) -1; +static char int_buf[MAXGEDCLINELEN*2]; + +int open_conv_to_internal(char* fromcode) +{ + if (cd_to_internal != (iconv_t) -1) + iconv_close(cd_to_internal); + cd_to_internal = iconv_open(INTERNAL_ENCODING, fromcode); + return (cd_to_internal != (iconv_t) -1); +} + +void close_conv_to_internal() +{ + iconv_close(cd_to_internal); +} + +char* to_internal(char* str, size_t len) +{ + size_t insize = len; + size_t outsize = MAXGEDCLINELEN * 2; + char *wrptr = int_buf; + char *rdptr = str; + memset(int_buf, 0, sizeof(int_buf)); + iconv(cd_to_internal, &rdptr, &insize, &wrptr, &outsize); + return int_buf; +} + diff --git a/encoding.h b/encoding.h new file mode 100644 index 0000000..c4b70ee --- /dev/null +++ b/encoding.h @@ -0,0 +1,3 @@ +int open_conv_to_internal(char* fromcode); +void close_conv_to_internal(); +char* to_internal(char* str, size_t len); diff --git a/gedcom.h b/gedcom.h index 6f35d4e..5fdf018 100644 --- a/gedcom.h +++ b/gedcom.h @@ -4,6 +4,7 @@ #include #include #include +#include #define MAXGEDCLEVEL 99 #define MAXGEDCLINELEN 256 @@ -18,20 +19,17 @@ typedef enum _MECH { IGNORE_ERRORS } MECHANISM; -/* Basic file encoding */ -typedef enum _ENC { - ONE_BYTE, - TWO_BYTE_HILO, - TWO_BYTE_LOHI -} ENCODING; int gedcom_error(char* s, ...); int gedcom_warning(char* s, ...); +int gedcom_message(char* s, ...); int gedcom_debug_print(char* s, ...); void gedcom_set_debug_level(int level); void gedcom_set_error_handling(MECHANISM mechanism); void gedcom_set_compat_handling(int enable_compat); + int gedcom_parse(); + int gedcom_lex(); + extern int line_no; -extern FILE *gedcom_in; diff --git a/gedcom.lex b/gedcom.lex deleted file mode 100644 index 62fd3f6..0000000 --- a/gedcom.lex +++ /dev/null @@ -1,314 +0,0 @@ -/* $Id$ */ -/* $Name$ */ - -%{ -#include "gedcom.tab.h" -#include "gedcom.h" -%} - -%s NORMAL -%s EXPECT_TAG - -alpha [A-Za-z_] -digit [0-9] -delim " " -tab [\t] -hash # -literal_at @@ -otherchar [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFE] -terminator \x0D|\x0A|\x0D\x0A|\x0A\x0D - -any_char {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at} -any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at} -non_at {alpha}|{digit}|{otherchar}|{delim}|{hash} -alphanum {alpha}|{digit} -gen_delim {delim}|{tab} - -escape @#{any_char}+@ -pointer @{alphanum}{non_at}+@ - -%{ -int current_level=-1; -int level_diff=MAXGEDCLEVEL; -int line_no=1; -%} - -%% - - /* The GEDCOM level number is converted into a sequence of opening - and closing brackets. Simply put, the following GEDCOM fragment: - - 0 HEAD - 1 SOUR genes - 2 VERS 1.6 - 2 NAME Genes - 1 DATE 07 OCT 2001 - ... - 0 TRLR - - is converted into: - - { HEAD (initial) - { SOUR genes (1 higher: no closing brackets) - { VERS 1.6 (1 higher: no closing brackets) - } { NAME Genes (same level: 1 closing bracket) - } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets) - ... - } { TRLR } - - or more clearly: - - { HEAD - { SOUR genes - { VERS 1.6 } - { NAME Genes } } - { DATE 07 OCT 2001 - ... } - { TRLR } - - But because this means that one token is converted into a series - of tokens, there is some initial code following immediately here - that returns "pending" tokens. */ - -%{ -char string_buf[MAXGEDCLINELEN+1]; - -if (level_diff < 1) { - level_diff++; - return CLOSE; -} -else if (level_diff == 1) { - level_diff++; - return OPEN; -} -else { - /* out of brackets... */ -} - -#define MKTAGACTION(tag) \ - { gedcom_lval.string = gedcom_text; \ - BEGIN(NORMAL); \ - return TAG_##tag; } - -%} - -{gen_delim}* /* ignore leading whitespace (also tabs) */ - -0{digit}+ { gedcom_error ("Level number with leading zero"); - return BADTOKEN; - } - -{digit}+ { int level = atoi(gedcom_text); - if ((level < 0) || (level > MAXGEDCLEVEL)) { - gedcom_error ("Level number out of range [0..%d]", - MAXGEDCLEVEL); - return BADTOKEN; - } - level_diff = level - current_level; - BEGIN(EXPECT_TAG); - current_level = level; - if (level_diff < 1) { - level_diff++; - return CLOSE; - } - else if (level_diff == 1) { - level_diff++; - return OPEN; - } - else { - /* should never happen (error to GEDCOM spec) */ - gedcom_error ("GEDCOM level number is %d higher than " - "previous", - level_diff); - return BADTOKEN; - } - } - -ABBR MKTAGACTION(ABBR) -ADDR MKTAGACTION(ADDR) -ADR1 MKTAGACTION(ADR1) -ADR2 MKTAGACTION(ADR2) -ADOP MKTAGACTION(ADOP) -AFN MKTAGACTION(AFN) -AGE MKTAGACTION(AGE) -AGNC MKTAGACTION(AGNC) -ALIA MKTAGACTION(ALIA) -ANCE MKTAGACTION(ANCE) -ANCI MKTAGACTION(ANCI) -ANUL MKTAGACTION(ANUL) -ASSO MKTAGACTION(ASSO) -AUTH MKTAGACTION(AUTH) -BAPL MKTAGACTION(BAPL) -BAPM MKTAGACTION(BAPM) -BARM MKTAGACTION(BARM) -BASM MKTAGACTION(BASM) -BIRT MKTAGACTION(BIRT) -BLES MKTAGACTION(BLES) -BLOB MKTAGACTION(BLOB) -BURI MKTAGACTION(BURI) -CALN MKTAGACTION(CALN) -CAST MKTAGACTION(CAST) -CAUS MKTAGACTION(CAUS) -CENS MKTAGACTION(CENS) -CHAN MKTAGACTION(CHAN) -CHAR MKTAGACTION(CHAR) -CHIL MKTAGACTION(CHIL) -CHR MKTAGACTION(CHR) -CHRA MKTAGACTION(CHRA) -CITY MKTAGACTION(CITY) -CONC MKTAGACTION(CONC) -CONF MKTAGACTION(CONF) -CONL MKTAGACTION(CONL) -CONT MKTAGACTION(CONT) -COPR MKTAGACTION(COPR) -CORP MKTAGACTION(CORP) -CREM MKTAGACTION(CREM) -CTRY MKTAGACTION(CTRY) -DATA MKTAGACTION(DATA) -DATE MKTAGACTION(DATE) -DEAT MKTAGACTION(DEAT) -DESC MKTAGACTION(DESC) -DESI MKTAGACTION(DESI) -DEST MKTAGACTION(DEST) -DIV MKTAGACTION(DIV) -DIVF MKTAGACTION(DIVF) -DSCR MKTAGACTION(DSCR) -EDUC MKTAGACTION(EDUC) -EMIG MKTAGACTION(EMIG) -ENDL MKTAGACTION(ENDL) -ENGA MKTAGACTION(ENGA) -EVEN MKTAGACTION(EVEN) -FAM MKTAGACTION(FAM) -FAMC MKTAGACTION(FAMC) -FAMF MKTAGACTION(FAMF) -FAMS MKTAGACTION(FAMS) -FCOM MKTAGACTION(FCOM) -FILE MKTAGACTION(FILE) -FORM MKTAGACTION(FORM) -GEDC MKTAGACTION(GEDC) -GIVN MKTAGACTION(GIVN) -GRAD MKTAGACTION(GRAD) -HEAD MKTAGACTION(HEAD) -HUSB MKTAGACTION(HUSB) -IDNO MKTAGACTION(IDNO) -IMMI MKTAGACTION(IMMI) -INDI MKTAGACTION(INDI) -LANG MKTAGACTION(LANG) -LEGA MKTAGACTION(LEGA) -MARB MKTAGACTION(MARB) -MARC MKTAGACTION(MARC) -MARL MKTAGACTION(MARL) -MARR MKTAGACTION(MARR) -MARS MKTAGACTION(MARS) -MEDI MKTAGACTION(MEDI) -NAME MKTAGACTION(NAME) -NATI MKTAGACTION(NATI) -NATU MKTAGACTION(NATU) -NCHI MKTAGACTION(NCHI) -NICK MKTAGACTION(NICK) -NMR MKTAGACTION(NMR) -NOTE MKTAGACTION(NOTE) -NPFX MKTAGACTION(NPFX) -NSFX MKTAGACTION(NSFX) -OBJE MKTAGACTION(OBJE) -OCCU MKTAGACTION(OCCU) -ORDI MKTAGACTION(ORDI) -ORDN MKTAGACTION(ORDN) -PAGE MKTAGACTION(PAGE) -PEDI MKTAGACTION(PEDI) -PHON MKTAGACTION(PHON) -PLAC MKTAGACTION(PLAC) -POST MKTAGACTION(POST) -PROB MKTAGACTION(PROB) -PROP MKTAGACTION(PROP) -PUBL MKTAGACTION(PUBL) -QUAY MKTAGACTION(QUAY) -REFN MKTAGACTION(REFN) -RELA MKTAGACTION(RELA) -RELI MKTAGACTION(RELI) -REPO MKTAGACTION(REPO) -RESI MKTAGACTION(RESI) -RESN MKTAGACTION(RESN) -RETI MKTAGACTION(RETI) -RFN MKTAGACTION(RFN) -RIN MKTAGACTION(RIN) -ROLE MKTAGACTION(ROLE) -SEX MKTAGACTION(SEX) -SLGC MKTAGACTION(SLGC) -SLGS MKTAGACTION(SLGS) -SOUR MKTAGACTION(SOUR) -SPFX MKTAGACTION(SPFX) -SSN MKTAGACTION(SSN) -STAE MKTAGACTION(STAE) -STAT MKTAGACTION(STAT) -SUBM MKTAGACTION(SUBM) -SUBN MKTAGACTION(SUBN) -SURN MKTAGACTION(SURN) -TEMP MKTAGACTION(TEMP) -TEXT MKTAGACTION(TEXT) -TIME MKTAGACTION(TIME) -TITL MKTAGACTION(TITL) -TRLR MKTAGACTION(TRLR) -TYPE MKTAGACTION(TYPE) -VERS MKTAGACTION(VERS) -WIFE MKTAGACTION(WIFE) -WILL MKTAGACTION(WILL) - -{alphanum}+ { if (strlen(gedcom_text) > MAXGEDCTAGLEN) { - gedcom_error("Tag '%s' too long, max %d chars"); - return BADTOKEN; - } - strncpy(string_buf, gedcom_text, MAXGEDCTAGLEN+1); - gedcom_lval.string = string_buf; - BEGIN(NORMAL); - return USERTAG; - } - -{delim} { gedcom_lval.string = gedcom_text; - return DELIM; - } - -{any_but_delim} { gedcom_lval.string = gedcom_text; - return ANYCHAR; - } - -{escape}/{non_at} { gedcom_lval.string = gedcom_text; - return ESCAPE; - } - -{pointer} { gedcom_lval.string = gedcom_text; - return POINTER; - } - - /* Due to the conversion of level numbers into brackets, the - terminator is not important, so no token is returned here. - Although not strictly according to the GEDCOM spec, we'll ignore - whitespace just before the terminator. - */ - -{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); } - - /* Eventually we have to return 1 closing bracket (for the trailer). - We can detect whether we have sent the closing bracket using the - level_diff (at eof, first it is 2, then we increment it ourselves) */ - -<> { if (level_diff == 2) { - level_diff++; - return CLOSE; - } - else { - yyterminate(); - } - } - -. { gedcom_error("Unexpected character: '%s' (0x%02x)", - gedcom_text, gedcom_text[0]); - return BADTOKEN; - } - -%% - -int gedcom_wrap() -{ - return 1; -} diff --git a/gedcom.y b/gedcom.y index b455587..7b8b2ef 100644 --- a/gedcom.y +++ b/gedcom.y @@ -122,6 +122,7 @@ %{ #include "gedcom.h" +#include "multilex.h" int count_level = 0; int fail = 0; @@ -129,7 +130,7 @@ int compat_enabled = 1; int gedcom_high_level_debug = 0; int compatibility = 0; MECHANISM error_mechanism=IMMED_FAIL; -char string_buf[MAXGEDCLINELEN+1]; +char string_buf[MAXGEDCLINELEN*4+1]; char *string_buf_ptr; enum _COMPAT { @@ -2109,10 +2110,15 @@ opt_line_item : /* empty */ { } | DELIM line_item { } ; -line_item : anychar { CLEAR_BUFFER(string_buf); +line_item : anychar { int i; + CLEAR_BUFFER(string_buf); string_buf_ptr = string_buf; /* The following also takes care of '@@' */ - *string_buf_ptr++ = $1[0]; + if (!strncmp($1, "@@", 3)) + *string_buf_ptr++ = '@'; + else + for (i=0; i < strlen($1); i++) + *string_buf_ptr++ = $1[i]; $$ = string_buf; } | ESCAPE { CLEAR_BUFFER(string_buf); @@ -2126,8 +2132,13 @@ line_item : anychar { CLEAR_BUFFER(string_buf); YYERROR; } else { + int i; /* The following also takes care of '@@' */ - *string_buf_ptr++ = $2[0]; + if (!strncmp($2, "@@", 3)) + *string_buf_ptr++ = '@'; + else + for (i=0; i < strlen($2); i++) + *string_buf_ptr++ = $2[i]; $$ = string_buf; } } @@ -2436,3 +2447,4 @@ int compat_mode(int compat_flags) { return (compat_flags & compatibility); } + diff --git a/gedcom_lohi.lex b/gedcom_lohi.lex new file mode 100644 index 0000000..6d88b43 --- /dev/null +++ b/gedcom_lohi.lex @@ -0,0 +1,357 @@ +/* $Id$ */ +/* $Name$ */ + +/* In low-high order, a space is encoded as 0x20 0x00 */ +/* i.e. this is utf-16-le */ + +%{ +#include "gedcom.tab.h" +#include "gedcom.h" +#include "multilex.h" +#include "encoding.h" +%} + +%s NORMAL +%s EXPECT_TAG + +alpha [A-Za-z_]\x00 +digit [0-9]\x00 +delim \x20\x00 +tab [\t]\x00 +hash #\x00 +literal_at @\x00@\x00 +otherchar [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFF]\x00|[\x00-\xFF][\x01-\xFF] +terminator \x0D\x00|\x0A\x00|\x0D\x00\x0A\x00|\x0A\x00\x0D\x00 + +any_char {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at} +any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at} +non_at {alpha}|{digit}|{otherchar}|{delim}|{hash} +alphanum {alpha}|{digit} +gen_delim {delim}|{tab} + +escape @\x00#\x00{any_char}+@\x00 +pointer @\x00{alphanum}{non_at}+@\x00 + +%{ +static int current_level=-1; +static int level_diff=MAXGEDCLEVEL; + +#ifdef LEXER_TEST +YYSTYPE gedcom_lval; +int line_no = 1; +#endif +%} + +%% + + /* The GEDCOM level number is converted into a sequence of opening + and closing brackets. Simply put, the following GEDCOM fragment: + + 0 HEAD + 1 SOUR genes + 2 VERS 1.6 + 2 NAME Genes + 1 DATE 07 OCT 2001 + ... + 0 TRLR + + is converted into: + + { HEAD (initial) + { SOUR genes (1 higher: no closing brackets) + { VERS 1.6 (1 higher: no closing brackets) + } { NAME Genes (same level: 1 closing bracket) + } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets) + ... + } { TRLR } + + or more clearly: + + { HEAD + { SOUR genes + { VERS 1.6 } + { NAME Genes } } + { DATE 07 OCT 2001 + ... } + { TRLR } + + But because this means that one token is converted into a series + of tokens, there is some initial code following immediately here + that returns "pending" tokens. */ + +%{ +char string_buf[MAXGEDCLINELEN+1]; + +if (level_diff < 1) { + level_diff++; + return CLOSE; +} +else if (level_diff == 1) { + level_diff++; + return OPEN; +} +else { + /* out of brackets... */ +} + +#define TO_INTERNAL(str) to_internal(str, yyleng) + +#define MKTAGACTION(tag) \ + { gedcom_lval.string = TO_INTERNAL(yytext); \ + BEGIN(NORMAL); \ + return TAG_##tag; } + +%} + +{gen_delim}* /* ignore leading whitespace (also tabs) */ + +\x00[0]{digit}+ { gedcom_error ("Level number with leading zero"); + return BADTOKEN; + } + +{digit}+ { int level = atoi(TO_INTERNAL(yytext)); + if ((level < 0) || (level > MAXGEDCLEVEL)) { + gedcom_error ("Level number out of range [0..%d]", + MAXGEDCLEVEL); + return BADTOKEN; + } + level_diff = level - current_level; + BEGIN(EXPECT_TAG); + current_level = level; + if (level_diff < 1) { + level_diff++; + return CLOSE; + } + else if (level_diff == 1) { + level_diff++; + return OPEN; + } + else { + /* should never happen (error to GEDCOM spec) */ + gedcom_error ("GEDCOM level number is %d higher than " + "previous", + level_diff); + return BADTOKEN; + } + } + +A\x00B\x00B\x00R\x00 MKTAGACTION(ABBR) +A\x00D\x00D\x00R\x00 MKTAGACTION(ADDR) +A\x00D\x00R\x001\x00 MKTAGACTION(ADR1) +A\x00D\x00R\x002\x00 MKTAGACTION(ADR2) +A\x00D\x00O\x00P\x00 MKTAGACTION(ADOP) +A\x00F\x00N\x00 MKTAGACTION(AFN) +A\x00G\x00E\x00 MKTAGACTION(AGE) +A\x00G\x00N\x00C\x00 MKTAGACTION(AGNC) +A\x00L\x00I\x00A\x00 MKTAGACTION(ALIA) +A\x00N\x00C\x00E\x00 MKTAGACTION(ANCE) +A\x00N\x00C\x00I\x00 MKTAGACTION(ANCI) +A\x00N\x00U\x00L\x00 MKTAGACTION(ANUL) +A\x00S\x00S\x00O\x00 MKTAGACTION(ASSO) +A\x00U\x00T\x00H\x00 MKTAGACTION(AUTH) +B\x00A\x00P\x00L\x00 MKTAGACTION(BAPL) +B\x00A\x00P\x00M\x00 MKTAGACTION(BAPM) +B\x00A\x00R\x00M\x00 MKTAGACTION(BARM) +B\x00A\x00S\x00M\x00 MKTAGACTION(BASM) +B\x00I\x00R\x00T\x00 MKTAGACTION(BIRT) +B\x00L\x00E\x00S\x00 MKTAGACTION(BLES) +B\x00L\x00O\x00B\x00 MKTAGACTION(BLOB) +B\x00U\x00R\x00I\x00 MKTAGACTION(BURI) +C\x00A\x00L\x00N\x00 MKTAGACTION(CALN) +C\x00A\x00S\x00T\x00 MKTAGACTION(CAST) +C\x00A\x00U\x00S\x00 MKTAGACTION(CAUS) +C\x00E\x00N\x00S\x00 MKTAGACTION(CENS) +C\x00H\x00A\x00N\x00 MKTAGACTION(CHAN) +C\x00H\x00A\x00R\x00 MKTAGACTION(CHAR) +C\x00H\x00I\x00L\x00 MKTAGACTION(CHIL) +C\x00H\x00R\x00 MKTAGACTION(CHR) +C\x00H\x00R\x00A\x00 MKTAGACTION(CHRA) +C\x00I\x00T\x00Y\x00 MKTAGACTION(CITY) +C\x00O\x00N\x00C\x00 MKTAGACTION(CONC) +C\x00O\x00N\x00F\x00 MKTAGACTION(CONF) +C\x00O\x00N\x00L\x00 MKTAGACTION(CONL) +C\x00O\x00N\x00T\x00 MKTAGACTION(CONT) +C\x00O\x00P\x00R\x00 MKTAGACTION(COPR) +C\x00O\x00R\x00P\x00 MKTAGACTION(CORP) +C\x00R\x00E\x00M\x00 MKTAGACTION(CREM) +C\x00T\x00R\x00Y\x00 MKTAGACTION(CTRY) +D\x00A\x00T\x00A\x00 MKTAGACTION(DATA) +D\x00A\x00T\x00E\x00 MKTAGACTION(DATE) +D\x00E\x00A\x00T\x00 MKTAGACTION(DEAT) +D\x00E\x00S\x00C\x00 MKTAGACTION(DESC) +D\x00E\x00S\x00I\x00 MKTAGACTION(DESI) +D\x00E\x00S\x00T\x00 MKTAGACTION(DEST) +D\x00I\x00V\x00 MKTAGACTION(DIV) +D\x00I\x00V\x00F\x00 MKTAGACTION(DIVF) +D\x00S\x00C\x00R\x00 MKTAGACTION(DSCR) +E\x00D\x00U\x00C\x00 MKTAGACTION(EDUC) +E\x00M\x00I\x00G\x00 MKTAGACTION(EMIG) +E\x00N\x00D\x00L\x00 MKTAGACTION(ENDL) +E\x00N\x00G\x00A\x00 MKTAGACTION(ENGA) +E\x00V\x00E\x00N\x00 MKTAGACTION(EVEN) +F\x00A\x00M\x00 MKTAGACTION(FAM) +F\x00A\x00M\x00C\x00 MKTAGACTION(FAMC) +F\x00A\x00M\x00F\x00 MKTAGACTION(FAMF) +F\x00A\x00M\x00S\x00 MKTAGACTION(FAMS) +F\x00C\x00O\x00M\x00 MKTAGACTION(FCOM) +F\x00I\x00L\x00E\x00 MKTAGACTION(FILE) +F\x00O\x00R\x00M\x00 MKTAGACTION(FORM) +G\x00E\x00D\x00C\x00 MKTAGACTION(GEDC) +G\x00I\x00V\x00N\x00 MKTAGACTION(GIVN) +G\x00R\x00A\x00D\x00 MKTAGACTION(GRAD) +H\x00E\x00A\x00D\x00 MKTAGACTION(HEAD) +H\x00U\x00S\x00B\x00 MKTAGACTION(HUSB) +I\x00D\x00N\x00O\x00 MKTAGACTION(IDNO) +I\x00M\x00M\x00I\x00 MKTAGACTION(IMMI) +I\x00N\x00D\x00I\x00 MKTAGACTION(INDI) +L\x00A\x00N\x00G\x00 MKTAGACTION(LANG) +L\x00E\x00G\x00A\x00 MKTAGACTION(LEGA) +M\x00A\x00R\x00B\x00 MKTAGACTION(MARB) +M\x00A\x00R\x00C\x00 MKTAGACTION(MARC) +M\x00A\x00R\x00L\x00 MKTAGACTION(MARL) +M\x00A\x00R\x00R\x00 MKTAGACTION(MARR) +M\x00A\x00R\x00S\x00 MKTAGACTION(MARS) +M\x00E\x00D\x00I\x00 MKTAGACTION(MEDI) +N\x00A\x00M\x00E\x00 MKTAGACTION(NAME) +N\x00A\x00T\x00I\x00 MKTAGACTION(NATI) +N\x00A\x00T\x00U\x00 MKTAGACTION(NATU) +N\x00C\x00H\x00I\x00 MKTAGACTION(NCHI) +N\x00I\x00C\x00K\x00 MKTAGACTION(NICK) +N\x00M\x00R\x00 MKTAGACTION(NMR) +N\x00O\x00T\x00E\x00 MKTAGACTION(NOTE) +N\x00P\x00F\x00X\x00 MKTAGACTION(NPFX) +N\x00S\x00F\x00X\x00 MKTAGACTION(NSFX) +O\x00B\x00J\x00E\x00 MKTAGACTION(OBJE) +O\x00C\x00C\x00U\x00 MKTAGACTION(OCCU) +O\x00R\x00D\x00I\x00 MKTAGACTION(ORDI) +O\x00R\x00D\x00N\x00 MKTAGACTION(ORDN) +P\x00A\x00G\x00E\x00 MKTAGACTION(PAGE) +P\x00E\x00D\x00I\x00 MKTAGACTION(PEDI) +P\x00H\x00O\x00N\x00 MKTAGACTION(PHON) +P\x00L\x00A\x00C\x00 MKTAGACTION(PLAC) +P\x00O\x00S\x00T\x00 MKTAGACTION(POST) +P\x00R\x00O\x00B\x00 MKTAGACTION(PROB) +P\x00R\x00O\x00P\x00 MKTAGACTION(PROP) +P\x00U\x00B\x00L\x00 MKTAGACTION(PUBL) +Q\x00U\x00A\x00Y\x00 MKTAGACTION(QUAY) +R\x00E\x00F\x00N\x00 MKTAGACTION(REFN) +R\x00E\x00L\x00A\x00 MKTAGACTION(RELA) +R\x00E\x00L\x00I\x00 MKTAGACTION(RELI) +R\x00E\x00P\x00O\x00 MKTAGACTION(REPO) +R\x00E\x00S\x00I\x00 MKTAGACTION(RESI) +R\x00E\x00S\x00N\x00 MKTAGACTION(RESN) +R\x00E\x00T\x00I\x00 MKTAGACTION(RETI) +R\x00F\x00N\x00 MKTAGACTION(RFN) +R\x00I\x00N\x00 MKTAGACTION(RIN) +R\x00O\x00L\x00E\x00 MKTAGACTION(ROLE) +S\x00E\x00X\x00 MKTAGACTION(SEX) +S\x00L\x00G\x00C\x00 MKTAGACTION(SLGC) +S\x00L\x00G\x00S\x00 MKTAGACTION(SLGS) +S\x00O\x00U\x00R\x00 MKTAGACTION(SOUR) +S\x00P\x00F\x00X\x00 MKTAGACTION(SPFX) +S\x00S\x00N\x00 MKTAGACTION(SSN) +S\x00T\x00A\x00E\x00 MKTAGACTION(STAE) +S\x00T\x00A\x00T\x00 MKTAGACTION(STAT) +S\x00U\x00B\x00M\x00 MKTAGACTION(SUBM) +S\x00U\x00B\x00N\x00 MKTAGACTION(SUBN) +S\x00U\x00R\x00N\x00 MKTAGACTION(SURN) +T\x00E\x00M\x00P\x00 MKTAGACTION(TEMP) +T\x00E\x00X\x00T\x00 MKTAGACTION(TEXT) +T\x00I\x00M\x00E\x00 MKTAGACTION(TIME) +T\x00I\x00T\x00L\x00 MKTAGACTION(TITL) +T\x00R\x00L\x00R\x00 MKTAGACTION(TRLR) +T\x00Y\x00P\x00E\x00 MKTAGACTION(TYPE) +V\x00E\x00R\x00S\x00 MKTAGACTION(VERS) +W\x00I\x00F\x00E\x00 MKTAGACTION(WIFE) +W\x00I\x00L\x00L\x00 MKTAGACTION(WILL) + +{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) { + gedcom_error("Tag '%s' too long, max %d chars"); + return BADTOKEN; + } + strncpy(string_buf, yytext, MAXGEDCTAGLEN+1); + gedcom_lval.string = TO_INTERNAL(string_buf); + BEGIN(NORMAL); + return USERTAG; + } + +{delim} { gedcom_lval.string = TO_INTERNAL(yytext); + return DELIM; + } + +{any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext); + return ANYCHAR; + } + +{escape}/{non_at} { gedcom_lval.string = TO_INTERNAL(yytext); + return ESCAPE; + } + +{pointer} { gedcom_lval.string = TO_INTERNAL(yytext); + return POINTER; + } + + /* Due to the conversion of level numbers into brackets, the + terminator is not important, so no token is returned here. + Although not strictly according to the GEDCOM spec, we'll ignore + whitespace just before the terminator. + */ + +{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); } + + /* Eventually we have to return 1 closing bracket (for the trailer). + We can detect whether we have sent the closing bracket using the + level_diff (at eof, first it is 2, then we increment it ourselves) */ + +<> { if (level_diff == 2) { + level_diff++; + return CLOSE; + } + else { + yyterminate(); + } + } + +. { gedcom_error("Unexpected character: '%s' (0x%02x)", + yytext, yytext[0]); + return BADTOKEN; + } + +%% + +int yywrap() +{ + return 1; +} + +#ifdef LEXER_TEST + +int main() +{ + int tok; + int res = open_conv_to_internal("UTF16LE"); + if (!res) { + gedcom_error("Unable to open conversion context: %s", + strerror(errno)); + return 1; + } + tok = gedcom_lohi_lex(); + while (tok) { + switch(tok) { + case BADTOKEN: printf("BADTOKEN "); break; + case OPEN: printf("OPEN "); break; + case CLOSE: printf("CLOSE "); break; + case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break; + case DELIM: printf("DELIM "); break; + case ANYCHAR: printf("%s ", gedcom_lval.string); break; + case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break; + case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break; + default: printf("TAG(%s) ", gedcom_lval.string); break; + } + tok = gedcom_lohi_lex(); + } + printf("\n"); + close_conv_to_internal(); + return 0; +} +#endif diff --git a/standalone.c b/standalone.c index 41c7752..452c1db 100644 --- a/standalone.c +++ b/standalone.c @@ -2,6 +2,7 @@ /* $Name$ */ #include "gedcom.h" +#include "multilex.h" void show_help () { @@ -17,65 +18,6 @@ void show_help () printf(" -da Debug setting: libgedcom + yacc debug messages\n"); } -int determine_encoding(FILE* f) -{ - char first[2]; - - fread(first, 1, 2, f); - if ((first[0] == '0') && (first[1] == ' ')) { - gedcom_warning("One-byte encoding"); - fseek(f, 0, 0); - return ONE_BYTE; - } - else if ((first[0] == '\0') && (first[1] == '0')) - { - gedcom_warning("Two-byte encoding, high-low"); - fseek(f, 0, 0); - return TWO_BYTE_HILO; - } - else if ((first[0] == '\xFE') && (first[1] == '\xFF')) - { - gedcom_warning("Two-byte encoding, high-low, with BOM"); - return TWO_BYTE_HILO; - } - else if ((first[0] == '0') && (first[1] == '\0')) - { - gedcom_warning("Two-byte encoding, low-high"); - fseek(f, 0, 0); - return TWO_BYTE_LOHI; - } - else if ((first[0] == '\xFF') && (first[1] == '\xFE')) - { - gedcom_warning("Two-byte encoding, low-high, with BOM"); - return TWO_BYTE_LOHI; - } - else { - gedcom_warning("Unknown encoding, falling back to one-byte"); - fseek(f, 0, 0); - return ONE_BYTE; - } -} - -int gedcom_xxx_parse(char* file_name) -{ - ENCODING enc; - FILE* file = fopen (file_name, "r"); - if (!file) { - printf("Could not open file '%s'\n", file_name); - exit(1); - } - enc = determine_encoding(file); - - if (enc == ONE_BYTE) { - gedcom_in = file; - return gedcom_parse(); - } - else { - printf("No parser yet for encoding\n"); - exit(1); - } -} - int main(int argc, char* argv[]) { MECHANISM mech = IMMED_FAIL; @@ -133,31 +75,3 @@ int main(int argc, char* argv[]) return 1; } } - -int gedcom_warning(char* s, ...) -{ - int res; - va_list ap; - - va_start(ap, s); - fprintf(stderr, "Warning on line %d: ", line_no); - res = vfprintf(stderr, s, ap); - fprintf(stderr, "\n"); - va_end(ap); - - return res; -} - -int gedcom_error(char* s, ...) -{ - int res; - va_list ap; - - va_start(ap, s); - fprintf(stderr, "Error on line %d: ", line_no); - res = vfprintf(stderr, s, ap); - fprintf(stderr, "\n"); - va_end(ap); - - return res; -} -- 2.30.2