From a54348309c92f4d7a2dd66b4055122a7be19ca28 Mon Sep 17 00:00:00 2001 From: Peter Verthez Date: Sun, 25 Nov 2001 12:53:20 +0000 Subject: [PATCH] General cleanup. --- Makefile | 8 +- ansel/ANSI_Z39.47.c | 4 +- ansel/Makefile | 8 +- ansel/gconv-modules | 2 + encoding.c | 9 +- encoding.h | 3 + gedcom.enc | 9 +- gedcom.h | 1 + gedcom_1byte.lex | 364 ++++++++++++++++++++++++++++++++++++++++++++ gedcom_hilo.lex | 361 +++++++++++++++++++++++++++++++++++++++++++ message.c | 45 ++++++ multilex.c | 107 +++++++++++++ multilex.h | 18 +++ 13 files changed, 932 insertions(+), 7 deletions(-) create mode 100644 gedcom_1byte.lex create mode 100644 gedcom_hilo.lex create mode 100644 message.c create mode 100644 multilex.c create mode 100644 multilex.h diff --git a/Makefile b/Makefile index b7585f3..63be764 100644 --- a/Makefile +++ b/Makefile @@ -8,12 +8,15 @@ CFLAGS=-g -W -Wall -pedantic YFLAGS=--debug --defines LFLAGS=-8 +all: ansel_module gedcom_parse + gedcom_parse: standalone.o lex.gedcom_1byte_.o lex.gedcom_hilo_.o \ lex.gedcom_lohi_.o gedcom.tab.o message.o multilex.o \ encoding.o $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@ -libgedcom.so: +ansel_module: + cd ansel && $(MAKE) lex.gedcom_1byte_.c: gedcom_1byte.lex gedcom.tab.h gedcom.h multilex.h $(LEX) $(LFLAGS) -Pgedcom_1byte_ gedcom_1byte.lex @@ -30,6 +33,7 @@ gedcom.tab.c gedcom.tab.h: gedcom.y gedcom.h clean: rm -f core gedcom_parse test_* *.o lex.gedcom_* \ gedcom.tab.* gedcom.output + cd ansel && $(MAKE) clean # Lexer test programs @@ -53,7 +57,7 @@ lex.gedcom_lohi_.test.o: lex.gedcom_lohi_.c # Test of parser -test: gedcom_parse +test: all @export GCONV_PATH=./ansel; \ for file in t/*.ged; do \ echo "=== testing $$file"; \ diff --git a/ansel/ANSI_Z39.47.c b/ansel/ANSI_Z39.47.c index 5826d39..a8f3cee 100644 --- a/ansel/ANSI_Z39.47.c +++ b/ansel/ANSI_Z39.47.c @@ -1,3 +1,6 @@ +/* $Id$ */ +/* $Name$ */ + /* Generic conversion to and from ANSI Z39.47 (also known as ANSEL) Based on the ansi_x3.110.c file from the glibc sources Data coming from: @@ -11,7 +14,6 @@ #include #include #include -#include static const uint32_t to_ucs4[256] = { diff --git a/ansel/Makefile b/ansel/Makefile index 1110388..da733bd 100644 --- a/ansel/Makefile +++ b/ansel/Makefile @@ -1,15 +1,19 @@ +# $Id$ +# $Name$ + LIBTOOL=libtool MODPATH=/usr/local/lib MODULES=ANSI_Z39.47.so +CFLAGS=-g all: $(MODULES) %.so: %.lo - $(LIBTOOL) $(CC) -module -avoid-version -o $*.la $^ -rpath $(MODPATH) + $(LIBTOOL) $(CC) -module -avoid-version $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $*.la -rpath $(MODPATH) mv .libs/$@ $@ %.lo: %.c - $(LIBTOOL) $(CC) -c $^ + $(LIBTOOL) $(CC) -c $(CPPFLAGS) $(CFLAGS) $^ clean: rm -rf .libs diff --git a/ansel/gconv-modules b/ansel/gconv-modules index 5329bb2..16a7aa7 100644 --- a/ansel/gconv-modules +++ b/ansel/gconv-modules @@ -1,3 +1,5 @@ +# $Id$ +# $Name$ # All lines contain the following information: # If the lines start with `module' diff --git a/encoding.c b/encoding.c index 6c342e0..9d842cc 100644 --- a/encoding.c +++ b/encoding.c @@ -1,3 +1,6 @@ +/* $Id$ */ +/* $Name$ */ + #include #include #include @@ -76,7 +79,7 @@ void init_encodings() ENCODING_CONF_FILE); return; } - else if (buffer[0] != '#') { + else if ((buffer[0] != '#') && (strcmp(buffer, "\n") != 0)) { if (sscanf(buffer, "%s %s %s", gedcom_n, charwidth, iconv_n) == 3) { add_encoding(gedcom_n, charwidth, iconv_n); } @@ -116,6 +119,10 @@ int open_conv_to_internal(char* fromcode) memset(conv_buf, 0, sizeof(conv_buf)); conv_buf_size = 0; cd_to_internal = iconv_open(INTERNAL_ENCODING, encoding); + if (cd_to_internal == (iconv_t) -1) { + gedcom_error("Error opening conversion context for encoding %s: %s", + encoding, strerror(errno)); + } } return (cd_to_internal != (iconv_t) -1); } diff --git a/encoding.h b/encoding.h index 12473d2..3e9af43 100644 --- a/encoding.h +++ b/encoding.h @@ -1,3 +1,6 @@ +/* $Id$ */ +/* $Name$ */ + /* Basic file encoding */ #ifndef __ENCODING_H #define __ENCODING_H diff --git a/gedcom.enc b/gedcom.enc index b47dc56..7f370a7 100644 --- a/gedcom.enc +++ b/gedcom.enc @@ -1,11 +1,18 @@ +# $Id$ +# $Name$ + # Mapping of charsets for gedcom parsing # Each line contains (separated by whitespace): # - the gedcom name # - a token identifying the width of characters and the ordering; # currently supported values: 1, 2_LOHI, 2_HILO # - the iconv name of the charset + +# First the encodings supported by the GEDCOM standard UNICODE 2_LOHI UTF16LE UNICODE 2_HILO UTF16BE ASCII 1 ASCII -ANSI 1 CP1252 ANSEL 1 ANSEL + +# Then some very frequently used non-standard encodings: +ANSI 1 CP1252 diff --git a/gedcom.h b/gedcom.h index 82836e8..987a382 100644 --- a/gedcom.h +++ b/gedcom.h @@ -1,5 +1,6 @@ /* $Id$ */ /* $Name$ */ + #ifndef __GEDCOM_H #define __GEDCOM_H #include diff --git a/gedcom_1byte.lex b/gedcom_1byte.lex new file mode 100644 index 0000000..7121973 --- /dev/null +++ b/gedcom_1byte.lex @@ -0,0 +1,364 @@ +/* $Id$ */ +/* $Name$ */ + +%{ +#include "gedcom.tab.h" +#include "gedcom.h" +#include "multilex.h" +#include "encoding.h" + +#define YY_NO_UNPUT +%} + +%s NORMAL +%s EXPECT_TAG + +alpha [A-Za-z_] +digit [0-9] +delim " " +tab [\t] +hash # +literal_at @@ +otherchar [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFE] +terminator \x0D|\x0A|\x0D\x0A|\x0A\x0D + +any_char {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at} +any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at} +non_at {alpha}|{digit}|{otherchar}|{delim}|{hash} +alphanum {alpha}|{digit} +gen_delim {delim}|{tab} + +escape @#{any_char}+@ +pointer @{alphanum}{non_at}+@ + +%{ +static int current_level=-1; +static int level_diff=MAXGEDCLEVEL; + +#ifdef LEXER_TEST +YYSTYPE gedcom_lval; +int line_no = 1; +#endif + +%} + +%% + + /* The GEDCOM level number is converted into a sequence of opening + and closing brackets. Simply put, the following GEDCOM fragment: + + 0 HEAD + 1 SOUR genes + 2 VERS 1.6 + 2 NAME Genes + 1 DATE 07 OCT 2001 + ... + 0 TRLR + + is converted into: + + { HEAD (initial) + { SOUR genes (1 higher: no closing brackets) + { VERS 1.6 (1 higher: no closing brackets) + } { NAME Genes (same level: 1 closing bracket) + } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets) + ... + } { TRLR } + + or more clearly: + + { HEAD + { SOUR genes + { VERS 1.6 } + { NAME Genes } } + { DATE 07 OCT 2001 + ... } + { TRLR } + + But because this means that one token is converted into a series + of tokens, there is some initial code following immediately here + that returns "pending" tokens. */ + +%{ +char string_buf[MAXGEDCLINELEN+1]; + +if (level_diff < 1) { + level_diff++; + return CLOSE; +} +else if (level_diff == 1) { + level_diff++; + return OPEN; +} +else { + /* out of brackets... */ +} + +#define TO_INTERNAL(str) to_internal(str, yyleng) + +#define MKTAGACTION(tag) \ + { gedcom_lval.string = TO_INTERNAL(yytext); \ + BEGIN(NORMAL); \ + return TAG_##tag; } + +%} + +{gen_delim}* /* ignore leading whitespace (also tabs) */ + +0{digit}+ { gedcom_error ("Level number with leading zero"); + return BADTOKEN; + } + +{digit}+ { int level = atoi(TO_INTERNAL(yytext)); + if ((level < 0) || (level > MAXGEDCLEVEL)) { + gedcom_error ("Level number out of range [0..%d]", + MAXGEDCLEVEL); + return BADTOKEN; + } + level_diff = level - current_level; + BEGIN(EXPECT_TAG); + current_level = level; + if (level_diff < 1) { + level_diff++; + return CLOSE; + } + else if (level_diff == 1) { + level_diff++; + return OPEN; + } + else { + /* should never happen (error to GEDCOM spec) */ + gedcom_error ("GEDCOM level number is %d higher than " + "previous", + level_diff); + return BADTOKEN; + } + } + +ABBR MKTAGACTION(ABBR) +ADDR MKTAGACTION(ADDR) +ADR1 MKTAGACTION(ADR1) +ADR2 MKTAGACTION(ADR2) +ADOP MKTAGACTION(ADOP) +AFN MKTAGACTION(AFN) +AGE MKTAGACTION(AGE) +AGNC MKTAGACTION(AGNC) +ALIA MKTAGACTION(ALIA) +ANCE MKTAGACTION(ANCE) +ANCI MKTAGACTION(ANCI) +ANUL MKTAGACTION(ANUL) +ASSO MKTAGACTION(ASSO) +AUTH MKTAGACTION(AUTH) +BAPL MKTAGACTION(BAPL) +BAPM MKTAGACTION(BAPM) +BARM MKTAGACTION(BARM) +BASM MKTAGACTION(BASM) +BIRT MKTAGACTION(BIRT) +BLES MKTAGACTION(BLES) +BLOB MKTAGACTION(BLOB) +BURI MKTAGACTION(BURI) +CALN MKTAGACTION(CALN) +CAST MKTAGACTION(CAST) +CAUS MKTAGACTION(CAUS) +CENS MKTAGACTION(CENS) +CHAN MKTAGACTION(CHAN) +CHAR MKTAGACTION(CHAR) +CHIL MKTAGACTION(CHIL) +CHR MKTAGACTION(CHR) +CHRA MKTAGACTION(CHRA) +CITY MKTAGACTION(CITY) +CONC MKTAGACTION(CONC) +CONF MKTAGACTION(CONF) +CONL MKTAGACTION(CONL) +CONT MKTAGACTION(CONT) +COPR MKTAGACTION(COPR) +CORP MKTAGACTION(CORP) +CREM MKTAGACTION(CREM) +CTRY MKTAGACTION(CTRY) +DATA MKTAGACTION(DATA) +DATE MKTAGACTION(DATE) +DEAT MKTAGACTION(DEAT) +DESC MKTAGACTION(DESC) +DESI MKTAGACTION(DESI) +DEST MKTAGACTION(DEST) +DIV MKTAGACTION(DIV) +DIVF MKTAGACTION(DIVF) +DSCR MKTAGACTION(DSCR) +EDUC MKTAGACTION(EDUC) +EMIG MKTAGACTION(EMIG) +ENDL MKTAGACTION(ENDL) +ENGA MKTAGACTION(ENGA) +EVEN MKTAGACTION(EVEN) +FAM MKTAGACTION(FAM) +FAMC MKTAGACTION(FAMC) +FAMF MKTAGACTION(FAMF) +FAMS MKTAGACTION(FAMS) +FCOM MKTAGACTION(FCOM) +FILE MKTAGACTION(FILE) +FORM MKTAGACTION(FORM) +GEDC MKTAGACTION(GEDC) +GIVN MKTAGACTION(GIVN) +GRAD MKTAGACTION(GRAD) +HEAD MKTAGACTION(HEAD) +HUSB MKTAGACTION(HUSB) +IDNO MKTAGACTION(IDNO) +IMMI MKTAGACTION(IMMI) +INDI MKTAGACTION(INDI) +LANG MKTAGACTION(LANG) +LEGA MKTAGACTION(LEGA) +MARB MKTAGACTION(MARB) +MARC MKTAGACTION(MARC) +MARL MKTAGACTION(MARL) +MARR MKTAGACTION(MARR) +MARS MKTAGACTION(MARS) +MEDI MKTAGACTION(MEDI) +NAME MKTAGACTION(NAME) +NATI MKTAGACTION(NATI) +NATU MKTAGACTION(NATU) +NCHI MKTAGACTION(NCHI) +NICK MKTAGACTION(NICK) +NMR MKTAGACTION(NMR) +NOTE MKTAGACTION(NOTE) +NPFX MKTAGACTION(NPFX) +NSFX MKTAGACTION(NSFX) +OBJE MKTAGACTION(OBJE) +OCCU MKTAGACTION(OCCU) +ORDI MKTAGACTION(ORDI) +ORDN MKTAGACTION(ORDN) +PAGE MKTAGACTION(PAGE) +PEDI MKTAGACTION(PEDI) +PHON MKTAGACTION(PHON) +PLAC MKTAGACTION(PLAC) +POST MKTAGACTION(POST) +PROB MKTAGACTION(PROB) +PROP MKTAGACTION(PROP) +PUBL MKTAGACTION(PUBL) +QUAY MKTAGACTION(QUAY) +REFN MKTAGACTION(REFN) +RELA MKTAGACTION(RELA) +RELI MKTAGACTION(RELI) +REPO MKTAGACTION(REPO) +RESI MKTAGACTION(RESI) +RESN MKTAGACTION(RESN) +RETI MKTAGACTION(RETI) +RFN MKTAGACTION(RFN) +RIN MKTAGACTION(RIN) +ROLE MKTAGACTION(ROLE) +SEX MKTAGACTION(SEX) +SLGC MKTAGACTION(SLGC) +SLGS MKTAGACTION(SLGS) +SOUR MKTAGACTION(SOUR) +SPFX MKTAGACTION(SPFX) +SSN MKTAGACTION(SSN) +STAE MKTAGACTION(STAE) +STAT MKTAGACTION(STAT) +SUBM MKTAGACTION(SUBM) +SUBN MKTAGACTION(SUBN) +SURN MKTAGACTION(SURN) +TEMP MKTAGACTION(TEMP) +TEXT MKTAGACTION(TEXT) +TIME MKTAGACTION(TIME) +TITL MKTAGACTION(TITL) +TRLR MKTAGACTION(TRLR) +TYPE MKTAGACTION(TYPE) +VERS MKTAGACTION(VERS) +WIFE MKTAGACTION(WIFE) +WILL MKTAGACTION(WILL) + +{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) { + gedcom_error("Tag '%s' too long, max %d chars"); + return BADTOKEN; + } + strncpy(string_buf, yytext, MAXGEDCTAGLEN+1); + gedcom_lval.string = TO_INTERNAL(string_buf); + BEGIN(NORMAL); + return USERTAG; + } + +{delim} { gedcom_lval.string = TO_INTERNAL(yytext); + return DELIM; + } + +{any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext); + /* Due to character conversions, it is possible + that the current character will be combined with + the next, and so now we don't have a character yet... + This is only applicable to the 1byte case (e.g. ANSEL). + */ + if (strlen(gedcom_lval.string) > 0) + return ANYCHAR; + } + +{escape}/{non_at} { gedcom_lval.string = TO_INTERNAL(yytext); + return ESCAPE; + } + +{pointer} { gedcom_lval.string = TO_INTERNAL(yytext); + return POINTER; + } + + /* Due to the conversion of level numbers into brackets, the + terminator is not important, so no token is returned here. + Although not strictly according to the GEDCOM spec, we'll ignore + whitespace just before the terminator. + */ + +{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); } + + /* Eventually we have to return 1 closing bracket (for the trailer). + We can detect whether we have sent the closing bracket using the + level_diff (at eof, first it is 2, then we increment it ourselves) */ + +<> { if (level_diff == 2) { + level_diff++; + return CLOSE; + } + else { + yyterminate(); + } + } + +. { gedcom_error("Unexpected character: '%s' (0x%02x)", + yytext, yytext[0]); + return BADTOKEN; + } + +%% + +int yywrap() +{ + return 1; +} + +#ifdef LEXER_TEST +int main() +{ + int tok, res; + init_encodings(); + set_encoding_width(ONE_BYTE); + res = open_conv_to_internal("ASCII"); + if (!res) { + gedcom_error("Unable to open conversion context: %s", + strerror(errno)); + return 1; + } + tok = gedcom_1byte_lex(); + while (tok) { + switch(tok) { + case BADTOKEN: printf("BADTOKEN "); break; + case OPEN: printf("OPEN "); break; + case CLOSE: printf("CLOSE "); break; + case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break; + case DELIM: printf("DELIM "); break; + case ANYCHAR: printf("%s ", gedcom_lval.string); break; + case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break; + case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break; + default: printf("TAG(%s) ", gedcom_lval.string); break; + } + tok = gedcom_1byte_lex(); + } + printf("\n"); + close_conv_to_internal(); + return 0; +} +#endif diff --git a/gedcom_hilo.lex b/gedcom_hilo.lex new file mode 100644 index 0000000..d8a1da0 --- /dev/null +++ b/gedcom_hilo.lex @@ -0,0 +1,361 @@ +/* $Id$ */ +/* $Name$ */ + +/* In high-low order, a space is encoded as 0x00 0x20 */ +/* i.e. this is utf-16-be */ + +%{ +#include "gedcom.tab.h" +#include "gedcom.h" +#include "multilex.h" +#include "encoding.h" + +#define YY_NO_UNPUT +%} + +%s NORMAL +%s EXPECT_TAG + +alpha \x00[A-Za-z_] +digit \x00[0-9] +delim \x00\x20 +tab \x00[\t] +hash \x00# +literal_at \x00@\x00@ +otherchar \x00[\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFF]|[\x01-\xFF][\x00-\xFF] +terminator \x00\x0D|\x00\x0A|\x00\x0D\x00\x0A|\x00\x0A\x00\x0D + +any_char {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at} +any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at} +non_at {alpha}|{digit}|{otherchar}|{delim}|{hash} +alphanum {alpha}|{digit} +gen_delim {delim}|{tab} + +escape \x00@\x00#{any_char}+\x00@ +pointer \x00@{alphanum}{non_at}+\x00@ + +%{ +static int current_level=-1; +static int level_diff=MAXGEDCLEVEL; + +#ifdef LEXER_TEST +YYSTYPE gedcom_lval; +int line_no = 1; +#endif +%} + +%% + + /* The GEDCOM level number is converted into a sequence of opening + and closing brackets. Simply put, the following GEDCOM fragment: + + 0 HEAD + 1 SOUR genes + 2 VERS 1.6 + 2 NAME Genes + 1 DATE 07 OCT 2001 + ... + 0 TRLR + + is converted into: + + { HEAD (initial) + { SOUR genes (1 higher: no closing brackets) + { VERS 1.6 (1 higher: no closing brackets) + } { NAME Genes (same level: 1 closing bracket) + } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets) + ... + } { TRLR } + + or more clearly: + + { HEAD + { SOUR genes + { VERS 1.6 } + { NAME Genes } } + { DATE 07 OCT 2001 + ... } + { TRLR } + + But because this means that one token is converted into a series + of tokens, there is some initial code following immediately here + that returns "pending" tokens. */ + +%{ +char string_buf[MAXGEDCLINELEN+1]; + +if (level_diff < 1) { + level_diff++; + return CLOSE; +} +else if (level_diff == 1) { + level_diff++; + return OPEN; +} +else { + /* out of brackets... */ +} + +#define TO_INTERNAL(str) to_internal(str, yyleng) + +#define MKTAGACTION(tag) \ + { gedcom_lval.string = TO_INTERNAL(yytext); \ + BEGIN(NORMAL); \ + return TAG_##tag; } + +%} + +{gen_delim}* /* ignore leading whitespace (also tabs) */ + +\x00[0]{digit}+ { gedcom_error ("Level number with leading zero"); + return BADTOKEN; + } + +{digit}+ { int level = atoi(TO_INTERNAL(yytext)); + if ((level < 0) || (level > MAXGEDCLEVEL)) { + gedcom_error ("Level number out of range [0..%d]", + MAXGEDCLEVEL); + return BADTOKEN; + } + level_diff = level - current_level; + BEGIN(EXPECT_TAG); + current_level = level; + if (level_diff < 1) { + level_diff++; + return CLOSE; + } + else if (level_diff == 1) { + level_diff++; + return OPEN; + } + else { + /* should never happen (error to GEDCOM spec) */ + gedcom_error ("GEDCOM level number is %d higher than " + "previous", + level_diff); + return BADTOKEN; + } + } + +\x00A\x00B\x00B\x00R MKTAGACTION(ABBR) +\x00A\x00D\x00D\x00R MKTAGACTION(ADDR) +\x00A\x00D\x00R\x001 MKTAGACTION(ADR1) +\x00A\x00D\x00R\x002 MKTAGACTION(ADR2) +\x00A\x00D\x00O\x00P MKTAGACTION(ADOP) +\x00A\x00F\x00N MKTAGACTION(AFN) +\x00A\x00G\x00E MKTAGACTION(AGE) +\x00A\x00G\x00N\x00C MKTAGACTION(AGNC) +\x00A\x00L\x00I\x00A MKTAGACTION(ALIA) +\x00A\x00N\x00C\x00E MKTAGACTION(ANCE) +\x00A\x00N\x00C\x00I MKTAGACTION(ANCI) +\x00A\x00N\x00U\x00L MKTAGACTION(ANUL) +\x00A\x00S\x00S\x00O MKTAGACTION(ASSO) +\x00A\x00U\x00T\x00H MKTAGACTION(AUTH) +\x00B\x00A\x00P\x00L MKTAGACTION(BAPL) +\x00B\x00A\x00P\x00M MKTAGACTION(BAPM) +\x00B\x00A\x00R\x00M MKTAGACTION(BARM) +\x00B\x00A\x00S\x00M MKTAGACTION(BASM) +\x00B\x00I\x00R\x00T MKTAGACTION(BIRT) +\x00B\x00L\x00E\x00S MKTAGACTION(BLES) +\x00B\x00L\x00O\x00B MKTAGACTION(BLOB) +\x00B\x00U\x00R\x00I MKTAGACTION(BURI) +\x00C\x00A\x00L\x00N MKTAGACTION(CALN) +\x00C\x00A\x00S\x00T MKTAGACTION(CAST) +\x00C\x00A\x00U\x00S MKTAGACTION(CAUS) +\x00C\x00E\x00N\x00S MKTAGACTION(CENS) +\x00C\x00H\x00A\x00N MKTAGACTION(CHAN) +\x00C\x00H\x00A\x00R MKTAGACTION(CHAR) +\x00C\x00H\x00I\x00L MKTAGACTION(CHIL) +\x00C\x00H\x00R MKTAGACTION(CHR) +\x00C\x00H\x00R\x00A MKTAGACTION(CHRA) +\x00C\x00I\x00T\x00Y MKTAGACTION(CITY) +\x00C\x00O\x00N\x00C MKTAGACTION(CONC) +\x00C\x00O\x00N\x00F MKTAGACTION(CONF) +\x00C\x00O\x00N\x00L MKTAGACTION(CONL) +\x00C\x00O\x00N\x00T MKTAGACTION(CONT) +\x00C\x00O\x00P\x00R MKTAGACTION(COPR) +\x00C\x00O\x00R\x00P MKTAGACTION(CORP) +\x00C\x00R\x00E\x00M MKTAGACTION(CREM) +\x00C\x00T\x00R\x00Y MKTAGACTION(CTRY) +\x00D\x00A\x00T\x00A MKTAGACTION(DATA) +\x00D\x00A\x00T\x00E MKTAGACTION(DATE) +\x00D\x00E\x00A\x00T MKTAGACTION(DEAT) +\x00D\x00E\x00S\x00C MKTAGACTION(DESC) +\x00D\x00E\x00S\x00I MKTAGACTION(DESI) +\x00D\x00E\x00S\x00T MKTAGACTION(DEST) +\x00D\x00I\x00V MKTAGACTION(DIV) +\x00D\x00I\x00V\x00F MKTAGACTION(DIVF) +\x00D\x00S\x00C\x00R MKTAGACTION(DSCR) +\x00E\x00D\x00U\x00C MKTAGACTION(EDUC) +\x00E\x00M\x00I\x00G MKTAGACTION(EMIG) +\x00E\x00N\x00D\x00L MKTAGACTION(ENDL) +\x00E\x00N\x00G\x00A MKTAGACTION(ENGA) +\x00E\x00V\x00E\x00N MKTAGACTION(EVEN) +\x00F\x00A\x00M MKTAGACTION(FAM) +\x00F\x00A\x00M\x00C MKTAGACTION(FAMC) +\x00F\x00A\x00M\x00F MKTAGACTION(FAMF) +\x00F\x00A\x00M\x00S MKTAGACTION(FAMS) +\x00F\x00C\x00O\x00M MKTAGACTION(FCOM) +\x00F\x00I\x00L\x00E MKTAGACTION(FILE) +\x00F\x00O\x00R\x00M MKTAGACTION(FORM) +\x00G\x00E\x00D\x00C MKTAGACTION(GEDC) +\x00G\x00I\x00V\x00N MKTAGACTION(GIVN) +\x00G\x00R\x00A\x00D MKTAGACTION(GRAD) +\x00H\x00E\x00A\x00D MKTAGACTION(HEAD) +\x00H\x00U\x00S\x00B MKTAGACTION(HUSB) +\x00I\x00D\x00N\x00O MKTAGACTION(IDNO) +\x00I\x00M\x00M\x00I MKTAGACTION(IMMI) +\x00I\x00N\x00D\x00I MKTAGACTION(INDI) +\x00L\x00A\x00N\x00G MKTAGACTION(LANG) +\x00L\x00E\x00G\x00A MKTAGACTION(LEGA) +\x00M\x00A\x00R\x00B MKTAGACTION(MARB) +\x00M\x00A\x00R\x00C MKTAGACTION(MARC) +\x00M\x00A\x00R\x00L MKTAGACTION(MARL) +\x00M\x00A\x00R\x00R MKTAGACTION(MARR) +\x00M\x00A\x00R\x00S MKTAGACTION(MARS) +\x00M\x00E\x00D\x00I MKTAGACTION(MEDI) +\x00N\x00A\x00M\x00E MKTAGACTION(NAME) +\x00N\x00A\x00T\x00I MKTAGACTION(NATI) +\x00N\x00A\x00T\x00U MKTAGACTION(NATU) +\x00N\x00C\x00H\x00I MKTAGACTION(NCHI) +\x00N\x00I\x00C\x00K MKTAGACTION(NICK) +\x00N\x00M\x00R MKTAGACTION(NMR) +\x00N\x00O\x00T\x00E MKTAGACTION(NOTE) +\x00N\x00P\x00F\x00X MKTAGACTION(NPFX) +\x00N\x00S\x00F\x00X MKTAGACTION(NSFX) +\x00O\x00B\x00J\x00E MKTAGACTION(OBJE) +\x00O\x00C\x00C\x00U MKTAGACTION(OCCU) +\x00O\x00R\x00D\x00I MKTAGACTION(ORDI) +\x00O\x00R\x00D\x00N MKTAGACTION(ORDN) +\x00P\x00A\x00G\x00E MKTAGACTION(PAGE) +\x00P\x00E\x00D\x00I MKTAGACTION(PEDI) +\x00P\x00H\x00O\x00N MKTAGACTION(PHON) +\x00P\x00L\x00A\x00C MKTAGACTION(PLAC) +\x00P\x00O\x00S\x00T MKTAGACTION(POST) +\x00P\x00R\x00O\x00B MKTAGACTION(PROB) +\x00P\x00R\x00O\x00P MKTAGACTION(PROP) +\x00P\x00U\x00B\x00L MKTAGACTION(PUBL) +\x00Q\x00U\x00A\x00Y MKTAGACTION(QUAY) +\x00R\x00E\x00F\x00N MKTAGACTION(REFN) +\x00R\x00E\x00L\x00A MKTAGACTION(RELA) +\x00R\x00E\x00L\x00I MKTAGACTION(RELI) +\x00R\x00E\x00P\x00O MKTAGACTION(REPO) +\x00R\x00E\x00S\x00I MKTAGACTION(RESI) +\x00R\x00E\x00S\x00N MKTAGACTION(RESN) +\x00R\x00E\x00T\x00I MKTAGACTION(RETI) +\x00R\x00F\x00N MKTAGACTION(RFN) +\x00R\x00I\x00N MKTAGACTION(RIN) +\x00R\x00O\x00L\x00E MKTAGACTION(ROLE) +\x00S\x00E\x00X MKTAGACTION(SEX) +\x00S\x00L\x00G\x00C MKTAGACTION(SLGC) +\x00S\x00L\x00G\x00S MKTAGACTION(SLGS) +\x00S\x00O\x00U\x00R MKTAGACTION(SOUR) +\x00S\x00P\x00F\x00X MKTAGACTION(SPFX) +\x00S\x00S\x00N MKTAGACTION(SSN) +\x00S\x00T\x00A\x00E MKTAGACTION(STAE) +\x00S\x00T\x00A\x00T MKTAGACTION(STAT) +\x00S\x00U\x00B\x00M MKTAGACTION(SUBM) +\x00S\x00U\x00B\x00N MKTAGACTION(SUBN) +\x00S\x00U\x00R\x00N MKTAGACTION(SURN) +\x00T\x00E\x00M\x00P MKTAGACTION(TEMP) +\x00T\x00E\x00X\x00T MKTAGACTION(TEXT) +\x00T\x00I\x00M\x00E MKTAGACTION(TIME) +\x00T\x00I\x00T\x00L MKTAGACTION(TITL) +\x00T\x00R\x00L\x00R MKTAGACTION(TRLR) +\x00T\x00Y\x00P\x00E MKTAGACTION(TYPE) +\x00V\x00E\x00R\x00S MKTAGACTION(VERS) +\x00W\x00I\x00F\x00E MKTAGACTION(WIFE) +\x00W\x00I\x00L\x00L MKTAGACTION(WILL) + +{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) { + gedcom_error("Tag '%s' too long, max %d chars"); + return BADTOKEN; + } + strncpy(string_buf, yytext, MAXGEDCTAGLEN+1); + gedcom_lval.string = TO_INTERNAL(string_buf); + BEGIN(NORMAL); + return USERTAG; + } + +{delim} { gedcom_lval.string = TO_INTERNAL(yytext); + return DELIM; + } + +{any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext); + return ANYCHAR; + } + +{escape}/{non_at} { gedcom_lval.string = TO_INTERNAL(yytext); + return ESCAPE; + } + +{pointer} { gedcom_lval.string = TO_INTERNAL(yytext); + return POINTER; + } + + /* Due to the conversion of level numbers into brackets, the + terminator is not important, so no token is returned here. + Although not strictly according to the GEDCOM spec, we'll ignore + whitespace just before the terminator. + */ + +{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); } + + /* Eventually we have to return 1 closing bracket (for the trailer). + We can detect whether we have sent the closing bracket using the + level_diff (at eof, first it is 2, then we increment it ourselves) */ + +<> { if (level_diff == 2) { + level_diff++; + return CLOSE; + } + else { + yyterminate(); + } + } + +. { gedcom_error("Unexpected character: '%s' (0x%02x)", + yytext, yytext[0]); + return BADTOKEN; + } + +%% + +int yywrap() +{ + return 1; +} + +#ifdef LEXER_TEST + +int main() +{ + int tok, res; + init_encodings(); + set_encoding_width(TWO_BYTE_HILO); + res = open_conv_to_internal("UNICODE"); + if (!res) { + gedcom_error("Unable to open conversion context: %s", + strerror(errno)); + return 1; + } + tok = gedcom_hilo_lex(); + while (tok) { + switch(tok) { + case BADTOKEN: printf("BADTOKEN "); break; + case OPEN: printf("OPEN "); break; + case CLOSE: printf("CLOSE "); break; + case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break; + case DELIM: printf("DELIM "); break; + case ANYCHAR: printf("%s ", gedcom_lval.string); break; + case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break; + case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break; + default: printf("TAG(%s) ", gedcom_lval.string); break; + } + tok = gedcom_hilo_lex(); + } + printf("\n"); + close_conv_to_internal(); + return 0; +} +#endif diff --git a/message.c b/message.c new file mode 100644 index 0000000..abde4b9 --- /dev/null +++ b/message.c @@ -0,0 +1,45 @@ +/* $Id$ */ +/* $Name$ */ + +#include "gedcom.h" + +int gedcom_message(char* s, ...) +{ + int res; + va_list ap; + + va_start(ap, s); + res = vfprintf(stderr, s, ap); + fprintf(stderr, "\n"); + va_end(ap); + + return res; +} + +int gedcom_warning(char* s, ...) +{ + int res; + va_list ap; + + va_start(ap, s); + fprintf(stderr, "Warning on line %d: ", line_no); + res = vfprintf(stderr, s, ap); + fprintf(stderr, "\n"); + va_end(ap); + + return res; +} + +int gedcom_error(char* s, ...) +{ + int res; + va_list ap; + + va_start(ap, s); + fprintf(stderr, "Error on line %d: ", line_no); + res = vfprintf(stderr, s, ap); + fprintf(stderr, "\n"); + va_end(ap); + + return res; +} diff --git a/multilex.c b/multilex.c new file mode 100644 index 0000000..b0097cc --- /dev/null +++ b/multilex.c @@ -0,0 +1,107 @@ +/* $Id$ */ +/* $Name$ */ + +#include "gedcom.h" +#include "multilex.h" +#include "encoding.h" + +int line_no = 1; + +typedef int (*lex_func)(void); +lex_func lf; + +int lexer_init(ENCODING enc, FILE* f) +{ + if (enc == ONE_BYTE) { + gedcom_1byte_in = f; + lf = &gedcom_1byte_lex; + set_encoding_width(enc); + return open_conv_to_internal("ASCII"); + } + else if (enc == TWO_BYTE_HILO) { + gedcom_hilo_in = f; + lf = &gedcom_hilo_lex; + set_encoding_width(enc); + return open_conv_to_internal("UNICODE"); + } + else if (enc == TWO_BYTE_LOHI) { + gedcom_lohi_in = f; + lf = &gedcom_lohi_lex; + set_encoding_width(enc); + return open_conv_to_internal("UNICODE"); + } + else { + return 0; + } +} + +void lexer_close() +{ + close_conv_to_internal(); +} + +int gedcom_lex() +{ + return (*lf)(); +} + +int determine_encoding(FILE* f) +{ + char first[2]; + + fread(first, 1, 2, f); + if ((first[0] == '0') && (first[1] == ' ')) { + gedcom_message("One-byte encoding"); + fseek(f, 0, 0); + return ONE_BYTE; + } + else if ((first[0] == '\0') && (first[1] == '0')) + { + gedcom_message("Two-byte encoding, high-low"); + fseek(f, 0, 0); + return TWO_BYTE_HILO; + } + else if ((first[0] == '\xFE') && (first[1] == '\xFF')) + { + gedcom_message("Two-byte encoding, high-low, with BOM"); + return TWO_BYTE_HILO; + } + else if ((first[0] == '0') && (first[1] == '\0')) + { + gedcom_message("Two-byte encoding, low-high"); + fseek(f, 0, 0); + return TWO_BYTE_LOHI; + } + else if ((first[0] == '\xFF') && (first[1] == '\xFE')) + { + gedcom_message("Two-byte encoding, low-high, with BOM"); + return TWO_BYTE_LOHI; + } + else { + gedcom_message("Unknown encoding, falling back to one-byte"); + fseek(f, 0, 0); + return ONE_BYTE; + } +} + +int gedcom_parse_file(char* file_name) +{ + ENCODING enc; + int result = 1; + FILE* file = fopen (file_name, "r"); + if (!file) { + gedcom_error("Could not open file '%s'\n", file_name); + return 1; + } + + init_encodings(); + enc = determine_encoding(file); + + if (lexer_init(enc, file)) { + result = gedcom_parse(); + } + lexer_close(); + + return result; +} + diff --git a/multilex.h b/multilex.h new file mode 100644 index 0000000..c8f81ef --- /dev/null +++ b/multilex.h @@ -0,0 +1,18 @@ +/* $Id$ */ +/* $Name$ */ + +#ifndef __MULTILEX_H +#define __MULTILEX_H +#include + +int gedcom_parse_file(char* file_name); + +int gedcom_1byte_lex(); +extern FILE *gedcom_1byte_in; + +int gedcom_hilo_lex(); +extern FILE *gedcom_hilo_in; + +int gedcom_lohi_lex(); +extern FILE *gedcom_lohi_in; +#endif /* __MULTILEX_H */ -- 2.30.2