/* This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * (C) 2001 by The Genes Development Team Original author: Peter Verthez (Peter.Verthez@advalvas.be) */ /* $Id$ */ /* $Name$ */ /* In low-high order, a space is encoded as 0x20 0x00 */ /* i.e. this is utf-16-le */ %{ #include "gedcom.tab.h" #include "gedcom.h" #include "multilex.h" #include "encoding.h" #define YY_NO_UNPUT %} %s NORMAL %s EXPECT_TAG alpha [A-Za-z_]\x00 digit [0-9]\x00 delim \x20\x00 tab [\t]\x00 hash #\x00 literal_at @\x00@\x00 otherchar [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFF]\x00|[\x00-\xFF][\x01-\xFF] terminator \x0D\x00|\x0A\x00|\x0D\x00\x0A\x00|\x0A\x00\x0D\x00 any_char {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at} any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at} non_at {alpha}|{digit}|{otherchar}|{delim}|{hash} alphanum {alpha}|{digit} gen_delim {delim}|{tab} escape @\x00#\x00{any_char}+@\x00 pointer @\x00{alphanum}{non_at}+@\x00 %{ static int current_level=-1; static int level_diff=MAXGEDCLEVEL; #ifdef LEXER_TEST YYSTYPE gedcom_lval; int line_no = 1; #endif %} %% /* The GEDCOM level number is converted into a sequence of opening and closing brackets. Simply put, the following GEDCOM fragment: 0 HEAD 1 SOUR genes 2 VERS 1.6 2 NAME Genes 1 DATE 07 OCT 2001 ... 0 TRLR is converted into: { HEAD (initial) { SOUR genes (1 higher: no closing brackets) { VERS 1.6 (1 higher: no closing brackets) } { NAME Genes (same level: 1 closing bracket) } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets) ... } { TRLR } or more clearly: { HEAD { SOUR genes { VERS 1.6 } { NAME Genes } } { DATE 07 OCT 2001 ... } { TRLR } But because this means that one token is converted into a series of tokens, there is some initial code following immediately here that returns "pending" tokens. */ %{ char string_buf[MAXGEDCLINELEN+1]; if (level_diff < 1) { level_diff++; return CLOSE; } else if (level_diff == 1) { level_diff++; gedcom_lval.number = current_level; return OPEN; } else { /* out of brackets... */ } #define TO_INTERNAL(str) to_internal(str, yyleng) #define MKTAGACTION(tag) \ { gedcom_lval.string = TO_INTERNAL(yytext); \ BEGIN(NORMAL); \ return TAG_##tag; } %} {gen_delim}* /* ignore leading whitespace (also tabs) */ \x00[0]{digit}+ { gedcom_error ("Level number with leading zero"); return BADTOKEN; } {digit}+ { int level = atoi(TO_INTERNAL(yytext)); if ((level < 0) || (level > MAXGEDCLEVEL)) { gedcom_error ("Level number out of range [0..%d]", MAXGEDCLEVEL); return BADTOKEN; } level_diff = level - current_level; BEGIN(EXPECT_TAG); current_level = level; if (level_diff < 1) { level_diff++; return CLOSE; } else if (level_diff == 1) { level_diff++; gedcom_lval.number = current_level; return OPEN; } else { /* should never happen (error to GEDCOM spec) */ gedcom_error ("GEDCOM level number is %d higher than " "previous", level_diff); return BADTOKEN; } } A\x00B\x00B\x00R\x00 MKTAGACTION(ABBR) A\x00D\x00D\x00R\x00 MKTAGACTION(ADDR) A\x00D\x00R\x001\x00 MKTAGACTION(ADR1) A\x00D\x00R\x002\x00 MKTAGACTION(ADR2) A\x00D\x00O\x00P\x00 MKTAGACTION(ADOP) A\x00F\x00N\x00 MKTAGACTION(AFN) A\x00G\x00E\x00 MKTAGACTION(AGE) A\x00G\x00N\x00C\x00 MKTAGACTION(AGNC) A\x00L\x00I\x00A\x00 MKTAGACTION(ALIA) A\x00N\x00C\x00E\x00 MKTAGACTION(ANCE) A\x00N\x00C\x00I\x00 MKTAGACTION(ANCI) A\x00N\x00U\x00L\x00 MKTAGACTION(ANUL) A\x00S\x00S\x00O\x00 MKTAGACTION(ASSO) A\x00U\x00T\x00H\x00 MKTAGACTION(AUTH) B\x00A\x00P\x00L\x00 MKTAGACTION(BAPL) B\x00A\x00P\x00M\x00 MKTAGACTION(BAPM) B\x00A\x00R\x00M\x00 MKTAGACTION(BARM) B\x00A\x00S\x00M\x00 MKTAGACTION(BASM) B\x00I\x00R\x00T\x00 MKTAGACTION(BIRT) B\x00L\x00E\x00S\x00 MKTAGACTION(BLES) B\x00L\x00O\x00B\x00 MKTAGACTION(BLOB) B\x00U\x00R\x00I\x00 MKTAGACTION(BURI) C\x00A\x00L\x00N\x00 MKTAGACTION(CALN) C\x00A\x00S\x00T\x00 MKTAGACTION(CAST) C\x00A\x00U\x00S\x00 MKTAGACTION(CAUS) C\x00E\x00N\x00S\x00 MKTAGACTION(CENS) C\x00H\x00A\x00N\x00 MKTAGACTION(CHAN) C\x00H\x00A\x00R\x00 MKTAGACTION(CHAR) C\x00H\x00I\x00L\x00 MKTAGACTION(CHIL) C\x00H\x00R\x00 MKTAGACTION(CHR) C\x00H\x00R\x00A\x00 MKTAGACTION(CHRA) C\x00I\x00T\x00Y\x00 MKTAGACTION(CITY) C\x00O\x00N\x00C\x00 MKTAGACTION(CONC) C\x00O\x00N\x00F\x00 MKTAGACTION(CONF) C\x00O\x00N\x00L\x00 MKTAGACTION(CONL) C\x00O\x00N\x00T\x00 MKTAGACTION(CONT) C\x00O\x00P\x00R\x00 MKTAGACTION(COPR) C\x00O\x00R\x00P\x00 MKTAGACTION(CORP) C\x00R\x00E\x00M\x00 MKTAGACTION(CREM) C\x00T\x00R\x00Y\x00 MKTAGACTION(CTRY) D\x00A\x00T\x00A\x00 MKTAGACTION(DATA) D\x00A\x00T\x00E\x00 MKTAGACTION(DATE) D\x00E\x00A\x00T\x00 MKTAGACTION(DEAT) D\x00E\x00S\x00C\x00 MKTAGACTION(DESC) D\x00E\x00S\x00I\x00 MKTAGACTION(DESI) D\x00E\x00S\x00T\x00 MKTAGACTION(DEST) D\x00I\x00V\x00 MKTAGACTION(DIV) D\x00I\x00V\x00F\x00 MKTAGACTION(DIVF) D\x00S\x00C\x00R\x00 MKTAGACTION(DSCR) E\x00D\x00U\x00C\x00 MKTAGACTION(EDUC) E\x00M\x00I\x00G\x00 MKTAGACTION(EMIG) E\x00N\x00D\x00L\x00 MKTAGACTION(ENDL) E\x00N\x00G\x00A\x00 MKTAGACTION(ENGA) E\x00V\x00E\x00N\x00 MKTAGACTION(EVEN) F\x00A\x00M\x00 MKTAGACTION(FAM) F\x00A\x00M\x00C\x00 MKTAGACTION(FAMC) F\x00A\x00M\x00F\x00 MKTAGACTION(FAMF) F\x00A\x00M\x00S\x00 MKTAGACTION(FAMS) F\x00C\x00O\x00M\x00 MKTAGACTION(FCOM) F\x00I\x00L\x00E\x00 MKTAGACTION(FILE) F\x00O\x00R\x00M\x00 MKTAGACTION(FORM) G\x00E\x00D\x00C\x00 MKTAGACTION(GEDC) G\x00I\x00V\x00N\x00 MKTAGACTION(GIVN) G\x00R\x00A\x00D\x00 MKTAGACTION(GRAD) H\x00E\x00A\x00D\x00 MKTAGACTION(HEAD) H\x00U\x00S\x00B\x00 MKTAGACTION(HUSB) I\x00D\x00N\x00O\x00 MKTAGACTION(IDNO) I\x00M\x00M\x00I\x00 MKTAGACTION(IMMI) I\x00N\x00D\x00I\x00 MKTAGACTION(INDI) L\x00A\x00N\x00G\x00 MKTAGACTION(LANG) L\x00E\x00G\x00A\x00 MKTAGACTION(LEGA) M\x00A\x00R\x00B\x00 MKTAGACTION(MARB) M\x00A\x00R\x00C\x00 MKTAGACTION(MARC) M\x00A\x00R\x00L\x00 MKTAGACTION(MARL) M\x00A\x00R\x00R\x00 MKTAGACTION(MARR) M\x00A\x00R\x00S\x00 MKTAGACTION(MARS) M\x00E\x00D\x00I\x00 MKTAGACTION(MEDI) N\x00A\x00M\x00E\x00 MKTAGACTION(NAME) N\x00A\x00T\x00I\x00 MKTAGACTION(NATI) N\x00A\x00T\x00U\x00 MKTAGACTION(NATU) N\x00C\x00H\x00I\x00 MKTAGACTION(NCHI) N\x00I\x00C\x00K\x00 MKTAGACTION(NICK) N\x00M\x00R\x00 MKTAGACTION(NMR) N\x00O\x00T\x00E\x00 MKTAGACTION(NOTE) N\x00P\x00F\x00X\x00 MKTAGACTION(NPFX) N\x00S\x00F\x00X\x00 MKTAGACTION(NSFX) O\x00B\x00J\x00E\x00 MKTAGACTION(OBJE) O\x00C\x00C\x00U\x00 MKTAGACTION(OCCU) O\x00R\x00D\x00I\x00 MKTAGACTION(ORDI) O\x00R\x00D\x00N\x00 MKTAGACTION(ORDN) P\x00A\x00G\x00E\x00 MKTAGACTION(PAGE) P\x00E\x00D\x00I\x00 MKTAGACTION(PEDI) P\x00H\x00O\x00N\x00 MKTAGACTION(PHON) P\x00L\x00A\x00C\x00 MKTAGACTION(PLAC) P\x00O\x00S\x00T\x00 MKTAGACTION(POST) P\x00R\x00O\x00B\x00 MKTAGACTION(PROB) P\x00R\x00O\x00P\x00 MKTAGACTION(PROP) P\x00U\x00B\x00L\x00 MKTAGACTION(PUBL) Q\x00U\x00A\x00Y\x00 MKTAGACTION(QUAY) R\x00E\x00F\x00N\x00 MKTAGACTION(REFN) R\x00E\x00L\x00A\x00 MKTAGACTION(RELA) R\x00E\x00L\x00I\x00 MKTAGACTION(RELI) R\x00E\x00P\x00O\x00 MKTAGACTION(REPO) R\x00E\x00S\x00I\x00 MKTAGACTION(RESI) R\x00E\x00S\x00N\x00 MKTAGACTION(RESN) R\x00E\x00T\x00I\x00 MKTAGACTION(RETI) R\x00F\x00N\x00 MKTAGACTION(RFN) R\x00I\x00N\x00 MKTAGACTION(RIN) R\x00O\x00L\x00E\x00 MKTAGACTION(ROLE) S\x00E\x00X\x00 MKTAGACTION(SEX) S\x00L\x00G\x00C\x00 MKTAGACTION(SLGC) S\x00L\x00G\x00S\x00 MKTAGACTION(SLGS) S\x00O\x00U\x00R\x00 MKTAGACTION(SOUR) S\x00P\x00F\x00X\x00 MKTAGACTION(SPFX) S\x00S\x00N\x00 MKTAGACTION(SSN) S\x00T\x00A\x00E\x00 MKTAGACTION(STAE) S\x00T\x00A\x00T\x00 MKTAGACTION(STAT) S\x00U\x00B\x00M\x00 MKTAGACTION(SUBM) S\x00U\x00B\x00N\x00 MKTAGACTION(SUBN) S\x00U\x00R\x00N\x00 MKTAGACTION(SURN) T\x00E\x00M\x00P\x00 MKTAGACTION(TEMP) T\x00E\x00X\x00T\x00 MKTAGACTION(TEXT) T\x00I\x00M\x00E\x00 MKTAGACTION(TIME) T\x00I\x00T\x00L\x00 MKTAGACTION(TITL) T\x00R\x00L\x00R\x00 MKTAGACTION(TRLR) T\x00Y\x00P\x00E\x00 MKTAGACTION(TYPE) V\x00E\x00R\x00S\x00 MKTAGACTION(VERS) W\x00I\x00F\x00E\x00 MKTAGACTION(WIFE) W\x00I\x00L\x00L\x00 MKTAGACTION(WILL) {alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) { gedcom_error("Tag '%s' too long, max %d chars"); return BADTOKEN; } strncpy(string_buf, yytext, MAXGEDCTAGLEN+1); gedcom_lval.string = TO_INTERNAL(string_buf); BEGIN(NORMAL); return USERTAG; } {delim} { gedcom_lval.string = TO_INTERNAL(yytext); return DELIM; } {any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext); return ANYCHAR; } {escape}/{non_at} { gedcom_lval.string = TO_INTERNAL(yytext); return ESCAPE; } {pointer} { gedcom_lval.string = TO_INTERNAL(yytext); return POINTER; } /* Due to the conversion of level numbers into brackets, the terminator is not important, so no token is returned here. Although not strictly according to the GEDCOM spec, we'll ignore whitespace just before the terminator. */ {gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); } /* Eventually we have to return 1 closing bracket (for the trailer). We can detect whether we have sent the closing bracket using the level_diff (at eof, first it is 2, then we increment it ourselves) */ <> { if (level_diff == 2) { level_diff++; return CLOSE; } else { yyterminate(); } } . { gedcom_error("Unexpected character: '%s' (0x%02x)", yytext, yytext[0]); return BADTOKEN; } %% int yywrap() { return 1; } #ifdef LEXER_TEST int main() { int tok, res; init_encodings(); set_encoding_width(TWO_BYTE_LOHI); res = open_conv_to_internal("UNICODE"); if (!res) { gedcom_error("Unable to open conversion context: %s", strerror(errno)); return 1; } tok = gedcom_lohi_lex(); while (tok) { switch(tok) { case BADTOKEN: printf("BADTOKEN "); break; case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break; case CLOSE: printf("CLOSE "); break; case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break; case DELIM: printf("DELIM "); break; case ANYCHAR: printf("%s ", gedcom_lval.string); break; case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break; case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break; default: printf("TAG(%s) ", gedcom_lval.string); break; } tok = gedcom_lohi_lex(); } printf("\n"); close_conv_to_internal(); return 0; } #endif