X-Git-Url: https://git.dlugolecki.net.pl/?a=blobdiff_plain;f=gedcom_1byte.lex;h=a4a5659df39a24977b2a067da5bffea39a304a50;hb=c75da58b11b6b5f18e1d3f2200aa1e2f9ba5ac64;hp=df4559abc7da5b45b3f143700a2b6d674e2689dd;hpb=0c341b31ca4de86b12b8b98f724b4f746e98a5d2;p=gedcom-parse.git diff --git a/gedcom_1byte.lex b/gedcom_1byte.lex index df4559a..a4a5659 100644 --- a/gedcom_1byte.lex +++ b/gedcom_1byte.lex @@ -11,12 +11,10 @@ /* $Name$ */ %{ -#include "gedcom.tab.h" -#include "gedcom.h" -#include "multilex.h" -#include "encoding.h" +#undef IN_LEX /* include only a specific part of the following file */ +#include "gedcom_lex_common.c" -#define YY_NO_UNPUT +static size_t encoding_width = 1; %} %s NORMAL @@ -40,111 +38,21 @@ gen_delim {delim}|{tab} escape @#{any_char}+@ pointer @{alphanum}{non_at}+@ -%{ -static int current_level=-1; -static int level_diff=MAXGEDCLEVEL; - -#ifdef LEXER_TEST -YYSTYPE gedcom_lval; -int line_no = 1; -#endif - -%} - %% - /* The GEDCOM level number is converted into a sequence of opening - and closing brackets. Simply put, the following GEDCOM fragment: - - 0 HEAD - 1 SOUR genes - 2 VERS 1.6 - 2 NAME Genes - 1 DATE 07 OCT 2001 - ... - 0 TRLR - - is converted into: - - { HEAD (initial) - { SOUR genes (1 higher: no closing brackets) - { VERS 1.6 (1 higher: no closing brackets) - } { NAME Genes (same level: 1 closing bracket) - } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets) - ... - } { TRLR } - - or more clearly: - - { HEAD - { SOUR genes - { VERS 1.6 } - { NAME Genes } } - { DATE 07 OCT 2001 - ... } - { TRLR } - - But because this means that one token is converted into a series - of tokens, there is some initial code following immediately here - that returns "pending" tokens. */ - %{ -char string_buf[MAXGEDCLINELEN+1]; - -if (level_diff < 1) { - level_diff++; - return CLOSE; -} -else if (level_diff == 1) { - level_diff++; - gedcom_lval.number = current_level; - return OPEN; -} -else { - /* out of brackets... */ -} - -#define TO_INTERNAL(str) to_internal(str, yyleng) - -#define MKTAGACTION(tag) \ - { gedcom_lval.string = TO_INTERNAL(yytext); \ - BEGIN(NORMAL); \ - return TAG_##tag; } +#define IN_LEX /* include only a specific part of the following file */ +#include "gedcom_lex_common.c" +ACTION_BEFORE_REGEXPS + %} -{gen_delim}* /* ignore leading whitespace (also tabs) */ +{gen_delim}* ACTION_INITIAL_WHITESPACE -0{digit}+ { gedcom_error ("Level number with leading zero"); - return BADTOKEN; - } +0{digit}+ ACTION_0_DIGITS -{digit}+ { int level = atoi(TO_INTERNAL(yytext)); - if ((level < 0) || (level > MAXGEDCLEVEL)) { - gedcom_error ("Level number out of range [0..%d]", - MAXGEDCLEVEL); - return BADTOKEN; - } - level_diff = level - current_level; - BEGIN(EXPECT_TAG); - current_level = level; - if (level_diff < 1) { - level_diff++; - return CLOSE; - } - else if (level_diff == 1) { - level_diff++; - gedcom_lval.number = current_level; - return OPEN; - } - else { - /* should never happen (error to GEDCOM spec) */ - gedcom_error ("GEDCOM level number is %d higher than " - "previous", - level_diff); - return BADTOKEN; - } - } +{digit}+ ACTION_DIGITS ABBR MKTAGACTION(ABBR) ADDR MKTAGACTION(ADDR) @@ -276,63 +184,21 @@ else { WIFE MKTAGACTION(WIFE) WILL MKTAGACTION(WILL) -{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) { - gedcom_error("Tag '%s' too long, max %d chars"); - return BADTOKEN; - } - strncpy(string_buf, yytext, MAXGEDCTAGLEN+1); - gedcom_lval.string = TO_INTERNAL(string_buf); - BEGIN(NORMAL); - return USERTAG; - } +{alphanum}+ ACTION_ALPHANUM -{delim} { gedcom_lval.string = TO_INTERNAL(yytext); - return DELIM; - } +{delim} ACTION_DELIM -{any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext); - /* Due to character conversions, it is possible - that the current character will be combined with - the next, and so now we don't have a character yet... - This is only applicable to the 1byte case (e.g. ANSEL). - */ - if (strlen(gedcom_lval.string) > 0) - return ANYCHAR; - } +{any_but_delim} ACTION_ANY -{escape}/{non_at} { gedcom_lval.string = TO_INTERNAL(yytext); - return ESCAPE; - } +{escape}/{non_at} ACTION_ESCAPE -{pointer} { gedcom_lval.string = TO_INTERNAL(yytext); - return POINTER; - } +{pointer} ACTION_POINTER - /* Due to the conversion of level numbers into brackets, the - terminator is not important, so no token is returned here. - Although not strictly according to the GEDCOM spec, we'll ignore - whitespace just before the terminator. - */ +{gen_delim}*{terminator} ACTION_TERMINATOR -{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); } +<> ACTION_EOF - /* Eventually we have to return 1 closing bracket (for the trailer). - We can detect whether we have sent the closing bracket using the - level_diff (at eof, first it is 2, then we increment it ourselves) */ - -<> { if (level_diff == 2) { - level_diff++; - return CLOSE; - } - else { - yyterminate(); - } - } - -. { gedcom_error("Unexpected character: '%s' (0x%02x)", - yytext, yytext[0]); - return BADTOKEN; - } +. ACTION_UNEXPECTED %% @@ -342,34 +208,13 @@ int yywrap() } #ifdef LEXER_TEST +int gedcom_lex() +{ + return gedcom_1byte_lex(); +} + int main() { - int tok, res; - init_encodings(); - set_encoding_width(ONE_BYTE); - res = open_conv_to_internal("ASCII"); - if (!res) { - gedcom_error("Unable to open conversion context: %s", - strerror(errno)); - return 1; - } - tok = gedcom_1byte_lex(); - while (tok) { - switch(tok) { - case BADTOKEN: printf("BADTOKEN "); break; - case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break; - case CLOSE: printf("CLOSE "); break; - case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break; - case DELIM: printf("DELIM "); break; - case ANYCHAR: printf("%s ", gedcom_lval.string); break; - case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break; - case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break; - default: printf("TAG(%s) ", gedcom_lval.string); break; - } - tok = gedcom_1byte_lex(); - } - printf("\n"); - close_conv_to_internal(); - return 0; + return test_loop(ONE_BYTE, "ASCII"); } #endif