X-Git-Url: https://git.dlugolecki.net.pl/?a=blobdiff_plain;f=gedcom_hilo.lex;h=5c674a7a17287cbabb4537f9450797a8293fabe3;hb=c881bdbd69886d7f37055a9f2802a2e106e50fdb;hp=d8a1da0c34c461d694ba186dc7879078f93705a5;hpb=a54348309c92f4d7a2dd66b4055122a7be19ca28;p=gedcom-parse.git diff --git a/gedcom_hilo.lex b/gedcom_hilo.lex index d8a1da0..5c674a7 100644 --- a/gedcom_hilo.lex +++ b/gedcom_hilo.lex @@ -1,3 +1,12 @@ +/* This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + + (C) 2001 by The Genes Development Team + Original author: Peter Verthez (Peter.Verthez@advalvas.be) +*/ + /* $Id$ */ /* $Name$ */ @@ -5,12 +14,10 @@ /* i.e. this is utf-16-be */ %{ -#include "gedcom.tab.h" -#include "gedcom.h" -#include "multilex.h" -#include "encoding.h" - -#define YY_NO_UNPUT +#undef IN_LEX /* include only a specific part of the following file */ +#include "gedcom_lex_common.c" + +static size_t encoding_width = 2; %} %s NORMAL @@ -34,108 +41,21 @@ gen_delim {delim}|{tab} escape \x00@\x00#{any_char}+\x00@ pointer \x00@{alphanum}{non_at}+\x00@ -%{ -static int current_level=-1; -static int level_diff=MAXGEDCLEVEL; - -#ifdef LEXER_TEST -YYSTYPE gedcom_lval; -int line_no = 1; -#endif -%} - %% - /* The GEDCOM level number is converted into a sequence of opening - and closing brackets. Simply put, the following GEDCOM fragment: - - 0 HEAD - 1 SOUR genes - 2 VERS 1.6 - 2 NAME Genes - 1 DATE 07 OCT 2001 - ... - 0 TRLR - - is converted into: - - { HEAD (initial) - { SOUR genes (1 higher: no closing brackets) - { VERS 1.6 (1 higher: no closing brackets) - } { NAME Genes (same level: 1 closing bracket) - } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets) - ... - } { TRLR } - - or more clearly: - - { HEAD - { SOUR genes - { VERS 1.6 } - { NAME Genes } } - { DATE 07 OCT 2001 - ... } - { TRLR } - - But because this means that one token is converted into a series - of tokens, there is some initial code following immediately here - that returns "pending" tokens. */ - %{ -char string_buf[MAXGEDCLINELEN+1]; - -if (level_diff < 1) { - level_diff++; - return CLOSE; -} -else if (level_diff == 1) { - level_diff++; - return OPEN; -} -else { - /* out of brackets... */ -} - -#define TO_INTERNAL(str) to_internal(str, yyleng) - -#define MKTAGACTION(tag) \ - { gedcom_lval.string = TO_INTERNAL(yytext); \ - BEGIN(NORMAL); \ - return TAG_##tag; } +#define IN_LEX /* include only a specific part of the following file */ +#include "gedcom_lex_common.c" +ACTION_BEFORE_REGEXPS + %} -{gen_delim}* /* ignore leading whitespace (also tabs) */ +{gen_delim}* ACTION_INITIAL_WHITESPACE -\x00[0]{digit}+ { gedcom_error ("Level number with leading zero"); - return BADTOKEN; - } +\x00[0]{digit}+ ACTION_0_DIGITS -{digit}+ { int level = atoi(TO_INTERNAL(yytext)); - if ((level < 0) || (level > MAXGEDCLEVEL)) { - gedcom_error ("Level number out of range [0..%d]", - MAXGEDCLEVEL); - return BADTOKEN; - } - level_diff = level - current_level; - BEGIN(EXPECT_TAG); - current_level = level; - if (level_diff < 1) { - level_diff++; - return CLOSE; - } - else if (level_diff == 1) { - level_diff++; - return OPEN; - } - else { - /* should never happen (error to GEDCOM spec) */ - gedcom_error ("GEDCOM level number is %d higher than " - "previous", - level_diff); - return BADTOKEN; - } - } +{digit}+ ACTION_DIGITS \x00A\x00B\x00B\x00R MKTAGACTION(ABBR) \x00A\x00D\x00D\x00R MKTAGACTION(ADDR) @@ -267,57 +187,21 @@ else { \x00W\x00I\x00F\x00E MKTAGACTION(WIFE) \x00W\x00I\x00L\x00L MKTAGACTION(WILL) -{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) { - gedcom_error("Tag '%s' too long, max %d chars"); - return BADTOKEN; - } - strncpy(string_buf, yytext, MAXGEDCTAGLEN+1); - gedcom_lval.string = TO_INTERNAL(string_buf); - BEGIN(NORMAL); - return USERTAG; - } - -{delim} { gedcom_lval.string = TO_INTERNAL(yytext); - return DELIM; - } - -{any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext); - return ANYCHAR; - } +{alphanum}+ ACTION_ALPHANUM -{escape}/{non_at} { gedcom_lval.string = TO_INTERNAL(yytext); - return ESCAPE; - } +{delim} ACTION_DELIM -{pointer} { gedcom_lval.string = TO_INTERNAL(yytext); - return POINTER; - } +{any_but_delim} ACTION_ANY - /* Due to the conversion of level numbers into brackets, the - terminator is not important, so no token is returned here. - Although not strictly according to the GEDCOM spec, we'll ignore - whitespace just before the terminator. - */ +{escape}/{non_at} ACTION_ESCAPE -{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); } +{pointer} ACTION_POINTER - /* Eventually we have to return 1 closing bracket (for the trailer). - We can detect whether we have sent the closing bracket using the - level_diff (at eof, first it is 2, then we increment it ourselves) */ +{gen_delim}*{terminator} ACTION_TERMINATOR -<> { if (level_diff == 2) { - level_diff++; - return CLOSE; - } - else { - yyterminate(); - } - } +<> ACTION_EOF -. { gedcom_error("Unexpected character: '%s' (0x%02x)", - yytext, yytext[0]); - return BADTOKEN; - } +. ACTION_UNEXPECTED %% @@ -327,35 +211,13 @@ int yywrap() } #ifdef LEXER_TEST +int gedcom_lex() +{ + return gedcom_hilo_lex(); +} int main() { - int tok, res; - init_encodings(); - set_encoding_width(TWO_BYTE_HILO); - res = open_conv_to_internal("UNICODE"); - if (!res) { - gedcom_error("Unable to open conversion context: %s", - strerror(errno)); - return 1; - } - tok = gedcom_hilo_lex(); - while (tok) { - switch(tok) { - case BADTOKEN: printf("BADTOKEN "); break; - case OPEN: printf("OPEN "); break; - case CLOSE: printf("CLOSE "); break; - case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break; - case DELIM: printf("DELIM "); break; - case ANYCHAR: printf("%s ", gedcom_lval.string); break; - case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break; - case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break; - default: printf("TAG(%s) ", gedcom_lval.string); break; - } - tok = gedcom_hilo_lex(); - } - printf("\n"); - close_conv_to_internal(); - return 0; + return test_loop(TWO_BYTE_HILO, "UNICODE"); } #endif