X-Git-Url: https://git.dlugolecki.net.pl/?a=blobdiff_plain;f=gedcom_lohi.lex;h=e91d4f0693d1a40738ad6c1b2cb81f9f38c4548a;hb=4a9af1fa889f85ce33ae94abec4ff5df002036be;hp=6d88b435ed979f3daa309bc10e96417fd0cf354a;hpb=a2f7d56476e81f8689d56e6a5641469f6497c487;p=gedcom-parse.git diff --git a/gedcom_lohi.lex b/gedcom_lohi.lex index 6d88b43..e91d4f0 100644 --- a/gedcom_lohi.lex +++ b/gedcom_lohi.lex @@ -1,3 +1,12 @@ +/* This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + + (C) 2001 by The Genes Development Team + Original author: Peter Verthez (Peter.Verthez@advalvas.be) +*/ + /* $Id$ */ /* $Name$ */ @@ -9,6 +18,8 @@ #include "gedcom.h" #include "multilex.h" #include "encoding.h" + +#define YY_NO_UNPUT %} %s NORMAL @@ -44,96 +55,18 @@ int line_no = 1; %% - /* The GEDCOM level number is converted into a sequence of opening - and closing brackets. Simply put, the following GEDCOM fragment: - - 0 HEAD - 1 SOUR genes - 2 VERS 1.6 - 2 NAME Genes - 1 DATE 07 OCT 2001 - ... - 0 TRLR - - is converted into: - - { HEAD (initial) - { SOUR genes (1 higher: no closing brackets) - { VERS 1.6 (1 higher: no closing brackets) - } { NAME Genes (same level: 1 closing bracket) - } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets) - ... - } { TRLR } - - or more clearly: - - { HEAD - { SOUR genes - { VERS 1.6 } - { NAME Genes } } - { DATE 07 OCT 2001 - ... } - { TRLR } - - But because this means that one token is converted into a series - of tokens, there is some initial code following immediately here - that returns "pending" tokens. */ - %{ -char string_buf[MAXGEDCLINELEN+1]; - -if (level_diff < 1) { - level_diff++; - return CLOSE; -} -else if (level_diff == 1) { - level_diff++; - return OPEN; -} -else { - /* out of brackets... */ -} - -#define TO_INTERNAL(str) to_internal(str, yyleng) - -#define MKTAGACTION(tag) \ - { gedcom_lval.string = TO_INTERNAL(yytext); \ - BEGIN(NORMAL); \ - return TAG_##tag; } +#include "gedcom_lex_common.c" +ACTION_BEFORE_REGEXPS + %} {gen_delim}* /* ignore leading whitespace (also tabs) */ -\x00[0]{digit}+ { gedcom_error ("Level number with leading zero"); - return BADTOKEN; - } +\x00[0]{digit}+ ACTION_0_DIGITS -{digit}+ { int level = atoi(TO_INTERNAL(yytext)); - if ((level < 0) || (level > MAXGEDCLEVEL)) { - gedcom_error ("Level number out of range [0..%d]", - MAXGEDCLEVEL); - return BADTOKEN; - } - level_diff = level - current_level; - BEGIN(EXPECT_TAG); - current_level = level; - if (level_diff < 1) { - level_diff++; - return CLOSE; - } - else if (level_diff == 1) { - level_diff++; - return OPEN; - } - else { - /* should never happen (error to GEDCOM spec) */ - gedcom_error ("GEDCOM level number is %d higher than " - "previous", - level_diff); - return BADTOKEN; - } - } +{digit}+ ACTION_DIGITS A\x00B\x00B\x00R\x00 MKTAGACTION(ABBR) A\x00D\x00D\x00R\x00 MKTAGACTION(ADDR) @@ -265,57 +198,21 @@ else { W\x00I\x00F\x00E\x00 MKTAGACTION(WIFE) W\x00I\x00L\x00L\x00 MKTAGACTION(WILL) -{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) { - gedcom_error("Tag '%s' too long, max %d chars"); - return BADTOKEN; - } - strncpy(string_buf, yytext, MAXGEDCTAGLEN+1); - gedcom_lval.string = TO_INTERNAL(string_buf); - BEGIN(NORMAL); - return USERTAG; - } - -{delim} { gedcom_lval.string = TO_INTERNAL(yytext); - return DELIM; - } - -{any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext); - return ANYCHAR; - } +{alphanum}+ ACTION_ALPHANUM -{escape}/{non_at} { gedcom_lval.string = TO_INTERNAL(yytext); - return ESCAPE; - } +{delim} ACTION_DELIM -{pointer} { gedcom_lval.string = TO_INTERNAL(yytext); - return POINTER; - } +{any_but_delim} ACTION_ANY - /* Due to the conversion of level numbers into brackets, the - terminator is not important, so no token is returned here. - Although not strictly according to the GEDCOM spec, we'll ignore - whitespace just before the terminator. - */ +{escape}/{non_at} ACTION_ESCAPE -{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); } +{pointer} ACTION_POINTER - /* Eventually we have to return 1 closing bracket (for the trailer). - We can detect whether we have sent the closing bracket using the - level_diff (at eof, first it is 2, then we increment it ourselves) */ +{gen_delim}*{terminator} ACTION_TERMINATOR -<> { if (level_diff == 2) { - level_diff++; - return CLOSE; - } - else { - yyterminate(); - } - } +<> ACTION_EOF -. { gedcom_error("Unexpected character: '%s' (0x%02x)", - yytext, yytext[0]); - return BADTOKEN; - } +. ACTION_UNEXPECTED %% @@ -328,8 +225,10 @@ int yywrap() int main() { - int tok; - int res = open_conv_to_internal("UTF16LE"); + int tok, res; + init_encodings(); + set_encoding_width(TWO_BYTE_LOHI); + res = open_conv_to_internal("UNICODE"); if (!res) { gedcom_error("Unable to open conversion context: %s", strerror(errno)); @@ -339,14 +238,14 @@ int main() while (tok) { switch(tok) { case BADTOKEN: printf("BADTOKEN "); break; - case OPEN: printf("OPEN "); break; + case OPEN: printf("OPEN(%d) ", gedcom_lval.level); break; case CLOSE: printf("CLOSE "); break; case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break; case DELIM: printf("DELIM "); break; case ANYCHAR: printf("%s ", gedcom_lval.string); break; - case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break; - case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break; - default: printf("TAG(%s) ", gedcom_lval.string); break; + case POINTER: printf("POINTER(%s) ", gedcom_lval.pointer); break; + case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag); break; + default: printf("TAG(%s) ", gedcom_lval.tag); break; } tok = gedcom_lohi_lex(); }