/* $Name$ */
%{
-#include "gedcom.tab.h"
-#include "gedcom.h"
-#include "multilex.h"
-#include "encoding.h"
+#undef IN_LEX /* include only a specific part of the following file */
+#include "gedcom_lex_common.c"
-#define YY_NO_UNPUT
+static size_t encoding_width = 1;
%}
%s NORMAL
escape @#{any_char}+@
pointer @{alphanum}{non_at}+@
-%{
-static int current_level=-1;
-static int level_diff=MAXGEDCLEVEL;
-
-#ifdef LEXER_TEST
-YYSTYPE gedcom_lval;
-int line_no = 1;
-#endif
-
-%}
-
%%
- /* The GEDCOM level number is converted into a sequence of opening
- and closing brackets. Simply put, the following GEDCOM fragment:
-
- 0 HEAD
- 1 SOUR genes
- 2 VERS 1.6
- 2 NAME Genes
- 1 DATE 07 OCT 2001
- ...
- 0 TRLR
-
- is converted into:
-
- { HEAD (initial)
- { SOUR genes (1 higher: no closing brackets)
- { VERS 1.6 (1 higher: no closing brackets)
- } { NAME Genes (same level: 1 closing bracket)
- } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets)
- ...
- } { TRLR }
-
- or more clearly:
-
- { HEAD
- { SOUR genes
- { VERS 1.6 }
- { NAME Genes } }
- { DATE 07 OCT 2001
- ... }
- { TRLR }
-
- But because this means that one token is converted into a series
- of tokens, there is some initial code following immediately here
- that returns "pending" tokens. */
-
%{
-char string_buf[MAXGEDCLINELEN+1];
-
-if (level_diff < 1) {
- level_diff++;
- return CLOSE;
-}
-else if (level_diff == 1) {
- level_diff++;
- return OPEN;
-}
-else {
- /* out of brackets... */
-}
-
-#define TO_INTERNAL(str) to_internal(str, yyleng)
-
-#define MKTAGACTION(tag) \
- { gedcom_lval.string = TO_INTERNAL(yytext); \
- BEGIN(NORMAL); \
- return TAG_##tag; }
+#define IN_LEX /* include only a specific part of the following file */
+#include "gedcom_lex_common.c"
+ACTION_BEFORE_REGEXPS
+
%}
-<INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
+<INITIAL>{gen_delim}* ACTION_INITIAL_WHITESPACE
-<INITIAL>0{digit}+ { gedcom_error ("Level number with leading zero");
- return BADTOKEN;
- }
+<INITIAL>0{digit}+ ACTION_0_DIGITS
-<INITIAL>{digit}+ { int level = atoi(TO_INTERNAL(yytext));
- if ((level < 0) || (level > MAXGEDCLEVEL)) {
- gedcom_error ("Level number out of range [0..%d]",
- MAXGEDCLEVEL);
- return BADTOKEN;
- }
- level_diff = level - current_level;
- BEGIN(EXPECT_TAG);
- current_level = level;
- if (level_diff < 1) {
- level_diff++;
- return CLOSE;
- }
- else if (level_diff == 1) {
- level_diff++;
- return OPEN;
- }
- else {
- /* should never happen (error to GEDCOM spec) */
- gedcom_error ("GEDCOM level number is %d higher than "
- "previous",
- level_diff);
- return BADTOKEN;
- }
- }
+<INITIAL>{digit}+ ACTION_DIGITS
<EXPECT_TAG>ABBR MKTAGACTION(ABBR)
<EXPECT_TAG>ADDR MKTAGACTION(ADDR)
<EXPECT_TAG>WIFE MKTAGACTION(WIFE)
<EXPECT_TAG>WILL MKTAGACTION(WILL)
-<EXPECT_TAG>{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) {
- gedcom_error("Tag '%s' too long, max %d chars");
- return BADTOKEN;
- }
- strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);
- gedcom_lval.string = TO_INTERNAL(string_buf);
- BEGIN(NORMAL);
- return USERTAG;
- }
+<EXPECT_TAG>{alphanum}+ ACTION_ALPHANUM
-{delim} { gedcom_lval.string = TO_INTERNAL(yytext);
- return DELIM;
- }
+{delim} ACTION_DELIM
-{any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext);
- /* Due to character conversions, it is possible
- that the current character will be combined with
- the next, and so now we don't have a character yet...
- This is only applicable to the 1byte case (e.g. ANSEL).
- */
- if (strlen(gedcom_lval.string) > 0)
- return ANYCHAR;
- }
+{any_but_delim} ACTION_ANY
-{escape}/{non_at} { gedcom_lval.string = TO_INTERNAL(yytext);
- return ESCAPE;
- }
+{escape}/{non_at} ACTION_ESCAPE
-{pointer} { gedcom_lval.string = TO_INTERNAL(yytext);
- return POINTER;
- }
+{pointer} ACTION_POINTER
- /* Due to the conversion of level numbers into brackets, the
- terminator is not important, so no token is returned here.
- Although not strictly according to the GEDCOM spec, we'll ignore
- whitespace just before the terminator.
- */
+{gen_delim}*{terminator} ACTION_TERMINATOR
-{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
+<<EOF>> ACTION_EOF
- /* Eventually we have to return 1 closing bracket (for the trailer).
- We can detect whether we have sent the closing bracket using the
- level_diff (at eof, first it is 2, then we increment it ourselves) */
-
-<<EOF>> { if (level_diff == 2) {
- level_diff++;
- return CLOSE;
- }
- else {
- yyterminate();
- }
- }
-
-. { gedcom_error("Unexpected character: '%s' (0x%02x)",
- yytext, yytext[0]);
- return BADTOKEN;
- }
+. ACTION_UNEXPECTED
%%
}
#ifdef LEXER_TEST
+int gedcom_lex()
+{
+ return gedcom_1byte_lex();
+}
+
int main()
{
- int tok, res;
- init_encodings();
- set_encoding_width(ONE_BYTE);
- res = open_conv_to_internal("ASCII");
- if (!res) {
- gedcom_error("Unable to open conversion context: %s",
- strerror(errno));
- return 1;
- }
- tok = gedcom_1byte_lex();
- while (tok) {
- switch(tok) {
- case BADTOKEN: printf("BADTOKEN "); break;
- case OPEN: printf("OPEN "); break;
- case CLOSE: printf("CLOSE "); break;
- case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
- case DELIM: printf("DELIM "); break;
- case ANYCHAR: printf("%s ", gedcom_lval.string); break;
- case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
- case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
- default: printf("TAG(%s) ", gedcom_lval.string); break;
- }
- tok = gedcom_1byte_lex();
- }
- printf("\n");
- close_conv_to_internal();
- return 0;
+ return test_loop(ONE_BYTE, "ASCII");
}
#endif