%%
- /* The GEDCOM level number is converted into a sequence of opening
- and closing brackets. Simply put, the following GEDCOM fragment:
-
- 0 HEAD
- 1 SOUR genes
- 2 VERS 1.6
- 2 NAME Genes
- 1 DATE 07 OCT 2001
- ...
- 0 TRLR
-
- is converted into:
-
- { HEAD (initial)
- { SOUR genes (1 higher: no closing brackets)
- { VERS 1.6 (1 higher: no closing brackets)
- } { NAME Genes (same level: 1 closing bracket)
- } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets)
- ...
- } { TRLR }
-
- or more clearly:
-
- { HEAD
- { SOUR genes
- { VERS 1.6 }
- { NAME Genes } }
- { DATE 07 OCT 2001
- ... }
- { TRLR }
-
- But because this means that one token is converted into a series
- of tokens, there is some initial code following immediately here
- that returns "pending" tokens. */
-
%{
-char string_buf[MAXGEDCLINELEN+1];
-
-if (level_diff < 1) {
- level_diff++;
- return CLOSE;
-}
-else if (level_diff == 1) {
- level_diff++;
- gedcom_lval.number = current_level;
- return OPEN;
-}
-else {
- /* out of brackets... */
-}
-
-#define TO_INTERNAL(str) to_internal(str, yyleng)
-
-#define MKTAGACTION(tag) \
- { gedcom_lval.string = TO_INTERNAL(yytext); \
- BEGIN(NORMAL); \
- return TAG_##tag; }
+#include "gedcom_lex_common.c"
+ACTION_BEFORE_REGEXPS
+
%}
<INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
-<INITIAL>0{digit}+ { gedcom_error ("Level number with leading zero");
- return BADTOKEN;
- }
+<INITIAL>0{digit}+ ACTION_0_DIGITS
-<INITIAL>{digit}+ { int level = atoi(TO_INTERNAL(yytext));
- if ((level < 0) || (level > MAXGEDCLEVEL)) {
- gedcom_error ("Level number out of range [0..%d]",
- MAXGEDCLEVEL);
- return BADTOKEN;
- }
- level_diff = level - current_level;
- BEGIN(EXPECT_TAG);
- current_level = level;
- if (level_diff < 1) {
- level_diff++;
- return CLOSE;
- }
- else if (level_diff == 1) {
- level_diff++;
- gedcom_lval.number = current_level;
- return OPEN;
- }
- else {
- /* should never happen (error to GEDCOM spec) */
- gedcom_error ("GEDCOM level number is %d higher than "
- "previous",
- level_diff);
- return BADTOKEN;
- }
- }
+<INITIAL>{digit}+ ACTION_DIGITS
<EXPECT_TAG>ABBR MKTAGACTION(ABBR)
<EXPECT_TAG>ADDR MKTAGACTION(ADDR)
<EXPECT_TAG>WIFE MKTAGACTION(WIFE)
<EXPECT_TAG>WILL MKTAGACTION(WILL)
-<EXPECT_TAG>{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) {
- gedcom_error("Tag '%s' too long, max %d chars");
- return BADTOKEN;
- }
- strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);
- gedcom_lval.string = TO_INTERNAL(string_buf);
- BEGIN(NORMAL);
- return USERTAG;
- }
-
-{delim} { gedcom_lval.string = TO_INTERNAL(yytext);
- return DELIM;
- }
-
-{any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext);
- /* Due to character conversions, it is possible
- that the current character will be combined with
- the next, and so now we don't have a character yet...
- This is only applicable to the 1byte case (e.g. ANSEL).
- */
- if (strlen(gedcom_lval.string) > 0)
- return ANYCHAR;
- }
+<EXPECT_TAG>{alphanum}+ ACTION_ALPHANUM
-{escape}/{non_at} { gedcom_lval.string = TO_INTERNAL(yytext);
- return ESCAPE;
- }
+{delim} ACTION_DELIM
-{pointer} { gedcom_lval.string = TO_INTERNAL(yytext);
- return POINTER;
- }
+{any_but_delim} ACTION_ANY
- /* Due to the conversion of level numbers into brackets, the
- terminator is not important, so no token is returned here.
- Although not strictly according to the GEDCOM spec, we'll ignore
- whitespace just before the terminator.
- */
+{escape}/{non_at} ACTION_ESCAPE
-{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
+{pointer} ACTION_POINTER
- /* Eventually we have to return 1 closing bracket (for the trailer).
- We can detect whether we have sent the closing bracket using the
- level_diff (at eof, first it is 2, then we increment it ourselves) */
+{gen_delim}*{terminator} ACTION_TERMINATOR
-<<EOF>> { if (level_diff == 2) {
- level_diff++;
- return CLOSE;
- }
- else {
- yyterminate();
- }
- }
+<<EOF>> ACTION_EOF
-. { gedcom_error("Unexpected character: '%s' (0x%02x)",
- yytext, yytext[0]);
- return BADTOKEN;
- }
+. ACTION_UNEXPECTED
%%
while (tok) {
switch(tok) {
case BADTOKEN: printf("BADTOKEN "); break;
- case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
+ case OPEN: printf("OPEN(%d) ", gedcom_lval.level); break;
case CLOSE: printf("CLOSE "); break;
case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
case DELIM: printf("DELIM "); break;
case ANYCHAR: printf("%s ", gedcom_lval.string); break;
- case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
- case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
- default: printf("TAG(%s) ", gedcom_lval.string); break;
+ case POINTER: printf("POINTER(%s) ", gedcom_lval.pointer); break;
+ case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag); break;
+ default: printf("TAG(%s) ", gedcom_lval.tag); break;
}
tok = gedcom_1byte_lex();
}
%%
- /* The GEDCOM level number is converted into a sequence of opening
- and closing brackets. Simply put, the following GEDCOM fragment:
-
- 0 HEAD
- 1 SOUR genes
- 2 VERS 1.6
- 2 NAME Genes
- 1 DATE 07 OCT 2001
- ...
- 0 TRLR
-
- is converted into:
-
- { HEAD (initial)
- { SOUR genes (1 higher: no closing brackets)
- { VERS 1.6 (1 higher: no closing brackets)
- } { NAME Genes (same level: 1 closing bracket)
- } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets)
- ...
- } { TRLR }
-
- or more clearly:
-
- { HEAD
- { SOUR genes
- { VERS 1.6 }
- { NAME Genes } }
- { DATE 07 OCT 2001
- ... }
- { TRLR }
-
- But because this means that one token is converted into a series
- of tokens, there is some initial code following immediately here
- that returns "pending" tokens. */
-
%{
-char string_buf[MAXGEDCLINELEN+1];
-
-if (level_diff < 1) {
- level_diff++;
- return CLOSE;
-}
-else if (level_diff == 1) {
- level_diff++;
- gedcom_lval.number = current_level;
- return OPEN;
-}
-else {
- /* out of brackets... */
-}
-
-#define TO_INTERNAL(str) to_internal(str, yyleng)
-
-#define MKTAGACTION(tag) \
- { gedcom_lval.string = TO_INTERNAL(yytext); \
- BEGIN(NORMAL); \
- return TAG_##tag; }
+#include "gedcom_lex_common.c"
+ACTION_BEFORE_REGEXPS
+
%}
<INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
-<INITIAL>\x00[0]{digit}+ { gedcom_error ("Level number with leading zero");
- return BADTOKEN;
- }
+<INITIAL>\x00[0]{digit}+ ACTION_0_DIGITS
-<INITIAL>{digit}+ { int level = atoi(TO_INTERNAL(yytext));
- if ((level < 0) || (level > MAXGEDCLEVEL)) {
- gedcom_error ("Level number out of range [0..%d]",
- MAXGEDCLEVEL);
- return BADTOKEN;
- }
- level_diff = level - current_level;
- BEGIN(EXPECT_TAG);
- current_level = level;
- if (level_diff < 1) {
- level_diff++;
- return CLOSE;
- }
- else if (level_diff == 1) {
- level_diff++;
- gedcom_lval.number = current_level;
- return OPEN;
- }
- else {
- /* should never happen (error to GEDCOM spec) */
- gedcom_error ("GEDCOM level number is %d higher than "
- "previous",
- level_diff);
- return BADTOKEN;
- }
- }
+<INITIAL>{digit}+ ACTION_DIGITS
<EXPECT_TAG>\x00A\x00B\x00B\x00R MKTAGACTION(ABBR)
<EXPECT_TAG>\x00A\x00D\x00D\x00R MKTAGACTION(ADDR)
<EXPECT_TAG>\x00W\x00I\x00F\x00E MKTAGACTION(WIFE)
<EXPECT_TAG>\x00W\x00I\x00L\x00L MKTAGACTION(WILL)
-<EXPECT_TAG>{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) {
- gedcom_error("Tag '%s' too long, max %d chars");
- return BADTOKEN;
- }
- strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);
- gedcom_lval.string = TO_INTERNAL(string_buf);
- BEGIN(NORMAL);
- return USERTAG;
- }
-
-{delim} { gedcom_lval.string = TO_INTERNAL(yytext);
- return DELIM;
- }
-
-{any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext);
- return ANYCHAR;
- }
+<EXPECT_TAG>{alphanum}+ ACTION_ALPHANUM
-{escape}/{non_at} { gedcom_lval.string = TO_INTERNAL(yytext);
- return ESCAPE;
- }
+{delim} ACTION_DELIM
-{pointer} { gedcom_lval.string = TO_INTERNAL(yytext);
- return POINTER;
- }
+{any_but_delim} ACTION_ANY
- /* Due to the conversion of level numbers into brackets, the
- terminator is not important, so no token is returned here.
- Although not strictly according to the GEDCOM spec, we'll ignore
- whitespace just before the terminator.
- */
+{escape}/{non_at} ACTION_ESCAPE
-{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
+{pointer} ACTION_POINTER
- /* Eventually we have to return 1 closing bracket (for the trailer).
- We can detect whether we have sent the closing bracket using the
- level_diff (at eof, first it is 2, then we increment it ourselves) */
+{gen_delim}*{terminator} ACTION_TERMINATOR
-<<EOF>> { if (level_diff == 2) {
- level_diff++;
- return CLOSE;
- }
- else {
- yyterminate();
- }
- }
+<<EOF>> ACTION_EOF
-. { gedcom_error("Unexpected character: '%s' (0x%02x)",
- yytext, yytext[0]);
- return BADTOKEN;
- }
+. ACTION_UNEXPECTED
%%
while (tok) {
switch(tok) {
case BADTOKEN: printf("BADTOKEN "); break;
- case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
+ case OPEN: printf("OPEN(%d) ", gedcom_lval.level); break;
case CLOSE: printf("CLOSE "); break;
case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
case DELIM: printf("DELIM "); break;
case ANYCHAR: printf("%s ", gedcom_lval.string); break;
- case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
- case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
- default: printf("TAG(%s) ", gedcom_lval.string); break;
+ case POINTER: printf("POINTER(%s) ", gedcom_lval.pointer); break;
+ case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag); break;
+ default: printf("TAG(%s) ", gedcom_lval.tag); break;
}
tok = gedcom_hilo_lex();
}
%%
- /* The GEDCOM level number is converted into a sequence of opening
- and closing brackets. Simply put, the following GEDCOM fragment:
-
- 0 HEAD
- 1 SOUR genes
- 2 VERS 1.6
- 2 NAME Genes
- 1 DATE 07 OCT 2001
- ...
- 0 TRLR
-
- is converted into:
-
- { HEAD (initial)
- { SOUR genes (1 higher: no closing brackets)
- { VERS 1.6 (1 higher: no closing brackets)
- } { NAME Genes (same level: 1 closing bracket)
- } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets)
- ...
- } { TRLR }
-
- or more clearly:
-
- { HEAD
- { SOUR genes
- { VERS 1.6 }
- { NAME Genes } }
- { DATE 07 OCT 2001
- ... }
- { TRLR }
-
- But because this means that one token is converted into a series
- of tokens, there is some initial code following immediately here
- that returns "pending" tokens. */
-
%{
-char string_buf[MAXGEDCLINELEN+1];
-
-if (level_diff < 1) {
- level_diff++;
- return CLOSE;
-}
-else if (level_diff == 1) {
- level_diff++;
- gedcom_lval.number = current_level;
- return OPEN;
-}
-else {
- /* out of brackets... */
-}
-
-#define TO_INTERNAL(str) to_internal(str, yyleng)
-
-#define MKTAGACTION(tag) \
- { gedcom_lval.string = TO_INTERNAL(yytext); \
- BEGIN(NORMAL); \
- return TAG_##tag; }
+#include "gedcom_lex_common.c"
+ACTION_BEFORE_REGEXPS
+
%}
<INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
-<INITIAL>\x00[0]{digit}+ { gedcom_error ("Level number with leading zero");
- return BADTOKEN;
- }
+<INITIAL>\x00[0]{digit}+ ACTION_0_DIGITS
-<INITIAL>{digit}+ { int level = atoi(TO_INTERNAL(yytext));
- if ((level < 0) || (level > MAXGEDCLEVEL)) {
- gedcom_error ("Level number out of range [0..%d]",
- MAXGEDCLEVEL);
- return BADTOKEN;
- }
- level_diff = level - current_level;
- BEGIN(EXPECT_TAG);
- current_level = level;
- if (level_diff < 1) {
- level_diff++;
- return CLOSE;
- }
- else if (level_diff == 1) {
- level_diff++;
- gedcom_lval.number = current_level;
- return OPEN;
- }
- else {
- /* should never happen (error to GEDCOM spec) */
- gedcom_error ("GEDCOM level number is %d higher than "
- "previous",
- level_diff);
- return BADTOKEN;
- }
- }
+<INITIAL>{digit}+ ACTION_DIGITS
<EXPECT_TAG>A\x00B\x00B\x00R\x00 MKTAGACTION(ABBR)
<EXPECT_TAG>A\x00D\x00D\x00R\x00 MKTAGACTION(ADDR)
<EXPECT_TAG>W\x00I\x00F\x00E\x00 MKTAGACTION(WIFE)
<EXPECT_TAG>W\x00I\x00L\x00L\x00 MKTAGACTION(WILL)
-<EXPECT_TAG>{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) {
- gedcom_error("Tag '%s' too long, max %d chars");
- return BADTOKEN;
- }
- strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);
- gedcom_lval.string = TO_INTERNAL(string_buf);
- BEGIN(NORMAL);
- return USERTAG;
- }
-
-{delim} { gedcom_lval.string = TO_INTERNAL(yytext);
- return DELIM;
- }
-
-{any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext);
- return ANYCHAR;
- }
+<EXPECT_TAG>{alphanum}+ ACTION_ALPHANUM
-{escape}/{non_at} { gedcom_lval.string = TO_INTERNAL(yytext);
- return ESCAPE;
- }
+{delim} ACTION_DELIM
-{pointer} { gedcom_lval.string = TO_INTERNAL(yytext);
- return POINTER;
- }
+{any_but_delim} ACTION_ANY
- /* Due to the conversion of level numbers into brackets, the
- terminator is not important, so no token is returned here.
- Although not strictly according to the GEDCOM spec, we'll ignore
- whitespace just before the terminator.
- */
+{escape}/{non_at} ACTION_ESCAPE
-{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
+{pointer} ACTION_POINTER
- /* Eventually we have to return 1 closing bracket (for the trailer).
- We can detect whether we have sent the closing bracket using the
- level_diff (at eof, first it is 2, then we increment it ourselves) */
+{gen_delim}*{terminator} ACTION_TERMINATOR
-<<EOF>> { if (level_diff == 2) {
- level_diff++;
- return CLOSE;
- }
- else {
- yyterminate();
- }
- }
+<<EOF>> ACTION_EOF
-. { gedcom_error("Unexpected character: '%s' (0x%02x)",
- yytext, yytext[0]);
- return BADTOKEN;
- }
+. ACTION_UNEXPECTED
%%
while (tok) {
switch(tok) {
case BADTOKEN: printf("BADTOKEN "); break;
- case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
+ case OPEN: printf("OPEN(%d) ", gedcom_lval.level); break;
case CLOSE: printf("CLOSE "); break;
case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
case DELIM: printf("DELIM "); break;
case ANYCHAR: printf("%s ", gedcom_lval.string); break;
- case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
- case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
- default: printf("TAG(%s) ", gedcom_lval.string); break;
+ case POINTER: printf("POINTER(%s) ", gedcom_lval.pointer); break;
+ case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag); break;
+ default: printf("TAG(%s) ", gedcom_lval.tag); break;
}
tok = gedcom_lohi_lex();
}