X-Git-Url: https://git.dlugolecki.net.pl/?a=blobdiff_plain;f=gedcom_lex_common.c;h=bc7387de4f8797aabaf61916c1fc6cd3437c3cd5;hb=471aa410dba0a5604cc40f4d23fc34efb098c778;hp=6d298222206d1afc96991744643222d1cbb08d7d;hpb=45a9528c2705679e8556c9bbfe9196233ec14d29;p=gedcom-parse.git diff --git a/gedcom_lex_common.c b/gedcom_lex_common.c index 6d29822..bc7387d 100644 --- a/gedcom_lex_common.c +++ b/gedcom_lex_common.c @@ -10,15 +10,90 @@ /* $Id$ */ /* $Name$ */ -char string_buf[MAXGEDCLINELEN+1]; +#ifndef IN_LEX + +#include "external.h" +#include "gedcom.h" +#include "gedcom.tab.h" +#include "multilex.h" +#include "encoding.h" + +#define YY_NO_UNPUT + +static size_t encoding_width; +static int current_level = -1; +static int level_diff=MAXGEDCLEVEL; +static size_t line_len = 0; + +static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1]; +static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1]; +static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1]; + +#ifdef LEXER_TEST +YYSTYPE gedcom_lval; +int line_no = 1; + +int gedcom_lex(); + +int test_loop(ENCODING enc, char* code) +{ + int tok, res; + init_encodings(); + set_encoding_width(enc); + res = open_conv_to_internal(code); + if (!res) { + gedcom_error("Unable to open conversion context: %s", + strerror(errno)); + return 1; + } + tok = gedcom_lex(); + while (tok) { + switch(tok) { + case BADTOKEN: printf("BADTOKEN "); break; + case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break; + case CLOSE: printf("CLOSE "); break; + case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break; + case DELIM: printf("DELIM "); break; + case ANYCHAR: printf("%s ", gedcom_lval.string); break; + case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break; + case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break; + default: printf("TAG(%s) ", gedcom_lval.string); break; + } + tok = gedcom_lex(); + } + printf("\n"); + close_conv_to_internal(); + return 0; +} -#define TO_INTERNAL(str) to_internal(str, yyleng) +#endif /* of #ifdef LEXER_TEST */ + +#else /* of #ifndef IN_LEX */ + +#define TO_INTERNAL(STR,OUTBUF) \ + to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF)) -#define MKTAGACTION(the_tag) \ - { gedcom_lval.tag = TO_INTERNAL(yytext); \ - BEGIN(NORMAL); \ - return TAG_##the_tag; } +#define INIT_LINE_LEN \ + line_len = 0; +#define CHECK_LINE_LEN \ + { if (line_len != (size_t)-1) { \ + line_len += strlen(yytext); \ + if (line_len > MAXGEDCLINELEN * encoding_width) { \ + gedcom_error("Line too long, max %d characters", \ + MAXGEDCLINELEN); \ + line_len = (size_t)-1; \ + return BADTOKEN; \ + } \ + } \ + } + +#define MKTAGACTION(THETAG) \ + { CHECK_LINE_LEN; \ + gedcom_lval.string = TO_INTERNAL(yytext, tag_buf); \ + BEGIN(NORMAL); \ + return TAG_##THETAG; \ + } /* The GEDCOM level number is converted into a sequence of opening and closing brackets. Simply put, the following GEDCOM fragment: @@ -62,13 +137,19 @@ char string_buf[MAXGEDCLINELEN+1]; } \ else if (level_diff == 1) { \ level_diff++; \ - gedcom_lval.level = current_level; \ + gedcom_lval.number = current_level; \ return OPEN; \ } \ else { \ /* out of brackets... */ \ } \ - } + } + + +#define ACTION_INITIAL_WHITESPACE \ + { CHECK_LINE_LEN; \ + /* ignore initial whitespace further */ \ + } #define ACTION_0_DIGITS \ @@ -78,7 +159,8 @@ char string_buf[MAXGEDCLINELEN+1]; #define ACTION_DIGITS \ - { int level = atoi(TO_INTERNAL(yytext)); \ + { int level = atoi(TO_INTERNAL(yytext, str_buf)); \ + CHECK_LINE_LEN; \ if ((level < 0) || (level > MAXGEDCLEVEL)) { \ gedcom_error ("Level number out of range [0..%d]", \ MAXGEDCLEVEL); \ @@ -93,7 +175,7 @@ char string_buf[MAXGEDCLINELEN+1]; } \ else if (level_diff == 1) { \ level_diff++; \ - gedcom_lval.level = current_level; \ + gedcom_lval.number = current_level; \ return OPEN; \ } \ else { \ @@ -107,28 +189,31 @@ char string_buf[MAXGEDCLINELEN+1]; #define ACTION_ALPHANUM \ - { if (strlen(yytext) > MAXGEDCTAGLEN) { \ - gedcom_error("Tag '%s' too long, max %d chars"); \ + { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) { \ + gedcom_error("Tag '%s' too long, max %d characters", \ + yytext, MAXGEDCTAGLEN); \ return BADTOKEN; \ } \ - strncpy(string_buf, yytext, MAXGEDCTAGLEN+1); \ - gedcom_lval.tag = TO_INTERNAL(string_buf); \ + CHECK_LINE_LEN; \ + gedcom_lval.string = TO_INTERNAL(yytext, tag_buf); \ BEGIN(NORMAL); \ return USERTAG; \ } #define ACTION_DELIM \ - { gedcom_lval.string = TO_INTERNAL(yytext); \ + { CHECK_LINE_LEN; \ + gedcom_lval.string = TO_INTERNAL(yytext, str_buf); \ return DELIM; \ } #define ACTION_ANY \ - { gedcom_lval.string = TO_INTERNAL(yytext); \ - /* Due to character conversions, it is possible \ - that the current character will be combined with \ - the next, and so now we don't have a character yet... \ + { CHECK_LINE_LEN; \ + gedcom_lval.string = TO_INTERNAL(yytext, str_buf); \ + /* Due to character conversions, it is possible that the current \ + character will be combined with the next, and so now we don't have a \ + character yet... \ In principle, this is only applicable to the 1byte case (e.g. ANSEL), \ but it doesn't harm the unicode case. \ */ \ @@ -138,13 +223,20 @@ char string_buf[MAXGEDCLINELEN+1]; #define ACTION_ESCAPE \ - { gedcom_lval.string = TO_INTERNAL(yytext); \ + { CHECK_LINE_LEN; \ + gedcom_lval.string = TO_INTERNAL(yytext, str_buf); \ return ESCAPE; \ } #define ACTION_POINTER \ - { gedcom_lval.pointer = TO_INTERNAL(yytext); \ + { CHECK_LINE_LEN; \ + if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) { \ + gedcom_error("Pointer '%s' too long, max %d characters", \ + yytext, MAXGEDCPTRLEN); \ + return BADTOKEN; \ + } \ + gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf); \ return POINTER; \ } @@ -156,7 +248,9 @@ char string_buf[MAXGEDCLINELEN+1]; */ #define ACTION_TERMINATOR \ - { line_no++; \ + { CHECK_LINE_LEN; \ + INIT_LINE_LEN; \ + line_no++; \ BEGIN(INITIAL); \ } @@ -172,6 +266,10 @@ char string_buf[MAXGEDCLINELEN+1]; return CLOSE; \ } \ else { \ + /* Reset our state */ \ + current_level = -1; \ + level_diff = MAXGEDCLEVEL; \ + /* ... then terminate lex */ \ yyterminate(); \ } \ } @@ -182,3 +280,5 @@ char string_buf[MAXGEDCLINELEN+1]; yytext, yytext[0]); \ return BADTOKEN; \ } + +#endif /* IN_LEX */