X-Git-Url: https://git.dlugolecki.net.pl/?a=blobdiff_plain;f=gedcom_lex_common.c;h=bc7387de4f8797aabaf61916c1fc6cd3437c3cd5;hb=c881bdbd69886d7f37055a9f2802a2e106e50fdb;hp=41f2da1bb19653927c4c2a6cc65244a8fe38cf6d;hpb=edc367259f736f18ad9426a68efc153d75178e4d;p=gedcom-parse.git diff --git a/gedcom_lex_common.c b/gedcom_lex_common.c index 41f2da1..bc7387d 100644 --- a/gedcom_lex_common.c +++ b/gedcom_lex_common.c @@ -12,16 +12,23 @@ #ifndef IN_LEX -#include "gedcom.tab.h" +#include "external.h" #include "gedcom.h" +#include "gedcom.tab.h" #include "multilex.h" #include "encoding.h" #define YY_NO_UNPUT - -static int current_level=-1; + +static size_t encoding_width; +static int current_level = -1; static int level_diff=MAXGEDCLEVEL; - +static size_t line_len = 0; + +static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1]; +static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1]; +static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1]; + #ifdef LEXER_TEST YYSTYPE gedcom_lval; int line_no = 1; @@ -43,14 +50,14 @@ int test_loop(ENCODING enc, char* code) while (tok) { switch(tok) { case BADTOKEN: printf("BADTOKEN "); break; - case OPEN: printf("OPEN(%d) ", gedcom_lval.level); break; + case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break; case CLOSE: printf("CLOSE "); break; case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break; case DELIM: printf("DELIM "); break; case ANYCHAR: printf("%s ", gedcom_lval.string); break; - case POINTER: printf("POINTER(%s) ", gedcom_lval.pointer); break; - case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag); break; - default: printf("TAG(%s) ", gedcom_lval.tag); break; + case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break; + case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break; + default: printf("TAG(%s) ", gedcom_lval.string); break; } tok = gedcom_lex(); } @@ -63,17 +70,31 @@ int test_loop(ENCODING enc, char* code) #else /* of #ifndef IN_LEX */ -char string_buf[MAXGEDCLINELEN+1]; - -#define TO_INTERNAL(str) to_internal(str, yyleng) +#define TO_INTERNAL(STR,OUTBUF) \ + to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF)) + +#define INIT_LINE_LEN \ + line_len = 0; + +#define CHECK_LINE_LEN \ + { if (line_len != (size_t)-1) { \ + line_len += strlen(yytext); \ + if (line_len > MAXGEDCLINELEN * encoding_width) { \ + gedcom_error("Line too long, max %d characters", \ + MAXGEDCLINELEN); \ + line_len = (size_t)-1; \ + return BADTOKEN; \ + } \ + } \ + } -#define MKTAGACTION(the_tag) \ - { gedcom_lval.tag = TO_INTERNAL(yytext); \ +#define MKTAGACTION(THETAG) \ + { CHECK_LINE_LEN; \ + gedcom_lval.string = TO_INTERNAL(yytext, tag_buf); \ BEGIN(NORMAL); \ - return TAG_##the_tag; \ + return TAG_##THETAG; \ } - /* The GEDCOM level number is converted into a sequence of opening and closing brackets. Simply put, the following GEDCOM fragment: @@ -116,13 +137,19 @@ char string_buf[MAXGEDCLINELEN+1]; } \ else if (level_diff == 1) { \ level_diff++; \ - gedcom_lval.level = current_level; \ + gedcom_lval.number = current_level; \ return OPEN; \ } \ else { \ /* out of brackets... */ \ } \ - } + } + + +#define ACTION_INITIAL_WHITESPACE \ + { CHECK_LINE_LEN; \ + /* ignore initial whitespace further */ \ + } #define ACTION_0_DIGITS \ @@ -132,7 +159,8 @@ char string_buf[MAXGEDCLINELEN+1]; #define ACTION_DIGITS \ - { int level = atoi(TO_INTERNAL(yytext)); \ + { int level = atoi(TO_INTERNAL(yytext, str_buf)); \ + CHECK_LINE_LEN; \ if ((level < 0) || (level > MAXGEDCLEVEL)) { \ gedcom_error ("Level number out of range [0..%d]", \ MAXGEDCLEVEL); \ @@ -147,7 +175,7 @@ char string_buf[MAXGEDCLINELEN+1]; } \ else if (level_diff == 1) { \ level_diff++; \ - gedcom_lval.level = current_level; \ + gedcom_lval.number = current_level; \ return OPEN; \ } \ else { \ @@ -161,28 +189,31 @@ char string_buf[MAXGEDCLINELEN+1]; #define ACTION_ALPHANUM \ - { if (strlen(yytext) > MAXGEDCTAGLEN) { \ - gedcom_error("Tag '%s' too long, max %d chars"); \ + { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) { \ + gedcom_error("Tag '%s' too long, max %d characters", \ + yytext, MAXGEDCTAGLEN); \ return BADTOKEN; \ } \ - strncpy(string_buf, yytext, MAXGEDCTAGLEN+1); \ - gedcom_lval.tag = TO_INTERNAL(string_buf); \ + CHECK_LINE_LEN; \ + gedcom_lval.string = TO_INTERNAL(yytext, tag_buf); \ BEGIN(NORMAL); \ return USERTAG; \ } #define ACTION_DELIM \ - { gedcom_lval.string = TO_INTERNAL(yytext); \ + { CHECK_LINE_LEN; \ + gedcom_lval.string = TO_INTERNAL(yytext, str_buf); \ return DELIM; \ } #define ACTION_ANY \ - { gedcom_lval.string = TO_INTERNAL(yytext); \ - /* Due to character conversions, it is possible \ - that the current character will be combined with \ - the next, and so now we don't have a character yet... \ + { CHECK_LINE_LEN; \ + gedcom_lval.string = TO_INTERNAL(yytext, str_buf); \ + /* Due to character conversions, it is possible that the current \ + character will be combined with the next, and so now we don't have a \ + character yet... \ In principle, this is only applicable to the 1byte case (e.g. ANSEL), \ but it doesn't harm the unicode case. \ */ \ @@ -192,13 +223,20 @@ char string_buf[MAXGEDCLINELEN+1]; #define ACTION_ESCAPE \ - { gedcom_lval.string = TO_INTERNAL(yytext); \ + { CHECK_LINE_LEN; \ + gedcom_lval.string = TO_INTERNAL(yytext, str_buf); \ return ESCAPE; \ } #define ACTION_POINTER \ - { gedcom_lval.pointer = TO_INTERNAL(yytext); \ + { CHECK_LINE_LEN; \ + if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) { \ + gedcom_error("Pointer '%s' too long, max %d characters", \ + yytext, MAXGEDCPTRLEN); \ + return BADTOKEN; \ + } \ + gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf); \ return POINTER; \ } @@ -210,7 +248,9 @@ char string_buf[MAXGEDCLINELEN+1]; */ #define ACTION_TERMINATOR \ - { line_no++; \ + { CHECK_LINE_LEN; \ + INIT_LINE_LEN; \ + line_no++; \ BEGIN(INITIAL); \ } @@ -226,6 +266,10 @@ char string_buf[MAXGEDCLINELEN+1]; return CLOSE; \ } \ else { \ + /* Reset our state */ \ + current_level = -1; \ + level_diff = MAXGEDCLEVEL; \ + /* ... then terminate lex */ \ yyterminate(); \ } \ }