X-Git-Url: https://git.dlugolecki.net.pl/?a=blobdiff_plain;f=gedcom%2Fgedcom_lex_common.c;h=06c1f77a8a5fefb51696d6fb7ac63e9b1c7f917c;hb=6103dd898c4de86c68891cc0222543988a2caab7;hp=109da8a1c33035ca9d26197fca562e45267c8ba3;hpb=44bfd161d5274ff6a39f3640d745b9bbabe715ad;p=gedcom-parse.git diff --git a/gedcom/gedcom_lex_common.c b/gedcom/gedcom_lex_common.c index 109da8a..06c1f77 100644 --- a/gedcom/gedcom_lex_common.c +++ b/gedcom/gedcom_lex_common.c @@ -21,28 +21,34 @@ /* $Id$ */ /* $Name$ */ -#ifndef IN_LEX +#if LEX_SECTION == 1 #include "gedcom_internal.h" #include "multilex.h" #include "encoding.h" +#include "encoding_state.h" #include "gedcom.h" #include "gedcom.tabgen.h" #include "compat.h" static size_t encoding_width; static int current_level = -1; -static int level_diff=MAXGEDCLEVEL; +static int level_diff = MAXGEDCLEVEL; static size_t line_len = 0; +static int tab_space = 0; +static int current_tag = -1; -static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1]; -static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1]; -static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1]; +static struct conv_buffer* ptr_buffer = NULL; +static struct conv_buffer* tag_buffer = NULL; +static struct conv_buffer* str_buffer = NULL; + +#define INITIAL_PTR_BUFFER_LEN MAXGEDCPTRLEN * UTF_FACTOR + 1 +#define INITIAL_TAG_BUFFER_LEN MAXGEDCTAGLEN * UTF_FACTOR + 1 +#define INITIAL_STR_BUFFER_LEN MAXGEDCLINELEN * UTF_FACTOR + 1 #ifdef LEXER_TEST YYSTYPE gedcom_lval; int line_no = 1; -int compat_at = 0; int gedcom_lex(); @@ -51,7 +57,7 @@ void message_handler(Gedcom_msg_type type, char *msg) fprintf(stderr, "(%d) %s\n", type, msg); } -int test_loop(ENCODING enc, char* code) +int test_loop(ENCODING enc, const char* code) { int tok, res; init_encodings(); @@ -85,10 +91,71 @@ int test_loop(ENCODING enc, char* code) #endif /* of #ifdef LEXER_TEST */ -#else /* of #ifndef IN_LEX */ +/* These are defined as functions here, because xgettext has trouble + extracting the strings out of long pre-processor defined */ + +static void error_line_too_long() +{ + gedcom_error(_("Line too long, max %d characters allowed"), + MAXGEDCLINELEN); +} + +static void error_level_leading_zero() +{ + gedcom_error (_("Level number with leading zero not allowed")); +} + +static void error_level_out_of_range() +{ + gedcom_error (_("Level number out of range [0..%d]"), MAXGEDCLEVEL); +} + +static void error_level_too_high(int level_diff) +{ + gedcom_error (_("GEDCOM level number is %d higher than previous"), + level_diff); +} + +static void error_tag_too_long(const char *tag) +{ + gedcom_error(_("Tag '%s' too long, max %d characters allowed"), + tag, MAXGEDCTAGLEN); +} + +static void error_invalid_character(const char *str, char ch) +{ + gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), str, ch); +} + +static void error_pointer_too_long(const char *ptr) +{ + gedcom_error(_("Pointer '%s' too long, max %d characters allowed"), + ptr, MAXGEDCPTRLEN); +} + +static void error_at_character() +{ + gedcom_error(_("'@' character should be written as '@@' in values")); +} + +static void error_tab_character() +{ + gedcom_error(_("Tab character is not allowed in values")); +} + +static void error_unexpected_character(const char* str, char ch) +{ + gedcom_error(_("Unexpected character: '%s' (0x%02x)"), str, ch); +} + +/* This is to bypass the iconv conversion (if the input is UTF-8 coming + from the program) */ +static int dummy_conv = 0; + +#elif LEX_SECTION == 2 #define TO_INTERNAL(STR,OUTBUF) \ - to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF)) + (dummy_conv ? STR : to_internal(STR, yyleng, OUTBUF)) #define INIT_LINE_LEN \ line_len = 0; @@ -96,21 +163,29 @@ int test_loop(ENCODING enc, char* code) #define CHECK_LINE_LEN \ { if (line_len != (size_t)-1) { \ line_len += strlen(yytext); \ - if (line_len > MAXGEDCLINELEN * encoding_width) { \ - gedcom_error(_("Line too long, max %d characters allowed"), \ - MAXGEDCLINELEN); \ + if (line_len > MAXGEDCLINELEN * encoding_width \ + && ! compat_long_line(current_level, current_tag)) { \ + error_line_too_long(); \ line_len = (size_t)-1; \ return BADTOKEN; \ } \ } \ } +#define GENERATE_TAB_SPACE \ + { gedcom_lval.string = " "; \ + tab_space--; \ + return DELIM; \ + } + #define MKTAGACTION(THETAG) \ { CHECK_LINE_LEN; \ - gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buf); \ - gedcom_lval.tag.value = TAG_##THETAG; \ + gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer); \ + current_tag = TAG_##THETAG; \ + gedcom_lval.tag.value = current_tag; \ BEGIN(NORMAL); \ - return TAG_##THETAG; \ + line_no++; \ + return current_tag; \ } /* The GEDCOM level number is converted into a sequence of opening @@ -146,10 +221,16 @@ int test_loop(ENCODING enc, char* code) But because this means that one token is converted into a series of tokens, there is some initial code following immediately here - that returns "pending" tokens. */ + that returns "pending" tokens. + + Also, for compatibility tabs are converted into spaces, which is + also handled here */ #define ACTION_BEFORE_REGEXPS \ - { if (level_diff < 1) { \ + { if (compat_mode(C_TAB_CHARACTER) && tab_space-- > 0) { \ + GENERATE_TAB_SPACE; \ + } \ + else if (level_diff < 1) { \ level_diff++; \ return CLOSE; \ } \ @@ -171,17 +252,17 @@ int test_loop(ENCODING enc, char* code) #define ACTION_0_DIGITS \ - { gedcom_error (_("Level number with leading zero not allowed")); \ + { error_level_leading_zero(); \ return BADTOKEN; \ } #define ACTION_DIGITS \ - { int level = atoi(TO_INTERNAL(yytext, str_buf)); \ + { int level = atoi(TO_INTERNAL(yytext, str_buffer)); \ CHECK_LINE_LEN; \ if ((level < 0) || (level > MAXGEDCLEVEL)) { \ - gedcom_error (_("Level number out of range [0..%d]"), \ - MAXGEDCLEVEL); \ + error_level_out_of_range(); \ + line_no++; \ return BADTOKEN; \ } \ level_diff = level - current_level; \ @@ -198,8 +279,8 @@ int test_loop(ENCODING enc, char* code) } \ else { \ /* should never happen (error to GEDCOM spec) */ \ - gedcom_error (_("GEDCOM level number is %d higher than previous"), \ - level_diff); \ + error_level_too_high(level_diff); \ + line_no++; \ return BADTOKEN; \ } \ } @@ -207,21 +288,22 @@ int test_loop(ENCODING enc, char* code) #define ACTION_ALPHANUM \ { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) { \ - gedcom_error(_("Tag '%s' too long, max %d characters allowed"), \ - yytext, MAXGEDCTAGLEN); \ + error_tag_too_long(yytext); \ + line_no++; \ return BADTOKEN; \ } \ CHECK_LINE_LEN; \ - gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buf); \ + gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer); \ gedcom_lval.tag.value = USERTAG; \ BEGIN(NORMAL); \ + line_no++; \ return USERTAG; \ } #define ACTION_DELIM \ { CHECK_LINE_LEN; \ - gedcom_lval.string = TO_INTERNAL(yytext, str_buf); \ + gedcom_lval.string = TO_INTERNAL(yytext, str_buffer); \ return DELIM; \ } @@ -229,11 +311,10 @@ int test_loop(ENCODING enc, char* code) #define ACTION_ANY \ { char* tmp; \ CHECK_LINE_LEN; \ - tmp = TO_INTERNAL(yytext, str_buf); \ + tmp = TO_INTERNAL(yytext, str_buffer); \ if (!tmp) { \ /* Something went wrong during conversion... */ \ - gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), \ - yytext, yytext[0]); \ + error_invalid_character(yytext, yytext[0]); \ return BADTOKEN; \ } \ else { \ @@ -252,7 +333,7 @@ int test_loop(ENCODING enc, char* code) #define ACTION_ESCAPE \ { CHECK_LINE_LEN; \ - gedcom_lval.string = TO_INTERNAL(yytext, str_buf); \ + gedcom_lval.string = TO_INTERNAL(yytext, str_buffer); \ return ESCAPE; \ } @@ -260,11 +341,10 @@ int test_loop(ENCODING enc, char* code) #define ACTION_POINTER \ { CHECK_LINE_LEN; \ if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) { \ - gedcom_error(_("Pointer '%s' too long, max %d characters allowed"), \ - yytext, MAXGEDCPTRLEN); \ + error_pointer_too_long(yytext); \ return BADTOKEN; \ } \ - gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf); \ + gedcom_lval.string = TO_INTERNAL(yytext, ptr_buffer); \ return POINTER; \ } @@ -278,7 +358,8 @@ int test_loop(ENCODING enc, char* code) #define ACTION_TERMINATOR \ { CHECK_LINE_LEN; \ INIT_LINE_LEN; \ - line_no++; \ + if (line_no == 1) \ + set_read_encoding_terminator(TO_INTERNAL(yytext, str_buffer)); \ BEGIN(INITIAL); \ } @@ -295,10 +376,7 @@ int test_loop(ENCODING enc, char* code) } \ else { \ char* ptr; int size; \ - /* Reset our state */ \ - current_level = -1; \ - level_diff = MAXGEDCLEVEL; \ - /* ... then terminate lex */ \ + /* ... terminate lex */ \ yyterminate(); \ /* Get rid of f*cking compiler warning from lex generated code */ \ /* yyterminate does return(), so program will never come here */ \ @@ -307,24 +385,85 @@ int test_loop(ENCODING enc, char* code) } #define ACTION_NORMAL_AT \ - { if (compat_at) { \ + { if (compat_mode(C_NO_DOUBLE_AT)) { \ int i, j; \ char *yycopy = strdup(yytext); \ - for (i = 0; i < 2; i++) \ - for (j = yyleng - 1; j >= 0; --j) \ - unput(yycopy[j]); \ - free(yycopy); \ + if (yycopy) { \ + for (i = 0; i < 2; i++) \ + for (j = yyleng - 1; j >= 0; --j) \ + unput(yycopy[j]); \ + free(yycopy); \ + } \ + else { \ + MEMORY_ERROR; \ + } \ } \ else { \ - gedcom_error(_("'@' character should be written as '@@' in values")); \ + error_at_character(); \ + return BADTOKEN; \ + } \ + } + +#define ACTION_TAB \ + { if (compat_mode(C_TAB_CHARACTER)) { \ + tab_space = 8; \ + GENERATE_TAB_SPACE; \ + } \ + else { \ + error_tab_character(); \ return BADTOKEN; \ } \ } #define ACTION_UNEXPECTED \ - { gedcom_error(_("Unexpected character: '%s' (0x%02x)"), \ - yytext, yytext[0]); \ + { error_unexpected_character(yytext, yytext[0]); \ return BADTOKEN; \ } -#endif /* IN_LEX */ +#elif LEX_SECTION == 3 + +int yywrap() +{ + return 1; +} + +static void free_conv_buffers() +{ + free_conv_buffer(ptr_buffer); + free_conv_buffer(tag_buffer); + free_conv_buffer(str_buffer); +} + +static void yylex_cleanup() +{ + /* fix memory leak in lex */ + yy_delete_buffer(yy_current_buffer); + yy_current_buffer = NULL; + free_conv_buffers(); +} + +static void init_conv_buffers() +{ + if (!ptr_buffer) { + ptr_buffer = create_conv_buffer(INITIAL_PTR_BUFFER_LEN); + tag_buffer = create_conv_buffer(INITIAL_TAG_BUFFER_LEN); + str_buffer = create_conv_buffer(INITIAL_STR_BUFFER_LEN); + } +} + +static int exitfuncregistered = 0; + +void yymyinit(FILE *f) +{ + if (! exitfuncregistered && atexit(yylex_cleanup) == 0) + exitfuncregistered = 1; + init_conv_buffers(); + yyin = f; + yyrestart(f); + /* Reset our state */ + current_level = -1; + level_diff = MAXGEDCLEVEL; + BEGIN(INITIAL); +} + +#endif