2 Copyright (C) 2001, 2002 The Genes Development Team
3 This file is part of the Gedcom parser library.
4 Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
6 The Gedcom parser library is free software; you can redistribute it
7 and/or modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The Gedcom parser library is distributed in the hope that it will be
12 useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the Gedcom parser library; if not, write to the
18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
26 #include "gedcom_internal.h"
29 #include "encoding_state.h"
31 #include "gedcom.tabgen.h"
34 static size_t encoding_width;
35 static int current_level = -1;
36 static int level_diff = MAXGEDCLEVEL;
37 static size_t line_len = 0;
38 static int tab_space = 0;
39 static int current_tag = -1;
41 static struct conv_buffer* ptr_buffer = NULL;
42 static struct conv_buffer* tag_buffer = NULL;
43 static struct conv_buffer* str_buffer = NULL;
45 #define INITIAL_PTR_BUFFER_LEN MAXGEDCPTRLEN * UTF_FACTOR + 1
46 #define INITIAL_TAG_BUFFER_LEN MAXGEDCTAGLEN * UTF_FACTOR + 1
47 #define INITIAL_STR_BUFFER_LEN MAXGEDCLINELEN * UTF_FACTOR + 1
55 void message_handler(Gedcom_msg_type type, char *msg)
57 fprintf(stderr, "(%d) %s\n", type, msg);
60 int test_loop(ENCODING enc, const char* code)
64 set_encoding_width(enc);
65 gedcom_set_message_handler(message_handler);
66 res = open_conv_to_internal(code);
68 gedcom_error("Unable to open conversion context: %s",
75 case BADTOKEN: printf("BADTOKEN "); break;
76 case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
77 case CLOSE: printf("CLOSE "); break;
78 case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
79 case DELIM: printf("DELIM "); break;
80 case ANYCHAR: printf("%s ", gedcom_lval.string); break;
81 case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
82 case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag.string); break;
83 default: printf("TAG(%s) ", gedcom_lval.tag.string); break;
88 close_conv_to_internal();
92 #endif /* of #ifdef LEXER_TEST */
94 /* These are defined as functions here, because xgettext has trouble
95 extracting the strings out of long pre-processor defined */
97 static void error_line_too_long()
99 gedcom_error(_("Line too long, max %d characters allowed"),
103 static void error_level_leading_zero()
105 gedcom_error (_("Level number with leading zero not allowed"));
108 static void error_level_out_of_range()
110 gedcom_error (_("Level number out of range [0..%d]"), MAXGEDCLEVEL);
113 static void error_level_too_high(int level_diff)
115 gedcom_error (_("GEDCOM level number is %d higher than previous"),
119 static void error_tag_too_long(const char *tag)
121 gedcom_error(_("Tag '%s' too long, max %d characters allowed"),
125 static void error_invalid_character(const char *str, char ch)
127 gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), str, ch);
130 static void error_pointer_too_long(const char *ptr)
132 gedcom_error(_("Pointer '%s' too long, max %d characters allowed"),
136 static void error_at_character()
138 gedcom_error(_("'@' character should be written as '@@' in values"));
141 static void error_tab_character()
143 gedcom_error(_("Tab character is not allowed in values"));
146 static void error_unexpected_character(const char* str, char ch)
148 gedcom_error(_("Unexpected character: '%s' (0x%02x)"), str, ch);
151 /* This is to bypass the iconv conversion (if the input is UTF-8 coming
153 static int dummy_conv = 0;
155 #elif LEX_SECTION == 2
157 #define TO_INTERNAL(STR,OUTBUF) \
158 (dummy_conv ? STR : to_internal(STR, yyleng, OUTBUF))
160 #define INIT_LINE_LEN \
163 #define CHECK_LINE_LEN \
164 { if (line_len != (size_t)-1) { \
165 line_len += strlen(yytext); \
166 if (line_len > MAXGEDCLINELEN * encoding_width \
167 && ! compat_long_line(current_level, current_tag)) { \
168 error_line_too_long(); \
169 line_len = (size_t)-1; \
175 #define GENERATE_TAB_SPACE \
176 { gedcom_lval.string = " "; \
181 #define MKTAGACTION(THETAG) \
183 gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer); \
184 current_tag = TAG_##THETAG; \
185 gedcom_lval.tag.value = current_tag; \
188 return current_tag; \
191 /* The GEDCOM level number is converted into a sequence of opening
192 and closing brackets. Simply put, the following GEDCOM fragment:
205 { SOUR genes (1 higher: no closing brackets)
206 { VERS 1.6 (1 higher: no closing brackets)
207 } { NAME Genes (same level: 1 closing bracket)
208 } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets)
222 But because this means that one token is converted into a series
223 of tokens, there is some initial code following immediately here
224 that returns "pending" tokens.
226 Also, for compatibility tabs are converted into spaces, which is
229 #define ACTION_BEFORE_REGEXPS \
230 { if (compat_mode(C_TAB_CHARACTER) && tab_space-- > 0) { \
231 GENERATE_TAB_SPACE; \
233 else if (level_diff < 1) { \
237 else if (level_diff == 1) { \
239 gedcom_lval.number = current_level; \
243 /* out of brackets... */ \
248 #define ACTION_INITIAL_WHITESPACE \
250 /* ignore initial whitespace further */ \
254 #define ACTION_0_DIGITS \
255 { error_level_leading_zero(); \
260 #define ACTION_DIGITS \
261 { int level = atoi(TO_INTERNAL(yytext, str_buffer)); \
263 if ((level < 0) || (level > MAXGEDCLEVEL)) { \
264 error_level_out_of_range(); \
268 level_diff = level - current_level; \
270 current_level = level; \
271 if (level_diff < 1) { \
275 else if (level_diff == 1) { \
277 gedcom_lval.number = current_level; \
281 /* should never happen (error to GEDCOM spec) */ \
282 error_level_too_high(level_diff); \
289 #define ACTION_ALPHANUM \
290 { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) { \
291 error_tag_too_long(yytext); \
296 gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer); \
297 gedcom_lval.tag.value = USERTAG; \
304 #define ACTION_DELIM \
306 gedcom_lval.string = TO_INTERNAL(yytext, str_buffer); \
314 tmp = TO_INTERNAL(yytext, str_buffer); \
316 /* Something went wrong during conversion... */ \
317 error_invalid_character(yytext, yytext[0]); \
321 gedcom_lval.string = tmp; \
322 /* Due to character conversions, it is possible that the current \
323 character will be combined with the next, and so now we don't have a \
325 In principle, this is only applicable to the 1byte case (e.g. ANSEL),\
326 but it doesn't harm the unicode case. \
328 if (strlen(gedcom_lval.string) > 0) \
334 #define ACTION_ESCAPE \
336 gedcom_lval.string = TO_INTERNAL(yytext, str_buffer); \
341 #define ACTION_POINTER \
343 if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) { \
344 error_pointer_too_long(yytext); \
347 gedcom_lval.string = TO_INTERNAL(yytext, ptr_buffer); \
352 /* Due to the conversion of level numbers into brackets, the
353 terminator is not important, so no token is returned here.
354 Although not strictly according to the GEDCOM spec, we'll ignore
355 whitespace just before the terminator.
358 #define ACTION_TERMINATOR \
362 set_read_encoding_terminator(TO_INTERNAL(yytext, str_buffer)); \
367 /* Eventually we have to return 1 closing bracket (for the trailer).
368 We can detect whether we have sent the closing bracket using the
369 level_diff (at eof, first it is 2, then we increment it ourselves)
373 { if (level_diff == 2) { \
378 char* ptr; int size; \
379 /* ... terminate lex */ \
381 /* Get rid of f*cking compiler warning from lex generated code */ \
382 /* yyterminate does return(), so program will never come here */ \
383 yy_flex_realloc(ptr, size); \
387 #define ACTION_NORMAL_AT \
388 { if (compat_mode(C_NO_DOUBLE_AT)) { \
390 char *yycopy = strdup(yytext); \
392 for (i = 0; i < 2; i++) \
393 for (j = yyleng - 1; j >= 0; --j) \
402 error_at_character(); \
408 { if (compat_mode(C_TAB_CHARACTER)) { \
410 GENERATE_TAB_SPACE; \
413 error_tab_character(); \
418 #define ACTION_UNEXPECTED \
419 { error_unexpected_character(yytext, yytext[0]); \
423 #elif LEX_SECTION == 3
430 static void free_conv_buffers()
432 free_conv_buffer(ptr_buffer);
433 free_conv_buffer(tag_buffer);
434 free_conv_buffer(str_buffer);
437 static void yylex_cleanup()
439 /* fix memory leak in lex */
440 yy_delete_buffer(yy_current_buffer);
441 yy_current_buffer = NULL;
445 static void init_conv_buffers()
448 ptr_buffer = create_conv_buffer(INITIAL_PTR_BUFFER_LEN);
449 tag_buffer = create_conv_buffer(INITIAL_TAG_BUFFER_LEN);
450 str_buffer = create_conv_buffer(INITIAL_STR_BUFFER_LEN);
454 static int exitfuncregistered = 0;
456 void yymyinit(FILE *f)
458 if (! exitfuncregistered && atexit(yylex_cleanup) == 0)
459 exitfuncregistered = 1;
463 /* Reset our state */
465 level_diff = MAXGEDCLEVEL;