Use safe buffer mechanism for storing gedcom value (to avoid fixed
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /* Common lexer code.
2    Copyright (C) 2001, 2002 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 #if LEX_SECTION == 1
25
26 #include "gedcom_internal.h"
27 #include "multilex.h"
28 #include "encoding.h"
29 #include "gedcom.h"
30 #include "gedcom.tabgen.h"
31 #include "compat.h"
32 #include "utf8.h"
33
34 static size_t encoding_width;
35 static int current_level = -1;
36 static int level_diff=MAXGEDCLEVEL;
37 static size_t line_len = 0;
38
39 static struct conv_buffer* ptr_buffer = NULL;
40 static struct conv_buffer* tag_buffer = NULL;
41 static struct conv_buffer* str_buffer = NULL;
42
43 #define INITIAL_PTR_BUFFER_LEN MAXGEDCPTRLEN * UTF_FACTOR + 1
44 #define INITIAL_TAG_BUFFER_LEN MAXGEDCTAGLEN * UTF_FACTOR + 1
45 #define INITIAL_STR_BUFFER_LEN MAXGEDCLINELEN * UTF_FACTOR + 1
46
47 #ifdef LEXER_TEST 
48 YYSTYPE gedcom_lval;
49 int line_no = 1;
50 int compat_at = 0;
51
52 int gedcom_lex();
53
54 void message_handler(Gedcom_msg_type type, char *msg)
55 {
56   fprintf(stderr, "(%d) %s\n", type, msg);
57 }
58
59 int test_loop(ENCODING enc, const char* code)
60 {
61   int tok, res;
62   init_encodings();
63   set_encoding_width(enc);
64   gedcom_set_message_handler(message_handler);
65   res = open_conv_to_internal(code);
66   if (!res) {
67     gedcom_error("Unable to open conversion context: %s",
68                  strerror(errno));
69     return 1;
70   }
71   tok = gedcom_lex();
72   while (tok) {
73     switch(tok) {
74       case BADTOKEN: printf("BADTOKEN "); break;
75       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
76       case CLOSE: printf("CLOSE "); break;
77       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
78       case DELIM: printf("DELIM "); break;
79       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
80       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
81       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag.string); break;
82       default: printf("TAG(%s) ", gedcom_lval.tag.string); break;
83     }
84     tok = gedcom_lex();
85   }
86   printf("\n");
87   close_conv_to_internal();
88   return 0;  
89 }
90  
91 #endif /* of #ifdef LEXER_TEST */
92
93 /* These are defined as functions here, because xgettext has trouble
94    extracting the strings out of long pre-processor defined */
95
96 static void error_line_too_long()
97 {
98   gedcom_error(_("Line too long, max %d characters allowed"), MAXGEDCLINELEN); 
99 }
100
101 static void error_level_leading_zero()
102 {
103   gedcom_error (_("Level number with leading zero not allowed"));
104 }
105
106 static void error_level_out_of_range()
107 {
108   gedcom_error (_("Level number out of range [0..%d]"), MAXGEDCLEVEL); 
109 }
110
111 static void error_level_too_high(int level_diff)
112 {
113   gedcom_error (_("GEDCOM level number is %d higher than previous"),
114                 level_diff); 
115 }
116
117 static void error_tag_too_long(const char *tag)
118 {
119   gedcom_error(_("Tag '%s' too long, max %d characters allowed"),
120                tag, MAXGEDCTAGLEN); 
121 }
122
123 static void error_invalid_character(const char *str, char ch)
124 {
125   gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), str, ch); 
126 }
127
128 static void error_pointer_too_long(const char *ptr)
129 {
130   gedcom_error(_("Pointer '%s' too long, max %d characters allowed"),
131                ptr, MAXGEDCPTRLEN);
132 }
133
134 static void error_at_character()
135 {
136   gedcom_error(_("'@' character should be written as '@@' in values"));
137 }
138
139 static void error_unexpected_character(const char* str, char ch)
140 {
141   gedcom_error(_("Unexpected character: '%s' (0x%02x)"), str, ch);
142 }
143
144 /* This is to bypass the iconv conversion (if the input is UTF-8 coming
145    from the program) */
146 static int dummy_conv = 0;
147
148 #elif LEX_SECTION == 2
149
150 #define TO_INTERNAL(STR,OUTBUF) \
151   (dummy_conv ? STR : to_internal(STR, yyleng, OUTBUF))
152
153 #define INIT_LINE_LEN \
154   line_len = 0;
155
156 #define CHECK_LINE_LEN                                                        \
157   { if (line_len != (size_t)-1) {                                             \
158       line_len += strlen(yytext);                                             \
159       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
160         error_line_too_long();                                                \
161         line_len = (size_t)-1;                                                \
162         return BADTOKEN;                                                      \
163       }                                                                       \
164     }                                                                         \
165   }
166
167 #define MKTAGACTION(THETAG)                                                  \
168   { CHECK_LINE_LEN;                                                          \
169     gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer);                \
170     gedcom_lval.tag.value  = TAG_##THETAG;                                   \
171     BEGIN(NORMAL);                                                           \
172     line_no++;                                                               \
173     return TAG_##THETAG;                                                     \
174   }
175
176 /* The GEDCOM level number is converted into a sequence of opening
177    and closing brackets.  Simply put, the following GEDCOM fragment:
178    
179    0 HEAD
180    1 SOUR genes
181    2 VERS 1.6
182    2 NAME Genes
183    1 DATE 07 OCT 2001
184    ...
185    0 TRLR
186    
187    is converted into:
188    
189    { HEAD                     (initial)  
190    { SOUR genes               (1 higher: no closing brackets)
191    { VERS 1.6                 (1 higher: no closing brackets)
192    } { NAME Genes             (same level: 1 closing bracket)
193    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
194    ...
195    } { TRLR }
196    
197    or more clearly:
198    
199    { HEAD
200      { SOUR genes
201        { VERS 1.6 }
202        { NAME Genes } }
203      { DATE 07 OCT 2001
204      ... }
205    { TRLR }
206
207    But because this means that one token is converted into a series
208    of tokens, there is some initial code following immediately here
209    that returns "pending" tokens. */
210
211 #define ACTION_BEFORE_REGEXPS                                                 \
212    { if (level_diff < 1) {                                                    \
213        level_diff++;                                                          \
214        return CLOSE;                                                          \
215      }                                                                        \
216      else if (level_diff == 1) {                                              \
217        level_diff++;                                                          \
218        gedcom_lval.number = current_level;                                    \
219        return OPEN;                                                           \
220      }                                                                        \
221      else {                                                                   \
222        /* out of brackets... */                                               \
223      }                                                                        \
224    }
225
226
227 #define ACTION_INITIAL_WHITESPACE                                             \
228   { CHECK_LINE_LEN;                                                           \
229     /* ignore initial whitespace further */                                   \
230   }
231
232
233 #define ACTION_0_DIGITS                                                       \
234    { error_level_leading_zero();                                              \
235      return BADTOKEN;                                                         \
236    } 
237
238
239 #define ACTION_DIGITS                                                         \
240    { int level = atoi(TO_INTERNAL(yytext, str_buffer));                       \
241      CHECK_LINE_LEN;                                                          \
242      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
243        error_level_out_of_range();                                            \
244        line_no++;                                                             \
245        return BADTOKEN;                                                       \
246      }                                                                        \
247      level_diff = level - current_level;                                      \
248      BEGIN(EXPECT_TAG);                                                       \
249      current_level = level;                                                   \
250      if (level_diff < 1) {                                                    \
251        level_diff++;                                                          \
252        return CLOSE;                                                          \
253      }                                                                        \
254      else if (level_diff == 1) {                                              \
255        level_diff++;                                                          \
256        gedcom_lval.number = current_level;                                    \
257        return OPEN;                                                           \
258      }                                                                        \
259      else {                                                                   \
260        /* should never happen (error to GEDCOM spec) */                       \
261        error_level_too_high(level_diff);                                      \
262        line_no++;                                                             \
263        return BADTOKEN;                                                       \
264      }                                                                        \
265    } 
266
267
268 #define ACTION_ALPHANUM                                                       \
269    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
270        error_tag_too_long(yytext);                                            \
271        line_no++;                                                             \
272        return BADTOKEN;                                                       \
273      }                                                                        \
274      CHECK_LINE_LEN;                                                          \
275      gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer);                \
276      gedcom_lval.tag.value  = USERTAG;                                        \
277      BEGIN(NORMAL);                                                           \
278      line_no++;                                                               \
279      return USERTAG;                                                          \
280    }
281
282
283 #define ACTION_DELIM                                                          \
284   { CHECK_LINE_LEN;                                                           \
285     gedcom_lval.string = TO_INTERNAL(yytext, str_buffer);                     \
286     return DELIM;                                                             \
287   }
288
289
290 #define ACTION_ANY                                                            \
291   { char* tmp;                                                                \
292     CHECK_LINE_LEN;                                                           \
293     tmp = TO_INTERNAL(yytext, str_buffer);                                    \
294     if (!tmp) {                                                               \
295       /* Something went wrong during conversion... */                         \
296           error_invalid_character(yytext, yytext[0]);                         \
297           return BADTOKEN;                                                    \
298     }                                                                         \
299     else {                                                                    \
300       gedcom_lval.string = tmp;                                               \
301       /* Due to character conversions, it is possible that the current        \
302          character will be combined with the next, and so now we don't have a \
303          character yet...                                                     \
304          In principle, this is only applicable to the 1byte case (e.g. ANSEL),\
305          but it doesn't harm the unicode case.                                \
306       */                                                                      \
307       if (strlen(gedcom_lval.string) > 0)                                     \
308         return ANYCHAR;                                                       \
309     }                                                                         \
310   }
311
312
313 #define ACTION_ESCAPE                                                         \
314   { CHECK_LINE_LEN;                                                           \
315     gedcom_lval.string = TO_INTERNAL(yytext, str_buffer);                     \
316     return ESCAPE;                                                            \
317   }
318
319
320 #define ACTION_POINTER                                                        \
321   { CHECK_LINE_LEN;                                                           \
322     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
323       error_pointer_too_long(yytext);                                         \
324       return BADTOKEN;                                                        \
325     }                                                                         \
326     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buffer);                     \
327     return POINTER;                                                           \
328   }
329
330
331 /* Due to the conversion of level numbers into brackets, the
332    terminator is not important, so no token is returned here.
333    Although not strictly according to the GEDCOM spec, we'll ignore
334    whitespace just before the terminator.
335 */
336
337 #define ACTION_TERMINATOR                                                     \
338   { CHECK_LINE_LEN;                                                           \
339     INIT_LINE_LEN;                                                            \
340     BEGIN(INITIAL);                                                           \
341   }
342
343
344 /* Eventually we have to return 1 closing bracket (for the trailer).
345    We can detect whether we have sent the closing bracket using the
346    level_diff (at eof, first it is 2, then we increment it ourselves)
347 */
348
349 #define ACTION_EOF                                                            \
350   { if (level_diff == 2) {                                                    \
351       level_diff++;                                                           \
352       return CLOSE;                                                           \
353     }                                                                         \
354     else {                                                                    \
355       char* ptr; int size;                                                    \
356       /* ... terminate lex */                                                 \
357       yyterminate();                                                          \
358       /* Get rid of f*cking compiler warning from lex generated code */       \
359       /* yyterminate does return(), so program will never come here  */       \
360       yy_flex_realloc(ptr, size);                                             \
361     }                                                                         \
362   } 
363
364 #define ACTION_NORMAL_AT                                                      \
365   { if (compat_at) {                                                          \
366       int i, j;                                                               \
367       char *yycopy = strdup(yytext);                                          \
368       if (yycopy) {                                                           \
369         for (i = 0; i < 2; i++)                                               \
370           for (j = yyleng - 1; j >= 0; --j)                                   \
371             unput(yycopy[j]);                                                 \
372         free(yycopy);                                                         \
373       }                                                                       \
374       else {                                                                  \
375         MEMORY_ERROR;                                                         \
376       }                                                                       \
377     }                                                                         \
378     else {                                                                    \
379       error_at_character();                                                   \
380       return BADTOKEN;                                                        \
381     }                                                                         \
382   }
383
384 #define ACTION_UNEXPECTED                                                     \
385   { error_unexpected_character(yytext, yytext[0]);                            \
386     return BADTOKEN;                                                          \
387   }
388
389 #elif LEX_SECTION == 3
390
391 int yywrap()
392 {
393   return 1;
394 }
395
396 static void free_conv_buffers()
397 {
398   free_conv_buffer(ptr_buffer);
399   free_conv_buffer(tag_buffer);
400   free_conv_buffer(str_buffer);
401 }
402
403 static void yylex_cleanup()
404 {
405   /* fix memory leak in lex */
406   yy_delete_buffer(yy_current_buffer);
407   yy_current_buffer = NULL;
408   free_conv_buffers();
409 }
410
411 static void init_conv_buffers()
412 {
413   if (!ptr_buffer) {
414     ptr_buffer = create_conv_buffer(INITIAL_PTR_BUFFER_LEN);
415     tag_buffer = create_conv_buffer(INITIAL_TAG_BUFFER_LEN);
416     str_buffer = create_conv_buffer(INITIAL_STR_BUFFER_LEN);
417   }
418 }
419
420 static int exitfuncregistered = 0;
421
422 void yymyinit(FILE *f)
423 {
424   if (! exitfuncregistered && atexit(yylex_cleanup) == 0)
425     exitfuncregistered = 1;
426   init_conv_buffers();
427   yyin = f;
428   yyrestart(f);
429   /* Reset our state */
430   current_level = -1;
431   level_diff = MAXGEDCLEVEL;
432   BEGIN(INITIAL);
433 }
434
435 #endif