Don't lose the HEAD.TIME section of Lifelines.
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /* Common lexer code.
2    Copyright (C) 2001, 2002 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 #if LEX_SECTION == 1
25
26 #include "gedcom_internal.h"
27 #include "multilex.h"
28 #include "encoding.h"
29 #include "encoding_state.h"
30 #include "gedcom.h"
31 #include "gedcom.tabgen.h"
32 #include "compat.h"
33
34 static size_t encoding_width;
35 static int current_level = -1;
36 static int level_diff=MAXGEDCLEVEL;
37 static size_t line_len = 0;
38 static int tab_space = 0;
39
40 static struct conv_buffer* ptr_buffer = NULL;
41 static struct conv_buffer* tag_buffer = NULL;
42 static struct conv_buffer* str_buffer = NULL;
43
44 #define INITIAL_PTR_BUFFER_LEN MAXGEDCPTRLEN * UTF_FACTOR + 1
45 #define INITIAL_TAG_BUFFER_LEN MAXGEDCTAGLEN * UTF_FACTOR + 1
46 #define INITIAL_STR_BUFFER_LEN MAXGEDCLINELEN * UTF_FACTOR + 1
47
48 #ifdef LEXER_TEST 
49 YYSTYPE gedcom_lval;
50 int line_no = 1;
51
52 int gedcom_lex();
53
54 void message_handler(Gedcom_msg_type type, char *msg)
55 {
56   fprintf(stderr, "(%d) %s\n", type, msg);
57 }
58
59 int test_loop(ENCODING enc, const char* code)
60 {
61   int tok, res;
62   init_encodings();
63   set_encoding_width(enc);
64   gedcom_set_message_handler(message_handler);
65   res = open_conv_to_internal(code);
66   if (!res) {
67     gedcom_error("Unable to open conversion context: %s",
68                  strerror(errno));
69     return 1;
70   }
71   tok = gedcom_lex();
72   while (tok) {
73     switch(tok) {
74       case BADTOKEN: printf("BADTOKEN "); break;
75       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
76       case CLOSE: printf("CLOSE "); break;
77       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
78       case DELIM: printf("DELIM "); break;
79       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
80       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
81       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag.string); break;
82       default: printf("TAG(%s) ", gedcom_lval.tag.string); break;
83     }
84     tok = gedcom_lex();
85   }
86   printf("\n");
87   close_conv_to_internal();
88   return 0;  
89 }
90  
91 #endif /* of #ifdef LEXER_TEST */
92
93 /* These are defined as functions here, because xgettext has trouble
94    extracting the strings out of long pre-processor defined */
95
96 static void error_line_too_long(const char *line)
97 {
98   gedcom_error(_("Line too long, max %d characters allowed: %s"),
99                MAXGEDCLINELEN, line); 
100 }
101
102 static void error_level_leading_zero()
103 {
104   gedcom_error (_("Level number with leading zero not allowed"));
105 }
106
107 static void error_level_out_of_range()
108 {
109   gedcom_error (_("Level number out of range [0..%d]"), MAXGEDCLEVEL); 
110 }
111
112 static void error_level_too_high(int level_diff)
113 {
114   gedcom_error (_("GEDCOM level number is %d higher than previous"),
115                 level_diff); 
116 }
117
118 static void error_tag_too_long(const char *tag)
119 {
120   gedcom_error(_("Tag '%s' too long, max %d characters allowed"),
121                tag, MAXGEDCTAGLEN); 
122 }
123
124 static void error_invalid_character(const char *str, char ch)
125 {
126   gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), str, ch); 
127 }
128
129 static void error_pointer_too_long(const char *ptr)
130 {
131   gedcom_error(_("Pointer '%s' too long, max %d characters allowed"),
132                ptr, MAXGEDCPTRLEN);
133 }
134
135 static void error_at_character()
136 {
137   gedcom_error(_("'@' character should be written as '@@' in values"));
138 }
139
140 static void error_tab_character()
141 {
142   gedcom_error(_("Tab character is not allowed in values"));
143 }
144
145 static void error_unexpected_character(const char* str, char ch)
146 {
147   gedcom_error(_("Unexpected character: '%s' (0x%02x)"), str, ch);
148 }
149
150 /* This is to bypass the iconv conversion (if the input is UTF-8 coming
151    from the program) */
152 static int dummy_conv = 0;
153
154 #elif LEX_SECTION == 2
155
156 #define TO_INTERNAL(STR,OUTBUF) \
157   (dummy_conv ? STR : to_internal(STR, yyleng, OUTBUF))
158
159 #define INIT_LINE_LEN \
160   line_len = 0;
161
162 #define CHECK_LINE_LEN                                                        \
163   { if (line_len != (size_t)-1) {                                             \
164       line_len += strlen(yytext);                                             \
165       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
166         error_line_too_long(yytext);                                          \
167         line_len = (size_t)-1;                                                \
168         return BADTOKEN;                                                      \
169       }                                                                       \
170     }                                                                         \
171   }
172
173 #define GENERATE_TAB_SPACE                                                    \
174   { gedcom_lval.string = " ";                                                 \
175     tab_space--;                                                              \
176     return DELIM;                                                             \
177   }
178
179 #define MKTAGACTION(THETAG)                                                  \
180   { CHECK_LINE_LEN;                                                          \
181     gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer);                \
182     gedcom_lval.tag.value  = TAG_##THETAG;                                   \
183     BEGIN(NORMAL);                                                           \
184     line_no++;                                                               \
185     return TAG_##THETAG;                                                     \
186   }
187
188 /* The GEDCOM level number is converted into a sequence of opening
189    and closing brackets.  Simply put, the following GEDCOM fragment:
190    
191    0 HEAD
192    1 SOUR genes
193    2 VERS 1.6
194    2 NAME Genes
195    1 DATE 07 OCT 2001
196    ...
197    0 TRLR
198    
199    is converted into:
200    
201    { HEAD                     (initial)  
202    { SOUR genes               (1 higher: no closing brackets)
203    { VERS 1.6                 (1 higher: no closing brackets)
204    } { NAME Genes             (same level: 1 closing bracket)
205    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
206    ...
207    } { TRLR }
208    
209    or more clearly:
210    
211    { HEAD
212      { SOUR genes
213        { VERS 1.6 }
214        { NAME Genes } }
215      { DATE 07 OCT 2001
216      ... }
217    { TRLR }
218
219    But because this means that one token is converted into a series
220    of tokens, there is some initial code following immediately here
221    that returns "pending" tokens.
222
223    Also, for compatibility tabs are converted into spaces, which is
224    also handled here */
225
226 #define ACTION_BEFORE_REGEXPS                                                 \
227    { if (compat_mode(C_TAB_CHARACTER) && tab_space-- > 0) {                   \
228        GENERATE_TAB_SPACE;                                                    \
229      }                                                                        \
230      else if (level_diff < 1) {                                               \
231        level_diff++;                                                          \
232        return CLOSE;                                                          \
233      }                                                                        \
234      else if (level_diff == 1) {                                              \
235        level_diff++;                                                          \
236        gedcom_lval.number = current_level;                                    \
237        return OPEN;                                                           \
238      }                                                                        \
239      else {                                                                   \
240        /* out of brackets... */                                               \
241      }                                                                        \
242    }
243
244
245 #define ACTION_INITIAL_WHITESPACE                                             \
246   { CHECK_LINE_LEN;                                                           \
247     /* ignore initial whitespace further */                                   \
248   }
249
250
251 #define ACTION_0_DIGITS                                                       \
252    { error_level_leading_zero();                                              \
253      return BADTOKEN;                                                         \
254    } 
255
256
257 #define ACTION_DIGITS                                                         \
258    { int level = atoi(TO_INTERNAL(yytext, str_buffer));                       \
259      CHECK_LINE_LEN;                                                          \
260      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
261        error_level_out_of_range();                                            \
262        line_no++;                                                             \
263        return BADTOKEN;                                                       \
264      }                                                                        \
265      level_diff = level - current_level;                                      \
266      BEGIN(EXPECT_TAG);                                                       \
267      current_level = level;                                                   \
268      if (level_diff < 1) {                                                    \
269        level_diff++;                                                          \
270        return CLOSE;                                                          \
271      }                                                                        \
272      else if (level_diff == 1) {                                              \
273        level_diff++;                                                          \
274        gedcom_lval.number = current_level;                                    \
275        return OPEN;                                                           \
276      }                                                                        \
277      else {                                                                   \
278        /* should never happen (error to GEDCOM spec) */                       \
279        error_level_too_high(level_diff);                                      \
280        line_no++;                                                             \
281        return BADTOKEN;                                                       \
282      }                                                                        \
283    } 
284
285
286 #define ACTION_ALPHANUM                                                       \
287    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
288        error_tag_too_long(yytext);                                            \
289        line_no++;                                                             \
290        return BADTOKEN;                                                       \
291      }                                                                        \
292      CHECK_LINE_LEN;                                                          \
293      gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer);                \
294      gedcom_lval.tag.value  = USERTAG;                                        \
295      BEGIN(NORMAL);                                                           \
296      line_no++;                                                               \
297      return USERTAG;                                                          \
298    }
299
300
301 #define ACTION_DELIM                                                          \
302   { CHECK_LINE_LEN;                                                           \
303     gedcom_lval.string = TO_INTERNAL(yytext, str_buffer);                     \
304     return DELIM;                                                             \
305   }
306
307
308 #define ACTION_ANY                                                            \
309   { char* tmp;                                                                \
310     CHECK_LINE_LEN;                                                           \
311     tmp = TO_INTERNAL(yytext, str_buffer);                                    \
312     if (!tmp) {                                                               \
313       /* Something went wrong during conversion... */                         \
314           error_invalid_character(yytext, yytext[0]);                         \
315           return BADTOKEN;                                                    \
316     }                                                                         \
317     else {                                                                    \
318       gedcom_lval.string = tmp;                                               \
319       /* Due to character conversions, it is possible that the current        \
320          character will be combined with the next, and so now we don't have a \
321          character yet...                                                     \
322          In principle, this is only applicable to the 1byte case (e.g. ANSEL),\
323          but it doesn't harm the unicode case.                                \
324       */                                                                      \
325       if (strlen(gedcom_lval.string) > 0)                                     \
326         return ANYCHAR;                                                       \
327     }                                                                         \
328   }
329
330
331 #define ACTION_ESCAPE                                                         \
332   { CHECK_LINE_LEN;                                                           \
333     gedcom_lval.string = TO_INTERNAL(yytext, str_buffer);                     \
334     return ESCAPE;                                                            \
335   }
336
337
338 #define ACTION_POINTER                                                        \
339   { CHECK_LINE_LEN;                                                           \
340     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
341       error_pointer_too_long(yytext);                                         \
342       return BADTOKEN;                                                        \
343     }                                                                         \
344     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buffer);                     \
345     return POINTER;                                                           \
346   }
347
348
349 /* Due to the conversion of level numbers into brackets, the
350    terminator is not important, so no token is returned here.
351    Although not strictly according to the GEDCOM spec, we'll ignore
352    whitespace just before the terminator.
353 */
354
355 #define ACTION_TERMINATOR                                                     \
356   { CHECK_LINE_LEN;                                                           \
357     INIT_LINE_LEN;                                                            \
358     if (line_no == 1)                                                         \
359       set_read_encoding_terminator(TO_INTERNAL(yytext, str_buffer));          \
360     BEGIN(INITIAL);                                                           \
361   }
362
363
364 /* Eventually we have to return 1 closing bracket (for the trailer).
365    We can detect whether we have sent the closing bracket using the
366    level_diff (at eof, first it is 2, then we increment it ourselves)
367 */
368
369 #define ACTION_EOF                                                            \
370   { if (level_diff == 2) {                                                    \
371       level_diff++;                                                           \
372       return CLOSE;                                                           \
373     }                                                                         \
374     else {                                                                    \
375       char* ptr; int size;                                                    \
376       /* ... terminate lex */                                                 \
377       yyterminate();                                                          \
378       /* Get rid of f*cking compiler warning from lex generated code */       \
379       /* yyterminate does return(), so program will never come here  */       \
380       yy_flex_realloc(ptr, size);                                             \
381     }                                                                         \
382   } 
383
384 #define ACTION_NORMAL_AT                                                      \
385   { if (compat_mode(C_NO_DOUBLE_AT)) {                                        \
386       int i, j;                                                               \
387       char *yycopy = strdup(yytext);                                          \
388       if (yycopy) {                                                           \
389         for (i = 0; i < 2; i++)                                               \
390           for (j = yyleng - 1; j >= 0; --j)                                   \
391             unput(yycopy[j]);                                                 \
392         free(yycopy);                                                         \
393       }                                                                       \
394       else {                                                                  \
395         MEMORY_ERROR;                                                         \
396       }                                                                       \
397     }                                                                         \
398     else {                                                                    \
399       error_at_character();                                                   \
400       return BADTOKEN;                                                        \
401     }                                                                         \
402   }
403
404 #define ACTION_TAB                                                            \
405   { if (compat_mode(C_TAB_CHARACTER)) {                                       \
406       tab_space = 8;                                                          \
407       GENERATE_TAB_SPACE;                                                     \
408     }                                                                         \
409     else {                                                                    \
410       error_tab_character();                                                  \
411       return BADTOKEN;                                                        \
412     }                                                                         \
413   }
414
415 #define ACTION_UNEXPECTED                                                     \
416   { error_unexpected_character(yytext, yytext[0]);                            \
417     return BADTOKEN;                                                          \
418   }
419
420 #elif LEX_SECTION == 3
421
422 int yywrap()
423 {
424   return 1;
425 }
426
427 static void free_conv_buffers()
428 {
429   free_conv_buffer(ptr_buffer);
430   free_conv_buffer(tag_buffer);
431   free_conv_buffer(str_buffer);
432 }
433
434 static void yylex_cleanup()
435 {
436   /* fix memory leak in lex */
437   yy_delete_buffer(yy_current_buffer);
438   yy_current_buffer = NULL;
439   free_conv_buffers();
440 }
441
442 static void init_conv_buffers()
443 {
444   if (!ptr_buffer) {
445     ptr_buffer = create_conv_buffer(INITIAL_PTR_BUFFER_LEN);
446     tag_buffer = create_conv_buffer(INITIAL_TAG_BUFFER_LEN);
447     str_buffer = create_conv_buffer(INITIAL_STR_BUFFER_LEN);
448   }
449 }
450
451 static int exitfuncregistered = 0;
452
453 void yymyinit(FILE *f)
454 {
455   if (! exitfuncregistered && atexit(yylex_cleanup) == 0)
456     exitfuncregistered = 1;
457   init_conv_buffers();
458   yyin = f;
459   yyrestart(f);
460   /* Reset our state */
461   current_level = -1;
462   level_diff = MAXGEDCLEVEL;
463   BEGIN(INITIAL);
464 }
465
466 #endif