Copied from old documentation. Removed all Gedcom_val details.
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /* Common lexer code.
2    Copyright (C) 2001, 2002 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 #if LEX_SECTION == 1
25
26 #include "gedcom_internal.h"
27 #include "multilex.h"
28 #include "encoding.h"
29 #include "encoding_state.h"
30 #include "gedcom.h"
31 #include "gedcom.tabgen.h"
32 #include "compat.h"
33
34 static size_t encoding_width;
35 static int current_level = -1;
36 static int level_diff = MAXGEDCLEVEL;
37 static size_t line_len = 0;
38 static int tab_space = 0;
39 static int current_tag = -1;
40
41 static struct conv_buffer* ptr_buffer = NULL;
42 static struct conv_buffer* tag_buffer = NULL;
43 static struct conv_buffer* str_buffer = NULL;
44
45 #define INITIAL_PTR_BUFFER_LEN MAXGEDCPTRLEN * UTF_FACTOR + 1
46 #define INITIAL_TAG_BUFFER_LEN MAXGEDCTAGLEN * UTF_FACTOR + 1
47 #define INITIAL_STR_BUFFER_LEN MAXGEDCLINELEN * UTF_FACTOR + 1
48
49 #ifdef LEXER_TEST 
50 YYSTYPE gedcom_lval;
51 int line_no = 1;
52
53 int gedcom_lex();
54
55 void message_handler(Gedcom_msg_type type, char *msg)
56 {
57   fprintf(stderr, "(%d) %s\n", type, msg);
58 }
59
60 int test_loop(ENCODING enc, const char* code)
61 {
62   int tok, res;
63   init_encodings();
64   set_encoding_width(enc);
65   gedcom_set_message_handler(message_handler);
66   res = open_conv_to_internal(code);
67   if (!res) {
68     gedcom_error("Unable to open conversion context: %s",
69                  strerror(errno));
70     return 1;
71   }
72   tok = gedcom_lex();
73   while (tok) {
74     switch(tok) {
75       case BADTOKEN: printf("BADTOKEN "); break;
76       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
77       case CLOSE: printf("CLOSE "); break;
78       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
79       case DELIM: printf("DELIM "); break;
80       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
81       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
82       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag.string); break;
83       default: printf("TAG(%s) ", gedcom_lval.tag.string); break;
84     }
85     tok = gedcom_lex();
86   }
87   printf("\n");
88   close_conv_to_internal();
89   return 0;  
90 }
91  
92 #endif /* of #ifdef LEXER_TEST */
93
94 /* These are defined as functions here, because xgettext has trouble
95    extracting the strings out of long pre-processor defined */
96
97 static void error_line_too_long()
98 {
99   gedcom_error(_("Line too long, max %d characters allowed"),
100                MAXGEDCLINELEN); 
101 }
102
103 static void error_level_leading_zero()
104 {
105   gedcom_error (_("Level number with leading zero not allowed"));
106 }
107
108 static void error_level_out_of_range()
109 {
110   gedcom_error (_("Level number out of range [0..%d]"), MAXGEDCLEVEL); 
111 }
112
113 static void error_level_too_high(int level_diff)
114 {
115   gedcom_error (_("GEDCOM level number is %d higher than previous"),
116                 level_diff); 
117 }
118
119 static void error_tag_too_long(const char *tag)
120 {
121   gedcom_error(_("Tag '%s' too long, max %d characters allowed"),
122                tag, MAXGEDCTAGLEN); 
123 }
124
125 static void error_invalid_character(const char *str, char ch)
126 {
127   gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), str, ch); 
128 }
129
130 static void error_pointer_too_long(const char *ptr)
131 {
132   gedcom_error(_("Pointer '%s' too long, max %d characters allowed"),
133                ptr, MAXGEDCPTRLEN);
134 }
135
136 static void error_at_character()
137 {
138   gedcom_error(_("'@' character should be written as '@@' in values"));
139 }
140
141 static void error_tab_character()
142 {
143   gedcom_error(_("Tab character is not allowed in values"));
144 }
145
146 static void error_unexpected_character(const char* str, char ch)
147 {
148   gedcom_error(_("Unexpected character: '%s' (0x%02x)"), str, ch);
149 }
150
151 /* This is to bypass the iconv conversion (if the input is UTF-8 coming
152    from the program) */
153 static int dummy_conv = 0;
154
155 #elif LEX_SECTION == 2
156
157 #define TO_INTERNAL(STR,OUTBUF) \
158   (dummy_conv ? STR : to_internal(STR, yyleng, OUTBUF))
159
160 #define INIT_LINE_LEN \
161   line_len = 0;
162
163 #define CHECK_LINE_LEN                                                        \
164   { if (line_len != (size_t)-1) {                                             \
165       line_len += strlen(yytext);                                             \
166       if (line_len > MAXGEDCLINELEN * encoding_width                          \
167           && ! compat_long_line(current_level, current_tag)) {                \
168         error_line_too_long();                                                \
169         line_len = (size_t)-1;                                                \
170         return BADTOKEN;                                                      \
171       }                                                                       \
172     }                                                                         \
173   }
174
175 #define GENERATE_TAB_SPACE                                                    \
176   { gedcom_lval.string = " ";                                                 \
177     tab_space--;                                                              \
178     return DELIM;                                                             \
179   }
180
181 #define MKTAGACTION(THETAG)                                                  \
182   { CHECK_LINE_LEN;                                                          \
183     gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer);                \
184     current_tag            = TAG_##THETAG;                                   \
185     gedcom_lval.tag.value  = current_tag;                                    \
186     BEGIN(NORMAL);                                                           \
187     line_no++;                                                               \
188     return current_tag;                                                      \
189   }
190
191 /* The GEDCOM level number is converted into a sequence of opening
192    and closing brackets.  Simply put, the following GEDCOM fragment:
193    
194    0 HEAD
195    1 SOUR genes
196    2 VERS 1.6
197    2 NAME Genes
198    1 DATE 07 OCT 2001
199    ...
200    0 TRLR
201    
202    is converted into:
203    
204    { HEAD                     (initial)  
205    { SOUR genes               (1 higher: no closing brackets)
206    { VERS 1.6                 (1 higher: no closing brackets)
207    } { NAME Genes             (same level: 1 closing bracket)
208    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
209    ...
210    } { TRLR }
211    
212    or more clearly:
213    
214    { HEAD
215      { SOUR genes
216        { VERS 1.6 }
217        { NAME Genes } }
218      { DATE 07 OCT 2001
219      ... }
220    { TRLR }
221
222    But because this means that one token is converted into a series
223    of tokens, there is some initial code following immediately here
224    that returns "pending" tokens.
225
226    Also, for compatibility tabs are converted into spaces, which is
227    also handled here */
228
229 #define ACTION_BEFORE_REGEXPS                                                 \
230    { if (compat_mode(C_TAB_CHARACTER) && tab_space-- > 0) {                   \
231        GENERATE_TAB_SPACE;                                                    \
232      }                                                                        \
233      else if (level_diff < 1) {                                               \
234        level_diff++;                                                          \
235        return CLOSE;                                                          \
236      }                                                                        \
237      else if (level_diff == 1) {                                              \
238        level_diff++;                                                          \
239        gedcom_lval.number = current_level;                                    \
240        return OPEN;                                                           \
241      }                                                                        \
242      else {                                                                   \
243        /* out of brackets... */                                               \
244      }                                                                        \
245    }
246
247
248 #define ACTION_INITIAL_WHITESPACE                                             \
249   { CHECK_LINE_LEN;                                                           \
250     /* ignore initial whitespace further */                                   \
251   }
252
253
254 #define ACTION_0_DIGITS                                                       \
255    { error_level_leading_zero();                                              \
256      return BADTOKEN;                                                         \
257    } 
258
259
260 #define ACTION_DIGITS                                                         \
261    { int level = atoi(TO_INTERNAL(yytext, str_buffer));                       \
262      CHECK_LINE_LEN;                                                          \
263      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
264        error_level_out_of_range();                                            \
265        line_no++;                                                             \
266        return BADTOKEN;                                                       \
267      }                                                                        \
268      level_diff = level - current_level;                                      \
269      BEGIN(EXPECT_TAG);                                                       \
270      current_level = level;                                                   \
271      if (level_diff < 1) {                                                    \
272        level_diff++;                                                          \
273        return CLOSE;                                                          \
274      }                                                                        \
275      else if (level_diff == 1) {                                              \
276        level_diff++;                                                          \
277        gedcom_lval.number = current_level;                                    \
278        return OPEN;                                                           \
279      }                                                                        \
280      else {                                                                   \
281        /* should never happen (error to GEDCOM spec) */                       \
282        error_level_too_high(level_diff);                                      \
283        line_no++;                                                             \
284        return BADTOKEN;                                                       \
285      }                                                                        \
286    } 
287
288
289 #define ACTION_ALPHANUM                                                       \
290    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
291        error_tag_too_long(yytext);                                            \
292        line_no++;                                                             \
293        return BADTOKEN;                                                       \
294      }                                                                        \
295      CHECK_LINE_LEN;                                                          \
296      gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer);                \
297      gedcom_lval.tag.value  = USERTAG;                                        \
298      BEGIN(NORMAL);                                                           \
299      line_no++;                                                               \
300      return USERTAG;                                                          \
301    }
302
303
304 #define ACTION_DELIM                                                          \
305   { CHECK_LINE_LEN;                                                           \
306     gedcom_lval.string = TO_INTERNAL(yytext, str_buffer);                     \
307     return DELIM;                                                             \
308   }
309
310
311 #define ACTION_ANY                                                            \
312   { char* tmp;                                                                \
313     CHECK_LINE_LEN;                                                           \
314     tmp = TO_INTERNAL(yytext, str_buffer);                                    \
315     if (!tmp) {                                                               \
316       /* Something went wrong during conversion... */                         \
317           error_invalid_character(yytext, yytext[0]);                         \
318           return BADTOKEN;                                                    \
319     }                                                                         \
320     else {                                                                    \
321       gedcom_lval.string = tmp;                                               \
322       /* Due to character conversions, it is possible that the current        \
323          character will be combined with the next, and so now we don't have a \
324          character yet...                                                     \
325          In principle, this is only applicable to the 1byte case (e.g. ANSEL),\
326          but it doesn't harm the unicode case.                                \
327       */                                                                      \
328       if (strlen(gedcom_lval.string) > 0)                                     \
329         return ANYCHAR;                                                       \
330     }                                                                         \
331   }
332
333
334 #define ACTION_ESCAPE                                                         \
335   { CHECK_LINE_LEN;                                                           \
336     gedcom_lval.string = TO_INTERNAL(yytext, str_buffer);                     \
337     return ESCAPE;                                                            \
338   }
339
340
341 #define ACTION_POINTER                                                        \
342   { CHECK_LINE_LEN;                                                           \
343     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
344       error_pointer_too_long(yytext);                                         \
345       return BADTOKEN;                                                        \
346     }                                                                         \
347     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buffer);                     \
348     return POINTER;                                                           \
349   }
350
351
352 /* Due to the conversion of level numbers into brackets, the
353    terminator is not important, so no token is returned here.
354    Although not strictly according to the GEDCOM spec, we'll ignore
355    whitespace just before the terminator.
356 */
357
358 #define ACTION_TERMINATOR                                                     \
359   { CHECK_LINE_LEN;                                                           \
360     INIT_LINE_LEN;                                                            \
361     if (line_no == 1)                                                         \
362       set_read_encoding_terminator(TO_INTERNAL(yytext, str_buffer));          \
363     BEGIN(INITIAL);                                                           \
364   }
365
366
367 /* Eventually we have to return 1 closing bracket (for the trailer).
368    We can detect whether we have sent the closing bracket using the
369    level_diff (at eof, first it is 2, then we increment it ourselves)
370 */
371
372 #define ACTION_EOF                                                            \
373   { if (level_diff == 2) {                                                    \
374       level_diff++;                                                           \
375       return CLOSE;                                                           \
376     }                                                                         \
377     else {                                                                    \
378       char* ptr; int size;                                                    \
379       /* ... terminate lex */                                                 \
380       yyterminate();                                                          \
381       /* Get rid of f*cking compiler warning from lex generated code */       \
382       /* yyterminate does return(), so program will never come here  */       \
383       yy_flex_realloc(ptr, size);                                             \
384     }                                                                         \
385   } 
386
387 #define ACTION_NORMAL_AT                                                      \
388   { if (compat_mode(C_NO_DOUBLE_AT)) {                                        \
389       int i, j;                                                               \
390       char *yycopy = strdup(yytext);                                          \
391       if (yycopy) {                                                           \
392         for (i = 0; i < 2; i++)                                               \
393           for (j = yyleng - 1; j >= 0; --j)                                   \
394             unput(yycopy[j]);                                                 \
395         free(yycopy);                                                         \
396       }                                                                       \
397       else {                                                                  \
398         MEMORY_ERROR;                                                         \
399       }                                                                       \
400     }                                                                         \
401     else {                                                                    \
402       error_at_character();                                                   \
403       return BADTOKEN;                                                        \
404     }                                                                         \
405   }
406
407 #define ACTION_TAB                                                            \
408   { if (compat_mode(C_TAB_CHARACTER)) {                                       \
409       tab_space = 8;                                                          \
410       GENERATE_TAB_SPACE;                                                     \
411     }                                                                         \
412     else {                                                                    \
413       error_tab_character();                                                  \
414       return BADTOKEN;                                                        \
415     }                                                                         \
416   }
417
418 #define ACTION_UNEXPECTED                                                     \
419   { error_unexpected_character(yytext, yytext[0]);                            \
420     return BADTOKEN;                                                          \
421   }
422
423 #elif LEX_SECTION == 3
424
425 int yywrap()
426 {
427   return 1;
428 }
429
430 static void free_conv_buffers()
431 {
432   free_conv_buffer(ptr_buffer);
433   free_conv_buffer(tag_buffer);
434   free_conv_buffer(str_buffer);
435 }
436
437 static void yylex_cleanup()
438 {
439   /* fix memory leak in lex */
440   yy_delete_buffer(yy_current_buffer);
441   yy_current_buffer = NULL;
442   free_conv_buffers();
443 }
444
445 static void init_conv_buffers()
446 {
447   if (!ptr_buffer) {
448     ptr_buffer = create_conv_buffer(INITIAL_PTR_BUFFER_LEN);
449     tag_buffer = create_conv_buffer(INITIAL_TAG_BUFFER_LEN);
450     str_buffer = create_conv_buffer(INITIAL_STR_BUFFER_LEN);
451   }
452 }
453
454 static int exitfuncregistered = 0;
455
456 void yymyinit(FILE *f)
457 {
458   if (! exitfuncregistered && atexit(yylex_cleanup) == 0)
459     exitfuncregistered = 1;
460   init_conv_buffers();
461   yyin = f;
462   yyrestart(f);
463   /* Reset our state */
464   current_level = -1;
465   level_diff = MAXGEDCLEVEL;
466   BEGIN(INITIAL);
467 }
468
469 #endif