045cea22d3ba46e2bb3c64500efafe5c784263d1
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /* Common lexer code.
2    Copyright (C) 2001, 2002 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 #ifndef IN_LEX
25
26 #include "gedcom_internal.h"
27 #include "multilex.h"
28 #include "encoding.h"
29 #include "gedcom.h"
30 #include "gedcom.tabgen.h"
31 #include "compat.h"
32
33 static size_t encoding_width;
34 static int current_level = -1;
35 static int level_diff=MAXGEDCLEVEL;
36 static size_t line_len = 0;
37
38 static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1];
39 static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1];
40 static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1];
41
42 #ifdef LEXER_TEST 
43 YYSTYPE gedcom_lval;
44 int line_no = 1;
45 int compat_at = 0;
46
47 int gedcom_lex();
48
49 void message_handler(Gedcom_msg_type type, char *msg)
50 {
51   fprintf(stderr, "(%d) %s\n", type, msg);
52 }
53
54 int test_loop(ENCODING enc, const char* code)
55 {
56   int tok, res;
57   init_encodings();
58   set_encoding_width(enc);
59   gedcom_set_message_handler(message_handler);
60   res = open_conv_to_internal(code);
61   if (!res) {
62     gedcom_error("Unable to open conversion context: %s",
63                  strerror(errno));
64     return 1;
65   }
66   tok = gedcom_lex();
67   while (tok) {
68     switch(tok) {
69       case BADTOKEN: printf("BADTOKEN "); break;
70       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
71       case CLOSE: printf("CLOSE "); break;
72       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
73       case DELIM: printf("DELIM "); break;
74       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
75       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
76       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag.string); break;
77       default: printf("TAG(%s) ", gedcom_lval.tag.string); break;
78     }
79     tok = gedcom_lex();
80   }
81   printf("\n");
82   close_conv_to_internal();
83   return 0;  
84 }
85  
86 #endif /* of #ifdef LEXER_TEST */
87
88 /* These are defined as functions here, because xgettext has trouble
89    extracting the strings out of long pre-processor defined */
90
91 static void error_line_too_long()
92 {
93   gedcom_error(_("Line too long, max %d characters allowed"), MAXGEDCLINELEN); 
94 }
95
96 static void error_level_leading_zero()
97 {
98   gedcom_error (_("Level number with leading zero not allowed"));
99 }
100
101 static void error_level_out_of_range()
102 {
103   gedcom_error (_("Level number out of range [0..%d]"), MAXGEDCLEVEL); 
104 }
105
106 static void error_level_too_high(int level_diff)
107 {
108   gedcom_error (_("GEDCOM level number is %d higher than previous"),
109                 level_diff); 
110 }
111
112 static void error_tag_too_long(const char *tag)
113 {
114   gedcom_error(_("Tag '%s' too long, max %d characters allowed"),
115                tag, MAXGEDCTAGLEN); 
116 }
117
118 static void error_invalid_character(const char *str, char ch)
119 {
120   gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), str, ch); 
121 }
122
123 static void error_pointer_too_long(const char *ptr)
124 {
125   gedcom_error(_("Pointer '%s' too long, max %d characters allowed"),
126                ptr, MAXGEDCPTRLEN);
127 }
128
129 static void error_at_character()
130 {
131   gedcom_error(_("'@' character should be written as '@@' in values"));
132 }
133
134 static void error_unexpected_character(const char* str, char ch)
135 {
136   gedcom_error(_("Unexpected character: '%s' (0x%02x)"), str, ch);
137 }
138
139 static void yylex_cleanup()
140 {
141   /* fix memory leak in lex */
142   yy_delete_buffer(yy_current_buffer);
143   yy_current_buffer = NULL;
144 }
145
146 #else  /* of #ifndef IN_LEX */
147
148 #define TO_INTERNAL(STR,OUTBUF) \
149   to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF))
150
151 #define INIT_LINE_LEN \
152   line_len = 0;
153
154 #define CHECK_LINE_LEN                                                        \
155   { if (line_len != (size_t)-1) {                                             \
156       line_len += strlen(yytext);                                             \
157       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
158         error_line_too_long();                                                \
159         line_len = (size_t)-1;                                                \
160         return BADTOKEN;                                                      \
161       }                                                                       \
162     }                                                                         \
163   }
164
165 #define MKTAGACTION(THETAG)                                                  \
166   { CHECK_LINE_LEN;                                                          \
167     gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buf);                   \
168     gedcom_lval.tag.value  = TAG_##THETAG;                                   \
169     BEGIN(NORMAL);                                                           \
170     return TAG_##THETAG;                                                     \
171   }
172
173 /* The GEDCOM level number is converted into a sequence of opening
174    and closing brackets.  Simply put, the following GEDCOM fragment:
175    
176    0 HEAD
177    1 SOUR genes
178    2 VERS 1.6
179    2 NAME Genes
180    1 DATE 07 OCT 2001
181    ...
182    0 TRLR
183    
184    is converted into:
185    
186    { HEAD                     (initial)  
187    { SOUR genes               (1 higher: no closing brackets)
188    { VERS 1.6                 (1 higher: no closing brackets)
189    } { NAME Genes             (same level: 1 closing bracket)
190    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
191    ...
192    } { TRLR }
193    
194    or more clearly:
195    
196    { HEAD
197      { SOUR genes
198        { VERS 1.6 }
199        { NAME Genes } }
200      { DATE 07 OCT 2001
201      ... }
202    { TRLR }
203
204    But because this means that one token is converted into a series
205    of tokens, there is some initial code following immediately here
206    that returns "pending" tokens. */
207
208 #define ACTION_BEFORE_REGEXPS                                                 \
209    { if (level_diff < 1) {                                                    \
210        level_diff++;                                                          \
211        return CLOSE;                                                          \
212      }                                                                        \
213      else if (level_diff == 1) {                                              \
214        level_diff++;                                                          \
215        gedcom_lval.number = current_level;                                    \
216        return OPEN;                                                           \
217      }                                                                        \
218      else {                                                                   \
219        /* out of brackets... */                                               \
220      }                                                                        \
221    }
222
223
224 #define ACTION_INITIAL_WHITESPACE                                             \
225   { CHECK_LINE_LEN;                                                           \
226     /* ignore initial whitespace further */                                   \
227   }
228
229
230 #define ACTION_0_DIGITS                                                       \
231    { error_level_leading_zero();                                              \
232      return BADTOKEN;                                                         \
233    } 
234
235
236 #define ACTION_DIGITS                                                         \
237    { int level = atoi(TO_INTERNAL(yytext, str_buf));                          \
238      CHECK_LINE_LEN;                                                          \
239      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
240        error_level_out_of_range();                                            \
241        return BADTOKEN;                                                       \
242      }                                                                        \
243      level_diff = level - current_level;                                      \
244      BEGIN(EXPECT_TAG);                                                       \
245      current_level = level;                                                   \
246      if (level_diff < 1) {                                                    \
247        level_diff++;                                                          \
248        return CLOSE;                                                          \
249      }                                                                        \
250      else if (level_diff == 1) {                                              \
251        level_diff++;                                                          \
252        gedcom_lval.number = current_level;                                    \
253        return OPEN;                                                           \
254      }                                                                        \
255      else {                                                                   \
256        /* should never happen (error to GEDCOM spec) */                       \
257        error_level_too_high(level_diff);                                      \
258        return BADTOKEN;                                                       \
259      }                                                                        \
260    } 
261
262
263 #define ACTION_ALPHANUM                                                       \
264    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
265        error_tag_too_long(yytext);                                            \
266        return BADTOKEN;                                                       \
267      }                                                                        \
268      CHECK_LINE_LEN;                                                          \
269      gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buf);                   \
270      gedcom_lval.tag.value  = USERTAG;                                        \
271      BEGIN(NORMAL);                                                           \
272      return USERTAG;                                                          \
273    }
274
275
276 #define ACTION_DELIM                                                          \
277   { CHECK_LINE_LEN;                                                           \
278     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
279     return DELIM;                                                             \
280   }
281
282
283 #define ACTION_ANY                                                            \
284   { char* tmp;                                                                \
285     CHECK_LINE_LEN;                                                           \
286     tmp = TO_INTERNAL(yytext, str_buf);                                       \
287     if (!tmp) {                                                               \
288       /* Something went wrong during conversion... */                         \
289           error_invalid_character(yytext, yytext[0]);                         \
290           return BADTOKEN;                                                    \
291     }                                                                         \
292     else {                                                                    \
293       gedcom_lval.string = tmp;                                               \
294       /* Due to character conversions, it is possible that the current        \
295          character will be combined with the next, and so now we don't have a \
296          character yet...                                                     \
297          In principle, this is only applicable to the 1byte case (e.g. ANSEL),\
298          but it doesn't harm the unicode case.                                \
299       */                                                                      \
300       if (strlen(gedcom_lval.string) > 0)                                     \
301         return ANYCHAR;                                                       \
302     }                                                                         \
303   }
304
305
306 #define ACTION_ESCAPE                                                         \
307   { CHECK_LINE_LEN;                                                           \
308     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
309     return ESCAPE;                                                            \
310   }
311
312
313 #define ACTION_POINTER                                                        \
314   { CHECK_LINE_LEN;                                                           \
315     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
316       error_pointer_too_long(yytext);                                         \
317       return BADTOKEN;                                                        \
318     }                                                                         \
319     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf);                        \
320     return POINTER;                                                           \
321   }
322
323
324 /* Due to the conversion of level numbers into brackets, the
325    terminator is not important, so no token is returned here.
326    Although not strictly according to the GEDCOM spec, we'll ignore
327    whitespace just before the terminator.
328 */
329
330 #define ACTION_TERMINATOR                                                     \
331   { CHECK_LINE_LEN;                                                           \
332     INIT_LINE_LEN;                                                            \
333     line_no++;                                                                \
334     BEGIN(INITIAL);                                                           \
335   }
336
337
338 /* Eventually we have to return 1 closing bracket (for the trailer).
339    We can detect whether we have sent the closing bracket using the
340    level_diff (at eof, first it is 2, then we increment it ourselves)
341 */
342
343 #define ACTION_EOF                                                            \
344   { if (level_diff == 2) {                                                    \
345       level_diff++;                                                           \
346       return CLOSE;                                                           \
347     }                                                                         \
348     else {                                                                    \
349       char* ptr; int size;                                                    \
350       /* Reset our state */                                                   \
351       current_level = -1;                                                     \
352       level_diff = MAXGEDCLEVEL;                                              \
353       /* ... then terminate lex */                                            \
354       yyterminate();                                                          \
355       /* Get rid of f*cking compiler warning from lex generated code */       \
356       /* yyterminate does return(), so program will never come here  */       \
357       yy_flex_realloc(ptr, size);                                             \
358     }                                                                         \
359   } 
360
361 #define ACTION_NORMAL_AT                                                      \
362   { if (compat_at) {                                                          \
363       int i, j;                                                               \
364       char *yycopy = strdup(yytext);                                          \
365       if (yycopy) {                                                           \
366         for (i = 0; i < 2; i++)                                               \
367           for (j = yyleng - 1; j >= 0; --j)                                   \
368             unput(yycopy[j]);                                                 \
369         free(yycopy);                                                         \
370       }                                                                       \
371       else {                                                                  \
372         MEMORY_ERROR;                                                         \
373       }                                                                       \
374     }                                                                         \
375     else {                                                                    \
376       error_at_character();                                                   \
377       return BADTOKEN;                                                        \
378     }                                                                         \
379   }
380
381 #define ACTION_UNEXPECTED                                                     \
382   { error_unexpected_character(yytext, yytext[0]);                            \
383     return BADTOKEN;                                                          \
384   }
385
386 #endif /* IN_LEX */