efc4bf931eed796ff332e6dec4df9a36f20d9c28
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /* Common lexer code.
2    Copyright (C) 2001, 2002 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 #if LEX_SECTION == 1
25
26 #include "gedcom_internal.h"
27 #include "multilex.h"
28 #include "encoding.h"
29 #include "gedcom.h"
30 #include "gedcom.tabgen.h"
31 #include "compat.h"
32
33 static size_t encoding_width;
34 static int current_level = -1;
35 static int level_diff=MAXGEDCLEVEL;
36 static size_t line_len = 0;
37
38 static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1];
39 static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1];
40 static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1];
41
42 #ifdef LEXER_TEST 
43 YYSTYPE gedcom_lval;
44 int line_no = 1;
45 int compat_at = 0;
46
47 int gedcom_lex();
48
49 void message_handler(Gedcom_msg_type type, char *msg)
50 {
51   fprintf(stderr, "(%d) %s\n", type, msg);
52 }
53
54 int test_loop(ENCODING enc, const char* code)
55 {
56   int tok, res;
57   init_encodings();
58   set_encoding_width(enc);
59   gedcom_set_message_handler(message_handler);
60   res = open_conv_to_internal(code);
61   if (!res) {
62     gedcom_error("Unable to open conversion context: %s",
63                  strerror(errno));
64     return 1;
65   }
66   tok = gedcom_lex();
67   while (tok) {
68     switch(tok) {
69       case BADTOKEN: printf("BADTOKEN "); break;
70       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
71       case CLOSE: printf("CLOSE "); break;
72       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
73       case DELIM: printf("DELIM "); break;
74       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
75       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
76       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag.string); break;
77       default: printf("TAG(%s) ", gedcom_lval.tag.string); break;
78     }
79     tok = gedcom_lex();
80   }
81   printf("\n");
82   close_conv_to_internal();
83   return 0;  
84 }
85  
86 #endif /* of #ifdef LEXER_TEST */
87
88 /* These are defined as functions here, because xgettext has trouble
89    extracting the strings out of long pre-processor defined */
90
91 static void error_line_too_long()
92 {
93   gedcom_error(_("Line too long, max %d characters allowed"), MAXGEDCLINELEN); 
94 }
95
96 static void error_level_leading_zero()
97 {
98   gedcom_error (_("Level number with leading zero not allowed"));
99 }
100
101 static void error_level_out_of_range()
102 {
103   gedcom_error (_("Level number out of range [0..%d]"), MAXGEDCLEVEL); 
104 }
105
106 static void error_level_too_high(int level_diff)
107 {
108   gedcom_error (_("GEDCOM level number is %d higher than previous"),
109                 level_diff); 
110 }
111
112 static void error_tag_too_long(const char *tag)
113 {
114   gedcom_error(_("Tag '%s' too long, max %d characters allowed"),
115                tag, MAXGEDCTAGLEN); 
116 }
117
118 static void error_invalid_character(const char *str, char ch)
119 {
120   gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), str, ch); 
121 }
122
123 static void error_pointer_too_long(const char *ptr)
124 {
125   gedcom_error(_("Pointer '%s' too long, max %d characters allowed"),
126                ptr, MAXGEDCPTRLEN);
127 }
128
129 static void error_at_character()
130 {
131   gedcom_error(_("'@' character should be written as '@@' in values"));
132 }
133
134 static void error_unexpected_character(const char* str, char ch)
135 {
136   gedcom_error(_("Unexpected character: '%s' (0x%02x)"), str, ch);
137 }
138
139 /* This is to bypass the iconv conversion (if the input is UTF-8 coming
140    from the program) */
141 static int dummy_conv = 0;
142
143 #elif LEX_SECTION == 2
144
145 #define TO_INTERNAL(STR,OUTBUF) \
146   (dummy_conv ? STR : to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF)))
147
148 #define INIT_LINE_LEN \
149   line_len = 0;
150
151 #define CHECK_LINE_LEN                                                        \
152   { if (line_len != (size_t)-1) {                                             \
153       line_len += strlen(yytext);                                             \
154       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
155         error_line_too_long();                                                \
156         line_len = (size_t)-1;                                                \
157         return BADTOKEN;                                                      \
158       }                                                                       \
159     }                                                                         \
160   }
161
162 #define MKTAGACTION(THETAG)                                                  \
163   { CHECK_LINE_LEN;                                                          \
164     gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buf);                   \
165     gedcom_lval.tag.value  = TAG_##THETAG;                                   \
166     BEGIN(NORMAL);                                                           \
167     return TAG_##THETAG;                                                     \
168   }
169
170 /* The GEDCOM level number is converted into a sequence of opening
171    and closing brackets.  Simply put, the following GEDCOM fragment:
172    
173    0 HEAD
174    1 SOUR genes
175    2 VERS 1.6
176    2 NAME Genes
177    1 DATE 07 OCT 2001
178    ...
179    0 TRLR
180    
181    is converted into:
182    
183    { HEAD                     (initial)  
184    { SOUR genes               (1 higher: no closing brackets)
185    { VERS 1.6                 (1 higher: no closing brackets)
186    } { NAME Genes             (same level: 1 closing bracket)
187    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
188    ...
189    } { TRLR }
190    
191    or more clearly:
192    
193    { HEAD
194      { SOUR genes
195        { VERS 1.6 }
196        { NAME Genes } }
197      { DATE 07 OCT 2001
198      ... }
199    { TRLR }
200
201    But because this means that one token is converted into a series
202    of tokens, there is some initial code following immediately here
203    that returns "pending" tokens. */
204
205 #define ACTION_BEFORE_REGEXPS                                                 \
206    { if (level_diff < 1) {                                                    \
207        level_diff++;                                                          \
208        return CLOSE;                                                          \
209      }                                                                        \
210      else if (level_diff == 1) {                                              \
211        level_diff++;                                                          \
212        gedcom_lval.number = current_level;                                    \
213        return OPEN;                                                           \
214      }                                                                        \
215      else {                                                                   \
216        /* out of brackets... */                                               \
217      }                                                                        \
218    }
219
220
221 #define ACTION_INITIAL_WHITESPACE                                             \
222   { CHECK_LINE_LEN;                                                           \
223     /* ignore initial whitespace further */                                   \
224   }
225
226
227 #define ACTION_0_DIGITS                                                       \
228    { error_level_leading_zero();                                              \
229      return BADTOKEN;                                                         \
230    } 
231
232
233 #define ACTION_DIGITS                                                         \
234    { int level = atoi(TO_INTERNAL(yytext, str_buf));                          \
235      CHECK_LINE_LEN;                                                          \
236      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
237        error_level_out_of_range();                                            \
238        return BADTOKEN;                                                       \
239      }                                                                        \
240      level_diff = level - current_level;                                      \
241      BEGIN(EXPECT_TAG);                                                       \
242      current_level = level;                                                   \
243      if (level_diff < 1) {                                                    \
244        level_diff++;                                                          \
245        return CLOSE;                                                          \
246      }                                                                        \
247      else if (level_diff == 1) {                                              \
248        level_diff++;                                                          \
249        gedcom_lval.number = current_level;                                    \
250        return OPEN;                                                           \
251      }                                                                        \
252      else {                                                                   \
253        /* should never happen (error to GEDCOM spec) */                       \
254        error_level_too_high(level_diff);                                      \
255        return BADTOKEN;                                                       \
256      }                                                                        \
257    } 
258
259
260 #define ACTION_ALPHANUM                                                       \
261    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
262        error_tag_too_long(yytext);                                            \
263        return BADTOKEN;                                                       \
264      }                                                                        \
265      CHECK_LINE_LEN;                                                          \
266      gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buf);                   \
267      gedcom_lval.tag.value  = USERTAG;                                        \
268      BEGIN(NORMAL);                                                           \
269      return USERTAG;                                                          \
270    }
271
272
273 #define ACTION_DELIM                                                          \
274   { CHECK_LINE_LEN;                                                           \
275     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
276     return DELIM;                                                             \
277   }
278
279
280 #define ACTION_ANY                                                            \
281   { char* tmp;                                                                \
282     CHECK_LINE_LEN;                                                           \
283     tmp = TO_INTERNAL(yytext, str_buf);                                       \
284     if (!tmp) {                                                               \
285       /* Something went wrong during conversion... */                         \
286           error_invalid_character(yytext, yytext[0]);                         \
287           return BADTOKEN;                                                    \
288     }                                                                         \
289     else {                                                                    \
290       gedcom_lval.string = tmp;                                               \
291       /* Due to character conversions, it is possible that the current        \
292          character will be combined with the next, and so now we don't have a \
293          character yet...                                                     \
294          In principle, this is only applicable to the 1byte case (e.g. ANSEL),\
295          but it doesn't harm the unicode case.                                \
296       */                                                                      \
297       if (strlen(gedcom_lval.string) > 0)                                     \
298         return ANYCHAR;                                                       \
299     }                                                                         \
300   }
301
302
303 #define ACTION_ESCAPE                                                         \
304   { CHECK_LINE_LEN;                                                           \
305     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
306     return ESCAPE;                                                            \
307   }
308
309
310 #define ACTION_POINTER                                                        \
311   { CHECK_LINE_LEN;                                                           \
312     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
313       error_pointer_too_long(yytext);                                         \
314       return BADTOKEN;                                                        \
315     }                                                                         \
316     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf);                        \
317     return POINTER;                                                           \
318   }
319
320
321 /* Due to the conversion of level numbers into brackets, the
322    terminator is not important, so no token is returned here.
323    Although not strictly according to the GEDCOM spec, we'll ignore
324    whitespace just before the terminator.
325 */
326
327 #define ACTION_TERMINATOR                                                     \
328   { CHECK_LINE_LEN;                                                           \
329     INIT_LINE_LEN;                                                            \
330     line_no++;                                                                \
331     BEGIN(INITIAL);                                                           \
332   }
333
334
335 /* Eventually we have to return 1 closing bracket (for the trailer).
336    We can detect whether we have sent the closing bracket using the
337    level_diff (at eof, first it is 2, then we increment it ourselves)
338 */
339
340 #define ACTION_EOF                                                            \
341   { if (level_diff == 2) {                                                    \
342       level_diff++;                                                           \
343       return CLOSE;                                                           \
344     }                                                                         \
345     else {                                                                    \
346       char* ptr; int size;                                                    \
347       /* ... terminate lex */                                                 \
348       yyterminate();                                                          \
349       /* Get rid of f*cking compiler warning from lex generated code */       \
350       /* yyterminate does return(), so program will never come here  */       \
351       yy_flex_realloc(ptr, size);                                             \
352     }                                                                         \
353   } 
354
355 #define ACTION_NORMAL_AT                                                      \
356   { if (compat_at) {                                                          \
357       int i, j;                                                               \
358       char *yycopy = strdup(yytext);                                          \
359       if (yycopy) {                                                           \
360         for (i = 0; i < 2; i++)                                               \
361           for (j = yyleng - 1; j >= 0; --j)                                   \
362             unput(yycopy[j]);                                                 \
363         free(yycopy);                                                         \
364       }                                                                       \
365       else {                                                                  \
366         MEMORY_ERROR;                                                         \
367       }                                                                       \
368     }                                                                         \
369     else {                                                                    \
370       error_at_character();                                                   \
371       return BADTOKEN;                                                        \
372     }                                                                         \
373   }
374
375 #define ACTION_UNEXPECTED                                                     \
376   { error_unexpected_character(yytext, yytext[0]);                            \
377     return BADTOKEN;                                                          \
378   }
379
380 #elif LEX_SECTION == 3
381
382 int yywrap()
383 {
384   return 1;
385 }
386
387 static void yylex_cleanup()
388 {
389   /* fix memory leak in lex */
390   yy_delete_buffer(yy_current_buffer);
391   yy_current_buffer = NULL;
392 }
393
394 static int exitfuncregistered = 0;
395
396 void yymyinit(FILE *f)
397 {
398   if (! exitfuncregistered && atexit(yylex_cleanup) == 0)
399     exitfuncregistered = 1;
400   yyin = f;
401   yyrestart(f);
402   /* Reset our state */
403   current_level = -1;
404   level_diff = MAXGEDCLEVEL;
405   BEGIN(INITIAL);
406 }
407
408 #endif