Removed support for testing with dmalloc (valgrind is better...).
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /* Common lexer code.
2    Copyright (C) 2001, 2002 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 #if LEX_SECTION == 1
25
26 #include "gedcom_internal.h"
27 #include "multilex.h"
28 #include "encoding.h"
29 #include "gedcom.h"
30 #include "gedcom.tabgen.h"
31 #include "compat.h"
32
33 static size_t encoding_width;
34 static int current_level = -1;
35 static int level_diff=MAXGEDCLEVEL;
36 static size_t line_len = 0;
37
38 static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1];
39 static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1];
40 static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1];
41
42 #ifdef LEXER_TEST 
43 YYSTYPE gedcom_lval;
44 int line_no = 1;
45 int compat_at = 0;
46
47 int gedcom_lex();
48
49 void message_handler(Gedcom_msg_type type, char *msg)
50 {
51   fprintf(stderr, "(%d) %s\n", type, msg);
52 }
53
54 int test_loop(ENCODING enc, const char* code)
55 {
56   int tok, res;
57   init_encodings();
58   set_encoding_width(enc);
59   gedcom_set_message_handler(message_handler);
60   res = open_conv_to_internal(code);
61   if (!res) {
62     gedcom_error("Unable to open conversion context: %s",
63                  strerror(errno));
64     return 1;
65   }
66   tok = gedcom_lex();
67   while (tok) {
68     switch(tok) {
69       case BADTOKEN: printf("BADTOKEN "); break;
70       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
71       case CLOSE: printf("CLOSE "); break;
72       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
73       case DELIM: printf("DELIM "); break;
74       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
75       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
76       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag.string); break;
77       default: printf("TAG(%s) ", gedcom_lval.tag.string); break;
78     }
79     tok = gedcom_lex();
80   }
81   printf("\n");
82   close_conv_to_internal();
83   return 0;  
84 }
85  
86 #endif /* of #ifdef LEXER_TEST */
87
88 /* These are defined as functions here, because xgettext has trouble
89    extracting the strings out of long pre-processor defined */
90
91 static void error_line_too_long()
92 {
93   gedcom_error(_("Line too long, max %d characters allowed"), MAXGEDCLINELEN); 
94 }
95
96 static void error_level_leading_zero()
97 {
98   gedcom_error (_("Level number with leading zero not allowed"));
99 }
100
101 static void error_level_out_of_range()
102 {
103   gedcom_error (_("Level number out of range [0..%d]"), MAXGEDCLEVEL); 
104 }
105
106 static void error_level_too_high(int level_diff)
107 {
108   gedcom_error (_("GEDCOM level number is %d higher than previous"),
109                 level_diff); 
110 }
111
112 static void error_tag_too_long(const char *tag)
113 {
114   gedcom_error(_("Tag '%s' too long, max %d characters allowed"),
115                tag, MAXGEDCTAGLEN); 
116 }
117
118 static void error_invalid_character(const char *str, char ch)
119 {
120   gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), str, ch); 
121 }
122
123 static void error_pointer_too_long(const char *ptr)
124 {
125   gedcom_error(_("Pointer '%s' too long, max %d characters allowed"),
126                ptr, MAXGEDCPTRLEN);
127 }
128
129 static void error_at_character()
130 {
131   gedcom_error(_("'@' character should be written as '@@' in values"));
132 }
133
134 static void error_unexpected_character(const char* str, char ch)
135 {
136   gedcom_error(_("Unexpected character: '%s' (0x%02x)"), str, ch);
137 }
138
139 /* This is to bypass the iconv conversion (if the input is UTF-8 coming
140    from the program) */
141 static int dummy_conv = 0;
142
143 #elif LEX_SECTION == 2
144
145 #define TO_INTERNAL(STR,OUTBUF) \
146   (dummy_conv ? STR : to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF)))
147
148 #define INIT_LINE_LEN \
149   line_len = 0;
150
151 #define CHECK_LINE_LEN                                                        \
152   { if (line_len != (size_t)-1) {                                             \
153       line_len += strlen(yytext);                                             \
154       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
155         error_line_too_long();                                                \
156         line_len = (size_t)-1;                                                \
157         return BADTOKEN;                                                      \
158       }                                                                       \
159     }                                                                         \
160   }
161
162 #define MKTAGACTION(THETAG)                                                  \
163   { CHECK_LINE_LEN;                                                          \
164     gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buf);                   \
165     gedcom_lval.tag.value  = TAG_##THETAG;                                   \
166     BEGIN(NORMAL);                                                           \
167     line_no++;                                                               \
168     return TAG_##THETAG;                                                     \
169   }
170
171 /* The GEDCOM level number is converted into a sequence of opening
172    and closing brackets.  Simply put, the following GEDCOM fragment:
173    
174    0 HEAD
175    1 SOUR genes
176    2 VERS 1.6
177    2 NAME Genes
178    1 DATE 07 OCT 2001
179    ...
180    0 TRLR
181    
182    is converted into:
183    
184    { HEAD                     (initial)  
185    { SOUR genes               (1 higher: no closing brackets)
186    { VERS 1.6                 (1 higher: no closing brackets)
187    } { NAME Genes             (same level: 1 closing bracket)
188    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
189    ...
190    } { TRLR }
191    
192    or more clearly:
193    
194    { HEAD
195      { SOUR genes
196        { VERS 1.6 }
197        { NAME Genes } }
198      { DATE 07 OCT 2001
199      ... }
200    { TRLR }
201
202    But because this means that one token is converted into a series
203    of tokens, there is some initial code following immediately here
204    that returns "pending" tokens. */
205
206 #define ACTION_BEFORE_REGEXPS                                                 \
207    { if (level_diff < 1) {                                                    \
208        level_diff++;                                                          \
209        return CLOSE;                                                          \
210      }                                                                        \
211      else if (level_diff == 1) {                                              \
212        level_diff++;                                                          \
213        gedcom_lval.number = current_level;                                    \
214        return OPEN;                                                           \
215      }                                                                        \
216      else {                                                                   \
217        /* out of brackets... */                                               \
218      }                                                                        \
219    }
220
221
222 #define ACTION_INITIAL_WHITESPACE                                             \
223   { CHECK_LINE_LEN;                                                           \
224     /* ignore initial whitespace further */                                   \
225   }
226
227
228 #define ACTION_0_DIGITS                                                       \
229    { error_level_leading_zero();                                              \
230      return BADTOKEN;                                                         \
231    } 
232
233
234 #define ACTION_DIGITS                                                         \
235    { int level = atoi(TO_INTERNAL(yytext, str_buf));                          \
236      CHECK_LINE_LEN;                                                          \
237      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
238        error_level_out_of_range();                                            \
239        line_no++;                                                             \
240        return BADTOKEN;                                                       \
241      }                                                                        \
242      level_diff = level - current_level;                                      \
243      BEGIN(EXPECT_TAG);                                                       \
244      current_level = level;                                                   \
245      if (level_diff < 1) {                                                    \
246        level_diff++;                                                          \
247        return CLOSE;                                                          \
248      }                                                                        \
249      else if (level_diff == 1) {                                              \
250        level_diff++;                                                          \
251        gedcom_lval.number = current_level;                                    \
252        return OPEN;                                                           \
253      }                                                                        \
254      else {                                                                   \
255        /* should never happen (error to GEDCOM spec) */                       \
256        error_level_too_high(level_diff);                                      \
257        line_no++;                                                             \
258        return BADTOKEN;                                                       \
259      }                                                                        \
260    } 
261
262
263 #define ACTION_ALPHANUM                                                       \
264    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
265        error_tag_too_long(yytext);                                            \
266        line_no++;                                                             \
267        return BADTOKEN;                                                       \
268      }                                                                        \
269      CHECK_LINE_LEN;                                                          \
270      gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buf);                   \
271      gedcom_lval.tag.value  = USERTAG;                                        \
272      BEGIN(NORMAL);                                                           \
273      line_no++;                                                               \
274      return USERTAG;                                                          \
275    }
276
277
278 #define ACTION_DELIM                                                          \
279   { CHECK_LINE_LEN;                                                           \
280     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
281     return DELIM;                                                             \
282   }
283
284
285 #define ACTION_ANY                                                            \
286   { char* tmp;                                                                \
287     CHECK_LINE_LEN;                                                           \
288     tmp = TO_INTERNAL(yytext, str_buf);                                       \
289     if (!tmp) {                                                               \
290       /* Something went wrong during conversion... */                         \
291           error_invalid_character(yytext, yytext[0]);                         \
292           return BADTOKEN;                                                    \
293     }                                                                         \
294     else {                                                                    \
295       gedcom_lval.string = tmp;                                               \
296       /* Due to character conversions, it is possible that the current        \
297          character will be combined with the next, and so now we don't have a \
298          character yet...                                                     \
299          In principle, this is only applicable to the 1byte case (e.g. ANSEL),\
300          but it doesn't harm the unicode case.                                \
301       */                                                                      \
302       if (strlen(gedcom_lval.string) > 0)                                     \
303         return ANYCHAR;                                                       \
304     }                                                                         \
305   }
306
307
308 #define ACTION_ESCAPE                                                         \
309   { CHECK_LINE_LEN;                                                           \
310     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
311     return ESCAPE;                                                            \
312   }
313
314
315 #define ACTION_POINTER                                                        \
316   { CHECK_LINE_LEN;                                                           \
317     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
318       error_pointer_too_long(yytext);                                         \
319       return BADTOKEN;                                                        \
320     }                                                                         \
321     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf);                        \
322     return POINTER;                                                           \
323   }
324
325
326 /* Due to the conversion of level numbers into brackets, the
327    terminator is not important, so no token is returned here.
328    Although not strictly according to the GEDCOM spec, we'll ignore
329    whitespace just before the terminator.
330 */
331
332 #define ACTION_TERMINATOR                                                     \
333   { CHECK_LINE_LEN;                                                           \
334     INIT_LINE_LEN;                                                            \
335     BEGIN(INITIAL);                                                           \
336   }
337
338
339 /* Eventually we have to return 1 closing bracket (for the trailer).
340    We can detect whether we have sent the closing bracket using the
341    level_diff (at eof, first it is 2, then we increment it ourselves)
342 */
343
344 #define ACTION_EOF                                                            \
345   { if (level_diff == 2) {                                                    \
346       level_diff++;                                                           \
347       return CLOSE;                                                           \
348     }                                                                         \
349     else {                                                                    \
350       char* ptr; int size;                                                    \
351       /* ... terminate lex */                                                 \
352       yyterminate();                                                          \
353       /* Get rid of f*cking compiler warning from lex generated code */       \
354       /* yyterminate does return(), so program will never come here  */       \
355       yy_flex_realloc(ptr, size);                                             \
356     }                                                                         \
357   } 
358
359 #define ACTION_NORMAL_AT                                                      \
360   { if (compat_at) {                                                          \
361       int i, j;                                                               \
362       char *yycopy = strdup(yytext);                                          \
363       if (yycopy) {                                                           \
364         for (i = 0; i < 2; i++)                                               \
365           for (j = yyleng - 1; j >= 0; --j)                                   \
366             unput(yycopy[j]);                                                 \
367         free(yycopy);                                                         \
368       }                                                                       \
369       else {                                                                  \
370         MEMORY_ERROR;                                                         \
371       }                                                                       \
372     }                                                                         \
373     else {                                                                    \
374       error_at_character();                                                   \
375       return BADTOKEN;                                                        \
376     }                                                                         \
377   }
378
379 #define ACTION_UNEXPECTED                                                     \
380   { error_unexpected_character(yytext, yytext[0]);                            \
381     return BADTOKEN;                                                          \
382   }
383
384 #elif LEX_SECTION == 3
385
386 int yywrap()
387 {
388   return 1;
389 }
390
391 static void yylex_cleanup()
392 {
393   /* fix memory leak in lex */
394   yy_delete_buffer(yy_current_buffer);
395   yy_current_buffer = NULL;
396 }
397
398 static int exitfuncregistered = 0;
399
400 void yymyinit(FILE *f)
401 {
402   if (! exitfuncregistered && atexit(yylex_cleanup) == 0)
403     exitfuncregistered = 1;
404   yyin = f;
405   yyrestart(f);
406   /* Reset our state */
407   current_level = -1;
408   level_diff = MAXGEDCLEVEL;
409   BEGIN(INITIAL);
410 }
411
412 #endif