More thorough error checking on library calls.
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /* Common lexer code.
2    Copyright (C) 2001, 2002 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 #ifndef IN_LEX
25
26 #include "gedcom_internal.h"
27 #include "multilex.h"
28 #include "encoding.h"
29 #include "gedcom.h"
30 #include "gedcom.tabgen.h"
31 #include "compat.h"
32
33 static size_t encoding_width;
34 static int current_level = -1;
35 static int level_diff=MAXGEDCLEVEL;
36 static size_t line_len = 0;
37
38 static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1];
39 static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1];
40 static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1];
41
42 #ifdef LEXER_TEST 
43 YYSTYPE gedcom_lval;
44 int line_no = 1;
45 int compat_at = 0;
46
47 int gedcom_lex();
48
49 void message_handler(Gedcom_msg_type type, char *msg)
50 {
51   fprintf(stderr, "(%d) %s\n", type, msg);
52 }
53
54 int test_loop(ENCODING enc, char* code)
55 {
56   int tok, res;
57   init_encodings();
58   set_encoding_width(enc);
59   gedcom_set_message_handler(message_handler);
60   res = open_conv_to_internal(code);
61   if (!res) {
62     gedcom_error("Unable to open conversion context: %s",
63                  strerror(errno));
64     return 1;
65   }
66   tok = gedcom_lex();
67   while (tok) {
68     switch(tok) {
69       case BADTOKEN: printf("BADTOKEN "); break;
70       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
71       case CLOSE: printf("CLOSE "); break;
72       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
73       case DELIM: printf("DELIM "); break;
74       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
75       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
76       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag.string); break;
77       default: printf("TAG(%s) ", gedcom_lval.tag.string); break;
78     }
79     tok = gedcom_lex();
80   }
81   printf("\n");
82   close_conv_to_internal();
83   return 0;  
84 }
85  
86 #endif /* of #ifdef LEXER_TEST */
87
88 #else  /* of #ifndef IN_LEX */
89
90 #define TO_INTERNAL(STR,OUTBUF) \
91   to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF))
92
93 #define INIT_LINE_LEN \
94   line_len = 0;
95
96 #define CHECK_LINE_LEN                                                        \
97   { if (line_len != (size_t)-1) {                                             \
98       line_len += strlen(yytext);                                             \
99       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
100         gedcom_error(_("Line too long, max %d characters allowed"),           \
101                      MAXGEDCLINELEN);                                         \
102         line_len = (size_t)-1;                                                \
103         return BADTOKEN;                                                      \
104       }                                                                       \
105     }                                                                         \
106   }
107
108 #define MKTAGACTION(THETAG)                                                  \
109   { CHECK_LINE_LEN;                                                          \
110     gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buf);                   \
111     gedcom_lval.tag.value  = TAG_##THETAG;                                   \
112     BEGIN(NORMAL);                                                           \
113     return TAG_##THETAG;                                                     \
114   }
115
116 /* The GEDCOM level number is converted into a sequence of opening
117    and closing brackets.  Simply put, the following GEDCOM fragment:
118    
119    0 HEAD
120    1 SOUR genes
121    2 VERS 1.6
122    2 NAME Genes
123    1 DATE 07 OCT 2001
124    ...
125    0 TRLR
126    
127    is converted into:
128    
129    { HEAD                     (initial)  
130    { SOUR genes               (1 higher: no closing brackets)
131    { VERS 1.6                 (1 higher: no closing brackets)
132    } { NAME Genes             (same level: 1 closing bracket)
133    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
134    ...
135    } { TRLR }
136    
137    or more clearly:
138    
139    { HEAD
140      { SOUR genes
141        { VERS 1.6 }
142        { NAME Genes } }
143      { DATE 07 OCT 2001
144      ... }
145    { TRLR }
146
147    But because this means that one token is converted into a series
148    of tokens, there is some initial code following immediately here
149    that returns "pending" tokens. */
150
151 #define ACTION_BEFORE_REGEXPS                                                 \
152    { if (level_diff < 1) {                                                    \
153        level_diff++;                                                          \
154        return CLOSE;                                                          \
155      }                                                                        \
156      else if (level_diff == 1) {                                              \
157        level_diff++;                                                          \
158        gedcom_lval.number = current_level;                                    \
159        return OPEN;                                                           \
160      }                                                                        \
161      else {                                                                   \
162        /* out of brackets... */                                               \
163      }                                                                        \
164    }
165
166
167 #define ACTION_INITIAL_WHITESPACE                                             \
168   { CHECK_LINE_LEN;                                                           \
169     /* ignore initial whitespace further */                                   \
170   }
171
172
173 #define ACTION_0_DIGITS                                                       \
174    { gedcom_error (_("Level number with leading zero not allowed"));          \
175      return BADTOKEN;                                                         \
176    } 
177
178
179 #define ACTION_DIGITS                                                         \
180    { int level = atoi(TO_INTERNAL(yytext, str_buf));                          \
181      CHECK_LINE_LEN;                                                          \
182      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
183        gedcom_error (_("Level number out of range [0..%d]"),                  \
184                      MAXGEDCLEVEL);                                           \
185        return BADTOKEN;                                                       \
186      }                                                                        \
187      level_diff = level - current_level;                                      \
188      BEGIN(EXPECT_TAG);                                                       \
189      current_level = level;                                                   \
190      if (level_diff < 1) {                                                    \
191        level_diff++;                                                          \
192        return CLOSE;                                                          \
193      }                                                                        \
194      else if (level_diff == 1) {                                              \
195        level_diff++;                                                          \
196        gedcom_lval.number = current_level;                                    \
197        return OPEN;                                                           \
198      }                                                                        \
199      else {                                                                   \
200        /* should never happen (error to GEDCOM spec) */                       \
201        gedcom_error (_("GEDCOM level number is %d higher than previous"),     \
202                      level_diff);                                             \
203        return BADTOKEN;                                                       \
204      }                                                                        \
205    } 
206
207
208 #define ACTION_ALPHANUM                                                       \
209    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
210        gedcom_error(_("Tag '%s' too long, max %d characters allowed"),        \
211                     yytext, MAXGEDCTAGLEN);                                   \
212        return BADTOKEN;                                                       \
213      }                                                                        \
214      CHECK_LINE_LEN;                                                          \
215      gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buf);                   \
216      gedcom_lval.tag.value  = USERTAG;                                        \
217      BEGIN(NORMAL);                                                           \
218      return USERTAG;                                                          \
219    }
220
221
222 #define ACTION_DELIM                                                          \
223   { CHECK_LINE_LEN;                                                           \
224     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
225     return DELIM;                                                             \
226   }
227
228
229 #define ACTION_ANY                                                            \
230   { char* tmp;                                                                \
231     CHECK_LINE_LEN;                                                           \
232     tmp = TO_INTERNAL(yytext, str_buf);                                       \
233     if (!tmp) {                                                               \
234       /* Something went wrong during conversion... */                         \
235           gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"),    \
236                  yytext, yytext[0]);                                          \
237           return BADTOKEN;                                                    \
238     }                                                                         \
239     else {                                                                    \
240       gedcom_lval.string = tmp;                                               \
241       /* Due to character conversions, it is possible that the current        \
242          character will be combined with the next, and so now we don't have a \
243          character yet...                                                     \
244          In principle, this is only applicable to the 1byte case (e.g. ANSEL),\
245          but it doesn't harm the unicode case.                                \
246       */                                                                      \
247       if (strlen(gedcom_lval.string) > 0)                                     \
248         return ANYCHAR;                                                       \
249     }                                                                         \
250   }
251
252
253 #define ACTION_ESCAPE                                                         \
254   { CHECK_LINE_LEN;                                                           \
255     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
256     return ESCAPE;                                                            \
257   }
258
259
260 #define ACTION_POINTER                                                        \
261   { CHECK_LINE_LEN;                                                           \
262     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
263       gedcom_error(_("Pointer '%s' too long, max %d characters allowed"),     \
264                    yytext, MAXGEDCPTRLEN);                                    \
265       return BADTOKEN;                                                        \
266     }                                                                         \
267     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf);                        \
268     return POINTER;                                                           \
269   }
270
271
272 /* Due to the conversion of level numbers into brackets, the
273    terminator is not important, so no token is returned here.
274    Although not strictly according to the GEDCOM spec, we'll ignore
275    whitespace just before the terminator.
276 */
277
278 #define ACTION_TERMINATOR                                                     \
279   { CHECK_LINE_LEN;                                                           \
280     INIT_LINE_LEN;                                                            \
281     line_no++;                                                                \
282     BEGIN(INITIAL);                                                           \
283   }
284
285
286 /* Eventually we have to return 1 closing bracket (for the trailer).
287    We can detect whether we have sent the closing bracket using the
288    level_diff (at eof, first it is 2, then we increment it ourselves)
289 */
290
291 #define ACTION_EOF                                                            \
292   { if (level_diff == 2) {                                                    \
293       level_diff++;                                                           \
294       return CLOSE;                                                           \
295     }                                                                         \
296     else {                                                                    \
297       char* ptr; int size;                                                    \
298       /* Reset our state */                                                   \
299       current_level = -1;                                                     \
300       level_diff = MAXGEDCLEVEL;                                              \
301       /* ... then terminate lex */                                            \
302       yyterminate();                                                          \
303       /* Get rid of f*cking compiler warning from lex generated code */       \
304       /* yyterminate does return(), so program will never come here  */       \
305       yy_flex_realloc(ptr, size);                                             \
306     }                                                                         \
307   } 
308
309 #define ACTION_NORMAL_AT                                                      \
310   { if (compat_at) {                                                          \
311       int i, j;                                                               \
312       char *yycopy = strdup(yytext);                                          \
313       if (yycopy) {                                                           \
314         for (i = 0; i < 2; i++)                                               \
315           for (j = yyleng - 1; j >= 0; --j)                                   \
316             unput(yycopy[j]);                                                 \
317         free(yycopy);                                                         \
318       }                                                                       \
319       else {                                                                  \
320         MEMORY_ERROR;                                                         \
321       }                                                                       \
322     }                                                                         \
323     else {                                                                    \
324       gedcom_error(_("'@' character should be written as '@@' in values"));   \
325       return BADTOKEN;                                                        \
326     }                                                                         \
327   }
328
329 #define ACTION_UNEXPECTED                                                     \
330   { gedcom_error(_("Unexpected character: '%s' (0x%02x)"),                    \
331                  yytext, yytext[0]);                                          \
332     return BADTOKEN;                                                          \
333   }
334
335 #endif /* IN_LEX */