Check maximum lengths properly.
[gedcom-parse.git] / gedcom_lex_common.c
1 /*  This program is free software; you can redistribute it and/or modify  *
2  *  it under the terms of the GNU General Public License as published by  *
3  *  the Free Software Foundation; either version 2 of the License, or     *
4  *  (at your option) any later version.                                   *
5
6  (C) 2001 by The Genes Development Team
7  Original author: Peter Verthez (Peter.Verthez@advalvas.be)
8 */
9
10 /* $Id$ */
11 /* $Name$ */
12
13 #ifndef IN_LEX
14
15 #include "gedcom.tab.h"
16 #include "gedcom.h"
17 #include "multilex.h"
18 #include "encoding.h"
19
20 #define YY_NO_UNPUT
21
22 static size_t encoding_width;
23 static int current_level = -1;
24 static int level_diff=MAXGEDCLEVEL;
25 static size_t line_len = 0;
26
27 static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1];
28 static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1];
29 static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1];
30
31 #ifdef LEXER_TEST 
32 YYSTYPE gedcom_lval;
33 int line_no = 1;
34
35 int gedcom_lex();
36
37 int test_loop(ENCODING enc, char* code)
38 {
39   int tok, res;
40   init_encodings();
41   set_encoding_width(enc);
42   res = open_conv_to_internal(code);
43   if (!res) {
44     gedcom_error("Unable to open conversion context: %s",
45                  strerror(errno));
46     return 1;
47   }
48   tok = gedcom_lex();
49   while (tok) {
50     switch(tok) {
51       case BADTOKEN: printf("BADTOKEN "); break;
52       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
53       case CLOSE: printf("CLOSE "); break;
54       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
55       case DELIM: printf("DELIM "); break;
56       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
57       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
58       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
59       default: printf("TAG(%s) ", gedcom_lval.string); break;
60     }
61     tok = gedcom_lex();
62   }
63   printf("\n");
64   close_conv_to_internal();
65   return 0;  
66 }
67  
68 #endif /* of #ifdef LEXER_TEST */
69
70 #else  /* of #ifndef IN_LEX */
71
72 #define TO_INTERNAL(STR,OUTBUF) \
73   to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF))
74
75 #define INIT_LINE_LEN \
76   line_len = 0;
77
78 #define CHECK_LINE_LEN                                                        \
79   { if (line_len != (size_t)-1)                                               \
80       line_len += strlen(yytext);                                             \
81     if (line_len > MAXGEDCLINELEN * encoding_width) {                         \
82       gedcom_error("Line too long, max %d characters",                        \
83                    MAXGEDCLINELEN);                                           \
84       line_len = (size_t)-1;                                                  \
85       return BADTOKEN;                                                        \
86     }                                                                         \
87   }
88
89 #define MKTAGACTION(THETAG)                                                  \
90   { CHECK_LINE_LEN;                                                          \
91     gedcom_lval.string = TO_INTERNAL(yytext, tag_buf);                       \
92     BEGIN(NORMAL);                                                           \
93     return TAG_##THETAG;                                                     \
94   }
95
96 /* The GEDCOM level number is converted into a sequence of opening
97    and closing brackets.  Simply put, the following GEDCOM fragment:
98    
99    0 HEAD
100    1 SOUR genes
101    2 VERS 1.6
102    2 NAME Genes
103    1 DATE 07 OCT 2001
104    ...
105    0 TRLR
106    
107    is converted into:
108    
109    { HEAD                     (initial)  
110    { SOUR genes               (1 higher: no closing brackets)
111    { VERS 1.6                 (1 higher: no closing brackets)
112    } { NAME Genes             (same level: 1 closing bracket)
113    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
114    ...
115    } { TRLR }
116    
117    or more clearly:
118    
119    { HEAD
120      { SOUR genes
121        { VERS 1.6 }
122        { NAME Genes } }
123      { DATE 07 OCT 2001
124      ... }
125    { TRLR }
126
127    But because this means that one token is converted into a series
128    of tokens, there is some initial code following immediately here
129    that returns "pending" tokens. */
130
131 #define ACTION_BEFORE_REGEXPS                                                 \
132    { if (level_diff < 1) {                                                    \
133        level_diff++;                                                          \
134        return CLOSE;                                                          \
135      }                                                                        \
136      else if (level_diff == 1) {                                              \
137        level_diff++;                                                          \
138        gedcom_lval.number = current_level;                                    \
139        return OPEN;                                                           \
140      }                                                                        \
141      else {                                                                   \
142        /* out of brackets... */                                               \
143      }                                                                        \
144    }
145
146
147 #define ACTION_INITIAL_WHITESPACE                                             \
148   { CHECK_LINE_LEN;                                                           \
149     /* ignore initial whitespace further */                                   \
150   }
151
152
153 #define ACTION_0_DIGITS                                                       \
154    { gedcom_error ("Level number with leading zero");                         \
155      return BADTOKEN;                                                         \
156    } 
157
158
159 #define ACTION_DIGITS                                                         \
160    { int level = atoi(TO_INTERNAL(yytext, str_buf));                          \
161      CHECK_LINE_LEN;                                                          \
162      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
163        gedcom_error ("Level number out of range [0..%d]",                     \
164                      MAXGEDCLEVEL);                                           \
165        return BADTOKEN;                                                       \
166      }                                                                        \
167      level_diff = level - current_level;                                      \
168      BEGIN(EXPECT_TAG);                                                       \
169      current_level = level;                                                   \
170      if (level_diff < 1) {                                                    \
171        level_diff++;                                                          \
172        return CLOSE;                                                          \
173      }                                                                        \
174      else if (level_diff == 1) {                                              \
175        level_diff++;                                                          \
176        gedcom_lval.number = current_level;                                    \
177        return OPEN;                                                           \
178      }                                                                        \
179      else {                                                                   \
180        /* should never happen (error to GEDCOM spec) */                       \
181        gedcom_error ("GEDCOM level number is %d higher than "                 \
182                      "previous",                                              \
183                      level_diff);                                             \
184        return BADTOKEN;                                                       \
185      }                                                                        \
186    } 
187
188
189 #define ACTION_ALPHANUM                                                       \
190    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
191        gedcom_error("Tag '%s' too long, max %d characters",                   \
192                     yytext, MAXGEDCTAGLEN);                                   \
193        return BADTOKEN;                                                       \
194      }                                                                        \
195      CHECK_LINE_LEN;                                                          \
196      gedcom_lval.string = TO_INTERNAL(yytext, tag_buf);                       \
197      BEGIN(NORMAL);                                                           \
198      return USERTAG;                                                          \
199    }
200
201
202 #define ACTION_DELIM                                                          \
203   { CHECK_LINE_LEN;                                                           \
204     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
205     return DELIM;                                                             \
206   }
207
208
209 #define ACTION_ANY                                                            \
210   { CHECK_LINE_LEN;                                                           \
211     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
212     /* Due to character conversions, it is possible that the current          \
213        character will be combined with the next, and so now we don't have a   \
214        character yet...                                                       \
215        In principle, this is only applicable to the 1byte case (e.g. ANSEL),  \
216        but it doesn't harm the unicode case.                                  \
217     */                                                                        \
218     if (strlen(gedcom_lval.string) > 0)                                       \
219       return ANYCHAR;                                                         \
220   }
221
222
223 #define ACTION_ESCAPE                                                         \
224   { CHECK_LINE_LEN;                                                           \
225     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
226     return ESCAPE;                                                            \
227   }
228
229
230 #define ACTION_POINTER                                                        \
231   { CHECK_LINE_LEN;                                                           \
232     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
233       gedcom_error("Pointer '%s' too long, max %d characters",                \
234                    yytext, MAXGEDCPTRLEN);                                    \
235       return BADTOKEN;                                                        \
236     }                                                                         \
237     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf);                        \
238     return POINTER;                                                           \
239   }
240
241
242 /* Due to the conversion of level numbers into brackets, the
243    terminator is not important, so no token is returned here.
244    Although not strictly according to the GEDCOM spec, we'll ignore
245    whitespace just before the terminator.
246 */
247
248 #define ACTION_TERMINATOR                                                     \
249   { CHECK_LINE_LEN;                                                           \
250     INIT_LINE_LEN;                                                            \
251     line_no++;                                                                \
252     BEGIN(INITIAL);                                                           \
253   }
254
255
256 /* Eventually we have to return 1 closing bracket (for the trailer).
257    We can detect whether we have sent the closing bracket using the
258    level_diff (at eof, first it is 2, then we increment it ourselves)
259 */
260
261 #define ACTION_EOF                                                            \
262   { if (level_diff == 2) {                                                    \
263       level_diff++;                                                           \
264       return CLOSE;                                                           \
265     }                                                                         \
266     else {                                                                    \
267       yyterminate();                                                          \
268     }                                                                         \
269   } 
270
271
272 #define ACTION_UNEXPECTED                                                     \
273   { gedcom_error("Unexpected character: '%s' (0x%02x)",                       \
274                  yytext, yytext[0]);                                          \
275     return BADTOKEN;                                                          \
276   }
277
278 #endif /* IN_LEX */