Fix for line numbering bug (bug 613480).
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /* Common lexer code.
2    Copyright (C) 2001, 2002 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 #if LEX_SECTION == 1
25
26 #include "gedcom_internal.h"
27 #include "multilex.h"
28 #include "encoding.h"
29 #include "gedcom.h"
30 #include "gedcom.tabgen.h"
31 #include "compat.h"
32
33 static size_t encoding_width;
34 static int current_level = -1;
35 static int level_diff=MAXGEDCLEVEL;
36 static size_t line_len = 0;
37
38 static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1];
39 static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1];
40 static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1];
41
42 #ifdef LEXER_TEST 
43 YYSTYPE gedcom_lval;
44 int line_no = 1;
45 int compat_at = 0;
46
47 int gedcom_lex();
48
49 void message_handler(Gedcom_msg_type type, char *msg)
50 {
51   fprintf(stderr, "(%d) %s\n", type, msg);
52 }
53
54 int test_loop(ENCODING enc, const char* code)
55 {
56   int tok, res;
57   init_encodings();
58   set_encoding_width(enc);
59   gedcom_set_message_handler(message_handler);
60   res = open_conv_to_internal(code);
61   if (!res) {
62     gedcom_error("Unable to open conversion context: %s",
63                  strerror(errno));
64     return 1;
65   }
66   tok = gedcom_lex();
67   while (tok) {
68     switch(tok) {
69       case BADTOKEN: printf("BADTOKEN "); break;
70       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
71       case CLOSE: printf("CLOSE "); break;
72       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
73       case DELIM: printf("DELIM "); break;
74       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
75       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
76       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag.string); break;
77       default: printf("TAG(%s) ", gedcom_lval.tag.string); break;
78     }
79     tok = gedcom_lex();
80   }
81   printf("\n");
82   close_conv_to_internal();
83   return 0;  
84 }
85  
86 #endif /* of #ifdef LEXER_TEST */
87
88 /* These are defined as functions here, because xgettext has trouble
89    extracting the strings out of long pre-processor defined */
90
91 static void error_line_too_long()
92 {
93   gedcom_error(_("Line too long, max %d characters allowed"), MAXGEDCLINELEN); 
94 }
95
96 static void error_level_leading_zero()
97 {
98   gedcom_error (_("Level number with leading zero not allowed"));
99 }
100
101 static void error_level_out_of_range()
102 {
103   gedcom_error (_("Level number out of range [0..%d]"), MAXGEDCLEVEL); 
104 }
105
106 static void error_level_too_high(int level_diff)
107 {
108   gedcom_error (_("GEDCOM level number is %d higher than previous"),
109                 level_diff); 
110 }
111
112 static void error_tag_too_long(const char *tag)
113 {
114   gedcom_error(_("Tag '%s' too long, max %d characters allowed"),
115                tag, MAXGEDCTAGLEN); 
116 }
117
118 static void error_invalid_character(const char *str, char ch)
119 {
120   gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), str, ch); 
121 }
122
123 static void error_pointer_too_long(const char *ptr)
124 {
125   gedcom_error(_("Pointer '%s' too long, max %d characters allowed"),
126                ptr, MAXGEDCPTRLEN);
127 }
128
129 static void error_at_character()
130 {
131   gedcom_error(_("'@' character should be written as '@@' in values"));
132 }
133
134 static void error_unexpected_character(const char* str, char ch)
135 {
136   gedcom_error(_("Unexpected character: '%s' (0x%02x)"), str, ch);
137 }
138
139 /* This is to bypass the iconv conversion (if the input is UTF-8 coming
140    from the program) */
141 static int dummy_conv = 0;
142
143 #elif LEX_SECTION == 2
144
145 #define TO_INTERNAL(STR,OUTBUF) \
146   (dummy_conv ? STR : to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF)))
147
148 #define INIT_LINE_LEN \
149   line_len = 0;
150
151 #define CHECK_LINE_LEN                                                        \
152   { if (line_len != (size_t)-1) {                                             \
153       line_len += strlen(yytext);                                             \
154       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
155         error_line_too_long();                                                \
156         line_len = (size_t)-1;                                                \
157         return BADTOKEN;                                                      \
158       }                                                                       \
159     }                                                                         \
160   }
161
162 #define MKTAGACTION(THETAG)                                                  \
163   { CHECK_LINE_LEN;                                                          \
164     gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buf);                   \
165     gedcom_lval.tag.value  = TAG_##THETAG;                                   \
166     BEGIN(NORMAL);                                                           \
167     line_no++;                                                               \
168     return TAG_##THETAG;                                                     \
169   }
170
171 /* The GEDCOM level number is converted into a sequence of opening
172    and closing brackets.  Simply put, the following GEDCOM fragment:
173    
174    0 HEAD
175    1 SOUR genes
176    2 VERS 1.6
177    2 NAME Genes
178    1 DATE 07 OCT 2001
179    ...
180    0 TRLR
181    
182    is converted into:
183    
184    { HEAD                     (initial)  
185    { SOUR genes               (1 higher: no closing brackets)
186    { VERS 1.6                 (1 higher: no closing brackets)
187    } { NAME Genes             (same level: 1 closing bracket)
188    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
189    ...
190    } { TRLR }
191    
192    or more clearly:
193    
194    { HEAD
195      { SOUR genes
196        { VERS 1.6 }
197        { NAME Genes } }
198      { DATE 07 OCT 2001
199      ... }
200    { TRLR }
201
202    But because this means that one token is converted into a series
203    of tokens, there is some initial code following immediately here
204    that returns "pending" tokens. */
205
206 #define ACTION_BEFORE_REGEXPS                                                 \
207    { if (level_diff < 1) {                                                    \
208        level_diff++;                                                          \
209        return CLOSE;                                                          \
210      }                                                                        \
211      else if (level_diff == 1) {                                              \
212        level_diff++;                                                          \
213        gedcom_lval.number = current_level;                                    \
214        return OPEN;                                                           \
215      }                                                                        \
216      else {                                                                   \
217        /* out of brackets... */                                               \
218      }                                                                        \
219    }
220
221
222 #define ACTION_INITIAL_WHITESPACE                                             \
223   { CHECK_LINE_LEN;                                                           \
224     /* ignore initial whitespace further */                                   \
225   }
226
227
228 #define ACTION_0_DIGITS                                                       \
229    { error_level_leading_zero();                                              \
230      return BADTOKEN;                                                         \
231    } 
232
233
234 #define ACTION_DIGITS                                                         \
235    { int level = atoi(TO_INTERNAL(yytext, str_buf));                          \
236      CHECK_LINE_LEN;                                                          \
237      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
238        error_level_out_of_range();                                            \
239        return BADTOKEN;                                                       \
240      }                                                                        \
241      level_diff = level - current_level;                                      \
242      BEGIN(EXPECT_TAG);                                                       \
243      current_level = level;                                                   \
244      if (level_diff < 1) {                                                    \
245        level_diff++;                                                          \
246        return CLOSE;                                                          \
247      }                                                                        \
248      else if (level_diff == 1) {                                              \
249        level_diff++;                                                          \
250        gedcom_lval.number = current_level;                                    \
251        return OPEN;                                                           \
252      }                                                                        \
253      else {                                                                   \
254        /* should never happen (error to GEDCOM spec) */                       \
255        error_level_too_high(level_diff);                                      \
256        return BADTOKEN;                                                       \
257      }                                                                        \
258    } 
259
260
261 #define ACTION_ALPHANUM                                                       \
262    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
263        error_tag_too_long(yytext);                                            \
264        line_no++;                                                             \
265        return BADTOKEN;                                                       \
266      }                                                                        \
267      CHECK_LINE_LEN;                                                          \
268      gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buf);                   \
269      gedcom_lval.tag.value  = USERTAG;                                        \
270      BEGIN(NORMAL);                                                           \
271      line_no++;                                                               \
272      return USERTAG;                                                          \
273    }
274
275
276 #define ACTION_DELIM                                                          \
277   { CHECK_LINE_LEN;                                                           \
278     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
279     return DELIM;                                                             \
280   }
281
282
283 #define ACTION_ANY                                                            \
284   { char* tmp;                                                                \
285     CHECK_LINE_LEN;                                                           \
286     tmp = TO_INTERNAL(yytext, str_buf);                                       \
287     if (!tmp) {                                                               \
288       /* Something went wrong during conversion... */                         \
289           error_invalid_character(yytext, yytext[0]);                         \
290           return BADTOKEN;                                                    \
291     }                                                                         \
292     else {                                                                    \
293       gedcom_lval.string = tmp;                                               \
294       /* Due to character conversions, it is possible that the current        \
295          character will be combined with the next, and so now we don't have a \
296          character yet...                                                     \
297          In principle, this is only applicable to the 1byte case (e.g. ANSEL),\
298          but it doesn't harm the unicode case.                                \
299       */                                                                      \
300       if (strlen(gedcom_lval.string) > 0)                                     \
301         return ANYCHAR;                                                       \
302     }                                                                         \
303   }
304
305
306 #define ACTION_ESCAPE                                                         \
307   { CHECK_LINE_LEN;                                                           \
308     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
309     return ESCAPE;                                                            \
310   }
311
312
313 #define ACTION_POINTER                                                        \
314   { CHECK_LINE_LEN;                                                           \
315     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
316       error_pointer_too_long(yytext);                                         \
317       return BADTOKEN;                                                        \
318     }                                                                         \
319     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf);                        \
320     return POINTER;                                                           \
321   }
322
323
324 /* Due to the conversion of level numbers into brackets, the
325    terminator is not important, so no token is returned here.
326    Although not strictly according to the GEDCOM spec, we'll ignore
327    whitespace just before the terminator.
328 */
329
330 #define ACTION_TERMINATOR                                                     \
331   { CHECK_LINE_LEN;                                                           \
332     INIT_LINE_LEN;                                                            \
333     BEGIN(INITIAL);                                                           \
334   }
335
336
337 /* Eventually we have to return 1 closing bracket (for the trailer).
338    We can detect whether we have sent the closing bracket using the
339    level_diff (at eof, first it is 2, then we increment it ourselves)
340 */
341
342 #define ACTION_EOF                                                            \
343   { if (level_diff == 2) {                                                    \
344       level_diff++;                                                           \
345       return CLOSE;                                                           \
346     }                                                                         \
347     else {                                                                    \
348       char* ptr; int size;                                                    \
349       /* ... terminate lex */                                                 \
350       yyterminate();                                                          \
351       /* Get rid of f*cking compiler warning from lex generated code */       \
352       /* yyterminate does return(), so program will never come here  */       \
353       yy_flex_realloc(ptr, size);                                             \
354     }                                                                         \
355   } 
356
357 #define ACTION_NORMAL_AT                                                      \
358   { if (compat_at) {                                                          \
359       int i, j;                                                               \
360       char *yycopy = strdup(yytext);                                          \
361       if (yycopy) {                                                           \
362         for (i = 0; i < 2; i++)                                               \
363           for (j = yyleng - 1; j >= 0; --j)                                   \
364             unput(yycopy[j]);                                                 \
365         free(yycopy);                                                         \
366       }                                                                       \
367       else {                                                                  \
368         MEMORY_ERROR;                                                         \
369       }                                                                       \
370     }                                                                         \
371     else {                                                                    \
372       error_at_character();                                                   \
373       return BADTOKEN;                                                        \
374     }                                                                         \
375   }
376
377 #define ACTION_UNEXPECTED                                                     \
378   { error_unexpected_character(yytext, yytext[0]);                            \
379     return BADTOKEN;                                                          \
380   }
381
382 #elif LEX_SECTION == 3
383
384 int yywrap()
385 {
386   return 1;
387 }
388
389 static void yylex_cleanup()
390 {
391   /* fix memory leak in lex */
392   yy_delete_buffer(yy_current_buffer);
393   yy_current_buffer = NULL;
394 }
395
396 static int exitfuncregistered = 0;
397
398 void yymyinit(FILE *f)
399 {
400   if (! exitfuncregistered && atexit(yylex_cleanup) == 0)
401     exitfuncregistered = 1;
402   yyin = f;
403   yyrestart(f);
404   /* Reset our state */
405   current_level = -1;
406   level_diff = MAXGEDCLEVEL;
407   BEGIN(INITIAL);
408 }
409
410 #endif