Moved encoding state to separate source file.
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /* Common lexer code.
2    Copyright (C) 2001, 2002 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 #if LEX_SECTION == 1
25
26 #include "gedcom_internal.h"
27 #include "multilex.h"
28 #include "encoding.h"
29 #include "encoding_state.h"
30 #include "gedcom.h"
31 #include "gedcom.tabgen.h"
32 #include "compat.h"
33
34 static size_t encoding_width;
35 static int current_level = -1;
36 static int level_diff=MAXGEDCLEVEL;
37 static size_t line_len = 0;
38
39 static struct conv_buffer* ptr_buffer = NULL;
40 static struct conv_buffer* tag_buffer = NULL;
41 static struct conv_buffer* str_buffer = NULL;
42
43 #define INITIAL_PTR_BUFFER_LEN MAXGEDCPTRLEN * UTF_FACTOR + 1
44 #define INITIAL_TAG_BUFFER_LEN MAXGEDCTAGLEN * UTF_FACTOR + 1
45 #define INITIAL_STR_BUFFER_LEN MAXGEDCLINELEN * UTF_FACTOR + 1
46
47 #ifdef LEXER_TEST 
48 YYSTYPE gedcom_lval;
49 int line_no = 1;
50
51 int gedcom_lex();
52
53 void message_handler(Gedcom_msg_type type, char *msg)
54 {
55   fprintf(stderr, "(%d) %s\n", type, msg);
56 }
57
58 int test_loop(ENCODING enc, const char* code)
59 {
60   int tok, res;
61   init_encodings();
62   set_encoding_width(enc);
63   gedcom_set_message_handler(message_handler);
64   res = open_conv_to_internal(code);
65   if (!res) {
66     gedcom_error("Unable to open conversion context: %s",
67                  strerror(errno));
68     return 1;
69   }
70   tok = gedcom_lex();
71   while (tok) {
72     switch(tok) {
73       case BADTOKEN: printf("BADTOKEN "); break;
74       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
75       case CLOSE: printf("CLOSE "); break;
76       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
77       case DELIM: printf("DELIM "); break;
78       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
79       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
80       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag.string); break;
81       default: printf("TAG(%s) ", gedcom_lval.tag.string); break;
82     }
83     tok = gedcom_lex();
84   }
85   printf("\n");
86   close_conv_to_internal();
87   return 0;  
88 }
89  
90 #endif /* of #ifdef LEXER_TEST */
91
92 /* These are defined as functions here, because xgettext has trouble
93    extracting the strings out of long pre-processor defined */
94
95 static void error_line_too_long(const char *line)
96 {
97   gedcom_error(_("Line too long, max %d characters allowed: %s"),
98                MAXGEDCLINELEN, line); 
99 }
100
101 static void error_level_leading_zero()
102 {
103   gedcom_error (_("Level number with leading zero not allowed"));
104 }
105
106 static void error_level_out_of_range()
107 {
108   gedcom_error (_("Level number out of range [0..%d]"), MAXGEDCLEVEL); 
109 }
110
111 static void error_level_too_high(int level_diff)
112 {
113   gedcom_error (_("GEDCOM level number is %d higher than previous"),
114                 level_diff); 
115 }
116
117 static void error_tag_too_long(const char *tag)
118 {
119   gedcom_error(_("Tag '%s' too long, max %d characters allowed"),
120                tag, MAXGEDCTAGLEN); 
121 }
122
123 static void error_invalid_character(const char *str, char ch)
124 {
125   gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), str, ch); 
126 }
127
128 static void error_pointer_too_long(const char *ptr)
129 {
130   gedcom_error(_("Pointer '%s' too long, max %d characters allowed"),
131                ptr, MAXGEDCPTRLEN);
132 }
133
134 static void error_at_character()
135 {
136   gedcom_error(_("'@' character should be written as '@@' in values"));
137 }
138
139 static void error_unexpected_character(const char* str, char ch)
140 {
141   gedcom_error(_("Unexpected character: '%s' (0x%02x)"), str, ch);
142 }
143
144 /* This is to bypass the iconv conversion (if the input is UTF-8 coming
145    from the program) */
146 static int dummy_conv = 0;
147
148 #elif LEX_SECTION == 2
149
150 #define TO_INTERNAL(STR,OUTBUF) \
151   (dummy_conv ? STR : to_internal(STR, yyleng, OUTBUF))
152
153 #define INIT_LINE_LEN \
154   line_len = 0;
155
156 #define CHECK_LINE_LEN                                                        \
157   { if (line_len != (size_t)-1) {                                             \
158       line_len += strlen(yytext);                                             \
159       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
160         error_line_too_long(yytext);                                          \
161         line_len = (size_t)-1;                                                \
162         return BADTOKEN;                                                      \
163       }                                                                       \
164     }                                                                         \
165   }
166
167 #define MKTAGACTION(THETAG)                                                  \
168   { CHECK_LINE_LEN;                                                          \
169     gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer);                \
170     gedcom_lval.tag.value  = TAG_##THETAG;                                   \
171     BEGIN(NORMAL);                                                           \
172     line_no++;                                                               \
173     return TAG_##THETAG;                                                     \
174   }
175
176 /* The GEDCOM level number is converted into a sequence of opening
177    and closing brackets.  Simply put, the following GEDCOM fragment:
178    
179    0 HEAD
180    1 SOUR genes
181    2 VERS 1.6
182    2 NAME Genes
183    1 DATE 07 OCT 2001
184    ...
185    0 TRLR
186    
187    is converted into:
188    
189    { HEAD                     (initial)  
190    { SOUR genes               (1 higher: no closing brackets)
191    { VERS 1.6                 (1 higher: no closing brackets)
192    } { NAME Genes             (same level: 1 closing bracket)
193    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
194    ...
195    } { TRLR }
196    
197    or more clearly:
198    
199    { HEAD
200      { SOUR genes
201        { VERS 1.6 }
202        { NAME Genes } }
203      { DATE 07 OCT 2001
204      ... }
205    { TRLR }
206
207    But because this means that one token is converted into a series
208    of tokens, there is some initial code following immediately here
209    that returns "pending" tokens. */
210
211 #define ACTION_BEFORE_REGEXPS                                                 \
212    { if (level_diff < 1) {                                                    \
213        level_diff++;                                                          \
214        return CLOSE;                                                          \
215      }                                                                        \
216      else if (level_diff == 1) {                                              \
217        level_diff++;                                                          \
218        gedcom_lval.number = current_level;                                    \
219        return OPEN;                                                           \
220      }                                                                        \
221      else {                                                                   \
222        /* out of brackets... */                                               \
223      }                                                                        \
224    }
225
226
227 #define ACTION_INITIAL_WHITESPACE                                             \
228   { CHECK_LINE_LEN;                                                           \
229     /* ignore initial whitespace further */                                   \
230   }
231
232
233 #define ACTION_0_DIGITS                                                       \
234    { error_level_leading_zero();                                              \
235      return BADTOKEN;                                                         \
236    } 
237
238
239 #define ACTION_DIGITS                                                         \
240    { int level = atoi(TO_INTERNAL(yytext, str_buffer));                       \
241      CHECK_LINE_LEN;                                                          \
242      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
243        error_level_out_of_range();                                            \
244        line_no++;                                                             \
245        return BADTOKEN;                                                       \
246      }                                                                        \
247      level_diff = level - current_level;                                      \
248      BEGIN(EXPECT_TAG);                                                       \
249      current_level = level;                                                   \
250      if (level_diff < 1) {                                                    \
251        level_diff++;                                                          \
252        return CLOSE;                                                          \
253      }                                                                        \
254      else if (level_diff == 1) {                                              \
255        level_diff++;                                                          \
256        gedcom_lval.number = current_level;                                    \
257        return OPEN;                                                           \
258      }                                                                        \
259      else {                                                                   \
260        /* should never happen (error to GEDCOM spec) */                       \
261        error_level_too_high(level_diff);                                      \
262        line_no++;                                                             \
263        return BADTOKEN;                                                       \
264      }                                                                        \
265    } 
266
267
268 #define ACTION_ALPHANUM                                                       \
269    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
270        error_tag_too_long(yytext);                                            \
271        line_no++;                                                             \
272        return BADTOKEN;                                                       \
273      }                                                                        \
274      CHECK_LINE_LEN;                                                          \
275      gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer);                \
276      gedcom_lval.tag.value  = USERTAG;                                        \
277      BEGIN(NORMAL);                                                           \
278      line_no++;                                                               \
279      return USERTAG;                                                          \
280    }
281
282
283 #define ACTION_DELIM                                                          \
284   { CHECK_LINE_LEN;                                                           \
285     gedcom_lval.string = TO_INTERNAL(yytext, str_buffer);                     \
286     return DELIM;                                                             \
287   }
288
289
290 #define ACTION_ANY                                                            \
291   { char* tmp;                                                                \
292     CHECK_LINE_LEN;                                                           \
293     tmp = TO_INTERNAL(yytext, str_buffer);                                    \
294     if (!tmp) {                                                               \
295       /* Something went wrong during conversion... */                         \
296           error_invalid_character(yytext, yytext[0]);                         \
297           return BADTOKEN;                                                    \
298     }                                                                         \
299     else {                                                                    \
300       gedcom_lval.string = tmp;                                               \
301       /* Due to character conversions, it is possible that the current        \
302          character will be combined with the next, and so now we don't have a \
303          character yet...                                                     \
304          In principle, this is only applicable to the 1byte case (e.g. ANSEL),\
305          but it doesn't harm the unicode case.                                \
306       */                                                                      \
307       if (strlen(gedcom_lval.string) > 0)                                     \
308         return ANYCHAR;                                                       \
309     }                                                                         \
310   }
311
312
313 #define ACTION_ESCAPE                                                         \
314   { CHECK_LINE_LEN;                                                           \
315     gedcom_lval.string = TO_INTERNAL(yytext, str_buffer);                     \
316     return ESCAPE;                                                            \
317   }
318
319
320 #define ACTION_POINTER                                                        \
321   { CHECK_LINE_LEN;                                                           \
322     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
323       error_pointer_too_long(yytext);                                         \
324       return BADTOKEN;                                                        \
325     }                                                                         \
326     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buffer);                     \
327     return POINTER;                                                           \
328   }
329
330
331 /* Due to the conversion of level numbers into brackets, the
332    terminator is not important, so no token is returned here.
333    Although not strictly according to the GEDCOM spec, we'll ignore
334    whitespace just before the terminator.
335 */
336
337 #define ACTION_TERMINATOR                                                     \
338   { CHECK_LINE_LEN;                                                           \
339     INIT_LINE_LEN;                                                            \
340     if (line_no == 1)                                                         \
341       set_read_encoding_terminator(TO_INTERNAL(yytext, str_buffer));          \
342     BEGIN(INITIAL);                                                           \
343   }
344
345
346 /* Eventually we have to return 1 closing bracket (for the trailer).
347    We can detect whether we have sent the closing bracket using the
348    level_diff (at eof, first it is 2, then we increment it ourselves)
349 */
350
351 #define ACTION_EOF                                                            \
352   { if (level_diff == 2) {                                                    \
353       level_diff++;                                                           \
354       return CLOSE;                                                           \
355     }                                                                         \
356     else {                                                                    \
357       char* ptr; int size;                                                    \
358       /* ... terminate lex */                                                 \
359       yyterminate();                                                          \
360       /* Get rid of f*cking compiler warning from lex generated code */       \
361       /* yyterminate does return(), so program will never come here  */       \
362       yy_flex_realloc(ptr, size);                                             \
363     }                                                                         \
364   } 
365
366 #define ACTION_NORMAL_AT                                                      \
367   { if (compat_mode(C_NO_DOUBLE_AT)) {                                        \
368       int i, j;                                                               \
369       char *yycopy = strdup(yytext);                                          \
370       if (yycopy) {                                                           \
371         for (i = 0; i < 2; i++)                                               \
372           for (j = yyleng - 1; j >= 0; --j)                                   \
373             unput(yycopy[j]);                                                 \
374         free(yycopy);                                                         \
375       }                                                                       \
376       else {                                                                  \
377         MEMORY_ERROR;                                                         \
378       }                                                                       \
379     }                                                                         \
380     else {                                                                    \
381       error_at_character();                                                   \
382       return BADTOKEN;                                                        \
383     }                                                                         \
384   }
385
386 #define ACTION_UNEXPECTED                                                     \
387   { error_unexpected_character(yytext, yytext[0]);                            \
388     return BADTOKEN;                                                          \
389   }
390
391 #elif LEX_SECTION == 3
392
393 int yywrap()
394 {
395   return 1;
396 }
397
398 static void free_conv_buffers()
399 {
400   free_conv_buffer(ptr_buffer);
401   free_conv_buffer(tag_buffer);
402   free_conv_buffer(str_buffer);
403 }
404
405 static void yylex_cleanup()
406 {
407   /* fix memory leak in lex */
408   yy_delete_buffer(yy_current_buffer);
409   yy_current_buffer = NULL;
410   free_conv_buffers();
411 }
412
413 static void init_conv_buffers()
414 {
415   if (!ptr_buffer) {
416     ptr_buffer = create_conv_buffer(INITIAL_PTR_BUFFER_LEN);
417     tag_buffer = create_conv_buffer(INITIAL_TAG_BUFFER_LEN);
418     str_buffer = create_conv_buffer(INITIAL_STR_BUFFER_LEN);
419   }
420 }
421
422 static int exitfuncregistered = 0;
423
424 void yymyinit(FILE *f)
425 {
426   if (! exitfuncregistered && atexit(yylex_cleanup) == 0)
427     exitfuncregistered = 1;
428   init_conv_buffers();
429   yyin = f;
430   yyrestart(f);
431   /* Reset our state */
432   current_level = -1;
433   level_diff = MAXGEDCLEVEL;
434   BEGIN(INITIAL);
435 }
436
437 #endif