utf8.h is renamed to utf8tools.h
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /* Common lexer code.
2    Copyright (C) 2001, 2002 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 #if LEX_SECTION == 1
25
26 #include "gedcom_internal.h"
27 #include "multilex.h"
28 #include "encoding.h"
29 #include "gedcom.h"
30 #include "gedcom.tabgen.h"
31 #include "compat.h"
32
33 static size_t encoding_width;
34 static int current_level = -1;
35 static int level_diff=MAXGEDCLEVEL;
36 static size_t line_len = 0;
37
38 static struct conv_buffer* ptr_buffer = NULL;
39 static struct conv_buffer* tag_buffer = NULL;
40 static struct conv_buffer* str_buffer = NULL;
41
42 #define INITIAL_PTR_BUFFER_LEN MAXGEDCPTRLEN * UTF_FACTOR + 1
43 #define INITIAL_TAG_BUFFER_LEN MAXGEDCTAGLEN * UTF_FACTOR + 1
44 #define INITIAL_STR_BUFFER_LEN MAXGEDCLINELEN * UTF_FACTOR + 1
45
46 #ifdef LEXER_TEST 
47 YYSTYPE gedcom_lval;
48 int line_no = 1;
49 int compat_at = 0;
50
51 int gedcom_lex();
52
53 void message_handler(Gedcom_msg_type type, char *msg)
54 {
55   fprintf(stderr, "(%d) %s\n", type, msg);
56 }
57
58 int test_loop(ENCODING enc, const char* code)
59 {
60   int tok, res;
61   init_encodings();
62   set_encoding_width(enc);
63   gedcom_set_message_handler(message_handler);
64   res = open_conv_to_internal(code);
65   if (!res) {
66     gedcom_error("Unable to open conversion context: %s",
67                  strerror(errno));
68     return 1;
69   }
70   tok = gedcom_lex();
71   while (tok) {
72     switch(tok) {
73       case BADTOKEN: printf("BADTOKEN "); break;
74       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
75       case CLOSE: printf("CLOSE "); break;
76       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
77       case DELIM: printf("DELIM "); break;
78       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
79       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
80       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag.string); break;
81       default: printf("TAG(%s) ", gedcom_lval.tag.string); break;
82     }
83     tok = gedcom_lex();
84   }
85   printf("\n");
86   close_conv_to_internal();
87   return 0;  
88 }
89  
90 #endif /* of #ifdef LEXER_TEST */
91
92 /* These are defined as functions here, because xgettext has trouble
93    extracting the strings out of long pre-processor defined */
94
95 static void error_line_too_long()
96 {
97   gedcom_error(_("Line too long, max %d characters allowed"), MAXGEDCLINELEN); 
98 }
99
100 static void error_level_leading_zero()
101 {
102   gedcom_error (_("Level number with leading zero not allowed"));
103 }
104
105 static void error_level_out_of_range()
106 {
107   gedcom_error (_("Level number out of range [0..%d]"), MAXGEDCLEVEL); 
108 }
109
110 static void error_level_too_high(int level_diff)
111 {
112   gedcom_error (_("GEDCOM level number is %d higher than previous"),
113                 level_diff); 
114 }
115
116 static void error_tag_too_long(const char *tag)
117 {
118   gedcom_error(_("Tag '%s' too long, max %d characters allowed"),
119                tag, MAXGEDCTAGLEN); 
120 }
121
122 static void error_invalid_character(const char *str, char ch)
123 {
124   gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), str, ch); 
125 }
126
127 static void error_pointer_too_long(const char *ptr)
128 {
129   gedcom_error(_("Pointer '%s' too long, max %d characters allowed"),
130                ptr, MAXGEDCPTRLEN);
131 }
132
133 static void error_at_character()
134 {
135   gedcom_error(_("'@' character should be written as '@@' in values"));
136 }
137
138 static void error_unexpected_character(const char* str, char ch)
139 {
140   gedcom_error(_("Unexpected character: '%s' (0x%02x)"), str, ch);
141 }
142
143 /* This is to bypass the iconv conversion (if the input is UTF-8 coming
144    from the program) */
145 static int dummy_conv = 0;
146
147 #elif LEX_SECTION == 2
148
149 #define TO_INTERNAL(STR,OUTBUF) \
150   (dummy_conv ? STR : to_internal(STR, yyleng, OUTBUF))
151
152 #define INIT_LINE_LEN \
153   line_len = 0;
154
155 #define CHECK_LINE_LEN                                                        \
156   { if (line_len != (size_t)-1) {                                             \
157       line_len += strlen(yytext);                                             \
158       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
159         error_line_too_long();                                                \
160         line_len = (size_t)-1;                                                \
161         return BADTOKEN;                                                      \
162       }                                                                       \
163     }                                                                         \
164   }
165
166 #define MKTAGACTION(THETAG)                                                  \
167   { CHECK_LINE_LEN;                                                          \
168     gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer);                \
169     gedcom_lval.tag.value  = TAG_##THETAG;                                   \
170     BEGIN(NORMAL);                                                           \
171     line_no++;                                                               \
172     return TAG_##THETAG;                                                     \
173   }
174
175 /* The GEDCOM level number is converted into a sequence of opening
176    and closing brackets.  Simply put, the following GEDCOM fragment:
177    
178    0 HEAD
179    1 SOUR genes
180    2 VERS 1.6
181    2 NAME Genes
182    1 DATE 07 OCT 2001
183    ...
184    0 TRLR
185    
186    is converted into:
187    
188    { HEAD                     (initial)  
189    { SOUR genes               (1 higher: no closing brackets)
190    { VERS 1.6                 (1 higher: no closing brackets)
191    } { NAME Genes             (same level: 1 closing bracket)
192    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
193    ...
194    } { TRLR }
195    
196    or more clearly:
197    
198    { HEAD
199      { SOUR genes
200        { VERS 1.6 }
201        { NAME Genes } }
202      { DATE 07 OCT 2001
203      ... }
204    { TRLR }
205
206    But because this means that one token is converted into a series
207    of tokens, there is some initial code following immediately here
208    that returns "pending" tokens. */
209
210 #define ACTION_BEFORE_REGEXPS                                                 \
211    { if (level_diff < 1) {                                                    \
212        level_diff++;                                                          \
213        return CLOSE;                                                          \
214      }                                                                        \
215      else if (level_diff == 1) {                                              \
216        level_diff++;                                                          \
217        gedcom_lval.number = current_level;                                    \
218        return OPEN;                                                           \
219      }                                                                        \
220      else {                                                                   \
221        /* out of brackets... */                                               \
222      }                                                                        \
223    }
224
225
226 #define ACTION_INITIAL_WHITESPACE                                             \
227   { CHECK_LINE_LEN;                                                           \
228     /* ignore initial whitespace further */                                   \
229   }
230
231
232 #define ACTION_0_DIGITS                                                       \
233    { error_level_leading_zero();                                              \
234      return BADTOKEN;                                                         \
235    } 
236
237
238 #define ACTION_DIGITS                                                         \
239    { int level = atoi(TO_INTERNAL(yytext, str_buffer));                       \
240      CHECK_LINE_LEN;                                                          \
241      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
242        error_level_out_of_range();                                            \
243        line_no++;                                                             \
244        return BADTOKEN;                                                       \
245      }                                                                        \
246      level_diff = level - current_level;                                      \
247      BEGIN(EXPECT_TAG);                                                       \
248      current_level = level;                                                   \
249      if (level_diff < 1) {                                                    \
250        level_diff++;                                                          \
251        return CLOSE;                                                          \
252      }                                                                        \
253      else if (level_diff == 1) {                                              \
254        level_diff++;                                                          \
255        gedcom_lval.number = current_level;                                    \
256        return OPEN;                                                           \
257      }                                                                        \
258      else {                                                                   \
259        /* should never happen (error to GEDCOM spec) */                       \
260        error_level_too_high(level_diff);                                      \
261        line_no++;                                                             \
262        return BADTOKEN;                                                       \
263      }                                                                        \
264    } 
265
266
267 #define ACTION_ALPHANUM                                                       \
268    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
269        error_tag_too_long(yytext);                                            \
270        line_no++;                                                             \
271        return BADTOKEN;                                                       \
272      }                                                                        \
273      CHECK_LINE_LEN;                                                          \
274      gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer);                \
275      gedcom_lval.tag.value  = USERTAG;                                        \
276      BEGIN(NORMAL);                                                           \
277      line_no++;                                                               \
278      return USERTAG;                                                          \
279    }
280
281
282 #define ACTION_DELIM                                                          \
283   { CHECK_LINE_LEN;                                                           \
284     gedcom_lval.string = TO_INTERNAL(yytext, str_buffer);                     \
285     return DELIM;                                                             \
286   }
287
288
289 #define ACTION_ANY                                                            \
290   { char* tmp;                                                                \
291     CHECK_LINE_LEN;                                                           \
292     tmp = TO_INTERNAL(yytext, str_buffer);                                    \
293     if (!tmp) {                                                               \
294       /* Something went wrong during conversion... */                         \
295           error_invalid_character(yytext, yytext[0]);                         \
296           return BADTOKEN;                                                    \
297     }                                                                         \
298     else {                                                                    \
299       gedcom_lval.string = tmp;                                               \
300       /* Due to character conversions, it is possible that the current        \
301          character will be combined with the next, and so now we don't have a \
302          character yet...                                                     \
303          In principle, this is only applicable to the 1byte case (e.g. ANSEL),\
304          but it doesn't harm the unicode case.                                \
305       */                                                                      \
306       if (strlen(gedcom_lval.string) > 0)                                     \
307         return ANYCHAR;                                                       \
308     }                                                                         \
309   }
310
311
312 #define ACTION_ESCAPE                                                         \
313   { CHECK_LINE_LEN;                                                           \
314     gedcom_lval.string = TO_INTERNAL(yytext, str_buffer);                     \
315     return ESCAPE;                                                            \
316   }
317
318
319 #define ACTION_POINTER                                                        \
320   { CHECK_LINE_LEN;                                                           \
321     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
322       error_pointer_too_long(yytext);                                         \
323       return BADTOKEN;                                                        \
324     }                                                                         \
325     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buffer);                     \
326     return POINTER;                                                           \
327   }
328
329
330 /* Due to the conversion of level numbers into brackets, the
331    terminator is not important, so no token is returned here.
332    Although not strictly according to the GEDCOM spec, we'll ignore
333    whitespace just before the terminator.
334 */
335
336 #define ACTION_TERMINATOR                                                     \
337   { CHECK_LINE_LEN;                                                           \
338     INIT_LINE_LEN;                                                            \
339     BEGIN(INITIAL);                                                           \
340   }
341
342
343 /* Eventually we have to return 1 closing bracket (for the trailer).
344    We can detect whether we have sent the closing bracket using the
345    level_diff (at eof, first it is 2, then we increment it ourselves)
346 */
347
348 #define ACTION_EOF                                                            \
349   { if (level_diff == 2) {                                                    \
350       level_diff++;                                                           \
351       return CLOSE;                                                           \
352     }                                                                         \
353     else {                                                                    \
354       char* ptr; int size;                                                    \
355       /* ... terminate lex */                                                 \
356       yyterminate();                                                          \
357       /* Get rid of f*cking compiler warning from lex generated code */       \
358       /* yyterminate does return(), so program will never come here  */       \
359       yy_flex_realloc(ptr, size);                                             \
360     }                                                                         \
361   } 
362
363 #define ACTION_NORMAL_AT                                                      \
364   { if (compat_at) {                                                          \
365       int i, j;                                                               \
366       char *yycopy = strdup(yytext);                                          \
367       if (yycopy) {                                                           \
368         for (i = 0; i < 2; i++)                                               \
369           for (j = yyleng - 1; j >= 0; --j)                                   \
370             unput(yycopy[j]);                                                 \
371         free(yycopy);                                                         \
372       }                                                                       \
373       else {                                                                  \
374         MEMORY_ERROR;                                                         \
375       }                                                                       \
376     }                                                                         \
377     else {                                                                    \
378       error_at_character();                                                   \
379       return BADTOKEN;                                                        \
380     }                                                                         \
381   }
382
383 #define ACTION_UNEXPECTED                                                     \
384   { error_unexpected_character(yytext, yytext[0]);                            \
385     return BADTOKEN;                                                          \
386   }
387
388 #elif LEX_SECTION == 3
389
390 int yywrap()
391 {
392   return 1;
393 }
394
395 static void free_conv_buffers()
396 {
397   free_conv_buffer(ptr_buffer);
398   free_conv_buffer(tag_buffer);
399   free_conv_buffer(str_buffer);
400 }
401
402 static void yylex_cleanup()
403 {
404   /* fix memory leak in lex */
405   yy_delete_buffer(yy_current_buffer);
406   yy_current_buffer = NULL;
407   free_conv_buffers();
408 }
409
410 static void init_conv_buffers()
411 {
412   if (!ptr_buffer) {
413     ptr_buffer = create_conv_buffer(INITIAL_PTR_BUFFER_LEN);
414     tag_buffer = create_conv_buffer(INITIAL_TAG_BUFFER_LEN);
415     str_buffer = create_conv_buffer(INITIAL_STR_BUFFER_LEN);
416   }
417 }
418
419 static int exitfuncregistered = 0;
420
421 void yymyinit(FILE *f)
422 {
423   if (! exitfuncregistered && atexit(yylex_cleanup) == 0)
424     exitfuncregistered = 1;
425   init_conv_buffers();
426   yyin = f;
427   yyrestart(f);
428   /* Reset our state */
429   current_level = -1;
430   level_diff = MAXGEDCLEVEL;
431   BEGIN(INITIAL);
432 }
433
434 #endif