Recognize the UTF-8 byte order mark.
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /* Common lexer code.
2    Copyright (C) 2001, 2002 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 #if LEX_SECTION == 1
25
26 #include "gedcom_internal.h"
27 #include "multilex.h"
28 #include "encoding.h"
29 #include "gedcom.h"
30 #include "gedcom.tabgen.h"
31 #include "compat.h"
32
33 static size_t encoding_width;
34 static int current_level = -1;
35 static int level_diff=MAXGEDCLEVEL;
36 static size_t line_len = 0;
37
38 static struct conv_buffer* ptr_buffer = NULL;
39 static struct conv_buffer* tag_buffer = NULL;
40 static struct conv_buffer* str_buffer = NULL;
41
42 #define INITIAL_PTR_BUFFER_LEN MAXGEDCPTRLEN * UTF_FACTOR + 1
43 #define INITIAL_TAG_BUFFER_LEN MAXGEDCTAGLEN * UTF_FACTOR + 1
44 #define INITIAL_STR_BUFFER_LEN MAXGEDCLINELEN * UTF_FACTOR + 1
45
46 #ifdef LEXER_TEST 
47 YYSTYPE gedcom_lval;
48 int line_no = 1;
49
50 int gedcom_lex();
51
52 void message_handler(Gedcom_msg_type type, char *msg)
53 {
54   fprintf(stderr, "(%d) %s\n", type, msg);
55 }
56
57 int test_loop(ENCODING enc, const char* code)
58 {
59   int tok, res;
60   init_encodings();
61   set_encoding_width(enc);
62   gedcom_set_message_handler(message_handler);
63   res = open_conv_to_internal(code);
64   if (!res) {
65     gedcom_error("Unable to open conversion context: %s",
66                  strerror(errno));
67     return 1;
68   }
69   tok = gedcom_lex();
70   while (tok) {
71     switch(tok) {
72       case BADTOKEN: printf("BADTOKEN "); break;
73       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
74       case CLOSE: printf("CLOSE "); break;
75       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
76       case DELIM: printf("DELIM "); break;
77       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
78       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
79       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag.string); break;
80       default: printf("TAG(%s) ", gedcom_lval.tag.string); break;
81     }
82     tok = gedcom_lex();
83   }
84   printf("\n");
85   close_conv_to_internal();
86   return 0;  
87 }
88  
89 #endif /* of #ifdef LEXER_TEST */
90
91 /* These are defined as functions here, because xgettext has trouble
92    extracting the strings out of long pre-processor defined */
93
94 static void error_line_too_long()
95 {
96   gedcom_error(_("Line too long, max %d characters allowed"), MAXGEDCLINELEN); 
97 }
98
99 static void error_level_leading_zero()
100 {
101   gedcom_error (_("Level number with leading zero not allowed"));
102 }
103
104 static void error_level_out_of_range()
105 {
106   gedcom_error (_("Level number out of range [0..%d]"), MAXGEDCLEVEL); 
107 }
108
109 static void error_level_too_high(int level_diff)
110 {
111   gedcom_error (_("GEDCOM level number is %d higher than previous"),
112                 level_diff); 
113 }
114
115 static void error_tag_too_long(const char *tag)
116 {
117   gedcom_error(_("Tag '%s' too long, max %d characters allowed"),
118                tag, MAXGEDCTAGLEN); 
119 }
120
121 static void error_invalid_character(const char *str, char ch)
122 {
123   gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), str, ch); 
124 }
125
126 static void error_pointer_too_long(const char *ptr)
127 {
128   gedcom_error(_("Pointer '%s' too long, max %d characters allowed"),
129                ptr, MAXGEDCPTRLEN);
130 }
131
132 static void error_at_character()
133 {
134   gedcom_error(_("'@' character should be written as '@@' in values"));
135 }
136
137 static void error_unexpected_character(const char* str, char ch)
138 {
139   gedcom_error(_("Unexpected character: '%s' (0x%02x)"), str, ch);
140 }
141
142 /* This is to bypass the iconv conversion (if the input is UTF-8 coming
143    from the program) */
144 static int dummy_conv = 0;
145
146 #elif LEX_SECTION == 2
147
148 #define TO_INTERNAL(STR,OUTBUF) \
149   (dummy_conv ? STR : to_internal(STR, yyleng, OUTBUF))
150
151 #define INIT_LINE_LEN \
152   line_len = 0;
153
154 #define CHECK_LINE_LEN                                                        \
155   { if (line_len != (size_t)-1) {                                             \
156       line_len += strlen(yytext);                                             \
157       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
158         error_line_too_long();                                                \
159         line_len = (size_t)-1;                                                \
160         return BADTOKEN;                                                      \
161       }                                                                       \
162     }                                                                         \
163   }
164
165 #define MKTAGACTION(THETAG)                                                  \
166   { CHECK_LINE_LEN;                                                          \
167     gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer);                \
168     gedcom_lval.tag.value  = TAG_##THETAG;                                   \
169     BEGIN(NORMAL);                                                           \
170     line_no++;                                                               \
171     return TAG_##THETAG;                                                     \
172   }
173
174 /* The GEDCOM level number is converted into a sequence of opening
175    and closing brackets.  Simply put, the following GEDCOM fragment:
176    
177    0 HEAD
178    1 SOUR genes
179    2 VERS 1.6
180    2 NAME Genes
181    1 DATE 07 OCT 2001
182    ...
183    0 TRLR
184    
185    is converted into:
186    
187    { HEAD                     (initial)  
188    { SOUR genes               (1 higher: no closing brackets)
189    { VERS 1.6                 (1 higher: no closing brackets)
190    } { NAME Genes             (same level: 1 closing bracket)
191    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
192    ...
193    } { TRLR }
194    
195    or more clearly:
196    
197    { HEAD
198      { SOUR genes
199        { VERS 1.6 }
200        { NAME Genes } }
201      { DATE 07 OCT 2001
202      ... }
203    { TRLR }
204
205    But because this means that one token is converted into a series
206    of tokens, there is some initial code following immediately here
207    that returns "pending" tokens. */
208
209 #define ACTION_BEFORE_REGEXPS                                                 \
210    { if (level_diff < 1) {                                                    \
211        level_diff++;                                                          \
212        return CLOSE;                                                          \
213      }                                                                        \
214      else if (level_diff == 1) {                                              \
215        level_diff++;                                                          \
216        gedcom_lval.number = current_level;                                    \
217        return OPEN;                                                           \
218      }                                                                        \
219      else {                                                                   \
220        /* out of brackets... */                                               \
221      }                                                                        \
222    }
223
224
225 #define ACTION_INITIAL_WHITESPACE                                             \
226   { CHECK_LINE_LEN;                                                           \
227     /* ignore initial whitespace further */                                   \
228   }
229
230
231 #define ACTION_0_DIGITS                                                       \
232    { error_level_leading_zero();                                              \
233      return BADTOKEN;                                                         \
234    } 
235
236
237 #define ACTION_DIGITS                                                         \
238    { int level = atoi(TO_INTERNAL(yytext, str_buffer));                       \
239      CHECK_LINE_LEN;                                                          \
240      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
241        error_level_out_of_range();                                            \
242        line_no++;                                                             \
243        return BADTOKEN;                                                       \
244      }                                                                        \
245      level_diff = level - current_level;                                      \
246      BEGIN(EXPECT_TAG);                                                       \
247      current_level = level;                                                   \
248      if (level_diff < 1) {                                                    \
249        level_diff++;                                                          \
250        return CLOSE;                                                          \
251      }                                                                        \
252      else if (level_diff == 1) {                                              \
253        level_diff++;                                                          \
254        gedcom_lval.number = current_level;                                    \
255        return OPEN;                                                           \
256      }                                                                        \
257      else {                                                                   \
258        /* should never happen (error to GEDCOM spec) */                       \
259        error_level_too_high(level_diff);                                      \
260        line_no++;                                                             \
261        return BADTOKEN;                                                       \
262      }                                                                        \
263    } 
264
265
266 #define ACTION_ALPHANUM                                                       \
267    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
268        error_tag_too_long(yytext);                                            \
269        line_no++;                                                             \
270        return BADTOKEN;                                                       \
271      }                                                                        \
272      CHECK_LINE_LEN;                                                          \
273      gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer);                \
274      gedcom_lval.tag.value  = USERTAG;                                        \
275      BEGIN(NORMAL);                                                           \
276      line_no++;                                                               \
277      return USERTAG;                                                          \
278    }
279
280
281 #define ACTION_DELIM                                                          \
282   { CHECK_LINE_LEN;                                                           \
283     gedcom_lval.string = TO_INTERNAL(yytext, str_buffer);                     \
284     return DELIM;                                                             \
285   }
286
287
288 #define ACTION_ANY                                                            \
289   { char* tmp;                                                                \
290     CHECK_LINE_LEN;                                                           \
291     tmp = TO_INTERNAL(yytext, str_buffer);                                    \
292     if (!tmp) {                                                               \
293       /* Something went wrong during conversion... */                         \
294           error_invalid_character(yytext, yytext[0]);                         \
295           return BADTOKEN;                                                    \
296     }                                                                         \
297     else {                                                                    \
298       gedcom_lval.string = tmp;                                               \
299       /* Due to character conversions, it is possible that the current        \
300          character will be combined with the next, and so now we don't have a \
301          character yet...                                                     \
302          In principle, this is only applicable to the 1byte case (e.g. ANSEL),\
303          but it doesn't harm the unicode case.                                \
304       */                                                                      \
305       if (strlen(gedcom_lval.string) > 0)                                     \
306         return ANYCHAR;                                                       \
307     }                                                                         \
308   }
309
310
311 #define ACTION_ESCAPE                                                         \
312   { CHECK_LINE_LEN;                                                           \
313     gedcom_lval.string = TO_INTERNAL(yytext, str_buffer);                     \
314     return ESCAPE;                                                            \
315   }
316
317
318 #define ACTION_POINTER                                                        \
319   { CHECK_LINE_LEN;                                                           \
320     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
321       error_pointer_too_long(yytext);                                         \
322       return BADTOKEN;                                                        \
323     }                                                                         \
324     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buffer);                     \
325     return POINTER;                                                           \
326   }
327
328
329 /* Due to the conversion of level numbers into brackets, the
330    terminator is not important, so no token is returned here.
331    Although not strictly according to the GEDCOM spec, we'll ignore
332    whitespace just before the terminator.
333 */
334
335 #define ACTION_TERMINATOR                                                     \
336   { CHECK_LINE_LEN;                                                           \
337     INIT_LINE_LEN;                                                            \
338     BEGIN(INITIAL);                                                           \
339   }
340
341
342 /* Eventually we have to return 1 closing bracket (for the trailer).
343    We can detect whether we have sent the closing bracket using the
344    level_diff (at eof, first it is 2, then we increment it ourselves)
345 */
346
347 #define ACTION_EOF                                                            \
348   { if (level_diff == 2) {                                                    \
349       level_diff++;                                                           \
350       return CLOSE;                                                           \
351     }                                                                         \
352     else {                                                                    \
353       char* ptr; int size;                                                    \
354       /* ... terminate lex */                                                 \
355       yyterminate();                                                          \
356       /* Get rid of f*cking compiler warning from lex generated code */       \
357       /* yyterminate does return(), so program will never come here  */       \
358       yy_flex_realloc(ptr, size);                                             \
359     }                                                                         \
360   } 
361
362 #define ACTION_NORMAL_AT                                                      \
363   { if (compat_mode(C_NO_DOUBLE_AT)) {                                        \
364       int i, j;                                                               \
365       char *yycopy = strdup(yytext);                                          \
366       if (yycopy) {                                                           \
367         for (i = 0; i < 2; i++)                                               \
368           for (j = yyleng - 1; j >= 0; --j)                                   \
369             unput(yycopy[j]);                                                 \
370         free(yycopy);                                                         \
371       }                                                                       \
372       else {                                                                  \
373         MEMORY_ERROR;                                                         \
374       }                                                                       \
375     }                                                                         \
376     else {                                                                    \
377       error_at_character();                                                   \
378       return BADTOKEN;                                                        \
379     }                                                                         \
380   }
381
382 #define ACTION_UNEXPECTED                                                     \
383   { error_unexpected_character(yytext, yytext[0]);                            \
384     return BADTOKEN;                                                          \
385   }
386
387 #elif LEX_SECTION == 3
388
389 int yywrap()
390 {
391   return 1;
392 }
393
394 static void free_conv_buffers()
395 {
396   free_conv_buffer(ptr_buffer);
397   free_conv_buffer(tag_buffer);
398   free_conv_buffer(str_buffer);
399 }
400
401 static void yylex_cleanup()
402 {
403   /* fix memory leak in lex */
404   yy_delete_buffer(yy_current_buffer);
405   yy_current_buffer = NULL;
406   free_conv_buffers();
407 }
408
409 static void init_conv_buffers()
410 {
411   if (!ptr_buffer) {
412     ptr_buffer = create_conv_buffer(INITIAL_PTR_BUFFER_LEN);
413     tag_buffer = create_conv_buffer(INITIAL_TAG_BUFFER_LEN);
414     str_buffer = create_conv_buffer(INITIAL_STR_BUFFER_LEN);
415   }
416 }
417
418 static int exitfuncregistered = 0;
419
420 void yymyinit(FILE *f)
421 {
422   if (! exitfuncregistered && atexit(yylex_cleanup) == 0)
423     exitfuncregistered = 1;
424   init_conv_buffers();
425   yyin = f;
426   yyrestart(f);
427   /* Reset our state */
428   current_level = -1;
429   level_diff = MAXGEDCLEVEL;
430   BEGIN(INITIAL);
431 }
432
433 #endif