Moved strings out of macros, because xgettext doesn't recognize them there.
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /* Common lexer code.
2    Copyright (C) 2001, 2002 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 #ifndef IN_LEX
25
26 #include "gedcom_internal.h"
27 #include "multilex.h"
28 #include "encoding.h"
29 #include "gedcom.h"
30 #include "gedcom.tabgen.h"
31 #include "compat.h"
32
33 static size_t encoding_width;
34 static int current_level = -1;
35 static int level_diff=MAXGEDCLEVEL;
36 static size_t line_len = 0;
37
38 static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1];
39 static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1];
40 static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1];
41
42 #ifdef LEXER_TEST 
43 YYSTYPE gedcom_lval;
44 int line_no = 1;
45 int compat_at = 0;
46
47 int gedcom_lex();
48
49 void message_handler(Gedcom_msg_type type, char *msg)
50 {
51   fprintf(stderr, "(%d) %s\n", type, msg);
52 }
53
54 int test_loop(ENCODING enc, char* code)
55 {
56   int tok, res;
57   init_encodings();
58   set_encoding_width(enc);
59   gedcom_set_message_handler(message_handler);
60   res = open_conv_to_internal(code);
61   if (!res) {
62     gedcom_error("Unable to open conversion context: %s",
63                  strerror(errno));
64     return 1;
65   }
66   tok = gedcom_lex();
67   while (tok) {
68     switch(tok) {
69       case BADTOKEN: printf("BADTOKEN "); break;
70       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
71       case CLOSE: printf("CLOSE "); break;
72       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
73       case DELIM: printf("DELIM "); break;
74       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
75       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
76       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag.string); break;
77       default: printf("TAG(%s) ", gedcom_lval.tag.string); break;
78     }
79     tok = gedcom_lex();
80   }
81   printf("\n");
82   close_conv_to_internal();
83   return 0;  
84 }
85  
86 #endif /* of #ifdef LEXER_TEST */
87
88 /* These are defined as functions here, because xgettext has trouble
89    extracting the strings out of long pre-processor defined */
90
91 static void error_line_too_long()
92 {
93   gedcom_error(_("Line too long, max %d characters allowed"), MAXGEDCLINELEN); 
94 }
95
96 static void error_level_leading_zero()
97 {
98   gedcom_error (_("Level number with leading zero not allowed"));
99 }
100
101 static void error_level_out_of_range()
102 {
103   gedcom_error (_("Level number out of range [0..%d]"), MAXGEDCLEVEL); 
104 }
105
106 static void error_level_too_high(int level_diff)
107 {
108   gedcom_error (_("GEDCOM level number is %d higher than previous"),
109                 level_diff); 
110 }
111
112 static void error_tag_too_long(char *tag)
113 {
114   gedcom_error(_("Tag '%s' too long, max %d characters allowed"),
115                tag, MAXGEDCTAGLEN); 
116 }
117
118 static void error_invalid_character(char *str, char ch)
119 {
120   gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), str, ch); 
121 }
122
123 static void error_pointer_too_long(char *ptr)
124 {
125   gedcom_error(_("Pointer '%s' too long, max %d characters allowed"),
126                ptr, MAXGEDCPTRLEN);
127 }
128
129 static void error_at_character()
130 {
131   gedcom_error(_("'@' character should be written as '@@' in values"));
132 }
133
134 static void error_unexpected_character(char* str, char ch)
135 {
136   gedcom_error(_("Unexpected character: '%s' (0x%02x)"), str, ch);
137 }
138
139 #else  /* of #ifndef IN_LEX */
140
141 #define TO_INTERNAL(STR,OUTBUF) \
142   to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF))
143
144 #define INIT_LINE_LEN \
145   line_len = 0;
146
147 #define CHECK_LINE_LEN                                                        \
148   { if (line_len != (size_t)-1) {                                             \
149       line_len += strlen(yytext);                                             \
150       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
151         error_line_too_long();                                                \
152         line_len = (size_t)-1;                                                \
153         return BADTOKEN;                                                      \
154       }                                                                       \
155     }                                                                         \
156   }
157
158 #define MKTAGACTION(THETAG)                                                  \
159   { CHECK_LINE_LEN;                                                          \
160     gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buf);                   \
161     gedcom_lval.tag.value  = TAG_##THETAG;                                   \
162     BEGIN(NORMAL);                                                           \
163     return TAG_##THETAG;                                                     \
164   }
165
166 /* The GEDCOM level number is converted into a sequence of opening
167    and closing brackets.  Simply put, the following GEDCOM fragment:
168    
169    0 HEAD
170    1 SOUR genes
171    2 VERS 1.6
172    2 NAME Genes
173    1 DATE 07 OCT 2001
174    ...
175    0 TRLR
176    
177    is converted into:
178    
179    { HEAD                     (initial)  
180    { SOUR genes               (1 higher: no closing brackets)
181    { VERS 1.6                 (1 higher: no closing brackets)
182    } { NAME Genes             (same level: 1 closing bracket)
183    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
184    ...
185    } { TRLR }
186    
187    or more clearly:
188    
189    { HEAD
190      { SOUR genes
191        { VERS 1.6 }
192        { NAME Genes } }
193      { DATE 07 OCT 2001
194      ... }
195    { TRLR }
196
197    But because this means that one token is converted into a series
198    of tokens, there is some initial code following immediately here
199    that returns "pending" tokens. */
200
201 #define ACTION_BEFORE_REGEXPS                                                 \
202    { if (level_diff < 1) {                                                    \
203        level_diff++;                                                          \
204        return CLOSE;                                                          \
205      }                                                                        \
206      else if (level_diff == 1) {                                              \
207        level_diff++;                                                          \
208        gedcom_lval.number = current_level;                                    \
209        return OPEN;                                                           \
210      }                                                                        \
211      else {                                                                   \
212        /* out of brackets... */                                               \
213      }                                                                        \
214    }
215
216
217 #define ACTION_INITIAL_WHITESPACE                                             \
218   { CHECK_LINE_LEN;                                                           \
219     /* ignore initial whitespace further */                                   \
220   }
221
222
223 #define ACTION_0_DIGITS                                                       \
224    { error_level_leading_zero();                                              \
225      return BADTOKEN;                                                         \
226    } 
227
228
229 #define ACTION_DIGITS                                                         \
230    { int level = atoi(TO_INTERNAL(yytext, str_buf));                          \
231      CHECK_LINE_LEN;                                                          \
232      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
233        error_level_out_of_range();                                            \
234        return BADTOKEN;                                                       \
235      }                                                                        \
236      level_diff = level - current_level;                                      \
237      BEGIN(EXPECT_TAG);                                                       \
238      current_level = level;                                                   \
239      if (level_diff < 1) {                                                    \
240        level_diff++;                                                          \
241        return CLOSE;                                                          \
242      }                                                                        \
243      else if (level_diff == 1) {                                              \
244        level_diff++;                                                          \
245        gedcom_lval.number = current_level;                                    \
246        return OPEN;                                                           \
247      }                                                                        \
248      else {                                                                   \
249        /* should never happen (error to GEDCOM spec) */                       \
250        error_level_too_high(level_diff);                                      \
251        return BADTOKEN;                                                       \
252      }                                                                        \
253    } 
254
255
256 #define ACTION_ALPHANUM                                                       \
257    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
258        error_tag_too_long(yytext);                                            \
259        return BADTOKEN;                                                       \
260      }                                                                        \
261      CHECK_LINE_LEN;                                                          \
262      gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buf);                   \
263      gedcom_lval.tag.value  = USERTAG;                                        \
264      BEGIN(NORMAL);                                                           \
265      return USERTAG;                                                          \
266    }
267
268
269 #define ACTION_DELIM                                                          \
270   { CHECK_LINE_LEN;                                                           \
271     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
272     return DELIM;                                                             \
273   }
274
275
276 #define ACTION_ANY                                                            \
277   { char* tmp;                                                                \
278     CHECK_LINE_LEN;                                                           \
279     tmp = TO_INTERNAL(yytext, str_buf);                                       \
280     if (!tmp) {                                                               \
281       /* Something went wrong during conversion... */                         \
282           error_invalid_character(yytext, yytext[0]);                         \
283           return BADTOKEN;                                                    \
284     }                                                                         \
285     else {                                                                    \
286       gedcom_lval.string = tmp;                                               \
287       /* Due to character conversions, it is possible that the current        \
288          character will be combined with the next, and so now we don't have a \
289          character yet...                                                     \
290          In principle, this is only applicable to the 1byte case (e.g. ANSEL),\
291          but it doesn't harm the unicode case.                                \
292       */                                                                      \
293       if (strlen(gedcom_lval.string) > 0)                                     \
294         return ANYCHAR;                                                       \
295     }                                                                         \
296   }
297
298
299 #define ACTION_ESCAPE                                                         \
300   { CHECK_LINE_LEN;                                                           \
301     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
302     return ESCAPE;                                                            \
303   }
304
305
306 #define ACTION_POINTER                                                        \
307   { CHECK_LINE_LEN;                                                           \
308     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
309       error_pointer_too_long(yytext);                                         \
310       return BADTOKEN;                                                        \
311     }                                                                         \
312     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf);                        \
313     return POINTER;                                                           \
314   }
315
316
317 /* Due to the conversion of level numbers into brackets, the
318    terminator is not important, so no token is returned here.
319    Although not strictly according to the GEDCOM spec, we'll ignore
320    whitespace just before the terminator.
321 */
322
323 #define ACTION_TERMINATOR                                                     \
324   { CHECK_LINE_LEN;                                                           \
325     INIT_LINE_LEN;                                                            \
326     line_no++;                                                                \
327     BEGIN(INITIAL);                                                           \
328   }
329
330
331 /* Eventually we have to return 1 closing bracket (for the trailer).
332    We can detect whether we have sent the closing bracket using the
333    level_diff (at eof, first it is 2, then we increment it ourselves)
334 */
335
336 #define ACTION_EOF                                                            \
337   { if (level_diff == 2) {                                                    \
338       level_diff++;                                                           \
339       return CLOSE;                                                           \
340     }                                                                         \
341     else {                                                                    \
342       char* ptr; int size;                                                    \
343       /* Reset our state */                                                   \
344       current_level = -1;                                                     \
345       level_diff = MAXGEDCLEVEL;                                              \
346       /* ... then terminate lex */                                            \
347       yyterminate();                                                          \
348       /* Get rid of f*cking compiler warning from lex generated code */       \
349       /* yyterminate does return(), so program will never come here  */       \
350       yy_flex_realloc(ptr, size);                                             \
351     }                                                                         \
352   } 
353
354 #define ACTION_NORMAL_AT                                                      \
355   { if (compat_at) {                                                          \
356       int i, j;                                                               \
357       char *yycopy = strdup(yytext);                                          \
358       if (yycopy) {                                                           \
359         for (i = 0; i < 2; i++)                                               \
360           for (j = yyleng - 1; j >= 0; --j)                                   \
361             unput(yycopy[j]);                                                 \
362         free(yycopy);                                                         \
363       }                                                                       \
364       else {                                                                  \
365         MEMORY_ERROR;                                                         \
366       }                                                                       \
367     }                                                                         \
368     else {                                                                    \
369       error_at_character();                                                   \
370       return BADTOKEN;                                                        \
371     }                                                                         \
372   }
373
374 #define ACTION_UNEXPECTED                                                     \
375   { error_unexpected_character(yytext, yytext[0]);                            \
376     return BADTOKEN;                                                          \
377   }
378
379 #endif /* IN_LEX */