Move to LGPL license.
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /* Common lexer code.
2    Copyright (C) 2001 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 #ifndef IN_LEX
25
26 #include "gedcom_internal.h"
27 #include "multilex.h"
28 #include "encoding.h"
29 #include "gedcom.h"
30 #include "gedcom.tab.h"
31
32 #define YY_NO_UNPUT
33
34 static size_t encoding_width;
35 static int current_level = -1;
36 static int level_diff=MAXGEDCLEVEL;
37 static size_t line_len = 0;
38
39 static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1];
40 static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1];
41 static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1];
42
43 #ifdef LEXER_TEST 
44 YYSTYPE gedcom_lval;
45 int line_no = 1;
46
47 int gedcom_lex();
48
49 int test_loop(ENCODING enc, char* code)
50 {
51   int tok, res;
52   init_encodings();
53   set_encoding_width(enc);
54   res = open_conv_to_internal(code);
55   if (!res) {
56     gedcom_error("Unable to open conversion context: %s",
57                  strerror(errno));
58     return 1;
59   }
60   tok = gedcom_lex();
61   while (tok) {
62     switch(tok) {
63       case BADTOKEN: printf("BADTOKEN "); break;
64       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
65       case CLOSE: printf("CLOSE "); break;
66       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
67       case DELIM: printf("DELIM "); break;
68       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
69       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
70       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
71       default: printf("TAG(%s) ", gedcom_lval.string); break;
72     }
73     tok = gedcom_lex();
74   }
75   printf("\n");
76   close_conv_to_internal();
77   return 0;  
78 }
79  
80 #endif /* of #ifdef LEXER_TEST */
81
82 #else  /* of #ifndef IN_LEX */
83
84 #define TO_INTERNAL(STR,OUTBUF) \
85   to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF))
86
87 #define INIT_LINE_LEN \
88   line_len = 0;
89
90 #define CHECK_LINE_LEN                                                        \
91   { if (line_len != (size_t)-1) {                                             \
92       line_len += strlen(yytext);                                             \
93       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
94         gedcom_error("Line too long, max %d characters",                      \
95                      MAXGEDCLINELEN);                                         \
96         line_len = (size_t)-1;                                                \
97         return BADTOKEN;                                                      \
98       }                                                                       \
99     }                                                                         \
100   }
101
102 #define MKTAGACTION(THETAG)                                                  \
103   { CHECK_LINE_LEN;                                                          \
104     gedcom_lval.string = TO_INTERNAL(yytext, tag_buf);                       \
105     BEGIN(NORMAL);                                                           \
106     return TAG_##THETAG;                                                     \
107   }
108
109 /* The GEDCOM level number is converted into a sequence of opening
110    and closing brackets.  Simply put, the following GEDCOM fragment:
111    
112    0 HEAD
113    1 SOUR genes
114    2 VERS 1.6
115    2 NAME Genes
116    1 DATE 07 OCT 2001
117    ...
118    0 TRLR
119    
120    is converted into:
121    
122    { HEAD                     (initial)  
123    { SOUR genes               (1 higher: no closing brackets)
124    { VERS 1.6                 (1 higher: no closing brackets)
125    } { NAME Genes             (same level: 1 closing bracket)
126    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
127    ...
128    } { TRLR }
129    
130    or more clearly:
131    
132    { HEAD
133      { SOUR genes
134        { VERS 1.6 }
135        { NAME Genes } }
136      { DATE 07 OCT 2001
137      ... }
138    { TRLR }
139
140    But because this means that one token is converted into a series
141    of tokens, there is some initial code following immediately here
142    that returns "pending" tokens. */
143
144 #define ACTION_BEFORE_REGEXPS                                                 \
145    { if (level_diff < 1) {                                                    \
146        level_diff++;                                                          \
147        return CLOSE;                                                          \
148      }                                                                        \
149      else if (level_diff == 1) {                                              \
150        level_diff++;                                                          \
151        gedcom_lval.number = current_level;                                    \
152        return OPEN;                                                           \
153      }                                                                        \
154      else {                                                                   \
155        /* out of brackets... */                                               \
156      }                                                                        \
157    }
158
159
160 #define ACTION_INITIAL_WHITESPACE                                             \
161   { CHECK_LINE_LEN;                                                           \
162     /* ignore initial whitespace further */                                   \
163   }
164
165
166 #define ACTION_0_DIGITS                                                       \
167    { gedcom_error ("Level number with leading zero");                         \
168      return BADTOKEN;                                                         \
169    } 
170
171
172 #define ACTION_DIGITS                                                         \
173    { int level = atoi(TO_INTERNAL(yytext, str_buf));                          \
174      CHECK_LINE_LEN;                                                          \
175      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
176        gedcom_error ("Level number out of range [0..%d]",                     \
177                      MAXGEDCLEVEL);                                           \
178        return BADTOKEN;                                                       \
179      }                                                                        \
180      level_diff = level - current_level;                                      \
181      BEGIN(EXPECT_TAG);                                                       \
182      current_level = level;                                                   \
183      if (level_diff < 1) {                                                    \
184        level_diff++;                                                          \
185        return CLOSE;                                                          \
186      }                                                                        \
187      else if (level_diff == 1) {                                              \
188        level_diff++;                                                          \
189        gedcom_lval.number = current_level;                                    \
190        return OPEN;                                                           \
191      }                                                                        \
192      else {                                                                   \
193        /* should never happen (error to GEDCOM spec) */                       \
194        gedcom_error ("GEDCOM level number is %d higher than "                 \
195                      "previous",                                              \
196                      level_diff);                                             \
197        return BADTOKEN;                                                       \
198      }                                                                        \
199    } 
200
201
202 #define ACTION_ALPHANUM                                                       \
203    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
204        gedcom_error("Tag '%s' too long, max %d characters",                   \
205                     yytext, MAXGEDCTAGLEN);                                   \
206        return BADTOKEN;                                                       \
207      }                                                                        \
208      CHECK_LINE_LEN;                                                          \
209      gedcom_lval.string = TO_INTERNAL(yytext, tag_buf);                       \
210      BEGIN(NORMAL);                                                           \
211      return USERTAG;                                                          \
212    }
213
214
215 #define ACTION_DELIM                                                          \
216   { CHECK_LINE_LEN;                                                           \
217     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
218     return DELIM;                                                             \
219   }
220
221
222 #define ACTION_ANY                                                            \
223   { CHECK_LINE_LEN;                                                           \
224     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
225     /* Due to character conversions, it is possible that the current          \
226        character will be combined with the next, and so now we don't have a   \
227        character yet...                                                       \
228        In principle, this is only applicable to the 1byte case (e.g. ANSEL),  \
229        but it doesn't harm the unicode case.                                  \
230     */                                                                        \
231     if (strlen(gedcom_lval.string) > 0)                                       \
232       return ANYCHAR;                                                         \
233   }
234
235
236 #define ACTION_ESCAPE                                                         \
237   { CHECK_LINE_LEN;                                                           \
238     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
239     return ESCAPE;                                                            \
240   }
241
242
243 #define ACTION_POINTER                                                        \
244   { CHECK_LINE_LEN;                                                           \
245     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
246       gedcom_error("Pointer '%s' too long, max %d characters",                \
247                    yytext, MAXGEDCPTRLEN);                                    \
248       return BADTOKEN;                                                        \
249     }                                                                         \
250     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf);                        \
251     return POINTER;                                                           \
252   }
253
254
255 /* Due to the conversion of level numbers into brackets, the
256    terminator is not important, so no token is returned here.
257    Although not strictly according to the GEDCOM spec, we'll ignore
258    whitespace just before the terminator.
259 */
260
261 #define ACTION_TERMINATOR                                                     \
262   { CHECK_LINE_LEN;                                                           \
263     INIT_LINE_LEN;                                                            \
264     line_no++;                                                                \
265     BEGIN(INITIAL);                                                           \
266   }
267
268
269 /* Eventually we have to return 1 closing bracket (for the trailer).
270    We can detect whether we have sent the closing bracket using the
271    level_diff (at eof, first it is 2, then we increment it ourselves)
272 */
273
274 #define ACTION_EOF                                                            \
275   { if (level_diff == 2) {                                                    \
276       level_diff++;                                                           \
277       return CLOSE;                                                           \
278     }                                                                         \
279     else {                                                                    \
280       /* Reset our state */                                                   \
281       current_level = -1;                                                     \
282       level_diff = MAXGEDCLEVEL;                                              \
283       /* ... then terminate lex */                                            \
284       yyterminate();                                                          \
285     }                                                                         \
286   } 
287
288
289 #define ACTION_UNEXPECTED                                                     \
290   { gedcom_error("Unexpected character: '%s' (0x%02x)",                       \
291                  yytext, yytext[0]);                                          \
292     return BADTOKEN;                                                          \
293   }
294
295 #endif /* IN_LEX */