d528b02cbe83bf6e3086df8e4f2d3539928439c5
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /* Common lexer code.
2    Copyright (C) 2001 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 #ifndef IN_LEX
25
26 #include "gedcom_internal.h"
27 #include "multilex.h"
28 #include "encoding.h"
29 #include "gedcom.h"
30 #include "gedcom.tab.h"
31
32 #define YY_NO_UNPUT
33
34 static size_t encoding_width;
35 static int current_level = -1;
36 static int level_diff=MAXGEDCLEVEL;
37 static size_t line_len = 0;
38
39 static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1];
40 static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1];
41 static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1];
42
43 #ifdef LEXER_TEST 
44 YYSTYPE gedcom_lval;
45 int line_no = 1;
46
47 int gedcom_lex();
48
49 void message_handler(Gedcom_msg_type type, char *msg)
50 {
51   fprintf(stderr, msg);
52 }
53
54 int test_loop(ENCODING enc, char* code)
55 {
56   int tok, res;
57   init_encodings();
58   set_encoding_width(enc);
59   gedcom_set_message_handler(message_handler);
60   res = open_conv_to_internal(code);
61   if (!res) {
62     gedcom_error("Unable to open conversion context: %s",
63                  strerror(errno));
64     return 1;
65   }
66   tok = gedcom_lex();
67   while (tok) {
68     switch(tok) {
69       case BADTOKEN: printf("BADTOKEN "); break;
70       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
71       case CLOSE: printf("CLOSE "); break;
72       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
73       case DELIM: printf("DELIM "); break;
74       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
75       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
76       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
77       default: printf("TAG(%s) ", gedcom_lval.string); break;
78     }
79     tok = gedcom_lex();
80   }
81   printf("\n");
82   close_conv_to_internal();
83   return 0;  
84 }
85  
86 #endif /* of #ifdef LEXER_TEST */
87
88 #else  /* of #ifndef IN_LEX */
89
90 #define TO_INTERNAL(STR,OUTBUF) \
91   to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF))
92
93 #define INIT_LINE_LEN \
94   line_len = 0;
95
96 #define CHECK_LINE_LEN                                                        \
97   { if (line_len != (size_t)-1) {                                             \
98       line_len += strlen(yytext);                                             \
99       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
100         gedcom_error("Line too long, max %d characters",                      \
101                      MAXGEDCLINELEN);                                         \
102         line_len = (size_t)-1;                                                \
103         return BADTOKEN;                                                      \
104       }                                                                       \
105     }                                                                         \
106   }
107
108 #define MKTAGACTION(THETAG)                                                  \
109   { CHECK_LINE_LEN;                                                          \
110     gedcom_lval.string = TO_INTERNAL(yytext, tag_buf);                       \
111     BEGIN(NORMAL);                                                           \
112     return TAG_##THETAG;                                                     \
113   }
114
115 /* The GEDCOM level number is converted into a sequence of opening
116    and closing brackets.  Simply put, the following GEDCOM fragment:
117    
118    0 HEAD
119    1 SOUR genes
120    2 VERS 1.6
121    2 NAME Genes
122    1 DATE 07 OCT 2001
123    ...
124    0 TRLR
125    
126    is converted into:
127    
128    { HEAD                     (initial)  
129    { SOUR genes               (1 higher: no closing brackets)
130    { VERS 1.6                 (1 higher: no closing brackets)
131    } { NAME Genes             (same level: 1 closing bracket)
132    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
133    ...
134    } { TRLR }
135    
136    or more clearly:
137    
138    { HEAD
139      { SOUR genes
140        { VERS 1.6 }
141        { NAME Genes } }
142      { DATE 07 OCT 2001
143      ... }
144    { TRLR }
145
146    But because this means that one token is converted into a series
147    of tokens, there is some initial code following immediately here
148    that returns "pending" tokens. */
149
150 #define ACTION_BEFORE_REGEXPS                                                 \
151    { if (level_diff < 1) {                                                    \
152        level_diff++;                                                          \
153        return CLOSE;                                                          \
154      }                                                                        \
155      else if (level_diff == 1) {                                              \
156        level_diff++;                                                          \
157        gedcom_lval.number = current_level;                                    \
158        return OPEN;                                                           \
159      }                                                                        \
160      else {                                                                   \
161        /* out of brackets... */                                               \
162      }                                                                        \
163    }
164
165
166 #define ACTION_INITIAL_WHITESPACE                                             \
167   { CHECK_LINE_LEN;                                                           \
168     /* ignore initial whitespace further */                                   \
169   }
170
171
172 #define ACTION_0_DIGITS                                                       \
173    { gedcom_error ("Level number with leading zero");                         \
174      return BADTOKEN;                                                         \
175    } 
176
177
178 #define ACTION_DIGITS                                                         \
179    { int level = atoi(TO_INTERNAL(yytext, str_buf));                          \
180      CHECK_LINE_LEN;                                                          \
181      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
182        gedcom_error ("Level number out of range [0..%d]",                     \
183                      MAXGEDCLEVEL);                                           \
184        return BADTOKEN;                                                       \
185      }                                                                        \
186      level_diff = level - current_level;                                      \
187      BEGIN(EXPECT_TAG);                                                       \
188      current_level = level;                                                   \
189      if (level_diff < 1) {                                                    \
190        level_diff++;                                                          \
191        return CLOSE;                                                          \
192      }                                                                        \
193      else if (level_diff == 1) {                                              \
194        level_diff++;                                                          \
195        gedcom_lval.number = current_level;                                    \
196        return OPEN;                                                           \
197      }                                                                        \
198      else {                                                                   \
199        /* should never happen (error to GEDCOM spec) */                       \
200        gedcom_error ("GEDCOM level number is %d higher than "                 \
201                      "previous",                                              \
202                      level_diff);                                             \
203        return BADTOKEN;                                                       \
204      }                                                                        \
205    } 
206
207
208 #define ACTION_ALPHANUM                                                       \
209    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
210        gedcom_error("Tag '%s' too long, max %d characters",                   \
211                     yytext, MAXGEDCTAGLEN);                                   \
212        return BADTOKEN;                                                       \
213      }                                                                        \
214      CHECK_LINE_LEN;                                                          \
215      gedcom_lval.string = TO_INTERNAL(yytext, tag_buf);                       \
216      BEGIN(NORMAL);                                                           \
217      return USERTAG;                                                          \
218    }
219
220
221 #define ACTION_DELIM                                                          \
222   { CHECK_LINE_LEN;                                                           \
223     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
224     return DELIM;                                                             \
225   }
226
227
228 #define ACTION_ANY                                                            \
229   { CHECK_LINE_LEN;                                                           \
230     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
231     /* Due to character conversions, it is possible that the current          \
232        character will be combined with the next, and so now we don't have a   \
233        character yet...                                                       \
234        In principle, this is only applicable to the 1byte case (e.g. ANSEL),  \
235        but it doesn't harm the unicode case.                                  \
236     */                                                                        \
237     if (strlen(gedcom_lval.string) > 0)                                       \
238       return ANYCHAR;                                                         \
239   }
240
241
242 #define ACTION_ESCAPE                                                         \
243   { CHECK_LINE_LEN;                                                           \
244     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
245     return ESCAPE;                                                            \
246   }
247
248
249 #define ACTION_POINTER                                                        \
250   { CHECK_LINE_LEN;                                                           \
251     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
252       gedcom_error("Pointer '%s' too long, max %d characters",                \
253                    yytext, MAXGEDCPTRLEN);                                    \
254       return BADTOKEN;                                                        \
255     }                                                                         \
256     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf);                        \
257     return POINTER;                                                           \
258   }
259
260
261 /* Due to the conversion of level numbers into brackets, the
262    terminator is not important, so no token is returned here.
263    Although not strictly according to the GEDCOM spec, we'll ignore
264    whitespace just before the terminator.
265 */
266
267 #define ACTION_TERMINATOR                                                     \
268   { CHECK_LINE_LEN;                                                           \
269     INIT_LINE_LEN;                                                            \
270     line_no++;                                                                \
271     BEGIN(INITIAL);                                                           \
272   }
273
274
275 /* Eventually we have to return 1 closing bracket (for the trailer).
276    We can detect whether we have sent the closing bracket using the
277    level_diff (at eof, first it is 2, then we increment it ourselves)
278 */
279
280 #define ACTION_EOF                                                            \
281   { if (level_diff == 2) {                                                    \
282       level_diff++;                                                           \
283       return CLOSE;                                                           \
284     }                                                                         \
285     else {                                                                    \
286       /* Reset our state */                                                   \
287       current_level = -1;                                                     \
288       level_diff = MAXGEDCLEVEL;                                              \
289       /* ... then terminate lex */                                            \
290       yyterminate();                                                          \
291     }                                                                         \
292   } 
293
294
295 #define ACTION_UNEXPECTED                                                     \
296   { gedcom_error("Unexpected character: '%s' (0x%02x)",                       \
297                  yytext, yytext[0]);                                          \
298     return BADTOKEN;                                                          \
299   }
300
301 #endif /* IN_LEX */