Moved gedcom_parse_file to external header.
[gedcom-parse.git] / gedcom_lex_common.c
1 /*  This program is free software; you can redistribute it and/or modify  *
2  *  it under the terms of the GNU General Public License as published by  *
3  *  the Free Software Foundation; either version 2 of the License, or     *
4  *  (at your option) any later version.                                   *
5
6  (C) 2001 by The Genes Development Team
7  Original author: Peter Verthez (Peter.Verthez@advalvas.be)
8 */
9
10 /* $Id$ */
11 /* $Name$ */
12
13 #ifndef IN_LEX
14
15 #include "gedcom.tab.h"
16 #include "gedcom.h"
17 #include "multilex.h"
18 #include "encoding.h"
19
20 #define YY_NO_UNPUT
21
22 static size_t encoding_width;
23 static int current_level = -1;
24 static int level_diff=MAXGEDCLEVEL;
25 static size_t line_len = 0;
26
27 static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1];
28 static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1];
29 static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1];
30
31 #ifdef LEXER_TEST 
32 YYSTYPE gedcom_lval;
33 int line_no = 1;
34
35 int gedcom_lex();
36
37 int test_loop(ENCODING enc, char* code)
38 {
39   int tok, res;
40   init_encodings();
41   set_encoding_width(enc);
42   res = open_conv_to_internal(code);
43   if (!res) {
44     gedcom_error("Unable to open conversion context: %s",
45                  strerror(errno));
46     return 1;
47   }
48   tok = gedcom_lex();
49   while (tok) {
50     switch(tok) {
51       case BADTOKEN: printf("BADTOKEN "); break;
52       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
53       case CLOSE: printf("CLOSE "); break;
54       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
55       case DELIM: printf("DELIM "); break;
56       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
57       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
58       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
59       default: printf("TAG(%s) ", gedcom_lval.string); break;
60     }
61     tok = gedcom_lex();
62   }
63   printf("\n");
64   close_conv_to_internal();
65   return 0;  
66 }
67  
68 #endif /* of #ifdef LEXER_TEST */
69
70 #else  /* of #ifndef IN_LEX */
71
72 #define TO_INTERNAL(STR,OUTBUF) \
73   to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF))
74
75 #define INIT_LINE_LEN \
76   line_len = 0;
77
78 #define CHECK_LINE_LEN                                                        \
79   { if (line_len != (size_t)-1) {                                             \
80       line_len += strlen(yytext);                                             \
81       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
82         gedcom_error("Line too long, max %d characters",                      \
83                      MAXGEDCLINELEN);                                         \
84         line_len = (size_t)-1;                                                \
85         return BADTOKEN;                                                      \
86       }                                                                       \
87     }                                                                         \
88   }
89
90 #define MKTAGACTION(THETAG)                                                  \
91   { CHECK_LINE_LEN;                                                          \
92     gedcom_lval.string = TO_INTERNAL(yytext, tag_buf);                       \
93     BEGIN(NORMAL);                                                           \
94     return TAG_##THETAG;                                                     \
95   }
96
97 /* The GEDCOM level number is converted into a sequence of opening
98    and closing brackets.  Simply put, the following GEDCOM fragment:
99    
100    0 HEAD
101    1 SOUR genes
102    2 VERS 1.6
103    2 NAME Genes
104    1 DATE 07 OCT 2001
105    ...
106    0 TRLR
107    
108    is converted into:
109    
110    { HEAD                     (initial)  
111    { SOUR genes               (1 higher: no closing brackets)
112    { VERS 1.6                 (1 higher: no closing brackets)
113    } { NAME Genes             (same level: 1 closing bracket)
114    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
115    ...
116    } { TRLR }
117    
118    or more clearly:
119    
120    { HEAD
121      { SOUR genes
122        { VERS 1.6 }
123        { NAME Genes } }
124      { DATE 07 OCT 2001
125      ... }
126    { TRLR }
127
128    But because this means that one token is converted into a series
129    of tokens, there is some initial code following immediately here
130    that returns "pending" tokens. */
131
132 #define ACTION_BEFORE_REGEXPS                                                 \
133    { if (level_diff < 1) {                                                    \
134        level_diff++;                                                          \
135        return CLOSE;                                                          \
136      }                                                                        \
137      else if (level_diff == 1) {                                              \
138        level_diff++;                                                          \
139        gedcom_lval.number = current_level;                                    \
140        return OPEN;                                                           \
141      }                                                                        \
142      else {                                                                   \
143        /* out of brackets... */                                               \
144      }                                                                        \
145    }
146
147
148 #define ACTION_INITIAL_WHITESPACE                                             \
149   { CHECK_LINE_LEN;                                                           \
150     /* ignore initial whitespace further */                                   \
151   }
152
153
154 #define ACTION_0_DIGITS                                                       \
155    { gedcom_error ("Level number with leading zero");                         \
156      return BADTOKEN;                                                         \
157    } 
158
159
160 #define ACTION_DIGITS                                                         \
161    { int level = atoi(TO_INTERNAL(yytext, str_buf));                          \
162      CHECK_LINE_LEN;                                                          \
163      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
164        gedcom_error ("Level number out of range [0..%d]",                     \
165                      MAXGEDCLEVEL);                                           \
166        return BADTOKEN;                                                       \
167      }                                                                        \
168      level_diff = level - current_level;                                      \
169      BEGIN(EXPECT_TAG);                                                       \
170      current_level = level;                                                   \
171      if (level_diff < 1) {                                                    \
172        level_diff++;                                                          \
173        return CLOSE;                                                          \
174      }                                                                        \
175      else if (level_diff == 1) {                                              \
176        level_diff++;                                                          \
177        gedcom_lval.number = current_level;                                    \
178        return OPEN;                                                           \
179      }                                                                        \
180      else {                                                                   \
181        /* should never happen (error to GEDCOM spec) */                       \
182        gedcom_error ("GEDCOM level number is %d higher than "                 \
183                      "previous",                                              \
184                      level_diff);                                             \
185        return BADTOKEN;                                                       \
186      }                                                                        \
187    } 
188
189
190 #define ACTION_ALPHANUM                                                       \
191    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
192        gedcom_error("Tag '%s' too long, max %d characters",                   \
193                     yytext, MAXGEDCTAGLEN);                                   \
194        return BADTOKEN;                                                       \
195      }                                                                        \
196      CHECK_LINE_LEN;                                                          \
197      gedcom_lval.string = TO_INTERNAL(yytext, tag_buf);                       \
198      BEGIN(NORMAL);                                                           \
199      return USERTAG;                                                          \
200    }
201
202
203 #define ACTION_DELIM                                                          \
204   { CHECK_LINE_LEN;                                                           \
205     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
206     return DELIM;                                                             \
207   }
208
209
210 #define ACTION_ANY                                                            \
211   { CHECK_LINE_LEN;                                                           \
212     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
213     /* Due to character conversions, it is possible that the current          \
214        character will be combined with the next, and so now we don't have a   \
215        character yet...                                                       \
216        In principle, this is only applicable to the 1byte case (e.g. ANSEL),  \
217        but it doesn't harm the unicode case.                                  \
218     */                                                                        \
219     if (strlen(gedcom_lval.string) > 0)                                       \
220       return ANYCHAR;                                                         \
221   }
222
223
224 #define ACTION_ESCAPE                                                         \
225   { CHECK_LINE_LEN;                                                           \
226     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
227     return ESCAPE;                                                            \
228   }
229
230
231 #define ACTION_POINTER                                                        \
232   { CHECK_LINE_LEN;                                                           \
233     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
234       gedcom_error("Pointer '%s' too long, max %d characters",                \
235                    yytext, MAXGEDCPTRLEN);                                    \
236       return BADTOKEN;                                                        \
237     }                                                                         \
238     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf);                        \
239     return POINTER;                                                           \
240   }
241
242
243 /* Due to the conversion of level numbers into brackets, the
244    terminator is not important, so no token is returned here.
245    Although not strictly according to the GEDCOM spec, we'll ignore
246    whitespace just before the terminator.
247 */
248
249 #define ACTION_TERMINATOR                                                     \
250   { CHECK_LINE_LEN;                                                           \
251     INIT_LINE_LEN;                                                            \
252     line_no++;                                                                \
253     BEGIN(INITIAL);                                                           \
254   }
255
256
257 /* Eventually we have to return 1 closing bracket (for the trailer).
258    We can detect whether we have sent the closing bracket using the
259    level_diff (at eof, first it is 2, then we increment it ourselves)
260 */
261
262 #define ACTION_EOF                                                            \
263   { if (level_diff == 2) {                                                    \
264       level_diff++;                                                           \
265       return CLOSE;                                                           \
266     }                                                                         \
267     else {                                                                    \
268       /* Reset our state */                                                   \
269       current_level = -1;                                                     \
270       level_diff = MAXGEDCLEVEL;                                              \
271       /* ... then terminate lex */                                            \
272       yyterminate();                                                          \
273     }                                                                         \
274   } 
275
276
277 #define ACTION_UNEXPECTED                                                     \
278   { gedcom_error("Unexpected character: '%s' (0x%02x)",                       \
279                  yytext, yytext[0]);                                          \
280     return BADTOKEN;                                                          \
281   }
282
283 #endif /* IN_LEX */