Return level number together with OPEN token.
[gedcom-parse.git] / gedcom_lohi.lex
1 /*  This program is free software; you can redistribute it and/or modify  *
2  *  it under the terms of the GNU General Public License as published by  *
3  *  the Free Software Foundation; either version 2 of the License, or     *
4  *  (at your option) any later version.                                   *
5
6  (C) 2001 by The Genes Development Team
7  Original author: Peter Verthez (Peter.Verthez@advalvas.be)
8 */
9
10 /* $Id$ */
11 /* $Name$ */
12
13 /* In low-high order, a space is encoded as 0x20 0x00 */
14 /* i.e. this is utf-16-le */
15
16 %{
17 #include "gedcom.tab.h"
18 #include "gedcom.h"
19 #include "multilex.h"
20 #include "encoding.h"
21
22 #define YY_NO_UNPUT
23 %}
24
25 %s NORMAL
26 %s EXPECT_TAG
27
28 alpha        [A-Za-z_]\x00
29 digit        [0-9]\x00
30 delim        \x20\x00
31 tab          [\t]\x00
32 hash         #\x00
33 literal_at   @\x00@\x00
34 otherchar    [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFF]\x00|[\x00-\xFF][\x01-\xFF]
35 terminator   \x0D\x00|\x0A\x00|\x0D\x00\x0A\x00|\x0A\x00\x0D\x00
36
37 any_char     {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at}
38 any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at}
39 non_at       {alpha}|{digit}|{otherchar}|{delim}|{hash}
40 alphanum     {alpha}|{digit}
41 gen_delim    {delim}|{tab}
42
43 escape       @\x00#\x00{any_char}+@\x00
44 pointer      @\x00{alphanum}{non_at}+@\x00
45
46 %{
47 static int current_level=-1;
48 static int level_diff=MAXGEDCLEVEL;
49  
50 #ifdef LEXER_TEST 
51 YYSTYPE gedcom_lval;
52 int line_no = 1; 
53 #endif
54 %} 
55
56 %%
57
58     /* The GEDCOM level number is converted into a sequence of opening
59        and closing brackets.  Simply put, the following GEDCOM fragment:
60
61          0 HEAD
62          1 SOUR genes
63          2 VERS 1.6
64          2 NAME Genes
65          1 DATE 07 OCT 2001
66          ...
67          0 TRLR
68
69        is converted into:
70
71          { HEAD                     (initial)  
72          { SOUR genes               (1 higher: no closing brackets)
73          { VERS 1.6                 (1 higher: no closing brackets)
74          } { NAME Genes             (same level: 1 closing bracket)
75          } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
76          ...
77          } { TRLR }
78
79        or more clearly:
80
81          { HEAD
82            { SOUR genes
83              { VERS 1.6 }
84              { NAME Genes } }
85            { DATE 07 OCT 2001
86          ... }
87          { TRLR }
88
89        But because this means that one token is converted into a series
90        of tokens, there is some initial code following immediately here
91        that returns "pending" tokens. */
92
93 %{
94 char string_buf[MAXGEDCLINELEN+1];
95  
96 if (level_diff < 1) {
97   level_diff++;
98   return CLOSE;
99 }
100 else if (level_diff == 1) {
101   level_diff++;
102   gedcom_lval.number = current_level;
103   return OPEN;
104 }
105 else {
106   /* out of brackets... */
107 }
108
109 #define TO_INTERNAL(str) to_internal(str, yyleng) 
110
111 #define MKTAGACTION(tag) \
112   { gedcom_lval.string = TO_INTERNAL(yytext); \
113     BEGIN(NORMAL); \
114     return TAG_##tag; }
115
116 %}
117
118 <INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
119
120 <INITIAL>\x00[0]{digit}+ { gedcom_error ("Level number with leading zero");
121                            return BADTOKEN;
122                          }
123
124 <INITIAL>{digit}+ { int level = atoi(TO_INTERNAL(yytext));
125                     if ((level < 0) || (level > MAXGEDCLEVEL)) {
126                       gedcom_error ("Level number out of range [0..%d]",
127                                     MAXGEDCLEVEL);
128                       return BADTOKEN;
129                     }
130                     level_diff = level - current_level;
131                     BEGIN(EXPECT_TAG);
132                     current_level = level;
133                     if (level_diff < 1) {
134                       level_diff++;
135                       return CLOSE;
136                     }
137                     else if (level_diff == 1) {
138                       level_diff++;
139                       gedcom_lval.number = current_level;
140                       return OPEN;
141                     }
142                     else {
143                       /* should never happen (error to GEDCOM spec) */
144                       gedcom_error ("GEDCOM level number is %d higher than "
145                                     "previous",
146                                     level_diff);
147                       return BADTOKEN;
148                     }
149                   }
150
151 <EXPECT_TAG>A\x00B\x00B\x00R\x00  MKTAGACTION(ABBR)
152 <EXPECT_TAG>A\x00D\x00D\x00R\x00  MKTAGACTION(ADDR)
153 <EXPECT_TAG>A\x00D\x00R\x001\x00  MKTAGACTION(ADR1)
154 <EXPECT_TAG>A\x00D\x00R\x002\x00  MKTAGACTION(ADR2)
155 <EXPECT_TAG>A\x00D\x00O\x00P\x00  MKTAGACTION(ADOP)
156 <EXPECT_TAG>A\x00F\x00N\x00   MKTAGACTION(AFN)
157 <EXPECT_TAG>A\x00G\x00E\x00   MKTAGACTION(AGE)
158 <EXPECT_TAG>A\x00G\x00N\x00C\x00  MKTAGACTION(AGNC)
159 <EXPECT_TAG>A\x00L\x00I\x00A\x00  MKTAGACTION(ALIA)
160 <EXPECT_TAG>A\x00N\x00C\x00E\x00  MKTAGACTION(ANCE)
161 <EXPECT_TAG>A\x00N\x00C\x00I\x00  MKTAGACTION(ANCI)
162 <EXPECT_TAG>A\x00N\x00U\x00L\x00  MKTAGACTION(ANUL)
163 <EXPECT_TAG>A\x00S\x00S\x00O\x00  MKTAGACTION(ASSO)
164 <EXPECT_TAG>A\x00U\x00T\x00H\x00  MKTAGACTION(AUTH)
165 <EXPECT_TAG>B\x00A\x00P\x00L\x00  MKTAGACTION(BAPL)
166 <EXPECT_TAG>B\x00A\x00P\x00M\x00  MKTAGACTION(BAPM)
167 <EXPECT_TAG>B\x00A\x00R\x00M\x00  MKTAGACTION(BARM)
168 <EXPECT_TAG>B\x00A\x00S\x00M\x00  MKTAGACTION(BASM)
169 <EXPECT_TAG>B\x00I\x00R\x00T\x00  MKTAGACTION(BIRT)
170 <EXPECT_TAG>B\x00L\x00E\x00S\x00  MKTAGACTION(BLES)
171 <EXPECT_TAG>B\x00L\x00O\x00B\x00  MKTAGACTION(BLOB)
172 <EXPECT_TAG>B\x00U\x00R\x00I\x00  MKTAGACTION(BURI)
173 <EXPECT_TAG>C\x00A\x00L\x00N\x00  MKTAGACTION(CALN)
174 <EXPECT_TAG>C\x00A\x00S\x00T\x00  MKTAGACTION(CAST)
175 <EXPECT_TAG>C\x00A\x00U\x00S\x00  MKTAGACTION(CAUS)
176 <EXPECT_TAG>C\x00E\x00N\x00S\x00  MKTAGACTION(CENS)
177 <EXPECT_TAG>C\x00H\x00A\x00N\x00  MKTAGACTION(CHAN)
178 <EXPECT_TAG>C\x00H\x00A\x00R\x00  MKTAGACTION(CHAR)
179 <EXPECT_TAG>C\x00H\x00I\x00L\x00  MKTAGACTION(CHIL)
180 <EXPECT_TAG>C\x00H\x00R\x00   MKTAGACTION(CHR)
181 <EXPECT_TAG>C\x00H\x00R\x00A\x00  MKTAGACTION(CHRA)
182 <EXPECT_TAG>C\x00I\x00T\x00Y\x00  MKTAGACTION(CITY)
183 <EXPECT_TAG>C\x00O\x00N\x00C\x00  MKTAGACTION(CONC)
184 <EXPECT_TAG>C\x00O\x00N\x00F\x00  MKTAGACTION(CONF)
185 <EXPECT_TAG>C\x00O\x00N\x00L\x00  MKTAGACTION(CONL)
186 <EXPECT_TAG>C\x00O\x00N\x00T\x00  MKTAGACTION(CONT)
187 <EXPECT_TAG>C\x00O\x00P\x00R\x00  MKTAGACTION(COPR)
188 <EXPECT_TAG>C\x00O\x00R\x00P\x00  MKTAGACTION(CORP)
189 <EXPECT_TAG>C\x00R\x00E\x00M\x00  MKTAGACTION(CREM)
190 <EXPECT_TAG>C\x00T\x00R\x00Y\x00  MKTAGACTION(CTRY)
191 <EXPECT_TAG>D\x00A\x00T\x00A\x00  MKTAGACTION(DATA)
192 <EXPECT_TAG>D\x00A\x00T\x00E\x00  MKTAGACTION(DATE)
193 <EXPECT_TAG>D\x00E\x00A\x00T\x00  MKTAGACTION(DEAT)
194 <EXPECT_TAG>D\x00E\x00S\x00C\x00  MKTAGACTION(DESC)
195 <EXPECT_TAG>D\x00E\x00S\x00I\x00  MKTAGACTION(DESI)
196 <EXPECT_TAG>D\x00E\x00S\x00T\x00  MKTAGACTION(DEST)
197 <EXPECT_TAG>D\x00I\x00V\x00   MKTAGACTION(DIV)
198 <EXPECT_TAG>D\x00I\x00V\x00F\x00  MKTAGACTION(DIVF)
199 <EXPECT_TAG>D\x00S\x00C\x00R\x00  MKTAGACTION(DSCR)
200 <EXPECT_TAG>E\x00D\x00U\x00C\x00  MKTAGACTION(EDUC)
201 <EXPECT_TAG>E\x00M\x00I\x00G\x00  MKTAGACTION(EMIG)
202 <EXPECT_TAG>E\x00N\x00D\x00L\x00  MKTAGACTION(ENDL)
203 <EXPECT_TAG>E\x00N\x00G\x00A\x00  MKTAGACTION(ENGA)
204 <EXPECT_TAG>E\x00V\x00E\x00N\x00  MKTAGACTION(EVEN)
205 <EXPECT_TAG>F\x00A\x00M\x00   MKTAGACTION(FAM)
206 <EXPECT_TAG>F\x00A\x00M\x00C\x00  MKTAGACTION(FAMC)
207 <EXPECT_TAG>F\x00A\x00M\x00F\x00  MKTAGACTION(FAMF)
208 <EXPECT_TAG>F\x00A\x00M\x00S\x00  MKTAGACTION(FAMS)
209 <EXPECT_TAG>F\x00C\x00O\x00M\x00  MKTAGACTION(FCOM)
210 <EXPECT_TAG>F\x00I\x00L\x00E\x00  MKTAGACTION(FILE)
211 <EXPECT_TAG>F\x00O\x00R\x00M\x00  MKTAGACTION(FORM)
212 <EXPECT_TAG>G\x00E\x00D\x00C\x00  MKTAGACTION(GEDC)
213 <EXPECT_TAG>G\x00I\x00V\x00N\x00  MKTAGACTION(GIVN)
214 <EXPECT_TAG>G\x00R\x00A\x00D\x00  MKTAGACTION(GRAD)
215 <EXPECT_TAG>H\x00E\x00A\x00D\x00  MKTAGACTION(HEAD)
216 <EXPECT_TAG>H\x00U\x00S\x00B\x00  MKTAGACTION(HUSB)
217 <EXPECT_TAG>I\x00D\x00N\x00O\x00  MKTAGACTION(IDNO)
218 <EXPECT_TAG>I\x00M\x00M\x00I\x00  MKTAGACTION(IMMI)
219 <EXPECT_TAG>I\x00N\x00D\x00I\x00  MKTAGACTION(INDI)
220 <EXPECT_TAG>L\x00A\x00N\x00G\x00  MKTAGACTION(LANG)
221 <EXPECT_TAG>L\x00E\x00G\x00A\x00  MKTAGACTION(LEGA)
222 <EXPECT_TAG>M\x00A\x00R\x00B\x00  MKTAGACTION(MARB)
223 <EXPECT_TAG>M\x00A\x00R\x00C\x00  MKTAGACTION(MARC)
224 <EXPECT_TAG>M\x00A\x00R\x00L\x00  MKTAGACTION(MARL)
225 <EXPECT_TAG>M\x00A\x00R\x00R\x00  MKTAGACTION(MARR)
226 <EXPECT_TAG>M\x00A\x00R\x00S\x00  MKTAGACTION(MARS)
227 <EXPECT_TAG>M\x00E\x00D\x00I\x00  MKTAGACTION(MEDI)
228 <EXPECT_TAG>N\x00A\x00M\x00E\x00  MKTAGACTION(NAME)
229 <EXPECT_TAG>N\x00A\x00T\x00I\x00  MKTAGACTION(NATI)
230 <EXPECT_TAG>N\x00A\x00T\x00U\x00  MKTAGACTION(NATU)
231 <EXPECT_TAG>N\x00C\x00H\x00I\x00  MKTAGACTION(NCHI)
232 <EXPECT_TAG>N\x00I\x00C\x00K\x00  MKTAGACTION(NICK)
233 <EXPECT_TAG>N\x00M\x00R\x00   MKTAGACTION(NMR)
234 <EXPECT_TAG>N\x00O\x00T\x00E\x00  MKTAGACTION(NOTE)
235 <EXPECT_TAG>N\x00P\x00F\x00X\x00  MKTAGACTION(NPFX)
236 <EXPECT_TAG>N\x00S\x00F\x00X\x00  MKTAGACTION(NSFX)
237 <EXPECT_TAG>O\x00B\x00J\x00E\x00  MKTAGACTION(OBJE)
238 <EXPECT_TAG>O\x00C\x00C\x00U\x00  MKTAGACTION(OCCU)
239 <EXPECT_TAG>O\x00R\x00D\x00I\x00  MKTAGACTION(ORDI)
240 <EXPECT_TAG>O\x00R\x00D\x00N\x00  MKTAGACTION(ORDN)
241 <EXPECT_TAG>P\x00A\x00G\x00E\x00  MKTAGACTION(PAGE)
242 <EXPECT_TAG>P\x00E\x00D\x00I\x00  MKTAGACTION(PEDI)
243 <EXPECT_TAG>P\x00H\x00O\x00N\x00  MKTAGACTION(PHON)
244 <EXPECT_TAG>P\x00L\x00A\x00C\x00  MKTAGACTION(PLAC)
245 <EXPECT_TAG>P\x00O\x00S\x00T\x00  MKTAGACTION(POST)
246 <EXPECT_TAG>P\x00R\x00O\x00B\x00  MKTAGACTION(PROB)
247 <EXPECT_TAG>P\x00R\x00O\x00P\x00  MKTAGACTION(PROP)
248 <EXPECT_TAG>P\x00U\x00B\x00L\x00  MKTAGACTION(PUBL)
249 <EXPECT_TAG>Q\x00U\x00A\x00Y\x00  MKTAGACTION(QUAY)
250 <EXPECT_TAG>R\x00E\x00F\x00N\x00  MKTAGACTION(REFN)
251 <EXPECT_TAG>R\x00E\x00L\x00A\x00  MKTAGACTION(RELA)
252 <EXPECT_TAG>R\x00E\x00L\x00I\x00  MKTAGACTION(RELI)
253 <EXPECT_TAG>R\x00E\x00P\x00O\x00  MKTAGACTION(REPO)
254 <EXPECT_TAG>R\x00E\x00S\x00I\x00  MKTAGACTION(RESI)
255 <EXPECT_TAG>R\x00E\x00S\x00N\x00  MKTAGACTION(RESN)
256 <EXPECT_TAG>R\x00E\x00T\x00I\x00  MKTAGACTION(RETI)
257 <EXPECT_TAG>R\x00F\x00N\x00   MKTAGACTION(RFN)
258 <EXPECT_TAG>R\x00I\x00N\x00   MKTAGACTION(RIN)
259 <EXPECT_TAG>R\x00O\x00L\x00E\x00  MKTAGACTION(ROLE)
260 <EXPECT_TAG>S\x00E\x00X\x00   MKTAGACTION(SEX)
261 <EXPECT_TAG>S\x00L\x00G\x00C\x00  MKTAGACTION(SLGC)
262 <EXPECT_TAG>S\x00L\x00G\x00S\x00  MKTAGACTION(SLGS)
263 <EXPECT_TAG>S\x00O\x00U\x00R\x00  MKTAGACTION(SOUR)
264 <EXPECT_TAG>S\x00P\x00F\x00X\x00  MKTAGACTION(SPFX)
265 <EXPECT_TAG>S\x00S\x00N\x00   MKTAGACTION(SSN)
266 <EXPECT_TAG>S\x00T\x00A\x00E\x00  MKTAGACTION(STAE)
267 <EXPECT_TAG>S\x00T\x00A\x00T\x00  MKTAGACTION(STAT)
268 <EXPECT_TAG>S\x00U\x00B\x00M\x00  MKTAGACTION(SUBM)
269 <EXPECT_TAG>S\x00U\x00B\x00N\x00  MKTAGACTION(SUBN)
270 <EXPECT_TAG>S\x00U\x00R\x00N\x00  MKTAGACTION(SURN)
271 <EXPECT_TAG>T\x00E\x00M\x00P\x00  MKTAGACTION(TEMP)
272 <EXPECT_TAG>T\x00E\x00X\x00T\x00  MKTAGACTION(TEXT)
273 <EXPECT_TAG>T\x00I\x00M\x00E\x00  MKTAGACTION(TIME)
274 <EXPECT_TAG>T\x00I\x00T\x00L\x00  MKTAGACTION(TITL)
275 <EXPECT_TAG>T\x00R\x00L\x00R\x00  MKTAGACTION(TRLR)
276 <EXPECT_TAG>T\x00Y\x00P\x00E\x00  MKTAGACTION(TYPE)
277 <EXPECT_TAG>V\x00E\x00R\x00S\x00  MKTAGACTION(VERS)
278 <EXPECT_TAG>W\x00I\x00F\x00E\x00  MKTAGACTION(WIFE)
279 <EXPECT_TAG>W\x00I\x00L\x00L\x00  MKTAGACTION(WILL)
280      
281 <EXPECT_TAG>{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) {
282                             gedcom_error("Tag '%s' too long, max %d chars");
283                             return BADTOKEN;
284                           }
285                           strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);
286                           gedcom_lval.string = TO_INTERNAL(string_buf);
287                           BEGIN(NORMAL);
288                           return USERTAG;
289                         }
290
291 {delim}      { gedcom_lval.string = TO_INTERNAL(yytext);
292                return DELIM;
293              }
294
295 {any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext);
296                   return ANYCHAR;
297                 }
298
299 {escape}/{non_at}  { gedcom_lval.string = TO_INTERNAL(yytext);
300                      return ESCAPE;
301                    }
302
303 {pointer}    { gedcom_lval.string = TO_INTERNAL(yytext);
304                return POINTER;
305              }
306
307    /* Due to the conversion of level numbers into brackets, the
308       terminator is not important, so no token is returned here.
309       Although not strictly according to the GEDCOM spec, we'll ignore
310       whitespace just before the terminator.
311    */
312
313 {gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
314
315    /* Eventually we have to return 1 closing bracket (for the trailer).
316       We can detect whether we have sent the closing bracket using the
317       level_diff (at eof, first it is 2, then we increment it ourselves) */
318
319 <<EOF>> { if (level_diff == 2) {
320             level_diff++;
321             return CLOSE;
322           }
323           else {
324             yyterminate();
325           }
326         } 
327
328 .  { gedcom_error("Unexpected character: '%s' (0x%02x)",
329                   yytext, yytext[0]);
330      return BADTOKEN;
331    }
332
333 %%
334
335 int yywrap()
336 {
337   return 1;
338 }
339
340 #ifdef LEXER_TEST
341
342 int main()
343 {
344   int tok, res;
345   init_encodings();
346   set_encoding_width(TWO_BYTE_LOHI);
347   res = open_conv_to_internal("UNICODE");
348   if (!res) {
349     gedcom_error("Unable to open conversion context: %s",
350                  strerror(errno));
351     return 1;
352   }
353   tok = gedcom_lohi_lex();
354   while (tok) {
355     switch(tok) {
356       case BADTOKEN: printf("BADTOKEN "); break;
357       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
358       case CLOSE: printf("CLOSE "); break;
359       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
360       case DELIM: printf("DELIM "); break;
361       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
362       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
363       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
364       default: printf("TAG(%s) ", gedcom_lval.string); break;
365     }
366     tok = gedcom_lohi_lex();
367   }
368   printf("\n");
369   close_conv_to_internal();
370   return 0;
371 }
372 #endif