Return level number together with OPEN token.
[gedcom-parse.git] / gedcom_1byte.lex
1 /*  This program is free software; you can redistribute it and/or modify  *
2  *  it under the terms of the GNU General Public License as published by  *
3  *  the Free Software Foundation; either version 2 of the License, or     *
4  *  (at your option) any later version.                                   *
5
6  (C) 2001 by The Genes Development Team
7  Original author: Peter Verthez (Peter.Verthez@advalvas.be)
8 */
9
10 /* $Id$ */
11 /* $Name$ */
12
13 %{
14 #include "gedcom.tab.h"
15 #include "gedcom.h"
16 #include "multilex.h"
17 #include "encoding.h"
18
19 #define YY_NO_UNPUT
20 %}
21
22 %s NORMAL
23 %s EXPECT_TAG
24
25 alpha        [A-Za-z_]
26 digit        [0-9]
27 delim        " "
28 tab          [\t]
29 hash         #
30 literal_at   @@
31 otherchar    [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFE]
32 terminator   \x0D|\x0A|\x0D\x0A|\x0A\x0D
33
34 any_char     {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at}
35 any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at}
36 non_at       {alpha}|{digit}|{otherchar}|{delim}|{hash}
37 alphanum     {alpha}|{digit}
38 gen_delim    {delim}|{tab}
39
40 escape       @#{any_char}+@
41 pointer      @{alphanum}{non_at}+@
42
43 %{
44 static int current_level=-1;
45 static int level_diff=MAXGEDCLEVEL;
46  
47 #ifdef LEXER_TEST 
48 YYSTYPE gedcom_lval;
49 int line_no = 1; 
50 #endif
51  
52 %} 
53
54 %%
55
56     /* The GEDCOM level number is converted into a sequence of opening
57        and closing brackets.  Simply put, the following GEDCOM fragment:
58
59          0 HEAD
60          1 SOUR genes
61          2 VERS 1.6
62          2 NAME Genes
63          1 DATE 07 OCT 2001
64          ...
65          0 TRLR
66
67        is converted into:
68
69          { HEAD                     (initial)  
70          { SOUR genes               (1 higher: no closing brackets)
71          { VERS 1.6                 (1 higher: no closing brackets)
72          } { NAME Genes             (same level: 1 closing bracket)
73          } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
74          ...
75          } { TRLR }
76
77        or more clearly:
78
79          { HEAD
80            { SOUR genes
81              { VERS 1.6 }
82              { NAME Genes } }
83            { DATE 07 OCT 2001
84          ... }
85          { TRLR }
86
87        But because this means that one token is converted into a series
88        of tokens, there is some initial code following immediately here
89        that returns "pending" tokens. */
90
91 %{
92 char string_buf[MAXGEDCLINELEN+1];
93  
94 if (level_diff < 1) {
95   level_diff++;
96   return CLOSE;
97 }
98 else if (level_diff == 1) {
99   level_diff++;
100   gedcom_lval.number = current_level;
101   return OPEN;
102 }
103 else {
104   /* out of brackets... */
105 }
106
107 #define TO_INTERNAL(str) to_internal(str, yyleng) 
108
109 #define MKTAGACTION(tag) \
110   { gedcom_lval.string = TO_INTERNAL(yytext); \
111     BEGIN(NORMAL); \
112     return TAG_##tag; }
113
114 %}
115
116 <INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
117
118 <INITIAL>0{digit}+ { gedcom_error ("Level number with leading zero");
119                      return BADTOKEN;
120                    }
121
122 <INITIAL>{digit}+ { int level = atoi(TO_INTERNAL(yytext));
123                     if ((level < 0) || (level > MAXGEDCLEVEL)) {
124                       gedcom_error ("Level number out of range [0..%d]",
125                                     MAXGEDCLEVEL);
126                       return BADTOKEN;
127                     }
128                     level_diff = level - current_level;
129                     BEGIN(EXPECT_TAG);
130                     current_level = level;
131                     if (level_diff < 1) {
132                       level_diff++;
133                       return CLOSE;
134                     }
135                     else if (level_diff == 1) {
136                       level_diff++;
137                       gedcom_lval.number = current_level;
138                       return OPEN;
139                     }
140                     else {
141                       /* should never happen (error to GEDCOM spec) */
142                       gedcom_error ("GEDCOM level number is %d higher than "
143                                     "previous",
144                                     level_diff);
145                       return BADTOKEN;
146                     }
147                   }
148
149 <EXPECT_TAG>ABBR  MKTAGACTION(ABBR)
150 <EXPECT_TAG>ADDR  MKTAGACTION(ADDR)
151 <EXPECT_TAG>ADR1  MKTAGACTION(ADR1)
152 <EXPECT_TAG>ADR2  MKTAGACTION(ADR2)
153 <EXPECT_TAG>ADOP  MKTAGACTION(ADOP)
154 <EXPECT_TAG>AFN   MKTAGACTION(AFN)
155 <EXPECT_TAG>AGE   MKTAGACTION(AGE)
156 <EXPECT_TAG>AGNC  MKTAGACTION(AGNC)
157 <EXPECT_TAG>ALIA  MKTAGACTION(ALIA)
158 <EXPECT_TAG>ANCE  MKTAGACTION(ANCE)
159 <EXPECT_TAG>ANCI  MKTAGACTION(ANCI)
160 <EXPECT_TAG>ANUL  MKTAGACTION(ANUL)
161 <EXPECT_TAG>ASSO  MKTAGACTION(ASSO)
162 <EXPECT_TAG>AUTH  MKTAGACTION(AUTH)
163 <EXPECT_TAG>BAPL  MKTAGACTION(BAPL)
164 <EXPECT_TAG>BAPM  MKTAGACTION(BAPM)
165 <EXPECT_TAG>BARM  MKTAGACTION(BARM)
166 <EXPECT_TAG>BASM  MKTAGACTION(BASM)
167 <EXPECT_TAG>BIRT  MKTAGACTION(BIRT)
168 <EXPECT_TAG>BLES  MKTAGACTION(BLES)
169 <EXPECT_TAG>BLOB  MKTAGACTION(BLOB)
170 <EXPECT_TAG>BURI  MKTAGACTION(BURI)
171 <EXPECT_TAG>CALN  MKTAGACTION(CALN)
172 <EXPECT_TAG>CAST  MKTAGACTION(CAST)
173 <EXPECT_TAG>CAUS  MKTAGACTION(CAUS)
174 <EXPECT_TAG>CENS  MKTAGACTION(CENS)
175 <EXPECT_TAG>CHAN  MKTAGACTION(CHAN)
176 <EXPECT_TAG>CHAR  MKTAGACTION(CHAR)
177 <EXPECT_TAG>CHIL  MKTAGACTION(CHIL)
178 <EXPECT_TAG>CHR   MKTAGACTION(CHR)
179 <EXPECT_TAG>CHRA  MKTAGACTION(CHRA)
180 <EXPECT_TAG>CITY  MKTAGACTION(CITY)
181 <EXPECT_TAG>CONC  MKTAGACTION(CONC)
182 <EXPECT_TAG>CONF  MKTAGACTION(CONF)
183 <EXPECT_TAG>CONL  MKTAGACTION(CONL)
184 <EXPECT_TAG>CONT  MKTAGACTION(CONT)
185 <EXPECT_TAG>COPR  MKTAGACTION(COPR)
186 <EXPECT_TAG>CORP  MKTAGACTION(CORP)
187 <EXPECT_TAG>CREM  MKTAGACTION(CREM)
188 <EXPECT_TAG>CTRY  MKTAGACTION(CTRY)
189 <EXPECT_TAG>DATA  MKTAGACTION(DATA)
190 <EXPECT_TAG>DATE  MKTAGACTION(DATE)
191 <EXPECT_TAG>DEAT  MKTAGACTION(DEAT)
192 <EXPECT_TAG>DESC  MKTAGACTION(DESC)
193 <EXPECT_TAG>DESI  MKTAGACTION(DESI)
194 <EXPECT_TAG>DEST  MKTAGACTION(DEST)
195 <EXPECT_TAG>DIV   MKTAGACTION(DIV)
196 <EXPECT_TAG>DIVF  MKTAGACTION(DIVF)
197 <EXPECT_TAG>DSCR  MKTAGACTION(DSCR)
198 <EXPECT_TAG>EDUC  MKTAGACTION(EDUC)
199 <EXPECT_TAG>EMIG  MKTAGACTION(EMIG)
200 <EXPECT_TAG>ENDL  MKTAGACTION(ENDL)
201 <EXPECT_TAG>ENGA  MKTAGACTION(ENGA)
202 <EXPECT_TAG>EVEN  MKTAGACTION(EVEN)
203 <EXPECT_TAG>FAM   MKTAGACTION(FAM)
204 <EXPECT_TAG>FAMC  MKTAGACTION(FAMC)
205 <EXPECT_TAG>FAMF  MKTAGACTION(FAMF)
206 <EXPECT_TAG>FAMS  MKTAGACTION(FAMS)
207 <EXPECT_TAG>FCOM  MKTAGACTION(FCOM)
208 <EXPECT_TAG>FILE  MKTAGACTION(FILE)
209 <EXPECT_TAG>FORM  MKTAGACTION(FORM)
210 <EXPECT_TAG>GEDC  MKTAGACTION(GEDC)
211 <EXPECT_TAG>GIVN  MKTAGACTION(GIVN)
212 <EXPECT_TAG>GRAD  MKTAGACTION(GRAD)
213 <EXPECT_TAG>HEAD  MKTAGACTION(HEAD)
214 <EXPECT_TAG>HUSB  MKTAGACTION(HUSB)
215 <EXPECT_TAG>IDNO  MKTAGACTION(IDNO)
216 <EXPECT_TAG>IMMI  MKTAGACTION(IMMI)
217 <EXPECT_TAG>INDI  MKTAGACTION(INDI)
218 <EXPECT_TAG>LANG  MKTAGACTION(LANG)
219 <EXPECT_TAG>LEGA  MKTAGACTION(LEGA)
220 <EXPECT_TAG>MARB  MKTAGACTION(MARB)
221 <EXPECT_TAG>MARC  MKTAGACTION(MARC)
222 <EXPECT_TAG>MARL  MKTAGACTION(MARL)
223 <EXPECT_TAG>MARR  MKTAGACTION(MARR)
224 <EXPECT_TAG>MARS  MKTAGACTION(MARS)
225 <EXPECT_TAG>MEDI  MKTAGACTION(MEDI)
226 <EXPECT_TAG>NAME  MKTAGACTION(NAME)
227 <EXPECT_TAG>NATI  MKTAGACTION(NATI)
228 <EXPECT_TAG>NATU  MKTAGACTION(NATU)
229 <EXPECT_TAG>NCHI  MKTAGACTION(NCHI)
230 <EXPECT_TAG>NICK  MKTAGACTION(NICK)
231 <EXPECT_TAG>NMR   MKTAGACTION(NMR)
232 <EXPECT_TAG>NOTE  MKTAGACTION(NOTE)
233 <EXPECT_TAG>NPFX  MKTAGACTION(NPFX)
234 <EXPECT_TAG>NSFX  MKTAGACTION(NSFX)
235 <EXPECT_TAG>OBJE  MKTAGACTION(OBJE)
236 <EXPECT_TAG>OCCU  MKTAGACTION(OCCU)
237 <EXPECT_TAG>ORDI  MKTAGACTION(ORDI)
238 <EXPECT_TAG>ORDN  MKTAGACTION(ORDN)
239 <EXPECT_TAG>PAGE  MKTAGACTION(PAGE)
240 <EXPECT_TAG>PEDI  MKTAGACTION(PEDI)
241 <EXPECT_TAG>PHON  MKTAGACTION(PHON)
242 <EXPECT_TAG>PLAC  MKTAGACTION(PLAC)
243 <EXPECT_TAG>POST  MKTAGACTION(POST)
244 <EXPECT_TAG>PROB  MKTAGACTION(PROB)
245 <EXPECT_TAG>PROP  MKTAGACTION(PROP)
246 <EXPECT_TAG>PUBL  MKTAGACTION(PUBL)
247 <EXPECT_TAG>QUAY  MKTAGACTION(QUAY)
248 <EXPECT_TAG>REFN  MKTAGACTION(REFN)
249 <EXPECT_TAG>RELA  MKTAGACTION(RELA)
250 <EXPECT_TAG>RELI  MKTAGACTION(RELI)
251 <EXPECT_TAG>REPO  MKTAGACTION(REPO)
252 <EXPECT_TAG>RESI  MKTAGACTION(RESI)
253 <EXPECT_TAG>RESN  MKTAGACTION(RESN)
254 <EXPECT_TAG>RETI  MKTAGACTION(RETI)
255 <EXPECT_TAG>RFN   MKTAGACTION(RFN)
256 <EXPECT_TAG>RIN   MKTAGACTION(RIN)
257 <EXPECT_TAG>ROLE  MKTAGACTION(ROLE)
258 <EXPECT_TAG>SEX   MKTAGACTION(SEX)
259 <EXPECT_TAG>SLGC  MKTAGACTION(SLGC)
260 <EXPECT_TAG>SLGS  MKTAGACTION(SLGS)
261 <EXPECT_TAG>SOUR  MKTAGACTION(SOUR)
262 <EXPECT_TAG>SPFX  MKTAGACTION(SPFX)
263 <EXPECT_TAG>SSN   MKTAGACTION(SSN)
264 <EXPECT_TAG>STAE  MKTAGACTION(STAE)
265 <EXPECT_TAG>STAT  MKTAGACTION(STAT)
266 <EXPECT_TAG>SUBM  MKTAGACTION(SUBM)
267 <EXPECT_TAG>SUBN  MKTAGACTION(SUBN)
268 <EXPECT_TAG>SURN  MKTAGACTION(SURN)
269 <EXPECT_TAG>TEMP  MKTAGACTION(TEMP)
270 <EXPECT_TAG>TEXT  MKTAGACTION(TEXT)
271 <EXPECT_TAG>TIME  MKTAGACTION(TIME)
272 <EXPECT_TAG>TITL  MKTAGACTION(TITL)
273 <EXPECT_TAG>TRLR  MKTAGACTION(TRLR)
274 <EXPECT_TAG>TYPE  MKTAGACTION(TYPE)
275 <EXPECT_TAG>VERS  MKTAGACTION(VERS)
276 <EXPECT_TAG>WIFE  MKTAGACTION(WIFE)
277 <EXPECT_TAG>WILL  MKTAGACTION(WILL)
278      
279 <EXPECT_TAG>{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) {
280                             gedcom_error("Tag '%s' too long, max %d chars");
281                             return BADTOKEN;
282                           }
283                           strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);
284                           gedcom_lval.string = TO_INTERNAL(string_buf);
285                           BEGIN(NORMAL);
286                           return USERTAG;
287                         }
288
289 {delim}      { gedcom_lval.string = TO_INTERNAL(yytext);
290                return DELIM;
291              }
292
293 {any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext);
294                   /* Due to character conversions, it is possible
295                      that the current character will be combined with
296                      the next, and so now we don't have a character yet...
297                      This is only applicable to the 1byte case (e.g. ANSEL).
298                   */
299                   if (strlen(gedcom_lval.string) > 0) 
300                     return ANYCHAR;
301                 }
302
303 {escape}/{non_at}  { gedcom_lval.string = TO_INTERNAL(yytext);
304                      return ESCAPE;
305                    }
306
307 {pointer}    { gedcom_lval.string = TO_INTERNAL(yytext);
308                return POINTER;
309              }
310
311    /* Due to the conversion of level numbers into brackets, the
312       terminator is not important, so no token is returned here.
313       Although not strictly according to the GEDCOM spec, we'll ignore
314       whitespace just before the terminator.
315    */
316
317 {gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
318
319    /* Eventually we have to return 1 closing bracket (for the trailer).
320       We can detect whether we have sent the closing bracket using the
321       level_diff (at eof, first it is 2, then we increment it ourselves) */
322
323 <<EOF>> { if (level_diff == 2) {
324             level_diff++;
325             return CLOSE;
326           }
327           else {
328             yyterminate();
329           }
330         } 
331
332 .  { gedcom_error("Unexpected character: '%s' (0x%02x)",
333                   yytext, yytext[0]);
334      return BADTOKEN;
335    }
336
337 %%
338
339 int yywrap()
340 {
341   return 1;
342 }
343
344 #ifdef LEXER_TEST
345 int main()
346 {
347   int tok, res;
348   init_encodings();
349   set_encoding_width(ONE_BYTE);
350   res = open_conv_to_internal("ASCII");
351   if (!res) {
352     gedcom_error("Unable to open conversion context: %s",
353                  strerror(errno));
354     return 1;
355   }
356   tok = gedcom_1byte_lex();
357   while (tok) {
358     switch(tok) {
359       case BADTOKEN: printf("BADTOKEN "); break;
360       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
361       case CLOSE: printf("CLOSE "); break;
362       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
363       case DELIM: printf("DELIM "); break;
364       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
365       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
366       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
367       default: printf("TAG(%s) ", gedcom_lval.string); break;
368     }
369     tok = gedcom_1byte_lex();
370   }
371   printf("\n");
372   close_conv_to_internal();
373   return 0;
374 }
375 #endif