Added CVS tags.
[gedcom-parse.git] / gedcom_1byte.lex
1 /*  This program is free software; you can redistribute it and/or modify  *
2  *  it under the terms of the GNU General Public License as published by  *
3  *  the Free Software Foundation; either version 2 of the License, or     *
4  *  (at your option) any later version.                                   *
5
6  (C) 2001 by The Genes Development Team
7  Original author: Peter Verthez (Peter.Verthez@advalvas.be)
8 */
9
10 /* $Id$ */
11 /* $Name$ */
12
13 %{
14 #include "gedcom.tab.h"
15 #include "gedcom.h"
16 #include "multilex.h"
17 #include "encoding.h"
18
19 #define YY_NO_UNPUT
20 %}
21
22 %s NORMAL
23 %s EXPECT_TAG
24
25 alpha        [A-Za-z_]
26 digit        [0-9]
27 delim        " "
28 tab          [\t]
29 hash         #
30 literal_at   @@
31 otherchar    [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFE]
32 terminator   \x0D|\x0A|\x0D\x0A|\x0A\x0D
33
34 any_char     {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at}
35 any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at}
36 non_at       {alpha}|{digit}|{otherchar}|{delim}|{hash}
37 alphanum     {alpha}|{digit}
38 gen_delim    {delim}|{tab}
39
40 escape       @#{any_char}+@
41 pointer      @{alphanum}{non_at}+@
42
43 %{
44 static int current_level=-1;
45 static int level_diff=MAXGEDCLEVEL;
46  
47 #ifdef LEXER_TEST 
48 YYSTYPE gedcom_lval;
49 int line_no = 1; 
50 #endif
51  
52 %} 
53
54 %%
55
56     /* The GEDCOM level number is converted into a sequence of opening
57        and closing brackets.  Simply put, the following GEDCOM fragment:
58
59          0 HEAD
60          1 SOUR genes
61          2 VERS 1.6
62          2 NAME Genes
63          1 DATE 07 OCT 2001
64          ...
65          0 TRLR
66
67        is converted into:
68
69          { HEAD                     (initial)  
70          { SOUR genes               (1 higher: no closing brackets)
71          { VERS 1.6                 (1 higher: no closing brackets)
72          } { NAME Genes             (same level: 1 closing bracket)
73          } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
74          ...
75          } { TRLR }
76
77        or more clearly:
78
79          { HEAD
80            { SOUR genes
81              { VERS 1.6 }
82              { NAME Genes } }
83            { DATE 07 OCT 2001
84          ... }
85          { TRLR }
86
87        But because this means that one token is converted into a series
88        of tokens, there is some initial code following immediately here
89        that returns "pending" tokens. */
90
91 %{
92 char string_buf[MAXGEDCLINELEN+1];
93  
94 if (level_diff < 1) {
95   level_diff++;
96   return CLOSE;
97 }
98 else if (level_diff == 1) {
99   level_diff++;
100   return OPEN;
101 }
102 else {
103   /* out of brackets... */
104 }
105
106 #define TO_INTERNAL(str) to_internal(str, yyleng) 
107
108 #define MKTAGACTION(tag) \
109   { gedcom_lval.string = TO_INTERNAL(yytext); \
110     BEGIN(NORMAL); \
111     return TAG_##tag; }
112
113 %}
114
115 <INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
116
117 <INITIAL>0{digit}+ { gedcom_error ("Level number with leading zero");
118                      return BADTOKEN;
119                    }
120
121 <INITIAL>{digit}+ { int level = atoi(TO_INTERNAL(yytext));
122                     if ((level < 0) || (level > MAXGEDCLEVEL)) {
123                       gedcom_error ("Level number out of range [0..%d]",
124                                     MAXGEDCLEVEL);
125                       return BADTOKEN;
126                     }
127                     level_diff = level - current_level;
128                     BEGIN(EXPECT_TAG);
129                     current_level = level;
130                     if (level_diff < 1) {
131                       level_diff++;
132                       return CLOSE;
133                     }
134                     else if (level_diff == 1) {
135                       level_diff++;
136                       return OPEN;
137                     }
138                     else {
139                       /* should never happen (error to GEDCOM spec) */
140                       gedcom_error ("GEDCOM level number is %d higher than "
141                                     "previous",
142                                     level_diff);
143                       return BADTOKEN;
144                     }
145                   }
146
147 <EXPECT_TAG>ABBR  MKTAGACTION(ABBR)
148 <EXPECT_TAG>ADDR  MKTAGACTION(ADDR)
149 <EXPECT_TAG>ADR1  MKTAGACTION(ADR1)
150 <EXPECT_TAG>ADR2  MKTAGACTION(ADR2)
151 <EXPECT_TAG>ADOP  MKTAGACTION(ADOP)
152 <EXPECT_TAG>AFN   MKTAGACTION(AFN)
153 <EXPECT_TAG>AGE   MKTAGACTION(AGE)
154 <EXPECT_TAG>AGNC  MKTAGACTION(AGNC)
155 <EXPECT_TAG>ALIA  MKTAGACTION(ALIA)
156 <EXPECT_TAG>ANCE  MKTAGACTION(ANCE)
157 <EXPECT_TAG>ANCI  MKTAGACTION(ANCI)
158 <EXPECT_TAG>ANUL  MKTAGACTION(ANUL)
159 <EXPECT_TAG>ASSO  MKTAGACTION(ASSO)
160 <EXPECT_TAG>AUTH  MKTAGACTION(AUTH)
161 <EXPECT_TAG>BAPL  MKTAGACTION(BAPL)
162 <EXPECT_TAG>BAPM  MKTAGACTION(BAPM)
163 <EXPECT_TAG>BARM  MKTAGACTION(BARM)
164 <EXPECT_TAG>BASM  MKTAGACTION(BASM)
165 <EXPECT_TAG>BIRT  MKTAGACTION(BIRT)
166 <EXPECT_TAG>BLES  MKTAGACTION(BLES)
167 <EXPECT_TAG>BLOB  MKTAGACTION(BLOB)
168 <EXPECT_TAG>BURI  MKTAGACTION(BURI)
169 <EXPECT_TAG>CALN  MKTAGACTION(CALN)
170 <EXPECT_TAG>CAST  MKTAGACTION(CAST)
171 <EXPECT_TAG>CAUS  MKTAGACTION(CAUS)
172 <EXPECT_TAG>CENS  MKTAGACTION(CENS)
173 <EXPECT_TAG>CHAN  MKTAGACTION(CHAN)
174 <EXPECT_TAG>CHAR  MKTAGACTION(CHAR)
175 <EXPECT_TAG>CHIL  MKTAGACTION(CHIL)
176 <EXPECT_TAG>CHR   MKTAGACTION(CHR)
177 <EXPECT_TAG>CHRA  MKTAGACTION(CHRA)
178 <EXPECT_TAG>CITY  MKTAGACTION(CITY)
179 <EXPECT_TAG>CONC  MKTAGACTION(CONC)
180 <EXPECT_TAG>CONF  MKTAGACTION(CONF)
181 <EXPECT_TAG>CONL  MKTAGACTION(CONL)
182 <EXPECT_TAG>CONT  MKTAGACTION(CONT)
183 <EXPECT_TAG>COPR  MKTAGACTION(COPR)
184 <EXPECT_TAG>CORP  MKTAGACTION(CORP)
185 <EXPECT_TAG>CREM  MKTAGACTION(CREM)
186 <EXPECT_TAG>CTRY  MKTAGACTION(CTRY)
187 <EXPECT_TAG>DATA  MKTAGACTION(DATA)
188 <EXPECT_TAG>DATE  MKTAGACTION(DATE)
189 <EXPECT_TAG>DEAT  MKTAGACTION(DEAT)
190 <EXPECT_TAG>DESC  MKTAGACTION(DESC)
191 <EXPECT_TAG>DESI  MKTAGACTION(DESI)
192 <EXPECT_TAG>DEST  MKTAGACTION(DEST)
193 <EXPECT_TAG>DIV   MKTAGACTION(DIV)
194 <EXPECT_TAG>DIVF  MKTAGACTION(DIVF)
195 <EXPECT_TAG>DSCR  MKTAGACTION(DSCR)
196 <EXPECT_TAG>EDUC  MKTAGACTION(EDUC)
197 <EXPECT_TAG>EMIG  MKTAGACTION(EMIG)
198 <EXPECT_TAG>ENDL  MKTAGACTION(ENDL)
199 <EXPECT_TAG>ENGA  MKTAGACTION(ENGA)
200 <EXPECT_TAG>EVEN  MKTAGACTION(EVEN)
201 <EXPECT_TAG>FAM   MKTAGACTION(FAM)
202 <EXPECT_TAG>FAMC  MKTAGACTION(FAMC)
203 <EXPECT_TAG>FAMF  MKTAGACTION(FAMF)
204 <EXPECT_TAG>FAMS  MKTAGACTION(FAMS)
205 <EXPECT_TAG>FCOM  MKTAGACTION(FCOM)
206 <EXPECT_TAG>FILE  MKTAGACTION(FILE)
207 <EXPECT_TAG>FORM  MKTAGACTION(FORM)
208 <EXPECT_TAG>GEDC  MKTAGACTION(GEDC)
209 <EXPECT_TAG>GIVN  MKTAGACTION(GIVN)
210 <EXPECT_TAG>GRAD  MKTAGACTION(GRAD)
211 <EXPECT_TAG>HEAD  MKTAGACTION(HEAD)
212 <EXPECT_TAG>HUSB  MKTAGACTION(HUSB)
213 <EXPECT_TAG>IDNO  MKTAGACTION(IDNO)
214 <EXPECT_TAG>IMMI  MKTAGACTION(IMMI)
215 <EXPECT_TAG>INDI  MKTAGACTION(INDI)
216 <EXPECT_TAG>LANG  MKTAGACTION(LANG)
217 <EXPECT_TAG>LEGA  MKTAGACTION(LEGA)
218 <EXPECT_TAG>MARB  MKTAGACTION(MARB)
219 <EXPECT_TAG>MARC  MKTAGACTION(MARC)
220 <EXPECT_TAG>MARL  MKTAGACTION(MARL)
221 <EXPECT_TAG>MARR  MKTAGACTION(MARR)
222 <EXPECT_TAG>MARS  MKTAGACTION(MARS)
223 <EXPECT_TAG>MEDI  MKTAGACTION(MEDI)
224 <EXPECT_TAG>NAME  MKTAGACTION(NAME)
225 <EXPECT_TAG>NATI  MKTAGACTION(NATI)
226 <EXPECT_TAG>NATU  MKTAGACTION(NATU)
227 <EXPECT_TAG>NCHI  MKTAGACTION(NCHI)
228 <EXPECT_TAG>NICK  MKTAGACTION(NICK)
229 <EXPECT_TAG>NMR   MKTAGACTION(NMR)
230 <EXPECT_TAG>NOTE  MKTAGACTION(NOTE)
231 <EXPECT_TAG>NPFX  MKTAGACTION(NPFX)
232 <EXPECT_TAG>NSFX  MKTAGACTION(NSFX)
233 <EXPECT_TAG>OBJE  MKTAGACTION(OBJE)
234 <EXPECT_TAG>OCCU  MKTAGACTION(OCCU)
235 <EXPECT_TAG>ORDI  MKTAGACTION(ORDI)
236 <EXPECT_TAG>ORDN  MKTAGACTION(ORDN)
237 <EXPECT_TAG>PAGE  MKTAGACTION(PAGE)
238 <EXPECT_TAG>PEDI  MKTAGACTION(PEDI)
239 <EXPECT_TAG>PHON  MKTAGACTION(PHON)
240 <EXPECT_TAG>PLAC  MKTAGACTION(PLAC)
241 <EXPECT_TAG>POST  MKTAGACTION(POST)
242 <EXPECT_TAG>PROB  MKTAGACTION(PROB)
243 <EXPECT_TAG>PROP  MKTAGACTION(PROP)
244 <EXPECT_TAG>PUBL  MKTAGACTION(PUBL)
245 <EXPECT_TAG>QUAY  MKTAGACTION(QUAY)
246 <EXPECT_TAG>REFN  MKTAGACTION(REFN)
247 <EXPECT_TAG>RELA  MKTAGACTION(RELA)
248 <EXPECT_TAG>RELI  MKTAGACTION(RELI)
249 <EXPECT_TAG>REPO  MKTAGACTION(REPO)
250 <EXPECT_TAG>RESI  MKTAGACTION(RESI)
251 <EXPECT_TAG>RESN  MKTAGACTION(RESN)
252 <EXPECT_TAG>RETI  MKTAGACTION(RETI)
253 <EXPECT_TAG>RFN   MKTAGACTION(RFN)
254 <EXPECT_TAG>RIN   MKTAGACTION(RIN)
255 <EXPECT_TAG>ROLE  MKTAGACTION(ROLE)
256 <EXPECT_TAG>SEX   MKTAGACTION(SEX)
257 <EXPECT_TAG>SLGC  MKTAGACTION(SLGC)
258 <EXPECT_TAG>SLGS  MKTAGACTION(SLGS)
259 <EXPECT_TAG>SOUR  MKTAGACTION(SOUR)
260 <EXPECT_TAG>SPFX  MKTAGACTION(SPFX)
261 <EXPECT_TAG>SSN   MKTAGACTION(SSN)
262 <EXPECT_TAG>STAE  MKTAGACTION(STAE)
263 <EXPECT_TAG>STAT  MKTAGACTION(STAT)
264 <EXPECT_TAG>SUBM  MKTAGACTION(SUBM)
265 <EXPECT_TAG>SUBN  MKTAGACTION(SUBN)
266 <EXPECT_TAG>SURN  MKTAGACTION(SURN)
267 <EXPECT_TAG>TEMP  MKTAGACTION(TEMP)
268 <EXPECT_TAG>TEXT  MKTAGACTION(TEXT)
269 <EXPECT_TAG>TIME  MKTAGACTION(TIME)
270 <EXPECT_TAG>TITL  MKTAGACTION(TITL)
271 <EXPECT_TAG>TRLR  MKTAGACTION(TRLR)
272 <EXPECT_TAG>TYPE  MKTAGACTION(TYPE)
273 <EXPECT_TAG>VERS  MKTAGACTION(VERS)
274 <EXPECT_TAG>WIFE  MKTAGACTION(WIFE)
275 <EXPECT_TAG>WILL  MKTAGACTION(WILL)
276      
277 <EXPECT_TAG>{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) {
278                             gedcom_error("Tag '%s' too long, max %d chars");
279                             return BADTOKEN;
280                           }
281                           strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);
282                           gedcom_lval.string = TO_INTERNAL(string_buf);
283                           BEGIN(NORMAL);
284                           return USERTAG;
285                         }
286
287 {delim}      { gedcom_lval.string = TO_INTERNAL(yytext);
288                return DELIM;
289              }
290
291 {any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext);
292                   /* Due to character conversions, it is possible
293                      that the current character will be combined with
294                      the next, and so now we don't have a character yet...
295                      This is only applicable to the 1byte case (e.g. ANSEL).
296                   */
297                   if (strlen(gedcom_lval.string) > 0) 
298                     return ANYCHAR;
299                 }
300
301 {escape}/{non_at}  { gedcom_lval.string = TO_INTERNAL(yytext);
302                      return ESCAPE;
303                    }
304
305 {pointer}    { gedcom_lval.string = TO_INTERNAL(yytext);
306                return POINTER;
307              }
308
309    /* Due to the conversion of level numbers into brackets, the
310       terminator is not important, so no token is returned here.
311       Although not strictly according to the GEDCOM spec, we'll ignore
312       whitespace just before the terminator.
313    */
314
315 {gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
316
317    /* Eventually we have to return 1 closing bracket (for the trailer).
318       We can detect whether we have sent the closing bracket using the
319       level_diff (at eof, first it is 2, then we increment it ourselves) */
320
321 <<EOF>> { if (level_diff == 2) {
322             level_diff++;
323             return CLOSE;
324           }
325           else {
326             yyterminate();
327           }
328         } 
329
330 .  { gedcom_error("Unexpected character: '%s' (0x%02x)",
331                   yytext, yytext[0]);
332      return BADTOKEN;
333    }
334
335 %%
336
337 int yywrap()
338 {
339   return 1;
340 }
341
342 #ifdef LEXER_TEST
343 int main()
344 {
345   int tok, res;
346   init_encodings();
347   set_encoding_width(ONE_BYTE);
348   res = open_conv_to_internal("ASCII");
349   if (!res) {
350     gedcom_error("Unable to open conversion context: %s",
351                  strerror(errno));
352     return 1;
353   }
354   tok = gedcom_1byte_lex();
355   while (tok) {
356     switch(tok) {
357       case BADTOKEN: printf("BADTOKEN "); break;
358       case OPEN: printf("OPEN "); break;
359       case CLOSE: printf("CLOSE "); break;
360       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
361       case DELIM: printf("DELIM "); break;
362       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
363       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
364       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
365       default: printf("TAG(%s) ", gedcom_lval.string); break;
366     }
367     tok = gedcom_1byte_lex();
368   }
369   printf("\n");
370   close_conv_to_internal();
371   return 0;
372 }
373 #endif