Added help text; better debug options.
[gedcom-parse.git] / gedcom.lex
1 /* $Id$ */
2 /* $Name$ */
3
4 %{
5 #include "gedcom.tab.h"
6 #include "gedcom.h"
7 %}
8
9 %s NORMAL
10 %s EXPECT_TAG
11
12 alpha        [A-Za-z_]
13 digit        [0-9]
14 delim        " "
15 tab          [\t]
16 hash         #
17 literal_at   @@
18 otherchar    [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFE]
19 terminator   \x0D|\x0A|\x0D\x0A|\x0A\x0D
20
21 any_char     {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at}
22 any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at}
23 non_at       {alpha}|{digit}|{otherchar}|{delim}|{hash}
24 alphanum     {alpha}|{digit}
25 gen_delim    {delim}|{tab}
26
27 escape       @#{any_char}+@
28 pointer      @{alphanum}{non_at}+@
29
30 %{
31 int current_level=-1;
32 int level_diff=MAXGEDCLEVEL;
33 int line_no=1;
34 %} 
35
36 %%
37
38     /* The GEDCOM level number is converted into a sequence of opening
39        and closing brackets.  Simply put, the following GEDCOM fragment:
40
41          0 HEAD
42          1 SOUR genes
43          2 VERS 1.6
44          2 NAME Genes
45          1 DATE 07 OCT 2001
46          ...
47          0 TRLR
48
49        is converted into:
50
51          { HEAD                     (initial)  
52          { SOUR genes               (1 higher: no closing brackets)
53          { VERS 1.6                 (1 higher: no closing brackets)
54          } { NAME Genes             (same level: 1 closing bracket)
55          } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
56          ...
57          } { TRLR }
58
59        or more clearly:
60
61          { HEAD
62            { SOUR genes
63              { VERS 1.6 }
64              { NAME Genes } }
65            { DATE 07 OCT 2001
66          ... }
67          { TRLR }
68
69        But because this means that one token is converted into a series
70        of tokens, there is some initial code following immediately here
71        that returns "pending" tokens. */
72
73 %{
74 char string_buf[MAXGEDCLINELEN+1];
75  
76 if (level_diff < 1) {
77   level_diff++;
78   return CLOSE;
79 }
80 else if (level_diff == 1) {
81   level_diff++;
82   return OPEN;
83 }
84 else {
85   /* out of brackets... */
86 }
87
88 #define MKTAGACTION(tag) \
89   { gedcom_lval.string = gedcom_text; \
90     BEGIN(NORMAL); \
91     return TAG_##tag; }
92
93 %}
94
95 <INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
96
97 <INITIAL>0{digit}+ { gedcom_error ("Level number with leading zero");
98                      return BADTOKEN;
99                    }
100
101 <INITIAL>{digit}+ { int level = atoi(gedcom_text);
102                     if ((level < 0) || (level > MAXGEDCLEVEL)) {
103                       gedcom_error ("Level number out of range [0..%d]",
104                                     MAXGEDCLEVEL);
105                       return BADTOKEN;
106                     }
107                     level_diff = level - current_level;
108                     BEGIN(EXPECT_TAG);
109                     current_level = level;
110                     if (level_diff < 1) {
111                       level_diff++;
112                       return CLOSE;
113                     }
114                     else if (level_diff == 1) {
115                       level_diff++;
116                       return OPEN;
117                     }
118                     else {
119                       /* should never happen (error to GEDCOM spec) */
120                       gedcom_error ("GEDCOM level number is %d higher than "
121                                     "previous",
122                                     level_diff);
123                       return BADTOKEN;
124                     }
125                   }
126
127 <EXPECT_TAG>ABBR  MKTAGACTION(ABBR)
128 <EXPECT_TAG>ADDR  MKTAGACTION(ADDR)
129 <EXPECT_TAG>ADR1  MKTAGACTION(ADR1)
130 <EXPECT_TAG>ADR2  MKTAGACTION(ADR2)
131 <EXPECT_TAG>ADOP  MKTAGACTION(ADOP)
132 <EXPECT_TAG>AFN   MKTAGACTION(AFN)
133 <EXPECT_TAG>AGE   MKTAGACTION(AGE)
134 <EXPECT_TAG>AGNC  MKTAGACTION(AGNC)
135 <EXPECT_TAG>ALIA  MKTAGACTION(ALIA)
136 <EXPECT_TAG>ANCE  MKTAGACTION(ANCE)
137 <EXPECT_TAG>ANCI  MKTAGACTION(ANCI)
138 <EXPECT_TAG>ANUL  MKTAGACTION(ANUL)
139 <EXPECT_TAG>ASSO  MKTAGACTION(ASSO)
140 <EXPECT_TAG>AUTH  MKTAGACTION(AUTH)
141 <EXPECT_TAG>BAPL  MKTAGACTION(BAPL)
142 <EXPECT_TAG>BAPM  MKTAGACTION(BAPM)
143 <EXPECT_TAG>BARM  MKTAGACTION(BARM)
144 <EXPECT_TAG>BASM  MKTAGACTION(BASM)
145 <EXPECT_TAG>BIRT  MKTAGACTION(BIRT)
146 <EXPECT_TAG>BLES  MKTAGACTION(BLES)
147 <EXPECT_TAG>BLOB  MKTAGACTION(BLOB)
148 <EXPECT_TAG>BURI  MKTAGACTION(BURI)
149 <EXPECT_TAG>CALN  MKTAGACTION(CALN)
150 <EXPECT_TAG>CAST  MKTAGACTION(CAST)
151 <EXPECT_TAG>CAUS  MKTAGACTION(CAUS)
152 <EXPECT_TAG>CENS  MKTAGACTION(CENS)
153 <EXPECT_TAG>CHAN  MKTAGACTION(CHAN)
154 <EXPECT_TAG>CHAR  MKTAGACTION(CHAR)
155 <EXPECT_TAG>CHIL  MKTAGACTION(CHIL)
156 <EXPECT_TAG>CHR   MKTAGACTION(CHR)
157 <EXPECT_TAG>CHRA  MKTAGACTION(CHRA)
158 <EXPECT_TAG>CITY  MKTAGACTION(CITY)
159 <EXPECT_TAG>CONC  MKTAGACTION(CONC)
160 <EXPECT_TAG>CONF  MKTAGACTION(CONF)
161 <EXPECT_TAG>CONL  MKTAGACTION(CONL)
162 <EXPECT_TAG>CONT  MKTAGACTION(CONT)
163 <EXPECT_TAG>COPR  MKTAGACTION(COPR)
164 <EXPECT_TAG>CORP  MKTAGACTION(CORP)
165 <EXPECT_TAG>CREM  MKTAGACTION(CREM)
166 <EXPECT_TAG>CTRY  MKTAGACTION(CTRY)
167 <EXPECT_TAG>DATA  MKTAGACTION(DATA)
168 <EXPECT_TAG>DATE  MKTAGACTION(DATE)
169 <EXPECT_TAG>DEAT  MKTAGACTION(DEAT)
170 <EXPECT_TAG>DESC  MKTAGACTION(DESC)
171 <EXPECT_TAG>DESI  MKTAGACTION(DESI)
172 <EXPECT_TAG>DEST  MKTAGACTION(DEST)
173 <EXPECT_TAG>DIV   MKTAGACTION(DIV)
174 <EXPECT_TAG>DIVF  MKTAGACTION(DIVF)
175 <EXPECT_TAG>DSCR  MKTAGACTION(DSCR)
176 <EXPECT_TAG>EDUC  MKTAGACTION(EDUC)
177 <EXPECT_TAG>EMIG  MKTAGACTION(EMIG)
178 <EXPECT_TAG>ENDL  MKTAGACTION(ENDL)
179 <EXPECT_TAG>ENGA  MKTAGACTION(ENGA)
180 <EXPECT_TAG>EVEN  MKTAGACTION(EVEN)
181 <EXPECT_TAG>FAM   MKTAGACTION(FAM)
182 <EXPECT_TAG>FAMC  MKTAGACTION(FAMC)
183 <EXPECT_TAG>FAMF  MKTAGACTION(FAMF)
184 <EXPECT_TAG>FAMS  MKTAGACTION(FAMS)
185 <EXPECT_TAG>FCOM  MKTAGACTION(FCOM)
186 <EXPECT_TAG>FILE  MKTAGACTION(FILE)
187 <EXPECT_TAG>FORM  MKTAGACTION(FORM)
188 <EXPECT_TAG>GEDC  MKTAGACTION(GEDC)
189 <EXPECT_TAG>GIVN  MKTAGACTION(GIVN)
190 <EXPECT_TAG>GRAD  MKTAGACTION(GRAD)
191 <EXPECT_TAG>HEAD  MKTAGACTION(HEAD)
192 <EXPECT_TAG>HUSB  MKTAGACTION(HUSB)
193 <EXPECT_TAG>IDNO  MKTAGACTION(IDNO)
194 <EXPECT_TAG>IMMI  MKTAGACTION(IMMI)
195 <EXPECT_TAG>INDI  MKTAGACTION(INDI)
196 <EXPECT_TAG>LANG  MKTAGACTION(LANG)
197 <EXPECT_TAG>LEGA  MKTAGACTION(LEGA)
198 <EXPECT_TAG>MARB  MKTAGACTION(MARB)
199 <EXPECT_TAG>MARC  MKTAGACTION(MARC)
200 <EXPECT_TAG>MARL  MKTAGACTION(MARL)
201 <EXPECT_TAG>MARR  MKTAGACTION(MARR)
202 <EXPECT_TAG>MARS  MKTAGACTION(MARS)
203 <EXPECT_TAG>MEDI  MKTAGACTION(MEDI)
204 <EXPECT_TAG>NAME  MKTAGACTION(NAME)
205 <EXPECT_TAG>NATI  MKTAGACTION(NATI)
206 <EXPECT_TAG>NATU  MKTAGACTION(NATU)
207 <EXPECT_TAG>NCHI  MKTAGACTION(NCHI)
208 <EXPECT_TAG>NICK  MKTAGACTION(NICK)
209 <EXPECT_TAG>NMR   MKTAGACTION(NMR)
210 <EXPECT_TAG>NOTE  MKTAGACTION(NOTE)
211 <EXPECT_TAG>NPFX  MKTAGACTION(NPFX)
212 <EXPECT_TAG>NSFX  MKTAGACTION(NSFX)
213 <EXPECT_TAG>OBJE  MKTAGACTION(OBJE)
214 <EXPECT_TAG>OCCU  MKTAGACTION(OCCU)
215 <EXPECT_TAG>ORDI  MKTAGACTION(ORDI)
216 <EXPECT_TAG>ORDN  MKTAGACTION(ORDN)
217 <EXPECT_TAG>PAGE  MKTAGACTION(PAGE)
218 <EXPECT_TAG>PEDI  MKTAGACTION(PEDI)
219 <EXPECT_TAG>PHON  MKTAGACTION(PHON)
220 <EXPECT_TAG>PLAC  MKTAGACTION(PLAC)
221 <EXPECT_TAG>POST  MKTAGACTION(POST)
222 <EXPECT_TAG>PROB  MKTAGACTION(PROB)
223 <EXPECT_TAG>PROP  MKTAGACTION(PROP)
224 <EXPECT_TAG>PUBL  MKTAGACTION(PUBL)
225 <EXPECT_TAG>QUAY  MKTAGACTION(QUAY)
226 <EXPECT_TAG>REFN  MKTAGACTION(REFN)
227 <EXPECT_TAG>RELA  MKTAGACTION(RELA)
228 <EXPECT_TAG>RELI  MKTAGACTION(RELI)
229 <EXPECT_TAG>REPO  MKTAGACTION(REPO)
230 <EXPECT_TAG>RESI  MKTAGACTION(RESI)
231 <EXPECT_TAG>RESN  MKTAGACTION(RESN)
232 <EXPECT_TAG>RETI  MKTAGACTION(RETI)
233 <EXPECT_TAG>RFN   MKTAGACTION(RFN)
234 <EXPECT_TAG>RIN   MKTAGACTION(RIN)
235 <EXPECT_TAG>ROLE  MKTAGACTION(ROLE)
236 <EXPECT_TAG>SEX   MKTAGACTION(SEX)
237 <EXPECT_TAG>SLGC  MKTAGACTION(SLGC)
238 <EXPECT_TAG>SLGS  MKTAGACTION(SLGS)
239 <EXPECT_TAG>SOUR  MKTAGACTION(SOUR)
240 <EXPECT_TAG>SPFX  MKTAGACTION(SPFX)
241 <EXPECT_TAG>SSN   MKTAGACTION(SSN)
242 <EXPECT_TAG>STAE  MKTAGACTION(STAE)
243 <EXPECT_TAG>STAT  MKTAGACTION(STAT)
244 <EXPECT_TAG>SUBM  MKTAGACTION(SUBM)
245 <EXPECT_TAG>SUBN  MKTAGACTION(SUBN)
246 <EXPECT_TAG>SURN  MKTAGACTION(SURN)
247 <EXPECT_TAG>TEMP  MKTAGACTION(TEMP)
248 <EXPECT_TAG>TEXT  MKTAGACTION(TEXT)
249 <EXPECT_TAG>TIME  MKTAGACTION(TIME)
250 <EXPECT_TAG>TITL  MKTAGACTION(TITL)
251 <EXPECT_TAG>TRLR  MKTAGACTION(TRLR)
252 <EXPECT_TAG>TYPE  MKTAGACTION(TYPE)
253 <EXPECT_TAG>VERS  MKTAGACTION(VERS)
254 <EXPECT_TAG>WIFE  MKTAGACTION(WIFE)
255 <EXPECT_TAG>WILL  MKTAGACTION(WILL)
256      
257 <EXPECT_TAG>{alphanum}+ { if (strlen(gedcom_text) > MAXGEDCTAGLEN) {
258                             gedcom_error("Tag '%s' too long, max %d chars");
259                             return BADTOKEN;
260                           }
261                           strncpy(string_buf, gedcom_text, MAXGEDCTAGLEN+1);
262                           gedcom_lval.string = string_buf;
263                           BEGIN(NORMAL);
264                           return USERTAG;
265                         }
266
267 {delim}      { gedcom_lval.string = gedcom_text;
268                return DELIM;
269              }
270
271 {any_but_delim} { gedcom_lval.string = gedcom_text;
272                   return ANYCHAR;
273                 }
274
275 {escape}/{non_at}  { gedcom_lval.string = gedcom_text;
276                      return ESCAPE;
277                    }
278
279 {pointer}    { gedcom_lval.string = gedcom_text;
280                return POINTER;
281              }
282
283    /* Due to the conversion of level numbers into brackets, the
284       terminator is not important, so no token is returned here.
285       Although not strictly according to the GEDCOM spec, we'll ignore
286       whitespace just before the terminator.
287    */
288
289 {gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
290
291    /* Eventually we have to return 1 closing bracket (for the trailer).
292       We can detect whether we have sent the closing bracket using the
293       level_diff (at eof, first it is 2, then we increment it ourselves) */
294
295 <<EOF>> { if (level_diff == 2) {
296             level_diff++;
297             return CLOSE;
298           }
299           else {
300             yyterminate();
301           }
302         } 
303
304 .  { gedcom_error("Unexpected character: '%s'", gedcom_text);
305      return BADTOKEN;
306    }
307
308 %%
309
310 int gedcom_wrap()
311 {
312   return 1;
313 }