General cleanup.
[gedcom-parse.git] / gedcom_1byte.lex
1 /* $Id$ */
2 /* $Name$ */
3
4 %{
5 #include "gedcom.tab.h"
6 #include "gedcom.h"
7 #include "multilex.h"
8 #include "encoding.h"
9
10 #define YY_NO_UNPUT
11 %}
12
13 %s NORMAL
14 %s EXPECT_TAG
15
16 alpha        [A-Za-z_]
17 digit        [0-9]
18 delim        " "
19 tab          [\t]
20 hash         #
21 literal_at   @@
22 otherchar    [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFE]
23 terminator   \x0D|\x0A|\x0D\x0A|\x0A\x0D
24
25 any_char     {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at}
26 any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at}
27 non_at       {alpha}|{digit}|{otherchar}|{delim}|{hash}
28 alphanum     {alpha}|{digit}
29 gen_delim    {delim}|{tab}
30
31 escape       @#{any_char}+@
32 pointer      @{alphanum}{non_at}+@
33
34 %{
35 static int current_level=-1;
36 static int level_diff=MAXGEDCLEVEL;
37  
38 #ifdef LEXER_TEST 
39 YYSTYPE gedcom_lval;
40 int line_no = 1; 
41 #endif
42  
43 %} 
44
45 %%
46
47     /* The GEDCOM level number is converted into a sequence of opening
48        and closing brackets.  Simply put, the following GEDCOM fragment:
49
50          0 HEAD
51          1 SOUR genes
52          2 VERS 1.6
53          2 NAME Genes
54          1 DATE 07 OCT 2001
55          ...
56          0 TRLR
57
58        is converted into:
59
60          { HEAD                     (initial)  
61          { SOUR genes               (1 higher: no closing brackets)
62          { VERS 1.6                 (1 higher: no closing brackets)
63          } { NAME Genes             (same level: 1 closing bracket)
64          } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
65          ...
66          } { TRLR }
67
68        or more clearly:
69
70          { HEAD
71            { SOUR genes
72              { VERS 1.6 }
73              { NAME Genes } }
74            { DATE 07 OCT 2001
75          ... }
76          { TRLR }
77
78        But because this means that one token is converted into a series
79        of tokens, there is some initial code following immediately here
80        that returns "pending" tokens. */
81
82 %{
83 char string_buf[MAXGEDCLINELEN+1];
84  
85 if (level_diff < 1) {
86   level_diff++;
87   return CLOSE;
88 }
89 else if (level_diff == 1) {
90   level_diff++;
91   return OPEN;
92 }
93 else {
94   /* out of brackets... */
95 }
96
97 #define TO_INTERNAL(str) to_internal(str, yyleng) 
98
99 #define MKTAGACTION(tag) \
100   { gedcom_lval.string = TO_INTERNAL(yytext); \
101     BEGIN(NORMAL); \
102     return TAG_##tag; }
103
104 %}
105
106 <INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
107
108 <INITIAL>0{digit}+ { gedcom_error ("Level number with leading zero");
109                      return BADTOKEN;
110                    }
111
112 <INITIAL>{digit}+ { int level = atoi(TO_INTERNAL(yytext));
113                     if ((level < 0) || (level > MAXGEDCLEVEL)) {
114                       gedcom_error ("Level number out of range [0..%d]",
115                                     MAXGEDCLEVEL);
116                       return BADTOKEN;
117                     }
118                     level_diff = level - current_level;
119                     BEGIN(EXPECT_TAG);
120                     current_level = level;
121                     if (level_diff < 1) {
122                       level_diff++;
123                       return CLOSE;
124                     }
125                     else if (level_diff == 1) {
126                       level_diff++;
127                       return OPEN;
128                     }
129                     else {
130                       /* should never happen (error to GEDCOM spec) */
131                       gedcom_error ("GEDCOM level number is %d higher than "
132                                     "previous",
133                                     level_diff);
134                       return BADTOKEN;
135                     }
136                   }
137
138 <EXPECT_TAG>ABBR  MKTAGACTION(ABBR)
139 <EXPECT_TAG>ADDR  MKTAGACTION(ADDR)
140 <EXPECT_TAG>ADR1  MKTAGACTION(ADR1)
141 <EXPECT_TAG>ADR2  MKTAGACTION(ADR2)
142 <EXPECT_TAG>ADOP  MKTAGACTION(ADOP)
143 <EXPECT_TAG>AFN   MKTAGACTION(AFN)
144 <EXPECT_TAG>AGE   MKTAGACTION(AGE)
145 <EXPECT_TAG>AGNC  MKTAGACTION(AGNC)
146 <EXPECT_TAG>ALIA  MKTAGACTION(ALIA)
147 <EXPECT_TAG>ANCE  MKTAGACTION(ANCE)
148 <EXPECT_TAG>ANCI  MKTAGACTION(ANCI)
149 <EXPECT_TAG>ANUL  MKTAGACTION(ANUL)
150 <EXPECT_TAG>ASSO  MKTAGACTION(ASSO)
151 <EXPECT_TAG>AUTH  MKTAGACTION(AUTH)
152 <EXPECT_TAG>BAPL  MKTAGACTION(BAPL)
153 <EXPECT_TAG>BAPM  MKTAGACTION(BAPM)
154 <EXPECT_TAG>BARM  MKTAGACTION(BARM)
155 <EXPECT_TAG>BASM  MKTAGACTION(BASM)
156 <EXPECT_TAG>BIRT  MKTAGACTION(BIRT)
157 <EXPECT_TAG>BLES  MKTAGACTION(BLES)
158 <EXPECT_TAG>BLOB  MKTAGACTION(BLOB)
159 <EXPECT_TAG>BURI  MKTAGACTION(BURI)
160 <EXPECT_TAG>CALN  MKTAGACTION(CALN)
161 <EXPECT_TAG>CAST  MKTAGACTION(CAST)
162 <EXPECT_TAG>CAUS  MKTAGACTION(CAUS)
163 <EXPECT_TAG>CENS  MKTAGACTION(CENS)
164 <EXPECT_TAG>CHAN  MKTAGACTION(CHAN)
165 <EXPECT_TAG>CHAR  MKTAGACTION(CHAR)
166 <EXPECT_TAG>CHIL  MKTAGACTION(CHIL)
167 <EXPECT_TAG>CHR   MKTAGACTION(CHR)
168 <EXPECT_TAG>CHRA  MKTAGACTION(CHRA)
169 <EXPECT_TAG>CITY  MKTAGACTION(CITY)
170 <EXPECT_TAG>CONC  MKTAGACTION(CONC)
171 <EXPECT_TAG>CONF  MKTAGACTION(CONF)
172 <EXPECT_TAG>CONL  MKTAGACTION(CONL)
173 <EXPECT_TAG>CONT  MKTAGACTION(CONT)
174 <EXPECT_TAG>COPR  MKTAGACTION(COPR)
175 <EXPECT_TAG>CORP  MKTAGACTION(CORP)
176 <EXPECT_TAG>CREM  MKTAGACTION(CREM)
177 <EXPECT_TAG>CTRY  MKTAGACTION(CTRY)
178 <EXPECT_TAG>DATA  MKTAGACTION(DATA)
179 <EXPECT_TAG>DATE  MKTAGACTION(DATE)
180 <EXPECT_TAG>DEAT  MKTAGACTION(DEAT)
181 <EXPECT_TAG>DESC  MKTAGACTION(DESC)
182 <EXPECT_TAG>DESI  MKTAGACTION(DESI)
183 <EXPECT_TAG>DEST  MKTAGACTION(DEST)
184 <EXPECT_TAG>DIV   MKTAGACTION(DIV)
185 <EXPECT_TAG>DIVF  MKTAGACTION(DIVF)
186 <EXPECT_TAG>DSCR  MKTAGACTION(DSCR)
187 <EXPECT_TAG>EDUC  MKTAGACTION(EDUC)
188 <EXPECT_TAG>EMIG  MKTAGACTION(EMIG)
189 <EXPECT_TAG>ENDL  MKTAGACTION(ENDL)
190 <EXPECT_TAG>ENGA  MKTAGACTION(ENGA)
191 <EXPECT_TAG>EVEN  MKTAGACTION(EVEN)
192 <EXPECT_TAG>FAM   MKTAGACTION(FAM)
193 <EXPECT_TAG>FAMC  MKTAGACTION(FAMC)
194 <EXPECT_TAG>FAMF  MKTAGACTION(FAMF)
195 <EXPECT_TAG>FAMS  MKTAGACTION(FAMS)
196 <EXPECT_TAG>FCOM  MKTAGACTION(FCOM)
197 <EXPECT_TAG>FILE  MKTAGACTION(FILE)
198 <EXPECT_TAG>FORM  MKTAGACTION(FORM)
199 <EXPECT_TAG>GEDC  MKTAGACTION(GEDC)
200 <EXPECT_TAG>GIVN  MKTAGACTION(GIVN)
201 <EXPECT_TAG>GRAD  MKTAGACTION(GRAD)
202 <EXPECT_TAG>HEAD  MKTAGACTION(HEAD)
203 <EXPECT_TAG>HUSB  MKTAGACTION(HUSB)
204 <EXPECT_TAG>IDNO  MKTAGACTION(IDNO)
205 <EXPECT_TAG>IMMI  MKTAGACTION(IMMI)
206 <EXPECT_TAG>INDI  MKTAGACTION(INDI)
207 <EXPECT_TAG>LANG  MKTAGACTION(LANG)
208 <EXPECT_TAG>LEGA  MKTAGACTION(LEGA)
209 <EXPECT_TAG>MARB  MKTAGACTION(MARB)
210 <EXPECT_TAG>MARC  MKTAGACTION(MARC)
211 <EXPECT_TAG>MARL  MKTAGACTION(MARL)
212 <EXPECT_TAG>MARR  MKTAGACTION(MARR)
213 <EXPECT_TAG>MARS  MKTAGACTION(MARS)
214 <EXPECT_TAG>MEDI  MKTAGACTION(MEDI)
215 <EXPECT_TAG>NAME  MKTAGACTION(NAME)
216 <EXPECT_TAG>NATI  MKTAGACTION(NATI)
217 <EXPECT_TAG>NATU  MKTAGACTION(NATU)
218 <EXPECT_TAG>NCHI  MKTAGACTION(NCHI)
219 <EXPECT_TAG>NICK  MKTAGACTION(NICK)
220 <EXPECT_TAG>NMR   MKTAGACTION(NMR)
221 <EXPECT_TAG>NOTE  MKTAGACTION(NOTE)
222 <EXPECT_TAG>NPFX  MKTAGACTION(NPFX)
223 <EXPECT_TAG>NSFX  MKTAGACTION(NSFX)
224 <EXPECT_TAG>OBJE  MKTAGACTION(OBJE)
225 <EXPECT_TAG>OCCU  MKTAGACTION(OCCU)
226 <EXPECT_TAG>ORDI  MKTAGACTION(ORDI)
227 <EXPECT_TAG>ORDN  MKTAGACTION(ORDN)
228 <EXPECT_TAG>PAGE  MKTAGACTION(PAGE)
229 <EXPECT_TAG>PEDI  MKTAGACTION(PEDI)
230 <EXPECT_TAG>PHON  MKTAGACTION(PHON)
231 <EXPECT_TAG>PLAC  MKTAGACTION(PLAC)
232 <EXPECT_TAG>POST  MKTAGACTION(POST)
233 <EXPECT_TAG>PROB  MKTAGACTION(PROB)
234 <EXPECT_TAG>PROP  MKTAGACTION(PROP)
235 <EXPECT_TAG>PUBL  MKTAGACTION(PUBL)
236 <EXPECT_TAG>QUAY  MKTAGACTION(QUAY)
237 <EXPECT_TAG>REFN  MKTAGACTION(REFN)
238 <EXPECT_TAG>RELA  MKTAGACTION(RELA)
239 <EXPECT_TAG>RELI  MKTAGACTION(RELI)
240 <EXPECT_TAG>REPO  MKTAGACTION(REPO)
241 <EXPECT_TAG>RESI  MKTAGACTION(RESI)
242 <EXPECT_TAG>RESN  MKTAGACTION(RESN)
243 <EXPECT_TAG>RETI  MKTAGACTION(RETI)
244 <EXPECT_TAG>RFN   MKTAGACTION(RFN)
245 <EXPECT_TAG>RIN   MKTAGACTION(RIN)
246 <EXPECT_TAG>ROLE  MKTAGACTION(ROLE)
247 <EXPECT_TAG>SEX   MKTAGACTION(SEX)
248 <EXPECT_TAG>SLGC  MKTAGACTION(SLGC)
249 <EXPECT_TAG>SLGS  MKTAGACTION(SLGS)
250 <EXPECT_TAG>SOUR  MKTAGACTION(SOUR)
251 <EXPECT_TAG>SPFX  MKTAGACTION(SPFX)
252 <EXPECT_TAG>SSN   MKTAGACTION(SSN)
253 <EXPECT_TAG>STAE  MKTAGACTION(STAE)
254 <EXPECT_TAG>STAT  MKTAGACTION(STAT)
255 <EXPECT_TAG>SUBM  MKTAGACTION(SUBM)
256 <EXPECT_TAG>SUBN  MKTAGACTION(SUBN)
257 <EXPECT_TAG>SURN  MKTAGACTION(SURN)
258 <EXPECT_TAG>TEMP  MKTAGACTION(TEMP)
259 <EXPECT_TAG>TEXT  MKTAGACTION(TEXT)
260 <EXPECT_TAG>TIME  MKTAGACTION(TIME)
261 <EXPECT_TAG>TITL  MKTAGACTION(TITL)
262 <EXPECT_TAG>TRLR  MKTAGACTION(TRLR)
263 <EXPECT_TAG>TYPE  MKTAGACTION(TYPE)
264 <EXPECT_TAG>VERS  MKTAGACTION(VERS)
265 <EXPECT_TAG>WIFE  MKTAGACTION(WIFE)
266 <EXPECT_TAG>WILL  MKTAGACTION(WILL)
267      
268 <EXPECT_TAG>{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) {
269                             gedcom_error("Tag '%s' too long, max %d chars");
270                             return BADTOKEN;
271                           }
272                           strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);
273                           gedcom_lval.string = TO_INTERNAL(string_buf);
274                           BEGIN(NORMAL);
275                           return USERTAG;
276                         }
277
278 {delim}      { gedcom_lval.string = TO_INTERNAL(yytext);
279                return DELIM;
280              }
281
282 {any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext);
283                   /* Due to character conversions, it is possible
284                      that the current character will be combined with
285                      the next, and so now we don't have a character yet...
286                      This is only applicable to the 1byte case (e.g. ANSEL).
287                   */
288                   if (strlen(gedcom_lval.string) > 0) 
289                     return ANYCHAR;
290                 }
291
292 {escape}/{non_at}  { gedcom_lval.string = TO_INTERNAL(yytext);
293                      return ESCAPE;
294                    }
295
296 {pointer}    { gedcom_lval.string = TO_INTERNAL(yytext);
297                return POINTER;
298              }
299
300    /* Due to the conversion of level numbers into brackets, the
301       terminator is not important, so no token is returned here.
302       Although not strictly according to the GEDCOM spec, we'll ignore
303       whitespace just before the terminator.
304    */
305
306 {gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
307
308    /* Eventually we have to return 1 closing bracket (for the trailer).
309       We can detect whether we have sent the closing bracket using the
310       level_diff (at eof, first it is 2, then we increment it ourselves) */
311
312 <<EOF>> { if (level_diff == 2) {
313             level_diff++;
314             return CLOSE;
315           }
316           else {
317             yyterminate();
318           }
319         } 
320
321 .  { gedcom_error("Unexpected character: '%s' (0x%02x)",
322                   yytext, yytext[0]);
323      return BADTOKEN;
324    }
325
326 %%
327
328 int yywrap()
329 {
330   return 1;
331 }
332
333 #ifdef LEXER_TEST
334 int main()
335 {
336   int tok, res;
337   init_encodings();
338   set_encoding_width(ONE_BYTE);
339   res = open_conv_to_internal("ASCII");
340   if (!res) {
341     gedcom_error("Unable to open conversion context: %s",
342                  strerror(errno));
343     return 1;
344   }
345   tok = gedcom_1byte_lex();
346   while (tok) {
347     switch(tok) {
348       case BADTOKEN: printf("BADTOKEN "); break;
349       case OPEN: printf("OPEN "); break;
350       case CLOSE: printf("CLOSE "); break;
351       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
352       case DELIM: printf("DELIM "); break;
353       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
354       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
355       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
356       default: printf("TAG(%s) ", gedcom_lval.string); break;
357     }
358     tok = gedcom_1byte_lex();
359   }
360   printf("\n");
361   close_conv_to_internal();
362   return 0;
363 }
364 #endif