6d88b435ed979f3daa309bc10e96417fd0cf354a
[gedcom-parse.git] / gedcom_lohi.lex
1 /* $Id$ */
2 /* $Name$ */
3
4 /* In low-high order, a space is encoded as 0x20 0x00 */
5 /* i.e. this is utf-16-le */
6
7 %{
8 #include "gedcom.tab.h"
9 #include "gedcom.h"
10 #include "multilex.h"
11 #include "encoding.h"
12 %}
13
14 %s NORMAL
15 %s EXPECT_TAG
16
17 alpha        [A-Za-z_]\x00
18 digit        [0-9]\x00
19 delim        \x20\x00
20 tab          [\t]\x00
21 hash         #\x00
22 literal_at   @\x00@\x00
23 otherchar    [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFF]\x00|[\x00-\xFF][\x01-\xFF]
24 terminator   \x0D\x00|\x0A\x00|\x0D\x00\x0A\x00|\x0A\x00\x0D\x00
25
26 any_char     {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at}
27 any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at}
28 non_at       {alpha}|{digit}|{otherchar}|{delim}|{hash}
29 alphanum     {alpha}|{digit}
30 gen_delim    {delim}|{tab}
31
32 escape       @\x00#\x00{any_char}+@\x00
33 pointer      @\x00{alphanum}{non_at}+@\x00
34
35 %{
36 static int current_level=-1;
37 static int level_diff=MAXGEDCLEVEL;
38  
39 #ifdef LEXER_TEST 
40 YYSTYPE gedcom_lval;
41 int line_no = 1; 
42 #endif
43 %} 
44
45 %%
46
47     /* The GEDCOM level number is converted into a sequence of opening
48        and closing brackets.  Simply put, the following GEDCOM fragment:
49
50          0 HEAD
51          1 SOUR genes
52          2 VERS 1.6
53          2 NAME Genes
54          1 DATE 07 OCT 2001
55          ...
56          0 TRLR
57
58        is converted into:
59
60          { HEAD                     (initial)  
61          { SOUR genes               (1 higher: no closing brackets)
62          { VERS 1.6                 (1 higher: no closing brackets)
63          } { NAME Genes             (same level: 1 closing bracket)
64          } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
65          ...
66          } { TRLR }
67
68        or more clearly:
69
70          { HEAD
71            { SOUR genes
72              { VERS 1.6 }
73              { NAME Genes } }
74            { DATE 07 OCT 2001
75          ... }
76          { TRLR }
77
78        But because this means that one token is converted into a series
79        of tokens, there is some initial code following immediately here
80        that returns "pending" tokens. */
81
82 %{
83 char string_buf[MAXGEDCLINELEN+1];
84  
85 if (level_diff < 1) {
86   level_diff++;
87   return CLOSE;
88 }
89 else if (level_diff == 1) {
90   level_diff++;
91   return OPEN;
92 }
93 else {
94   /* out of brackets... */
95 }
96
97 #define TO_INTERNAL(str) to_internal(str, yyleng) 
98
99 #define MKTAGACTION(tag) \
100   { gedcom_lval.string = TO_INTERNAL(yytext); \
101     BEGIN(NORMAL); \
102     return TAG_##tag; }
103
104 %}
105
106 <INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
107
108 <INITIAL>\x00[0]{digit}+ { gedcom_error ("Level number with leading zero");
109                            return BADTOKEN;
110                          }
111
112 <INITIAL>{digit}+ { int level = atoi(TO_INTERNAL(yytext));
113                     if ((level < 0) || (level > MAXGEDCLEVEL)) {
114                       gedcom_error ("Level number out of range [0..%d]",
115                                     MAXGEDCLEVEL);
116                       return BADTOKEN;
117                     }
118                     level_diff = level - current_level;
119                     BEGIN(EXPECT_TAG);
120                     current_level = level;
121                     if (level_diff < 1) {
122                       level_diff++;
123                       return CLOSE;
124                     }
125                     else if (level_diff == 1) {
126                       level_diff++;
127                       return OPEN;
128                     }
129                     else {
130                       /* should never happen (error to GEDCOM spec) */
131                       gedcom_error ("GEDCOM level number is %d higher than "
132                                     "previous",
133                                     level_diff);
134                       return BADTOKEN;
135                     }
136                   }
137
138 <EXPECT_TAG>A\x00B\x00B\x00R\x00  MKTAGACTION(ABBR)
139 <EXPECT_TAG>A\x00D\x00D\x00R\x00  MKTAGACTION(ADDR)
140 <EXPECT_TAG>A\x00D\x00R\x001\x00  MKTAGACTION(ADR1)
141 <EXPECT_TAG>A\x00D\x00R\x002\x00  MKTAGACTION(ADR2)
142 <EXPECT_TAG>A\x00D\x00O\x00P\x00  MKTAGACTION(ADOP)
143 <EXPECT_TAG>A\x00F\x00N\x00   MKTAGACTION(AFN)
144 <EXPECT_TAG>A\x00G\x00E\x00   MKTAGACTION(AGE)
145 <EXPECT_TAG>A\x00G\x00N\x00C\x00  MKTAGACTION(AGNC)
146 <EXPECT_TAG>A\x00L\x00I\x00A\x00  MKTAGACTION(ALIA)
147 <EXPECT_TAG>A\x00N\x00C\x00E\x00  MKTAGACTION(ANCE)
148 <EXPECT_TAG>A\x00N\x00C\x00I\x00  MKTAGACTION(ANCI)
149 <EXPECT_TAG>A\x00N\x00U\x00L\x00  MKTAGACTION(ANUL)
150 <EXPECT_TAG>A\x00S\x00S\x00O\x00  MKTAGACTION(ASSO)
151 <EXPECT_TAG>A\x00U\x00T\x00H\x00  MKTAGACTION(AUTH)
152 <EXPECT_TAG>B\x00A\x00P\x00L\x00  MKTAGACTION(BAPL)
153 <EXPECT_TAG>B\x00A\x00P\x00M\x00  MKTAGACTION(BAPM)
154 <EXPECT_TAG>B\x00A\x00R\x00M\x00  MKTAGACTION(BARM)
155 <EXPECT_TAG>B\x00A\x00S\x00M\x00  MKTAGACTION(BASM)
156 <EXPECT_TAG>B\x00I\x00R\x00T\x00  MKTAGACTION(BIRT)
157 <EXPECT_TAG>B\x00L\x00E\x00S\x00  MKTAGACTION(BLES)
158 <EXPECT_TAG>B\x00L\x00O\x00B\x00  MKTAGACTION(BLOB)
159 <EXPECT_TAG>B\x00U\x00R\x00I\x00  MKTAGACTION(BURI)
160 <EXPECT_TAG>C\x00A\x00L\x00N\x00  MKTAGACTION(CALN)
161 <EXPECT_TAG>C\x00A\x00S\x00T\x00  MKTAGACTION(CAST)
162 <EXPECT_TAG>C\x00A\x00U\x00S\x00  MKTAGACTION(CAUS)
163 <EXPECT_TAG>C\x00E\x00N\x00S\x00  MKTAGACTION(CENS)
164 <EXPECT_TAG>C\x00H\x00A\x00N\x00  MKTAGACTION(CHAN)
165 <EXPECT_TAG>C\x00H\x00A\x00R\x00  MKTAGACTION(CHAR)
166 <EXPECT_TAG>C\x00H\x00I\x00L\x00  MKTAGACTION(CHIL)
167 <EXPECT_TAG>C\x00H\x00R\x00   MKTAGACTION(CHR)
168 <EXPECT_TAG>C\x00H\x00R\x00A\x00  MKTAGACTION(CHRA)
169 <EXPECT_TAG>C\x00I\x00T\x00Y\x00  MKTAGACTION(CITY)
170 <EXPECT_TAG>C\x00O\x00N\x00C\x00  MKTAGACTION(CONC)
171 <EXPECT_TAG>C\x00O\x00N\x00F\x00  MKTAGACTION(CONF)
172 <EXPECT_TAG>C\x00O\x00N\x00L\x00  MKTAGACTION(CONL)
173 <EXPECT_TAG>C\x00O\x00N\x00T\x00  MKTAGACTION(CONT)
174 <EXPECT_TAG>C\x00O\x00P\x00R\x00  MKTAGACTION(COPR)
175 <EXPECT_TAG>C\x00O\x00R\x00P\x00  MKTAGACTION(CORP)
176 <EXPECT_TAG>C\x00R\x00E\x00M\x00  MKTAGACTION(CREM)
177 <EXPECT_TAG>C\x00T\x00R\x00Y\x00  MKTAGACTION(CTRY)
178 <EXPECT_TAG>D\x00A\x00T\x00A\x00  MKTAGACTION(DATA)
179 <EXPECT_TAG>D\x00A\x00T\x00E\x00  MKTAGACTION(DATE)
180 <EXPECT_TAG>D\x00E\x00A\x00T\x00  MKTAGACTION(DEAT)
181 <EXPECT_TAG>D\x00E\x00S\x00C\x00  MKTAGACTION(DESC)
182 <EXPECT_TAG>D\x00E\x00S\x00I\x00  MKTAGACTION(DESI)
183 <EXPECT_TAG>D\x00E\x00S\x00T\x00  MKTAGACTION(DEST)
184 <EXPECT_TAG>D\x00I\x00V\x00   MKTAGACTION(DIV)
185 <EXPECT_TAG>D\x00I\x00V\x00F\x00  MKTAGACTION(DIVF)
186 <EXPECT_TAG>D\x00S\x00C\x00R\x00  MKTAGACTION(DSCR)
187 <EXPECT_TAG>E\x00D\x00U\x00C\x00  MKTAGACTION(EDUC)
188 <EXPECT_TAG>E\x00M\x00I\x00G\x00  MKTAGACTION(EMIG)
189 <EXPECT_TAG>E\x00N\x00D\x00L\x00  MKTAGACTION(ENDL)
190 <EXPECT_TAG>E\x00N\x00G\x00A\x00  MKTAGACTION(ENGA)
191 <EXPECT_TAG>E\x00V\x00E\x00N\x00  MKTAGACTION(EVEN)
192 <EXPECT_TAG>F\x00A\x00M\x00   MKTAGACTION(FAM)
193 <EXPECT_TAG>F\x00A\x00M\x00C\x00  MKTAGACTION(FAMC)
194 <EXPECT_TAG>F\x00A\x00M\x00F\x00  MKTAGACTION(FAMF)
195 <EXPECT_TAG>F\x00A\x00M\x00S\x00  MKTAGACTION(FAMS)
196 <EXPECT_TAG>F\x00C\x00O\x00M\x00  MKTAGACTION(FCOM)
197 <EXPECT_TAG>F\x00I\x00L\x00E\x00  MKTAGACTION(FILE)
198 <EXPECT_TAG>F\x00O\x00R\x00M\x00  MKTAGACTION(FORM)
199 <EXPECT_TAG>G\x00E\x00D\x00C\x00  MKTAGACTION(GEDC)
200 <EXPECT_TAG>G\x00I\x00V\x00N\x00  MKTAGACTION(GIVN)
201 <EXPECT_TAG>G\x00R\x00A\x00D\x00  MKTAGACTION(GRAD)
202 <EXPECT_TAG>H\x00E\x00A\x00D\x00  MKTAGACTION(HEAD)
203 <EXPECT_TAG>H\x00U\x00S\x00B\x00  MKTAGACTION(HUSB)
204 <EXPECT_TAG>I\x00D\x00N\x00O\x00  MKTAGACTION(IDNO)
205 <EXPECT_TAG>I\x00M\x00M\x00I\x00  MKTAGACTION(IMMI)
206 <EXPECT_TAG>I\x00N\x00D\x00I\x00  MKTAGACTION(INDI)
207 <EXPECT_TAG>L\x00A\x00N\x00G\x00  MKTAGACTION(LANG)
208 <EXPECT_TAG>L\x00E\x00G\x00A\x00  MKTAGACTION(LEGA)
209 <EXPECT_TAG>M\x00A\x00R\x00B\x00  MKTAGACTION(MARB)
210 <EXPECT_TAG>M\x00A\x00R\x00C\x00  MKTAGACTION(MARC)
211 <EXPECT_TAG>M\x00A\x00R\x00L\x00  MKTAGACTION(MARL)
212 <EXPECT_TAG>M\x00A\x00R\x00R\x00  MKTAGACTION(MARR)
213 <EXPECT_TAG>M\x00A\x00R\x00S\x00  MKTAGACTION(MARS)
214 <EXPECT_TAG>M\x00E\x00D\x00I\x00  MKTAGACTION(MEDI)
215 <EXPECT_TAG>N\x00A\x00M\x00E\x00  MKTAGACTION(NAME)
216 <EXPECT_TAG>N\x00A\x00T\x00I\x00  MKTAGACTION(NATI)
217 <EXPECT_TAG>N\x00A\x00T\x00U\x00  MKTAGACTION(NATU)
218 <EXPECT_TAG>N\x00C\x00H\x00I\x00  MKTAGACTION(NCHI)
219 <EXPECT_TAG>N\x00I\x00C\x00K\x00  MKTAGACTION(NICK)
220 <EXPECT_TAG>N\x00M\x00R\x00   MKTAGACTION(NMR)
221 <EXPECT_TAG>N\x00O\x00T\x00E\x00  MKTAGACTION(NOTE)
222 <EXPECT_TAG>N\x00P\x00F\x00X\x00  MKTAGACTION(NPFX)
223 <EXPECT_TAG>N\x00S\x00F\x00X\x00  MKTAGACTION(NSFX)
224 <EXPECT_TAG>O\x00B\x00J\x00E\x00  MKTAGACTION(OBJE)
225 <EXPECT_TAG>O\x00C\x00C\x00U\x00  MKTAGACTION(OCCU)
226 <EXPECT_TAG>O\x00R\x00D\x00I\x00  MKTAGACTION(ORDI)
227 <EXPECT_TAG>O\x00R\x00D\x00N\x00  MKTAGACTION(ORDN)
228 <EXPECT_TAG>P\x00A\x00G\x00E\x00  MKTAGACTION(PAGE)
229 <EXPECT_TAG>P\x00E\x00D\x00I\x00  MKTAGACTION(PEDI)
230 <EXPECT_TAG>P\x00H\x00O\x00N\x00  MKTAGACTION(PHON)
231 <EXPECT_TAG>P\x00L\x00A\x00C\x00  MKTAGACTION(PLAC)
232 <EXPECT_TAG>P\x00O\x00S\x00T\x00  MKTAGACTION(POST)
233 <EXPECT_TAG>P\x00R\x00O\x00B\x00  MKTAGACTION(PROB)
234 <EXPECT_TAG>P\x00R\x00O\x00P\x00  MKTAGACTION(PROP)
235 <EXPECT_TAG>P\x00U\x00B\x00L\x00  MKTAGACTION(PUBL)
236 <EXPECT_TAG>Q\x00U\x00A\x00Y\x00  MKTAGACTION(QUAY)
237 <EXPECT_TAG>R\x00E\x00F\x00N\x00  MKTAGACTION(REFN)
238 <EXPECT_TAG>R\x00E\x00L\x00A\x00  MKTAGACTION(RELA)
239 <EXPECT_TAG>R\x00E\x00L\x00I\x00  MKTAGACTION(RELI)
240 <EXPECT_TAG>R\x00E\x00P\x00O\x00  MKTAGACTION(REPO)
241 <EXPECT_TAG>R\x00E\x00S\x00I\x00  MKTAGACTION(RESI)
242 <EXPECT_TAG>R\x00E\x00S\x00N\x00  MKTAGACTION(RESN)
243 <EXPECT_TAG>R\x00E\x00T\x00I\x00  MKTAGACTION(RETI)
244 <EXPECT_TAG>R\x00F\x00N\x00   MKTAGACTION(RFN)
245 <EXPECT_TAG>R\x00I\x00N\x00   MKTAGACTION(RIN)
246 <EXPECT_TAG>R\x00O\x00L\x00E\x00  MKTAGACTION(ROLE)
247 <EXPECT_TAG>S\x00E\x00X\x00   MKTAGACTION(SEX)
248 <EXPECT_TAG>S\x00L\x00G\x00C\x00  MKTAGACTION(SLGC)
249 <EXPECT_TAG>S\x00L\x00G\x00S\x00  MKTAGACTION(SLGS)
250 <EXPECT_TAG>S\x00O\x00U\x00R\x00  MKTAGACTION(SOUR)
251 <EXPECT_TAG>S\x00P\x00F\x00X\x00  MKTAGACTION(SPFX)
252 <EXPECT_TAG>S\x00S\x00N\x00   MKTAGACTION(SSN)
253 <EXPECT_TAG>S\x00T\x00A\x00E\x00  MKTAGACTION(STAE)
254 <EXPECT_TAG>S\x00T\x00A\x00T\x00  MKTAGACTION(STAT)
255 <EXPECT_TAG>S\x00U\x00B\x00M\x00  MKTAGACTION(SUBM)
256 <EXPECT_TAG>S\x00U\x00B\x00N\x00  MKTAGACTION(SUBN)
257 <EXPECT_TAG>S\x00U\x00R\x00N\x00  MKTAGACTION(SURN)
258 <EXPECT_TAG>T\x00E\x00M\x00P\x00  MKTAGACTION(TEMP)
259 <EXPECT_TAG>T\x00E\x00X\x00T\x00  MKTAGACTION(TEXT)
260 <EXPECT_TAG>T\x00I\x00M\x00E\x00  MKTAGACTION(TIME)
261 <EXPECT_TAG>T\x00I\x00T\x00L\x00  MKTAGACTION(TITL)
262 <EXPECT_TAG>T\x00R\x00L\x00R\x00  MKTAGACTION(TRLR)
263 <EXPECT_TAG>T\x00Y\x00P\x00E\x00  MKTAGACTION(TYPE)
264 <EXPECT_TAG>V\x00E\x00R\x00S\x00  MKTAGACTION(VERS)
265 <EXPECT_TAG>W\x00I\x00F\x00E\x00  MKTAGACTION(WIFE)
266 <EXPECT_TAG>W\x00I\x00L\x00L\x00  MKTAGACTION(WILL)
267      
268 <EXPECT_TAG>{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) {
269                             gedcom_error("Tag '%s' too long, max %d chars");
270                             return BADTOKEN;
271                           }
272                           strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);
273                           gedcom_lval.string = TO_INTERNAL(string_buf);
274                           BEGIN(NORMAL);
275                           return USERTAG;
276                         }
277
278 {delim}      { gedcom_lval.string = TO_INTERNAL(yytext);
279                return DELIM;
280              }
281
282 {any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext);
283                   return ANYCHAR;
284                 }
285
286 {escape}/{non_at}  { gedcom_lval.string = TO_INTERNAL(yytext);
287                      return ESCAPE;
288                    }
289
290 {pointer}    { gedcom_lval.string = TO_INTERNAL(yytext);
291                return POINTER;
292              }
293
294    /* Due to the conversion of level numbers into brackets, the
295       terminator is not important, so no token is returned here.
296       Although not strictly according to the GEDCOM spec, we'll ignore
297       whitespace just before the terminator.
298    */
299
300 {gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
301
302    /* Eventually we have to return 1 closing bracket (for the trailer).
303       We can detect whether we have sent the closing bracket using the
304       level_diff (at eof, first it is 2, then we increment it ourselves) */
305
306 <<EOF>> { if (level_diff == 2) {
307             level_diff++;
308             return CLOSE;
309           }
310           else {
311             yyterminate();
312           }
313         } 
314
315 .  { gedcom_error("Unexpected character: '%s' (0x%02x)",
316                   yytext, yytext[0]);
317      return BADTOKEN;
318    }
319
320 %%
321
322 int yywrap()
323 {
324   return 1;
325 }
326
327 #ifdef LEXER_TEST
328
329 int main()
330 {
331   int tok;
332   int res = open_conv_to_internal("UTF16LE");
333   if (!res) {
334     gedcom_error("Unable to open conversion context: %s",
335                  strerror(errno));
336     return 1;
337   }
338   tok = gedcom_lohi_lex();
339   while (tok) {
340     switch(tok) {
341       case BADTOKEN: printf("BADTOKEN "); break;
342       case OPEN: printf("OPEN "); break;
343       case CLOSE: printf("CLOSE "); break;
344       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
345       case DELIM: printf("DELIM "); break;
346       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
347       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
348       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
349       default: printf("TAG(%s) ", gedcom_lval.string); break;
350     }
351     tok = gedcom_lohi_lex();
352   }
353   printf("\n");
354   close_conv_to_internal();
355   return 0;
356 }
357 #endif