bc8546851b26851c6d2ee1f3aa2e7523187adae2
[gedcom-parse.git] / gedcom_lohi.lex
1 /* $Id$ */
2 /* $Name$ */
3
4 /* In low-high order, a space is encoded as 0x20 0x00 */
5 /* i.e. this is utf-16-le */
6
7 %{
8 #include "gedcom.tab.h"
9 #include "gedcom.h"
10 #include "multilex.h"
11 #include "encoding.h"
12
13 #define YY_NO_UNPUT
14 %}
15
16 %s NORMAL
17 %s EXPECT_TAG
18
19 alpha        [A-Za-z_]\x00
20 digit        [0-9]\x00
21 delim        \x20\x00
22 tab          [\t]\x00
23 hash         #\x00
24 literal_at   @\x00@\x00
25 otherchar    [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFF]\x00|[\x00-\xFF][\x01-\xFF]
26 terminator   \x0D\x00|\x0A\x00|\x0D\x00\x0A\x00|\x0A\x00\x0D\x00
27
28 any_char     {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at}
29 any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at}
30 non_at       {alpha}|{digit}|{otherchar}|{delim}|{hash}
31 alphanum     {alpha}|{digit}
32 gen_delim    {delim}|{tab}
33
34 escape       @\x00#\x00{any_char}+@\x00
35 pointer      @\x00{alphanum}{non_at}+@\x00
36
37 %{
38 static int current_level=-1;
39 static int level_diff=MAXGEDCLEVEL;
40  
41 #ifdef LEXER_TEST 
42 YYSTYPE gedcom_lval;
43 int line_no = 1; 
44 #endif
45 %} 
46
47 %%
48
49     /* The GEDCOM level number is converted into a sequence of opening
50        and closing brackets.  Simply put, the following GEDCOM fragment:
51
52          0 HEAD
53          1 SOUR genes
54          2 VERS 1.6
55          2 NAME Genes
56          1 DATE 07 OCT 2001
57          ...
58          0 TRLR
59
60        is converted into:
61
62          { HEAD                     (initial)  
63          { SOUR genes               (1 higher: no closing brackets)
64          { VERS 1.6                 (1 higher: no closing brackets)
65          } { NAME Genes             (same level: 1 closing bracket)
66          } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
67          ...
68          } { TRLR }
69
70        or more clearly:
71
72          { HEAD
73            { SOUR genes
74              { VERS 1.6 }
75              { NAME Genes } }
76            { DATE 07 OCT 2001
77          ... }
78          { TRLR }
79
80        But because this means that one token is converted into a series
81        of tokens, there is some initial code following immediately here
82        that returns "pending" tokens. */
83
84 %{
85 char string_buf[MAXGEDCLINELEN+1];
86  
87 if (level_diff < 1) {
88   level_diff++;
89   return CLOSE;
90 }
91 else if (level_diff == 1) {
92   level_diff++;
93   return OPEN;
94 }
95 else {
96   /* out of brackets... */
97 }
98
99 #define TO_INTERNAL(str) to_internal(str, yyleng) 
100
101 #define MKTAGACTION(tag) \
102   { gedcom_lval.string = TO_INTERNAL(yytext); \
103     BEGIN(NORMAL); \
104     return TAG_##tag; }
105
106 %}
107
108 <INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
109
110 <INITIAL>\x00[0]{digit}+ { gedcom_error ("Level number with leading zero");
111                            return BADTOKEN;
112                          }
113
114 <INITIAL>{digit}+ { int level = atoi(TO_INTERNAL(yytext));
115                     if ((level < 0) || (level > MAXGEDCLEVEL)) {
116                       gedcom_error ("Level number out of range [0..%d]",
117                                     MAXGEDCLEVEL);
118                       return BADTOKEN;
119                     }
120                     level_diff = level - current_level;
121                     BEGIN(EXPECT_TAG);
122                     current_level = level;
123                     if (level_diff < 1) {
124                       level_diff++;
125                       return CLOSE;
126                     }
127                     else if (level_diff == 1) {
128                       level_diff++;
129                       return OPEN;
130                     }
131                     else {
132                       /* should never happen (error to GEDCOM spec) */
133                       gedcom_error ("GEDCOM level number is %d higher than "
134                                     "previous",
135                                     level_diff);
136                       return BADTOKEN;
137                     }
138                   }
139
140 <EXPECT_TAG>A\x00B\x00B\x00R\x00  MKTAGACTION(ABBR)
141 <EXPECT_TAG>A\x00D\x00D\x00R\x00  MKTAGACTION(ADDR)
142 <EXPECT_TAG>A\x00D\x00R\x001\x00  MKTAGACTION(ADR1)
143 <EXPECT_TAG>A\x00D\x00R\x002\x00  MKTAGACTION(ADR2)
144 <EXPECT_TAG>A\x00D\x00O\x00P\x00  MKTAGACTION(ADOP)
145 <EXPECT_TAG>A\x00F\x00N\x00   MKTAGACTION(AFN)
146 <EXPECT_TAG>A\x00G\x00E\x00   MKTAGACTION(AGE)
147 <EXPECT_TAG>A\x00G\x00N\x00C\x00  MKTAGACTION(AGNC)
148 <EXPECT_TAG>A\x00L\x00I\x00A\x00  MKTAGACTION(ALIA)
149 <EXPECT_TAG>A\x00N\x00C\x00E\x00  MKTAGACTION(ANCE)
150 <EXPECT_TAG>A\x00N\x00C\x00I\x00  MKTAGACTION(ANCI)
151 <EXPECT_TAG>A\x00N\x00U\x00L\x00  MKTAGACTION(ANUL)
152 <EXPECT_TAG>A\x00S\x00S\x00O\x00  MKTAGACTION(ASSO)
153 <EXPECT_TAG>A\x00U\x00T\x00H\x00  MKTAGACTION(AUTH)
154 <EXPECT_TAG>B\x00A\x00P\x00L\x00  MKTAGACTION(BAPL)
155 <EXPECT_TAG>B\x00A\x00P\x00M\x00  MKTAGACTION(BAPM)
156 <EXPECT_TAG>B\x00A\x00R\x00M\x00  MKTAGACTION(BARM)
157 <EXPECT_TAG>B\x00A\x00S\x00M\x00  MKTAGACTION(BASM)
158 <EXPECT_TAG>B\x00I\x00R\x00T\x00  MKTAGACTION(BIRT)
159 <EXPECT_TAG>B\x00L\x00E\x00S\x00  MKTAGACTION(BLES)
160 <EXPECT_TAG>B\x00L\x00O\x00B\x00  MKTAGACTION(BLOB)
161 <EXPECT_TAG>B\x00U\x00R\x00I\x00  MKTAGACTION(BURI)
162 <EXPECT_TAG>C\x00A\x00L\x00N\x00  MKTAGACTION(CALN)
163 <EXPECT_TAG>C\x00A\x00S\x00T\x00  MKTAGACTION(CAST)
164 <EXPECT_TAG>C\x00A\x00U\x00S\x00  MKTAGACTION(CAUS)
165 <EXPECT_TAG>C\x00E\x00N\x00S\x00  MKTAGACTION(CENS)
166 <EXPECT_TAG>C\x00H\x00A\x00N\x00  MKTAGACTION(CHAN)
167 <EXPECT_TAG>C\x00H\x00A\x00R\x00  MKTAGACTION(CHAR)
168 <EXPECT_TAG>C\x00H\x00I\x00L\x00  MKTAGACTION(CHIL)
169 <EXPECT_TAG>C\x00H\x00R\x00   MKTAGACTION(CHR)
170 <EXPECT_TAG>C\x00H\x00R\x00A\x00  MKTAGACTION(CHRA)
171 <EXPECT_TAG>C\x00I\x00T\x00Y\x00  MKTAGACTION(CITY)
172 <EXPECT_TAG>C\x00O\x00N\x00C\x00  MKTAGACTION(CONC)
173 <EXPECT_TAG>C\x00O\x00N\x00F\x00  MKTAGACTION(CONF)
174 <EXPECT_TAG>C\x00O\x00N\x00L\x00  MKTAGACTION(CONL)
175 <EXPECT_TAG>C\x00O\x00N\x00T\x00  MKTAGACTION(CONT)
176 <EXPECT_TAG>C\x00O\x00P\x00R\x00  MKTAGACTION(COPR)
177 <EXPECT_TAG>C\x00O\x00R\x00P\x00  MKTAGACTION(CORP)
178 <EXPECT_TAG>C\x00R\x00E\x00M\x00  MKTAGACTION(CREM)
179 <EXPECT_TAG>C\x00T\x00R\x00Y\x00  MKTAGACTION(CTRY)
180 <EXPECT_TAG>D\x00A\x00T\x00A\x00  MKTAGACTION(DATA)
181 <EXPECT_TAG>D\x00A\x00T\x00E\x00  MKTAGACTION(DATE)
182 <EXPECT_TAG>D\x00E\x00A\x00T\x00  MKTAGACTION(DEAT)
183 <EXPECT_TAG>D\x00E\x00S\x00C\x00  MKTAGACTION(DESC)
184 <EXPECT_TAG>D\x00E\x00S\x00I\x00  MKTAGACTION(DESI)
185 <EXPECT_TAG>D\x00E\x00S\x00T\x00  MKTAGACTION(DEST)
186 <EXPECT_TAG>D\x00I\x00V\x00   MKTAGACTION(DIV)
187 <EXPECT_TAG>D\x00I\x00V\x00F\x00  MKTAGACTION(DIVF)
188 <EXPECT_TAG>D\x00S\x00C\x00R\x00  MKTAGACTION(DSCR)
189 <EXPECT_TAG>E\x00D\x00U\x00C\x00  MKTAGACTION(EDUC)
190 <EXPECT_TAG>E\x00M\x00I\x00G\x00  MKTAGACTION(EMIG)
191 <EXPECT_TAG>E\x00N\x00D\x00L\x00  MKTAGACTION(ENDL)
192 <EXPECT_TAG>E\x00N\x00G\x00A\x00  MKTAGACTION(ENGA)
193 <EXPECT_TAG>E\x00V\x00E\x00N\x00  MKTAGACTION(EVEN)
194 <EXPECT_TAG>F\x00A\x00M\x00   MKTAGACTION(FAM)
195 <EXPECT_TAG>F\x00A\x00M\x00C\x00  MKTAGACTION(FAMC)
196 <EXPECT_TAG>F\x00A\x00M\x00F\x00  MKTAGACTION(FAMF)
197 <EXPECT_TAG>F\x00A\x00M\x00S\x00  MKTAGACTION(FAMS)
198 <EXPECT_TAG>F\x00C\x00O\x00M\x00  MKTAGACTION(FCOM)
199 <EXPECT_TAG>F\x00I\x00L\x00E\x00  MKTAGACTION(FILE)
200 <EXPECT_TAG>F\x00O\x00R\x00M\x00  MKTAGACTION(FORM)
201 <EXPECT_TAG>G\x00E\x00D\x00C\x00  MKTAGACTION(GEDC)
202 <EXPECT_TAG>G\x00I\x00V\x00N\x00  MKTAGACTION(GIVN)
203 <EXPECT_TAG>G\x00R\x00A\x00D\x00  MKTAGACTION(GRAD)
204 <EXPECT_TAG>H\x00E\x00A\x00D\x00  MKTAGACTION(HEAD)
205 <EXPECT_TAG>H\x00U\x00S\x00B\x00  MKTAGACTION(HUSB)
206 <EXPECT_TAG>I\x00D\x00N\x00O\x00  MKTAGACTION(IDNO)
207 <EXPECT_TAG>I\x00M\x00M\x00I\x00  MKTAGACTION(IMMI)
208 <EXPECT_TAG>I\x00N\x00D\x00I\x00  MKTAGACTION(INDI)
209 <EXPECT_TAG>L\x00A\x00N\x00G\x00  MKTAGACTION(LANG)
210 <EXPECT_TAG>L\x00E\x00G\x00A\x00  MKTAGACTION(LEGA)
211 <EXPECT_TAG>M\x00A\x00R\x00B\x00  MKTAGACTION(MARB)
212 <EXPECT_TAG>M\x00A\x00R\x00C\x00  MKTAGACTION(MARC)
213 <EXPECT_TAG>M\x00A\x00R\x00L\x00  MKTAGACTION(MARL)
214 <EXPECT_TAG>M\x00A\x00R\x00R\x00  MKTAGACTION(MARR)
215 <EXPECT_TAG>M\x00A\x00R\x00S\x00  MKTAGACTION(MARS)
216 <EXPECT_TAG>M\x00E\x00D\x00I\x00  MKTAGACTION(MEDI)
217 <EXPECT_TAG>N\x00A\x00M\x00E\x00  MKTAGACTION(NAME)
218 <EXPECT_TAG>N\x00A\x00T\x00I\x00  MKTAGACTION(NATI)
219 <EXPECT_TAG>N\x00A\x00T\x00U\x00  MKTAGACTION(NATU)
220 <EXPECT_TAG>N\x00C\x00H\x00I\x00  MKTAGACTION(NCHI)
221 <EXPECT_TAG>N\x00I\x00C\x00K\x00  MKTAGACTION(NICK)
222 <EXPECT_TAG>N\x00M\x00R\x00   MKTAGACTION(NMR)
223 <EXPECT_TAG>N\x00O\x00T\x00E\x00  MKTAGACTION(NOTE)
224 <EXPECT_TAG>N\x00P\x00F\x00X\x00  MKTAGACTION(NPFX)
225 <EXPECT_TAG>N\x00S\x00F\x00X\x00  MKTAGACTION(NSFX)
226 <EXPECT_TAG>O\x00B\x00J\x00E\x00  MKTAGACTION(OBJE)
227 <EXPECT_TAG>O\x00C\x00C\x00U\x00  MKTAGACTION(OCCU)
228 <EXPECT_TAG>O\x00R\x00D\x00I\x00  MKTAGACTION(ORDI)
229 <EXPECT_TAG>O\x00R\x00D\x00N\x00  MKTAGACTION(ORDN)
230 <EXPECT_TAG>P\x00A\x00G\x00E\x00  MKTAGACTION(PAGE)
231 <EXPECT_TAG>P\x00E\x00D\x00I\x00  MKTAGACTION(PEDI)
232 <EXPECT_TAG>P\x00H\x00O\x00N\x00  MKTAGACTION(PHON)
233 <EXPECT_TAG>P\x00L\x00A\x00C\x00  MKTAGACTION(PLAC)
234 <EXPECT_TAG>P\x00O\x00S\x00T\x00  MKTAGACTION(POST)
235 <EXPECT_TAG>P\x00R\x00O\x00B\x00  MKTAGACTION(PROB)
236 <EXPECT_TAG>P\x00R\x00O\x00P\x00  MKTAGACTION(PROP)
237 <EXPECT_TAG>P\x00U\x00B\x00L\x00  MKTAGACTION(PUBL)
238 <EXPECT_TAG>Q\x00U\x00A\x00Y\x00  MKTAGACTION(QUAY)
239 <EXPECT_TAG>R\x00E\x00F\x00N\x00  MKTAGACTION(REFN)
240 <EXPECT_TAG>R\x00E\x00L\x00A\x00  MKTAGACTION(RELA)
241 <EXPECT_TAG>R\x00E\x00L\x00I\x00  MKTAGACTION(RELI)
242 <EXPECT_TAG>R\x00E\x00P\x00O\x00  MKTAGACTION(REPO)
243 <EXPECT_TAG>R\x00E\x00S\x00I\x00  MKTAGACTION(RESI)
244 <EXPECT_TAG>R\x00E\x00S\x00N\x00  MKTAGACTION(RESN)
245 <EXPECT_TAG>R\x00E\x00T\x00I\x00  MKTAGACTION(RETI)
246 <EXPECT_TAG>R\x00F\x00N\x00   MKTAGACTION(RFN)
247 <EXPECT_TAG>R\x00I\x00N\x00   MKTAGACTION(RIN)
248 <EXPECT_TAG>R\x00O\x00L\x00E\x00  MKTAGACTION(ROLE)
249 <EXPECT_TAG>S\x00E\x00X\x00   MKTAGACTION(SEX)
250 <EXPECT_TAG>S\x00L\x00G\x00C\x00  MKTAGACTION(SLGC)
251 <EXPECT_TAG>S\x00L\x00G\x00S\x00  MKTAGACTION(SLGS)
252 <EXPECT_TAG>S\x00O\x00U\x00R\x00  MKTAGACTION(SOUR)
253 <EXPECT_TAG>S\x00P\x00F\x00X\x00  MKTAGACTION(SPFX)
254 <EXPECT_TAG>S\x00S\x00N\x00   MKTAGACTION(SSN)
255 <EXPECT_TAG>S\x00T\x00A\x00E\x00  MKTAGACTION(STAE)
256 <EXPECT_TAG>S\x00T\x00A\x00T\x00  MKTAGACTION(STAT)
257 <EXPECT_TAG>S\x00U\x00B\x00M\x00  MKTAGACTION(SUBM)
258 <EXPECT_TAG>S\x00U\x00B\x00N\x00  MKTAGACTION(SUBN)
259 <EXPECT_TAG>S\x00U\x00R\x00N\x00  MKTAGACTION(SURN)
260 <EXPECT_TAG>T\x00E\x00M\x00P\x00  MKTAGACTION(TEMP)
261 <EXPECT_TAG>T\x00E\x00X\x00T\x00  MKTAGACTION(TEXT)
262 <EXPECT_TAG>T\x00I\x00M\x00E\x00  MKTAGACTION(TIME)
263 <EXPECT_TAG>T\x00I\x00T\x00L\x00  MKTAGACTION(TITL)
264 <EXPECT_TAG>T\x00R\x00L\x00R\x00  MKTAGACTION(TRLR)
265 <EXPECT_TAG>T\x00Y\x00P\x00E\x00  MKTAGACTION(TYPE)
266 <EXPECT_TAG>V\x00E\x00R\x00S\x00  MKTAGACTION(VERS)
267 <EXPECT_TAG>W\x00I\x00F\x00E\x00  MKTAGACTION(WIFE)
268 <EXPECT_TAG>W\x00I\x00L\x00L\x00  MKTAGACTION(WILL)
269      
270 <EXPECT_TAG>{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) {
271                             gedcom_error("Tag '%s' too long, max %d chars");
272                             return BADTOKEN;
273                           }
274                           strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);
275                           gedcom_lval.string = TO_INTERNAL(string_buf);
276                           BEGIN(NORMAL);
277                           return USERTAG;
278                         }
279
280 {delim}      { gedcom_lval.string = TO_INTERNAL(yytext);
281                return DELIM;
282              }
283
284 {any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext);
285                   return ANYCHAR;
286                 }
287
288 {escape}/{non_at}  { gedcom_lval.string = TO_INTERNAL(yytext);
289                      return ESCAPE;
290                    }
291
292 {pointer}    { gedcom_lval.string = TO_INTERNAL(yytext);
293                return POINTER;
294              }
295
296    /* Due to the conversion of level numbers into brackets, the
297       terminator is not important, so no token is returned here.
298       Although not strictly according to the GEDCOM spec, we'll ignore
299       whitespace just before the terminator.
300    */
301
302 {gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
303
304    /* Eventually we have to return 1 closing bracket (for the trailer).
305       We can detect whether we have sent the closing bracket using the
306       level_diff (at eof, first it is 2, then we increment it ourselves) */
307
308 <<EOF>> { if (level_diff == 2) {
309             level_diff++;
310             return CLOSE;
311           }
312           else {
313             yyterminate();
314           }
315         } 
316
317 .  { gedcom_error("Unexpected character: '%s' (0x%02x)",
318                   yytext, yytext[0]);
319      return BADTOKEN;
320    }
321
322 %%
323
324 int yywrap()
325 {
326   return 1;
327 }
328
329 #ifdef LEXER_TEST
330
331 int main()
332 {
333   int tok, res;
334   init_encodings();
335   res = open_conv_to_internal("UNICODE(LOHI)");
336   if (!res) {
337     gedcom_error("Unable to open conversion context: %s",
338                  strerror(errno));
339     return 1;
340   }
341   tok = gedcom_lohi_lex();
342   while (tok) {
343     switch(tok) {
344       case BADTOKEN: printf("BADTOKEN "); break;
345       case OPEN: printf("OPEN "); break;
346       case CLOSE: printf("CLOSE "); break;
347       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
348       case DELIM: printf("DELIM "); break;
349       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
350       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
351       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
352       default: printf("TAG(%s) ", gedcom_lval.string); break;
353     }
354     tok = gedcom_lohi_lex();
355   }
356   printf("\n");
357   close_conv_to_internal();
358   return 0;
359 }
360 #endif