Additional common code factored out.
[gedcom-parse.git] / gedcom_lex_common.c
1 /*  This program is free software; you can redistribute it and/or modify  *
2  *  it under the terms of the GNU General Public License as published by  *
3  *  the Free Software Foundation; either version 2 of the License, or     *
4  *  (at your option) any later version.                                   *
5
6  (C) 2001 by The Genes Development Team
7  Original author: Peter Verthez (Peter.Verthez@advalvas.be)
8 */
9
10 /* $Id$ */
11 /* $Name$ */
12
13 #ifndef IN_LEX
14
15 #include "gedcom.tab.h"
16 #include "gedcom.h"
17 #include "multilex.h"
18 #include "encoding.h"
19
20 #define YY_NO_UNPUT
21   
22 static int current_level=-1;
23 static int level_diff=MAXGEDCLEVEL;
24  
25 #ifdef LEXER_TEST 
26 YYSTYPE gedcom_lval;
27 int line_no = 1;
28
29 int gedcom_lex();
30
31 int test_loop(ENCODING enc, char* code)
32 {
33   int tok, res;
34   init_encodings();
35   set_encoding_width(enc);
36   res = open_conv_to_internal(code);
37   if (!res) {
38     gedcom_error("Unable to open conversion context: %s",
39                  strerror(errno));
40     return 1;
41   }
42   tok = gedcom_lex();
43   while (tok) {
44     switch(tok) {
45       case BADTOKEN: printf("BADTOKEN "); break;
46       case OPEN: printf("OPEN(%d) ", gedcom_lval.level); break;
47       case CLOSE: printf("CLOSE "); break;
48       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
49       case DELIM: printf("DELIM "); break;
50       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
51       case POINTER: printf("POINTER(%s) ", gedcom_lval.pointer); break;
52       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag); break;
53       default: printf("TAG(%s) ", gedcom_lval.tag); break;
54     }
55     tok = gedcom_lex();
56   }
57   printf("\n");
58   close_conv_to_internal();
59   return 0;  
60 }
61  
62 #endif /* of #ifdef LEXER_TEST */
63
64 #else  /* of #ifndef IN_LEX */
65
66 char string_buf[MAXGEDCLINELEN+1];
67  
68 #define TO_INTERNAL(str) to_internal(str, yyleng) 
69
70 #define MKTAGACTION(the_tag)                                                 \
71   { gedcom_lval.tag = TO_INTERNAL(yytext);                                   \
72     BEGIN(NORMAL);                                                           \
73     return TAG_##the_tag;                                                    \
74   }
75
76
77 /* The GEDCOM level number is converted into a sequence of opening
78    and closing brackets.  Simply put, the following GEDCOM fragment:
79    
80    0 HEAD
81    1 SOUR genes
82    2 VERS 1.6
83    2 NAME Genes
84    1 DATE 07 OCT 2001
85    ...
86    0 TRLR
87    
88    is converted into:
89    
90    { HEAD                     (initial)  
91    { SOUR genes               (1 higher: no closing brackets)
92    { VERS 1.6                 (1 higher: no closing brackets)
93    } { NAME Genes             (same level: 1 closing bracket)
94    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
95    ...
96    } { TRLR }
97    
98    or more clearly:
99    
100    { HEAD
101      { SOUR genes
102        { VERS 1.6 }
103        { NAME Genes } }
104      { DATE 07 OCT 2001
105      ... }
106    { TRLR }
107
108    But because this means that one token is converted into a series
109    of tokens, there is some initial code following immediately here
110    that returns "pending" tokens. */
111
112 #define ACTION_BEFORE_REGEXPS                                                 \
113    { if (level_diff < 1) {                                                    \
114        level_diff++;                                                          \
115        return CLOSE;                                                          \
116      }                                                                        \
117      else if (level_diff == 1) {                                              \
118        level_diff++;                                                          \
119        gedcom_lval.level = current_level;                                     \
120        return OPEN;                                                           \
121      }                                                                        \
122      else {                                                                   \
123        /* out of brackets... */                                               \
124      }                                                                        \
125    } 
126
127
128 #define ACTION_0_DIGITS                                                       \
129    { gedcom_error ("Level number with leading zero");                         \
130      return BADTOKEN;                                                         \
131    } 
132
133
134 #define ACTION_DIGITS                                                         \
135    { int level = atoi(TO_INTERNAL(yytext));                                   \
136      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
137        gedcom_error ("Level number out of range [0..%d]",                     \
138                      MAXGEDCLEVEL);                                           \
139        return BADTOKEN;                                                       \
140      }                                                                        \
141      level_diff = level - current_level;                                      \
142      BEGIN(EXPECT_TAG);                                                       \
143      current_level = level;                                                   \
144      if (level_diff < 1) {                                                    \
145        level_diff++;                                                          \
146        return CLOSE;                                                          \
147      }                                                                        \
148      else if (level_diff == 1) {                                              \
149        level_diff++;                                                          \
150        gedcom_lval.level = current_level;                                     \
151        return OPEN;                                                           \
152      }                                                                        \
153      else {                                                                   \
154        /* should never happen (error to GEDCOM spec) */                       \
155        gedcom_error ("GEDCOM level number is %d higher than "                 \
156                      "previous",                                              \
157                      level_diff);                                             \
158        return BADTOKEN;                                                       \
159      }                                                                        \
160    } 
161
162
163 #define ACTION_ALPHANUM                                                       \
164    { if (strlen(yytext) > MAXGEDCTAGLEN) {                                    \
165        gedcom_error("Tag '%s' too long, max %d chars");                       \
166        return BADTOKEN;                                                       \
167      }                                                                        \
168      strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);                            \
169      gedcom_lval.tag = TO_INTERNAL(string_buf);                               \
170      BEGIN(NORMAL);                                                           \
171      return USERTAG;                                                          \
172    }
173
174
175 #define ACTION_DELIM                                                          \
176   { gedcom_lval.string = TO_INTERNAL(yytext);                                 \
177     return DELIM;                                                             \
178   }
179
180
181 #define ACTION_ANY                                                            \
182   { gedcom_lval.string = TO_INTERNAL(yytext);                                 \
183     /* Due to character conversions, it is possible                           \
184        that the current character will be combined with                       \
185        the next, and so now we don't have a character yet...                  \
186        In principle, this is only applicable to the 1byte case (e.g. ANSEL),  \
187        but it doesn't harm the unicode case.                                  \
188     */                                                                        \
189     if (strlen(gedcom_lval.string) > 0)                                       \
190       return ANYCHAR;                                                         \
191   }
192
193
194 #define ACTION_ESCAPE                                                         \
195   { gedcom_lval.string = TO_INTERNAL(yytext);                                 \
196     return ESCAPE;                                                            \
197   }
198
199
200 #define ACTION_POINTER                                                        \
201   { gedcom_lval.pointer = TO_INTERNAL(yytext);                                \
202     return POINTER;                                                           \
203   }
204
205
206 /* Due to the conversion of level numbers into brackets, the
207    terminator is not important, so no token is returned here.
208    Although not strictly according to the GEDCOM spec, we'll ignore
209    whitespace just before the terminator.
210 */
211
212 #define ACTION_TERMINATOR                                                     \
213   { line_no++;                                                                \
214     BEGIN(INITIAL);                                                           \
215   }
216
217
218 /* Eventually we have to return 1 closing bracket (for the trailer).
219    We can detect whether we have sent the closing bracket using the
220    level_diff (at eof, first it is 2, then we increment it ourselves)
221 */
222
223 #define ACTION_EOF                                                            \
224   { if (level_diff == 2) {                                                    \
225       level_diff++;                                                           \
226       return CLOSE;                                                           \
227     }                                                                         \
228     else {                                                                    \
229       yyterminate();                                                          \
230     }                                                                         \
231   } 
232
233
234 #define ACTION_UNEXPECTED                                                     \
235   { gedcom_error("Unexpected character: '%s' (0x%02x)",                       \
236                  yytext, yytext[0]);                                          \
237     return BADTOKEN;                                                          \
238   }
239
240 #endif /* IN_LEX */