Moved to gedcom subdirectory.
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
1 /*  This program is free software; you can redistribute it and/or modify  *
2  *  it under the terms of the GNU General Public License as published by  *
3  *  the Free Software Foundation; either version 2 of the License, or     *
4  *  (at your option) any later version.                                   *
5
6  (C) 2001 by The Genes Development Team
7  Original author: Peter Verthez (Peter.Verthez@advalvas.be)
8 */
9
10 /* $Id$ */
11 /* $Name$ */
12
13 #ifndef IN_LEX
14
15 #include "gedcom_internal.h"
16 #include "multilex.h"
17 #include "encoding.h"
18 #include "gedcom.h"
19 #include "gedcom.tab.h"
20
21 #define YY_NO_UNPUT
22
23 static size_t encoding_width;
24 static int current_level = -1;
25 static int level_diff=MAXGEDCLEVEL;
26 static size_t line_len = 0;
27
28 static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1];
29 static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1];
30 static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1];
31
32 #ifdef LEXER_TEST 
33 YYSTYPE gedcom_lval;
34 int line_no = 1;
35
36 int gedcom_lex();
37
38 int test_loop(ENCODING enc, char* code)
39 {
40   int tok, res;
41   init_encodings();
42   set_encoding_width(enc);
43   res = open_conv_to_internal(code);
44   if (!res) {
45     gedcom_error("Unable to open conversion context: %s",
46                  strerror(errno));
47     return 1;
48   }
49   tok = gedcom_lex();
50   while (tok) {
51     switch(tok) {
52       case BADTOKEN: printf("BADTOKEN "); break;
53       case OPEN: printf("OPEN(%d) ", gedcom_lval.number); break;
54       case CLOSE: printf("CLOSE "); break;
55       case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
56       case DELIM: printf("DELIM "); break;
57       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
58       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
59       case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
60       default: printf("TAG(%s) ", gedcom_lval.string); break;
61     }
62     tok = gedcom_lex();
63   }
64   printf("\n");
65   close_conv_to_internal();
66   return 0;  
67 }
68  
69 #endif /* of #ifdef LEXER_TEST */
70
71 #else  /* of #ifndef IN_LEX */
72
73 #define TO_INTERNAL(STR,OUTBUF) \
74   to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF))
75
76 #define INIT_LINE_LEN \
77   line_len = 0;
78
79 #define CHECK_LINE_LEN                                                        \
80   { if (line_len != (size_t)-1) {                                             \
81       line_len += strlen(yytext);                                             \
82       if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
83         gedcom_error("Line too long, max %d characters",                      \
84                      MAXGEDCLINELEN);                                         \
85         line_len = (size_t)-1;                                                \
86         return BADTOKEN;                                                      \
87       }                                                                       \
88     }                                                                         \
89   }
90
91 #define MKTAGACTION(THETAG)                                                  \
92   { CHECK_LINE_LEN;                                                          \
93     gedcom_lval.string = TO_INTERNAL(yytext, tag_buf);                       \
94     BEGIN(NORMAL);                                                           \
95     return TAG_##THETAG;                                                     \
96   }
97
98 /* The GEDCOM level number is converted into a sequence of opening
99    and closing brackets.  Simply put, the following GEDCOM fragment:
100    
101    0 HEAD
102    1 SOUR genes
103    2 VERS 1.6
104    2 NAME Genes
105    1 DATE 07 OCT 2001
106    ...
107    0 TRLR
108    
109    is converted into:
110    
111    { HEAD                     (initial)  
112    { SOUR genes               (1 higher: no closing brackets)
113    { VERS 1.6                 (1 higher: no closing brackets)
114    } { NAME Genes             (same level: 1 closing bracket)
115    } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
116    ...
117    } { TRLR }
118    
119    or more clearly:
120    
121    { HEAD
122      { SOUR genes
123        { VERS 1.6 }
124        { NAME Genes } }
125      { DATE 07 OCT 2001
126      ... }
127    { TRLR }
128
129    But because this means that one token is converted into a series
130    of tokens, there is some initial code following immediately here
131    that returns "pending" tokens. */
132
133 #define ACTION_BEFORE_REGEXPS                                                 \
134    { if (level_diff < 1) {                                                    \
135        level_diff++;                                                          \
136        return CLOSE;                                                          \
137      }                                                                        \
138      else if (level_diff == 1) {                                              \
139        level_diff++;                                                          \
140        gedcom_lval.number = current_level;                                    \
141        return OPEN;                                                           \
142      }                                                                        \
143      else {                                                                   \
144        /* out of brackets... */                                               \
145      }                                                                        \
146    }
147
148
149 #define ACTION_INITIAL_WHITESPACE                                             \
150   { CHECK_LINE_LEN;                                                           \
151     /* ignore initial whitespace further */                                   \
152   }
153
154
155 #define ACTION_0_DIGITS                                                       \
156    { gedcom_error ("Level number with leading zero");                         \
157      return BADTOKEN;                                                         \
158    } 
159
160
161 #define ACTION_DIGITS                                                         \
162    { int level = atoi(TO_INTERNAL(yytext, str_buf));                          \
163      CHECK_LINE_LEN;                                                          \
164      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
165        gedcom_error ("Level number out of range [0..%d]",                     \
166                      MAXGEDCLEVEL);                                           \
167        return BADTOKEN;                                                       \
168      }                                                                        \
169      level_diff = level - current_level;                                      \
170      BEGIN(EXPECT_TAG);                                                       \
171      current_level = level;                                                   \
172      if (level_diff < 1) {                                                    \
173        level_diff++;                                                          \
174        return CLOSE;                                                          \
175      }                                                                        \
176      else if (level_diff == 1) {                                              \
177        level_diff++;                                                          \
178        gedcom_lval.number = current_level;                                    \
179        return OPEN;                                                           \
180      }                                                                        \
181      else {                                                                   \
182        /* should never happen (error to GEDCOM spec) */                       \
183        gedcom_error ("GEDCOM level number is %d higher than "                 \
184                      "previous",                                              \
185                      level_diff);                                             \
186        return BADTOKEN;                                                       \
187      }                                                                        \
188    } 
189
190
191 #define ACTION_ALPHANUM                                                       \
192    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
193        gedcom_error("Tag '%s' too long, max %d characters",                   \
194                     yytext, MAXGEDCTAGLEN);                                   \
195        return BADTOKEN;                                                       \
196      }                                                                        \
197      CHECK_LINE_LEN;                                                          \
198      gedcom_lval.string = TO_INTERNAL(yytext, tag_buf);                       \
199      BEGIN(NORMAL);                                                           \
200      return USERTAG;                                                          \
201    }
202
203
204 #define ACTION_DELIM                                                          \
205   { CHECK_LINE_LEN;                                                           \
206     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
207     return DELIM;                                                             \
208   }
209
210
211 #define ACTION_ANY                                                            \
212   { CHECK_LINE_LEN;                                                           \
213     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
214     /* Due to character conversions, it is possible that the current          \
215        character will be combined with the next, and so now we don't have a   \
216        character yet...                                                       \
217        In principle, this is only applicable to the 1byte case (e.g. ANSEL),  \
218        but it doesn't harm the unicode case.                                  \
219     */                                                                        \
220     if (strlen(gedcom_lval.string) > 0)                                       \
221       return ANYCHAR;                                                         \
222   }
223
224
225 #define ACTION_ESCAPE                                                         \
226   { CHECK_LINE_LEN;                                                           \
227     gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
228     return ESCAPE;                                                            \
229   }
230
231
232 #define ACTION_POINTER                                                        \
233   { CHECK_LINE_LEN;                                                           \
234     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
235       gedcom_error("Pointer '%s' too long, max %d characters",                \
236                    yytext, MAXGEDCPTRLEN);                                    \
237       return BADTOKEN;                                                        \
238     }                                                                         \
239     gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf);                        \
240     return POINTER;                                                           \
241   }
242
243
244 /* Due to the conversion of level numbers into brackets, the
245    terminator is not important, so no token is returned here.
246    Although not strictly according to the GEDCOM spec, we'll ignore
247    whitespace just before the terminator.
248 */
249
250 #define ACTION_TERMINATOR                                                     \
251   { CHECK_LINE_LEN;                                                           \
252     INIT_LINE_LEN;                                                            \
253     line_no++;                                                                \
254     BEGIN(INITIAL);                                                           \
255   }
256
257
258 /* Eventually we have to return 1 closing bracket (for the trailer).
259    We can detect whether we have sent the closing bracket using the
260    level_diff (at eof, first it is 2, then we increment it ourselves)
261 */
262
263 #define ACTION_EOF                                                            \
264   { if (level_diff == 2) {                                                    \
265       level_diff++;                                                           \
266       return CLOSE;                                                           \
267     }                                                                         \
268     else {                                                                    \
269       /* Reset our state */                                                   \
270       current_level = -1;                                                     \
271       level_diff = MAXGEDCLEVEL;                                              \
272       /* ... then terminate lex */                                            \
273       yyterminate();                                                          \
274     }                                                                         \
275   } 
276
277
278 #define ACTION_UNEXPECTED                                                     \
279   { gedcom_error("Unexpected character: '%s' (0x%02x)",                       \
280                  yytext, yytext[0]);                                          \
281     return BADTOKEN;                                                          \
282   }
283
284 #endif /* IN_LEX */