# $Id$
# $Name$
-CFLAGS=-Wall -pedantic
+YACC=bison
+LEX=flex
-gedcom_parse: standalone.o lex.gedcom_.o gedcom.tab.o
- cc standalone.o lex.gedcom_.o gedcom.tab.o -o gedcom_parse
+CFLAGS=-g -Wall -pedantic
+YFLAGS=--debug --defines
+LFLAGS=-8
-lex.gedcom_.c: gedcom.lex gedcom.tab.h gedcom.h
- flex -8 -Pgedcom_ gedcom.lex
+gedcom_parse: standalone.o lex.gedcom_1byte_.o lex.gedcom_hilo_.o \
+ lex.gedcom_lohi_.o gedcom.tab.o message.o multilex.o \
+ encoding.o
+ $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@
+
+lex.gedcom_1byte_.c: gedcom_1byte.lex gedcom.tab.h gedcom.h multilex.h
+ $(LEX) $(LFLAGS) -Pgedcom_1byte_ gedcom_1byte.lex
+
+lex.gedcom_hilo_.c: gedcom_hilo.lex gedcom.tab.h gedcom.h multilex.h
+ $(LEX) $(LFLAGS) -Pgedcom_hilo_ gedcom_hilo.lex
+
+lex.gedcom_lohi_.c: gedcom_lohi.lex gedcom.tab.h gedcom.h multilex.h
+ $(LEX) $(LFLAGS) -Pgedcom_lohi_ gedcom_lohi.lex
gedcom.tab.c gedcom.tab.h: gedcom.y gedcom.h
- bison --debug --defines --name-prefix=gedcom_ gedcom.y
+ $(YACC) $(YFLAGS) --name-prefix=gedcom_ gedcom.y
clean:
- rm -f core gedcom_parse *.o lex.gedcom_.c gedcom.tab.* gedcom.output
+ rm -f core gedcom_parse test_* *.o lex.gedcom_* \
+ gedcom.tab.* gedcom.output
+
+# Test programs
+
+test_1byte: lex.gedcom_1byte_.test.o message.o encoding.o
+ $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@
+
+lex.gedcom_1byte_.test.o: lex.gedcom_1byte_.c
+ $(CC) -DLEXER_TEST -c $(CPPFLAGS) $(CFLAGS) $^ -o $@
+
+test_hilo: lex.gedcom_hilo_.test.o message.o encoding.o
+ $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@
+
+lex.gedcom_hilo_.test.o: lex.gedcom_hilo_.c
+ $(CC) -DLEXER_TEST -c $(CPPFLAGS) $(CFLAGS) $^ -o $@
+
+test_lohi: lex.gedcom_lohi_.test.o message.o encoding.o
+ $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@
+
+lex.gedcom_lohi_.test.o: lex.gedcom_lohi_.c
+ $(CC) -DLEXER_TEST -c $(CPPFLAGS) $(CFLAGS) $^ -o $@
--- /dev/null
+#include <string.h>
+#include <iconv.h>
+#include "gedcom.h"
+#include "encoding.h"
+
+#define INTERNAL_ENCODING "UTF8"
+
+static iconv_t cd_to_internal = (iconv_t) -1;
+static char int_buf[MAXGEDCLINELEN*2];
+
+int open_conv_to_internal(char* fromcode)
+{
+ if (cd_to_internal != (iconv_t) -1)
+ iconv_close(cd_to_internal);
+ cd_to_internal = iconv_open(INTERNAL_ENCODING, fromcode);
+ return (cd_to_internal != (iconv_t) -1);
+}
+
+void close_conv_to_internal()
+{
+ iconv_close(cd_to_internal);
+}
+
+char* to_internal(char* str, size_t len)
+{
+ size_t insize = len;
+ size_t outsize = MAXGEDCLINELEN * 2;
+ char *wrptr = int_buf;
+ char *rdptr = str;
+ memset(int_buf, 0, sizeof(int_buf));
+ iconv(cd_to_internal, &rdptr, &insize, &wrptr, &outsize);
+ return int_buf;
+}
+
--- /dev/null
+int open_conv_to_internal(char* fromcode);
+void close_conv_to_internal();
+char* to_internal(char* str, size_t len);
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
+#include <errno.h>
#define MAXGEDCLEVEL 99
#define MAXGEDCLINELEN 256
IGNORE_ERRORS
} MECHANISM;
-/* Basic file encoding */
-typedef enum _ENC {
- ONE_BYTE,
- TWO_BYTE_HILO,
- TWO_BYTE_LOHI
-} ENCODING;
int gedcom_error(char* s, ...);
int gedcom_warning(char* s, ...);
+int gedcom_message(char* s, ...);
int gedcom_debug_print(char* s, ...);
void gedcom_set_debug_level(int level);
void gedcom_set_error_handling(MECHANISM mechanism);
void gedcom_set_compat_handling(int enable_compat);
+
int gedcom_parse();
+
int gedcom_lex();
+
extern int line_no;
-extern FILE *gedcom_in;
+++ /dev/null
-/* $Id$ */
-/* $Name$ */
-
-%{
-#include "gedcom.tab.h"
-#include "gedcom.h"
-%}
-
-%s NORMAL
-%s EXPECT_TAG
-
-alpha [A-Za-z_]
-digit [0-9]
-delim " "
-tab [\t]
-hash #
-literal_at @@
-otherchar [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFE]
-terminator \x0D|\x0A|\x0D\x0A|\x0A\x0D
-
-any_char {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at}
-any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at}
-non_at {alpha}|{digit}|{otherchar}|{delim}|{hash}
-alphanum {alpha}|{digit}
-gen_delim {delim}|{tab}
-
-escape @#{any_char}+@
-pointer @{alphanum}{non_at}+@
-
-%{
-int current_level=-1;
-int level_diff=MAXGEDCLEVEL;
-int line_no=1;
-%}
-
-%%
-
- /* The GEDCOM level number is converted into a sequence of opening
- and closing brackets. Simply put, the following GEDCOM fragment:
-
- 0 HEAD
- 1 SOUR genes
- 2 VERS 1.6
- 2 NAME Genes
- 1 DATE 07 OCT 2001
- ...
- 0 TRLR
-
- is converted into:
-
- { HEAD (initial)
- { SOUR genes (1 higher: no closing brackets)
- { VERS 1.6 (1 higher: no closing brackets)
- } { NAME Genes (same level: 1 closing bracket)
- } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets)
- ...
- } { TRLR }
-
- or more clearly:
-
- { HEAD
- { SOUR genes
- { VERS 1.6 }
- { NAME Genes } }
- { DATE 07 OCT 2001
- ... }
- { TRLR }
-
- But because this means that one token is converted into a series
- of tokens, there is some initial code following immediately here
- that returns "pending" tokens. */
-
-%{
-char string_buf[MAXGEDCLINELEN+1];
-
-if (level_diff < 1) {
- level_diff++;
- return CLOSE;
-}
-else if (level_diff == 1) {
- level_diff++;
- return OPEN;
-}
-else {
- /* out of brackets... */
-}
-
-#define MKTAGACTION(tag) \
- { gedcom_lval.string = gedcom_text; \
- BEGIN(NORMAL); \
- return TAG_##tag; }
-
-%}
-
-<INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
-
-<INITIAL>0{digit}+ { gedcom_error ("Level number with leading zero");
- return BADTOKEN;
- }
-
-<INITIAL>{digit}+ { int level = atoi(gedcom_text);
- if ((level < 0) || (level > MAXGEDCLEVEL)) {
- gedcom_error ("Level number out of range [0..%d]",
- MAXGEDCLEVEL);
- return BADTOKEN;
- }
- level_diff = level - current_level;
- BEGIN(EXPECT_TAG);
- current_level = level;
- if (level_diff < 1) {
- level_diff++;
- return CLOSE;
- }
- else if (level_diff == 1) {
- level_diff++;
- return OPEN;
- }
- else {
- /* should never happen (error to GEDCOM spec) */
- gedcom_error ("GEDCOM level number is %d higher than "
- "previous",
- level_diff);
- return BADTOKEN;
- }
- }
-
-<EXPECT_TAG>ABBR MKTAGACTION(ABBR)
-<EXPECT_TAG>ADDR MKTAGACTION(ADDR)
-<EXPECT_TAG>ADR1 MKTAGACTION(ADR1)
-<EXPECT_TAG>ADR2 MKTAGACTION(ADR2)
-<EXPECT_TAG>ADOP MKTAGACTION(ADOP)
-<EXPECT_TAG>AFN MKTAGACTION(AFN)
-<EXPECT_TAG>AGE MKTAGACTION(AGE)
-<EXPECT_TAG>AGNC MKTAGACTION(AGNC)
-<EXPECT_TAG>ALIA MKTAGACTION(ALIA)
-<EXPECT_TAG>ANCE MKTAGACTION(ANCE)
-<EXPECT_TAG>ANCI MKTAGACTION(ANCI)
-<EXPECT_TAG>ANUL MKTAGACTION(ANUL)
-<EXPECT_TAG>ASSO MKTAGACTION(ASSO)
-<EXPECT_TAG>AUTH MKTAGACTION(AUTH)
-<EXPECT_TAG>BAPL MKTAGACTION(BAPL)
-<EXPECT_TAG>BAPM MKTAGACTION(BAPM)
-<EXPECT_TAG>BARM MKTAGACTION(BARM)
-<EXPECT_TAG>BASM MKTAGACTION(BASM)
-<EXPECT_TAG>BIRT MKTAGACTION(BIRT)
-<EXPECT_TAG>BLES MKTAGACTION(BLES)
-<EXPECT_TAG>BLOB MKTAGACTION(BLOB)
-<EXPECT_TAG>BURI MKTAGACTION(BURI)
-<EXPECT_TAG>CALN MKTAGACTION(CALN)
-<EXPECT_TAG>CAST MKTAGACTION(CAST)
-<EXPECT_TAG>CAUS MKTAGACTION(CAUS)
-<EXPECT_TAG>CENS MKTAGACTION(CENS)
-<EXPECT_TAG>CHAN MKTAGACTION(CHAN)
-<EXPECT_TAG>CHAR MKTAGACTION(CHAR)
-<EXPECT_TAG>CHIL MKTAGACTION(CHIL)
-<EXPECT_TAG>CHR MKTAGACTION(CHR)
-<EXPECT_TAG>CHRA MKTAGACTION(CHRA)
-<EXPECT_TAG>CITY MKTAGACTION(CITY)
-<EXPECT_TAG>CONC MKTAGACTION(CONC)
-<EXPECT_TAG>CONF MKTAGACTION(CONF)
-<EXPECT_TAG>CONL MKTAGACTION(CONL)
-<EXPECT_TAG>CONT MKTAGACTION(CONT)
-<EXPECT_TAG>COPR MKTAGACTION(COPR)
-<EXPECT_TAG>CORP MKTAGACTION(CORP)
-<EXPECT_TAG>CREM MKTAGACTION(CREM)
-<EXPECT_TAG>CTRY MKTAGACTION(CTRY)
-<EXPECT_TAG>DATA MKTAGACTION(DATA)
-<EXPECT_TAG>DATE MKTAGACTION(DATE)
-<EXPECT_TAG>DEAT MKTAGACTION(DEAT)
-<EXPECT_TAG>DESC MKTAGACTION(DESC)
-<EXPECT_TAG>DESI MKTAGACTION(DESI)
-<EXPECT_TAG>DEST MKTAGACTION(DEST)
-<EXPECT_TAG>DIV MKTAGACTION(DIV)
-<EXPECT_TAG>DIVF MKTAGACTION(DIVF)
-<EXPECT_TAG>DSCR MKTAGACTION(DSCR)
-<EXPECT_TAG>EDUC MKTAGACTION(EDUC)
-<EXPECT_TAG>EMIG MKTAGACTION(EMIG)
-<EXPECT_TAG>ENDL MKTAGACTION(ENDL)
-<EXPECT_TAG>ENGA MKTAGACTION(ENGA)
-<EXPECT_TAG>EVEN MKTAGACTION(EVEN)
-<EXPECT_TAG>FAM MKTAGACTION(FAM)
-<EXPECT_TAG>FAMC MKTAGACTION(FAMC)
-<EXPECT_TAG>FAMF MKTAGACTION(FAMF)
-<EXPECT_TAG>FAMS MKTAGACTION(FAMS)
-<EXPECT_TAG>FCOM MKTAGACTION(FCOM)
-<EXPECT_TAG>FILE MKTAGACTION(FILE)
-<EXPECT_TAG>FORM MKTAGACTION(FORM)
-<EXPECT_TAG>GEDC MKTAGACTION(GEDC)
-<EXPECT_TAG>GIVN MKTAGACTION(GIVN)
-<EXPECT_TAG>GRAD MKTAGACTION(GRAD)
-<EXPECT_TAG>HEAD MKTAGACTION(HEAD)
-<EXPECT_TAG>HUSB MKTAGACTION(HUSB)
-<EXPECT_TAG>IDNO MKTAGACTION(IDNO)
-<EXPECT_TAG>IMMI MKTAGACTION(IMMI)
-<EXPECT_TAG>INDI MKTAGACTION(INDI)
-<EXPECT_TAG>LANG MKTAGACTION(LANG)
-<EXPECT_TAG>LEGA MKTAGACTION(LEGA)
-<EXPECT_TAG>MARB MKTAGACTION(MARB)
-<EXPECT_TAG>MARC MKTAGACTION(MARC)
-<EXPECT_TAG>MARL MKTAGACTION(MARL)
-<EXPECT_TAG>MARR MKTAGACTION(MARR)
-<EXPECT_TAG>MARS MKTAGACTION(MARS)
-<EXPECT_TAG>MEDI MKTAGACTION(MEDI)
-<EXPECT_TAG>NAME MKTAGACTION(NAME)
-<EXPECT_TAG>NATI MKTAGACTION(NATI)
-<EXPECT_TAG>NATU MKTAGACTION(NATU)
-<EXPECT_TAG>NCHI MKTAGACTION(NCHI)
-<EXPECT_TAG>NICK MKTAGACTION(NICK)
-<EXPECT_TAG>NMR MKTAGACTION(NMR)
-<EXPECT_TAG>NOTE MKTAGACTION(NOTE)
-<EXPECT_TAG>NPFX MKTAGACTION(NPFX)
-<EXPECT_TAG>NSFX MKTAGACTION(NSFX)
-<EXPECT_TAG>OBJE MKTAGACTION(OBJE)
-<EXPECT_TAG>OCCU MKTAGACTION(OCCU)
-<EXPECT_TAG>ORDI MKTAGACTION(ORDI)
-<EXPECT_TAG>ORDN MKTAGACTION(ORDN)
-<EXPECT_TAG>PAGE MKTAGACTION(PAGE)
-<EXPECT_TAG>PEDI MKTAGACTION(PEDI)
-<EXPECT_TAG>PHON MKTAGACTION(PHON)
-<EXPECT_TAG>PLAC MKTAGACTION(PLAC)
-<EXPECT_TAG>POST MKTAGACTION(POST)
-<EXPECT_TAG>PROB MKTAGACTION(PROB)
-<EXPECT_TAG>PROP MKTAGACTION(PROP)
-<EXPECT_TAG>PUBL MKTAGACTION(PUBL)
-<EXPECT_TAG>QUAY MKTAGACTION(QUAY)
-<EXPECT_TAG>REFN MKTAGACTION(REFN)
-<EXPECT_TAG>RELA MKTAGACTION(RELA)
-<EXPECT_TAG>RELI MKTAGACTION(RELI)
-<EXPECT_TAG>REPO MKTAGACTION(REPO)
-<EXPECT_TAG>RESI MKTAGACTION(RESI)
-<EXPECT_TAG>RESN MKTAGACTION(RESN)
-<EXPECT_TAG>RETI MKTAGACTION(RETI)
-<EXPECT_TAG>RFN MKTAGACTION(RFN)
-<EXPECT_TAG>RIN MKTAGACTION(RIN)
-<EXPECT_TAG>ROLE MKTAGACTION(ROLE)
-<EXPECT_TAG>SEX MKTAGACTION(SEX)
-<EXPECT_TAG>SLGC MKTAGACTION(SLGC)
-<EXPECT_TAG>SLGS MKTAGACTION(SLGS)
-<EXPECT_TAG>SOUR MKTAGACTION(SOUR)
-<EXPECT_TAG>SPFX MKTAGACTION(SPFX)
-<EXPECT_TAG>SSN MKTAGACTION(SSN)
-<EXPECT_TAG>STAE MKTAGACTION(STAE)
-<EXPECT_TAG>STAT MKTAGACTION(STAT)
-<EXPECT_TAG>SUBM MKTAGACTION(SUBM)
-<EXPECT_TAG>SUBN MKTAGACTION(SUBN)
-<EXPECT_TAG>SURN MKTAGACTION(SURN)
-<EXPECT_TAG>TEMP MKTAGACTION(TEMP)
-<EXPECT_TAG>TEXT MKTAGACTION(TEXT)
-<EXPECT_TAG>TIME MKTAGACTION(TIME)
-<EXPECT_TAG>TITL MKTAGACTION(TITL)
-<EXPECT_TAG>TRLR MKTAGACTION(TRLR)
-<EXPECT_TAG>TYPE MKTAGACTION(TYPE)
-<EXPECT_TAG>VERS MKTAGACTION(VERS)
-<EXPECT_TAG>WIFE MKTAGACTION(WIFE)
-<EXPECT_TAG>WILL MKTAGACTION(WILL)
-
-<EXPECT_TAG>{alphanum}+ { if (strlen(gedcom_text) > MAXGEDCTAGLEN) {
- gedcom_error("Tag '%s' too long, max %d chars");
- return BADTOKEN;
- }
- strncpy(string_buf, gedcom_text, MAXGEDCTAGLEN+1);
- gedcom_lval.string = string_buf;
- BEGIN(NORMAL);
- return USERTAG;
- }
-
-{delim} { gedcom_lval.string = gedcom_text;
- return DELIM;
- }
-
-{any_but_delim} { gedcom_lval.string = gedcom_text;
- return ANYCHAR;
- }
-
-{escape}/{non_at} { gedcom_lval.string = gedcom_text;
- return ESCAPE;
- }
-
-{pointer} { gedcom_lval.string = gedcom_text;
- return POINTER;
- }
-
- /* Due to the conversion of level numbers into brackets, the
- terminator is not important, so no token is returned here.
- Although not strictly according to the GEDCOM spec, we'll ignore
- whitespace just before the terminator.
- */
-
-{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
-
- /* Eventually we have to return 1 closing bracket (for the trailer).
- We can detect whether we have sent the closing bracket using the
- level_diff (at eof, first it is 2, then we increment it ourselves) */
-
-<<EOF>> { if (level_diff == 2) {
- level_diff++;
- return CLOSE;
- }
- else {
- yyterminate();
- }
- }
-
-. { gedcom_error("Unexpected character: '%s' (0x%02x)",
- gedcom_text, gedcom_text[0]);
- return BADTOKEN;
- }
-
-%%
-
-int gedcom_wrap()
-{
- return 1;
-}
%{
#include "gedcom.h"
+#include "multilex.h"
int count_level = 0;
int fail = 0;
int gedcom_high_level_debug = 0;
int compatibility = 0;
MECHANISM error_mechanism=IMMED_FAIL;
-char string_buf[MAXGEDCLINELEN+1];
+char string_buf[MAXGEDCLINELEN*4+1];
char *string_buf_ptr;
enum _COMPAT {
| DELIM line_item { }
;
-line_item : anychar { CLEAR_BUFFER(string_buf);
+line_item : anychar { int i;
+ CLEAR_BUFFER(string_buf);
string_buf_ptr = string_buf;
/* The following also takes care of '@@' */
- *string_buf_ptr++ = $1[0];
+ if (!strncmp($1, "@@", 3))
+ *string_buf_ptr++ = '@';
+ else
+ for (i=0; i < strlen($1); i++)
+ *string_buf_ptr++ = $1[i];
$$ = string_buf;
}
| ESCAPE { CLEAR_BUFFER(string_buf);
YYERROR;
}
else {
+ int i;
/* The following also takes care of '@@' */
- *string_buf_ptr++ = $2[0];
+ if (!strncmp($2, "@@", 3))
+ *string_buf_ptr++ = '@';
+ else
+ for (i=0; i < strlen($2); i++)
+ *string_buf_ptr++ = $2[i];
$$ = string_buf;
}
}
{
return (compat_flags & compatibility);
}
+
--- /dev/null
+/* $Id$ */
+/* $Name$ */
+
+/* In low-high order, a space is encoded as 0x20 0x00 */
+/* i.e. this is utf-16-le */
+
+%{
+#include "gedcom.tab.h"
+#include "gedcom.h"
+#include "multilex.h"
+#include "encoding.h"
+%}
+
+%s NORMAL
+%s EXPECT_TAG
+
+alpha [A-Za-z_]\x00
+digit [0-9]\x00
+delim \x20\x00
+tab [\t]\x00
+hash #\x00
+literal_at @\x00@\x00
+otherchar [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFF]\x00|[\x00-\xFF][\x01-\xFF]
+terminator \x0D\x00|\x0A\x00|\x0D\x00\x0A\x00|\x0A\x00\x0D\x00
+
+any_char {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at}
+any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at}
+non_at {alpha}|{digit}|{otherchar}|{delim}|{hash}
+alphanum {alpha}|{digit}
+gen_delim {delim}|{tab}
+
+escape @\x00#\x00{any_char}+@\x00
+pointer @\x00{alphanum}{non_at}+@\x00
+
+%{
+static int current_level=-1;
+static int level_diff=MAXGEDCLEVEL;
+
+#ifdef LEXER_TEST
+YYSTYPE gedcom_lval;
+int line_no = 1;
+#endif
+%}
+
+%%
+
+ /* The GEDCOM level number is converted into a sequence of opening
+ and closing brackets. Simply put, the following GEDCOM fragment:
+
+ 0 HEAD
+ 1 SOUR genes
+ 2 VERS 1.6
+ 2 NAME Genes
+ 1 DATE 07 OCT 2001
+ ...
+ 0 TRLR
+
+ is converted into:
+
+ { HEAD (initial)
+ { SOUR genes (1 higher: no closing brackets)
+ { VERS 1.6 (1 higher: no closing brackets)
+ } { NAME Genes (same level: 1 closing bracket)
+ } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets)
+ ...
+ } { TRLR }
+
+ or more clearly:
+
+ { HEAD
+ { SOUR genes
+ { VERS 1.6 }
+ { NAME Genes } }
+ { DATE 07 OCT 2001
+ ... }
+ { TRLR }
+
+ But because this means that one token is converted into a series
+ of tokens, there is some initial code following immediately here
+ that returns "pending" tokens. */
+
+%{
+char string_buf[MAXGEDCLINELEN+1];
+
+if (level_diff < 1) {
+ level_diff++;
+ return CLOSE;
+}
+else if (level_diff == 1) {
+ level_diff++;
+ return OPEN;
+}
+else {
+ /* out of brackets... */
+}
+
+#define TO_INTERNAL(str) to_internal(str, yyleng)
+
+#define MKTAGACTION(tag) \
+ { gedcom_lval.string = TO_INTERNAL(yytext); \
+ BEGIN(NORMAL); \
+ return TAG_##tag; }
+
+%}
+
+<INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
+
+<INITIAL>\x00[0]{digit}+ { gedcom_error ("Level number with leading zero");
+ return BADTOKEN;
+ }
+
+<INITIAL>{digit}+ { int level = atoi(TO_INTERNAL(yytext));
+ if ((level < 0) || (level > MAXGEDCLEVEL)) {
+ gedcom_error ("Level number out of range [0..%d]",
+ MAXGEDCLEVEL);
+ return BADTOKEN;
+ }
+ level_diff = level - current_level;
+ BEGIN(EXPECT_TAG);
+ current_level = level;
+ if (level_diff < 1) {
+ level_diff++;
+ return CLOSE;
+ }
+ else if (level_diff == 1) {
+ level_diff++;
+ return OPEN;
+ }
+ else {
+ /* should never happen (error to GEDCOM spec) */
+ gedcom_error ("GEDCOM level number is %d higher than "
+ "previous",
+ level_diff);
+ return BADTOKEN;
+ }
+ }
+
+<EXPECT_TAG>A\x00B\x00B\x00R\x00 MKTAGACTION(ABBR)
+<EXPECT_TAG>A\x00D\x00D\x00R\x00 MKTAGACTION(ADDR)
+<EXPECT_TAG>A\x00D\x00R\x001\x00 MKTAGACTION(ADR1)
+<EXPECT_TAG>A\x00D\x00R\x002\x00 MKTAGACTION(ADR2)
+<EXPECT_TAG>A\x00D\x00O\x00P\x00 MKTAGACTION(ADOP)
+<EXPECT_TAG>A\x00F\x00N\x00 MKTAGACTION(AFN)
+<EXPECT_TAG>A\x00G\x00E\x00 MKTAGACTION(AGE)
+<EXPECT_TAG>A\x00G\x00N\x00C\x00 MKTAGACTION(AGNC)
+<EXPECT_TAG>A\x00L\x00I\x00A\x00 MKTAGACTION(ALIA)
+<EXPECT_TAG>A\x00N\x00C\x00E\x00 MKTAGACTION(ANCE)
+<EXPECT_TAG>A\x00N\x00C\x00I\x00 MKTAGACTION(ANCI)
+<EXPECT_TAG>A\x00N\x00U\x00L\x00 MKTAGACTION(ANUL)
+<EXPECT_TAG>A\x00S\x00S\x00O\x00 MKTAGACTION(ASSO)
+<EXPECT_TAG>A\x00U\x00T\x00H\x00 MKTAGACTION(AUTH)
+<EXPECT_TAG>B\x00A\x00P\x00L\x00 MKTAGACTION(BAPL)
+<EXPECT_TAG>B\x00A\x00P\x00M\x00 MKTAGACTION(BAPM)
+<EXPECT_TAG>B\x00A\x00R\x00M\x00 MKTAGACTION(BARM)
+<EXPECT_TAG>B\x00A\x00S\x00M\x00 MKTAGACTION(BASM)
+<EXPECT_TAG>B\x00I\x00R\x00T\x00 MKTAGACTION(BIRT)
+<EXPECT_TAG>B\x00L\x00E\x00S\x00 MKTAGACTION(BLES)
+<EXPECT_TAG>B\x00L\x00O\x00B\x00 MKTAGACTION(BLOB)
+<EXPECT_TAG>B\x00U\x00R\x00I\x00 MKTAGACTION(BURI)
+<EXPECT_TAG>C\x00A\x00L\x00N\x00 MKTAGACTION(CALN)
+<EXPECT_TAG>C\x00A\x00S\x00T\x00 MKTAGACTION(CAST)
+<EXPECT_TAG>C\x00A\x00U\x00S\x00 MKTAGACTION(CAUS)
+<EXPECT_TAG>C\x00E\x00N\x00S\x00 MKTAGACTION(CENS)
+<EXPECT_TAG>C\x00H\x00A\x00N\x00 MKTAGACTION(CHAN)
+<EXPECT_TAG>C\x00H\x00A\x00R\x00 MKTAGACTION(CHAR)
+<EXPECT_TAG>C\x00H\x00I\x00L\x00 MKTAGACTION(CHIL)
+<EXPECT_TAG>C\x00H\x00R\x00 MKTAGACTION(CHR)
+<EXPECT_TAG>C\x00H\x00R\x00A\x00 MKTAGACTION(CHRA)
+<EXPECT_TAG>C\x00I\x00T\x00Y\x00 MKTAGACTION(CITY)
+<EXPECT_TAG>C\x00O\x00N\x00C\x00 MKTAGACTION(CONC)
+<EXPECT_TAG>C\x00O\x00N\x00F\x00 MKTAGACTION(CONF)
+<EXPECT_TAG>C\x00O\x00N\x00L\x00 MKTAGACTION(CONL)
+<EXPECT_TAG>C\x00O\x00N\x00T\x00 MKTAGACTION(CONT)
+<EXPECT_TAG>C\x00O\x00P\x00R\x00 MKTAGACTION(COPR)
+<EXPECT_TAG>C\x00O\x00R\x00P\x00 MKTAGACTION(CORP)
+<EXPECT_TAG>C\x00R\x00E\x00M\x00 MKTAGACTION(CREM)
+<EXPECT_TAG>C\x00T\x00R\x00Y\x00 MKTAGACTION(CTRY)
+<EXPECT_TAG>D\x00A\x00T\x00A\x00 MKTAGACTION(DATA)
+<EXPECT_TAG>D\x00A\x00T\x00E\x00 MKTAGACTION(DATE)
+<EXPECT_TAG>D\x00E\x00A\x00T\x00 MKTAGACTION(DEAT)
+<EXPECT_TAG>D\x00E\x00S\x00C\x00 MKTAGACTION(DESC)
+<EXPECT_TAG>D\x00E\x00S\x00I\x00 MKTAGACTION(DESI)
+<EXPECT_TAG>D\x00E\x00S\x00T\x00 MKTAGACTION(DEST)
+<EXPECT_TAG>D\x00I\x00V\x00 MKTAGACTION(DIV)
+<EXPECT_TAG>D\x00I\x00V\x00F\x00 MKTAGACTION(DIVF)
+<EXPECT_TAG>D\x00S\x00C\x00R\x00 MKTAGACTION(DSCR)
+<EXPECT_TAG>E\x00D\x00U\x00C\x00 MKTAGACTION(EDUC)
+<EXPECT_TAG>E\x00M\x00I\x00G\x00 MKTAGACTION(EMIG)
+<EXPECT_TAG>E\x00N\x00D\x00L\x00 MKTAGACTION(ENDL)
+<EXPECT_TAG>E\x00N\x00G\x00A\x00 MKTAGACTION(ENGA)
+<EXPECT_TAG>E\x00V\x00E\x00N\x00 MKTAGACTION(EVEN)
+<EXPECT_TAG>F\x00A\x00M\x00 MKTAGACTION(FAM)
+<EXPECT_TAG>F\x00A\x00M\x00C\x00 MKTAGACTION(FAMC)
+<EXPECT_TAG>F\x00A\x00M\x00F\x00 MKTAGACTION(FAMF)
+<EXPECT_TAG>F\x00A\x00M\x00S\x00 MKTAGACTION(FAMS)
+<EXPECT_TAG>F\x00C\x00O\x00M\x00 MKTAGACTION(FCOM)
+<EXPECT_TAG>F\x00I\x00L\x00E\x00 MKTAGACTION(FILE)
+<EXPECT_TAG>F\x00O\x00R\x00M\x00 MKTAGACTION(FORM)
+<EXPECT_TAG>G\x00E\x00D\x00C\x00 MKTAGACTION(GEDC)
+<EXPECT_TAG>G\x00I\x00V\x00N\x00 MKTAGACTION(GIVN)
+<EXPECT_TAG>G\x00R\x00A\x00D\x00 MKTAGACTION(GRAD)
+<EXPECT_TAG>H\x00E\x00A\x00D\x00 MKTAGACTION(HEAD)
+<EXPECT_TAG>H\x00U\x00S\x00B\x00 MKTAGACTION(HUSB)
+<EXPECT_TAG>I\x00D\x00N\x00O\x00 MKTAGACTION(IDNO)
+<EXPECT_TAG>I\x00M\x00M\x00I\x00 MKTAGACTION(IMMI)
+<EXPECT_TAG>I\x00N\x00D\x00I\x00 MKTAGACTION(INDI)
+<EXPECT_TAG>L\x00A\x00N\x00G\x00 MKTAGACTION(LANG)
+<EXPECT_TAG>L\x00E\x00G\x00A\x00 MKTAGACTION(LEGA)
+<EXPECT_TAG>M\x00A\x00R\x00B\x00 MKTAGACTION(MARB)
+<EXPECT_TAG>M\x00A\x00R\x00C\x00 MKTAGACTION(MARC)
+<EXPECT_TAG>M\x00A\x00R\x00L\x00 MKTAGACTION(MARL)
+<EXPECT_TAG>M\x00A\x00R\x00R\x00 MKTAGACTION(MARR)
+<EXPECT_TAG>M\x00A\x00R\x00S\x00 MKTAGACTION(MARS)
+<EXPECT_TAG>M\x00E\x00D\x00I\x00 MKTAGACTION(MEDI)
+<EXPECT_TAG>N\x00A\x00M\x00E\x00 MKTAGACTION(NAME)
+<EXPECT_TAG>N\x00A\x00T\x00I\x00 MKTAGACTION(NATI)
+<EXPECT_TAG>N\x00A\x00T\x00U\x00 MKTAGACTION(NATU)
+<EXPECT_TAG>N\x00C\x00H\x00I\x00 MKTAGACTION(NCHI)
+<EXPECT_TAG>N\x00I\x00C\x00K\x00 MKTAGACTION(NICK)
+<EXPECT_TAG>N\x00M\x00R\x00 MKTAGACTION(NMR)
+<EXPECT_TAG>N\x00O\x00T\x00E\x00 MKTAGACTION(NOTE)
+<EXPECT_TAG>N\x00P\x00F\x00X\x00 MKTAGACTION(NPFX)
+<EXPECT_TAG>N\x00S\x00F\x00X\x00 MKTAGACTION(NSFX)
+<EXPECT_TAG>O\x00B\x00J\x00E\x00 MKTAGACTION(OBJE)
+<EXPECT_TAG>O\x00C\x00C\x00U\x00 MKTAGACTION(OCCU)
+<EXPECT_TAG>O\x00R\x00D\x00I\x00 MKTAGACTION(ORDI)
+<EXPECT_TAG>O\x00R\x00D\x00N\x00 MKTAGACTION(ORDN)
+<EXPECT_TAG>P\x00A\x00G\x00E\x00 MKTAGACTION(PAGE)
+<EXPECT_TAG>P\x00E\x00D\x00I\x00 MKTAGACTION(PEDI)
+<EXPECT_TAG>P\x00H\x00O\x00N\x00 MKTAGACTION(PHON)
+<EXPECT_TAG>P\x00L\x00A\x00C\x00 MKTAGACTION(PLAC)
+<EXPECT_TAG>P\x00O\x00S\x00T\x00 MKTAGACTION(POST)
+<EXPECT_TAG>P\x00R\x00O\x00B\x00 MKTAGACTION(PROB)
+<EXPECT_TAG>P\x00R\x00O\x00P\x00 MKTAGACTION(PROP)
+<EXPECT_TAG>P\x00U\x00B\x00L\x00 MKTAGACTION(PUBL)
+<EXPECT_TAG>Q\x00U\x00A\x00Y\x00 MKTAGACTION(QUAY)
+<EXPECT_TAG>R\x00E\x00F\x00N\x00 MKTAGACTION(REFN)
+<EXPECT_TAG>R\x00E\x00L\x00A\x00 MKTAGACTION(RELA)
+<EXPECT_TAG>R\x00E\x00L\x00I\x00 MKTAGACTION(RELI)
+<EXPECT_TAG>R\x00E\x00P\x00O\x00 MKTAGACTION(REPO)
+<EXPECT_TAG>R\x00E\x00S\x00I\x00 MKTAGACTION(RESI)
+<EXPECT_TAG>R\x00E\x00S\x00N\x00 MKTAGACTION(RESN)
+<EXPECT_TAG>R\x00E\x00T\x00I\x00 MKTAGACTION(RETI)
+<EXPECT_TAG>R\x00F\x00N\x00 MKTAGACTION(RFN)
+<EXPECT_TAG>R\x00I\x00N\x00 MKTAGACTION(RIN)
+<EXPECT_TAG>R\x00O\x00L\x00E\x00 MKTAGACTION(ROLE)
+<EXPECT_TAG>S\x00E\x00X\x00 MKTAGACTION(SEX)
+<EXPECT_TAG>S\x00L\x00G\x00C\x00 MKTAGACTION(SLGC)
+<EXPECT_TAG>S\x00L\x00G\x00S\x00 MKTAGACTION(SLGS)
+<EXPECT_TAG>S\x00O\x00U\x00R\x00 MKTAGACTION(SOUR)
+<EXPECT_TAG>S\x00P\x00F\x00X\x00 MKTAGACTION(SPFX)
+<EXPECT_TAG>S\x00S\x00N\x00 MKTAGACTION(SSN)
+<EXPECT_TAG>S\x00T\x00A\x00E\x00 MKTAGACTION(STAE)
+<EXPECT_TAG>S\x00T\x00A\x00T\x00 MKTAGACTION(STAT)
+<EXPECT_TAG>S\x00U\x00B\x00M\x00 MKTAGACTION(SUBM)
+<EXPECT_TAG>S\x00U\x00B\x00N\x00 MKTAGACTION(SUBN)
+<EXPECT_TAG>S\x00U\x00R\x00N\x00 MKTAGACTION(SURN)
+<EXPECT_TAG>T\x00E\x00M\x00P\x00 MKTAGACTION(TEMP)
+<EXPECT_TAG>T\x00E\x00X\x00T\x00 MKTAGACTION(TEXT)
+<EXPECT_TAG>T\x00I\x00M\x00E\x00 MKTAGACTION(TIME)
+<EXPECT_TAG>T\x00I\x00T\x00L\x00 MKTAGACTION(TITL)
+<EXPECT_TAG>T\x00R\x00L\x00R\x00 MKTAGACTION(TRLR)
+<EXPECT_TAG>T\x00Y\x00P\x00E\x00 MKTAGACTION(TYPE)
+<EXPECT_TAG>V\x00E\x00R\x00S\x00 MKTAGACTION(VERS)
+<EXPECT_TAG>W\x00I\x00F\x00E\x00 MKTAGACTION(WIFE)
+<EXPECT_TAG>W\x00I\x00L\x00L\x00 MKTAGACTION(WILL)
+
+<EXPECT_TAG>{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) {
+ gedcom_error("Tag '%s' too long, max %d chars");
+ return BADTOKEN;
+ }
+ strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);
+ gedcom_lval.string = TO_INTERNAL(string_buf);
+ BEGIN(NORMAL);
+ return USERTAG;
+ }
+
+{delim} { gedcom_lval.string = TO_INTERNAL(yytext);
+ return DELIM;
+ }
+
+{any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext);
+ return ANYCHAR;
+ }
+
+{escape}/{non_at} { gedcom_lval.string = TO_INTERNAL(yytext);
+ return ESCAPE;
+ }
+
+{pointer} { gedcom_lval.string = TO_INTERNAL(yytext);
+ return POINTER;
+ }
+
+ /* Due to the conversion of level numbers into brackets, the
+ terminator is not important, so no token is returned here.
+ Although not strictly according to the GEDCOM spec, we'll ignore
+ whitespace just before the terminator.
+ */
+
+{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
+
+ /* Eventually we have to return 1 closing bracket (for the trailer).
+ We can detect whether we have sent the closing bracket using the
+ level_diff (at eof, first it is 2, then we increment it ourselves) */
+
+<<EOF>> { if (level_diff == 2) {
+ level_diff++;
+ return CLOSE;
+ }
+ else {
+ yyterminate();
+ }
+ }
+
+. { gedcom_error("Unexpected character: '%s' (0x%02x)",
+ yytext, yytext[0]);
+ return BADTOKEN;
+ }
+
+%%
+
+int yywrap()
+{
+ return 1;
+}
+
+#ifdef LEXER_TEST
+
+int main()
+{
+ int tok;
+ int res = open_conv_to_internal("UTF16LE");
+ if (!res) {
+ gedcom_error("Unable to open conversion context: %s",
+ strerror(errno));
+ return 1;
+ }
+ tok = gedcom_lohi_lex();
+ while (tok) {
+ switch(tok) {
+ case BADTOKEN: printf("BADTOKEN "); break;
+ case OPEN: printf("OPEN "); break;
+ case CLOSE: printf("CLOSE "); break;
+ case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
+ case DELIM: printf("DELIM "); break;
+ case ANYCHAR: printf("%s ", gedcom_lval.string); break;
+ case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
+ case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
+ default: printf("TAG(%s) ", gedcom_lval.string); break;
+ }
+ tok = gedcom_lohi_lex();
+ }
+ printf("\n");
+ close_conv_to_internal();
+ return 0;
+}
+#endif
/* $Name$ */
#include "gedcom.h"
+#include "multilex.h"
void show_help ()
{
printf(" -da Debug setting: libgedcom + yacc debug messages\n");
}
-int determine_encoding(FILE* f)
-{
- char first[2];
-
- fread(first, 1, 2, f);
- if ((first[0] == '0') && (first[1] == ' ')) {
- gedcom_warning("One-byte encoding");
- fseek(f, 0, 0);
- return ONE_BYTE;
- }
- else if ((first[0] == '\0') && (first[1] == '0'))
- {
- gedcom_warning("Two-byte encoding, high-low");
- fseek(f, 0, 0);
- return TWO_BYTE_HILO;
- }
- else if ((first[0] == '\xFE') && (first[1] == '\xFF'))
- {
- gedcom_warning("Two-byte encoding, high-low, with BOM");
- return TWO_BYTE_HILO;
- }
- else if ((first[0] == '0') && (first[1] == '\0'))
- {
- gedcom_warning("Two-byte encoding, low-high");
- fseek(f, 0, 0);
- return TWO_BYTE_LOHI;
- }
- else if ((first[0] == '\xFF') && (first[1] == '\xFE'))
- {
- gedcom_warning("Two-byte encoding, low-high, with BOM");
- return TWO_BYTE_LOHI;
- }
- else {
- gedcom_warning("Unknown encoding, falling back to one-byte");
- fseek(f, 0, 0);
- return ONE_BYTE;
- }
-}
-
-int gedcom_xxx_parse(char* file_name)
-{
- ENCODING enc;
- FILE* file = fopen (file_name, "r");
- if (!file) {
- printf("Could not open file '%s'\n", file_name);
- exit(1);
- }
- enc = determine_encoding(file);
-
- if (enc == ONE_BYTE) {
- gedcom_in = file;
- return gedcom_parse();
- }
- else {
- printf("No parser yet for encoding\n");
- exit(1);
- }
-}
-
int main(int argc, char* argv[])
{
MECHANISM mech = IMMED_FAIL;
return 1;
}
}
-
-int gedcom_warning(char* s, ...)
-{
- int res;
- va_list ap;
-
- va_start(ap, s);
- fprintf(stderr, "Warning on line %d: ", line_no);
- res = vfprintf(stderr, s, ap);
- fprintf(stderr, "\n");
- va_end(ap);
-
- return res;
-}
-
-int gedcom_error(char* s, ...)
-{
- int res;
- va_list ap;
-
- va_start(ap, s);
- fprintf(stderr, "Error on line %d: ", line_no);
- res = vfprintf(stderr, s, ap);
- fprintf(stderr, "\n");
- va_end(ap);
-
- return res;
-}