YFLAGS=--debug --defines
LFLAGS=-8
+all: ansel_module gedcom_parse
+
gedcom_parse: standalone.o lex.gedcom_1byte_.o lex.gedcom_hilo_.o \
lex.gedcom_lohi_.o gedcom.tab.o message.o multilex.o \
encoding.o
$(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@
-libgedcom.so:
+ansel_module:
+ cd ansel && $(MAKE)
lex.gedcom_1byte_.c: gedcom_1byte.lex gedcom.tab.h gedcom.h multilex.h
$(LEX) $(LFLAGS) -Pgedcom_1byte_ gedcom_1byte.lex
clean:
rm -f core gedcom_parse test_* *.o lex.gedcom_* \
gedcom.tab.* gedcom.output
+ cd ansel && $(MAKE) clean
# Lexer test programs
# Test of parser
-test: gedcom_parse
+test: all
@export GCONV_PATH=./ansel; \
for file in t/*.ged; do \
echo "=== testing $$file"; \
+/* $Id$ */
+/* $Name$ */
+
/* Generic conversion to and from ANSI Z39.47 (also known as ANSEL)
Based on the ansi_x3.110.c file from the glibc sources
Data coming from:
#include <gconv.h>
#include <stdint.h>
#include <string.h>
-#include <stdio.h>
static const uint32_t to_ucs4[256] =
{
+# $Id$
+# $Name$
+
LIBTOOL=libtool
MODPATH=/usr/local/lib
MODULES=ANSI_Z39.47.so
+CFLAGS=-g
all: $(MODULES)
%.so: %.lo
- $(LIBTOOL) $(CC) -module -avoid-version -o $*.la $^ -rpath $(MODPATH)
+ $(LIBTOOL) $(CC) -module -avoid-version $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $*.la -rpath $(MODPATH)
mv .libs/$@ $@
%.lo: %.c
- $(LIBTOOL) $(CC) -c $^
+ $(LIBTOOL) $(CC) -c $(CPPFLAGS) $(CFLAGS) $^
clean:
rm -rf .libs
+# $Id$
+# $Name$
# All lines contain the following information:
# If the lines start with `module'
+/* $Id$ */
+/* $Name$ */
+
#include <string.h>
#include <iconv.h>
#include <search.h>
ENCODING_CONF_FILE);
return;
}
- else if (buffer[0] != '#') {
+ else if ((buffer[0] != '#') && (strcmp(buffer, "\n") != 0)) {
if (sscanf(buffer, "%s %s %s", gedcom_n, charwidth, iconv_n) == 3) {
add_encoding(gedcom_n, charwidth, iconv_n);
}
memset(conv_buf, 0, sizeof(conv_buf));
conv_buf_size = 0;
cd_to_internal = iconv_open(INTERNAL_ENCODING, encoding);
+ if (cd_to_internal == (iconv_t) -1) {
+ gedcom_error("Error opening conversion context for encoding %s: %s",
+ encoding, strerror(errno));
+ }
}
return (cd_to_internal != (iconv_t) -1);
}
+/* $Id$ */
+/* $Name$ */
+
/* Basic file encoding */
#ifndef __ENCODING_H
#define __ENCODING_H
+# $Id$
+# $Name$
+
# Mapping of charsets for gedcom parsing
# Each line contains (separated by whitespace):
# - the gedcom name
# - a token identifying the width of characters and the ordering;
# currently supported values: 1, 2_LOHI, 2_HILO
# - the iconv name of the charset
+
+# First the encodings supported by the GEDCOM standard
UNICODE 2_LOHI UTF16LE
UNICODE 2_HILO UTF16BE
ASCII 1 ASCII
-ANSI 1 CP1252
ANSEL 1 ANSEL
+
+# Then some very frequently used non-standard encodings:
+ANSI 1 CP1252
/* $Id$ */
/* $Name$ */
+
#ifndef __GEDCOM_H
#define __GEDCOM_H
#include <stdarg.h>
--- /dev/null
+/* $Id$ */
+/* $Name$ */
+
+%{
+#include "gedcom.tab.h"
+#include "gedcom.h"
+#include "multilex.h"
+#include "encoding.h"
+
+#define YY_NO_UNPUT
+%}
+
+%s NORMAL
+%s EXPECT_TAG
+
+alpha [A-Za-z_]
+digit [0-9]
+delim " "
+tab [\t]
+hash #
+literal_at @@
+otherchar [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFE]
+terminator \x0D|\x0A|\x0D\x0A|\x0A\x0D
+
+any_char {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at}
+any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at}
+non_at {alpha}|{digit}|{otherchar}|{delim}|{hash}
+alphanum {alpha}|{digit}
+gen_delim {delim}|{tab}
+
+escape @#{any_char}+@
+pointer @{alphanum}{non_at}+@
+
+%{
+static int current_level=-1;
+static int level_diff=MAXGEDCLEVEL;
+
+#ifdef LEXER_TEST
+YYSTYPE gedcom_lval;
+int line_no = 1;
+#endif
+
+%}
+
+%%
+
+ /* The GEDCOM level number is converted into a sequence of opening
+ and closing brackets. Simply put, the following GEDCOM fragment:
+
+ 0 HEAD
+ 1 SOUR genes
+ 2 VERS 1.6
+ 2 NAME Genes
+ 1 DATE 07 OCT 2001
+ ...
+ 0 TRLR
+
+ is converted into:
+
+ { HEAD (initial)
+ { SOUR genes (1 higher: no closing brackets)
+ { VERS 1.6 (1 higher: no closing brackets)
+ } { NAME Genes (same level: 1 closing bracket)
+ } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets)
+ ...
+ } { TRLR }
+
+ or more clearly:
+
+ { HEAD
+ { SOUR genes
+ { VERS 1.6 }
+ { NAME Genes } }
+ { DATE 07 OCT 2001
+ ... }
+ { TRLR }
+
+ But because this means that one token is converted into a series
+ of tokens, there is some initial code following immediately here
+ that returns "pending" tokens. */
+
+%{
+char string_buf[MAXGEDCLINELEN+1];
+
+if (level_diff < 1) {
+ level_diff++;
+ return CLOSE;
+}
+else if (level_diff == 1) {
+ level_diff++;
+ return OPEN;
+}
+else {
+ /* out of brackets... */
+}
+
+#define TO_INTERNAL(str) to_internal(str, yyleng)
+
+#define MKTAGACTION(tag) \
+ { gedcom_lval.string = TO_INTERNAL(yytext); \
+ BEGIN(NORMAL); \
+ return TAG_##tag; }
+
+%}
+
+<INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
+
+<INITIAL>0{digit}+ { gedcom_error ("Level number with leading zero");
+ return BADTOKEN;
+ }
+
+<INITIAL>{digit}+ { int level = atoi(TO_INTERNAL(yytext));
+ if ((level < 0) || (level > MAXGEDCLEVEL)) {
+ gedcom_error ("Level number out of range [0..%d]",
+ MAXGEDCLEVEL);
+ return BADTOKEN;
+ }
+ level_diff = level - current_level;
+ BEGIN(EXPECT_TAG);
+ current_level = level;
+ if (level_diff < 1) {
+ level_diff++;
+ return CLOSE;
+ }
+ else if (level_diff == 1) {
+ level_diff++;
+ return OPEN;
+ }
+ else {
+ /* should never happen (error to GEDCOM spec) */
+ gedcom_error ("GEDCOM level number is %d higher than "
+ "previous",
+ level_diff);
+ return BADTOKEN;
+ }
+ }
+
+<EXPECT_TAG>ABBR MKTAGACTION(ABBR)
+<EXPECT_TAG>ADDR MKTAGACTION(ADDR)
+<EXPECT_TAG>ADR1 MKTAGACTION(ADR1)
+<EXPECT_TAG>ADR2 MKTAGACTION(ADR2)
+<EXPECT_TAG>ADOP MKTAGACTION(ADOP)
+<EXPECT_TAG>AFN MKTAGACTION(AFN)
+<EXPECT_TAG>AGE MKTAGACTION(AGE)
+<EXPECT_TAG>AGNC MKTAGACTION(AGNC)
+<EXPECT_TAG>ALIA MKTAGACTION(ALIA)
+<EXPECT_TAG>ANCE MKTAGACTION(ANCE)
+<EXPECT_TAG>ANCI MKTAGACTION(ANCI)
+<EXPECT_TAG>ANUL MKTAGACTION(ANUL)
+<EXPECT_TAG>ASSO MKTAGACTION(ASSO)
+<EXPECT_TAG>AUTH MKTAGACTION(AUTH)
+<EXPECT_TAG>BAPL MKTAGACTION(BAPL)
+<EXPECT_TAG>BAPM MKTAGACTION(BAPM)
+<EXPECT_TAG>BARM MKTAGACTION(BARM)
+<EXPECT_TAG>BASM MKTAGACTION(BASM)
+<EXPECT_TAG>BIRT MKTAGACTION(BIRT)
+<EXPECT_TAG>BLES MKTAGACTION(BLES)
+<EXPECT_TAG>BLOB MKTAGACTION(BLOB)
+<EXPECT_TAG>BURI MKTAGACTION(BURI)
+<EXPECT_TAG>CALN MKTAGACTION(CALN)
+<EXPECT_TAG>CAST MKTAGACTION(CAST)
+<EXPECT_TAG>CAUS MKTAGACTION(CAUS)
+<EXPECT_TAG>CENS MKTAGACTION(CENS)
+<EXPECT_TAG>CHAN MKTAGACTION(CHAN)
+<EXPECT_TAG>CHAR MKTAGACTION(CHAR)
+<EXPECT_TAG>CHIL MKTAGACTION(CHIL)
+<EXPECT_TAG>CHR MKTAGACTION(CHR)
+<EXPECT_TAG>CHRA MKTAGACTION(CHRA)
+<EXPECT_TAG>CITY MKTAGACTION(CITY)
+<EXPECT_TAG>CONC MKTAGACTION(CONC)
+<EXPECT_TAG>CONF MKTAGACTION(CONF)
+<EXPECT_TAG>CONL MKTAGACTION(CONL)
+<EXPECT_TAG>CONT MKTAGACTION(CONT)
+<EXPECT_TAG>COPR MKTAGACTION(COPR)
+<EXPECT_TAG>CORP MKTAGACTION(CORP)
+<EXPECT_TAG>CREM MKTAGACTION(CREM)
+<EXPECT_TAG>CTRY MKTAGACTION(CTRY)
+<EXPECT_TAG>DATA MKTAGACTION(DATA)
+<EXPECT_TAG>DATE MKTAGACTION(DATE)
+<EXPECT_TAG>DEAT MKTAGACTION(DEAT)
+<EXPECT_TAG>DESC MKTAGACTION(DESC)
+<EXPECT_TAG>DESI MKTAGACTION(DESI)
+<EXPECT_TAG>DEST MKTAGACTION(DEST)
+<EXPECT_TAG>DIV MKTAGACTION(DIV)
+<EXPECT_TAG>DIVF MKTAGACTION(DIVF)
+<EXPECT_TAG>DSCR MKTAGACTION(DSCR)
+<EXPECT_TAG>EDUC MKTAGACTION(EDUC)
+<EXPECT_TAG>EMIG MKTAGACTION(EMIG)
+<EXPECT_TAG>ENDL MKTAGACTION(ENDL)
+<EXPECT_TAG>ENGA MKTAGACTION(ENGA)
+<EXPECT_TAG>EVEN MKTAGACTION(EVEN)
+<EXPECT_TAG>FAM MKTAGACTION(FAM)
+<EXPECT_TAG>FAMC MKTAGACTION(FAMC)
+<EXPECT_TAG>FAMF MKTAGACTION(FAMF)
+<EXPECT_TAG>FAMS MKTAGACTION(FAMS)
+<EXPECT_TAG>FCOM MKTAGACTION(FCOM)
+<EXPECT_TAG>FILE MKTAGACTION(FILE)
+<EXPECT_TAG>FORM MKTAGACTION(FORM)
+<EXPECT_TAG>GEDC MKTAGACTION(GEDC)
+<EXPECT_TAG>GIVN MKTAGACTION(GIVN)
+<EXPECT_TAG>GRAD MKTAGACTION(GRAD)
+<EXPECT_TAG>HEAD MKTAGACTION(HEAD)
+<EXPECT_TAG>HUSB MKTAGACTION(HUSB)
+<EXPECT_TAG>IDNO MKTAGACTION(IDNO)
+<EXPECT_TAG>IMMI MKTAGACTION(IMMI)
+<EXPECT_TAG>INDI MKTAGACTION(INDI)
+<EXPECT_TAG>LANG MKTAGACTION(LANG)
+<EXPECT_TAG>LEGA MKTAGACTION(LEGA)
+<EXPECT_TAG>MARB MKTAGACTION(MARB)
+<EXPECT_TAG>MARC MKTAGACTION(MARC)
+<EXPECT_TAG>MARL MKTAGACTION(MARL)
+<EXPECT_TAG>MARR MKTAGACTION(MARR)
+<EXPECT_TAG>MARS MKTAGACTION(MARS)
+<EXPECT_TAG>MEDI MKTAGACTION(MEDI)
+<EXPECT_TAG>NAME MKTAGACTION(NAME)
+<EXPECT_TAG>NATI MKTAGACTION(NATI)
+<EXPECT_TAG>NATU MKTAGACTION(NATU)
+<EXPECT_TAG>NCHI MKTAGACTION(NCHI)
+<EXPECT_TAG>NICK MKTAGACTION(NICK)
+<EXPECT_TAG>NMR MKTAGACTION(NMR)
+<EXPECT_TAG>NOTE MKTAGACTION(NOTE)
+<EXPECT_TAG>NPFX MKTAGACTION(NPFX)
+<EXPECT_TAG>NSFX MKTAGACTION(NSFX)
+<EXPECT_TAG>OBJE MKTAGACTION(OBJE)
+<EXPECT_TAG>OCCU MKTAGACTION(OCCU)
+<EXPECT_TAG>ORDI MKTAGACTION(ORDI)
+<EXPECT_TAG>ORDN MKTAGACTION(ORDN)
+<EXPECT_TAG>PAGE MKTAGACTION(PAGE)
+<EXPECT_TAG>PEDI MKTAGACTION(PEDI)
+<EXPECT_TAG>PHON MKTAGACTION(PHON)
+<EXPECT_TAG>PLAC MKTAGACTION(PLAC)
+<EXPECT_TAG>POST MKTAGACTION(POST)
+<EXPECT_TAG>PROB MKTAGACTION(PROB)
+<EXPECT_TAG>PROP MKTAGACTION(PROP)
+<EXPECT_TAG>PUBL MKTAGACTION(PUBL)
+<EXPECT_TAG>QUAY MKTAGACTION(QUAY)
+<EXPECT_TAG>REFN MKTAGACTION(REFN)
+<EXPECT_TAG>RELA MKTAGACTION(RELA)
+<EXPECT_TAG>RELI MKTAGACTION(RELI)
+<EXPECT_TAG>REPO MKTAGACTION(REPO)
+<EXPECT_TAG>RESI MKTAGACTION(RESI)
+<EXPECT_TAG>RESN MKTAGACTION(RESN)
+<EXPECT_TAG>RETI MKTAGACTION(RETI)
+<EXPECT_TAG>RFN MKTAGACTION(RFN)
+<EXPECT_TAG>RIN MKTAGACTION(RIN)
+<EXPECT_TAG>ROLE MKTAGACTION(ROLE)
+<EXPECT_TAG>SEX MKTAGACTION(SEX)
+<EXPECT_TAG>SLGC MKTAGACTION(SLGC)
+<EXPECT_TAG>SLGS MKTAGACTION(SLGS)
+<EXPECT_TAG>SOUR MKTAGACTION(SOUR)
+<EXPECT_TAG>SPFX MKTAGACTION(SPFX)
+<EXPECT_TAG>SSN MKTAGACTION(SSN)
+<EXPECT_TAG>STAE MKTAGACTION(STAE)
+<EXPECT_TAG>STAT MKTAGACTION(STAT)
+<EXPECT_TAG>SUBM MKTAGACTION(SUBM)
+<EXPECT_TAG>SUBN MKTAGACTION(SUBN)
+<EXPECT_TAG>SURN MKTAGACTION(SURN)
+<EXPECT_TAG>TEMP MKTAGACTION(TEMP)
+<EXPECT_TAG>TEXT MKTAGACTION(TEXT)
+<EXPECT_TAG>TIME MKTAGACTION(TIME)
+<EXPECT_TAG>TITL MKTAGACTION(TITL)
+<EXPECT_TAG>TRLR MKTAGACTION(TRLR)
+<EXPECT_TAG>TYPE MKTAGACTION(TYPE)
+<EXPECT_TAG>VERS MKTAGACTION(VERS)
+<EXPECT_TAG>WIFE MKTAGACTION(WIFE)
+<EXPECT_TAG>WILL MKTAGACTION(WILL)
+
+<EXPECT_TAG>{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) {
+ gedcom_error("Tag '%s' too long, max %d chars");
+ return BADTOKEN;
+ }
+ strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);
+ gedcom_lval.string = TO_INTERNAL(string_buf);
+ BEGIN(NORMAL);
+ return USERTAG;
+ }
+
+{delim} { gedcom_lval.string = TO_INTERNAL(yytext);
+ return DELIM;
+ }
+
+{any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext);
+ /* Due to character conversions, it is possible
+ that the current character will be combined with
+ the next, and so now we don't have a character yet...
+ This is only applicable to the 1byte case (e.g. ANSEL).
+ */
+ if (strlen(gedcom_lval.string) > 0)
+ return ANYCHAR;
+ }
+
+{escape}/{non_at} { gedcom_lval.string = TO_INTERNAL(yytext);
+ return ESCAPE;
+ }
+
+{pointer} { gedcom_lval.string = TO_INTERNAL(yytext);
+ return POINTER;
+ }
+
+ /* Due to the conversion of level numbers into brackets, the
+ terminator is not important, so no token is returned here.
+ Although not strictly according to the GEDCOM spec, we'll ignore
+ whitespace just before the terminator.
+ */
+
+{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
+
+ /* Eventually we have to return 1 closing bracket (for the trailer).
+ We can detect whether we have sent the closing bracket using the
+ level_diff (at eof, first it is 2, then we increment it ourselves) */
+
+<<EOF>> { if (level_diff == 2) {
+ level_diff++;
+ return CLOSE;
+ }
+ else {
+ yyterminate();
+ }
+ }
+
+. { gedcom_error("Unexpected character: '%s' (0x%02x)",
+ yytext, yytext[0]);
+ return BADTOKEN;
+ }
+
+%%
+
+int yywrap()
+{
+ return 1;
+}
+
+#ifdef LEXER_TEST
+int main()
+{
+ int tok, res;
+ init_encodings();
+ set_encoding_width(ONE_BYTE);
+ res = open_conv_to_internal("ASCII");
+ if (!res) {
+ gedcom_error("Unable to open conversion context: %s",
+ strerror(errno));
+ return 1;
+ }
+ tok = gedcom_1byte_lex();
+ while (tok) {
+ switch(tok) {
+ case BADTOKEN: printf("BADTOKEN "); break;
+ case OPEN: printf("OPEN "); break;
+ case CLOSE: printf("CLOSE "); break;
+ case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
+ case DELIM: printf("DELIM "); break;
+ case ANYCHAR: printf("%s ", gedcom_lval.string); break;
+ case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
+ case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
+ default: printf("TAG(%s) ", gedcom_lval.string); break;
+ }
+ tok = gedcom_1byte_lex();
+ }
+ printf("\n");
+ close_conv_to_internal();
+ return 0;
+}
+#endif
--- /dev/null
+/* $Id$ */
+/* $Name$ */
+
+/* In high-low order, a space is encoded as 0x00 0x20 */
+/* i.e. this is utf-16-be */
+
+%{
+#include "gedcom.tab.h"
+#include "gedcom.h"
+#include "multilex.h"
+#include "encoding.h"
+
+#define YY_NO_UNPUT
+%}
+
+%s NORMAL
+%s EXPECT_TAG
+
+alpha \x00[A-Za-z_]
+digit \x00[0-9]
+delim \x00\x20
+tab \x00[\t]
+hash \x00#
+literal_at \x00@\x00@
+otherchar \x00[\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFF]|[\x01-\xFF][\x00-\xFF]
+terminator \x00\x0D|\x00\x0A|\x00\x0D\x00\x0A|\x00\x0A\x00\x0D
+
+any_char {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at}
+any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at}
+non_at {alpha}|{digit}|{otherchar}|{delim}|{hash}
+alphanum {alpha}|{digit}
+gen_delim {delim}|{tab}
+
+escape \x00@\x00#{any_char}+\x00@
+pointer \x00@{alphanum}{non_at}+\x00@
+
+%{
+static int current_level=-1;
+static int level_diff=MAXGEDCLEVEL;
+
+#ifdef LEXER_TEST
+YYSTYPE gedcom_lval;
+int line_no = 1;
+#endif
+%}
+
+%%
+
+ /* The GEDCOM level number is converted into a sequence of opening
+ and closing brackets. Simply put, the following GEDCOM fragment:
+
+ 0 HEAD
+ 1 SOUR genes
+ 2 VERS 1.6
+ 2 NAME Genes
+ 1 DATE 07 OCT 2001
+ ...
+ 0 TRLR
+
+ is converted into:
+
+ { HEAD (initial)
+ { SOUR genes (1 higher: no closing brackets)
+ { VERS 1.6 (1 higher: no closing brackets)
+ } { NAME Genes (same level: 1 closing bracket)
+ } } { DATE 07 OCT 2001 (1 lower: 2 closing brackets)
+ ...
+ } { TRLR }
+
+ or more clearly:
+
+ { HEAD
+ { SOUR genes
+ { VERS 1.6 }
+ { NAME Genes } }
+ { DATE 07 OCT 2001
+ ... }
+ { TRLR }
+
+ But because this means that one token is converted into a series
+ of tokens, there is some initial code following immediately here
+ that returns "pending" tokens. */
+
+%{
+char string_buf[MAXGEDCLINELEN+1];
+
+if (level_diff < 1) {
+ level_diff++;
+ return CLOSE;
+}
+else if (level_diff == 1) {
+ level_diff++;
+ return OPEN;
+}
+else {
+ /* out of brackets... */
+}
+
+#define TO_INTERNAL(str) to_internal(str, yyleng)
+
+#define MKTAGACTION(tag) \
+ { gedcom_lval.string = TO_INTERNAL(yytext); \
+ BEGIN(NORMAL); \
+ return TAG_##tag; }
+
+%}
+
+<INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
+
+<INITIAL>\x00[0]{digit}+ { gedcom_error ("Level number with leading zero");
+ return BADTOKEN;
+ }
+
+<INITIAL>{digit}+ { int level = atoi(TO_INTERNAL(yytext));
+ if ((level < 0) || (level > MAXGEDCLEVEL)) {
+ gedcom_error ("Level number out of range [0..%d]",
+ MAXGEDCLEVEL);
+ return BADTOKEN;
+ }
+ level_diff = level - current_level;
+ BEGIN(EXPECT_TAG);
+ current_level = level;
+ if (level_diff < 1) {
+ level_diff++;
+ return CLOSE;
+ }
+ else if (level_diff == 1) {
+ level_diff++;
+ return OPEN;
+ }
+ else {
+ /* should never happen (error to GEDCOM spec) */
+ gedcom_error ("GEDCOM level number is %d higher than "
+ "previous",
+ level_diff);
+ return BADTOKEN;
+ }
+ }
+
+<EXPECT_TAG>\x00A\x00B\x00B\x00R MKTAGACTION(ABBR)
+<EXPECT_TAG>\x00A\x00D\x00D\x00R MKTAGACTION(ADDR)
+<EXPECT_TAG>\x00A\x00D\x00R\x001 MKTAGACTION(ADR1)
+<EXPECT_TAG>\x00A\x00D\x00R\x002 MKTAGACTION(ADR2)
+<EXPECT_TAG>\x00A\x00D\x00O\x00P MKTAGACTION(ADOP)
+<EXPECT_TAG>\x00A\x00F\x00N MKTAGACTION(AFN)
+<EXPECT_TAG>\x00A\x00G\x00E MKTAGACTION(AGE)
+<EXPECT_TAG>\x00A\x00G\x00N\x00C MKTAGACTION(AGNC)
+<EXPECT_TAG>\x00A\x00L\x00I\x00A MKTAGACTION(ALIA)
+<EXPECT_TAG>\x00A\x00N\x00C\x00E MKTAGACTION(ANCE)
+<EXPECT_TAG>\x00A\x00N\x00C\x00I MKTAGACTION(ANCI)
+<EXPECT_TAG>\x00A\x00N\x00U\x00L MKTAGACTION(ANUL)
+<EXPECT_TAG>\x00A\x00S\x00S\x00O MKTAGACTION(ASSO)
+<EXPECT_TAG>\x00A\x00U\x00T\x00H MKTAGACTION(AUTH)
+<EXPECT_TAG>\x00B\x00A\x00P\x00L MKTAGACTION(BAPL)
+<EXPECT_TAG>\x00B\x00A\x00P\x00M MKTAGACTION(BAPM)
+<EXPECT_TAG>\x00B\x00A\x00R\x00M MKTAGACTION(BARM)
+<EXPECT_TAG>\x00B\x00A\x00S\x00M MKTAGACTION(BASM)
+<EXPECT_TAG>\x00B\x00I\x00R\x00T MKTAGACTION(BIRT)
+<EXPECT_TAG>\x00B\x00L\x00E\x00S MKTAGACTION(BLES)
+<EXPECT_TAG>\x00B\x00L\x00O\x00B MKTAGACTION(BLOB)
+<EXPECT_TAG>\x00B\x00U\x00R\x00I MKTAGACTION(BURI)
+<EXPECT_TAG>\x00C\x00A\x00L\x00N MKTAGACTION(CALN)
+<EXPECT_TAG>\x00C\x00A\x00S\x00T MKTAGACTION(CAST)
+<EXPECT_TAG>\x00C\x00A\x00U\x00S MKTAGACTION(CAUS)
+<EXPECT_TAG>\x00C\x00E\x00N\x00S MKTAGACTION(CENS)
+<EXPECT_TAG>\x00C\x00H\x00A\x00N MKTAGACTION(CHAN)
+<EXPECT_TAG>\x00C\x00H\x00A\x00R MKTAGACTION(CHAR)
+<EXPECT_TAG>\x00C\x00H\x00I\x00L MKTAGACTION(CHIL)
+<EXPECT_TAG>\x00C\x00H\x00R MKTAGACTION(CHR)
+<EXPECT_TAG>\x00C\x00H\x00R\x00A MKTAGACTION(CHRA)
+<EXPECT_TAG>\x00C\x00I\x00T\x00Y MKTAGACTION(CITY)
+<EXPECT_TAG>\x00C\x00O\x00N\x00C MKTAGACTION(CONC)
+<EXPECT_TAG>\x00C\x00O\x00N\x00F MKTAGACTION(CONF)
+<EXPECT_TAG>\x00C\x00O\x00N\x00L MKTAGACTION(CONL)
+<EXPECT_TAG>\x00C\x00O\x00N\x00T MKTAGACTION(CONT)
+<EXPECT_TAG>\x00C\x00O\x00P\x00R MKTAGACTION(COPR)
+<EXPECT_TAG>\x00C\x00O\x00R\x00P MKTAGACTION(CORP)
+<EXPECT_TAG>\x00C\x00R\x00E\x00M MKTAGACTION(CREM)
+<EXPECT_TAG>\x00C\x00T\x00R\x00Y MKTAGACTION(CTRY)
+<EXPECT_TAG>\x00D\x00A\x00T\x00A MKTAGACTION(DATA)
+<EXPECT_TAG>\x00D\x00A\x00T\x00E MKTAGACTION(DATE)
+<EXPECT_TAG>\x00D\x00E\x00A\x00T MKTAGACTION(DEAT)
+<EXPECT_TAG>\x00D\x00E\x00S\x00C MKTAGACTION(DESC)
+<EXPECT_TAG>\x00D\x00E\x00S\x00I MKTAGACTION(DESI)
+<EXPECT_TAG>\x00D\x00E\x00S\x00T MKTAGACTION(DEST)
+<EXPECT_TAG>\x00D\x00I\x00V MKTAGACTION(DIV)
+<EXPECT_TAG>\x00D\x00I\x00V\x00F MKTAGACTION(DIVF)
+<EXPECT_TAG>\x00D\x00S\x00C\x00R MKTAGACTION(DSCR)
+<EXPECT_TAG>\x00E\x00D\x00U\x00C MKTAGACTION(EDUC)
+<EXPECT_TAG>\x00E\x00M\x00I\x00G MKTAGACTION(EMIG)
+<EXPECT_TAG>\x00E\x00N\x00D\x00L MKTAGACTION(ENDL)
+<EXPECT_TAG>\x00E\x00N\x00G\x00A MKTAGACTION(ENGA)
+<EXPECT_TAG>\x00E\x00V\x00E\x00N MKTAGACTION(EVEN)
+<EXPECT_TAG>\x00F\x00A\x00M MKTAGACTION(FAM)
+<EXPECT_TAG>\x00F\x00A\x00M\x00C MKTAGACTION(FAMC)
+<EXPECT_TAG>\x00F\x00A\x00M\x00F MKTAGACTION(FAMF)
+<EXPECT_TAG>\x00F\x00A\x00M\x00S MKTAGACTION(FAMS)
+<EXPECT_TAG>\x00F\x00C\x00O\x00M MKTAGACTION(FCOM)
+<EXPECT_TAG>\x00F\x00I\x00L\x00E MKTAGACTION(FILE)
+<EXPECT_TAG>\x00F\x00O\x00R\x00M MKTAGACTION(FORM)
+<EXPECT_TAG>\x00G\x00E\x00D\x00C MKTAGACTION(GEDC)
+<EXPECT_TAG>\x00G\x00I\x00V\x00N MKTAGACTION(GIVN)
+<EXPECT_TAG>\x00G\x00R\x00A\x00D MKTAGACTION(GRAD)
+<EXPECT_TAG>\x00H\x00E\x00A\x00D MKTAGACTION(HEAD)
+<EXPECT_TAG>\x00H\x00U\x00S\x00B MKTAGACTION(HUSB)
+<EXPECT_TAG>\x00I\x00D\x00N\x00O MKTAGACTION(IDNO)
+<EXPECT_TAG>\x00I\x00M\x00M\x00I MKTAGACTION(IMMI)
+<EXPECT_TAG>\x00I\x00N\x00D\x00I MKTAGACTION(INDI)
+<EXPECT_TAG>\x00L\x00A\x00N\x00G MKTAGACTION(LANG)
+<EXPECT_TAG>\x00L\x00E\x00G\x00A MKTAGACTION(LEGA)
+<EXPECT_TAG>\x00M\x00A\x00R\x00B MKTAGACTION(MARB)
+<EXPECT_TAG>\x00M\x00A\x00R\x00C MKTAGACTION(MARC)
+<EXPECT_TAG>\x00M\x00A\x00R\x00L MKTAGACTION(MARL)
+<EXPECT_TAG>\x00M\x00A\x00R\x00R MKTAGACTION(MARR)
+<EXPECT_TAG>\x00M\x00A\x00R\x00S MKTAGACTION(MARS)
+<EXPECT_TAG>\x00M\x00E\x00D\x00I MKTAGACTION(MEDI)
+<EXPECT_TAG>\x00N\x00A\x00M\x00E MKTAGACTION(NAME)
+<EXPECT_TAG>\x00N\x00A\x00T\x00I MKTAGACTION(NATI)
+<EXPECT_TAG>\x00N\x00A\x00T\x00U MKTAGACTION(NATU)
+<EXPECT_TAG>\x00N\x00C\x00H\x00I MKTAGACTION(NCHI)
+<EXPECT_TAG>\x00N\x00I\x00C\x00K MKTAGACTION(NICK)
+<EXPECT_TAG>\x00N\x00M\x00R MKTAGACTION(NMR)
+<EXPECT_TAG>\x00N\x00O\x00T\x00E MKTAGACTION(NOTE)
+<EXPECT_TAG>\x00N\x00P\x00F\x00X MKTAGACTION(NPFX)
+<EXPECT_TAG>\x00N\x00S\x00F\x00X MKTAGACTION(NSFX)
+<EXPECT_TAG>\x00O\x00B\x00J\x00E MKTAGACTION(OBJE)
+<EXPECT_TAG>\x00O\x00C\x00C\x00U MKTAGACTION(OCCU)
+<EXPECT_TAG>\x00O\x00R\x00D\x00I MKTAGACTION(ORDI)
+<EXPECT_TAG>\x00O\x00R\x00D\x00N MKTAGACTION(ORDN)
+<EXPECT_TAG>\x00P\x00A\x00G\x00E MKTAGACTION(PAGE)
+<EXPECT_TAG>\x00P\x00E\x00D\x00I MKTAGACTION(PEDI)
+<EXPECT_TAG>\x00P\x00H\x00O\x00N MKTAGACTION(PHON)
+<EXPECT_TAG>\x00P\x00L\x00A\x00C MKTAGACTION(PLAC)
+<EXPECT_TAG>\x00P\x00O\x00S\x00T MKTAGACTION(POST)
+<EXPECT_TAG>\x00P\x00R\x00O\x00B MKTAGACTION(PROB)
+<EXPECT_TAG>\x00P\x00R\x00O\x00P MKTAGACTION(PROP)
+<EXPECT_TAG>\x00P\x00U\x00B\x00L MKTAGACTION(PUBL)
+<EXPECT_TAG>\x00Q\x00U\x00A\x00Y MKTAGACTION(QUAY)
+<EXPECT_TAG>\x00R\x00E\x00F\x00N MKTAGACTION(REFN)
+<EXPECT_TAG>\x00R\x00E\x00L\x00A MKTAGACTION(RELA)
+<EXPECT_TAG>\x00R\x00E\x00L\x00I MKTAGACTION(RELI)
+<EXPECT_TAG>\x00R\x00E\x00P\x00O MKTAGACTION(REPO)
+<EXPECT_TAG>\x00R\x00E\x00S\x00I MKTAGACTION(RESI)
+<EXPECT_TAG>\x00R\x00E\x00S\x00N MKTAGACTION(RESN)
+<EXPECT_TAG>\x00R\x00E\x00T\x00I MKTAGACTION(RETI)
+<EXPECT_TAG>\x00R\x00F\x00N MKTAGACTION(RFN)
+<EXPECT_TAG>\x00R\x00I\x00N MKTAGACTION(RIN)
+<EXPECT_TAG>\x00R\x00O\x00L\x00E MKTAGACTION(ROLE)
+<EXPECT_TAG>\x00S\x00E\x00X MKTAGACTION(SEX)
+<EXPECT_TAG>\x00S\x00L\x00G\x00C MKTAGACTION(SLGC)
+<EXPECT_TAG>\x00S\x00L\x00G\x00S MKTAGACTION(SLGS)
+<EXPECT_TAG>\x00S\x00O\x00U\x00R MKTAGACTION(SOUR)
+<EXPECT_TAG>\x00S\x00P\x00F\x00X MKTAGACTION(SPFX)
+<EXPECT_TAG>\x00S\x00S\x00N MKTAGACTION(SSN)
+<EXPECT_TAG>\x00S\x00T\x00A\x00E MKTAGACTION(STAE)
+<EXPECT_TAG>\x00S\x00T\x00A\x00T MKTAGACTION(STAT)
+<EXPECT_TAG>\x00S\x00U\x00B\x00M MKTAGACTION(SUBM)
+<EXPECT_TAG>\x00S\x00U\x00B\x00N MKTAGACTION(SUBN)
+<EXPECT_TAG>\x00S\x00U\x00R\x00N MKTAGACTION(SURN)
+<EXPECT_TAG>\x00T\x00E\x00M\x00P MKTAGACTION(TEMP)
+<EXPECT_TAG>\x00T\x00E\x00X\x00T MKTAGACTION(TEXT)
+<EXPECT_TAG>\x00T\x00I\x00M\x00E MKTAGACTION(TIME)
+<EXPECT_TAG>\x00T\x00I\x00T\x00L MKTAGACTION(TITL)
+<EXPECT_TAG>\x00T\x00R\x00L\x00R MKTAGACTION(TRLR)
+<EXPECT_TAG>\x00T\x00Y\x00P\x00E MKTAGACTION(TYPE)
+<EXPECT_TAG>\x00V\x00E\x00R\x00S MKTAGACTION(VERS)
+<EXPECT_TAG>\x00W\x00I\x00F\x00E MKTAGACTION(WIFE)
+<EXPECT_TAG>\x00W\x00I\x00L\x00L MKTAGACTION(WILL)
+
+<EXPECT_TAG>{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) {
+ gedcom_error("Tag '%s' too long, max %d chars");
+ return BADTOKEN;
+ }
+ strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);
+ gedcom_lval.string = TO_INTERNAL(string_buf);
+ BEGIN(NORMAL);
+ return USERTAG;
+ }
+
+{delim} { gedcom_lval.string = TO_INTERNAL(yytext);
+ return DELIM;
+ }
+
+{any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext);
+ return ANYCHAR;
+ }
+
+{escape}/{non_at} { gedcom_lval.string = TO_INTERNAL(yytext);
+ return ESCAPE;
+ }
+
+{pointer} { gedcom_lval.string = TO_INTERNAL(yytext);
+ return POINTER;
+ }
+
+ /* Due to the conversion of level numbers into brackets, the
+ terminator is not important, so no token is returned here.
+ Although not strictly according to the GEDCOM spec, we'll ignore
+ whitespace just before the terminator.
+ */
+
+{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
+
+ /* Eventually we have to return 1 closing bracket (for the trailer).
+ We can detect whether we have sent the closing bracket using the
+ level_diff (at eof, first it is 2, then we increment it ourselves) */
+
+<<EOF>> { if (level_diff == 2) {
+ level_diff++;
+ return CLOSE;
+ }
+ else {
+ yyterminate();
+ }
+ }
+
+. { gedcom_error("Unexpected character: '%s' (0x%02x)",
+ yytext, yytext[0]);
+ return BADTOKEN;
+ }
+
+%%
+
+int yywrap()
+{
+ return 1;
+}
+
+#ifdef LEXER_TEST
+
+int main()
+{
+ int tok, res;
+ init_encodings();
+ set_encoding_width(TWO_BYTE_HILO);
+ res = open_conv_to_internal("UNICODE");
+ if (!res) {
+ gedcom_error("Unable to open conversion context: %s",
+ strerror(errno));
+ return 1;
+ }
+ tok = gedcom_hilo_lex();
+ while (tok) {
+ switch(tok) {
+ case BADTOKEN: printf("BADTOKEN "); break;
+ case OPEN: printf("OPEN "); break;
+ case CLOSE: printf("CLOSE "); break;
+ case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
+ case DELIM: printf("DELIM "); break;
+ case ANYCHAR: printf("%s ", gedcom_lval.string); break;
+ case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
+ case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
+ default: printf("TAG(%s) ", gedcom_lval.string); break;
+ }
+ tok = gedcom_hilo_lex();
+ }
+ printf("\n");
+ close_conv_to_internal();
+ return 0;
+}
+#endif
--- /dev/null
+/* $Id$ */
+/* $Name$ */
+
+#include "gedcom.h"
+
+int gedcom_message(char* s, ...)
+{
+ int res;
+ va_list ap;
+
+ va_start(ap, s);
+ res = vfprintf(stderr, s, ap);
+ fprintf(stderr, "\n");
+ va_end(ap);
+
+ return res;
+}
+
+int gedcom_warning(char* s, ...)
+{
+ int res;
+ va_list ap;
+
+ va_start(ap, s);
+ fprintf(stderr, "Warning on line %d: ", line_no);
+ res = vfprintf(stderr, s, ap);
+ fprintf(stderr, "\n");
+ va_end(ap);
+
+ return res;
+}
+
+int gedcom_error(char* s, ...)
+{
+ int res;
+ va_list ap;
+
+ va_start(ap, s);
+ fprintf(stderr, "Error on line %d: ", line_no);
+ res = vfprintf(stderr, s, ap);
+ fprintf(stderr, "\n");
+ va_end(ap);
+
+ return res;
+}
--- /dev/null
+/* $Id$ */
+/* $Name$ */
+
+#include "gedcom.h"
+#include "multilex.h"
+#include "encoding.h"
+
+int line_no = 1;
+
+typedef int (*lex_func)(void);
+lex_func lf;
+
+int lexer_init(ENCODING enc, FILE* f)
+{
+ if (enc == ONE_BYTE) {
+ gedcom_1byte_in = f;
+ lf = &gedcom_1byte_lex;
+ set_encoding_width(enc);
+ return open_conv_to_internal("ASCII");
+ }
+ else if (enc == TWO_BYTE_HILO) {
+ gedcom_hilo_in = f;
+ lf = &gedcom_hilo_lex;
+ set_encoding_width(enc);
+ return open_conv_to_internal("UNICODE");
+ }
+ else if (enc == TWO_BYTE_LOHI) {
+ gedcom_lohi_in = f;
+ lf = &gedcom_lohi_lex;
+ set_encoding_width(enc);
+ return open_conv_to_internal("UNICODE");
+ }
+ else {
+ return 0;
+ }
+}
+
+void lexer_close()
+{
+ close_conv_to_internal();
+}
+
+int gedcom_lex()
+{
+ return (*lf)();
+}
+
+int determine_encoding(FILE* f)
+{
+ char first[2];
+
+ fread(first, 1, 2, f);
+ if ((first[0] == '0') && (first[1] == ' ')) {
+ gedcom_message("One-byte encoding");
+ fseek(f, 0, 0);
+ return ONE_BYTE;
+ }
+ else if ((first[0] == '\0') && (first[1] == '0'))
+ {
+ gedcom_message("Two-byte encoding, high-low");
+ fseek(f, 0, 0);
+ return TWO_BYTE_HILO;
+ }
+ else if ((first[0] == '\xFE') && (first[1] == '\xFF'))
+ {
+ gedcom_message("Two-byte encoding, high-low, with BOM");
+ return TWO_BYTE_HILO;
+ }
+ else if ((first[0] == '0') && (first[1] == '\0'))
+ {
+ gedcom_message("Two-byte encoding, low-high");
+ fseek(f, 0, 0);
+ return TWO_BYTE_LOHI;
+ }
+ else if ((first[0] == '\xFF') && (first[1] == '\xFE'))
+ {
+ gedcom_message("Two-byte encoding, low-high, with BOM");
+ return TWO_BYTE_LOHI;
+ }
+ else {
+ gedcom_message("Unknown encoding, falling back to one-byte");
+ fseek(f, 0, 0);
+ return ONE_BYTE;
+ }
+}
+
+int gedcom_parse_file(char* file_name)
+{
+ ENCODING enc;
+ int result = 1;
+ FILE* file = fopen (file_name, "r");
+ if (!file) {
+ gedcom_error("Could not open file '%s'\n", file_name);
+ return 1;
+ }
+
+ init_encodings();
+ enc = determine_encoding(file);
+
+ if (lexer_init(enc, file)) {
+ result = gedcom_parse();
+ }
+ lexer_close();
+
+ return result;
+}
+
--- /dev/null
+/* $Id$ */
+/* $Name$ */
+
+#ifndef __MULTILEX_H
+#define __MULTILEX_H
+#include <stdio.h>
+
+int gedcom_parse_file(char* file_name);
+
+int gedcom_1byte_lex();
+extern FILE *gedcom_1byte_in;
+
+int gedcom_hilo_lex();
+extern FILE *gedcom_hilo_in;
+
+int gedcom_lohi_lex();
+extern FILE *gedcom_lohi_in;
+#endif /* __MULTILEX_H */