Full unicode support.
authorPeter Verthez <Peter.Verthez@advalvas.be>
Fri, 16 Nov 2001 10:27:29 +0000 (10:27 +0000)
committerPeter Verthez <Peter.Verthez@advalvas.be>
Fri, 16 Nov 2001 10:27:29 +0000 (10:27 +0000)
Makefile
encoding.c [new file with mode: 0644]
encoding.h [new file with mode: 0644]
gedcom.h
gedcom.lex [deleted file]
gedcom.y
gedcom_lohi.lex [new file with mode: 0644]
standalone.c

index a0ca68a43b05f52a477724d01a50b41ddb55e4a0..5b3e9b149afe395319d9623d52c29314ccc30164 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,16 +1,50 @@
 # $Id$
 # $Name$
 
-CFLAGS=-Wall -pedantic
+YACC=bison
+LEX=flex
 
-gedcom_parse:  standalone.o lex.gedcom_.o gedcom.tab.o
-       cc standalone.o lex.gedcom_.o gedcom.tab.o -o gedcom_parse
+CFLAGS=-g -Wall -pedantic
+YFLAGS=--debug --defines
+LFLAGS=-8
 
-lex.gedcom_.c: gedcom.lex gedcom.tab.h gedcom.h
-       flex -8 -Pgedcom_ gedcom.lex
+gedcom_parse:  standalone.o lex.gedcom_1byte_.o lex.gedcom_hilo_.o \
+                lex.gedcom_lohi_.o gedcom.tab.o message.o multilex.o \
+               encoding.o
+       $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@
+
+lex.gedcom_1byte_.c:   gedcom_1byte.lex gedcom.tab.h gedcom.h multilex.h
+       $(LEX) $(LFLAGS) -Pgedcom_1byte_ gedcom_1byte.lex
+
+lex.gedcom_hilo_.c:    gedcom_hilo.lex gedcom.tab.h gedcom.h multilex.h
+       $(LEX) $(LFLAGS) -Pgedcom_hilo_ gedcom_hilo.lex
+
+lex.gedcom_lohi_.c:    gedcom_lohi.lex gedcom.tab.h gedcom.h multilex.h
+       $(LEX) $(LFLAGS) -Pgedcom_lohi_ gedcom_lohi.lex
 
 gedcom.tab.c gedcom.tab.h:     gedcom.y gedcom.h
-       bison --debug --defines --name-prefix=gedcom_ gedcom.y
+       $(YACC) $(YFLAGS) --name-prefix=gedcom_ gedcom.y
 
 clean:
-       rm -f core gedcom_parse *.o lex.gedcom_.c gedcom.tab.* gedcom.output
+       rm -f core gedcom_parse test_* *.o lex.gedcom_* \
+        gedcom.tab.* gedcom.output
+
+# Test programs
+
+test_1byte:    lex.gedcom_1byte_.test.o message.o encoding.o
+       $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@
+
+lex.gedcom_1byte_.test.o:      lex.gedcom_1byte_.c
+       $(CC) -DLEXER_TEST -c $(CPPFLAGS) $(CFLAGS) $^ -o $@
+
+test_hilo:     lex.gedcom_hilo_.test.o message.o encoding.o
+       $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@
+
+lex.gedcom_hilo_.test.o:       lex.gedcom_hilo_.c
+       $(CC) -DLEXER_TEST -c $(CPPFLAGS) $(CFLAGS) $^ -o $@
+
+test_lohi:     lex.gedcom_lohi_.test.o message.o encoding.o
+       $(CC) $(LDFLAGS) $^ $(LOADLIBES) $(LDLIBS) -o $@
+
+lex.gedcom_lohi_.test.o:       lex.gedcom_lohi_.c
+       $(CC) -DLEXER_TEST -c $(CPPFLAGS) $(CFLAGS) $^ -o $@
diff --git a/encoding.c b/encoding.c
new file mode 100644 (file)
index 0000000..08f69db
--- /dev/null
@@ -0,0 +1,34 @@
+#include <string.h>
+#include <iconv.h>
+#include "gedcom.h"
+#include "encoding.h"
+
+#define INTERNAL_ENCODING "UTF8"
+
+static iconv_t cd_to_internal = (iconv_t) -1;
+static char int_buf[MAXGEDCLINELEN*2];
+
+int open_conv_to_internal(char* fromcode)
+{
+  if (cd_to_internal != (iconv_t) -1)
+    iconv_close(cd_to_internal);
+  cd_to_internal = iconv_open(INTERNAL_ENCODING, fromcode);
+  return (cd_to_internal != (iconv_t) -1);  
+}
+
+void close_conv_to_internal()
+{
+  iconv_close(cd_to_internal);
+}
+
+char* to_internal(char* str, size_t len)
+{
+  size_t insize = len;
+  size_t outsize = MAXGEDCLINELEN * 2;
+  char *wrptr = int_buf;
+  char *rdptr = str;
+  memset(int_buf, 0, sizeof(int_buf));
+  iconv(cd_to_internal, &rdptr, &insize, &wrptr, &outsize);
+  return int_buf;
+}
+
diff --git a/encoding.h b/encoding.h
new file mode 100644 (file)
index 0000000..c4b70ee
--- /dev/null
@@ -0,0 +1,3 @@
+int open_conv_to_internal(char* fromcode);
+void close_conv_to_internal();
+char* to_internal(char* str, size_t len);
index 6f35d4eca12285f91fddcc9e112b7653b812b276..5fdf0181e6b4f7d26e3e74c31eb1beaf5da6f75a 100644 (file)
--- a/gedcom.h
+++ b/gedcom.h
@@ -4,6 +4,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
+#include <errno.h>
 
 #define MAXGEDCLEVEL 99
 #define MAXGEDCLINELEN 256
@@ -18,20 +19,17 @@ typedef enum _MECH {
   IGNORE_ERRORS
 } MECHANISM;
 
-/* Basic file encoding */
-typedef enum _ENC {
-  ONE_BYTE,
-  TWO_BYTE_HILO,
-  TWO_BYTE_LOHI
-} ENCODING;
 
 int        gedcom_error(char* s, ...);
 int        gedcom_warning(char* s, ...);
+int        gedcom_message(char* s, ...);
 int        gedcom_debug_print(char* s, ...);
 void       gedcom_set_debug_level(int level);
 void       gedcom_set_error_handling(MECHANISM mechanism);
 void       gedcom_set_compat_handling(int enable_compat);
+
 int        gedcom_parse();
+
 int        gedcom_lex();
+
 extern int line_no;
-extern FILE *gedcom_in;
diff --git a/gedcom.lex b/gedcom.lex
deleted file mode 100644 (file)
index 62fd3f6..0000000
+++ /dev/null
@@ -1,314 +0,0 @@
-/* $Id$ */
-/* $Name$ */
-
-%{
-#include "gedcom.tab.h"
-#include "gedcom.h"
-%}
-
-%s NORMAL
-%s EXPECT_TAG
-
-alpha        [A-Za-z_]
-digit        [0-9]
-delim        " "
-tab          [\t]
-hash         #
-literal_at   @@
-otherchar    [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFE]
-terminator   \x0D|\x0A|\x0D\x0A|\x0A\x0D
-
-any_char     {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at}
-any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at}
-non_at       {alpha}|{digit}|{otherchar}|{delim}|{hash}
-alphanum     {alpha}|{digit}
-gen_delim    {delim}|{tab}
-
-escape       @#{any_char}+@
-pointer      @{alphanum}{non_at}+@
-
-%{
-int current_level=-1;
-int level_diff=MAXGEDCLEVEL;
-int line_no=1;
-%} 
-
-%%
-
-    /* The GEDCOM level number is converted into a sequence of opening
-       and closing brackets.  Simply put, the following GEDCOM fragment:
-
-         0 HEAD
-        1 SOUR genes
-        2 VERS 1.6
-        2 NAME Genes
-        1 DATE 07 OCT 2001
-        ...
-        0 TRLR
-
-       is converted into:
-
-         { HEAD                     (initial)  
-        { SOUR genes               (1 higher: no closing brackets)
-        { VERS 1.6                 (1 higher: no closing brackets)
-        } { NAME Genes             (same level: 1 closing bracket)
-        } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
-        ...
-        } { TRLR }
-
-       or more clearly:
-
-         { HEAD
-          { SOUR genes
-            { VERS 1.6 }
-            { NAME Genes } }
-          { DATE 07 OCT 2001
-        ... }
-        { TRLR }
-
-       But because this means that one token is converted into a series
-       of tokens, there is some initial code following immediately here
-       that returns "pending" tokens. */
-
-%{
-char string_buf[MAXGEDCLINELEN+1];
-if (level_diff < 1) {
-  level_diff++;
-  return CLOSE;
-}
-else if (level_diff == 1) {
-  level_diff++;
-  return OPEN;
-}
-else {
-  /* out of brackets... */
-}
-
-#define MKTAGACTION(tag) \
-  { gedcom_lval.string = gedcom_text; \
-    BEGIN(NORMAL); \
-    return TAG_##tag; }
-
-%}
-
-<INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
-
-<INITIAL>0{digit}+ { gedcom_error ("Level number with leading zero");
-                     return BADTOKEN;
-                   }
-
-<INITIAL>{digit}+ { int level = atoi(gedcom_text);
-                    if ((level < 0) || (level > MAXGEDCLEVEL)) {
-                     gedcom_error ("Level number out of range [0..%d]",
-                                   MAXGEDCLEVEL);
-                     return BADTOKEN;
-                   }
-                    level_diff = level - current_level;
-                   BEGIN(EXPECT_TAG);
-                   current_level = level;
-                   if (level_diff < 1) {
-                     level_diff++;
-                     return CLOSE;
-                   }
-                   else if (level_diff == 1) {
-                     level_diff++;
-                     return OPEN;
-                   }
-                   else {
-                     /* should never happen (error to GEDCOM spec) */
-                     gedcom_error ("GEDCOM level number is %d higher than "
-                                   "previous",
-                                   level_diff);
-                     return BADTOKEN;
-                   }
-                  }
-
-<EXPECT_TAG>ABBR  MKTAGACTION(ABBR)
-<EXPECT_TAG>ADDR  MKTAGACTION(ADDR)
-<EXPECT_TAG>ADR1  MKTAGACTION(ADR1)
-<EXPECT_TAG>ADR2  MKTAGACTION(ADR2)
-<EXPECT_TAG>ADOP  MKTAGACTION(ADOP)
-<EXPECT_TAG>AFN   MKTAGACTION(AFN)
-<EXPECT_TAG>AGE   MKTAGACTION(AGE)
-<EXPECT_TAG>AGNC  MKTAGACTION(AGNC)
-<EXPECT_TAG>ALIA  MKTAGACTION(ALIA)
-<EXPECT_TAG>ANCE  MKTAGACTION(ANCE)
-<EXPECT_TAG>ANCI  MKTAGACTION(ANCI)
-<EXPECT_TAG>ANUL  MKTAGACTION(ANUL)
-<EXPECT_TAG>ASSO  MKTAGACTION(ASSO)
-<EXPECT_TAG>AUTH  MKTAGACTION(AUTH)
-<EXPECT_TAG>BAPL  MKTAGACTION(BAPL)
-<EXPECT_TAG>BAPM  MKTAGACTION(BAPM)
-<EXPECT_TAG>BARM  MKTAGACTION(BARM)
-<EXPECT_TAG>BASM  MKTAGACTION(BASM)
-<EXPECT_TAG>BIRT  MKTAGACTION(BIRT)
-<EXPECT_TAG>BLES  MKTAGACTION(BLES)
-<EXPECT_TAG>BLOB  MKTAGACTION(BLOB)
-<EXPECT_TAG>BURI  MKTAGACTION(BURI)
-<EXPECT_TAG>CALN  MKTAGACTION(CALN)
-<EXPECT_TAG>CAST  MKTAGACTION(CAST)
-<EXPECT_TAG>CAUS  MKTAGACTION(CAUS)
-<EXPECT_TAG>CENS  MKTAGACTION(CENS)
-<EXPECT_TAG>CHAN  MKTAGACTION(CHAN)
-<EXPECT_TAG>CHAR  MKTAGACTION(CHAR)
-<EXPECT_TAG>CHIL  MKTAGACTION(CHIL)
-<EXPECT_TAG>CHR   MKTAGACTION(CHR)
-<EXPECT_TAG>CHRA  MKTAGACTION(CHRA)
-<EXPECT_TAG>CITY  MKTAGACTION(CITY)
-<EXPECT_TAG>CONC  MKTAGACTION(CONC)
-<EXPECT_TAG>CONF  MKTAGACTION(CONF)
-<EXPECT_TAG>CONL  MKTAGACTION(CONL)
-<EXPECT_TAG>CONT  MKTAGACTION(CONT)
-<EXPECT_TAG>COPR  MKTAGACTION(COPR)
-<EXPECT_TAG>CORP  MKTAGACTION(CORP)
-<EXPECT_TAG>CREM  MKTAGACTION(CREM)
-<EXPECT_TAG>CTRY  MKTAGACTION(CTRY)
-<EXPECT_TAG>DATA  MKTAGACTION(DATA)
-<EXPECT_TAG>DATE  MKTAGACTION(DATE)
-<EXPECT_TAG>DEAT  MKTAGACTION(DEAT)
-<EXPECT_TAG>DESC  MKTAGACTION(DESC)
-<EXPECT_TAG>DESI  MKTAGACTION(DESI)
-<EXPECT_TAG>DEST  MKTAGACTION(DEST)
-<EXPECT_TAG>DIV   MKTAGACTION(DIV)
-<EXPECT_TAG>DIVF  MKTAGACTION(DIVF)
-<EXPECT_TAG>DSCR  MKTAGACTION(DSCR)
-<EXPECT_TAG>EDUC  MKTAGACTION(EDUC)
-<EXPECT_TAG>EMIG  MKTAGACTION(EMIG)
-<EXPECT_TAG>ENDL  MKTAGACTION(ENDL)
-<EXPECT_TAG>ENGA  MKTAGACTION(ENGA)
-<EXPECT_TAG>EVEN  MKTAGACTION(EVEN)
-<EXPECT_TAG>FAM   MKTAGACTION(FAM)
-<EXPECT_TAG>FAMC  MKTAGACTION(FAMC)
-<EXPECT_TAG>FAMF  MKTAGACTION(FAMF)
-<EXPECT_TAG>FAMS  MKTAGACTION(FAMS)
-<EXPECT_TAG>FCOM  MKTAGACTION(FCOM)
-<EXPECT_TAG>FILE  MKTAGACTION(FILE)
-<EXPECT_TAG>FORM  MKTAGACTION(FORM)
-<EXPECT_TAG>GEDC  MKTAGACTION(GEDC)
-<EXPECT_TAG>GIVN  MKTAGACTION(GIVN)
-<EXPECT_TAG>GRAD  MKTAGACTION(GRAD)
-<EXPECT_TAG>HEAD  MKTAGACTION(HEAD)
-<EXPECT_TAG>HUSB  MKTAGACTION(HUSB)
-<EXPECT_TAG>IDNO  MKTAGACTION(IDNO)
-<EXPECT_TAG>IMMI  MKTAGACTION(IMMI)
-<EXPECT_TAG>INDI  MKTAGACTION(INDI)
-<EXPECT_TAG>LANG  MKTAGACTION(LANG)
-<EXPECT_TAG>LEGA  MKTAGACTION(LEGA)
-<EXPECT_TAG>MARB  MKTAGACTION(MARB)
-<EXPECT_TAG>MARC  MKTAGACTION(MARC)
-<EXPECT_TAG>MARL  MKTAGACTION(MARL)
-<EXPECT_TAG>MARR  MKTAGACTION(MARR)
-<EXPECT_TAG>MARS  MKTAGACTION(MARS)
-<EXPECT_TAG>MEDI  MKTAGACTION(MEDI)
-<EXPECT_TAG>NAME  MKTAGACTION(NAME)
-<EXPECT_TAG>NATI  MKTAGACTION(NATI)
-<EXPECT_TAG>NATU  MKTAGACTION(NATU)
-<EXPECT_TAG>NCHI  MKTAGACTION(NCHI)
-<EXPECT_TAG>NICK  MKTAGACTION(NICK)
-<EXPECT_TAG>NMR   MKTAGACTION(NMR)
-<EXPECT_TAG>NOTE  MKTAGACTION(NOTE)
-<EXPECT_TAG>NPFX  MKTAGACTION(NPFX)
-<EXPECT_TAG>NSFX  MKTAGACTION(NSFX)
-<EXPECT_TAG>OBJE  MKTAGACTION(OBJE)
-<EXPECT_TAG>OCCU  MKTAGACTION(OCCU)
-<EXPECT_TAG>ORDI  MKTAGACTION(ORDI)
-<EXPECT_TAG>ORDN  MKTAGACTION(ORDN)
-<EXPECT_TAG>PAGE  MKTAGACTION(PAGE)
-<EXPECT_TAG>PEDI  MKTAGACTION(PEDI)
-<EXPECT_TAG>PHON  MKTAGACTION(PHON)
-<EXPECT_TAG>PLAC  MKTAGACTION(PLAC)
-<EXPECT_TAG>POST  MKTAGACTION(POST)
-<EXPECT_TAG>PROB  MKTAGACTION(PROB)
-<EXPECT_TAG>PROP  MKTAGACTION(PROP)
-<EXPECT_TAG>PUBL  MKTAGACTION(PUBL)
-<EXPECT_TAG>QUAY  MKTAGACTION(QUAY)
-<EXPECT_TAG>REFN  MKTAGACTION(REFN)
-<EXPECT_TAG>RELA  MKTAGACTION(RELA)
-<EXPECT_TAG>RELI  MKTAGACTION(RELI)
-<EXPECT_TAG>REPO  MKTAGACTION(REPO)
-<EXPECT_TAG>RESI  MKTAGACTION(RESI)
-<EXPECT_TAG>RESN  MKTAGACTION(RESN)
-<EXPECT_TAG>RETI  MKTAGACTION(RETI)
-<EXPECT_TAG>RFN   MKTAGACTION(RFN)
-<EXPECT_TAG>RIN   MKTAGACTION(RIN)
-<EXPECT_TAG>ROLE  MKTAGACTION(ROLE)
-<EXPECT_TAG>SEX   MKTAGACTION(SEX)
-<EXPECT_TAG>SLGC  MKTAGACTION(SLGC)
-<EXPECT_TAG>SLGS  MKTAGACTION(SLGS)
-<EXPECT_TAG>SOUR  MKTAGACTION(SOUR)
-<EXPECT_TAG>SPFX  MKTAGACTION(SPFX)
-<EXPECT_TAG>SSN   MKTAGACTION(SSN)
-<EXPECT_TAG>STAE  MKTAGACTION(STAE)
-<EXPECT_TAG>STAT  MKTAGACTION(STAT)
-<EXPECT_TAG>SUBM  MKTAGACTION(SUBM)
-<EXPECT_TAG>SUBN  MKTAGACTION(SUBN)
-<EXPECT_TAG>SURN  MKTAGACTION(SURN)
-<EXPECT_TAG>TEMP  MKTAGACTION(TEMP)
-<EXPECT_TAG>TEXT  MKTAGACTION(TEXT)
-<EXPECT_TAG>TIME  MKTAGACTION(TIME)
-<EXPECT_TAG>TITL  MKTAGACTION(TITL)
-<EXPECT_TAG>TRLR  MKTAGACTION(TRLR)
-<EXPECT_TAG>TYPE  MKTAGACTION(TYPE)
-<EXPECT_TAG>VERS  MKTAGACTION(VERS)
-<EXPECT_TAG>WIFE  MKTAGACTION(WIFE)
-<EXPECT_TAG>WILL  MKTAGACTION(WILL)
-     
-<EXPECT_TAG>{alphanum}+ { if (strlen(gedcom_text) > MAXGEDCTAGLEN) {
-                            gedcom_error("Tag '%s' too long, max %d chars");
-                            return BADTOKEN;
-                          }
-                          strncpy(string_buf, gedcom_text, MAXGEDCTAGLEN+1);
-                         gedcom_lval.string = string_buf;
-                         BEGIN(NORMAL);
-                         return USERTAG;
-                        }
-
-{delim}      { gedcom_lval.string = gedcom_text;
-               return DELIM;
-             }
-
-{any_but_delim} { gedcom_lval.string = gedcom_text;
-                  return ANYCHAR;
-                }
-
-{escape}/{non_at}  { gedcom_lval.string = gedcom_text;
-                     return ESCAPE;
-                   }
-
-{pointer}    { gedcom_lval.string = gedcom_text;
-               return POINTER;
-             }
-
-   /* Due to the conversion of level numbers into brackets, the
-      terminator is not important, so no token is returned here.
-      Although not strictly according to the GEDCOM spec, we'll ignore
-      whitespace just before the terminator.
-   */
-
-{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
-
-   /* Eventually we have to return 1 closing bracket (for the trailer).
-      We can detect whether we have sent the closing bracket using the
-      level_diff (at eof, first it is 2, then we increment it ourselves) */
-
-<<EOF>> { if (level_diff == 2) {
-           level_diff++;
-            return CLOSE;
-          }
-          else {
-           yyterminate();
-         }
-        } 
-
-.  { gedcom_error("Unexpected character: '%s' (0x%02x)",
-                 gedcom_text, gedcom_text[0]);
-     return BADTOKEN;
-   }
-
-%%
-
-int gedcom_wrap()
-{
-  return 1;
-}
index b455587196502464fb3ac9da71f9b519f2099b7f..7b8b2ef421440c6e4c36d4f0ea44dd76b6788729 100644 (file)
--- a/gedcom.y
+++ b/gedcom.y
 
 %{
 #include "gedcom.h"
+#include "multilex.h"
 
 int  count_level    = 0;
 int  fail           = 0;
@@ -129,7 +130,7 @@ int  compat_enabled = 1;
 int  gedcom_high_level_debug = 0; 
 int  compatibility  = 0; 
 MECHANISM error_mechanism=IMMED_FAIL;
-char string_buf[MAXGEDCLINELEN+1];
+char string_buf[MAXGEDCLINELEN*4+1];
 char *string_buf_ptr;
 
 enum _COMPAT {
@@ -2109,10 +2110,15 @@ opt_line_item : /* empty */ { }
               | DELIM line_item { }
               ;
 
-line_item   : anychar  { CLEAR_BUFFER(string_buf);
+line_item   : anychar  { int i;
+                        CLEAR_BUFFER(string_buf);
                          string_buf_ptr = string_buf;
                         /* The following also takes care of '@@' */
-                        *string_buf_ptr++ = $1[0];
+                        if (!strncmp($1, "@@", 3))
+                          *string_buf_ptr++ = '@';
+                        else
+                          for (i=0; i < strlen($1); i++)
+                            *string_buf_ptr++ = $1[i];
                         $$ = string_buf;
                        }
             | ESCAPE   { CLEAR_BUFFER(string_buf);
@@ -2126,8 +2132,13 @@ line_item   : anychar  { CLEAR_BUFFER(string_buf);
                      YYERROR;
                    }
                    else {
+                     int i;
                      /* The following also takes care of '@@' */
-                     *string_buf_ptr++ = $2[0];
+                     if (!strncmp($2, "@@", 3))
+                       *string_buf_ptr++ = '@';
+                     else
+                       for (i=0; i < strlen($2); i++)
+                         *string_buf_ptr++ = $2[i];
                      $$ = string_buf;
                    }
                  }
@@ -2436,3 +2447,4 @@ int compat_mode(int compat_flags)
 {
   return (compat_flags & compatibility);
 }
+
diff --git a/gedcom_lohi.lex b/gedcom_lohi.lex
new file mode 100644 (file)
index 0000000..6d88b43
--- /dev/null
@@ -0,0 +1,357 @@
+/* $Id$ */
+/* $Name$ */
+
+/* In low-high order, a space is encoded as 0x20 0x00 */
+/* i.e. this is utf-16-le */
+
+%{
+#include "gedcom.tab.h"
+#include "gedcom.h"
+#include "multilex.h"
+#include "encoding.h"
+%}
+
+%s NORMAL
+%s EXPECT_TAG
+
+alpha        [A-Za-z_]\x00
+digit        [0-9]\x00
+delim        \x20\x00
+tab          [\t]\x00
+hash         #\x00
+literal_at   @\x00@\x00
+otherchar    [\x21-\x22\x24-\x2F\x3A-\x3F\x5B-\x5E\x60\x7B-\x7E\x80-\xFF]\x00|[\x00-\xFF][\x01-\xFF]
+terminator   \x0D\x00|\x0A\x00|\x0D\x00\x0A\x00|\x0A\x00\x0D\x00
+
+any_char     {alpha}|{digit}|{otherchar}|{delim}|{hash}|{literal_at}
+any_but_delim {alpha}|{digit}|{otherchar}|{hash}|{literal_at}
+non_at       {alpha}|{digit}|{otherchar}|{delim}|{hash}
+alphanum     {alpha}|{digit}
+gen_delim    {delim}|{tab}
+
+escape       @\x00#\x00{any_char}+@\x00
+pointer      @\x00{alphanum}{non_at}+@\x00
+
+%{
+static int current_level=-1;
+static int level_diff=MAXGEDCLEVEL;
+#ifdef LEXER_TEST 
+YYSTYPE gedcom_lval;
+int line_no = 1; 
+#endif
+%} 
+
+%%
+
+    /* The GEDCOM level number is converted into a sequence of opening
+       and closing brackets.  Simply put, the following GEDCOM fragment:
+
+         0 HEAD
+        1 SOUR genes
+        2 VERS 1.6
+        2 NAME Genes
+        1 DATE 07 OCT 2001
+        ...
+        0 TRLR
+
+       is converted into:
+
+         { HEAD                     (initial)  
+        { SOUR genes               (1 higher: no closing brackets)
+        { VERS 1.6                 (1 higher: no closing brackets)
+        } { NAME Genes             (same level: 1 closing bracket)
+        } } { DATE 07 OCT 2001     (1 lower: 2 closing brackets)
+        ...
+        } { TRLR }
+
+       or more clearly:
+
+         { HEAD
+          { SOUR genes
+            { VERS 1.6 }
+            { NAME Genes } }
+          { DATE 07 OCT 2001
+        ... }
+        { TRLR }
+
+       But because this means that one token is converted into a series
+       of tokens, there is some initial code following immediately here
+       that returns "pending" tokens. */
+
+%{
+char string_buf[MAXGEDCLINELEN+1];
+if (level_diff < 1) {
+  level_diff++;
+  return CLOSE;
+}
+else if (level_diff == 1) {
+  level_diff++;
+  return OPEN;
+}
+else {
+  /* out of brackets... */
+}
+
+#define TO_INTERNAL(str) to_internal(str, yyleng) 
+
+#define MKTAGACTION(tag) \
+  { gedcom_lval.string = TO_INTERNAL(yytext); \
+    BEGIN(NORMAL); \
+    return TAG_##tag; }
+
+%}
+
+<INITIAL>{gen_delim}* /* ignore leading whitespace (also tabs) */
+
+<INITIAL>\x00[0]{digit}+ { gedcom_error ("Level number with leading zero");
+                           return BADTOKEN;
+                         }
+
+<INITIAL>{digit}+ { int level = atoi(TO_INTERNAL(yytext));
+                    if ((level < 0) || (level > MAXGEDCLEVEL)) {
+                     gedcom_error ("Level number out of range [0..%d]",
+                                   MAXGEDCLEVEL);
+                     return BADTOKEN;
+                   }
+                    level_diff = level - current_level;
+                   BEGIN(EXPECT_TAG);
+                   current_level = level;
+                   if (level_diff < 1) {
+                     level_diff++;
+                     return CLOSE;
+                   }
+                   else if (level_diff == 1) {
+                     level_diff++;
+                     return OPEN;
+                   }
+                   else {
+                     /* should never happen (error to GEDCOM spec) */
+                     gedcom_error ("GEDCOM level number is %d higher than "
+                                   "previous",
+                                   level_diff);
+                     return BADTOKEN;
+                   }
+                  }
+
+<EXPECT_TAG>A\x00B\x00B\x00R\x00  MKTAGACTION(ABBR)
+<EXPECT_TAG>A\x00D\x00D\x00R\x00  MKTAGACTION(ADDR)
+<EXPECT_TAG>A\x00D\x00R\x001\x00  MKTAGACTION(ADR1)
+<EXPECT_TAG>A\x00D\x00R\x002\x00  MKTAGACTION(ADR2)
+<EXPECT_TAG>A\x00D\x00O\x00P\x00  MKTAGACTION(ADOP)
+<EXPECT_TAG>A\x00F\x00N\x00   MKTAGACTION(AFN)
+<EXPECT_TAG>A\x00G\x00E\x00   MKTAGACTION(AGE)
+<EXPECT_TAG>A\x00G\x00N\x00C\x00  MKTAGACTION(AGNC)
+<EXPECT_TAG>A\x00L\x00I\x00A\x00  MKTAGACTION(ALIA)
+<EXPECT_TAG>A\x00N\x00C\x00E\x00  MKTAGACTION(ANCE)
+<EXPECT_TAG>A\x00N\x00C\x00I\x00  MKTAGACTION(ANCI)
+<EXPECT_TAG>A\x00N\x00U\x00L\x00  MKTAGACTION(ANUL)
+<EXPECT_TAG>A\x00S\x00S\x00O\x00  MKTAGACTION(ASSO)
+<EXPECT_TAG>A\x00U\x00T\x00H\x00  MKTAGACTION(AUTH)
+<EXPECT_TAG>B\x00A\x00P\x00L\x00  MKTAGACTION(BAPL)
+<EXPECT_TAG>B\x00A\x00P\x00M\x00  MKTAGACTION(BAPM)
+<EXPECT_TAG>B\x00A\x00R\x00M\x00  MKTAGACTION(BARM)
+<EXPECT_TAG>B\x00A\x00S\x00M\x00  MKTAGACTION(BASM)
+<EXPECT_TAG>B\x00I\x00R\x00T\x00  MKTAGACTION(BIRT)
+<EXPECT_TAG>B\x00L\x00E\x00S\x00  MKTAGACTION(BLES)
+<EXPECT_TAG>B\x00L\x00O\x00B\x00  MKTAGACTION(BLOB)
+<EXPECT_TAG>B\x00U\x00R\x00I\x00  MKTAGACTION(BURI)
+<EXPECT_TAG>C\x00A\x00L\x00N\x00  MKTAGACTION(CALN)
+<EXPECT_TAG>C\x00A\x00S\x00T\x00  MKTAGACTION(CAST)
+<EXPECT_TAG>C\x00A\x00U\x00S\x00  MKTAGACTION(CAUS)
+<EXPECT_TAG>C\x00E\x00N\x00S\x00  MKTAGACTION(CENS)
+<EXPECT_TAG>C\x00H\x00A\x00N\x00  MKTAGACTION(CHAN)
+<EXPECT_TAG>C\x00H\x00A\x00R\x00  MKTAGACTION(CHAR)
+<EXPECT_TAG>C\x00H\x00I\x00L\x00  MKTAGACTION(CHIL)
+<EXPECT_TAG>C\x00H\x00R\x00   MKTAGACTION(CHR)
+<EXPECT_TAG>C\x00H\x00R\x00A\x00  MKTAGACTION(CHRA)
+<EXPECT_TAG>C\x00I\x00T\x00Y\x00  MKTAGACTION(CITY)
+<EXPECT_TAG>C\x00O\x00N\x00C\x00  MKTAGACTION(CONC)
+<EXPECT_TAG>C\x00O\x00N\x00F\x00  MKTAGACTION(CONF)
+<EXPECT_TAG>C\x00O\x00N\x00L\x00  MKTAGACTION(CONL)
+<EXPECT_TAG>C\x00O\x00N\x00T\x00  MKTAGACTION(CONT)
+<EXPECT_TAG>C\x00O\x00P\x00R\x00  MKTAGACTION(COPR)
+<EXPECT_TAG>C\x00O\x00R\x00P\x00  MKTAGACTION(CORP)
+<EXPECT_TAG>C\x00R\x00E\x00M\x00  MKTAGACTION(CREM)
+<EXPECT_TAG>C\x00T\x00R\x00Y\x00  MKTAGACTION(CTRY)
+<EXPECT_TAG>D\x00A\x00T\x00A\x00  MKTAGACTION(DATA)
+<EXPECT_TAG>D\x00A\x00T\x00E\x00  MKTAGACTION(DATE)
+<EXPECT_TAG>D\x00E\x00A\x00T\x00  MKTAGACTION(DEAT)
+<EXPECT_TAG>D\x00E\x00S\x00C\x00  MKTAGACTION(DESC)
+<EXPECT_TAG>D\x00E\x00S\x00I\x00  MKTAGACTION(DESI)
+<EXPECT_TAG>D\x00E\x00S\x00T\x00  MKTAGACTION(DEST)
+<EXPECT_TAG>D\x00I\x00V\x00   MKTAGACTION(DIV)
+<EXPECT_TAG>D\x00I\x00V\x00F\x00  MKTAGACTION(DIVF)
+<EXPECT_TAG>D\x00S\x00C\x00R\x00  MKTAGACTION(DSCR)
+<EXPECT_TAG>E\x00D\x00U\x00C\x00  MKTAGACTION(EDUC)
+<EXPECT_TAG>E\x00M\x00I\x00G\x00  MKTAGACTION(EMIG)
+<EXPECT_TAG>E\x00N\x00D\x00L\x00  MKTAGACTION(ENDL)
+<EXPECT_TAG>E\x00N\x00G\x00A\x00  MKTAGACTION(ENGA)
+<EXPECT_TAG>E\x00V\x00E\x00N\x00  MKTAGACTION(EVEN)
+<EXPECT_TAG>F\x00A\x00M\x00   MKTAGACTION(FAM)
+<EXPECT_TAG>F\x00A\x00M\x00C\x00  MKTAGACTION(FAMC)
+<EXPECT_TAG>F\x00A\x00M\x00F\x00  MKTAGACTION(FAMF)
+<EXPECT_TAG>F\x00A\x00M\x00S\x00  MKTAGACTION(FAMS)
+<EXPECT_TAG>F\x00C\x00O\x00M\x00  MKTAGACTION(FCOM)
+<EXPECT_TAG>F\x00I\x00L\x00E\x00  MKTAGACTION(FILE)
+<EXPECT_TAG>F\x00O\x00R\x00M\x00  MKTAGACTION(FORM)
+<EXPECT_TAG>G\x00E\x00D\x00C\x00  MKTAGACTION(GEDC)
+<EXPECT_TAG>G\x00I\x00V\x00N\x00  MKTAGACTION(GIVN)
+<EXPECT_TAG>G\x00R\x00A\x00D\x00  MKTAGACTION(GRAD)
+<EXPECT_TAG>H\x00E\x00A\x00D\x00  MKTAGACTION(HEAD)
+<EXPECT_TAG>H\x00U\x00S\x00B\x00  MKTAGACTION(HUSB)
+<EXPECT_TAG>I\x00D\x00N\x00O\x00  MKTAGACTION(IDNO)
+<EXPECT_TAG>I\x00M\x00M\x00I\x00  MKTAGACTION(IMMI)
+<EXPECT_TAG>I\x00N\x00D\x00I\x00  MKTAGACTION(INDI)
+<EXPECT_TAG>L\x00A\x00N\x00G\x00  MKTAGACTION(LANG)
+<EXPECT_TAG>L\x00E\x00G\x00A\x00  MKTAGACTION(LEGA)
+<EXPECT_TAG>M\x00A\x00R\x00B\x00  MKTAGACTION(MARB)
+<EXPECT_TAG>M\x00A\x00R\x00C\x00  MKTAGACTION(MARC)
+<EXPECT_TAG>M\x00A\x00R\x00L\x00  MKTAGACTION(MARL)
+<EXPECT_TAG>M\x00A\x00R\x00R\x00  MKTAGACTION(MARR)
+<EXPECT_TAG>M\x00A\x00R\x00S\x00  MKTAGACTION(MARS)
+<EXPECT_TAG>M\x00E\x00D\x00I\x00  MKTAGACTION(MEDI)
+<EXPECT_TAG>N\x00A\x00M\x00E\x00  MKTAGACTION(NAME)
+<EXPECT_TAG>N\x00A\x00T\x00I\x00  MKTAGACTION(NATI)
+<EXPECT_TAG>N\x00A\x00T\x00U\x00  MKTAGACTION(NATU)
+<EXPECT_TAG>N\x00C\x00H\x00I\x00  MKTAGACTION(NCHI)
+<EXPECT_TAG>N\x00I\x00C\x00K\x00  MKTAGACTION(NICK)
+<EXPECT_TAG>N\x00M\x00R\x00   MKTAGACTION(NMR)
+<EXPECT_TAG>N\x00O\x00T\x00E\x00  MKTAGACTION(NOTE)
+<EXPECT_TAG>N\x00P\x00F\x00X\x00  MKTAGACTION(NPFX)
+<EXPECT_TAG>N\x00S\x00F\x00X\x00  MKTAGACTION(NSFX)
+<EXPECT_TAG>O\x00B\x00J\x00E\x00  MKTAGACTION(OBJE)
+<EXPECT_TAG>O\x00C\x00C\x00U\x00  MKTAGACTION(OCCU)
+<EXPECT_TAG>O\x00R\x00D\x00I\x00  MKTAGACTION(ORDI)
+<EXPECT_TAG>O\x00R\x00D\x00N\x00  MKTAGACTION(ORDN)
+<EXPECT_TAG>P\x00A\x00G\x00E\x00  MKTAGACTION(PAGE)
+<EXPECT_TAG>P\x00E\x00D\x00I\x00  MKTAGACTION(PEDI)
+<EXPECT_TAG>P\x00H\x00O\x00N\x00  MKTAGACTION(PHON)
+<EXPECT_TAG>P\x00L\x00A\x00C\x00  MKTAGACTION(PLAC)
+<EXPECT_TAG>P\x00O\x00S\x00T\x00  MKTAGACTION(POST)
+<EXPECT_TAG>P\x00R\x00O\x00B\x00  MKTAGACTION(PROB)
+<EXPECT_TAG>P\x00R\x00O\x00P\x00  MKTAGACTION(PROP)
+<EXPECT_TAG>P\x00U\x00B\x00L\x00  MKTAGACTION(PUBL)
+<EXPECT_TAG>Q\x00U\x00A\x00Y\x00  MKTAGACTION(QUAY)
+<EXPECT_TAG>R\x00E\x00F\x00N\x00  MKTAGACTION(REFN)
+<EXPECT_TAG>R\x00E\x00L\x00A\x00  MKTAGACTION(RELA)
+<EXPECT_TAG>R\x00E\x00L\x00I\x00  MKTAGACTION(RELI)
+<EXPECT_TAG>R\x00E\x00P\x00O\x00  MKTAGACTION(REPO)
+<EXPECT_TAG>R\x00E\x00S\x00I\x00  MKTAGACTION(RESI)
+<EXPECT_TAG>R\x00E\x00S\x00N\x00  MKTAGACTION(RESN)
+<EXPECT_TAG>R\x00E\x00T\x00I\x00  MKTAGACTION(RETI)
+<EXPECT_TAG>R\x00F\x00N\x00   MKTAGACTION(RFN)
+<EXPECT_TAG>R\x00I\x00N\x00   MKTAGACTION(RIN)
+<EXPECT_TAG>R\x00O\x00L\x00E\x00  MKTAGACTION(ROLE)
+<EXPECT_TAG>S\x00E\x00X\x00   MKTAGACTION(SEX)
+<EXPECT_TAG>S\x00L\x00G\x00C\x00  MKTAGACTION(SLGC)
+<EXPECT_TAG>S\x00L\x00G\x00S\x00  MKTAGACTION(SLGS)
+<EXPECT_TAG>S\x00O\x00U\x00R\x00  MKTAGACTION(SOUR)
+<EXPECT_TAG>S\x00P\x00F\x00X\x00  MKTAGACTION(SPFX)
+<EXPECT_TAG>S\x00S\x00N\x00   MKTAGACTION(SSN)
+<EXPECT_TAG>S\x00T\x00A\x00E\x00  MKTAGACTION(STAE)
+<EXPECT_TAG>S\x00T\x00A\x00T\x00  MKTAGACTION(STAT)
+<EXPECT_TAG>S\x00U\x00B\x00M\x00  MKTAGACTION(SUBM)
+<EXPECT_TAG>S\x00U\x00B\x00N\x00  MKTAGACTION(SUBN)
+<EXPECT_TAG>S\x00U\x00R\x00N\x00  MKTAGACTION(SURN)
+<EXPECT_TAG>T\x00E\x00M\x00P\x00  MKTAGACTION(TEMP)
+<EXPECT_TAG>T\x00E\x00X\x00T\x00  MKTAGACTION(TEXT)
+<EXPECT_TAG>T\x00I\x00M\x00E\x00  MKTAGACTION(TIME)
+<EXPECT_TAG>T\x00I\x00T\x00L\x00  MKTAGACTION(TITL)
+<EXPECT_TAG>T\x00R\x00L\x00R\x00  MKTAGACTION(TRLR)
+<EXPECT_TAG>T\x00Y\x00P\x00E\x00  MKTAGACTION(TYPE)
+<EXPECT_TAG>V\x00E\x00R\x00S\x00  MKTAGACTION(VERS)
+<EXPECT_TAG>W\x00I\x00F\x00E\x00  MKTAGACTION(WIFE)
+<EXPECT_TAG>W\x00I\x00L\x00L\x00  MKTAGACTION(WILL)
+     
+<EXPECT_TAG>{alphanum}+ { if (strlen(yytext) > MAXGEDCTAGLEN) {
+                            gedcom_error("Tag '%s' too long, max %d chars");
+                            return BADTOKEN;
+                          }
+                          strncpy(string_buf, yytext, MAXGEDCTAGLEN+1);
+                         gedcom_lval.string = TO_INTERNAL(string_buf);
+                         BEGIN(NORMAL);
+                         return USERTAG;
+                        }
+
+{delim}      { gedcom_lval.string = TO_INTERNAL(yytext);
+               return DELIM;
+             }
+
+{any_but_delim} { gedcom_lval.string = TO_INTERNAL(yytext);
+                  return ANYCHAR;
+                }
+
+{escape}/{non_at}  { gedcom_lval.string = TO_INTERNAL(yytext);
+                     return ESCAPE;
+                   }
+
+{pointer}    { gedcom_lval.string = TO_INTERNAL(yytext);
+               return POINTER;
+             }
+
+   /* Due to the conversion of level numbers into brackets, the
+      terminator is not important, so no token is returned here.
+      Although not strictly according to the GEDCOM spec, we'll ignore
+      whitespace just before the terminator.
+   */
+
+{gen_delim}*{terminator} { line_no++; BEGIN(INITIAL); }
+
+   /* Eventually we have to return 1 closing bracket (for the trailer).
+      We can detect whether we have sent the closing bracket using the
+      level_diff (at eof, first it is 2, then we increment it ourselves) */
+
+<<EOF>> { if (level_diff == 2) {
+           level_diff++;
+            return CLOSE;
+          }
+          else {
+           yyterminate();
+         }
+        } 
+
+.  { gedcom_error("Unexpected character: '%s' (0x%02x)",
+                 yytext, yytext[0]);
+     return BADTOKEN;
+   }
+
+%%
+
+int yywrap()
+{
+  return 1;
+}
+
+#ifdef LEXER_TEST
+
+int main()
+{
+  int tok;
+  int res = open_conv_to_internal("UTF16LE");
+  if (!res) {
+    gedcom_error("Unable to open conversion context: %s",
+                strerror(errno));
+    return 1;
+  }
+  tok = gedcom_lohi_lex();
+  while (tok) {
+    switch(tok) {
+      case BADTOKEN: printf("BADTOKEN "); break;
+      case OPEN: printf("OPEN "); break;
+      case CLOSE: printf("CLOSE "); break;
+      case ESCAPE: printf("ESCAPE(%s) ", gedcom_lval.string); break;
+      case DELIM: printf("DELIM "); break;
+      case ANYCHAR: printf("%s ", gedcom_lval.string); break;
+      case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
+      case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
+      default: printf("TAG(%s) ", gedcom_lval.string); break;
+    }
+    tok = gedcom_lohi_lex();
+  }
+  printf("\n");
+  close_conv_to_internal();
+  return 0;
+}
+#endif
index 41c77524b3a99f9af453bd48f1c9250d401d02e9..452c1db394e1d88bc2996edbd2b3ae26313c36c2 100644 (file)
@@ -2,6 +2,7 @@
 /* $Name$ */
 
 #include "gedcom.h"
+#include "multilex.h"
 
 void show_help ()
 {
@@ -17,65 +18,6 @@ void show_help ()
   printf("  -da   Debug setting: libgedcom + yacc debug messages\n");
 }
 
-int determine_encoding(FILE* f)
-{
-  char first[2];
-
-  fread(first, 1, 2, f);
-  if ((first[0] == '0') && (first[1] == ' ')) {
-    gedcom_warning("One-byte encoding");
-    fseek(f, 0, 0);
-    return ONE_BYTE;
-  }
-  else if ((first[0] == '\0') && (first[1] == '0'))
-  {
-    gedcom_warning("Two-byte encoding, high-low");
-    fseek(f, 0, 0);
-    return TWO_BYTE_HILO;
-  }
-  else if ((first[0] == '\xFE') && (first[1] == '\xFF'))
-  {
-    gedcom_warning("Two-byte encoding, high-low, with BOM");
-    return TWO_BYTE_HILO;
-  }
-  else if ((first[0] == '0') && (first[1] == '\0'))
-  {
-    gedcom_warning("Two-byte encoding, low-high");
-    fseek(f, 0, 0);
-    return TWO_BYTE_LOHI;
-  }
-  else if ((first[0] == '\xFF') && (first[1] == '\xFE'))
-  {
-    gedcom_warning("Two-byte encoding, low-high, with BOM");
-    return TWO_BYTE_LOHI;
-  }
-  else {
-    gedcom_warning("Unknown encoding, falling back to one-byte");
-    fseek(f, 0, 0);
-    return ONE_BYTE;
-  }
-}
-
-int gedcom_xxx_parse(char* file_name)
-{
-  ENCODING enc;
-  FILE* file = fopen (file_name, "r");
-  if (!file) {
-    printf("Could not open file '%s'\n", file_name);
-    exit(1);
-  }
-  enc = determine_encoding(file);
-
-  if (enc == ONE_BYTE) {
-    gedcom_in = file;
-    return gedcom_parse();
-  }
-  else {
-    printf("No parser yet for encoding\n");
-    exit(1);
-  }
-}
-
 int main(int argc, char* argv[])
 {
   MECHANISM mech = IMMED_FAIL;
@@ -133,31 +75,3 @@ int main(int argc, char* argv[])
     return 1;
   }  
 }
-
-int gedcom_warning(char* s, ...)
-{
-  int res;
-  va_list ap;
-
-  va_start(ap, s);
-  fprintf(stderr, "Warning on line %d: ", line_no);
-  res = vfprintf(stderr, s, ap);
-  fprintf(stderr, "\n");
-  va_end(ap);
-  
-  return res;
-}
-
-int gedcom_error(char* s, ...)
-{
-  int res;
-  va_list ap;
-
-  va_start(ap, s);
-  fprintf(stderr, "Error on line %d: ", line_no);
-  res = vfprintf(stderr, s, ap);
-  fprintf(stderr, "\n");
-  va_end(ap);
-  
-  return res;
-}