Copied from old documentation. Removed all Gedcom_val details.
[gedcom-parse.git] / gedcom / gedcom_lex_common.c
index d528b02cbe83bf6e3086df8e4f2d3539928439c5..06c1f77a8a5fefb51696d6fb7ac63e9b1c7f917c 100644 (file)
@@ -1,5 +1,5 @@
 /* Common lexer code.
-   Copyright (C) 2001 The Genes Development Team
+   Copyright (C) 2001, 2002 The Genes Development Team
    This file is part of the Gedcom parser library.
    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
 
 /* $Id$ */
 /* $Name$ */
 
-#ifndef IN_LEX
+#if LEX_SECTION == 1
 
 #include "gedcom_internal.h"
 #include "multilex.h"
 #include "encoding.h"
+#include "encoding_state.h"
 #include "gedcom.h"
-#include "gedcom.tab.h"
-
-#define YY_NO_UNPUT
+#include "gedcom.tabgen.h"
+#include "compat.h"
 
 static size_t encoding_width;
 static int current_level = -1;
-static int level_diff=MAXGEDCLEVEL;
+static int level_diff = MAXGEDCLEVEL;
 static size_t line_len = 0;
+static int tab_space = 0;
+static int current_tag = -1;
+
+static struct conv_buffer* ptr_buffer = NULL;
+static struct conv_buffer* tag_buffer = NULL;
+static struct conv_buffer* str_buffer = NULL;
 
-static char ptr_buf[MAXGEDCPTRLEN * UTF_FACTOR + 1];
-static char tag_buf[MAXGEDCTAGLEN * UTF_FACTOR + 1];
-static char str_buf[MAXGEDCLINELEN * UTF_FACTOR + 1];
+#define INITIAL_PTR_BUFFER_LEN MAXGEDCPTRLEN * UTF_FACTOR + 1
+#define INITIAL_TAG_BUFFER_LEN MAXGEDCTAGLEN * UTF_FACTOR + 1
+#define INITIAL_STR_BUFFER_LEN MAXGEDCLINELEN * UTF_FACTOR + 1
 
 #ifdef LEXER_TEST 
 YYSTYPE gedcom_lval;
@@ -48,10 +54,10 @@ int gedcom_lex();
 
 void message_handler(Gedcom_msg_type type, char *msg)
 {
-  fprintf(stderr, msg);
+  fprintf(stderr, "(%d) %s\n", type, msg);
 }
 
-int test_loop(ENCODING enc, char* code)
+int test_loop(ENCODING enc, const char* code)
 {
   int tok, res;
   init_encodings();
@@ -73,8 +79,8 @@ int test_loop(ENCODING enc, char* code)
       case DELIM: printf("DELIM "); break;
       case ANYCHAR: printf("%s ", gedcom_lval.string); break;
       case POINTER: printf("POINTER(%s) ", gedcom_lval.string); break;
-      case USERTAG: printf("USERTAG(%s) ", gedcom_lval.string); break;
-      default: printf("TAG(%s) ", gedcom_lval.string); break;
+      case USERTAG: printf("USERTAG(%s) ", gedcom_lval.tag.string); break;
+      default: printf("TAG(%s) ", gedcom_lval.tag.string); break;
     }
     tok = gedcom_lex();
   }
@@ -85,10 +91,71 @@ int test_loop(ENCODING enc, char* code)
  
 #endif /* of #ifdef LEXER_TEST */
 
-#else  /* of #ifndef IN_LEX */
+/* These are defined as functions here, because xgettext has trouble
+   extracting the strings out of long pre-processor defined */
+
+static void error_line_too_long()
+{
+  gedcom_error(_("Line too long, max %d characters allowed"),
+              MAXGEDCLINELEN); 
+}
+
+static void error_level_leading_zero()
+{
+  gedcom_error (_("Level number with leading zero not allowed"));
+}
+
+static void error_level_out_of_range()
+{
+  gedcom_error (_("Level number out of range [0..%d]"), MAXGEDCLEVEL); 
+}
+
+static void error_level_too_high(int level_diff)
+{
+  gedcom_error (_("GEDCOM level number is %d higher than previous"),
+               level_diff); 
+}
+
+static void error_tag_too_long(const char *tag)
+{
+  gedcom_error(_("Tag '%s' too long, max %d characters allowed"),
+              tag, MAXGEDCTAGLEN); 
+}
+
+static void error_invalid_character(const char *str, char ch)
+{
+  gedcom_error(_("Invalid character for encoding: '%s' (0x%02x)"), str, ch); 
+}
+
+static void error_pointer_too_long(const char *ptr)
+{
+  gedcom_error(_("Pointer '%s' too long, max %d characters allowed"),
+              ptr, MAXGEDCPTRLEN);
+}
+
+static void error_at_character()
+{
+  gedcom_error(_("'@' character should be written as '@@' in values"));
+}
+
+static void error_tab_character()
+{
+  gedcom_error(_("Tab character is not allowed in values"));
+}
+
+static void error_unexpected_character(const char* str, char ch)
+{
+  gedcom_error(_("Unexpected character: '%s' (0x%02x)"), str, ch);
+}
+
+/* This is to bypass the iconv conversion (if the input is UTF-8 coming
+   from the program) */
+static int dummy_conv = 0;
+
+#elif LEX_SECTION == 2
 
 #define TO_INTERNAL(STR,OUTBUF) \
-  to_internal(STR, yyleng, OUTBUF, sizeof(OUTBUF))
+  (dummy_conv ? STR : to_internal(STR, yyleng, OUTBUF))
 
 #define INIT_LINE_LEN \
   line_len = 0;
@@ -96,20 +163,29 @@ int test_loop(ENCODING enc, char* code)
 #define CHECK_LINE_LEN                                                        \
   { if (line_len != (size_t)-1) {                                             \
       line_len += strlen(yytext);                                             \
-      if (line_len > MAXGEDCLINELEN * encoding_width) {                       \
-        gedcom_error("Line too long, max %d characters",                      \
-                    MAXGEDCLINELEN);                                         \
+      if (line_len > MAXGEDCLINELEN * encoding_width                          \
+         && ! compat_long_line(current_level, current_tag)) {                \
+        error_line_too_long();                                                \
         line_len = (size_t)-1;                                                \
         return BADTOKEN;                                                      \
       }                                                                       \
     }                                                                         \
   }
 
+#define GENERATE_TAB_SPACE                                                    \
+  { gedcom_lval.string = " ";                                                 \
+    tab_space--;                                                              \
+    return DELIM;                                                             \
+  }
+
 #define MKTAGACTION(THETAG)                                                  \
   { CHECK_LINE_LEN;                                                          \
-    gedcom_lval.string = TO_INTERNAL(yytext, tag_buf);                       \
+    gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer);                \
+    current_tag            = TAG_##THETAG;                                   \
+    gedcom_lval.tag.value  = current_tag;                                    \
     BEGIN(NORMAL);                                                           \
-    return TAG_##THETAG;                                                     \
+    line_no++;                                                               \
+    return current_tag;                                                      \
   }
 
 /* The GEDCOM level number is converted into a sequence of opening
@@ -145,10 +221,16 @@ int test_loop(ENCODING enc, char* code)
 
    But because this means that one token is converted into a series
    of tokens, there is some initial code following immediately here
-   that returns "pending" tokens. */
+   that returns "pending" tokens.
+
+   Also, for compatibility tabs are converted into spaces, which is
+   also handled here */
 
 #define ACTION_BEFORE_REGEXPS                                                 \
-   { if (level_diff < 1) {                                                    \
+   { if (compat_mode(C_TAB_CHARACTER) && tab_space-- > 0) {                   \
+       GENERATE_TAB_SPACE;                                                    \
+     }                                                                        \
+     else if (level_diff < 1) {                                               \
        level_diff++;                                                          \
        return CLOSE;                                                          \
      }                                                                        \
@@ -170,17 +252,17 @@ int test_loop(ENCODING enc, char* code)
 
 
 #define ACTION_0_DIGITS                                                       \
-   { gedcom_error ("Level number with leading zero");                         \
+   { error_level_leading_zero();                                              \
      return BADTOKEN;                                                         \
    } 
 
 
 #define ACTION_DIGITS                                                         \
-   { int level = atoi(TO_INTERNAL(yytext, str_buf));                          \
+   { int level = atoi(TO_INTERNAL(yytext, str_buffer));                       \
      CHECK_LINE_LEN;                                                          \
      if ((level < 0) || (level > MAXGEDCLEVEL)) {                             \
-       gedcom_error ("Level number out of range [0..%d]",                     \
-                    MAXGEDCLEVEL);                                           \
+       error_level_out_of_range();                                            \
+       line_no++;                                                             \
        return BADTOKEN;                                                       \
      }                                                                        \
      level_diff = level - current_level;                                      \
@@ -197,9 +279,8 @@ int test_loop(ENCODING enc, char* code)
      }                                                                        \
      else {                                                                   \
        /* should never happen (error to GEDCOM spec) */                       \
-       gedcom_error ("GEDCOM level number is %d higher than "                 \
-                    "previous",                                              \
-                    level_diff);                                             \
+       error_level_too_high(level_diff);                                      \
+       line_no++;                                                             \
        return BADTOKEN;                                                       \
      }                                                                        \
    } 
@@ -207,41 +288,52 @@ int test_loop(ENCODING enc, char* code)
 
 #define ACTION_ALPHANUM                                                       \
    { if (strlen(yytext) > MAXGEDCTAGLEN * encoding_width) {                   \
-       gedcom_error("Tag '%s' too long, max %d characters",                   \
-                   yytext, MAXGEDCTAGLEN);                                   \
+       error_tag_too_long(yytext);                                            \
+       line_no++;                                                             \
        return BADTOKEN;                                                       \
      }                                                                        \
      CHECK_LINE_LEN;                                                          \
-     gedcom_lval.string = TO_INTERNAL(yytext, tag_buf);                       \
+     gedcom_lval.tag.string = TO_INTERNAL(yytext, tag_buffer);                \
+     gedcom_lval.tag.value  = USERTAG;                                        \
      BEGIN(NORMAL);                                                           \
+     line_no++;                                                               \
      return USERTAG;                                                          \
    }
 
 
 #define ACTION_DELIM                                                          \
   { CHECK_LINE_LEN;                                                           \
-    gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
+    gedcom_lval.string = TO_INTERNAL(yytext, str_buffer);                     \
     return DELIM;                                                             \
   }
 
 
 #define ACTION_ANY                                                            \
-  { CHECK_LINE_LEN;                                                           \
-    gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
-    /* Due to character conversions, it is possible that the current          \
-       character will be combined with the next, and so now we don't have a   \
-       character yet...                                                       \
-       In principle, this is only applicable to the 1byte case (e.g. ANSEL),  \
-       but it doesn't harm the unicode case.                                  \
-    */                                                                        \
-    if (strlen(gedcom_lval.string) > 0)                                       \
-      return ANYCHAR;                                                         \
+  { char* tmp;                                                                \
+    CHECK_LINE_LEN;                                                           \
+    tmp = TO_INTERNAL(yytext, str_buffer);                                    \
+    if (!tmp) {                                                               \
+      /* Something went wrong during conversion... */                         \
+          error_invalid_character(yytext, yytext[0]);                         \
+          return BADTOKEN;                                                    \
+    }                                                                         \
+    else {                                                                    \
+      gedcom_lval.string = tmp;                                               \
+      /* Due to character conversions, it is possible that the current        \
+         character will be combined with the next, and so now we don't have a \
+         character yet...                                                     \
+         In principle, this is only applicable to the 1byte case (e.g. ANSEL),\
+         but it doesn't harm the unicode case.                                \
+      */                                                                      \
+      if (strlen(gedcom_lval.string) > 0)                                     \
+        return ANYCHAR;                                                       \
+    }                                                                         \
   }
 
 
 #define ACTION_ESCAPE                                                         \
   { CHECK_LINE_LEN;                                                           \
-    gedcom_lval.string = TO_INTERNAL(yytext, str_buf);                        \
+    gedcom_lval.string = TO_INTERNAL(yytext, str_buffer);                     \
     return ESCAPE;                                                            \
   }
 
@@ -249,11 +341,10 @@ int test_loop(ENCODING enc, char* code)
 #define ACTION_POINTER                                                        \
   { CHECK_LINE_LEN;                                                           \
     if (strlen(yytext) > MAXGEDCPTRLEN * encoding_width) {                    \
-      gedcom_error("Pointer '%s' too long, max %d characters",                \
-                  yytext, MAXGEDCPTRLEN);                                    \
+      error_pointer_too_long(yytext);                                         \
       return BADTOKEN;                                                        \
     }                                                                         \
-    gedcom_lval.string = TO_INTERNAL(yytext, ptr_buf);                        \
+    gedcom_lval.string = TO_INTERNAL(yytext, ptr_buffer);                     \
     return POINTER;                                                           \
   }
 
@@ -267,7 +358,8 @@ int test_loop(ENCODING enc, char* code)
 #define ACTION_TERMINATOR                                                     \
   { CHECK_LINE_LEN;                                                           \
     INIT_LINE_LEN;                                                            \
-    line_no++;                                                                \
+    if (line_no == 1)                                                         \
+      set_read_encoding_terminator(TO_INTERNAL(yytext, str_buffer));          \
     BEGIN(INITIAL);                                                           \
   }
 
@@ -283,19 +375,95 @@ int test_loop(ENCODING enc, char* code)
       return CLOSE;                                                           \
     }                                                                         \
     else {                                                                    \
-      /* Reset our state */                                                   \
-      current_level = -1;                                                     \
-      level_diff = MAXGEDCLEVEL;                                              \
-      /* ... then terminate lex */                                            \
+      char* ptr; int size;                                                    \
+      /* ... terminate lex */                                                 \
       yyterminate();                                                          \
+      /* Get rid of f*cking compiler warning from lex generated code */       \
+      /* yyterminate does return(), so program will never come here  */       \
+      yy_flex_realloc(ptr, size);                                             \
     }                                                                         \
   } 
 
+#define ACTION_NORMAL_AT                                                      \
+  { if (compat_mode(C_NO_DOUBLE_AT)) {                                        \
+      int i, j;                                                               \
+      char *yycopy = strdup(yytext);                                          \
+      if (yycopy) {                                                           \
+        for (i = 0; i < 2; i++)                                               \
+          for (j = yyleng - 1; j >= 0; --j)                                   \
+            unput(yycopy[j]);                                                 \
+        free(yycopy);                                                         \
+      }                                                                       \
+      else {                                                                  \
+        MEMORY_ERROR;                                                         \
+      }                                                                       \
+    }                                                                         \
+    else {                                                                    \
+      error_at_character();                                                   \
+      return BADTOKEN;                                                        \
+    }                                                                         \
+  }
+
+#define ACTION_TAB                                                            \
+  { if (compat_mode(C_TAB_CHARACTER)) {                                       \
+      tab_space = 8;                                                          \
+      GENERATE_TAB_SPACE;                                                     \
+    }                                                                         \
+    else {                                                                    \
+      error_tab_character();                                                  \
+      return BADTOKEN;                                                        \
+    }                                                                         \
+  }
 
 #define ACTION_UNEXPECTED                                                     \
-  { gedcom_error("Unexpected character: '%s' (0x%02x)",                       \
-                yytext, yytext[0]);                                          \
+  { error_unexpected_character(yytext, yytext[0]);                            \
     return BADTOKEN;                                                          \
   }
 
-#endif /* IN_LEX */
+#elif LEX_SECTION == 3
+
+int yywrap()
+{
+  return 1;
+}
+
+static void free_conv_buffers()
+{
+  free_conv_buffer(ptr_buffer);
+  free_conv_buffer(tag_buffer);
+  free_conv_buffer(str_buffer);
+}
+
+static void yylex_cleanup()
+{
+  /* fix memory leak in lex */
+  yy_delete_buffer(yy_current_buffer);
+  yy_current_buffer = NULL;
+  free_conv_buffers();
+}
+
+static void init_conv_buffers()
+{
+  if (!ptr_buffer) {
+    ptr_buffer = create_conv_buffer(INITIAL_PTR_BUFFER_LEN);
+    tag_buffer = create_conv_buffer(INITIAL_TAG_BUFFER_LEN);
+    str_buffer = create_conv_buffer(INITIAL_STR_BUFFER_LEN);
+  }
+}
+
+static int exitfuncregistered = 0;
+
+void yymyinit(FILE *f)
+{
+  if (! exitfuncregistered && atexit(yylex_cleanup) == 0)
+    exitfuncregistered = 1;
+  init_conv_buffers();
+  yyin = f;
+  yyrestart(f);
+  /* Reset our state */
+  current_level = -1;
+  level_diff = MAXGEDCLEVEL;
+  BEGIN(INITIAL);
+}
+
+#endif