iconv/glibc/ANSI_Z39.47.c

   1 /* Conversion for ANSI_Z39.47 aka ANSEL.
   2    Copyright (C) 2001 The Genes Development Team
   3    This file is part of the Gedcom parser library.
   4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
   5
   6    The Gedcom parser library is free software; you can redistribute it
   7    and/or modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The Gedcom parser library is distributed in the hope that it will be
  12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the Gedcom parser library; if not, write to the
  18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 /* $Id$ */
  22 /* $Name$ */
  23
  24 /* Generic conversion to and from ANSI Z39.47 (also known as ANSEL)
  25    Based on the ansi_x3.110.c file from the glibc sources
  26    Data coming from:
  27    http://lcweb.loc.gov/marc/specifications/speccharlatin.html
  28
  29    Note: in ANSEL, diacritical marks come *before* the base character;
  30    in Unicode, they come *after*...
  31 */
  32
  33 #include <dlfcn.h>
  34 #include <gconv.h>
  35 #include <stdint.h>
  36 #include <string.h>
  37 #include "ANSI_Z39.47-tables.h"
  38
  39 /* Omit first half of table: assume identity mapping (ASCII) */
  40 static const uint32_t to_ucs4[128] = TABLE_TO_UCS4_BASIC;
  41
  42 /* The outer array range runs from 0xe0 to 0xfe, the inner range from 0x20
  43    to 0x7f.  */
  44 static const uint32_t to_ucs4_comb[31][96] =
  45 {
  46   /* 0xe0 (hook above) */                           TABLE_TO_UCS4_COMBINING_E0,
  47   /* 0xe1 (grave) */                                TABLE_TO_UCS4_COMBINING_E1,
  48   /* 0xe2 (acute) */                                TABLE_TO_UCS4_COMBINING_E2,
  49   /* 0xe3 (circumflex) */                           TABLE_TO_UCS4_COMBINING_E3,
  50   /* 0xe4 (tilde) */                                TABLE_TO_UCS4_COMBINING_E4,
  51   /* 0xe5 (macron) */                               TABLE_TO_UCS4_COMBINING_E5,
  52   /* 0xe6 (breve) */                                TABLE_TO_UCS4_COMBINING_E6,
  53   /* 0xe7 (dot above) */                            TABLE_TO_UCS4_COMBINING_E7,
  54   /* 0xe8 (umlaut, diaeresis) */                    TABLE_TO_UCS4_COMBINING_E8,
  55   /* 0xe9 (caron, hacek) */                         TABLE_TO_UCS4_COMBINING_E9,
  56   /* 0xea (ring above) */                           TABLE_TO_UCS4_COMBINING_EA,
  57   /* 0xeb (ligature, left half) */                  TABLE_TO_UCS4_COMBINING_EB,
  58   /* 0xec (ligature, right half) */                 TABLE_TO_UCS4_COMBINING_EC,
  59   /* 0xed (comma above right) */                    TABLE_TO_UCS4_COMBINING_ED,
  60   /* 0xee (double acute) */                         TABLE_TO_UCS4_COMBINING_EE,
  61   /* 0xef (candrabindu) */                          TABLE_TO_UCS4_COMBINING_EF,
  62   /* 0xf0 (cedilla) */                              TABLE_TO_UCS4_COMBINING_F0,
  63   /* 0xf1 (ogonek, right hook) */                   TABLE_TO_UCS4_COMBINING_F1,
  64   /* 0xf2 (dot below) */                            TABLE_TO_UCS4_COMBINING_F2,
  65   /* 0xf3 (double dot below) */                     TABLE_TO_UCS4_COMBINING_F3,
  66   /* 0xf4 (ring below) */                           TABLE_TO_UCS4_COMBINING_F4,
  67   /* 0xf5 (double low line) */                      TABLE_TO_UCS4_COMBINING_F5,
  68   /* 0xf6 (line below) */                           TABLE_TO_UCS4_COMBINING_F6,
  69   /* 0xf7 (comma below, left hook) */               TABLE_TO_UCS4_COMBINING_F7,
  70   /* 0xf8 (left half ring below, right cedilla) */  TABLE_TO_UCS4_COMBINING_F8,
  71   /* 0xf9 (breve below, half circle below) */       TABLE_TO_UCS4_COMBINING_F9,
  72   /* 0xfa (double tilde, left half) */              TABLE_TO_UCS4_COMBINING_FA,
  73   /* 0xfb (double tilde, right half) */             TABLE_TO_UCS4_COMBINING_FB,
  74   /* 0xfc */                                        TABLE_TO_UCS4_COMBINING_FC,
  75   /* 0xfd */                                        TABLE_TO_UCS4_COMBINING_FD,
  76   /* 0xfe (comma above, high centered comma) */     TABLE_TO_UCS4_COMBINING_FE,
  77 };
  78
  79 /* Omit first part of table: assume identity mapping (ASCII) */
  80 static const char from_ucs4[][2] =      TABLE_FROM_UCS4_BASIC;
  81 static const char from_ucs4_p01a[][2] = TABLE_FROM_UCS4_PAGE_01A;
  82 static const char from_ucs4_p022[][2] = TABLE_FROM_UCS4_PAGE_022;
  83 static const char from_ucs4_p02b[][2] = TABLE_FROM_UCS4_PAGE_02B;
  84 static const char from_ucs4_p030[][2] = TABLE_FROM_UCS4_PAGE_030;
  85 static const char from_ucs4_p1ea[][2] = TABLE_FROM_UCS4_PAGE_1EA;
  86 static const char from_ucs4_p200[][2] = TABLE_FROM_UCS4_PAGE_200;
  87 static const char from_ucs4_p211[][2] = TABLE_FROM_UCS4_PAGE_211;
  88 static const char from_ucs4_p266[][2] = TABLE_FROM_UCS4_PAGE_266;
  89 static const char from_ucs4_pfe2[][2] = TABLE_FROM_UCS4_PAGE_FE2;
  90
  91 /* Definitions used in the body of the `gconv' function.  */
  92 #define CHARSET_NAME            "ANSI_Z39.47//"
  93 #define FROM_LOOP               from_ansi_z39_47
  94 #define TO_LOOP                 to_ansi_z39_47
  95 #define DEFINE_INIT             1
  96 #define DEFINE_FINI             1
  97 #define MIN_NEEDED_FROM         1
  98 #define MAX_NEEDED_FROM         2
  99 #define MIN_NEEDED_TO           4
 100
 101 /* First define the conversion function from ANSI_Z39.47 to UCS4.  */
 102 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
 103 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
 104 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
 105 #define LOOPFCT                 FROM_LOOP
 106 #define BODY \
 107   {                                                                           \
 108     uint32_t ch = *inptr;                                                     \
 109     int incr;                                                                 \
 110                                                                               \
 111     if (__builtin_expect (ch >= 0xe0, 0) && ch <= 0xfe)                       \
 112       {                                                                       \
 113         /* Composed character.  First test whether the next character         \
 114            is also available.  */                                             \
 115         uint32_t ch2;                                                         \
 116                                                                               \
 117         if (inptr + 1 >= inend)                                               \
 118           {                                                                   \
 119             /* The second character is not available.  */                     \
 120             result = __GCONV_INCOMPLETE_INPUT;                                \
 121             break;                                                            \
 122           }                                                                   \
 123                                                                               \
 124         ch2 = inptr[1];                                                       \
 125                                                                               \
 126         if (__builtin_expect (ch2 < 0x20, 0)                                  \
 127             || __builtin_expect (ch2 >= 0x80, 0))                             \
 128           {                                                                   \
 129             /* This is illegal.  */                                           \
 130             if (! ignore_errors_p ())                                         \
 131               {                                                               \
 132                 result = __GCONV_ILLEGAL_INPUT;                               \
 133                 break;                                                        \
 134               }                                                               \
 135                                                                               \
 136             ++*irreversible;                                                  \
 137             incr = 1;                                                         \
 138           }                                                                   \
 139         else                                                                  \
 140           {                                                                   \
 141             uint32_t ch3 = to_ucs4_comb[ch - 0xe0][ch2 - 0x20];               \
 142             if (ch3 != 0) {                                                   \
 143               ch = ch3;                                                       \
 144               incr = 2;                                                       \
 145             }                                                                 \
 146             else {                                                            \
 147               /* mapping for ch2 is an identity, because is ASCII here */     \
 148               put32 (outptr, ch2);                                            \
 149               outptr += 4;                                                    \
 150               ch = to_ucs4[ch - 0x80];                                        \
 151               incr = 2;                                                       \
 152             }                                                                 \
 153           }                                                                   \
 154       }                                                                       \
 155     else                                                                      \
 156       {                                                                       \
 157         if (__builtin_expect (ch >= 0x80, 0))                                 \
 158           ch = to_ucs4[ch - 0x80];                                            \
 159         incr = 1;                                                             \
 160       }                                                                       \
 161                                                                               \
 162     if (__builtin_expect (ch, 1) == 0 && *inptr != '\0')                      \
 163       {                                                                       \
 164         /* This is an illegal character.  */                                  \
 165         if (! ignore_errors_p ())                                             \
 166           {                                                                   \
 167             result = __GCONV_ILLEGAL_INPUT;                                   \
 168             break;                                                            \
 169           }                                                                   \
 170       }                                                                       \
 171     else                                                                      \
 172       {                                                                       \
 173         put32 (outptr, ch);                                                   \
 174         outptr += 4;                                                          \
 175       }                                                                       \
 176                                                                               \
 177     inptr += incr;                                                            \
 178   }
 179 #define LOOP_NEED_FLAGS
 180 #include "loop.c"
 181
 182
 183 /* Next, define the other direction.  */
 184 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
 185 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
 186 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
 187 #define LOOPFCT                 TO_LOOP
 188 #define BODY \
 189   {                                                                           \
 190     char tmp[2];                                                              \
 191     uint32_t ch = get32 (inptr);                                              \
 192     const char *cp;                                                           \
 193                                                                               \
 194     if (__builtin_expect (ch > 0x017e, 0))                                    \
 195       {                                                                       \
 196         if (ch >= 0x1a0 && ch < 0x1b4)                                        \
 197           cp = from_ucs4_p01a[ch - 0x1a0];                                    \
 198         else if (ch >= 0x220 && ch < 0x234)                                   \
 199           cp = from_ucs4_p022[ch - 0x220];                                    \
 200         else if (ch >= 0x2b0 && ch < 0x2e2)                                   \
 201           cp = from_ucs4_p02b[ch - 0x2b0];                                    \
 202         else if (ch >= 0x300 && ch < 0x337)                                   \
 203           cp = from_ucs4_p030[ch - 0x300];                                    \
 204         else if (ch >= 0x1ea0 && ch < 0x1efa)                                 \
 205           cp = from_ucs4_p1ea[ch - 0x1ea0];                                   \
 206         else if (ch >= 0x2000 && ch < 0x200f)                                 \
 207           cp = from_ucs4_p200[ch - 0x2000];                                   \
 208         else if (ch >= 0x2110 && ch < 0x211a)                                 \
 209           cp = from_ucs4_p211[ch - 0x2110];                                   \
 210         else if (ch >= 0x2660 && ch < 0x2674)                                 \
 211           cp = from_ucs4_p266[ch - 0x2660];                                   \
 212         else if (ch >= 0xfe20 && ch < 0xfe25)                                 \
 213           cp = from_ucs4_pfe2[ch - 0xfe20];                                   \
 214         else                                                                  \
 215           {                                                                   \
 216             UNICODE_TAG_HANDLER (ch, 4);                                      \
 217                                                                               \
 218             /* Illegal characters.  */                                        \
 219             STANDARD_ERR_HANDLER (4);                                         \
 220           }                                                                   \
 221       }                                                                       \
 222     else                                                                      \
 223       {                                                                       \
 224         if (__builtin_expect (ch < 0x80, 1)) {                                \
 225           tmp[0] = ch;                                                        \
 226           tmp[1] = '\0';                                                      \
 227           cp = tmp;                                                           \
 228         }                                                                     \
 229         else                                                                  \
 230           cp = from_ucs4[ch-0x80];                                            \
 231         if (__builtin_expect (ch >= 0x20, 1)                                  \
 232             && __builtin_expect (ch < 0x80, 1))                               \
 233         {                                                                     \
 234           /* Check whether the next character is an accent, if so, then */    \
 235           /* output it first */                                               \
 236           uint32_t ch2;                                                       \
 237           inptr += 4;                                                         \
 238           ch2 = get32 (inptr);                                                \
 239           if (ch2 >= 0x300 && ch2 < 0x337) {                                  \
 240             const char* cp2 = from_ucs4_p030[ch2 - 0x300];                    \
 241             if (cp2[0] != '\0') {                                             \
 242               *outptr++ = cp2[0];                                             \
 243             }                                                                 \
 244             else                                                              \
 245               inptr -= 4;                                                     \
 246           }                                                                   \
 247           else if (ch2 >= 0xfe20 && ch2 < 0xfe25) {                           \
 248             const char* cp2 = from_ucs4_pfe2[ch2 - 0xfe20];                   \
 249             if (cp2[0] != '\0') {                                             \
 250               *outptr++ = cp2[0];                                             \
 251             }                                                                 \
 252             else                                                              \
 253               inptr -= 4;                                                     \
 254           }                                                                   \
 255           else                                                                \
 256             inptr -= 4;                                                       \
 257         }                                                                     \
 258       }                                                                       \
 259                                                                               \
 260     if (__builtin_expect (cp[0], '\1') == '\0' && ch != 0)                    \
 261       {                                                                       \
 262         /* Illegal characters.  */                                            \
 263         STANDARD_ERR_HANDLER (4);                                             \
 264       }                                                                       \
 265                                                                               \
 266     *outptr++ = cp[0];                                                        \
 267     /* Now test for a possible second byte and write this if possible.  */    \
 268     if (cp[1] != '\0')                                                        \
 269       {                                                                       \
 270         if (__builtin_expect (outptr >= outend, 0))                           \
 271           {                                                                   \
 272             /* The result does not fit into the buffer.  */                   \
 273             --outptr;                                                         \
 274             result = __GCONV_FULL_OUTPUT;                                     \
 275             break;                                                            \
 276           }                                                                   \
 277                                                                               \
 278         *outptr++ = cp[1];                                                    \
 279       }                                                                       \
 280                                                                               \
 281     inptr += 4;                                                               \
 282   }
 283 #define LOOP_NEED_FLAGS
 284 #include "loop.c"
 285
 286
 287 /* Now define the toplevel functions.  */
 288 #include "skeleton.c"