iconv/glibc/ANSI_Z39.47.c

   1 /* Conversion for ANSI_Z39.47 aka ANSEL.
   2    Copyright (C) 2001 The Genes Development Team
   3    This file is part of the Gedcom parser library.
   4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
   5
   6    The Gedcom parser library is free software; you can redistribute it
   7    and/or modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The Gedcom parser library is distributed in the hope that it will be
  12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the Gedcom parser library; if not, write to the
  18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 /* $Id$ */
  22 /* $Name$ */
  23
  24 /* Generic conversion to and from ANSI Z39.47 (also known as ANSEL)
  25    Based on the ansi_x3.110.c file from the glibc sources
  26    Data coming from:
  27    http://lcweb.loc.gov/marc/specifications/speccharlatin.html
  28
  29    Note: in ANSEL, diacritical marks come *before* the base character;
  30    in Unicode, they come *after*...
  31 */
  32
  33 #include <dlfcn.h>
  34 #include <gconv.h>
  35 #include <stdint.h>
  36 #include <string.h>
  37 #include "ANSI_Z39.47-tables.h"
  38
  39 /* From /usr/include/linux/compiler.h out of GCC 2.96+: */
  40 /* Somewhere in the middle of the GCC 2.96 development cycle, we implemented
  41    a mechanism by which the user can annotate likely branch directions and
  42    expect the blocks to be reordered appropriately.  Define __builtin_expect
  43    to nothing for earlier compilers.  */
  44
  45 #if __GNUC__ == 2 && __GNUC_MINOR__ < 96
  46 #define __builtin_expect(x, expected_value) (x)
  47 #endif
  48
  49 /* Omit first half of table: assume identity mapping (ASCII) */
  50 static const uint32_t to_ucs4[128] = TABLE_TO_UCS4_BASIC;
  51
  52 /* The outer array range runs from 0xe0 to 0xfe, the inner range from 0x20
  53    to 0x7f.  */
  54 static const uint32_t to_ucs4_comb[31][96] =
  55 {
  56   /* 0xe0 (hook above) */                           TABLE_TO_UCS4_COMBINING_E0,
  57   /* 0xe1 (grave) */                                TABLE_TO_UCS4_COMBINING_E1,
  58   /* 0xe2 (acute) */                                TABLE_TO_UCS4_COMBINING_E2,
  59   /* 0xe3 (circumflex) */                           TABLE_TO_UCS4_COMBINING_E3,
  60   /* 0xe4 (tilde) */                                TABLE_TO_UCS4_COMBINING_E4,
  61   /* 0xe5 (macron) */                               TABLE_TO_UCS4_COMBINING_E5,
  62   /* 0xe6 (breve) */                                TABLE_TO_UCS4_COMBINING_E6,
  63   /* 0xe7 (dot above) */                            TABLE_TO_UCS4_COMBINING_E7,
  64   /* 0xe8 (umlaut, diaeresis) */                    TABLE_TO_UCS4_COMBINING_E8,
  65   /* 0xe9 (caron, hacek) */                         TABLE_TO_UCS4_COMBINING_E9,
  66   /* 0xea (ring above) */                           TABLE_TO_UCS4_COMBINING_EA,
  67   /* 0xeb (ligature, left half) */                  TABLE_TO_UCS4_COMBINING_EB,
  68   /* 0xec (ligature, right half) */                 TABLE_TO_UCS4_COMBINING_EC,
  69   /* 0xed (comma above right) */                    TABLE_TO_UCS4_COMBINING_ED,
  70   /* 0xee (double acute) */                         TABLE_TO_UCS4_COMBINING_EE,
  71   /* 0xef (candrabindu) */                          TABLE_TO_UCS4_COMBINING_EF,
  72   /* 0xf0 (cedilla) */                              TABLE_TO_UCS4_COMBINING_F0,
  73   /* 0xf1 (ogonek, right hook) */                   TABLE_TO_UCS4_COMBINING_F1,
  74   /* 0xf2 (dot below) */                            TABLE_TO_UCS4_COMBINING_F2,
  75   /* 0xf3 (double dot below) */                     TABLE_TO_UCS4_COMBINING_F3,
  76   /* 0xf4 (ring below) */                           TABLE_TO_UCS4_COMBINING_F4,
  77   /* 0xf5 (double low line) */                      TABLE_TO_UCS4_COMBINING_F5,
  78   /* 0xf6 (line below) */                           TABLE_TO_UCS4_COMBINING_F6,
  79   /* 0xf7 (comma below, left hook) */               TABLE_TO_UCS4_COMBINING_F7,
  80   /* 0xf8 (left half ring below, right cedilla) */  TABLE_TO_UCS4_COMBINING_F8,
  81   /* 0xf9 (breve below, half circle below) */       TABLE_TO_UCS4_COMBINING_F9,
  82   /* 0xfa (double tilde, left half) */              TABLE_TO_UCS4_COMBINING_FA,
  83   /* 0xfb (double tilde, right half) */             TABLE_TO_UCS4_COMBINING_FB,
  84   /* 0xfc */                                        TABLE_TO_UCS4_COMBINING_FC,
  85   /* 0xfd */                                        TABLE_TO_UCS4_COMBINING_FD,
  86   /* 0xfe (comma above, high centered comma) */     TABLE_TO_UCS4_COMBINING_FE,
  87 };
  88
  89 /* Omit first part of table: assume identity mapping (ASCII) */
  90 static const char from_ucs4[][2] =      TABLE_FROM_UCS4_BASIC;
  91 static const char from_ucs4_p01a[][2] = TABLE_FROM_UCS4_PAGE_01A;
  92 static const char from_ucs4_p022[][2] = TABLE_FROM_UCS4_PAGE_022;
  93 static const char from_ucs4_p02b[][2] = TABLE_FROM_UCS4_PAGE_02B;
  94 static const char from_ucs4_p030[][2] = TABLE_FROM_UCS4_PAGE_030;
  95 static const char from_ucs4_p1ea[][2] = TABLE_FROM_UCS4_PAGE_1EA;
  96 static const char from_ucs4_p200[][2] = TABLE_FROM_UCS4_PAGE_200;
  97 static const char from_ucs4_p211[][2] = TABLE_FROM_UCS4_PAGE_211;
  98 static const char from_ucs4_p266[][2] = TABLE_FROM_UCS4_PAGE_266;
  99 static const char from_ucs4_pfe2[][2] = TABLE_FROM_UCS4_PAGE_FE2;
 100
 101 /* Definitions used in the body of the `gconv' function.  */
 102 #define CHARSET_NAME            "ANSI_Z39.47//"
 103 #define FROM_LOOP               from_ansi_z39_47
 104 #define TO_LOOP                 to_ansi_z39_47
 105 #define DEFINE_INIT             1
 106 #define DEFINE_FINI             1
 107 #define MIN_NEEDED_FROM         1
 108 #define MAX_NEEDED_FROM         2
 109 #define MIN_NEEDED_TO           4
 110
 111 /* First define the conversion function from ANSI_Z39.47 to UCS4.  */
 112 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
 113 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
 114 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
 115 #define LOOPFCT                 FROM_LOOP
 116 #define BODY \
 117   {                                                                           \
 118     uint32_t ch = *inptr;                                                     \
 119     int incr;                                                                 \
 120                                                                               \
 121     if (__builtin_expect (ch >= 0xe0, 0) && ch <= 0xfe)                       \
 122       {                                                                       \
 123         /* Composed character.  First test whether the next character         \
 124            is also available.  */                                             \
 125         uint32_t ch2;                                                         \
 126                                                                               \
 127         if (inptr + 1 >= inend)                                               \
 128           {                                                                   \
 129             /* The second character is not available.  */                     \
 130             result = __GCONV_INCOMPLETE_INPUT;                                \
 131             break;                                                            \
 132           }                                                                   \
 133                                                                               \
 134         ch2 = inptr[1];                                                       \
 135                                                                               \
 136         if (__builtin_expect (ch2 < 0x20, 0)                                  \
 137             || __builtin_expect (ch2 >= 0x80, 0))                             \
 138           {                                                                   \
 139             /* This is illegal.  */                                           \
 140             if (! ignore_errors_p ())                                         \
 141               {                                                               \
 142                 result = __GCONV_ILLEGAL_INPUT;                               \
 143                 break;                                                        \
 144               }                                                               \
 145                                                                               \
 146             ++*irreversible;                                                  \
 147             incr = 1;                                                         \
 148           }                                                                   \
 149         else                                                                  \
 150           {                                                                   \
 151             uint32_t ch3 = to_ucs4_comb[ch - 0xe0][ch2 - 0x20];               \
 152             if (ch3 != 0) {                                                   \
 153               ch = ch3;                                                       \
 154               incr = 2;                                                       \
 155             }                                                                 \
 156             else {                                                            \
 157               /* mapping for ch2 is an identity, because is ASCII here */     \
 158               put32 (outptr, ch2);                                            \
 159               outptr += 4;                                                    \
 160               ch = to_ucs4[ch - 0x80];                                        \
 161               incr = 2;                                                       \
 162             }                                                                 \
 163           }                                                                   \
 164       }                                                                       \
 165     else                                                                      \
 166       {                                                                       \
 167         if (__builtin_expect (ch >= 0x80, 0))                                 \
 168           ch = to_ucs4[ch - 0x80];                                            \
 169         incr = 1;                                                             \
 170       }                                                                       \
 171                                                                               \
 172     if (__builtin_expect (ch, 1) == 0 && *inptr != '\0')                      \
 173       {                                                                       \
 174         /* This is an illegal character.  */                                  \
 175         if (! ignore_errors_p ())                                             \
 176           {                                                                   \
 177             result = __GCONV_ILLEGAL_INPUT;                                   \
 178             break;                                                            \
 179           }                                                                   \
 180       }                                                                       \
 181     else                                                                      \
 182       {                                                                       \
 183         put32 (outptr, ch);                                                   \
 184         outptr += 4;                                                          \
 185       }                                                                       \
 186                                                                               \
 187     inptr += incr;                                                            \
 188   }
 189 #define LOOP_NEED_FLAGS
 190 #include "loop.c"
 191
 192
 193 /* Next, define the other direction.  */
 194 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
 195 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
 196 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
 197 #define LOOPFCT                 TO_LOOP
 198 #define BODY \
 199   {                                                                           \
 200     char tmp[2];                                                              \
 201     uint32_t ch = get32 (inptr);                                              \
 202     const char *cp;                                                           \
 203                                                                               \
 204     if (__builtin_expect (ch > 0x017e, 0))                                    \
 205       {                                                                       \
 206         if (ch >= 0x1a0 && ch < 0x1b4)                                        \
 207           cp = from_ucs4_p01a[ch - 0x1a0];                                    \
 208         else if (ch >= 0x220 && ch < 0x234)                                   \
 209           cp = from_ucs4_p022[ch - 0x220];                                    \
 210         else if (ch >= 0x2b0 && ch < 0x2e2)                                   \
 211           cp = from_ucs4_p02b[ch - 0x2b0];                                    \
 212         else if (ch >= 0x300 && ch < 0x337)                                   \
 213           cp = from_ucs4_p030[ch - 0x300];                                    \
 214         else if (ch >= 0x1ea0 && ch < 0x1efa)                                 \
 215           cp = from_ucs4_p1ea[ch - 0x1ea0];                                   \
 216         else if (ch >= 0x2000 && ch < 0x200f)                                 \
 217           cp = from_ucs4_p200[ch - 0x2000];                                   \
 218         else if (ch >= 0x2110 && ch < 0x211a)                                 \
 219           cp = from_ucs4_p211[ch - 0x2110];                                   \
 220         else if (ch >= 0x2660 && ch < 0x2674)                                 \
 221           cp = from_ucs4_p266[ch - 0x2660];                                   \
 222         else if (ch >= 0xfe20 && ch < 0xfe25)                                 \
 223           cp = from_ucs4_pfe2[ch - 0xfe20];                                   \
 224         else                                                                  \
 225           {                                                                   \
 226             UNICODE_TAG_HANDLER (ch, 4);                                      \
 227                                                                               \
 228             /* Illegal characters.  */                                        \
 229             STANDARD_ERR_HANDLER (4);                                         \
 230           }                                                                   \
 231       }                                                                       \
 232     else                                                                      \
 233       {                                                                       \
 234         if (__builtin_expect (ch < 0x80, 1)) {                                \
 235           tmp[0] = ch;                                                        \
 236           tmp[1] = '\0';                                                      \
 237           cp = tmp;                                                           \
 238         }                                                                     \
 239         else                                                                  \
 240           cp = from_ucs4[ch-0x80];                                            \
 241         if (__builtin_expect (ch >= 0x20, 1)                                  \
 242             && __builtin_expect (ch < 0x80, 1))                               \
 243         {                                                                     \
 244           /* Check whether the next character is an accent, if so, then */    \
 245           /* output it first */                                               \
 246           uint32_t ch2;                                                       \
 247           inptr += 4;                                                         \
 248           ch2 = get32 (inptr);                                                \
 249           if (ch2 >= 0x300 && ch2 < 0x337) {                                  \
 250             const char* cp2 = from_ucs4_p030[ch2 - 0x300];                    \
 251             if (cp2[0] != '\0') {                                             \
 252               *outptr++ = cp2[0];                                             \
 253             }                                                                 \
 254             else                                                              \
 255               inptr -= 4;                                                     \
 256           }                                                                   \
 257           else if (ch2 >= 0xfe20 && ch2 < 0xfe25) {                           \
 258             const char* cp2 = from_ucs4_pfe2[ch2 - 0xfe20];                   \
 259             if (cp2[0] != '\0') {                                             \
 260               *outptr++ = cp2[0];                                             \
 261             }                                                                 \
 262             else                                                              \
 263               inptr -= 4;                                                     \
 264           }                                                                   \
 265           else                                                                \
 266             inptr -= 4;                                                       \
 267         }                                                                     \
 268       }                                                                       \
 269                                                                               \
 270     if (__builtin_expect (cp[0], '\1') == '\0' && ch != 0)                    \
 271       {                                                                       \
 272         /* Illegal characters.  */                                            \
 273         STANDARD_ERR_HANDLER (4);                                             \
 274       }                                                                       \
 275                                                                               \
 276     *outptr++ = cp[0];                                                        \
 277     /* Now test for a possible second byte and write this if possible.  */    \
 278     if (cp[1] != '\0')                                                        \
 279       {                                                                       \
 280         if (__builtin_expect (outptr >= outend, 0))                           \
 281           {                                                                   \
 282             /* The result does not fit into the buffer.  */                   \
 283             --outptr;                                                         \
 284             result = __GCONV_FULL_OUTPUT;                                     \
 285             break;                                                            \
 286           }                                                                   \
 287                                                                               \
 288         *outptr++ = cp[1];                                                    \
 289       }                                                                       \
 290                                                                               \
 291     inptr += 4;                                                               \
 292   }
 293 #define LOOP_NEED_FLAGS
 294 #include "loop.c"
 295
 296
 297 /* Now define the toplevel functions.  */
 298 #include "skeleton.c"