Moved from ansel directory.
[gedcom-parse.git] / iconv / glibc / ANSI_Z39.47.c
1 /* Conversion for ANSI_Z39.47 aka ANSEL.
2    Copyright (C) 2001 The Genes Development Team
3    This file is part of the Gedcom parser library.
4    Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
5
6    The Gedcom parser library is free software; you can redistribute it
7    and/or modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The Gedcom parser library is distributed in the hope that it will be
12    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the Gedcom parser library; if not, write to the
18    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 /* $Id$ */
22 /* $Name$ */
23
24 /* Generic conversion to and from ANSI Z39.47 (also known as ANSEL)
25    Based on the ansi_x3.110.c file from the glibc sources
26    Data coming from:
27    http://lcweb.loc.gov/marc/specifications/speccharlatin.html
28
29    Note: in ANSEL, diacritical marks come *before* the base character;
30    in Unicode, they come *after*...
31 */
32
33 #include <dlfcn.h>
34 #include <gconv.h>
35 #include <stdint.h>
36 #include <string.h>
37 #include "ANSI_Z39.47-tables.h"
38
39 /* Omit first half of table: assume identity mapping (ASCII) */
40 static const uint32_t to_ucs4[128] = TABLE_TO_UCS4_BASIC;
41
42 /* The outer array range runs from 0xe0 to 0xfe, the inner range from 0x20
43    to 0x7f.  */
44 static const uint32_t to_ucs4_comb[31][96] =
45 {
46   /* 0xe0 (hook above) */                           TABLE_TO_UCS4_COMBINING_E0,
47   /* 0xe1 (grave) */                                TABLE_TO_UCS4_COMBINING_E1,
48   /* 0xe2 (acute) */                                TABLE_TO_UCS4_COMBINING_E2,
49   /* 0xe3 (circumflex) */                           TABLE_TO_UCS4_COMBINING_E3,
50   /* 0xe4 (tilde) */                                TABLE_TO_UCS4_COMBINING_E4,
51   /* 0xe5 (macron) */                               TABLE_TO_UCS4_COMBINING_E5,
52   /* 0xe6 (breve) */                                TABLE_TO_UCS4_COMBINING_E6,
53   /* 0xe7 (dot above) */                            TABLE_TO_UCS4_COMBINING_E7,
54   /* 0xe8 (umlaut, diaeresis) */                    TABLE_TO_UCS4_COMBINING_E8,
55   /* 0xe9 (caron, hacek) */                         TABLE_TO_UCS4_COMBINING_E9,
56   /* 0xea (ring above) */                           TABLE_TO_UCS4_COMBINING_EA,
57   /* 0xeb (ligature, left half) */                  TABLE_TO_UCS4_COMBINING_EB,
58   /* 0xec (ligature, right half) */                 TABLE_TO_UCS4_COMBINING_EC,
59   /* 0xed (comma above right) */                    TABLE_TO_UCS4_COMBINING_ED,
60   /* 0xee (double acute) */                         TABLE_TO_UCS4_COMBINING_EE,
61   /* 0xef (candrabindu) */                          TABLE_TO_UCS4_COMBINING_EF,
62   /* 0xf0 (cedilla) */                              TABLE_TO_UCS4_COMBINING_F0,
63   /* 0xf1 (ogonek, right hook) */                   TABLE_TO_UCS4_COMBINING_F1,
64   /* 0xf2 (dot below) */                            TABLE_TO_UCS4_COMBINING_F2,
65   /* 0xf3 (double dot below) */                     TABLE_TO_UCS4_COMBINING_F3,
66   /* 0xf4 (ring below) */                           TABLE_TO_UCS4_COMBINING_F4,
67   /* 0xf5 (double low line) */                      TABLE_TO_UCS4_COMBINING_F5,
68   /* 0xf6 (line below) */                           TABLE_TO_UCS4_COMBINING_F6,
69   /* 0xf7 (comma below, left hook) */               TABLE_TO_UCS4_COMBINING_F7,
70   /* 0xf8 (left half ring below, right cedilla) */  TABLE_TO_UCS4_COMBINING_F8,
71   /* 0xf9 (breve below, half circle below) */       TABLE_TO_UCS4_COMBINING_F9,
72   /* 0xfa (double tilde, left half) */              TABLE_TO_UCS4_COMBINING_FA,
73   /* 0xfb (double tilde, right half) */             TABLE_TO_UCS4_COMBINING_FB,
74   /* 0xfc */                                        TABLE_TO_UCS4_COMBINING_FC,
75   /* 0xfd */                                        TABLE_TO_UCS4_COMBINING_FD,
76   /* 0xfe (comma above, high centered comma) */     TABLE_TO_UCS4_COMBINING_FE,
77 };
78
79 /* Omit first part of table: assume identity mapping (ASCII) */
80 static const char from_ucs4[][2] =      TABLE_FROM_UCS4_BASIC;
81 static const char from_ucs4_p01a[][2] = TABLE_FROM_UCS4_PAGE_01A;
82 static const char from_ucs4_p022[][2] = TABLE_FROM_UCS4_PAGE_022;
83 static const char from_ucs4_p02b[][2] = TABLE_FROM_UCS4_PAGE_02B;
84 static const char from_ucs4_p030[][2] = TABLE_FROM_UCS4_PAGE_030;
85 static const char from_ucs4_p1ea[][2] = TABLE_FROM_UCS4_PAGE_1EA;
86 static const char from_ucs4_p200[][2] = TABLE_FROM_UCS4_PAGE_200;
87 static const char from_ucs4_p211[][2] = TABLE_FROM_UCS4_PAGE_211;
88 static const char from_ucs4_p266[][2] = TABLE_FROM_UCS4_PAGE_266;
89 static const char from_ucs4_pfe2[][2] = TABLE_FROM_UCS4_PAGE_FE2;
90
91 /* Definitions used in the body of the `gconv' function.  */
92 #define CHARSET_NAME            "ANSI_Z39.47//"
93 #define FROM_LOOP               from_ansi_z39_47
94 #define TO_LOOP                 to_ansi_z39_47
95 #define DEFINE_INIT             1
96 #define DEFINE_FINI             1
97 #define MIN_NEEDED_FROM         1
98 #define MAX_NEEDED_FROM         2
99 #define MIN_NEEDED_TO           4
100
101 /* First define the conversion function from ANSI_Z39.47 to UCS4.  */
102 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
103 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
104 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
105 #define LOOPFCT                 FROM_LOOP
106 #define BODY \
107   {                                                                           \
108     uint32_t ch = *inptr;                                                     \
109     int incr;                                                                 \
110                                                                               \
111     if (__builtin_expect (ch >= 0xe0, 0) && ch <= 0xfe)                       \
112       {                                                                       \
113         /* Composed character.  First test whether the next character         \
114            is also available.  */                                             \
115         uint32_t ch2;                                                         \
116                                                                               \
117         if (inptr + 1 >= inend)                                               \
118           {                                                                   \
119             /* The second character is not available.  */                     \
120             result = __GCONV_INCOMPLETE_INPUT;                                \
121             break;                                                            \
122           }                                                                   \
123                                                                               \
124         ch2 = inptr[1];                                                       \
125                                                                               \
126         if (__builtin_expect (ch2 < 0x20, 0)                                  \
127             || __builtin_expect (ch2 >= 0x80, 0))                             \
128           {                                                                   \
129             /* This is illegal.  */                                           \
130             if (! ignore_errors_p ())                                         \
131               {                                                               \
132                 result = __GCONV_ILLEGAL_INPUT;                               \
133                 break;                                                        \
134               }                                                               \
135                                                                               \
136             ++*irreversible;                                                  \
137             incr = 1;                                                         \
138           }                                                                   \
139         else                                                                  \
140           {                                                                   \
141             uint32_t ch3 = to_ucs4_comb[ch - 0xe0][ch2 - 0x20];               \
142             if (ch3 != 0) {                                                   \
143               ch = ch3;                                                       \
144               incr = 2;                                                       \
145             }                                                                 \
146             else {                                                            \
147               /* mapping for ch2 is an identity, because is ASCII here */     \
148               put32 (outptr, ch2);                                            \
149               outptr += 4;                                                    \
150               ch = to_ucs4[ch - 0x80];                                        \
151               incr = 2;                                                       \
152             }                                                                 \
153           }                                                                   \
154       }                                                                       \
155     else                                                                      \
156       {                                                                       \
157         if (__builtin_expect (ch >= 0x80, 0))                                 \
158           ch = to_ucs4[ch - 0x80];                                            \
159         incr = 1;                                                             \
160       }                                                                       \
161                                                                               \
162     if (__builtin_expect (ch, 1) == 0 && *inptr != '\0')                      \
163       {                                                                       \
164         /* This is an illegal character.  */                                  \
165         if (! ignore_errors_p ())                                             \
166           {                                                                   \
167             result = __GCONV_ILLEGAL_INPUT;                                   \
168             break;                                                            \
169           }                                                                   \
170       }                                                                       \
171     else                                                                      \
172       {                                                                       \
173         put32 (outptr, ch);                                                   \
174         outptr += 4;                                                          \
175       }                                                                       \
176                                                                               \
177     inptr += incr;                                                            \
178   }
179 #define LOOP_NEED_FLAGS
180 #include "loop.c"
181
182
183 /* Next, define the other direction.  */
184 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
185 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
186 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
187 #define LOOPFCT                 TO_LOOP
188 #define BODY \
189   {                                                                           \
190     char tmp[2];                                                              \
191     uint32_t ch = get32 (inptr);                                              \
192     const char *cp;                                                           \
193                                                                               \
194     if (__builtin_expect (ch > 0x017e, 0))                                    \
195       {                                                                       \
196         if (ch >= 0x1a0 && ch < 0x1b4)                                        \
197           cp = from_ucs4_p01a[ch - 0x1a0];                                    \
198         else if (ch >= 0x220 && ch < 0x234)                                   \
199           cp = from_ucs4_p022[ch - 0x220];                                    \
200         else if (ch >= 0x2b0 && ch < 0x2e2)                                   \
201           cp = from_ucs4_p02b[ch - 0x2b0];                                    \
202         else if (ch >= 0x300 && ch < 0x337)                                   \
203           cp = from_ucs4_p030[ch - 0x300];                                    \
204         else if (ch >= 0x1ea0 && ch < 0x1efa)                                 \
205           cp = from_ucs4_p1ea[ch - 0x1ea0];                                   \
206         else if (ch >= 0x2000 && ch < 0x200f)                                 \
207           cp = from_ucs4_p200[ch - 0x2000];                                   \
208         else if (ch >= 0x2110 && ch < 0x211a)                                 \
209           cp = from_ucs4_p211[ch - 0x2110];                                   \
210         else if (ch >= 0x2660 && ch < 0x2674)                                 \
211           cp = from_ucs4_p266[ch - 0x2660];                                   \
212         else if (ch >= 0xfe20 && ch < 0xfe25)                                 \
213           cp = from_ucs4_pfe2[ch - 0xfe20];                                   \
214         else                                                                  \
215           {                                                                   \
216             UNICODE_TAG_HANDLER (ch, 4);                                      \
217                                                                               \
218             /* Illegal characters.  */                                        \
219             STANDARD_ERR_HANDLER (4);                                         \
220           }                                                                   \
221       }                                                                       \
222     else                                                                      \
223       {                                                                       \
224         if (__builtin_expect (ch < 0x80, 1)) {                                \
225           tmp[0] = ch;                                                        \
226           tmp[1] = '\0';                                                      \
227           cp = tmp;                                                           \
228         }                                                                     \
229         else                                                                  \
230           cp = from_ucs4[ch-0x80];                                            \
231         if (__builtin_expect (ch >= 0x20, 1)                                  \
232             && __builtin_expect (ch < 0x80, 1))                               \
233         {                                                                     \
234           /* Check whether the next character is an accent, if so, then */    \
235           /* output it first */                                               \
236           uint32_t ch2;                                                       \
237           inptr += 4;                                                         \
238           ch2 = get32 (inptr);                                                \
239           if (ch2 >= 0x300 && ch2 < 0x337) {                                  \
240             const char* cp2 = from_ucs4_p030[ch2 - 0x300];                    \
241             if (cp2[0] != '\0') {                                             \
242               *outptr++ = cp2[0];                                             \
243             }                                                                 \
244             else                                                              \
245               inptr -= 4;                                                     \
246           }                                                                   \
247           else if (ch2 >= 0xfe20 && ch2 < 0xfe25) {                           \
248             const char* cp2 = from_ucs4_pfe2[ch2 - 0xfe20];                   \
249             if (cp2[0] != '\0') {                                             \
250               *outptr++ = cp2[0];                                             \
251             }                                                                 \
252             else                                                              \
253               inptr -= 4;                                                     \
254           }                                                                   \
255           else                                                                \
256             inptr -= 4;                                                       \
257         }                                                                     \
258       }                                                                       \
259                                                                               \
260     if (__builtin_expect (cp[0], '\1') == '\0' && ch != 0)                    \
261       {                                                                       \
262         /* Illegal characters.  */                                            \
263         STANDARD_ERR_HANDLER (4);                                             \
264       }                                                                       \
265                                                                               \
266     *outptr++ = cp[0];                                                        \
267     /* Now test for a possible second byte and write this if possible.  */    \
268     if (cp[1] != '\0')                                                        \
269       {                                                                       \
270         if (__builtin_expect (outptr >= outend, 0))                           \
271           {                                                                   \
272             /* The result does not fit into the buffer.  */                   \
273             --outptr;                                                         \
274             result = __GCONV_FULL_OUTPUT;                                     \
275             break;                                                            \
276           }                                                                   \
277                                                                               \
278         *outptr++ = cp[1];                                                    \
279       }                                                                       \
280                                                                               \
281     inptr += 4;                                                               \
282   }
283 #define LOOP_NEED_FLAGS
284 #include "loop.c"
285
286
287 /* Now define the toplevel functions.  */
288 #include "skeleton.c"