1 /* Conversion for ANSI_Z39.47 aka ANSEL.
2 Copyright (C) 2001 The Genes Development Team
3 This file is part of the Gedcom parser library.
4 Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
6 The Gedcom parser library is free software; you can redistribute it
7 and/or modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The Gedcom parser library is distributed in the hope that it will be
12 useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the Gedcom parser library; if not, write to the
18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
24 /* Generic conversion to and from ANSI Z39.47 (also known as ANSEL)
25 Based on the ansi_x3.110.c file from the glibc sources
27 http://lcweb.loc.gov/marc/specifications/speccharlatin.html
29 Note: in ANSEL, diacritical marks come *before* the base character;
30 in Unicode, they come *after*...
37 #include "ANSI_Z39.47-tables.h"
39 /* Omit first half of table: assume identity mapping (ASCII) */
40 static const uint32_t to_ucs4[128] = TABLE_TO_UCS4_BASIC;
42 /* The outer array range runs from 0xe0 to 0xfe, the inner range from 0x20
44 static const uint32_t to_ucs4_comb[31][96] =
46 /* 0xe0 (hook above) */ TABLE_TO_UCS4_COMBINING_E0,
47 /* 0xe1 (grave) */ TABLE_TO_UCS4_COMBINING_E1,
48 /* 0xe2 (acute) */ TABLE_TO_UCS4_COMBINING_E2,
49 /* 0xe3 (circumflex) */ TABLE_TO_UCS4_COMBINING_E3,
50 /* 0xe4 (tilde) */ TABLE_TO_UCS4_COMBINING_E4,
51 /* 0xe5 (macron) */ TABLE_TO_UCS4_COMBINING_E5,
52 /* 0xe6 (breve) */ TABLE_TO_UCS4_COMBINING_E6,
53 /* 0xe7 (dot above) */ TABLE_TO_UCS4_COMBINING_E7,
54 /* 0xe8 (umlaut, diaeresis) */ TABLE_TO_UCS4_COMBINING_E8,
55 /* 0xe9 (caron, hacek) */ TABLE_TO_UCS4_COMBINING_E9,
56 /* 0xea (ring above) */ TABLE_TO_UCS4_COMBINING_EA,
57 /* 0xeb (ligature, left half) */ TABLE_TO_UCS4_COMBINING_EB,
58 /* 0xec (ligature, right half) */ TABLE_TO_UCS4_COMBINING_EC,
59 /* 0xed (comma above right) */ TABLE_TO_UCS4_COMBINING_ED,
60 /* 0xee (double acute) */ TABLE_TO_UCS4_COMBINING_EE,
61 /* 0xef (candrabindu) */ TABLE_TO_UCS4_COMBINING_EF,
62 /* 0xf0 (cedilla) */ TABLE_TO_UCS4_COMBINING_F0,
63 /* 0xf1 (ogonek, right hook) */ TABLE_TO_UCS4_COMBINING_F1,
64 /* 0xf2 (dot below) */ TABLE_TO_UCS4_COMBINING_F2,
65 /* 0xf3 (double dot below) */ TABLE_TO_UCS4_COMBINING_F3,
66 /* 0xf4 (ring below) */ TABLE_TO_UCS4_COMBINING_F4,
67 /* 0xf5 (double low line) */ TABLE_TO_UCS4_COMBINING_F5,
68 /* 0xf6 (line below) */ TABLE_TO_UCS4_COMBINING_F6,
69 /* 0xf7 (comma below, left hook) */ TABLE_TO_UCS4_COMBINING_F7,
70 /* 0xf8 (left half ring below, right cedilla) */ TABLE_TO_UCS4_COMBINING_F8,
71 /* 0xf9 (breve below, half circle below) */ TABLE_TO_UCS4_COMBINING_F9,
72 /* 0xfa (double tilde, left half) */ TABLE_TO_UCS4_COMBINING_FA,
73 /* 0xfb (double tilde, right half) */ TABLE_TO_UCS4_COMBINING_FB,
74 /* 0xfc */ TABLE_TO_UCS4_COMBINING_FC,
75 /* 0xfd */ TABLE_TO_UCS4_COMBINING_FD,
76 /* 0xfe (comma above, high centered comma) */ TABLE_TO_UCS4_COMBINING_FE,
79 /* Omit first part of table: assume identity mapping (ASCII) */
80 static const char from_ucs4[][2] = TABLE_FROM_UCS4_BASIC;
81 static const char from_ucs4_p01a[][2] = TABLE_FROM_UCS4_PAGE_01A;
82 static const char from_ucs4_p022[][2] = TABLE_FROM_UCS4_PAGE_022;
83 static const char from_ucs4_p02b[][2] = TABLE_FROM_UCS4_PAGE_02B;
84 static const char from_ucs4_p030[][2] = TABLE_FROM_UCS4_PAGE_030;
85 static const char from_ucs4_p1ea[][2] = TABLE_FROM_UCS4_PAGE_1EA;
86 static const char from_ucs4_p200[][2] = TABLE_FROM_UCS4_PAGE_200;
87 static const char from_ucs4_p211[][2] = TABLE_FROM_UCS4_PAGE_211;
88 static const char from_ucs4_p266[][2] = TABLE_FROM_UCS4_PAGE_266;
89 static const char from_ucs4_pfe2[][2] = TABLE_FROM_UCS4_PAGE_FE2;
91 /* Definitions used in the body of the `gconv' function. */
92 #define CHARSET_NAME "ANSI_Z39.47//"
93 #define FROM_LOOP from_ansi_z39_47
94 #define TO_LOOP to_ansi_z39_47
97 #define MIN_NEEDED_FROM 1
98 #define MAX_NEEDED_FROM 2
99 #define MIN_NEEDED_TO 4
101 /* First define the conversion function from ANSI_Z39.47 to UCS4. */
102 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
103 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
104 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
105 #define LOOPFCT FROM_LOOP
108 uint32_t ch = *inptr; \
111 if (__builtin_expect (ch >= 0xe0, 0) && ch <= 0xfe) \
113 /* Composed character. First test whether the next character \
114 is also available. */ \
117 if (inptr + 1 >= inend) \
119 /* The second character is not available. */ \
120 result = __GCONV_INCOMPLETE_INPUT; \
126 if (__builtin_expect (ch2 < 0x20, 0) \
127 || __builtin_expect (ch2 >= 0x80, 0)) \
129 /* This is illegal. */ \
130 if (! ignore_errors_p ()) \
132 result = __GCONV_ILLEGAL_INPUT; \
141 uint32_t ch3 = to_ucs4_comb[ch - 0xe0][ch2 - 0x20]; \
147 /* mapping for ch2 is an identity, because is ASCII here */ \
148 put32 (outptr, ch2); \
150 ch = to_ucs4[ch - 0x80]; \
157 if (__builtin_expect (ch >= 0x80, 0)) \
158 ch = to_ucs4[ch - 0x80]; \
162 if (__builtin_expect (ch, 1) == 0 && *inptr != '\0') \
164 /* This is an illegal character. */ \
165 if (! ignore_errors_p ()) \
167 result = __GCONV_ILLEGAL_INPUT; \
173 put32 (outptr, ch); \
179 #define LOOP_NEED_FLAGS
183 /* Next, define the other direction. */
184 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
185 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
186 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
187 #define LOOPFCT TO_LOOP
191 uint32_t ch = get32 (inptr); \
194 if (__builtin_expect (ch > 0x017e, 0)) \
196 if (ch >= 0x1a0 && ch < 0x1b4) \
197 cp = from_ucs4_p01a[ch - 0x1a0]; \
198 else if (ch >= 0x220 && ch < 0x234) \
199 cp = from_ucs4_p022[ch - 0x220]; \
200 else if (ch >= 0x2b0 && ch < 0x2e2) \
201 cp = from_ucs4_p02b[ch - 0x2b0]; \
202 else if (ch >= 0x300 && ch < 0x337) \
203 cp = from_ucs4_p030[ch - 0x300]; \
204 else if (ch >= 0x1ea0 && ch < 0x1efa) \
205 cp = from_ucs4_p1ea[ch - 0x1ea0]; \
206 else if (ch >= 0x2000 && ch < 0x200f) \
207 cp = from_ucs4_p200[ch - 0x2000]; \
208 else if (ch >= 0x2110 && ch < 0x211a) \
209 cp = from_ucs4_p211[ch - 0x2110]; \
210 else if (ch >= 0x2660 && ch < 0x2674) \
211 cp = from_ucs4_p266[ch - 0x2660]; \
212 else if (ch >= 0xfe20 && ch < 0xfe25) \
213 cp = from_ucs4_pfe2[ch - 0xfe20]; \
216 UNICODE_TAG_HANDLER (ch, 4); \
218 /* Illegal characters. */ \
219 STANDARD_ERR_HANDLER (4); \
224 if (__builtin_expect (ch < 0x80, 1)) { \
230 cp = from_ucs4[ch-0x80]; \
231 if (__builtin_expect (ch >= 0x20, 1) \
232 && __builtin_expect (ch < 0x80, 1)) \
234 /* Check whether the next character is an accent, if so, then */ \
235 /* output it first */ \
238 ch2 = get32 (inptr); \
239 if (ch2 >= 0x300 && ch2 < 0x337) { \
240 const char* cp2 = from_ucs4_p030[ch2 - 0x300]; \
241 if (cp2[0] != '\0') { \
242 *outptr++ = cp2[0]; \
247 else if (ch2 >= 0xfe20 && ch2 < 0xfe25) { \
248 const char* cp2 = from_ucs4_pfe2[ch2 - 0xfe20]; \
249 if (cp2[0] != '\0') { \
250 *outptr++ = cp2[0]; \
260 if (__builtin_expect (cp[0], '\1') == '\0' && ch != 0) \
262 /* Illegal characters. */ \
263 STANDARD_ERR_HANDLER (4); \
267 /* Now test for a possible second byte and write this if possible. */ \
270 if (__builtin_expect (outptr >= outend, 0)) \
272 /* The result does not fit into the buffer. */ \
274 result = __GCONV_FULL_OUTPUT; \
283 #define LOOP_NEED_FLAGS
287 /* Now define the toplevel functions. */
288 #include "skeleton.c"