1 /* Conversion for ANSI_Z39.47 aka ANSEL.
2 Copyright (C) 2001 The Genes Development Team
3 This file is part of the Gedcom parser library.
4 Contributed by Peter Verthez <Peter.Verthez@advalvas.be>, 2001.
6 The Gedcom parser library is free software; you can redistribute it
7 and/or modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The Gedcom parser library is distributed in the hope that it will be
12 useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the Gedcom parser library; if not, write to the
18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
24 /* Generic conversion to and from ANSI Z39.47 (also known as ANSEL)
25 Based on the ansi_x3.110.c file from the glibc sources
27 http://lcweb.loc.gov/marc/specifications/speccharlatin.html
29 Note: in ANSEL, diacritical marks come *before* the base character;
30 in Unicode, they come *after*...
37 #include "ANSI_Z39.47-tables.h"
39 /* From /usr/include/linux/compiler.h out of GCC 2.96+: */
40 /* Somewhere in the middle of the GCC 2.96 development cycle, we implemented
41 a mechanism by which the user can annotate likely branch directions and
42 expect the blocks to be reordered appropriately. Define __builtin_expect
43 to nothing for earlier compilers. */
45 #if __GNUC__ == 2 && __GNUC_MINOR__ < 96
46 #define __builtin_expect(x, expected_value) (x)
49 /* Omit first half of table: assume identity mapping (ASCII) */
50 static const uint32_t to_ucs4[128] = TABLE_TO_UCS4_BASIC;
52 /* The outer array range runs from 0xe0 to 0xfe, the inner range from 0x20
54 static const uint32_t to_ucs4_comb[31][96] =
56 /* 0xe0 (hook above) */ TABLE_TO_UCS4_COMBINING_E0,
57 /* 0xe1 (grave) */ TABLE_TO_UCS4_COMBINING_E1,
58 /* 0xe2 (acute) */ TABLE_TO_UCS4_COMBINING_E2,
59 /* 0xe3 (circumflex) */ TABLE_TO_UCS4_COMBINING_E3,
60 /* 0xe4 (tilde) */ TABLE_TO_UCS4_COMBINING_E4,
61 /* 0xe5 (macron) */ TABLE_TO_UCS4_COMBINING_E5,
62 /* 0xe6 (breve) */ TABLE_TO_UCS4_COMBINING_E6,
63 /* 0xe7 (dot above) */ TABLE_TO_UCS4_COMBINING_E7,
64 /* 0xe8 (umlaut, diaeresis) */ TABLE_TO_UCS4_COMBINING_E8,
65 /* 0xe9 (caron, hacek) */ TABLE_TO_UCS4_COMBINING_E9,
66 /* 0xea (ring above) */ TABLE_TO_UCS4_COMBINING_EA,
67 /* 0xeb (ligature, left half) */ TABLE_TO_UCS4_COMBINING_EB,
68 /* 0xec (ligature, right half) */ TABLE_TO_UCS4_COMBINING_EC,
69 /* 0xed (comma above right) */ TABLE_TO_UCS4_COMBINING_ED,
70 /* 0xee (double acute) */ TABLE_TO_UCS4_COMBINING_EE,
71 /* 0xef (candrabindu) */ TABLE_TO_UCS4_COMBINING_EF,
72 /* 0xf0 (cedilla) */ TABLE_TO_UCS4_COMBINING_F0,
73 /* 0xf1 (ogonek, right hook) */ TABLE_TO_UCS4_COMBINING_F1,
74 /* 0xf2 (dot below) */ TABLE_TO_UCS4_COMBINING_F2,
75 /* 0xf3 (double dot below) */ TABLE_TO_UCS4_COMBINING_F3,
76 /* 0xf4 (ring below) */ TABLE_TO_UCS4_COMBINING_F4,
77 /* 0xf5 (double low line) */ TABLE_TO_UCS4_COMBINING_F5,
78 /* 0xf6 (line below) */ TABLE_TO_UCS4_COMBINING_F6,
79 /* 0xf7 (comma below, left hook) */ TABLE_TO_UCS4_COMBINING_F7,
80 /* 0xf8 (left half ring below, right cedilla) */ TABLE_TO_UCS4_COMBINING_F8,
81 /* 0xf9 (breve below, half circle below) */ TABLE_TO_UCS4_COMBINING_F9,
82 /* 0xfa (double tilde, left half) */ TABLE_TO_UCS4_COMBINING_FA,
83 /* 0xfb (double tilde, right half) */ TABLE_TO_UCS4_COMBINING_FB,
84 /* 0xfc */ TABLE_TO_UCS4_COMBINING_FC,
85 /* 0xfd */ TABLE_TO_UCS4_COMBINING_FD,
86 /* 0xfe (comma above, high centered comma) */ TABLE_TO_UCS4_COMBINING_FE,
89 /* Omit first part of table: assume identity mapping (ASCII) */
90 static const char from_ucs4[][2] = TABLE_FROM_UCS4_BASIC;
91 static const char from_ucs4_p01a[][2] = TABLE_FROM_UCS4_PAGE_01A;
92 static const char from_ucs4_p022[][2] = TABLE_FROM_UCS4_PAGE_022;
93 static const char from_ucs4_p02b[][2] = TABLE_FROM_UCS4_PAGE_02B;
94 static const char from_ucs4_p030[][2] = TABLE_FROM_UCS4_PAGE_030;
95 static const char from_ucs4_p1ea[][2] = TABLE_FROM_UCS4_PAGE_1EA;
96 static const char from_ucs4_p200[][2] = TABLE_FROM_UCS4_PAGE_200;
97 static const char from_ucs4_p211[][2] = TABLE_FROM_UCS4_PAGE_211;
98 static const char from_ucs4_p266[][2] = TABLE_FROM_UCS4_PAGE_266;
99 static const char from_ucs4_pfe2[][2] = TABLE_FROM_UCS4_PAGE_FE2;
101 /* Definitions used in the body of the `gconv' function. */
102 #define CHARSET_NAME "ANSI_Z39.47//"
103 #define FROM_LOOP from_ansi_z39_47
104 #define TO_LOOP to_ansi_z39_47
105 #define DEFINE_INIT 1
106 #define DEFINE_FINI 1
107 #define MIN_NEEDED_FROM 1
108 #define MAX_NEEDED_FROM 2
109 #define MIN_NEEDED_TO 4
111 /* First define the conversion function from ANSI_Z39.47 to UCS4. */
112 #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
113 #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
114 #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
115 #define LOOPFCT FROM_LOOP
118 uint32_t ch = *inptr; \
121 if (__builtin_expect (ch >= 0xe0, 0) && ch <= 0xfe) \
123 /* Composed character. First test whether the next character \
124 is also available. */ \
127 if (inptr + 1 >= inend) \
129 /* The second character is not available. */ \
130 result = __GCONV_INCOMPLETE_INPUT; \
136 if (__builtin_expect (ch2 < 0x20, 0) \
137 || __builtin_expect (ch2 >= 0x80, 0)) \
139 /* This is illegal. */ \
140 if (! ignore_errors_p ()) \
142 result = __GCONV_ILLEGAL_INPUT; \
151 uint32_t ch3 = to_ucs4_comb[ch - 0xe0][ch2 - 0x20]; \
157 /* mapping for ch2 is an identity, because is ASCII here */ \
158 put32 (outptr, ch2); \
160 ch = to_ucs4[ch - 0x80]; \
167 if (__builtin_expect (ch >= 0x80, 0)) \
168 ch = to_ucs4[ch - 0x80]; \
172 if (__builtin_expect (ch, 1) == 0 && *inptr != '\0') \
174 /* This is an illegal character. */ \
175 if (! ignore_errors_p ()) \
177 result = __GCONV_ILLEGAL_INPUT; \
183 put32 (outptr, ch); \
189 #define LOOP_NEED_FLAGS
193 /* Next, define the other direction. */
194 #define MIN_NEEDED_INPUT MIN_NEEDED_TO
195 #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
196 #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
197 #define LOOPFCT TO_LOOP
201 uint32_t ch = get32 (inptr); \
204 if (__builtin_expect (ch > 0x017e, 0)) \
206 if (ch >= 0x1a0 && ch < 0x1b4) \
207 cp = from_ucs4_p01a[ch - 0x1a0]; \
208 else if (ch >= 0x220 && ch < 0x234) \
209 cp = from_ucs4_p022[ch - 0x220]; \
210 else if (ch >= 0x2b0 && ch < 0x2e2) \
211 cp = from_ucs4_p02b[ch - 0x2b0]; \
212 else if (ch >= 0x300 && ch < 0x337) \
213 cp = from_ucs4_p030[ch - 0x300]; \
214 else if (ch >= 0x1ea0 && ch < 0x1efa) \
215 cp = from_ucs4_p1ea[ch - 0x1ea0]; \
216 else if (ch >= 0x2000 && ch < 0x200f) \
217 cp = from_ucs4_p200[ch - 0x2000]; \
218 else if (ch >= 0x2110 && ch < 0x211a) \
219 cp = from_ucs4_p211[ch - 0x2110]; \
220 else if (ch >= 0x2660 && ch < 0x2674) \
221 cp = from_ucs4_p266[ch - 0x2660]; \
222 else if (ch >= 0xfe20 && ch < 0xfe25) \
223 cp = from_ucs4_pfe2[ch - 0xfe20]; \
226 UNICODE_TAG_HANDLER (ch, 4); \
228 /* Illegal characters. */ \
229 STANDARD_ERR_HANDLER (4); \
234 if (__builtin_expect (ch < 0x80, 1)) { \
240 cp = from_ucs4[ch-0x80]; \
241 if (__builtin_expect (ch >= 0x20, 1) \
242 && __builtin_expect (ch < 0x80, 1)) \
244 /* Check whether the next character is an accent, if so, then */ \
245 /* output it first */ \
248 ch2 = get32 (inptr); \
249 if (ch2 >= 0x300 && ch2 < 0x337) { \
250 const char* cp2 = from_ucs4_p030[ch2 - 0x300]; \
251 if (cp2[0] != '\0') { \
252 *outptr++ = cp2[0]; \
257 else if (ch2 >= 0xfe20 && ch2 < 0xfe25) { \
258 const char* cp2 = from_ucs4_pfe2[ch2 - 0xfe20]; \
259 if (cp2[0] != '\0') { \
260 *outptr++ = cp2[0]; \
270 if (__builtin_expect (cp[0], '\1') == '\0' && ch != 0) \
272 /* Illegal characters. */ \
273 STANDARD_ERR_HANDLER (4); \
277 /* Now test for a possible second byte and write this if possible. */ \
280 if (__builtin_expect (outptr >= outend, 0)) \
282 /* The result does not fit into the buffer. */ \
284 result = __GCONV_FULL_OUTPUT; \
293 #define LOOP_NEED_FLAGS
297 /* Now define the toplevel functions. */
298 #include "skeleton.c"