From 0a5db0b39f18118b8e51744a1d024b3738a46979 Mon Sep 17 00:00:00 2001 From: Peter Verthez Date: Wed, 2 Oct 2002 18:10:00 +0000 Subject: [PATCH] Moved from ansel directory. Extracted tables to separate header file. --- iconv/glibc/ANSI_Z39.47.c | 288 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100644 iconv/glibc/ANSI_Z39.47.c diff --git a/iconv/glibc/ANSI_Z39.47.c b/iconv/glibc/ANSI_Z39.47.c new file mode 100644 index 0000000..761bf04 --- /dev/null +++ b/iconv/glibc/ANSI_Z39.47.c @@ -0,0 +1,288 @@ +/* Conversion for ANSI_Z39.47 aka ANSEL. + Copyright (C) 2001 The Genes Development Team + This file is part of the Gedcom parser library. + Contributed by Peter Verthez , 2001. + + The Gedcom parser library is free software; you can redistribute it + and/or modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The Gedcom parser library is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the Gedcom parser library; if not, write to the + Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* $Id$ */ +/* $Name$ */ + +/* Generic conversion to and from ANSI Z39.47 (also known as ANSEL) + Based on the ansi_x3.110.c file from the glibc sources + Data coming from: + http://lcweb.loc.gov/marc/specifications/speccharlatin.html + + Note: in ANSEL, diacritical marks come *before* the base character; + in Unicode, they come *after*... +*/ + +#include +#include +#include +#include +#include "ANSI_Z39.47-tables.h" + +/* Omit first half of table: assume identity mapping (ASCII) */ +static const uint32_t to_ucs4[128] = TABLE_TO_UCS4_BASIC; + +/* The outer array range runs from 0xe0 to 0xfe, the inner range from 0x20 + to 0x7f. */ +static const uint32_t to_ucs4_comb[31][96] = +{ + /* 0xe0 (hook above) */ TABLE_TO_UCS4_COMBINING_E0, + /* 0xe1 (grave) */ TABLE_TO_UCS4_COMBINING_E1, + /* 0xe2 (acute) */ TABLE_TO_UCS4_COMBINING_E2, + /* 0xe3 (circumflex) */ TABLE_TO_UCS4_COMBINING_E3, + /* 0xe4 (tilde) */ TABLE_TO_UCS4_COMBINING_E4, + /* 0xe5 (macron) */ TABLE_TO_UCS4_COMBINING_E5, + /* 0xe6 (breve) */ TABLE_TO_UCS4_COMBINING_E6, + /* 0xe7 (dot above) */ TABLE_TO_UCS4_COMBINING_E7, + /* 0xe8 (umlaut, diaeresis) */ TABLE_TO_UCS4_COMBINING_E8, + /* 0xe9 (caron, hacek) */ TABLE_TO_UCS4_COMBINING_E9, + /* 0xea (ring above) */ TABLE_TO_UCS4_COMBINING_EA, + /* 0xeb (ligature, left half) */ TABLE_TO_UCS4_COMBINING_EB, + /* 0xec (ligature, right half) */ TABLE_TO_UCS4_COMBINING_EC, + /* 0xed (comma above right) */ TABLE_TO_UCS4_COMBINING_ED, + /* 0xee (double acute) */ TABLE_TO_UCS4_COMBINING_EE, + /* 0xef (candrabindu) */ TABLE_TO_UCS4_COMBINING_EF, + /* 0xf0 (cedilla) */ TABLE_TO_UCS4_COMBINING_F0, + /* 0xf1 (ogonek, right hook) */ TABLE_TO_UCS4_COMBINING_F1, + /* 0xf2 (dot below) */ TABLE_TO_UCS4_COMBINING_F2, + /* 0xf3 (double dot below) */ TABLE_TO_UCS4_COMBINING_F3, + /* 0xf4 (ring below) */ TABLE_TO_UCS4_COMBINING_F4, + /* 0xf5 (double low line) */ TABLE_TO_UCS4_COMBINING_F5, + /* 0xf6 (line below) */ TABLE_TO_UCS4_COMBINING_F6, + /* 0xf7 (comma below, left hook) */ TABLE_TO_UCS4_COMBINING_F7, + /* 0xf8 (left half ring below, right cedilla) */ TABLE_TO_UCS4_COMBINING_F8, + /* 0xf9 (breve below, half circle below) */ TABLE_TO_UCS4_COMBINING_F9, + /* 0xfa (double tilde, left half) */ TABLE_TO_UCS4_COMBINING_FA, + /* 0xfb (double tilde, right half) */ TABLE_TO_UCS4_COMBINING_FB, + /* 0xfc */ TABLE_TO_UCS4_COMBINING_FC, + /* 0xfd */ TABLE_TO_UCS4_COMBINING_FD, + /* 0xfe (comma above, high centered comma) */ TABLE_TO_UCS4_COMBINING_FE, +}; + +/* Omit first part of table: assume identity mapping (ASCII) */ +static const char from_ucs4[][2] = TABLE_FROM_UCS4_BASIC; +static const char from_ucs4_p01a[][2] = TABLE_FROM_UCS4_PAGE_01A; +static const char from_ucs4_p022[][2] = TABLE_FROM_UCS4_PAGE_022; +static const char from_ucs4_p02b[][2] = TABLE_FROM_UCS4_PAGE_02B; +static const char from_ucs4_p030[][2] = TABLE_FROM_UCS4_PAGE_030; +static const char from_ucs4_p1ea[][2] = TABLE_FROM_UCS4_PAGE_1EA; +static const char from_ucs4_p200[][2] = TABLE_FROM_UCS4_PAGE_200; +static const char from_ucs4_p211[][2] = TABLE_FROM_UCS4_PAGE_211; +static const char from_ucs4_p266[][2] = TABLE_FROM_UCS4_PAGE_266; +static const char from_ucs4_pfe2[][2] = TABLE_FROM_UCS4_PAGE_FE2; + +/* Definitions used in the body of the `gconv' function. */ +#define CHARSET_NAME "ANSI_Z39.47//" +#define FROM_LOOP from_ansi_z39_47 +#define TO_LOOP to_ansi_z39_47 +#define DEFINE_INIT 1 +#define DEFINE_FINI 1 +#define MIN_NEEDED_FROM 1 +#define MAX_NEEDED_FROM 2 +#define MIN_NEEDED_TO 4 + +/* First define the conversion function from ANSI_Z39.47 to UCS4. */ +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +#define BODY \ + { \ + uint32_t ch = *inptr; \ + int incr; \ + \ + if (__builtin_expect (ch >= 0xe0, 0) && ch <= 0xfe) \ + { \ + /* Composed character. First test whether the next character \ + is also available. */ \ + uint32_t ch2; \ + \ + if (inptr + 1 >= inend) \ + { \ + /* The second character is not available. */ \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + \ + ch2 = inptr[1]; \ + \ + if (__builtin_expect (ch2 < 0x20, 0) \ + || __builtin_expect (ch2 >= 0x80, 0)) \ + { \ + /* This is illegal. */ \ + if (! ignore_errors_p ()) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + \ + ++*irreversible; \ + incr = 1; \ + } \ + else \ + { \ + uint32_t ch3 = to_ucs4_comb[ch - 0xe0][ch2 - 0x20]; \ + if (ch3 != 0) { \ + ch = ch3; \ + incr = 2; \ + } \ + else { \ + /* mapping for ch2 is an identity, because is ASCII here */ \ + put32 (outptr, ch2); \ + outptr += 4; \ + ch = to_ucs4[ch - 0x80]; \ + incr = 2; \ + } \ + } \ + } \ + else \ + { \ + if (__builtin_expect (ch >= 0x80, 0)) \ + ch = to_ucs4[ch - 0x80]; \ + incr = 1; \ + } \ + \ + if (__builtin_expect (ch, 1) == 0 && *inptr != '\0') \ + { \ + /* This is an illegal character. */ \ + if (! ignore_errors_p ()) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + } \ + else \ + { \ + put32 (outptr, ch); \ + outptr += 4; \ + } \ + \ + inptr += incr; \ + } +#define LOOP_NEED_FLAGS +#include "loop.c" + + +/* Next, define the other direction. */ +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM +#define LOOPFCT TO_LOOP +#define BODY \ + { \ + char tmp[2]; \ + uint32_t ch = get32 (inptr); \ + const char *cp; \ + \ + if (__builtin_expect (ch > 0x017e, 0)) \ + { \ + if (ch >= 0x1a0 && ch < 0x1b4) \ + cp = from_ucs4_p01a[ch - 0x1a0]; \ + else if (ch >= 0x220 && ch < 0x234) \ + cp = from_ucs4_p022[ch - 0x220]; \ + else if (ch >= 0x2b0 && ch < 0x2e2) \ + cp = from_ucs4_p02b[ch - 0x2b0]; \ + else if (ch >= 0x300 && ch < 0x337) \ + cp = from_ucs4_p030[ch - 0x300]; \ + else if (ch >= 0x1ea0 && ch < 0x1efa) \ + cp = from_ucs4_p1ea[ch - 0x1ea0]; \ + else if (ch >= 0x2000 && ch < 0x200f) \ + cp = from_ucs4_p200[ch - 0x2000]; \ + else if (ch >= 0x2110 && ch < 0x211a) \ + cp = from_ucs4_p211[ch - 0x2110]; \ + else if (ch >= 0x2660 && ch < 0x2674) \ + cp = from_ucs4_p266[ch - 0x2660]; \ + else if (ch >= 0xfe20 && ch < 0xfe25) \ + cp = from_ucs4_pfe2[ch - 0xfe20]; \ + else \ + { \ + UNICODE_TAG_HANDLER (ch, 4); \ + \ + /* Illegal characters. */ \ + STANDARD_ERR_HANDLER (4); \ + } \ + } \ + else \ + { \ + if (__builtin_expect (ch < 0x80, 1)) { \ + tmp[0] = ch; \ + tmp[1] = '\0'; \ + cp = tmp; \ + } \ + else \ + cp = from_ucs4[ch-0x80]; \ + if (__builtin_expect (ch >= 0x20, 1) \ + && __builtin_expect (ch < 0x80, 1)) \ + { \ + /* Check whether the next character is an accent, if so, then */ \ + /* output it first */ \ + uint32_t ch2; \ + inptr += 4; \ + ch2 = get32 (inptr); \ + if (ch2 >= 0x300 && ch2 < 0x337) { \ + const char* cp2 = from_ucs4_p030[ch2 - 0x300]; \ + if (cp2[0] != '\0') { \ + *outptr++ = cp2[0]; \ + } \ + else \ + inptr -= 4; \ + } \ + else if (ch2 >= 0xfe20 && ch2 < 0xfe25) { \ + const char* cp2 = from_ucs4_pfe2[ch2 - 0xfe20]; \ + if (cp2[0] != '\0') { \ + *outptr++ = cp2[0]; \ + } \ + else \ + inptr -= 4; \ + } \ + else \ + inptr -= 4; \ + } \ + } \ + \ + if (__builtin_expect (cp[0], '\1') == '\0' && ch != 0) \ + { \ + /* Illegal characters. */ \ + STANDARD_ERR_HANDLER (4); \ + } \ + \ + *outptr++ = cp[0]; \ + /* Now test for a possible second byte and write this if possible. */ \ + if (cp[1] != '\0') \ + { \ + if (__builtin_expect (outptr >= outend, 0)) \ + { \ + /* The result does not fit into the buffer. */ \ + --outptr; \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + *outptr++ = cp[1]; \ + } \ + \ + inptr += 4; \ + } +#define LOOP_NEED_FLAGS +#include "loop.c" + + +/* Now define the toplevel functions. */ +#include "skeleton.c" -- 2.30.2