iconv/libiconv/ansi_z39_47.h

   1 /*
   2  * Copyright (C) 1999-2002 Free Software Foundation, Inc.
   3  * This file is part of the GNU LIBICONV Library.
   4  *
   5  * The GNU LIBICONV Library is free software; you can redistribute it
   6  * and/or modify it under the terms of the GNU Library General Public
   7  * License as published by the Free Software Foundation; either version 2
   8  * of the License, or (at your option) any later version.
   9  *
  10  * The GNU LIBICONV Library is distributed in the hope that it will be
  11  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Library General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Library General Public
  16  * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  17  * If not, write to the Free Software Foundation, Inc., 59 Temple Place -
  18  * Suite 330, Boston, MA 02111-1307, USA.
  19  */
  20
  21 /*
  22  * ANSI_Z39.47
  23  */
  24
  25 #include "ANSI_Z39.47-tables.h"
  26
  27 /* Omit first half of table: assume identity mapping (ASCII) */
  28 static const unsigned short ansi_z39_47_2uni[128] = TABLE_TO_UCS4_BASIC;
  29
  30 /* The outer array range runs from 0xe0 to 0xfe, the inner range from 0x20
  31    to 0x7f.  */
  32 static const unsigned short ansi_z39_47_2uni_comb[31][96] =
  33 {
  34   /* 0xe0 (hook above) */                           TABLE_TO_UCS4_COMBINING_E0,
  35   /* 0xe1 (grave) */                                TABLE_TO_UCS4_COMBINING_E1,
  36   /* 0xe2 (acute) */                                TABLE_TO_UCS4_COMBINING_E2,
  37   /* 0xe3 (circumflex) */                           TABLE_TO_UCS4_COMBINING_E3,
  38   /* 0xe4 (tilde) */                                TABLE_TO_UCS4_COMBINING_E4,
  39   /* 0xe5 (macron) */                               TABLE_TO_UCS4_COMBINING_E5,
  40   /* 0xe6 (breve) */                                TABLE_TO_UCS4_COMBINING_E6,
  41   /* 0xe7 (dot above) */                            TABLE_TO_UCS4_COMBINING_E7,
  42   /* 0xe8 (umlaut, diaeresis) */                    TABLE_TO_UCS4_COMBINING_E8,
  43   /* 0xe9 (caron, hacek) */                         TABLE_TO_UCS4_COMBINING_E9,
  44   /* 0xea (ring above) */                           TABLE_TO_UCS4_COMBINING_EA,
  45   /* 0xeb (ligature, left half) */                  TABLE_TO_UCS4_COMBINING_EB,
  46   /* 0xec (ligature, right half) */                 TABLE_TO_UCS4_COMBINING_EC,
  47   /* 0xed (comma above right) */                    TABLE_TO_UCS4_COMBINING_ED,
  48   /* 0xee (double acute) */                         TABLE_TO_UCS4_COMBINING_EE,
  49   /* 0xef (candrabindu) */                          TABLE_TO_UCS4_COMBINING_EF,
  50   /* 0xf0 (cedilla) */                              TABLE_TO_UCS4_COMBINING_F0,
  51   /* 0xf1 (ogonek, right hook) */                   TABLE_TO_UCS4_COMBINING_F1,
  52   /* 0xf2 (dot below) */                            TABLE_TO_UCS4_COMBINING_F2,
  53   /* 0xf3 (double dot below) */                     TABLE_TO_UCS4_COMBINING_F3,
  54   /* 0xf4 (ring below) */                           TABLE_TO_UCS4_COMBINING_F4,
  55   /* 0xf5 (double low line) */                      TABLE_TO_UCS4_COMBINING_F5,
  56   /* 0xf6 (line below) */                           TABLE_TO_UCS4_COMBINING_F6,
  57   /* 0xf7 (comma below, left hook) */               TABLE_TO_UCS4_COMBINING_F7,
  58   /* 0xf8 (left half ring below, right cedilla) */  TABLE_TO_UCS4_COMBINING_F8,
  59   /* 0xf9 (breve below, half circle below) */       TABLE_TO_UCS4_COMBINING_F9,
  60   /* 0xfa (double tilde, left half) */              TABLE_TO_UCS4_COMBINING_FA,
  61   /* 0xfb (double tilde, right half) */             TABLE_TO_UCS4_COMBINING_FB,
  62   /* 0xfc */                                        TABLE_TO_UCS4_COMBINING_FC,
  63   /* 0xfd */                                        TABLE_TO_UCS4_COMBINING_FD,
  64   /* 0xfe (comma above, high centered comma) */     TABLE_TO_UCS4_COMBINING_FE,
  65 };
  66
  67 #define BASE_PASSED 0x10000
  68
  69 static int
  70 ansi_z39_47_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
  71 {
  72   unsigned char c = *s;
  73   ucs4_t last_wc = conv->istate;
  74   int retval = 1;
  75   if (last_wc & BASE_PASSED) {
  76     /* base character was already output, reset the state and output the
  77        diacritical mark */
  78     unsigned char dc = (unsigned char)(last_wc & ~BASE_PASSED);
  79     *pwc = ansi_z39_47_2uni[dc-0x80];
  80     conv->istate = 0;
  81     return 1;
  82   }
  83   if (last_wc) {
  84     conv->istate |= BASE_PASSED;
  85     retval = 0;
  86   }
  87   if (c < 0x80) {
  88     if (last_wc && c >= 0x20) {
  89       /* Check if we can combine the character with the diacritical mark */
  90       unsigned char dc = (unsigned char)(last_wc & ~BASE_PASSED);
  91       unsigned short wc = ansi_z39_47_2uni_comb[dc-0xe0][c-0x20];
  92       if (wc != 0x0000) {
  93         *pwc = (ucs4_t) wc;
  94         conv->istate = 0;
  95         return 1;
  96       }
  97     }
  98     *pwc = (ucs4_t) c;
  99     return retval;
 100   }
 101   else if (c < 0xe0) {
 102     unsigned short wc = ansi_z39_47_2uni[c-0x80];
 103     if (wc != 0x0000) {
 104       *pwc = (ucs4_t) wc;
 105       return retval;
 106     }
 107   }
 108   else {
 109     /* The range from 0xe0 to 0xfe are diacritical marks.
 110        Note that in ANSEL they come *before* the base characters, in Unicode,
 111        they come *after*, so we have to buffer them ... */
 112     conv->istate = (state_t)c;
 113     return RET_TOOFEW(1);
 114   }
 115   return RET_ILSEQ;
 116 }
 117
 118 static const unsigned char ansi_z39_47_page080[][2] = TABLE_FROM_UCS4_BASIC;
 119 static const unsigned char ansi_z39_47_page01a[][2] = TABLE_FROM_UCS4_PAGE_01A;
 120 static const unsigned char ansi_z39_47_page022[][2] = TABLE_FROM_UCS4_PAGE_022;
 121 static const unsigned char ansi_z39_47_page02b[][2] = TABLE_FROM_UCS4_PAGE_02B;
 122 static const unsigned char ansi_z39_47_page030[][2] = TABLE_FROM_UCS4_PAGE_030;
 123 static const unsigned char ansi_z39_47_page1ea[][2] = TABLE_FROM_UCS4_PAGE_1EA;
 124 static const unsigned char ansi_z39_47_page200[][2] = TABLE_FROM_UCS4_PAGE_200;
 125 static const unsigned char ansi_z39_47_page211[][2] = TABLE_FROM_UCS4_PAGE_211;
 126 static const unsigned char ansi_z39_47_page266[][2] = TABLE_FROM_UCS4_PAGE_266;
 127 static const unsigned char ansi_z39_47_pagefe2[][2] = TABLE_FROM_UCS4_PAGE_FE2;
 128
 129 static int
 130 ansi_z39_47_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
 131 {
 132   const unsigned char* ch = NULL;
 133   int output = 0;
 134
 135 #define OUTPUT(c)  ++output; if (n < output) return RET_TOOSMALL; *(r++) = (c);
 136
 137   /* Since in UTF-8 diacritical marks come after the base character and in
 138      ANSEL before, we need to buffer possible base characters (0x20 to 0x7f)
 139      to put the diacritical mark before it if there is one following */
 140   if (wc < 0x0080) {
 141     if (conv->ostate) {
 142       OUTPUT(conv->ostate);
 143       conv->ostate = 0;
 144     }
 145     if (wc >= 0x0020) {
 146       conv->ostate = (state_t) wc;
 147     }
 148     else {
 149       OUTPUT(wc);
 150     }
 151     return output;
 152   }
 153   else if (wc >= 0x0080 && wc < 0x017f)
 154     ch = ansi_z39_47_page080[wc-0x0080];
 155   else if (wc >= 0x01a0 && wc < 0x01b4)
 156     ch = ansi_z39_47_page01a[wc-0x01a0];
 157   else if (wc >= 0x0220 && wc < 0x0234)
 158     ch = ansi_z39_47_page022[wc-0x0220];
 159   else if (wc >= 0x02b0 && wc < 0x02e2)
 160     ch = ansi_z39_47_page02b[wc-0x02b0];
 161   else if (wc >= 0x0300 && wc < 0x0337)
 162     ch = ansi_z39_47_page030[wc-0x0300];
 163   else if (wc >= 0x1ea0 && wc < 0x1efa)
 164     ch = ansi_z39_47_page1ea[wc-0x1ea0];
 165   else if (wc >= 0x2000 && wc < 0x200f)
 166     ch = ansi_z39_47_page200[wc-0x2000];
 167   else if (wc >= 0x2110 && wc < 0x211a)
 168     ch = ansi_z39_47_page211[wc-0x2110];
 169   else if (wc >= 0x2660 && wc < 0x2674)
 170     ch = ansi_z39_47_page266[wc-0x2660];
 171   else if (wc >= 0xfe20 && wc < 0xfe25)
 172     ch = ansi_z39_47_pagefe2[wc-0xfe20];
 173   if (ch && ch[0] != 0) {
 174     if (ch[1] == 0 && ch[0] >= 0xe0 && ch[0] <= 0xfe) {
 175       /* Diacritical mark following a base character, buffered in ostate */
 176       /* Output diacritical mark, then base character */
 177       if (conv->ostate) {
 178         OUTPUT(ch[0]);
 179         OUTPUT(conv->ostate);
 180         conv->ostate = 0;
 181       }
 182       else
 183         return RET_ILUNI;
 184     }
 185     else {
 186       if (conv->ostate) {
 187         OUTPUT(conv->ostate);
 188         conv->ostate = 0;
 189       }
 190       OUTPUT(ch[0]);
 191     }
 192     if (ch[1] != 0) {
 193       OUTPUT(ch[1]);
 194     }
 195     return output;
 196   }
 197   return RET_ILUNI;
 198 }