iconv/libiconv/ansi_z39_47.h

   1 /*
   2  * Copyright (C) 1999-2002 Free Software Foundation, Inc.
   3  * This file is part of the GNU LIBICONV Library.
   4  *
   5  * The GNU LIBICONV Library is free software; you can redistribute it
   6  * and/or modify it under the terms of the GNU Library General Public
   7  * License as published by the Free Software Foundation; either version 2
   8  * of the License, or (at your option) any later version.
   9  *
  10  * The GNU LIBICONV Library is distributed in the hope that it will be
  11  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Library General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Library General Public
  16  * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  17  * If not, write to the Free Software Foundation, Inc., 59 Temple Place -
  18  * Suite 330, Boston, MA 02111-1307, USA.
  19  */
  20
  21 /*
  22  * ANSI_Z39.47
  23  */
  24
  25 #include "ANSI_Z39.47-tables.h"
  26 #include <stdio.h>
  27
  28 /* Omit first half of table: assume identity mapping (ASCII) */
  29 static const unsigned short ansi_z39_47_2uni[128] = TABLE_TO_UCS4_BASIC;
  30
  31 /* The outer array range runs from 0xe0 to 0xfe, the inner range from 0x20
  32    to 0x7f.  */
  33 static const unsigned short ansi_z39_47_2uni_comb[31][96] =
  34 {
  35   /* 0xe0 (hook above) */                           TABLE_TO_UCS4_COMBINING_E0,
  36   /* 0xe1 (grave) */                                TABLE_TO_UCS4_COMBINING_E1,
  37   /* 0xe2 (acute) */                                TABLE_TO_UCS4_COMBINING_E2,
  38   /* 0xe3 (circumflex) */                           TABLE_TO_UCS4_COMBINING_E3,
  39   /* 0xe4 (tilde) */                                TABLE_TO_UCS4_COMBINING_E4,
  40   /* 0xe5 (macron) */                               TABLE_TO_UCS4_COMBINING_E5,
  41   /* 0xe6 (breve) */                                TABLE_TO_UCS4_COMBINING_E6,
  42   /* 0xe7 (dot above) */                            TABLE_TO_UCS4_COMBINING_E7,
  43   /* 0xe8 (umlaut, diaeresis) */                    TABLE_TO_UCS4_COMBINING_E8,
  44   /* 0xe9 (caron, hacek) */                         TABLE_TO_UCS4_COMBINING_E9,
  45   /* 0xea (ring above) */                           TABLE_TO_UCS4_COMBINING_EA,
  46   /* 0xeb (ligature, left half) */                  TABLE_TO_UCS4_COMBINING_EB,
  47   /* 0xec (ligature, right half) */                 TABLE_TO_UCS4_COMBINING_EC,
  48   /* 0xed (comma above right) */                    TABLE_TO_UCS4_COMBINING_ED,
  49   /* 0xee (double acute) */                         TABLE_TO_UCS4_COMBINING_EE,
  50   /* 0xef (candrabindu) */                          TABLE_TO_UCS4_COMBINING_EF,
  51   /* 0xf0 (cedilla) */                              TABLE_TO_UCS4_COMBINING_F0,
  52   /* 0xf1 (ogonek, right hook) */                   TABLE_TO_UCS4_COMBINING_F1,
  53   /* 0xf2 (dot below) */                            TABLE_TO_UCS4_COMBINING_F2,
  54   /* 0xf3 (double dot below) */                     TABLE_TO_UCS4_COMBINING_F3,
  55   /* 0xf4 (ring below) */                           TABLE_TO_UCS4_COMBINING_F4,
  56   /* 0xf5 (double low line) */                      TABLE_TO_UCS4_COMBINING_F5,
  57   /* 0xf6 (line below) */                           TABLE_TO_UCS4_COMBINING_F6,
  58   /* 0xf7 (comma below, left hook) */               TABLE_TO_UCS4_COMBINING_F7,
  59   /* 0xf8 (left half ring below, right cedilla) */  TABLE_TO_UCS4_COMBINING_F8,
  60   /* 0xf9 (breve below, half circle below) */       TABLE_TO_UCS4_COMBINING_F9,
  61   /* 0xfa (double tilde, left half) */              TABLE_TO_UCS4_COMBINING_FA,
  62   /* 0xfb (double tilde, right half) */             TABLE_TO_UCS4_COMBINING_FB,
  63   /* 0xfc */                                        TABLE_TO_UCS4_COMBINING_FC,
  64   /* 0xfd */                                        TABLE_TO_UCS4_COMBINING_FD,
  65   /* 0xfe (comma above, high centered comma) */     TABLE_TO_UCS4_COMBINING_FE,
  66 };
  67
  68 #define BASE_PASSED 0x10000
  69
  70 static int
  71 ansi_z39_47_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
  72 {
  73   unsigned char c = *s;
  74   ucs4_t last_wc = conv->istate;
  75   int retval = 1;
  76   if (last_wc & BASE_PASSED) {
  77     /* base character was already output, reset the state and output the
  78        diacritical mark */
  79     unsigned char dc = (unsigned char)(last_wc & ~BASE_PASSED);
  80     *pwc = ansi_z39_47_2uni[dc-0x80];
  81     conv->istate = 0;
  82     return 1;
  83   }
  84   if (last_wc) {
  85     conv->istate |= BASE_PASSED;
  86     retval = 0;
  87   }
  88   if (c < 0x80) {
  89     if (last_wc && c >= 0x20) {
  90       /* Check if we can combine the character with the diacritical mark */
  91       unsigned char dc = (unsigned char)(last_wc & ~BASE_PASSED);
  92       unsigned short wc = ansi_z39_47_2uni_comb[dc-0xe0][c-0x20];
  93       if (wc != 0x0000) {
  94         *pwc = (ucs4_t) wc;
  95         conv->istate = 0;
  96         return 1;
  97       }
  98     }
  99     *pwc = (ucs4_t) c;
 100     return retval;
 101   }
 102   else if (c < 0xe0) {
 103     unsigned short wc = ansi_z39_47_2uni[c-0x80];
 104     if (wc != 0x0000) {
 105       *pwc = (ucs4_t) wc;
 106       return retval;
 107     }
 108   }
 109   else {
 110     /* The range from 0xe0 to 0xfe are diacritical marks.
 111        Note that in ANSEL they come *before* the base characters, in Unicode,
 112        they come *after*, so we have to buffer them ... */
 113     conv->istate = (state_t)c;
 114     return RET_TOOFEW(1);
 115   }
 116   return RET_ILSEQ;
 117 }
 118
 119 static const unsigned char ansi_z39_47_page080[][2] = TABLE_FROM_UCS4_BASIC;
 120 static const unsigned char ansi_z39_47_page01a[][2] = TABLE_FROM_UCS4_PAGE_01A;
 121 static const unsigned char ansi_z39_47_page022[][2] = TABLE_FROM_UCS4_PAGE_022;
 122 static const unsigned char ansi_z39_47_page02b[][2] = TABLE_FROM_UCS4_PAGE_02B;
 123 static const unsigned char ansi_z39_47_page030[][2] = TABLE_FROM_UCS4_PAGE_030;
 124 static const unsigned char ansi_z39_47_page1ea[][2] = TABLE_FROM_UCS4_PAGE_1EA;
 125 static const unsigned char ansi_z39_47_page200[][2] = TABLE_FROM_UCS4_PAGE_200;
 126 static const unsigned char ansi_z39_47_page211[][2] = TABLE_FROM_UCS4_PAGE_211;
 127 static const unsigned char ansi_z39_47_page266[][2] = TABLE_FROM_UCS4_PAGE_266;
 128 static const unsigned char ansi_z39_47_pagefe2[][2] = TABLE_FROM_UCS4_PAGE_FE2;
 129
 130 static int
 131 ansi_z39_47_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
 132 {
 133   const unsigned char* ch = NULL;
 134   int output = 0;
 135
 136 #define OUTPUT(c)  ++output; if (n < output) return RET_TOOSMALL; *(r++) = (c);
 137
 138   /* Since in UTF-8 diacritical marks come after the base character and in
 139      ANSEL before, we need to buffer possible base characters (0x20 to 0x7f)
 140      to put the diacritical mark before it if there is one following */
 141   if (wc < 0x0080) {
 142     if (conv->ostate) {
 143       OUTPUT(conv->ostate);
 144       conv->ostate = 0;
 145     }
 146     if (wc >= 0x0020) {
 147       conv->ostate = (state_t) wc;
 148     }
 149     else {
 150       OUTPUT(wc);
 151     }
 152     return output;
 153   }
 154   else if (wc >= 0x0080 && wc < 0x017f)
 155     ch = ansi_z39_47_page080[wc-0x0080];
 156   else if (wc >= 0x01a0 && wc < 0x01b4)
 157     ch = ansi_z39_47_page01a[wc-0x01a0];
 158   else if (wc >= 0x0220 && wc < 0x0234)
 159     ch = ansi_z39_47_page022[wc-0x0220];
 160   else if (wc >= 0x02b0 && wc < 0x02e2)
 161     ch = ansi_z39_47_page02b[wc-0x02b0];
 162   else if (wc >= 0x0300 && wc < 0x0337)
 163     ch = ansi_z39_47_page030[wc-0x0300];
 164   else if (wc >= 0x1ea0 && wc < 0x1efa)
 165     ch = ansi_z39_47_page1ea[wc-0x1ea0];
 166   else if (wc >= 0x2000 && wc < 0x200f)
 167     ch = ansi_z39_47_page200[wc-0x2000];
 168   else if (wc >= 0x2110 && wc < 0x211a)
 169     ch = ansi_z39_47_page211[wc-0x2110];
 170   else if (wc >= 0x2660 && wc < 0x2674)
 171     ch = ansi_z39_47_page266[wc-0x2660];
 172   else if (wc >= 0xfe20 && wc < 0xfe25)
 173     ch = ansi_z39_47_pagefe2[wc-0xfe20];
 174   if (ch && ch[0] != 0) {
 175     if (ch[1] == 0 && ch[0] >= 0xe0 && ch[0] <= 0xfe) {
 176       /* Diacritical mark following a base character, buffered in ostate */
 177       /* Output diacritical mark, then base character */
 178       if (conv->ostate) {
 179         OUTPUT(ch[0]);
 180         OUTPUT(conv->ostate);
 181         conv->ostate = 0;
 182       }
 183       else
 184         return RET_ILUNI;
 185     }
 186     else {
 187       if (conv->ostate) {
 188         OUTPUT(conv->ostate);
 189         conv->ostate = 0;
 190       }
 191       OUTPUT(ch[0]);
 192     }
 193     if (ch[1] != 0) {
 194       OUTPUT(ch[1]);
 195     }
 196     return output;
 197   }
 198   return RET_ILUNI;
 199 }