utf8/utf8.c

   1 /* Utility functions for UTF-8
   2    Copyright (C) 2001, 2002 Peter Verthez
   3
   4    The UTF8 tools library is free software; you can redistribute it
   5    and/or modify it under the terms of the GNU Lesser General Public
   6    License as published by the Free Software Foundation; either
   7    version 2.1 of the License, or (at your option) any later version.
   8
   9    The Gedcom parser library is distributed in the hope that it will be
  10    useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12    Lesser General Public License for more details.
  13
  14    You should have received a copy of the GNU Lesser General Public
  15    License along with the Gedcom parser library; if not, write to the
  16    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  17    02111-1307 USA.
  18 */
  19
  20 /* $Id$ */
  21 /* $Name$ */
  22
  23 #include "utf8tools.h"
  24 #include <string.h>
  25
  26 int is_utf8_string(const char* str)
  27 {
  28   int expect_bytes = 0;
  29
  30   if (!str) return 0;
  31
  32   while (*str) {
  33     if ((*str & 0x80) == 0) {
  34       /* Looks like an ASCII character */
  35       if (expect_bytes)
  36         /* byte of UTF-8 character expected */
  37         return 0;
  38       else {
  39         /* OK, ASCII character expected */
  40         str++;
  41       }
  42     }
  43     else {
  44       /* Looks like byte of an UTF-8 character */
  45       if (expect_bytes) {
  46         /* expect_bytes already set: first byte of UTF-8 char already seen */
  47         if ((*str & 0xC0) == 0x80) {
  48           /* OK, next byte of UTF-8 character */
  49           /* Decrement number of expected bytes */
  50           expect_bytes--;
  51           str++;
  52         }
  53         else {
  54           /* again first byte ?!?! */
  55           return 0;
  56         }
  57       }
  58       else {
  59         /* First byte of the UTF-8 character */
  60         /* count initial one bits and set expect_bytes to 1 less */
  61         char ch = *str;
  62         while (ch & 0x80) {
  63           expect_bytes++;
  64           ch = (ch & 0x7f) << 1;
  65         }
  66         expect_bytes--;
  67         str++;
  68       }
  69     }
  70   }
  71
  72   return (expect_bytes == 0);
  73 }
  74
  75 int utf8_strlen(const char* str)
  76 {
  77   int num_char = 0;
  78
  79   if (!str) return 0;
  80
  81   while (*str) {
  82     if ((*str & 0xC0) != 0xC0) num_char++;
  83     str++;
  84   }
  85
  86   return num_char;
  87 }
  88
  89 char* next_utf8_char(char* str)
  90 {
  91   if (!str) return NULL;
  92
  93   if (*str) {
  94     str++;
  95     while (*str && (*str & 0xC0) == 0x80)
  96       str++;
  97   }
  98   return str;
  99 }
 100
 101 char* nth_utf8_char(char* str, int n)
 102 {
 103   int num_char = 0;
 104   if (!str) return NULL;
 105
 106   if (*str) {
 107     str++;
 108     while (*str) {
 109       if ((*str & 0xC0) != 0x80) num_char++;
 110       if (num_char == n) break;
 111       str++;
 112     }
 113   }
 114   return str;
 115 }