From b254663a41a4da7bc4c9039f89411f4b3bd0b8a7 Mon Sep 17 00:00:00 2001 From: Peter Verthez Date: Fri, 22 Nov 2002 20:53:51 +0000 Subject: [PATCH] Renamed utf8-locale.h to utf8.h. Added functions is_utf8_string and utf8_strlen. --- utf8/utf8.c | 77 ++++++++++++++++++++++++++++++++++ utf8/{utf8-locale.h => utf8.h} | 15 +++++-- 2 files changed, 88 insertions(+), 4 deletions(-) create mode 100644 utf8/utf8.c rename utf8/{utf8-locale.h => utf8.h} (58%) diff --git a/utf8/utf8.c b/utf8/utf8.c new file mode 100644 index 0000000..443fcfc --- /dev/null +++ b/utf8/utf8.c @@ -0,0 +1,77 @@ +/* Utility functions for UTF-8 + Copyright (C) 2001, 2002 Peter Verthez + + Permission granted to do anything with this file that you want, as long + as the above copyright is retained in all copies. + THERE IS NO WARRANTY - USE AT YOUR OWN RISK +*/ + +/* $Id$ */ +/* $Name$ */ + +#include "utf8.h" + +int is_utf8_string(const char* str) +{ + int expect_bytes = 0; + + if (!str) return 0; + + while (*str) { + if ((*str & 0x80) == 0) { + /* Looks like an ASCII character */ + if (expect_bytes) + /* byte of UTF-8 character expected */ + return 0; + else { + /* OK, ASCII character expected */ + str++; + } + } + else { + /* Looks like byte of an UTF-8 character */ + if (expect_bytes) { + /* expect_bytes already set: first byte of UTF-8 char already seen */ + if ((*str & 0xC0) == 0x80) { + /* OK, next byte of UTF-8 character */ + /* Decrement number of expected bytes */ + expect_bytes--; + str++; + } + else { + /* again first byte ?!?! */ + return 0; + } + } + else { + /* First byte of the UTF-8 character */ + /* count initial one bits and set expect_bytes to 1 less */ + char ch = *str; + while (ch & 0x80) { + expect_bytes++; + ch = (ch & 0x7f) << 1; + } + expect_bytes--; + str++; + } + } + } + + return (expect_bytes == 0); +} + +int utf8_strlen(const char* str) +{ + int num_char = 0; + + if (!str) return 0; + + while (*str) { + if ((*str & 0x80) == 0 || (*str & 0xC0) == 0xC0) + num_char++; + str++; + } + + return num_char; +} + diff --git a/utf8/utf8-locale.h b/utf8/utf8.h similarity index 58% rename from utf8/utf8-locale.h rename to utf8/utf8.h index b8a752b..79836dd 100644 --- a/utf8/utf8-locale.h +++ b/utf8/utf8.h @@ -1,4 +1,4 @@ -/* Encoding utility from UTF-8 to locale and vice versa +/* Header file for UTF-8 functions Copyright (C) 2001, 2002 Peter Verthez Permission granted to do anything with this file that you want, as long @@ -9,13 +9,20 @@ /* $Id$ */ /* $Name$ */ -#ifndef __UTF8_LOCALE_H -#define __UTF8_LOCALE_H +#ifndef __UTF8_H +#define __UTF8_H #ifdef __cplusplus extern "C" { #endif + /* Returns -1 if the string is not a valid UTF-8 string, returns its + string length otherwise */ +int utf8_strlen(const char* input); + + /* Returns 1 if string is valid UTF-8 string, 0 otherwise */ +int is_utf8_string(const char* input); + void convert_set_unknown(const char* unknown); char* convert_utf8_to_locale(const char* input, int *conv_fails); char* convert_locale_to_utf8(const char* input); @@ -24,4 +31,4 @@ char* convert_locale_to_utf8(const char* input); } #endif -#endif /* __UTF8_LOCALE_H */ +#endif /* __UTF8_H */ -- 2.30.2