changeset 5683:8101787cdd1c HEAD

Rewrote some code and cleaned up the API
author Timo Sirainen <tss@iki.fi>
date Mon, 11 Jun 2007 04:37:29 +0300
parents ff5ba9cb6cd0
children d7302155b97f
files src/lib/unichar.c src/lib/unichar.h
diffstat 2 files changed, 122 insertions(+), 189 deletions(-) [+]
line wrap: on
line diff
--- a/src/lib/unichar.c	Mon Jun 11 02:27:55 2007 +0300
+++ b/src/lib/unichar.c	Mon Jun 11 04:37:29 2007 +0300
@@ -1,49 +1,15 @@
-/* Copyright (C) 2005 Timo Sirainen */
-
-/* Contains code from GLIB:
- *
- * Copyright (C) 1999 Tom Tromey
- * Copyright (C) 2000 Red Hat, Inc.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- */
+/* Copyright (C) 2005-2007 Timo Sirainen */
 
 #include "lib.h"
 #include "buffer.h"
 #include "unichar.h"
 
-#define UTF8_LENGTH(Char)              \
-  ((Char) < 0x80 ? 1 :                 \
-   ((Char) < 0x800 ? 2 :               \
-    ((Char) < 0x10000 ? 3 :            \
-     ((Char) < 0x200000 ? 4 :          \
-      ((Char) < 0x4000000 ? 5 : 6)))))
-
-static const char utf8_skip_data[256] = {
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+static const uint8_t utf8_non1_bytes[256 - 192 - 2] = {
+	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
 };
 
-const char *const uni_utf8_skip = utf8_skip_data;
+const uint8_t *const uni_utf8_non1_bytes = utf8_non1_bytes;
 
 unsigned int uni_strlen(const unichar_t *str)
 {
@@ -54,146 +20,71 @@
 	return len;
 }
 
-unichar_t uni_utf8_get_char(const char *input)
+int uni_utf8_get_char(const char *input, unichar_t *chr_r)
 {
-	return uni_utf8_get_char_len((const unsigned char *)input, (size_t)-1);
+	return uni_utf8_get_char_n((const unsigned char *)input, (size_t)-1,
+				   chr_r);
 }
 
-unichar_t uni_utf8_get_char_len(const unsigned char *input, size_t max_len)
+int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r)
 {
-  unsigned int i, len;
-  unichar_t wc = *input;
-
-  i_assert(max_len > 0);
+	const unsigned char *input = _input;
+	unichar_t chr;
+	unsigned int i, len;
+	int ret;
 
-  if (wc < 0x80)
-    {
-      return wc;
-    }
-  else if (wc < 0xc0)
-    {
-      return (unichar_t)-1;
-    }
-  else if (wc < 0xe0)
-    {
-      len = 2;
-      wc &= 0x1f;
-    }
-  else if (wc < 0xf0)
-    {
-      len = 3;
-      wc &= 0x0f;
-    }
-  else if (wc < 0xf8)
-    {
-      len = 4;
-      wc &= 0x07;
-    }
-  else if (wc < 0xfc)
-    {
-      len = 5;
-      wc &= 0x03;
-    }
-  else if (wc < 0xfe)
-    {
-      len = 6;
-      wc &= 0x01;
-    }
-  else
-    {
-      return (unichar_t)-1;
-    }
+	i_assert(max_len > 0);
 
-  if (max_len != (size_t)-1 && len > max_len)
-    {
-      for (i = 1; i < max_len; i++)
-	{
-	  if ((input[i] & 0xc0) != 0x80)
-	    return (unichar_t)-1;
-	}
-      return (unichar_t)-2;
-    }
-
-  for (i = 1; i < len; ++i)
-    {
-      if ((input[i] & 0xc0) != 0x80)
-	{
-	  if (input[i] != '\0')
-	    return (unichar_t)-1;
-	  else
-	    return (unichar_t)-2;
+	if (*input < 0x80) {
+		*chr_r = *input;
+		return 1;
 	}
 
-      wc <<= 6;
-      wc |= (input[i] & 0x3f);
-    }
-
-  if (UTF8_LENGTH(wc) != len)
-    return (unichar_t)-1;
-  
-  return wc;
-}
-
-/**
- * g_unichar_to_utf8:
- * @c: a ISO10646 character code
- * @outbuf: output buffer, must have at least 6 bytes of space.
- *       If %NULL, the length will be computed and returned
- *       and nothing will be written to @outbuf.
- * 
- * Converts a single character to UTF-8.
- * 
- * Return value: number of bytes written
- **/
-static int
-g_unichar_to_utf8(unichar_t c, char *outbuf)
-{
-  unsigned int len = 0;
-  int first;
-  int i;
+	/* first byte has len highest bits set, followed by zero bit.
+	   the rest of the bits are used as the highest bits of the value. */
+	chr = *input;
+	len = uni_utf8_char_bytes(*input);
+	switch (len) {
+	case 2:
+		chr &= 0x1f;
+		break;
+	case 3:
+		chr &= 0x0f;
+		break;
+	case 4:
+		chr &= 0x07;
+		break;
+	case 5:
+		chr &= 0x03;
+		break;
+	case 6:
+		chr &= 0x01;
+		break;
+	default:
+		/* only 7bit chars should have len==1 */
+		i_assert(len == 1);
+		return -1;
+	}
 
-  if (c < 0x80)
-    {
-      first = 0;
-      len = 1;
-    }
-  else if (c < 0x800)
-    {
-      first = 0xc0;
-      len = 2;
-    }
-  else if (c < 0x10000)
-    {
-      first = 0xe0;
-      len = 3;
-    }
-   else if (c < 0x200000)
-    {
-      first = 0xf0;
-      len = 4;
-    }
-  else if (c < 0x4000000)
-    {
-      first = 0xf8;
-      len = 5;
-    }
-  else
-    {
-      first = 0xfc;
-      len = 6;
-    }
+	if (len <= max_len)
+		ret = 1;
+	else {
+		/* check first if the input is invalid before returning 0 */
+		ret = 0;
+		len = max_len;
+	}
 
-  if (outbuf)
-    {
-      for (i = len - 1; i > 0; --i)
-	{
-	  outbuf[i] = (c & 0x3f) | 0x80;
-	  c >>= 6;
+	/* the following bytes must all be 10xxxxxx */
+	for (i = 1; i < len; i++) {
+		if ((input[i] & 0xc0) != 0x80)
+			return input[i] == '\0' ? 0 : -1;
+
+		chr <<= 6;
+		chr |= input[i] & 0x3f;
 	}
-      outbuf[0] = c | first;
-    }
 
-  return len;
+	*chr_r = chr;
+	return ret;
 }
 
 int uni_utf8_to_ucs4(const char *input, buffer_t *output)
@@ -201,12 +92,11 @@
 	unichar_t chr;
 
 	while (*input != '\0') {
-		chr = uni_utf8_get_char(input);
-		if (chr & 0x80000000) {
+		if (uni_utf8_get_char(input, &chr) <= 0) {
 			/* invalid input */
 			return -1;
 		}
-                input = uni_utf8_next_char(input);
+                input += uni_utf8_char_bytes(*input);
 
 		buffer_append(output, &chr, sizeof(chr));
 	}
@@ -215,24 +105,59 @@
 
 void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output)
 {
-	void *buf;
-	int char_len;
-
-	for (; *input != '\0' && len > 0; input++, len--) {
-		buf = buffer_append_space_unsafe(output, 6);
-		char_len = g_unichar_to_utf8(*input, buf);
-		buffer_set_used_size(output, output->used - 6 + char_len);
-	}
+	for (; *input != '\0' && len > 0; input++, len--)
+		uni_ucs4_to_utf8_c(*input, output);
 }
 
-unsigned int uni_utf8_strlen_n(const void *input, size_t size)
+void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output)
 {
-	const uint8_t *data = (const uint8_t *)input;
+	unsigned char first;
+	int bitpos;
+
+	if (chr < 0x80) {
+		buffer_append_c(output, chr);
+		return;
+	}
+
+	i_assert(chr <= 0x40000000); /* 1 << (5 * 6) */
+
+	if (chr < (1 << (6 + 5))) {
+		/* 110xxxxx */
+		bitpos = 6;
+		first = 0x80 | 0x40;
+	} else if (chr < (1 << ((2*6) + 4))) {
+		/* 1110xxxx */
+		bitpos = 2*6;
+		first = 0x80 | 0x40 | 0x20;
+	} else if (chr < (1 << ((3*6) + 3))) {
+		/* 11110xxx */
+		bitpos = 3*6;
+		first = 0x80 | 0x40 | 0x20 | 0x10;
+	} else if (chr < (1 << ((4*6) + 2))) {
+		/* 111110xx */
+		bitpos = 4*6;
+		first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08;
+	} else {
+		/* 1111110x */
+		bitpos = 5*6;
+		first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08 | 0x04;
+	}
+	buffer_append_c(output, first | (chr >> bitpos));
+
+	do {
+		bitpos -= 6;
+		buffer_append_c(output, 0x80 | ((chr >> bitpos) & 0x3f));
+	} while (bitpos > 0);
+}
+
+unsigned int uni_utf8_strlen_n(const void *_input, size_t size)
+{
+	const unsigned char *input = _input;
 	unsigned int len = 0;
 	size_t i;
 
-	for (i = 0; i < size && data[i] != '\0'; ) {
-		i += uni_utf8_skip[data[i]];
+	for (i = 0; i < size && input[i] != '\0'; ) {
+		i += uni_utf8_char_bytes(input[i]);
 		if (i > size)
 			break;
 		len++;
--- a/src/lib/unichar.h	Mon Jun 11 02:27:55 2007 +0300
+++ b/src/lib/unichar.h	Mon Jun 11 04:37:29 2007 +0300
@@ -3,7 +3,7 @@
 
 typedef uint32_t unichar_t;
 
-extern const char *const uni_utf8_skip;
+extern const uint8_t *const uni_utf8_non1_bytes;
 
 /* Returns number of characters in a NUL-terminated unicode string */
 unsigned int uni_strlen(const unichar_t *str);
@@ -12,15 +12,23 @@
 int uni_utf8_to_ucs4(const char *input, buffer_t *output);
 /* Translates UCS-4 input to UTF-8 output. */
 void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output);
+void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output);
 
-/* Returns the next UTF-8 character, or (unichar_t)-1 for invalid input and
-   (unichar_t)-2 for incomplete trailing character. */
-unichar_t uni_utf8_get_char(const char *input);
-unichar_t uni_utf8_get_char_len(const unsigned char *input, size_t max_len);
+/* Returns 1 if *chr_r is set, 0 for incomplete trailing character,
+   -1 for invalid input. */
+int uni_utf8_get_char(const char *input, unichar_t *chr_r);
+int uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r);
 /* Returns UTF-8 string length with maximum input size. */
 unsigned int uni_utf8_strlen_n(const void *input, size_t size);
 
-#define uni_utf8_next_char(p) \
-	((p) + uni_utf8_skip[*(const uint8_t *)(p)])
+/* Returns the number of bytes belonging to this partial UTF-8 character.
+   Invalid input is returned with length 1. */
+static inline unsigned int uni_utf8_char_bytes(char chr)
+{
+	/* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
+	if ((uint8_t)chr < (192 + 2))
+		return 1;
+	return uni_utf8_non1_bytes[(uint8_t)chr - (192 + 2)];
+}
 
 #endif