changeset 6129:04b9eb27283c HEAD

Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They use a unicharmap.c file generated from UnicodeData.txt.
author Timo Sirainen <tss@iki.fi>
date Fri, 20 Jul 2007 17:25:16 +0300
parents 6d2bee707053
children 9afe3fa4858d
files .hgignore src/lib/Makefile.am src/lib/unichar.c src/lib/unichar.h src/lib/unicodemap.pl
diffstat 5 files changed, 273 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/.hgignore	Fri Jul 20 17:21:53 2007 +0300
+++ b/.hgignore	Fri Jul 20 17:25:16 2007 +0300
@@ -53,6 +53,7 @@
 src/dict/dict
 src/imap-login/imap-login
 src/imap/imap
+src/lib/unicodemap.c
 src/lib-dict/dict-drivers-register.c
 src/lib-sql/sql-drivers-register.c
 src/lib-storage/register/mail-storage-register.c
--- a/src/lib/Makefile.am	Fri Jul 20 17:21:53 2007 +0300
+++ b/src/lib/Makefile.am	Fri Jul 20 17:25:16 2007 +0300
@@ -1,5 +1,13 @@
 noinst_LIBRARIES = liblib.a
 
+BUILT_SOURCES = unicodemap.c
+
+EXTRA_DIST = unicodemap.c
+
+unicodemap.c:
+	test -f UnicodeData.txt || wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
+	perl unicodemap.pl < UnicodeData.txt > $@
+
 liblib_a_SOURCES = \
 	backtrace-string.c \
 	base64.c \
--- a/src/lib/unichar.c	Fri Jul 20 17:21:53 2007 +0300
+++ b/src/lib/unichar.c	Fri Jul 20 17:25:16 2007 +0300
@@ -2,8 +2,14 @@
 
 #include "lib.h"
 #include "buffer.h"
+#include "bsearch-insert-pos.h"
 #include "unichar.h"
 
+#include "unicodemap.c"
+
+#define HANGUL_FIRST 0xac00
+#define HANGUL_LAST 0xd7a3
+
 static const uint8_t utf8_non1_bytes[256 - 192 - 2] = {
 	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
@@ -164,3 +170,118 @@
 	}
 	return len;
 }
+
+static bool uint16_find(const uint16_t *data, unsigned int count,
+			uint16_t value, unsigned int *idx_r)
+{
+	BINARY_NUMBER_SEARCH(data, count, value, idx_r);
+}
+
+static bool uint32_find(const uint32_t *data, unsigned int count,
+			uint32_t value, unsigned int *idx_r)
+{
+	BINARY_NUMBER_SEARCH(data, count, value, idx_r);
+}
+
+unichar_t uni_ucs4_to_titlecase(unichar_t chr)
+{
+	unsigned int idx;
+
+	if (chr <= 0xffff) {
+		if (!uint16_find(titlecase16_keys, N_ELEMENTS(titlecase16_keys),
+				 chr, &idx))
+			return chr;
+		else
+			return titlecase16_values[idx];
+	} else {
+		if (!uint32_find(titlecase32_keys, N_ELEMENTS(titlecase32_keys),
+				 chr, &idx))
+			return chr;
+		else
+			return titlecase32_values[idx];
+	}
+}
+
+static bool uni_ucs4_decompose_uni(unichar_t *chr)
+{
+	unsigned int idx;
+
+	if (*chr <= 0xffff) {
+		if (!uint16_find(uni16_decomp_keys,
+				 N_ELEMENTS(uni16_decomp_keys),
+				 *chr, &idx))
+			return FALSE;
+		*chr = uni16_decomp_values[idx];
+	} else {
+		if (!uint32_find(uni32_decomp_keys,
+				 N_ELEMENTS(uni32_decomp_keys),
+				 *chr, &idx))
+			return FALSE;
+		*chr = uni32_decomp_values[idx];
+	}
+	return TRUE;
+}
+
+static void uni_ucs4_decompose_hangul_utf8(unichar_t chr, buffer_t *output)
+{
+#define SBase HANGUL_FIRST
+#define LBase 0x1100 
+#define VBase 0x1161 
+#define TBase 0x11A7
+#define LCount 19 
+#define VCount 21
+#define TCount 28
+#define NCount (VCount * TCount)
+	unsigned int SIndex = chr - SBase;
+        unichar_t L = LBase + SIndex / NCount;
+        unichar_t V = VBase + (SIndex % NCount) / TCount;
+        unichar_t T = TBase + SIndex % TCount;
+
+	uni_ucs4_to_utf8_c(L, output);
+	uni_ucs4_to_utf8_c(V, output);
+	if (T != TBase) uni_ucs4_to_utf8_c(T, output);
+}
+
+static bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output)
+{
+	const uint16_t *value;
+	unsigned int idx;
+
+	if (chr > 0xffff)
+		return FALSE;
+
+	if (!uint16_find(multidecomp_keys, N_ELEMENTS(multidecomp_keys),
+			 chr, &idx))
+		return FALSE;
+
+	value = &multidecomp_values[multidecomp_offsets[idx]];
+	for (; *value != 0; value++)
+		uni_ucs4_to_utf8_c(*value, output);
+	return TRUE;
+}
+
+int uni_utf8_to_decomposed_titlecase(const void *_input, size_t max_len,
+				     buffer_t *output)
+{
+	const unsigned char *input = _input;
+	unsigned int bytes;
+	unichar_t chr;
+
+	while (max_len > 0 && *input != '\0') {
+		if (uni_utf8_get_char_n(input, max_len, &chr) <= 0) {
+			/* invalid input */
+			return -1;
+		}
+		bytes = uni_utf8_char_bytes(*input);
+		input += bytes;
+		max_len -= bytes;
+
+		chr = uni_ucs4_to_titlecase(chr);
+		if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST)
+			uni_ucs4_decompose_hangul_utf8(chr, output);
+		else if (uni_ucs4_decompose_uni(&chr) ||
+			 !uni_ucs4_decompose_multi_utf8(chr, output))
+			uni_ucs4_to_utf8_c(chr, output);
+	}
+	return 0;
+}
--- a/src/lib/unichar.h	Fri Jul 20 17:21:53 2007 +0300
+++ b/src/lib/unichar.h	Fri Jul 20 17:25:16 2007 +0300
@@ -31,4 +31,13 @@
 	return uni_utf8_non1_bytes[(uint8_t)chr - (192 + 2)];
 }
 
+/* Return given character in titlecase. */
+unichar_t uni_ucs4_to_titlecase(unichar_t chr);
+
+/* Convert UTF-8 input to titlecase and decompose the titlecase characters to
+   output buffer. Returns 0 if ok, -1 if input was invalid. This generates
+   output that's compatible with i;unicode-casemap comparator. */
+int uni_utf8_to_decomposed_titlecase(const void *input, size_t max_len,
+				     buffer_t *output);
+
 #endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/unicodemap.pl	Fri Jul 20 17:25:16 2007 +0300
@@ -0,0 +1,134 @@
+#!/usr/bin/env perl
+use strict;
+
+my (@titlecase16_keys, @titlecase16_values);
+my (@titlecase32_keys, @titlecase32_values);
+my (@uni16_decomp_keys, @uni16_decomp_values);
+my (@uni32_decomp_keys, @uni32_decomp_values);
+my (@multidecomp_keys, @multidecomp_offsets, @multidecomp_values);
+while (<>) {
+  chomp $_;
+  my @arr = split(";");
+  my $code = eval("0x".$arr[0]);
+  my $decomp = $arr[5];
+  my $titlecode = $arr[14];
+  
+  if ($titlecode ne "") {
+    # titlecase mapping
+    my $value = eval("0x$titlecode");
+    if ($value == $code) { 
+      # the same character, ignore
+    } elsif ($code <= 0xffff && $value <= 0xffff) {
+      push @titlecase16_keys, $code;
+      push @titlecase16_values, $value;
+    } else {
+      push @titlecase32_keys, $code;
+      push @titlecase32_values, $value;
+    }
+  } elsif ($decomp =~ /\<[^>]*> (.+)/) {
+    # decompositions
+    my $decomp_codes = $1;
+    if ($decomp_codes =~ /^([0-9A-Z]*)$/i) {
+      # unicharacter decomposition. use separate lists for this
+      my $value = eval("0x$1");
+      if ($value > 0xffff) {
+	print STDERR "We've assumed decomposition codes are max. 16bit\n";
+	exit;
+      }
+      if ($code <= 0xffff) {
+	push @uni16_decomp_keys, $code;
+	push @uni16_decomp_values, $value;
+      } else {
+	push @uni32_decomp_keys, $code;
+	push @uni32_decomp_values, $value;
+      }
+    } else {
+      # multicharacter decomposition.
+      if ($code > 0xffff) {
+	print STDERR "We've assumed multi-decomposition key codes are max. 16bit\n";
+	exit;
+      }
+      
+      push @multidecomp_keys, $code;
+      push @multidecomp_offsets, scalar(@multidecomp_values);
+
+      foreach my $dcode (split(" ", $decomp_codes)) {
+	my $value = eval("0x$dcode");
+	if ($value > 0xffff) {
+	  print STDERR "We've assumed decomposition codes are max. 16bit\n";
+	  exit;
+	}
+	push @multidecomp_values, $value;
+      }
+      push @multidecomp_values, 0;
+    }
+  }
+}
+
+sub print_list {
+  my @list = @{$_[0]};
+  
+  my $last = $#list;
+  my $n = 0;
+  foreach my $key (@list) {
+    printf("0x%04x", $key);
+    last if ($n == $last);
+    print ",";
+    
+    $n++;
+    if (($n % 8) == 0) {
+      print "\n\t";
+    } else {
+      print " ";
+    }
+  }
+}
+
+print "/* This file is automatically generated by unicodemap.pl from UnicodeData.txt
+
+   NOTE: decompositions for characters having titlecase characters
+   are not included, because we first translate everything to titlecase */\n";
+
+print "static uint16_t titlecase16_keys[] = {\n\t";
+print_list(\@titlecase16_keys);
+print "\n};\n";
+
+print "static uint16_t titlecase16_values[] = {\n\t";
+print_list(\@titlecase16_values);
+print "\n};\n";
+
+print "static uint32_t titlecase32_keys[] = {\n\t";
+print_list(\@titlecase32_keys);
+print "\n};\n";
+
+print "static uint32_t titlecase32_values[] = {\n\t";
+print_list(\@titlecase32_values);
+print "\n};\n";
+
+print "static uint16_t uni16_decomp_keys[] = {\n\t";
+print_list(\@uni16_decomp_keys);
+print "\n};\n";
+
+print "static uint16_t uni16_decomp_values[] = {\n\t";
+print_list(\@uni16_decomp_values);
+print "\n};\n";
+
+print "static uint32_t uni32_decomp_keys[] = {\n\t";
+print_list(\@uni32_decomp_keys);
+print "\n};\n";
+
+print "static uint16_t uni32_decomp_values[] = {\n\t";
+print_list(\@uni32_decomp_values);
+print "\n};\n";
+
+print "static uint16_t multidecomp_keys[] = {\n\t";
+print_list(\@multidecomp_keys);
+print "\n};\n";
+
+print "static uint16_t multidecomp_offsets[] = {\n\t";
+print_list(\@multidecomp_offsets);
+print "\n};\n";
+
+print "static uint16_t multidecomp_values[] = {\n\t";
+print_list(\@multidecomp_values);
+print "\n};\n";