Mercurial > dovecot > core-2.2
changeset 6129:04b9eb27283c HEAD
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
use a unicharmap.c file generated from UnicodeData.txt.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Fri, 20 Jul 2007 17:25:16 +0300 |
parents | 6d2bee707053 |
children | 9afe3fa4858d |
files | .hgignore src/lib/Makefile.am src/lib/unichar.c src/lib/unichar.h src/lib/unicodemap.pl |
diffstat | 5 files changed, 273 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/.hgignore Fri Jul 20 17:21:53 2007 +0300 +++ b/.hgignore Fri Jul 20 17:25:16 2007 +0300 @@ -53,6 +53,7 @@ src/dict/dict src/imap-login/imap-login src/imap/imap +src/lib/unicodemap.c src/lib-dict/dict-drivers-register.c src/lib-sql/sql-drivers-register.c src/lib-storage/register/mail-storage-register.c
--- a/src/lib/Makefile.am Fri Jul 20 17:21:53 2007 +0300 +++ b/src/lib/Makefile.am Fri Jul 20 17:25:16 2007 +0300 @@ -1,5 +1,13 @@ noinst_LIBRARIES = liblib.a +BUILT_SOURCES = unicodemap.c + +EXTRA_DIST = unicodemap.c + +unicodemap.c: + test -f UnicodeData.txt || wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt + perl unicodemap.pl < UnicodeData.txt > $@ + liblib_a_SOURCES = \ backtrace-string.c \ base64.c \
--- a/src/lib/unichar.c Fri Jul 20 17:21:53 2007 +0300 +++ b/src/lib/unichar.c Fri Jul 20 17:25:16 2007 +0300 @@ -2,8 +2,14 @@ #include "lib.h" #include "buffer.h" +#include "bsearch-insert-pos.h" #include "unichar.h" +#include "unicodemap.c" + +#define HANGUL_FIRST 0xac00 +#define HANGUL_LAST 0xd7a3 + static const uint8_t utf8_non1_bytes[256 - 192 - 2] = { 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 @@ -164,3 +170,118 @@ } return len; } + +static bool uint16_find(const uint16_t *data, unsigned int count, + uint16_t value, unsigned int *idx_r) +{ + BINARY_NUMBER_SEARCH(data, count, value, idx_r); +} + +static bool uint32_find(const uint32_t *data, unsigned int count, + uint32_t value, unsigned int *idx_r) +{ + BINARY_NUMBER_SEARCH(data, count, value, idx_r); +} + +unichar_t uni_ucs4_to_titlecase(unichar_t chr) +{ + unsigned int idx; + + if (chr <= 0xffff) { + if (!uint16_find(titlecase16_keys, N_ELEMENTS(titlecase16_keys), + chr, &idx)) + return chr; + else + return titlecase16_values[idx]; + } else { + if (!uint32_find(titlecase32_keys, N_ELEMENTS(titlecase32_keys), + chr, &idx)) + return chr; + else + return titlecase32_values[idx]; + } +} + +static bool uni_ucs4_decompose_uni(unichar_t *chr) +{ + unsigned int idx; + + if (*chr <= 0xffff) { + if (!uint16_find(uni16_decomp_keys, + N_ELEMENTS(uni16_decomp_keys), + *chr, &idx)) + return FALSE; + *chr = uni16_decomp_values[idx]; + } else { + if (!uint32_find(uni32_decomp_keys, + N_ELEMENTS(uni32_decomp_keys), + *chr, &idx)) + return FALSE; + *chr = uni32_decomp_values[idx]; + } + return TRUE; +} + +static void uni_ucs4_decompose_hangul_utf8(unichar_t chr, buffer_t *output) +{ +#define SBase HANGUL_FIRST +#define LBase 0x1100 +#define VBase 0x1161 +#define TBase 0x11A7 +#define LCount 19 +#define VCount 21 +#define TCount 28 +#define NCount (VCount * TCount) + unsigned int SIndex = chr - SBase; + unichar_t L = LBase + SIndex / NCount; + unichar_t V = VBase + (SIndex % NCount) / TCount; + unichar_t T = TBase + SIndex % TCount; + + uni_ucs4_to_utf8_c(L, output); + uni_ucs4_to_utf8_c(V, output); + if (T != TBase) uni_ucs4_to_utf8_c(T, output); +} + +static bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output) +{ + const uint16_t *value; + unsigned int idx; + + if (chr > 0xffff) + return FALSE; + + if (!uint16_find(multidecomp_keys, N_ELEMENTS(multidecomp_keys), + chr, &idx)) + return FALSE; + + value = &multidecomp_values[multidecomp_offsets[idx]]; + for (; *value != 0; value++) + uni_ucs4_to_utf8_c(*value, output); + return TRUE; +} + +int uni_utf8_to_decomposed_titlecase(const void *_input, size_t max_len, + buffer_t *output) +{ + const unsigned char *input = _input; + unsigned int bytes; + unichar_t chr; + + while (max_len > 0 && *input != '\0') { + if (uni_utf8_get_char_n(input, max_len, &chr) <= 0) { + /* invalid input */ + return -1; + } + bytes = uni_utf8_char_bytes(*input); + input += bytes; + max_len -= bytes; + + chr = uni_ucs4_to_titlecase(chr); + if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST) + uni_ucs4_decompose_hangul_utf8(chr, output); + else if (uni_ucs4_decompose_uni(&chr) || + !uni_ucs4_decompose_multi_utf8(chr, output)) + uni_ucs4_to_utf8_c(chr, output); + } + return 0; +}
--- a/src/lib/unichar.h Fri Jul 20 17:21:53 2007 +0300 +++ b/src/lib/unichar.h Fri Jul 20 17:25:16 2007 +0300 @@ -31,4 +31,13 @@ return uni_utf8_non1_bytes[(uint8_t)chr - (192 + 2)]; } +/* Return given character in titlecase. */ +unichar_t uni_ucs4_to_titlecase(unichar_t chr); + +/* Convert UTF-8 input to titlecase and decompose the titlecase characters to + output buffer. Returns 0 if ok, -1 if input was invalid. This generates + output that's compatible with i;unicode-casemap comparator. */ +int uni_utf8_to_decomposed_titlecase(const void *input, size_t max_len, + buffer_t *output); + #endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/unicodemap.pl Fri Jul 20 17:25:16 2007 +0300 @@ -0,0 +1,134 @@ +#!/usr/bin/env perl +use strict; + +my (@titlecase16_keys, @titlecase16_values); +my (@titlecase32_keys, @titlecase32_values); +my (@uni16_decomp_keys, @uni16_decomp_values); +my (@uni32_decomp_keys, @uni32_decomp_values); +my (@multidecomp_keys, @multidecomp_offsets, @multidecomp_values); +while (<>) { + chomp $_; + my @arr = split(";"); + my $code = eval("0x".$arr[0]); + my $decomp = $arr[5]; + my $titlecode = $arr[14]; + + if ($titlecode ne "") { + # titlecase mapping + my $value = eval("0x$titlecode"); + if ($value == $code) { + # the same character, ignore + } elsif ($code <= 0xffff && $value <= 0xffff) { + push @titlecase16_keys, $code; + push @titlecase16_values, $value; + } else { + push @titlecase32_keys, $code; + push @titlecase32_values, $value; + } + } elsif ($decomp =~ /\<[^>]*> (.+)/) { + # decompositions + my $decomp_codes = $1; + if ($decomp_codes =~ /^([0-9A-Z]*)$/i) { + # unicharacter decomposition. use separate lists for this + my $value = eval("0x$1"); + if ($value > 0xffff) { + print STDERR "We've assumed decomposition codes are max. 16bit\n"; + exit; + } + if ($code <= 0xffff) { + push @uni16_decomp_keys, $code; + push @uni16_decomp_values, $value; + } else { + push @uni32_decomp_keys, $code; + push @uni32_decomp_values, $value; + } + } else { + # multicharacter decomposition. + if ($code > 0xffff) { + print STDERR "We've assumed multi-decomposition key codes are max. 16bit\n"; + exit; + } + + push @multidecomp_keys, $code; + push @multidecomp_offsets, scalar(@multidecomp_values); + + foreach my $dcode (split(" ", $decomp_codes)) { + my $value = eval("0x$dcode"); + if ($value > 0xffff) { + print STDERR "We've assumed decomposition codes are max. 16bit\n"; + exit; + } + push @multidecomp_values, $value; + } + push @multidecomp_values, 0; + } + } +} + +sub print_list { + my @list = @{$_[0]}; + + my $last = $#list; + my $n = 0; + foreach my $key (@list) { + printf("0x%04x", $key); + last if ($n == $last); + print ","; + + $n++; + if (($n % 8) == 0) { + print "\n\t"; + } else { + print " "; + } + } +} + +print "/* This file is automatically generated by unicodemap.pl from UnicodeData.txt + + NOTE: decompositions for characters having titlecase characters + are not included, because we first translate everything to titlecase */\n"; + +print "static uint16_t titlecase16_keys[] = {\n\t"; +print_list(\@titlecase16_keys); +print "\n};\n"; + +print "static uint16_t titlecase16_values[] = {\n\t"; +print_list(\@titlecase16_values); +print "\n};\n"; + +print "static uint32_t titlecase32_keys[] = {\n\t"; +print_list(\@titlecase32_keys); +print "\n};\n"; + +print "static uint32_t titlecase32_values[] = {\n\t"; +print_list(\@titlecase32_values); +print "\n};\n"; + +print "static uint16_t uni16_decomp_keys[] = {\n\t"; +print_list(\@uni16_decomp_keys); +print "\n};\n"; + +print "static uint16_t uni16_decomp_values[] = {\n\t"; +print_list(\@uni16_decomp_values); +print "\n};\n"; + +print "static uint32_t uni32_decomp_keys[] = {\n\t"; +print_list(\@uni32_decomp_keys); +print "\n};\n"; + +print "static uint16_t uni32_decomp_values[] = {\n\t"; +print_list(\@uni32_decomp_values); +print "\n};\n"; + +print "static uint16_t multidecomp_keys[] = {\n\t"; +print_list(\@multidecomp_keys); +print "\n};\n"; + +print "static uint16_t multidecomp_offsets[] = {\n\t"; +print_list(\@multidecomp_offsets); +print "\n};\n"; + +print "static uint16_t multidecomp_values[] = {\n\t"; +print_list(\@multidecomp_values); +print "\n};\n";