Mercurial > dovecot > core-2.2
changeset 18720:309863bb69cc
lib-fts: Rewrite ICU handling functions.
Some of the changes:
- Use buffers instead of allocating everything from data stack.
- Optimistically attempt to write the data directly to the buffers without
first calculating their size. Grow the buffer if it doesn't fit first.
- Use u_strFromUTF8Lenient() instead of u_strFromUTF8(). Our input is
already supposed to be valid UTF-8, although we don't check if all code
points are valid, while u_strFromUTF8() does check them and return failures.
We don't really care about if code points are valid or not and
u_strFromUTF8Lenient() passes through everything.
Added unit tests to make sure all the functions work as intended and all the
UTF-8 input passes through them successfully.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Sat, 16 May 2015 18:47:20 +0300 |
parents | 3deb3fd654c6 |
children | 9809f68aaa36 |
files | src/lib-fts/Makefile.am src/lib-fts/fts-filter-normalizer-icu.c src/lib-fts/fts-icu.c src/lib-fts/fts-icu.h src/lib-fts/test-fts-filter.c src/lib-fts/test-fts-icu.c |
diffstat | 6 files changed, 352 insertions(+), 139 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-fts/Makefile.am Sat May 16 18:41:44 2015 +0300 +++ b/src/lib-fts/Makefile.am Sat May 16 18:47:20 2015 +0300 @@ -50,7 +50,9 @@ endif if BUILD_LIBICU +ICU_SOURCES = fts-icu.c NORMALIZER_LIBS = $(LIBICU_LIBS) +ICU_TESTS = test-fts-icu endif libfts_la_LIBADD = \ @@ -67,19 +69,22 @@ fts-language.c \ fts-tokenizer.c \ fts-tokenizer-address.c \ - fts-tokenizer-generic.c + fts-tokenizer-generic.c \ + $(ICU_SOURCES) noinst_HEADERS = \ fts-filter.h \ fts-filter-private.h \ + fts-icu.h \ fts-language.h \ fts-tokenizer.h \ fts-tokenizer-private.h \ fts-tokenizer-generic-private.h test_programs = \ + $(ICU_TESTS) \ + $(TEST_FTS_LANGUAGE) \ test-fts-filter \ - $(TEST_FTS_LANGUAGE) \ test-fts-tokenizer noinst_PROGRAMS = $(test_programs) @@ -89,6 +94,10 @@ ../lib/liblib.la test_deps = $(noinst_LTLIBRARIES) $(test_libs) +test_fts_icu_SOURCES = test-fts-icu.c +test_fts_icu_LDADD = fts-icu.lo $(LIBICU_LIBS) $(test_libs) +test_fts_icu_DEPENDENCIES = fts-icu.lo $(test_deps) + test_fts_filter_SOURCES = test-fts-filter.c test_fts_filter_LDADD = libfts.la $(test_libs) test_fts_filter_DEPENDENCIES = libfts.la $(test_deps)
--- a/src/lib-fts/fts-filter-normalizer-icu.c Sat May 16 18:41:44 2015 +0300 +++ b/src/lib-fts/fts-filter-normalizer-icu.c Sat May 16 18:47:20 2015 +0300 @@ -8,111 +8,19 @@ #include "fts-language.h" #ifdef HAVE_LIBICU - -#include <unicode/utrans.h> -#include <unicode/uenum.h> -#include <unicode/ustring.h> -#include <unicode/ucnv.h> -#include <stdlib.h> +#include "fts-icu.h" struct fts_filter_normalizer_icu { struct fts_filter filter; pool_t pool; const char *transliterator_id; - UTransliterator *transliterator; -}; - -/* Helper to create UTF16, which libicu wants as input. - - On input, if *dst_uchars_r > 0, it indicates the number of UChar - sized units that should be allocated for the text. However, the - function will not use the number, if the text will not fit in that - amount. - - On return *dst_uchars_r will contain the number of UChar sized units - allocated for the dst. NOT the number of bytes nor the length of the - text. */ -static void make_uchar(const char *src, UChar **dst, int32_t *dst_uchars_r) -{ - UErrorCode err = U_ZERO_ERROR; - int32_t len = strlen(src); - int32_t ustr_len = 0; - int32_t ustr_len_actual = 0; - UChar *retp = NULL; - int32_t alloc_uchars = 0; - - i_assert(dst_uchars_r != NULL); - - /* Check length required for encoded dst. */ - retp = u_strFromUTF8(NULL, 0, &ustr_len, src, len, &err); - - /* When preflighting a successful call returns a buffer overflow - error. */ - if (U_BUFFER_OVERFLOW_ERROR != err && U_FAILURE(err)) { - i_panic("Failed to estimate allocation size with lib ICU" - " u_strFromUTF8(): %s",u_errorName(err)); - } - i_assert(NULL == retp); - - err = U_ZERO_ERROR; - if (*dst_uchars_r > 0 && *dst_uchars_r > ustr_len) - alloc_uchars = *dst_uchars_r; - else - alloc_uchars = ustr_len; - alloc_uchars++; /* room for null bytes(2) */ - *dst = t_malloc(alloc_uchars * sizeof(UChar)); - *dst_uchars_r = alloc_uchars; - retp = u_strFromUTF8(*dst, alloc_uchars, &ustr_len_actual, - src, len, &err); + const UChar *transliterator_id_utf16; + unsigned int transliterator_id_utf16_len; - if (U_FAILURE(err)) - i_panic("Lib ICU u_strFromUTF8 failed: %s", u_errorName(err)); - i_assert(retp == *dst); - i_assert(ustr_len == ustr_len_actual); -} - -static void make_utf8(const UChar *src, const char **_dst) -{ - char *dst; - char *retp = NULL; - int32_t dsize = 0; - int32_t dsize_actual = 0; - int32_t sub_num = 0; - UErrorCode err = U_ZERO_ERROR; - int32_t usrc_len = u_strlen(src); /* libicu selects different codepaths - depending if srclen -1 or not */ - - retp = u_strToUTF8WithSub(NULL, 0, &dsize, src, usrc_len, - UNICODE_REPLACEMENT_CHAR, &sub_num, &err); - - /* Preflighting can cause buffer overflow to be reported */ - if (U_BUFFER_OVERFLOW_ERROR != err && U_FAILURE(err)) { - i_panic("Failed to estimate allocation size with lib ICU" - " u_strToUTF8(): %s",u_errorName(err)); - } - i_assert(0 == sub_num); - i_assert(NULL == retp); - - dsize++; /* room for '\0' byte */ - dst = t_malloc(dsize); - err = U_ZERO_ERROR; - retp = u_strToUTF8WithSub(dst, dsize, &dsize_actual, src, usrc_len, - UNICODE_REPLACEMENT_CHAR, &sub_num, &err); - if (U_FAILURE(err)) - i_panic("Lib ICU u_strToUTF8WithSub() failed: %s", - u_errorName(err)); - if (dsize_actual >= dsize) { - i_panic("Produced UTF8 string length (%d) does not fit in " - "preflighted(%d). Buffer overflow?", - dsize_actual, dsize); - } - if (0 != sub_num) { - i_panic("UTF8 string not well formed. " - "Substitutions (%d) were made.", sub_num); - } - i_assert(retp == dst); - *_dst = dst; -} + UTransliterator *transliterator; + buffer_t *utf16_token, *trans_token; + string_t *utf8_token; +}; static void fts_filter_normalizer_icu_destroy(struct fts_filter *filter) { @@ -152,6 +60,13 @@ np->pool = pp; np->filter = *fts_filter_normalizer_icu; np->transliterator_id = p_strdup(pp, id); + np->utf16_token = buffer_create_dynamic(pp, 128); + np->trans_token = buffer_create_dynamic(pp, 128); + np->utf8_token = buffer_create_dynamic(pp, 128); + fts_icu_utf8_to_utf16(np->utf16_token, id); + np->transliterator_id_utf16 = + p_memdup(pp, np->utf16_token->data, np->utf16_token->used); + np->transliterator_id_utf16_len = np->utf16_token->used / sizeof(UChar); *filter_r = &np->filter; return 0; } @@ -162,14 +77,11 @@ { UErrorCode err = U_ZERO_ERROR; UParseError perr; - UChar *id_uchar = NULL; - int32_t id_len_uchar = 0; memset(&perr, 0, sizeof(perr)); - make_uchar(np->transliterator_id, &id_uchar, &id_len_uchar); - - np->transliterator = utrans_openU(id_uchar, u_strlen(id_uchar), + np->transliterator = utrans_openU(np->transliterator_id_utf16, + np->transliterator_id_utf16_len, UTRANS_FORWARD, NULL, 0, &perr, &err); if (U_FAILURE(err)) { string_t *str = t_str_new(128); @@ -193,48 +105,27 @@ { struct fts_filter_normalizer_icu *np = (struct fts_filter_normalizer_icu *)filter; - UErrorCode err = U_ZERO_ERROR; - UChar *utext = NULL; - int32_t utext_cap = 0; - int32_t utext_len = -1; - int32_t utext_limit; if (np->transliterator == NULL) { if (fts_filter_normalizer_icu_create_trans(np, error_r) < 0) return -1; } - make_uchar(*token, &utext, &utext_cap); - utext_limit = u_strlen(utext); - utrans_transUChars(np->transliterator, utext, &utext_len, - utext_cap, 0, &utext_limit, &err); + fts_icu_utf8_to_utf16(np->utf16_token, *token); + buffer_append_zero(np->utf16_token, 2); + buffer_set_used_size(np->utf16_token, np->utf16_token->used-2); + buffer_set_used_size(np->trans_token, 0); + if (fts_icu_translate(np->trans_token, np->utf16_token->data, + np->utf16_token->used / sizeof(UChar), + np->transliterator, error_r) < 0) + return -1; - /* Data did not fit into utext. */ - if (utext_len > utext_cap || err == U_BUFFER_OVERFLOW_ERROR) { - /* This is a crude retry fix... Make a new utext of the - size utrans_transUChars indicated */ - utext_len++; /* room for '\0' bytes(2) */ - utext_cap = utext_len; - make_uchar(*token, &utext, &utext_cap); - i_assert(utext_cap == utext_len); - utext_limit = u_strlen(utext); - utext_len = -1; - err = U_ZERO_ERROR; - utrans_transUChars(np->transliterator, utext, - &utext_len, utext_cap, 0, - &utext_limit, &err); - } - - if (U_FAILURE(err)) { - *error_r = t_strdup_printf("utrans_transUChars() failed: %s\n", - u_errorName(err)); - return -1; - } - - if (utext_len == 0) + if (np->trans_token->used == 0) return 0; - make_utf8(utext, token); + fts_icu_utf16_to_utf8(np->utf8_token, np->trans_token->data, + np->trans_token->used / sizeof(UChar)); + *token = str_c(np->utf8_token); return 1; }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib-fts/fts-icu.c Sat May 16 18:47:20 2015 +0300 @@ -0,0 +1,110 @@ +/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "buffer.h" +#include "unichar.h" +#include "fts-icu.h" + +void fts_icu_utf8_to_utf16(buffer_t *dest_utf16, const char *src_utf8) +{ + UErrorCode err = U_ZERO_ERROR; + unsigned int src_bytes = strlen(src_utf8); + int32_t utf16_len; + UChar *dest_data, *retp = NULL; + int32_t avail_uchars = 0; + + /* try to encode with the current buffer size */ + avail_uchars = buffer_get_writable_size(dest_utf16) / sizeof(UChar); + dest_data = buffer_get_space_unsafe(dest_utf16, 0, + buffer_get_writable_size(dest_utf16)); + retp = u_strFromUTF8Lenient(dest_data, avail_uchars, + &utf16_len, src_utf8, src_bytes, &err); + if (err == U_BUFFER_OVERFLOW_ERROR) { + /* try again with a larger buffer */ + dest_data = buffer_get_space_unsafe(dest_utf16, 0, + utf16_len * sizeof(UChar)); + err = U_ZERO_ERROR; + retp = u_strFromUTF8Lenient(dest_data, utf16_len, + &utf16_len, src_utf8, + src_bytes, &err); + } + if (U_FAILURE(err)) { + i_panic("LibICU u_strFromUTF8Lenient() failed: %s", + u_errorName(err)); + } + buffer_set_used_size(dest_utf16, utf16_len * sizeof(UChar)); + i_assert(retp == dest_data); +} + +void fts_icu_utf16_to_utf8(string_t *dest_utf8, const UChar *src_utf16, + unsigned int src_len) +{ + int32_t dest_len = 0; + int32_t sub_num = 0; + char *dest_data, *retp = NULL; + UErrorCode err = U_ZERO_ERROR; + + /* try to encode with the current buffer size */ + dest_data = buffer_get_space_unsafe(dest_utf8, 0, + buffer_get_writable_size(dest_utf8)); + retp = u_strToUTF8WithSub(dest_data, buffer_get_writable_size(dest_utf8), + &dest_len, src_utf16, src_len, + UNICODE_REPLACEMENT_CHAR, &sub_num, &err); + if (err == U_BUFFER_OVERFLOW_ERROR) { + /* try again with a larger buffer */ + dest_data = buffer_get_space_unsafe(dest_utf8, 0, dest_len); + err = U_ZERO_ERROR; + retp = u_strToUTF8WithSub(dest_data, buffer_get_writable_size(dest_utf8), &dest_len, + src_utf16, src_len, + UNICODE_REPLACEMENT_CHAR, + &sub_num, &err); + } + if (U_FAILURE(err)) { + i_panic("LibICU u_strToUTF8WithSub() failed: %s", + u_errorName(err)); + } + buffer_set_used_size(dest_utf8, dest_len); + i_assert(retp == dest_data); +} + +int fts_icu_translate(buffer_t *dest_utf16, const UChar *src_utf16, + unsigned int src_len, UTransliterator *transliterator, + const char **error_r) +{ + UErrorCode err = U_ZERO_ERROR; + int32_t utf16_len = src_len; + UChar *dest_data; + int32_t avail_uchars, limit = src_len; + size_t dest_pos = dest_utf16->used; + + /* translation is done in-place in the buffer. try first with the + current buffer size. */ + buffer_append(dest_utf16, src_utf16, src_len*sizeof(UChar)); + + avail_uchars = (buffer_get_writable_size(dest_utf16)-dest_pos) / sizeof(UChar); + dest_data = buffer_get_space_unsafe(dest_utf16, dest_pos, + buffer_get_writable_size(dest_utf16)-dest_pos); + utrans_transUChars(transliterator, dest_data, &utf16_len, + avail_uchars, 0, &limit, &err); + if (err == U_BUFFER_OVERFLOW_ERROR) { + /* try again with a larger buffer */ + err = U_ZERO_ERROR; + avail_uchars = utf16_len; + limit = utf16_len = src_len; + buffer_write(dest_utf16, dest_pos, + src_utf16, src_len*sizeof(UChar)); + dest_data = buffer_get_space_unsafe(dest_utf16, dest_pos, + avail_uchars * sizeof(UChar)); + utrans_transUChars(transliterator, dest_data, &utf16_len, + avail_uchars, 0, &limit, &err); + i_assert(err != U_BUFFER_OVERFLOW_ERROR); + } + if (U_FAILURE(err)) { + *error_r = t_strdup_printf("LibICU utrans_transUChars() failed: %s", + u_errorName(err)); + buffer_set_used_size(dest_utf16, dest_pos); + return -1; + } + buffer_set_used_size(dest_utf16, utf16_len * sizeof(UChar)); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib-fts/fts-icu.h Sat May 16 18:47:20 2015 +0300 @@ -0,0 +1,17 @@ +#ifndef HAVE_FTS_ICU_H +#define HAVE_FTS_ICU_H + +#include <unicode/ustring.h> +#include <unicode/utrans.h> + +/* Convert UTF-8 input to UTF-16 output. The dest_utf16 contains UChars. */ +void fts_icu_utf8_to_utf16(buffer_t *dest_utf16, const char *src_utf8); +/* Convert UTF-16 input to UTF-8 output. */ +void fts_icu_utf16_to_utf8(string_t *dest_utf8, const UChar *src_utf16, + unsigned int src_len); +/* Run ICU translation for the string. Returns 0 on success, -1 on error. */ +int fts_icu_translate(buffer_t *dest_utf16, const UChar *src_utf16, + unsigned int src_len, UTransliterator *transliterator, + const char **error_r); + +#endif
--- a/src/lib-fts/test-fts-filter.c Sat May 16 18:41:44 2015 +0300 +++ b/src/lib-fts/test-fts-filter.c Sat May 16 18:47:20 2015 +0300 @@ -2,6 +2,8 @@ #include "lib.h" #include "sha2.h" +#include "str.h" +#include "unichar.h" #include "test-common.h" #include "fts-language.h" #include "fts-filter.h" @@ -463,6 +465,37 @@ test_end(); } +static void test_fts_filter_normalizer_baddata(void) +{ + const char * const settings[] = + {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL}; + struct fts_filter *norm; + const char *token, *error; + string_t *str; + unsigned int i; + + test_begin("fts filter normalizer bad data"); + + test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0); + str = t_str_new(128); + for (i = 1; i < 0x1ffff; i++) { + str_truncate(str, 0); + uni_ucs4_to_utf8_c(i, str); + token = str_c(str); + T_BEGIN { + test_assert_idx(fts_filter_filter(norm, &token, &error) >= 0, i); + } T_END; + } + + str_truncate(str, 0); + uni_ucs4_to_utf8_c(0x7fffffff, str); + token = str_c(str); + test_assert(fts_filter_filter(norm, &token, &error) >= 0); + + fts_filter_unref(&norm); + test_end(); +} + static void test_fts_filter_normalizer_invalid_id(void) { struct fts_filter *norm = NULL; @@ -558,6 +591,7 @@ test_fts_filter_normalizer_swedish_short_default_id, test_fts_filter_normalizer_french, test_fts_filter_normalizer_empty, + test_fts_filter_normalizer_baddata, test_fts_filter_normalizer_invalid_id, #ifdef HAVE_FTS_STEMMER test_fts_filter_normalizer_stopwords_stemmer_eng,
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib-fts/test-fts-icu.c Sat May 16 18:47:20 2015 +0300 @@ -0,0 +1,152 @@ +/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "buffer.h" +#include "str.h" +#include "unichar.h" +#include "test-common.h" +#include "fts-icu.h" + +static void test_fts_icu_utf8_to_utf16_ascii_resize(void) +{ + buffer_t *dest = buffer_create_dynamic(pool_datastack_create(), 5); + + test_begin("fts_icu_utf8_to_utf16 ascii resize"); + /* dynamic buffers reserve +1 for str_c()'s NUL, so 5 -> 4 */ + test_assert(buffer_get_size(dest) == 5); + fts_icu_utf8_to_utf16(dest, "12"); + test_assert(dest->used == 4); + test_assert(buffer_get_size(dest) == 5); + + fts_icu_utf8_to_utf16(dest, "123"); + test_assert(dest->used == 6); + test_assert(buffer_get_size(dest) == 8); + + fts_icu_utf8_to_utf16(dest, "12345"); + test_assert(dest->used == 10); + + test_end(); +} + +static void test_fts_icu_utf8_to_utf16_32bit_resize(void) +{ + buffer_t *dest; + unsigned int i; + + test_begin("fts_icu_utf8_to_utf16 32bit resize"); + for (i = 2; i <= 5; i++) { + dest = buffer_create_dynamic(pool_datastack_create(), i); + test_assert(buffer_get_size(dest) == i); + fts_icu_utf8_to_utf16(dest, "\xF0\x90\x90\x80"); /* 0x10400 */ + test_assert(dest->used == 4); + } + + test_end(); +} + +static void test_fts_icu_utf16_to_utf8(void) +{ + string_t *dest = t_str_new(64); + const UChar src[] = { 0xbd, 'b', 'c' }; + unsigned int i; + + test_begin("fts_icu_utf16_to_utf8"); + for (i = N_ELEMENTS(src); i > 0; i--) { + fts_icu_utf16_to_utf8(dest, src, i); + test_assert(dest->used == i+1); + } + test_end(); +} + +static void test_fts_icu_utf16_to_utf8_resize(void) +{ + string_t *dest; + const UChar src = UNICODE_REPLACEMENT_CHAR; + unsigned int i; + + test_begin("fts_icu_utf16_to_utf8 resize"); + for (i = 2; i <= 6; i++) { + dest = t_str_new(i); + test_assert(buffer_get_size(dest) == i); + fts_icu_utf16_to_utf8(dest, &src, 1); + test_assert(dest->used == 3); + test_assert(strcmp(str_c(dest), UNICODE_REPLACEMENT_CHAR_UTF8) == 0); + } + + test_end(); +} + +static UTransliterator *get_translit(const char *id) +{ + UTransliterator *translit; + buffer_t *id_utf16; + UErrorCode err = U_ZERO_ERROR; + UParseError perr; + + id_utf16 = buffer_create_dynamic(pool_datastack_create(), 16); + fts_icu_utf8_to_utf16(id_utf16, id); + translit = utrans_openU(id_utf16->data, id_utf16->used/sizeof(UChar), + UTRANS_FORWARD, NULL, 0, &perr, &err); + test_assert(!U_FAILURE(err)); + return translit; +} + +static void test_fts_icu_translate(void) +{ + const char *translit_id = "Any-Lower"; + UTransliterator *translit; + buffer_t *dest = buffer_create_dynamic(pool_datastack_create(), 64); + const UChar src[] = { 0xbd, 'B', 'C' }; + const char *error; + unsigned int i; + + test_begin("fts_icu_translate"); + translit = get_translit(translit_id); + for (i = N_ELEMENTS(src); i > 0; i--) { + buffer_set_used_size(dest, 0); + test_assert(fts_icu_translate(dest, src, i, + translit, &error) == 0); + test_assert(dest->used == i * sizeof(UChar)); + } + test_end(); +} + +static void test_fts_icu_translate_resize(void) +{ + const char *translit_id = "Any-Hex"; + const char *src_utf8 = "FOO"; + buffer_t *dest, *src_utf16; + UTransliterator *translit; + const char *error; + unsigned int i; + + test_begin("fts_icu_translate_resize resize"); + + src_utf16 = buffer_create_dynamic(pool_datastack_create(), 16); + translit = get_translit(translit_id); + for (i = 2; i <= 20; i++) { + buffer_set_used_size(src_utf16, 0); + fts_icu_utf8_to_utf16(src_utf16, src_utf8); + dest = buffer_create_dynamic(pool_datastack_create(), i); + test_assert(buffer_get_size(dest) == i); + test_assert(fts_icu_translate(dest, src_utf16->data, + src_utf16->used/sizeof(UChar), + translit, &error) == 0); + } + + test_end(); +} + +int main(void) +{ + static void (*test_functions[])(void) = { + test_fts_icu_utf8_to_utf16_ascii_resize, + test_fts_icu_utf8_to_utf16_32bit_resize, + test_fts_icu_utf16_to_utf8, + test_fts_icu_utf16_to_utf8_resize, + test_fts_icu_translate, + test_fts_icu_translate_resize, + NULL + }; + return test_run(test_functions); +}