# HG changeset patch # User Timo Sirainen # Date 1196684625 -7200 # Node ID a215deb3de8d3c82d3c19d60ea56fa9ee67350ef # Parent c68564884bae2f6a32314885a868ef539ca7c685 Fixed non-ASCII searches. diff -r c68564884bae -r a215deb3de8d src/plugins/fts-squat/squat-trie.c --- a/src/plugins/fts-squat/squat-trie.c Mon Dec 03 14:23:19 2007 +0200 +++ b/src/plugins/fts-squat/squat-trie.c Mon Dec 03 14:23:45 2007 +0200 @@ -5,6 +5,7 @@ #include "str.h" #include "istream.h" #include "ostream.h" +#include "unichar.h" #include "seq-range-array.h" #include "squat-uidlist.h" #include "squat-trie-private.h" @@ -702,18 +703,65 @@ } static int -squat_build_word(struct squat_trie *trie, uint32_t uid, - const unsigned char *data, unsigned int size) +squat_build_word_bytes(struct squat_trie *trie, uint32_t uid, + const unsigned char *data, unsigned int size) { unsigned int i; - for (i = size - 1; i > 0; i--) { + if (trie->hdr.full_len <= trie->hdr.partial_len) + i = 0; + else { + /* the first word is longer than others */ + if (squat_build_add(trie, uid, data, + I_MIN(size, trie->hdr.full_len)) < 0) + return -1; + i = 1; + } + + for (; i < size; i++) { if (squat_build_add(trie, uid, data + i, - I_MIN(trie->hdr.partial_len, size - i)) < 0) + I_MIN(trie->hdr.partial_len, size-i)) < 0) return -1; } - return squat_build_add(trie, uid, data, - I_MIN(size, trie->hdr.full_len)); + return 0; +} + +static int +squat_build_word(struct squat_trie *trie, uint32_t uid, + const unsigned char *data, const uint8_t *char_lengths, + unsigned int size) +{ + unsigned int i, j, bytelen; + + if (char_lengths == NULL) { + /* optimization path: all characters are bytes */ + return squat_build_word_bytes(trie, uid, data, size); + } + + if (trie->hdr.full_len <= trie->hdr.partial_len) + i = 0; + else { + /* the first word is longer than others */ + bytelen = 0; + for (j = 0; j < trie->hdr.full_len && bytelen < size; j++) + bytelen += char_lengths[bytelen]; + i_assert(bytelen <= size); + + if (squat_build_add(trie, uid, data, bytelen) < 0) + return -1; + i = char_lengths[0]; + } + + for (; i < size; i += char_lengths[i]) { + bytelen = 0; + for (j = 0; j < trie->hdr.partial_len && i+bytelen < size; j++) + bytelen += char_lengths[i + bytelen]; + i_assert(i + bytelen <= size); + + if (squat_build_add(trie, uid, data + i, bytelen) < 0) + return -1; + } + return 0; } static unsigned char * @@ -731,17 +779,24 @@ int squat_trie_build_more(struct squat_trie_build_context *ctx, uint32_t uid, enum squat_index_type type, - const unsigned char *data, unsigned int size) + const unsigned char *input, unsigned int size) { struct squat_trie *trie = ctx->trie; + const unsigned char *data; + uint8_t *char_lengths; unsigned int i, start = 0; + bool multibyte_chars = FALSE; int ret = 0; uid = uid * 2 + (type == SQUAT_INDEX_TYPE_HEADER ? 0 : 1); t_push(); - data = squat_data_normalize(trie, data, size); + char_lengths = t_malloc(size); + data = squat_data_normalize(trie, input, size); for (i = 0; i < size; i++) { + char_lengths[i] = uni_utf8_char_bytes(input[i]); + if (char_lengths[i] != 1) + multibyte_chars = TRUE; if (data[i] != '\0') continue; @@ -749,6 +804,8 @@ start++; if (i != start) { if (squat_build_word(trie, uid, data + start, + !multibyte_chars ? NULL : + char_lengths + start, i - start) < 0) { ret = -1; start = i; @@ -760,7 +817,9 @@ while (start < i && data[start] == '\0') start++; if (i != start) { - if (squat_build_word(trie, uid, data + start, i - start) < 0) + if (squat_build_word(trie, uid, data + start, + !multibyte_chars ? NULL : + char_lengths + start, i - start) < 0) ret = -1; } t_pop(); @@ -1355,20 +1414,23 @@ static int squat_trie_lookup_partial(struct squat_trie_lookup_context *ctx, - const unsigned char *data, unsigned int size) + const unsigned char *data, uint8_t *char_lengths, + unsigned int size) { - const unsigned char *block; - unsigned int block_len; + const unsigned int partial_len = ctx->trie->hdr.partial_len; + unsigned int char_idx, max_chars, i, j, bytelen; int ret; - do { - if (size <= ctx->trie->hdr.partial_len) - block_len = size; - else - block_len = ctx->trie->hdr.partial_len; - block = data + size - block_len; + max_chars = uni_utf8_strlen_n(data, size); + if (max_chars > ctx->trie->hdr.partial_len) + max_chars = partial_len; - ret = squat_trie_lookup_data(ctx->trie, block, block_len, + for (i = 0, char_idx = 0; char_idx < max_chars; char_idx++) { + bytelen = 0; + for (j = 0; j < partial_len && i+bytelen < size; j++) + bytelen += char_lengths[i + bytelen]; + + ret = squat_trie_lookup_data(ctx->trie, data + i, bytelen, &ctx->tmp_uids); if (ret <= 0) { array_clear(ctx->maybe_uids); @@ -1385,7 +1447,9 @@ seq_range_array_remove_invert_range(ctx->maybe_uids, &ctx->tmp_uids2); } - } while (--size >= ctx->trie->hdr.partial_len); + + i += char_lengths[i]; + } return 1; } @@ -1416,7 +1480,8 @@ { struct squat_trie_lookup_context ctx; unsigned char *data; - unsigned int i, start, size; + uint8_t *char_lengths; + unsigned int i, start, bytes, str_bytelen, str_charlen; int ret = 0; t_push(); @@ -1429,12 +1494,17 @@ t_array_init(&ctx.tmp_uids2, 128); ctx.first = TRUE; - size = strlen(str); - data = t_malloc(size); - memcpy(data, str, size); - data = squat_data_normalize(trie, data, size); + str_bytelen = strlen(str); + char_lengths = t_malloc0(str_bytelen); + for (i = 0; i < str_bytelen; ) { + bytes = uni_utf8_char_bytes(str[i]); + char_lengths[i] = bytes; + i += bytes; + } + data = squat_data_normalize(trie, (const unsigned char *)str, + str_bytelen); - for (i = start = 0; i < size && ret >= 0; i++) { + for (i = start = 0; i < str_bytelen && ret >= 0; i += char_lengths[i]) { if (data[i] != '\0') continue; @@ -1442,9 +1512,10 @@ search it in parts. */ if (i != start) { ret = squat_trie_lookup_partial(&ctx, data + start, + char_lengths, i - start); } - start = i + 1; + start = i + char_lengths[i]; } if (start != 0) { @@ -1452,6 +1523,7 @@ array_clear(definite_uids); if (i != start && ret >= 0) { ret = squat_trie_lookup_partial(&ctx, data + start, + char_lengths, i - start); } t_pop(); @@ -1459,9 +1531,10 @@ return ret < 0 ? -1 : 0; } - if (size <= trie->hdr.partial_len || + if (str_charlen <= trie->hdr.partial_len || trie->hdr.full_len > trie->hdr.partial_len) { - ret = squat_trie_lookup_data(trie, data, size, &ctx.tmp_uids); + ret = squat_trie_lookup_data(trie, data, str_bytelen, + &ctx.tmp_uids); if (ret > 0) { squat_trie_filter_type(type, &ctx.tmp_uids, definite_uids); @@ -1470,12 +1543,13 @@ array_clear(definite_uids); } - if (size <= trie->hdr.partial_len || trie->hdr.partial_len == 0) { + if (str_charlen <= trie->hdr.partial_len || + trie->hdr.partial_len == 0) { /* we have the result */ array_clear(maybe_uids); } else { ret = squat_trie_lookup_partial(&ctx, data + start, - i - start); + char_lengths, i - start); } t_pop(); squat_trie_add_unknown(trie, maybe_uids); diff -r c68564884bae -r a215deb3de8d src/plugins/fts/Makefile.am --- a/src/plugins/fts/Makefile.am Mon Dec 03 14:23:19 2007 +0200 +++ b/src/plugins/fts/Makefile.am Mon Dec 03 14:23:45 2007 +0200 @@ -1,5 +1,6 @@ AM_CPPFLAGS = \ -I$(top_srcdir)/src/lib \ + -I$(top_srcdir)/src/lib-charset \ -I$(top_srcdir)/src/lib-mail \ -I$(top_srcdir)/src/lib-index \ -I$(top_srcdir)/src/lib-storage diff -r c68564884bae -r a215deb3de8d src/plugins/fts/fts-search.c --- a/src/plugins/fts/fts-search.c Mon Dec 03 14:23:19 2007 +0200 +++ b/src/plugins/fts/fts-search.c Mon Dec 03 14:23:45 2007 +0200 @@ -2,7 +2,9 @@ #include "lib.h" #include "array.h" +#include "str.h" #include "seq-range-array.h" +#include "charset-utf8.h" #include "mail-search.h" #include "mail-storage-private.h" #include "fts-api-private.h" @@ -47,6 +49,9 @@ struct fts_backend *backend; enum fts_lookup_flags flags = 0; const char *key; + string_t *key_utf8; + enum charset_result result; + int ret; switch (arg->type) { case SEARCH_HEADER: @@ -81,20 +86,29 @@ if (arg->not) flags |= FTS_LOOKUP_FLAG_INVERT; - if (!backend->locked) { - if (fts_backend_lock(backend) <= 0) - return -1; + /* convert key to titlecase */ + t_push(); + key_utf8 = t_str_new(128); + if (charset_to_utf8_str(fctx->charset, CHARSET_FLAG_DECOMP_TITLECASE, + key, key_utf8, &result) < 0) { + /* unknown charset, can't handle this */ + ret = 0; + } else if (result != CHARSET_RET_OK) { + /* let the core code handle this error */ + ret = 0; + } else if (!backend->locked && fts_backend_lock(backend) <= 0) + ret = -1; + else if (!filter) { + ret = fts_backend_lookup(backend, str_c(key_utf8), flags, + &fctx->definite_seqs, + &fctx->maybe_seqs); + } else { + ret = fts_backend_filter(backend, str_c(key_utf8), flags, + &fctx->definite_seqs, + &fctx->maybe_seqs); } - - if (!filter) { - return fts_backend_lookup(backend, key, flags, - &fctx->definite_seqs, - &fctx->maybe_seqs); - } else { - return fts_backend_filter(backend, key, flags, - &fctx->definite_seqs, - &fctx->maybe_seqs); - } + t_pop(); + return ret; } void fts_search_lookup(struct fts_search_context *fctx) diff -r c68564884bae -r a215deb3de8d src/plugins/fts/fts-storage.c --- a/src/plugins/fts/fts-storage.c Mon Dec 03 14:23:19 2007 +0200 +++ b/src/plugins/fts/fts-storage.c Mon Dec 03 14:23:45 2007 +0200 @@ -340,6 +340,7 @@ fctx->fbox = fbox; fctx->t = t; fctx->args = args; + fctx->charset = ctx->charset; MODULE_CONTEXT_SET(ctx, fts_storage_module, fctx); if (fbox->backend_substr == NULL && fbox->backend_fast == NULL) diff -r c68564884bae -r a215deb3de8d src/plugins/fts/fts-storage.h --- a/src/plugins/fts/fts-storage.h Mon Dec 03 14:23:19 2007 +0200 +++ b/src/plugins/fts/fts-storage.h Mon Dec 03 14:23:45 2007 +0200 @@ -17,6 +17,7 @@ struct mailbox_transaction_context *t; struct mail_search_arg *args; struct mail_search_arg *best_arg; + const char *charset; ARRAY_TYPE(seq_range) definite_seqs, maybe_seqs; unsigned int definite_idx, maybe_idx;