Mercurial > dovecot > core-2.2
changeset 13646:6d483a22134e
fts-lucene: Added whitespace_chars subsetting to fts_lucene.
A value of "@." could be useful so that user@domain.tld allows searching
user, domain and tld separately instead of requiring the whole string to
match.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Fri, 04 Nov 2011 19:35:30 +0200 |
parents | b6e5cf112b3e |
children | 9f739df97593 |
files | src/plugins/fts-lucene/fts-lucene-plugin.c src/plugins/fts-lucene/fts-lucene-plugin.h src/plugins/fts-lucene/lucene-wrapper.cc |
diffstat | 3 files changed, 39 insertions(+), 9 deletions(-) [+] |
line wrap: on
line diff
--- a/src/plugins/fts-lucene/fts-lucene-plugin.c Fri Nov 04 18:50:24 2011 +0200 +++ b/src/plugins/fts-lucene/fts-lucene-plugin.c Fri Nov 04 19:35:30 2011 +0200 @@ -26,6 +26,8 @@ set->textcat_conf = p_strdup(user->pool, *tmp + 13); } else if (strncmp(*tmp, "textcat_dir=", 12) == 0) { set->textcat_dir = p_strdup(user->pool, *tmp + 12); + } else if (strncmp(*tmp, "whitespace_chars=", 17) == 0) { + set->whitespace_chars = p_strdup(user->pool, *tmp + 17); } else { i_error("fts_lucene: Invalid setting: %s", *tmp); return -1; @@ -39,6 +41,8 @@ i_error("fts_lucene: textcat_dir set, but textcat_conf unset"); return -1; } + if (set->whitespace_chars == NULL) + set->whitespace_chars = ""; #ifndef HAVE_LUCENE_STEMMER if (set->default_language != NULL) { i_error("fts_lucene: default_language set, " @@ -61,9 +65,11 @@ uint32_t fts_lucene_settings_checksum(const struct fts_lucene_settings *set) { - /* only the default language change matters */ - return set->default_language == NULL ? 0 : - crc32_str(set->default_language); + uint32_t crc; + + crc = crc32_str(set->default_language); + crc = crc32_str_more(crc, set->whitespace_chars); + return crc; } static void fts_lucene_mail_user_created(struct mail_user *user)
--- a/src/plugins/fts-lucene/fts-lucene-plugin.h Fri Nov 04 18:50:24 2011 +0200 +++ b/src/plugins/fts-lucene/fts-lucene-plugin.h Fri Nov 04 19:35:30 2011 +0200 @@ -11,6 +11,7 @@ struct fts_lucene_settings { const char *default_language; const char *textcat_conf, *textcat_dir; + const char *whitespace_chars; }; struct fts_lucene_user {
--- a/src/plugins/fts-lucene/lucene-wrapper.cc Fri Nov 04 18:50:24 2011 +0200 +++ b/src/plugins/fts-lucene/lucene-wrapper.cc Fri Nov 04 19:35:30 2011 +0200 @@ -143,6 +143,21 @@ i_free(index); } +static void lucene_data_translate(struct lucene_index *index, + wchar_t *data, unsigned int len) +{ + const char *whitespace_chars = index->set.whitespace_chars; + unsigned int i; + + if (*whitespace_chars == '\0') + return; + + for (i = 0; i < len; i++) { + if (strchr(whitespace_chars, data[i]) != NULL) + data[i] = ' '; + } +} + void lucene_utf8_n_to_tchar(const unsigned char *src, size_t srcsize, wchar_t *dest, size_t destsize) { @@ -159,10 +174,14 @@ dest[destsize-1] = 0; } -static const wchar_t *t_lucene_utf8_to_tchar(const char *str) +static const wchar_t * +t_lucene_utf8_to_tchar(struct lucene_index *index, + const char *str, bool translate) { ARRAY_TYPE(unichars) dest_arr; - const unichar_t *ret; + const unichar_t *chars; + wchar_t *ret; + unsigned int len; i_assert(sizeof(wchar_t) == sizeof(unichar_t)); @@ -170,8 +189,11 @@ if (uni_utf8_to_ucs4(str, &dest_arr) < 0) i_unreached(); (void)array_append_space(&dest_arr); - ret = array_idx(&dest_arr, 0); - return (const wchar_t *)ret; + + chars = array_get_modifiable(&dest_arr, &len); + ret = (wchar_t *)chars; + lucene_data_translate(index, ret, len - 1); + return ret; } void lucene_index_select_mailbox(struct lucene_index *index, @@ -478,6 +500,7 @@ datasize = uni_utf8_strlen_n(data, size) + 1; wchar_t dest[datasize]; lucene_utf8_n_to_tchar(data, size, dest, datasize); + lucene_data_translate(index, dest, datasize); if (hdr_name != NULL) { /* hdr_name should be ASCII, but don't break in case it isn't */ @@ -1010,7 +1033,7 @@ lucene_get_query_str(struct lucene_index *index, const TCHAR *key, const char *str, bool fuzzy) { - const TCHAR *wvalue = t_lucene_utf8_to_tchar(str); + const TCHAR *wvalue = t_lucene_utf8_to_tchar(index, str, TRUE); Analyzer *analyzer = guess_analyzer(index, str, strlen(str)); if (analyzer == NULL) analyzer = index->default_analyzer; @@ -1067,7 +1090,7 @@ } q = lucene_get_query(index, - t_lucene_utf8_to_tchar(arg->hdr_field_name), + t_lucene_utf8_to_tchar(index, arg->hdr_field_name, FALSE), arg); break; default: