Mercurial > dovecot > core-2.2
changeset 18585:fcc20dce3c83
fts: Lowecase non-human language input while indexing.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Sat, 09 May 2015 14:41:05 +0300 |
parents | 75b4b312ea09 |
children | 307fe289f1b7 |
files | src/plugins/fts/fts-build-mail.c src/plugins/fts/fts-user.c src/plugins/fts/fts-user.h |
diffstat | 3 files changed, 45 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/src/plugins/fts/fts-build-mail.c Sat May 09 14:26:42 2015 +0300 +++ b/src/plugins/fts/fts-build-mail.c Sat May 09 14:41:05 2015 +0300 @@ -35,11 +35,6 @@ struct fts_user_language *cur_user_lang; }; -static struct fts_user_language fts_user_language_data = { - .lang = &fts_language_data, - .filter = NULL -}; - static int fts_build_data(struct fts_mail_build_context *ctx, const unsigned char *data, size_t size, bool last); @@ -127,6 +122,17 @@ i_free(buf); } +static bool data_has_8bit(const unsigned char *data, size_t size) +{ + size_t i; + + for (i = 0; i < size; i++) { + if ((data[i] & 0x80) != 0) + return TRUE; + } + return FALSE; +} + static void fts_build_mail_header(struct fts_mail_build_context *ctx, const struct message_block *block) { @@ -145,10 +151,17 @@ key.part = block->part; key.hdr_name = hdr->name; - if (!header_has_language(key.hdr_name)) - ctx->cur_user_lang = &fts_user_language_data; + /* Headers that don't contain any human language will only be + translated to lowercase - no stemming or other filtering. There's + unfortunately no pefect way of detecting which headers contain + human languages, so we have a list of some hardcoded header names + and we'll also assume that if there's any 8bit content it's a human + language. */ + if (header_has_language(key.hdr_name) || + data_has_8bit(hdr->full_value, hdr->full_value_len)) + ctx->cur_user_lang = NULL; else - ctx->cur_user_lang = NULL; + ctx->cur_user_lang = fts_user_get_data_lang(ctx->update_ctx->backend->ns->user); if (!fts_backend_update_set_build_key(ctx->update_ctx, &key)) return;
--- a/src/plugins/fts/fts-user.c Sat May 09 14:26:42 2015 +0300 +++ b/src/plugins/fts/fts-user.c Sat May 09 14:41:05 2015 +0300 @@ -16,6 +16,7 @@ struct fts_language_list *lang_list; struct fts_tokenizer *index_tokenizer, *search_tokenizer; + struct fts_user_language *data_lang; ARRAY_TYPE(fts_user_language) languages; }; @@ -269,6 +270,26 @@ return &fuser->languages; } +struct fts_user_language *fts_user_get_data_lang(struct mail_user *user) +{ + struct fts_user *fuser = FTS_USER_CONTEXT(user); + struct fts_user_language *lang; + const char *error; + + if (fuser->data_lang != NULL) + return fuser->data_lang; + + lang = p_new(user->pool, struct fts_user_language, 1); + lang->lang = &fts_language_data; + + if (fts_filter_create(fts_filter_lowercase, NULL, lang->lang, NULL, + &lang->filter, &error) < 0) + i_unreached(); + i_assert(lang->filter != NULL); + fuser->data_lang = lang; + return fuser->data_lang; +} + static void fts_user_free(struct fts_user *fuser) { struct fts_user_language *const *user_langp; @@ -280,6 +301,8 @@ if ((*user_langp)->filter != NULL) fts_filter_unref(&(*user_langp)->filter); } + if (fuser->data_lang != NULL && fuser->data_lang->filter != NULL) + fts_filter_unref(&fuser->data_lang->filter); if (fuser->index_tokenizer != NULL) fts_tokenizer_unref(&fuser->index_tokenizer);
--- a/src/plugins/fts/fts-user.h Sat May 09 14:26:42 2015 +0300 +++ b/src/plugins/fts/fts-user.h Sat May 09 14:41:05 2015 +0300 @@ -15,6 +15,7 @@ struct fts_language_list *fts_user_get_language_list(struct mail_user *user); const ARRAY_TYPE(fts_user_language) * fts_user_get_all_languages(struct mail_user *user); +struct fts_user_language *fts_user_get_data_lang(struct mail_user *user); int fts_mail_user_init(struct mail_user *user, const char **error_r); void fts_mail_user_deinit(struct mail_user *user);