Mercurial > dovecot > core-2.2
changeset 18549:ae0458c63761
fts: Create tokenizers differently
Create tokenizers earlier. Create separate tokenizers for search
and indexing. Enable configuration of tokenizers. Add some helpers
in fts-tokenizer.h api. Change tokenizer unit tests to match
those changes.
lib-fts: Refactor lib-fts settings a bit
Turned address tokenizer settings into "boolean" values. Changed
have_parent to "no_parent" and added "search" setting. Added
documentation in fts-tokenizer.h. Change unit tests accordingly.
author | Teemu Huovila <teemu.huovila@dovecot.fi> |
---|---|
date | Sat, 09 May 2015 11:02:22 +0300 |
parents | abbd71252175 |
children | cebe8be92034 |
files | src/lib-fts/fts-tokenizer-address.c src/lib-fts/fts-tokenizer.c src/lib-fts/fts-tokenizer.h src/lib-fts/test-fts-tokenizer.c src/plugins/fts/fts-api-private.h src/plugins/fts/fts-build-mail.c src/plugins/fts/fts-plugin.c src/plugins/fts/fts-search-args.c src/plugins/fts/fts-storage.c src/plugins/fts/fts-user.c src/plugins/fts/fts-user.h |
diffstat | 11 files changed, 188 insertions(+), 61 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-fts/fts-tokenizer-address.c Sat May 09 10:53:25 2015 +0300 +++ b/src/lib-fts/fts-tokenizer-address.c Sat May 09 11:02:22 2015 +0300 @@ -5,8 +5,8 @@ #include "buffer.h" #include "fts-tokenizer-private.h" -/* Return not only our tokens, but also data for parent to process.*/ -#define FTS_DEFAULT_HAVE_PARENT 1 +#define FTS_DEFAULT_NO_PARENT FALSE +#define FTS_DEFAULT_SEARCH FALSE enum email_address_parser_state { EMAIL_ADDRESS_PARSER_STATE_NONE = 0, @@ -21,8 +21,8 @@ string_t *last_word; string_t *parent_data; /* Copy of input data between tokens. TODO: could be buffer_t maybe */ - unsigned int have_parent; /* Setting for stand-alone usage. - Might be superfluous. */ + bool no_parent; + bool search; }; /* @@ -85,18 +85,17 @@ const char **error_r) { struct email_address_fts_tokenizer *tok; - unsigned int have_parent = FTS_DEFAULT_HAVE_PARENT; + bool no_parent = FTS_DEFAULT_NO_PARENT; + bool search = FTS_DEFAULT_SEARCH; unsigned int i; for (i = 0; settings[i] != NULL; i += 2) { - const char *key = settings[i], *value = settings[i+1]; + const char *key = settings[i]; - if (strcmp(key, "have_parent") == 0) { - if (str_to_uint(value, &have_parent) < 0 ) { - *error_r = t_strdup_printf( - "Invalid parent setting: %s", value); - return -1; - } + if (strcmp(key, "no_parent") == 0) { + no_parent = TRUE; + }else if (strcmp(key, "search") == 0) { + search = TRUE; } else { *error_r = t_strdup_printf("Unknown setting: %s", key); return -1; @@ -107,7 +106,8 @@ tok->tokenizer = *fts_tokenizer_email_address; tok->last_word = str_new(default_pool, 128); tok->parent_data = str_new(default_pool, 128); - tok->have_parent = have_parent; + tok->no_parent = no_parent; + tok->search = search; *tokenizer_r = &tok->tokenizer; return 0; } @@ -134,6 +134,9 @@ fts_tokenizer_address_parent_data(struct email_address_fts_tokenizer *tok) { const char *ret; + /* TODO: search option removes address from data here. */ + if (tok->search && tok->state >= EMAIL_ADDRESS_PARSER_STATE_DOMAIN) + i_debug("Would remove current token"); ret = t_strdup(str_c(tok->parent_data)); str_truncate(tok->parent_data, 0); @@ -250,7 +253,7 @@ fts_tokenizer_address_update_parent(struct email_address_fts_tokenizer *tok, const unsigned char *data, size_t size) { - if (tok->have_parent > 0) + if (!tok->no_parent) str_append_n(tok->parent_data, data, size); } static const char * @@ -273,7 +276,7 @@ /* end of data, output lingering tokens. first the parents data, then possibly our token, if complete enough */ if (size == 0) { - if (tok->have_parent > 0 && str_len(tok->parent_data) > 0) + if (!tok->no_parent && str_len(tok->parent_data) > 0) return fts_tokenizer_address_parent_data(tok); if (tok->state == EMAIL_ADDRESS_PARSER_STATE_DOMAIN @@ -328,7 +331,7 @@ *skip_r = pos + local_skip; fts_tokenizer_address_update_parent(tok, data+pos, local_skip); - if (tok->have_parent > 0) + if (!tok->no_parent) return fts_tokenizer_address_parent_data(tok); else { return fts_tokenizer_address_current_token(tok);
--- a/src/lib-fts/fts-tokenizer.c Sat May 09 10:53:25 2015 +0300 +++ b/src/lib-fts/fts-tokenizer.c Sat May 09 11:02:22 2015 +0300 @@ -10,6 +10,20 @@ ARRAY(struct fts_tokenizer) fts_tokenizer_classes; +void fts_tokenizers_init(void) +{ + if (!array_is_created(&fts_tokenizer_classes)) { + fts_tokenizer_register(fts_tokenizer_generic); + fts_tokenizer_register(fts_tokenizer_email_address); + } +} + +void fts_tokenizers_deinit(void) +{ + if (array_is_created(&fts_tokenizer_classes)) + array_free(&fts_tokenizer_classes); +} + /* private */ void fts_tokenizer_register(const struct fts_tokenizer *tok_class) { @@ -47,6 +61,11 @@ return NULL; } +const char *fts_tokenizer_name(const struct fts_tokenizer *tok) +{ + return tok->name; +} + int fts_tokenizer_create(const struct fts_tokenizer *tok_class, struct fts_tokenizer *parent, const char *const *settings,
--- a/src/lib-fts/fts-tokenizer.h Sat May 09 10:53:25 2015 +0300 +++ b/src/lib-fts/fts-tokenizer.h Sat May 09 11:02:22 2015 +0300 @@ -3,7 +3,9 @@ /* Settings are given in the form of a const char * const *settings = - {"key, "value", "key2", "value2", NULL} array of string pairs. + {"key, "value", "key2", "value2", NULL} array of string pairs. Some + keys, like "no_parent" and "search" are a sort of boolean and the + value does not matter, just mentioning the key enables the functionality. The array has to be NULL terminated. */ /* Email address header tokenizer that returns "user@domain.org" input as @@ -13,15 +15,21 @@ allows doing an explicit "user@domain" search, which returns only mails matching that exact address (instead of e.g. a mail with both user@domain2 and user2@domain words). */ -/* Settings: "have_parent", Return not only our tokens, but also data - for parent to process. Defaults to 1. Should normally not need to - be changed. */ +/* Settings: + "no_parent", Return only our tokens, no data for parent to process. + Defaults to disabled. Should normally not be needed. + + "search" Remove addresses from parent data stream, so they are not processed + further. Defaults to disabled. Enable by defining the keyword (and any + value). */ extern const struct fts_tokenizer *fts_tokenizer_email_address; #define FTS_TOKENIZER_EMAIL_ADDRESS_NAME "email-address" /* Generic email content tokenizer. Cuts text into tokens. */ -/* Settings: "maxlen" Maximum length of token, before an arbitary cut - off is made. Defaults to FTS_DEFAULT_TOKEN_MAX_LENGTH. +/* Settings: + "maxlen" Maximum length of token, before an arbitary cut off is made. + Defaults to FTS_DEFAULT_TOKEN_MAX_LENGTH. + "algorithm", accepted values are "simple" or "tr29". Defines the method for looking for word boundaries. Simple is faster and will work for many texts, especially those using latin alphabets, but @@ -35,9 +43,18 @@ extern const struct fts_tokenizer *fts_tokenizer_generic; #define FTS_TOKENIZER_GENERIC_NAME "generic" +/* + Tokenizing workflow, find --> create --> filter --> destroy. + Do init before first use and deinit after all done. + */ + +/* Register all built-in tokenizers. */ +void fts_tokenizers_init(void); +void fts_tokenizers_deinit(void); + const struct fts_tokenizer *fts_tokenizer_find(const char *name); -/* Create a new tokenizer. The settings is an array of key,value pairs. */ +/* Create a new tokenizer. The settings are described above. */ int fts_tokenizer_create(const struct fts_tokenizer *tok_class, struct fts_tokenizer *parent, const char *const *settings, @@ -57,4 +74,5 @@ fts_tokenizer_next(struct fts_tokenizer *tok, const unsigned char *data, size_t size); +const char *fts_tokenizer_name(const struct fts_tokenizer *tok); #endif
--- a/src/lib-fts/test-fts-tokenizer.c Sat May 09 10:53:25 2015 +0300 +++ b/src/lib-fts/test-fts-tokenizer.c Sat May 09 11:02:22 2015 +0300 @@ -25,7 +25,7 @@ const char *token, *error; test_begin("fts tokenizer generic simple"); - fts_tokenizer_register(fts_tokenizer_generic); + fts_tokenizers_init(); tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME); test_assert(fts_tokenizer_create(tok_class, NULL, NULL, &tok, &error) == 0); while ((token = fts_tokenizer_next(tok, input, sizeof(input)-1)) != NULL) { @@ -38,7 +38,7 @@ } test_assert(*eopp == NULL); fts_tokenizer_unref(&tok); - fts_tokenizer_unregister(fts_tokenizer_generic); + fts_tokenizers_deinit(); test_end(); } @@ -267,7 +267,7 @@ "abc@example.com", "bar@example.org", "foo@domain", "foo@domain", "bar@example.org", NULL }; - const char *const settings[] = {"have_parent", "0", NULL}; + const char *const settings[] = {"no_parent", "foo", NULL}; struct fts_tokenizer *tok; const char * const *eopp = expected_output; const char *token, *error; @@ -305,7 +305,7 @@ "abc@example.com", "bar@example.org", "foo@domain", NULL }; - const char *const settings[] = {"have_parent", "0", NULL}; + const char *const settings[] = {"no_parent", "0", NULL}; struct fts_tokenizer *tok; const char * const *eopp = expected_output; const char *token, *error; @@ -346,7 +346,7 @@ struct fts_tokenizer *tok; const char * const *eopp = expected_output; const char *token, *error; - const char *const settings[] = {"have_parent", "0", NULL}; + const char *const settings[] = {"no_parent", "abc", NULL}; unsigned int i, step, step_max = 10; test_begin("fts tokenizer email address, input random length"); @@ -390,8 +390,7 @@ unsigned int i; test_begin("fts tokenizer email address + parent, input one character at a time"); - fts_tokenizer_register(fts_tokenizer_generic); - fts_tokenizer_register(fts_tokenizer_email_address); + fts_tokenizers_init(); test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0); test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0); @@ -411,8 +410,7 @@ test_assert(*eopp == NULL); fts_tokenizer_unref(&tok); fts_tokenizer_unref(&gen_tok); - fts_tokenizer_unregister(fts_tokenizer_generic); - fts_tokenizer_unregister(fts_tokenizer_email_address); + fts_tokenizers_deinit(); test_end(); } @@ -437,8 +435,7 @@ unsigned int i; test_begin("fts tokenizer email address + parent, input one line at a time"); - fts_tokenizer_register(fts_tokenizer_generic); - fts_tokenizer_register(fts_tokenizer_email_address); + fts_tokenizers_init(); test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0); test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0); @@ -457,8 +454,7 @@ test_assert(*eopp == NULL); fts_tokenizer_unref(&tok); fts_tokenizer_unref(&gen_tok); - fts_tokenizer_unregister(fts_tokenizer_generic); - fts_tokenizer_unregister(fts_tokenizer_email_address); + fts_tokenizers_deinit(); test_end(); }
--- a/src/plugins/fts/fts-api-private.h Sat May 09 10:53:25 2015 +0300 +++ b/src/plugins/fts/fts-api-private.h Sat May 09 11:02:22 2015 +0300 @@ -76,8 +76,6 @@ struct fts_backend_vfuncs v; struct mail_namespace *ns; - struct fts_tokenizer *tokenizer; - unsigned int updating:1; };
--- a/src/plugins/fts/fts-build-mail.c Sat May 09 10:53:25 2015 +0300 +++ b/src/plugins/fts/fts-build-mail.c Sat May 09 11:02:22 2015 +0300 @@ -241,9 +241,11 @@ fts_build_add_tokens_with_filter(struct fts_mail_build_context *ctx, const unsigned char *data, size_t size) { - struct fts_tokenizer *tokenizer = ctx->update_ctx->backend->tokenizer; + struct fts_tokenizer *tokenizer; struct fts_filter *filter = ctx->cur_user_lang->filter; const char *token; + + tokenizer = fts_user_get_index_tokenizer(ctx->update_ctx->backend->ns->user); while ((token = fts_tokenizer_next(tokenizer, data, size)) != NULL) { if (filter != NULL) { token = fts_filter_filter(filter, token); @@ -311,7 +313,7 @@ } else { ctx->cur_user_lang = fts_user_language_find(user, lang); i_assert(ctx->cur_user_lang != NULL); - + if (ctx->pending_input->used > 0) { if (fts_build_add_tokens_with_filter(ctx, ctx->pending_input->data,
--- a/src/plugins/fts/fts-plugin.c Sat May 09 10:53:25 2015 +0300 +++ b/src/plugins/fts/fts-plugin.c Sat May 09 11:02:22 2015 +0300 @@ -3,6 +3,7 @@ #include "lib.h" #include "mail-storage-hooks.h" #include "fts-filter.h" +#include "fts-tokenizer.h" #include "fts-parser.h" #include "fts-storage.h" #include "fts-user.h" @@ -21,12 +22,14 @@ void fts_plugin_init(struct module *module) { fts_filters_init(); + fts_tokenizers_init(); mail_storage_hooks_add(module, &fts_mail_storage_hooks); } void fts_plugin_deinit(void) { fts_filters_deinit(); + fts_tokenizers_deinit(); fts_parsers_unload(); mail_storage_hooks_remove(&fts_mail_storage_hooks); }
--- a/src/plugins/fts/fts-search-args.c Sat May 09 10:53:25 2015 +0300 +++ b/src/plugins/fts/fts-search-args.c Sat May 09 11:02:22 2015 +0300 @@ -95,8 +95,10 @@ struct mail_search_arg *and_arg, *orig_arg = *argp; const char *token, *orig_token = orig_arg->value.str; unsigned int orig_token_len = strlen(orig_token); + struct fts_tokenizer *tokenizer; languages = fts_user_get_all_languages(backend->ns->user); + tokenizer = fts_user_get_search_tokenizer(backend->ns->user); /* we want all the tokens found from the string to be found, so create a parent AND and place all the filtered token alternatives under @@ -107,14 +109,14 @@ and_arg->next = orig_arg->next; *argp = and_arg; - while ((token = fts_tokenizer_next(backend->tokenizer, + while ((token = fts_tokenizer_next(tokenizer, (const void *)orig_token, orig_token_len)) != NULL) { fts_backend_dovecot_expand_lang_tokens(languages, pool, and_arg, orig_arg, orig_token, token); } - while ((token = fts_tokenizer_next(backend->tokenizer, NULL, 0)) != NULL) { + while ((token = fts_tokenizer_next(tokenizer, NULL, 0)) != NULL) { fts_backend_dovecot_expand_lang_tokens(languages, pool, and_arg, orig_arg, orig_token, token); @@ -151,7 +153,6 @@ int fts_search_args_expand(struct fts_backend *backend, struct mail_search_args *args) { - fts_search_args_expand_tree(backend, args->pool, &args->args); /* we'll need to re-simplify the args if we changed anything */
--- a/src/plugins/fts/fts-storage.c Sat May 09 10:53:25 2015 +0300 +++ b/src/plugins/fts/fts-storage.c Sat May 09 11:02:22 2015 +0300 @@ -738,23 +738,11 @@ { struct fts_mailbox_list *flist = FTS_LIST_CONTEXT(list); - if (flist->backend->tokenizer != NULL) - fts_tokenizer_unref(&flist->backend->tokenizer); fts_backend_deinit(&flist->backend); flist->module_ctx.super.deinit(list); } -static int fts_backend_init_libfts(struct fts_backend *backend) -{ - const char *error; - if (fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, - &backend->tokenizer, &error) < 0) { - i_error("Failed to initialize fts tokenizer: %s", error); - return -1; - } - return 0; -} static void fts_mailbox_list_init(struct mailbox_list *list, const char *name) @@ -773,8 +761,6 @@ if (fts_backend_init(name, list->ns, &error, &backend) < 0) { i_error("fts: Failed to initialize backend '%s': %s", name, error); - } else if (fts_backend_init_libfts(backend) < 0) { - fts_backend_deinit(&backend); } else { struct fts_mailbox_list *flist; struct mailbox_list_vfuncs *v = list->vlast;
--- a/src/plugins/fts/fts-user.c Sat May 09 10:53:25 2015 +0300 +++ b/src/plugins/fts/fts-user.c Sat May 09 11:02:22 2015 +0300 @@ -5,6 +5,7 @@ #include "mail-user.h" #include "fts-language.h" #include "fts-filter.h" +#include "fts-tokenizer.h" #include "fts-user.h" #define FTS_USER_CONTEXT(obj) \ @@ -14,6 +15,7 @@ union mail_user_module_context module_ctx; struct fts_language_list *lang_list; + struct fts_tokenizer *index_tokenizer, *search_tokenizer; ARRAY_TYPE(fts_user_language) languages; }; @@ -114,6 +116,85 @@ return 0; } +static int +fts_user_create_tokenizer(struct mail_user *user, + struct fts_tokenizer **tokenizer_r, bool search, + const char **error_r) +{ + const struct fts_tokenizer *tokenizer_class; + struct fts_tokenizer *tokenizer = NULL, *parent = NULL; + const char *tokenizers_key, *const *tokenizers; + const char *str, *error, *set_key, *const *settings; + unsigned int i; + int ret = 0; + + tokenizers_key = "fts_tokenizers"; + str = mail_user_plugin_getenv(user, tokenizers_key); + if (str == NULL) + str = "generic email-address"; /* default tokenizers */ + + tokenizers = t_strsplit_spaces(str, " "); + + for (i = 0; tokenizers[i] != NULL; i++) { + tokenizer_class = fts_tokenizer_find(tokenizers[i]); + if (tokenizer_class == NULL) { + *error_r = t_strdup_printf("%s: Unknown tokenizer '%s'", + tokenizers_key, tokenizers[i]); + ret = -1; + break; + } + + set_key = t_strdup_printf("fts_tokenizers_%s", tokenizers[i]); + str = mail_user_plugin_getenv(user, set_key); + + /* If the email-address tokenizer is included in the search + tokenizer, add a setting. */ + if (search && strcmp(fts_tokenizer_name(tokenizer_class), + FTS_TOKENIZER_EMAIL_ADDRESS_NAME) == 0) { + if (str == NULL) + str = "search yes"; + else + str = t_strconcat(str, " search yes", NULL); + } + + settings = str == NULL ? NULL : t_strsplit_spaces(str, " "); + + if (fts_tokenizer_create(tokenizer_class, parent, settings, + &tokenizer, &error) < 0) { + *error_r = t_strdup_printf( + "Tokenizer '%s' init via settings '%s' failed: %s", + tokenizers[i], set_key, error); + ret = -1; + break; + } + if (parent != NULL) + fts_tokenizer_unref(&parent); + parent = tokenizer; + } + if (ret < 0) { + if (parent != NULL) + fts_tokenizer_unref(&parent); + return -1; + } + *tokenizer_r = tokenizer; + return 0; +} + +static int fts_user_init_tokenizers(struct mail_user *user, + struct fts_user *fuser, + const char **error_r) +{ + if (fts_user_create_tokenizer(user, &fuser->index_tokenizer, FALSE, + error_r) < 0) + return -1; + + if (fts_user_create_tokenizer(user, &fuser->search_tokenizer, TRUE, + error_r) < 0) + return -1; + + return 0; +} + struct fts_user_language * fts_user_language_find(struct mail_user *user, const struct fts_language *lang) @@ -128,6 +209,20 @@ return NULL; } +struct fts_tokenizer *fts_user_get_index_tokenizer(struct mail_user *user) +{ + struct fts_user *fuser = FTS_USER_CONTEXT(user); + + return fuser->index_tokenizer; +} + +struct fts_tokenizer *fts_user_get_search_tokenizer(struct mail_user *user) +{ + struct fts_user *fuser = FTS_USER_CONTEXT(user); + + return fuser->search_tokenizer; +} + static int fts_user_language_create(struct mail_user *user, struct fts_user *fuser, const struct fts_language *lang, @@ -185,12 +280,16 @@ if ((*user_langp)->filter != NULL) fts_filter_unref(&(*user_langp)->filter); } + + if (fuser->index_tokenizer != NULL) + fts_tokenizer_unref(&fuser->index_tokenizer); + if (fuser->search_tokenizer != NULL) + fts_tokenizer_unref(&fuser->search_tokenizer); } int fts_mail_user_init(struct mail_user *user, const char **error_r) { struct fts_user *fuser; - const char *error; fuser = p_new(user->pool, struct fts_user, 1); p_array_init(&fuser->languages, user->pool, 4); @@ -199,11 +298,12 @@ fts_user_free(fuser); return -1; } - if (fts_user_languages_fill_all(user, fuser, &error) < 0) { - i_error("fts_dovecot: Failed to initialize languages: %s", error); + if (fts_user_languages_fill_all(user, fuser, error_r) < 0 || + fts_user_init_tokenizers(user, fuser, error_r) < 0) { fts_user_free(fuser); return -1; } + MODULE_CONTEXT_SET(user, fts_user_module, fuser); return 0; }
--- a/src/plugins/fts/fts-user.h Sat May 09 10:53:25 2015 +0300 +++ b/src/plugins/fts/fts-user.h Sat May 09 11:02:22 2015 +0300 @@ -10,7 +10,8 @@ struct fts_user_language * fts_user_language_find(struct mail_user *user, const struct fts_language *lang); - +struct fts_tokenizer *fts_user_get_index_tokenizer(struct mail_user *user); +struct fts_tokenizer *fts_user_get_search_tokenizer(struct mail_user *user); struct fts_language_list *fts_user_get_language_list(struct mail_user *user); const ARRAY_TYPE(fts_user_language) * fts_user_get_all_languages(struct mail_user *user);