changeset 19919:126a81a431d2

fts: Added "doveadm fts tokenize" command.
author Timo Sirainen <timo.sirainen@dovecot.fi>
date Thu, 03 Mar 2016 16:10:51 +0200
parents e7fb4e09d051
children c1b70b76f9bb
files src/plugins/fts/doveadm-fts.c
diffstat 1 files changed, 157 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/src/plugins/fts/doveadm-fts.c	Thu Mar 03 11:53:05 2016 +0200
+++ b/src/plugins/fts/doveadm-fts.c	Thu Mar 03 16:10:51 2016 +0200
@@ -6,14 +6,25 @@
 #include "mail-namespace.h"
 #include "mail-search.h"
 #include "mailbox-list-iter.h"
+#include "fts-tokenizer.h"
+#include "fts-filter.h"
+#include "fts-language.h"
 #include "fts-storage.h"
 #include "fts-search-args.h"
+#include "fts-user.h"
+#include "doveadm-print.h"
 #include "doveadm-mail.h"
 #include "doveadm-mailbox-list-iter.h"
 #include "doveadm-fts.h"
 
 const char *doveadm_fts_plugin_version = DOVECOT_ABI_VERSION;
 
+struct fts_tokenize_cmd_context {
+	struct doveadm_mail_cmd_context ctx;
+	const char *language;
+	const char *tokens;
+};
+
 static int
 cmd_search_box(struct doveadm_mail_cmd_context *ctx,
 	       const struct mailbox_info *info)
@@ -156,6 +167,142 @@
 }
 
 static int
+cmd_fts_tokenize_run(struct doveadm_mail_cmd_context *_ctx,
+		     struct mail_user *user)
+{
+	struct fts_tokenize_cmd_context *ctx =
+		(struct fts_tokenize_cmd_context *)_ctx;
+	struct mail_namespace *ns = mail_namespace_find_inbox(user->namespaces);
+	struct fts_backend *backend;
+	struct fts_user_language *user_lang;
+	const struct fts_language *lang = NULL;
+	int ret, ret2;
+	bool final = FALSE;
+
+	backend = fts_list_backend(ns->list);
+	if (backend == NULL) {
+		i_error("fts not enabled for INBOX");
+		_ctx->exit_code = EX_CONFIG;
+		return -1;
+	}
+
+	if (ctx->language == NULL) {
+		struct fts_language_list *lang_list =
+			fts_user_get_language_list(user);
+		enum fts_language_result result;
+
+		result = fts_language_detect(lang_list,
+		    (const unsigned char *)ctx->tokens, strlen(ctx->tokens),
+                    &lang);
+		if (lang == NULL)
+			lang = fts_language_list_get_first(lang_list);
+		switch (result) {
+		case FTS_LANGUAGE_RESULT_SHORT:
+			i_warning("Text too short, can't detect its language - assuming %s", lang->name);
+			break;
+		case FTS_LANGUAGE_RESULT_UNKNOWN:
+			i_warning("Can't detect its language - assuming %s", lang->name);
+			break;
+		case FTS_LANGUAGE_RESULT_OK:
+			break;
+		case FTS_LANGUAGE_RESULT_ERROR:
+			i_error("Language detection library initialization failed");
+			_ctx->exit_code = EX_CONFIG;
+			return -1;
+		default:
+			i_unreached();
+		}
+	} else {
+		lang = fts_language_find(ctx->language);
+		if (lang == NULL) {
+			i_error("Unknown language: %s", ctx->language);
+			_ctx->exit_code = EX_USAGE;
+			return -1;
+		}
+	}
+	user_lang = fts_user_language_find(user, lang);
+	if (user_lang == NULL) {
+		i_error("Language not enabled for user: %s", ctx->language);
+		_ctx->exit_code = EX_USAGE;
+		return -1;
+	}
+
+	fts_tokenizer_reset(user_lang->index_tokenizer);
+	for (;;) {
+		const char *token, *error;
+
+		if (!final) {
+			ret = fts_tokenizer_next(user_lang->index_tokenizer,
+				(const unsigned char *)ctx->tokens, strlen(ctx->tokens),
+				&token, &error);
+		} else {
+			ret = fts_tokenizer_final(user_lang->index_tokenizer,
+						  &token, &error);
+		}
+		if (ret < 0)
+			break;
+		if (ret > 0 && user_lang->filter != NULL) {
+			ret2 = fts_filter_filter(user_lang->filter, &token, &error);
+			if (ret2 > 0)
+				doveadm_print(token);
+			else if (ret2 < 0)
+				i_error("Couldn't create indexable tokens: %s", error);
+		}
+		if (ret == 0) {
+			if (final)
+				break;
+			final = TRUE;
+		}
+	}
+	return 0;
+}
+
+static void
+cmd_fts_tokenize_init(struct doveadm_mail_cmd_context *_ctx,
+		      const char *const args[])
+{
+	struct fts_tokenize_cmd_context *ctx =
+		(struct fts_tokenize_cmd_context *)_ctx;
+
+	if (args[0] == NULL)
+		doveadm_mail_help_name("fts tokenize");
+
+	ctx->tokens = p_strdup(_ctx->pool, t_strarray_join(args, " "));
+
+	doveadm_print_init(DOVEADM_PRINT_TYPE_FLOW);
+	doveadm_print_header("token", "token", DOVEADM_PRINT_HEADER_FLAG_HIDE_TITLE);
+}
+
+static bool
+cmd_fts_tokenize_parse_arg(struct doveadm_mail_cmd_context *_ctx, int c)
+{
+	struct fts_tokenize_cmd_context *ctx =
+		(struct fts_tokenize_cmd_context *)_ctx;
+
+	switch (c) {
+	case 'l':
+		ctx->language = p_strdup(_ctx->pool, optarg);
+		break;
+	default:
+		return FALSE;
+	}
+	return TRUE;
+}
+
+static struct doveadm_mail_cmd_context *
+cmd_fts_tokenize_alloc(void)
+{
+	struct fts_tokenize_cmd_context *ctx;
+
+	ctx = doveadm_mail_cmd_alloc(struct fts_tokenize_cmd_context);
+	ctx->ctx.v.run = cmd_fts_tokenize_run;
+	ctx->ctx.v.init = cmd_fts_tokenize_init;
+	ctx->ctx.v.parse_arg = cmd_fts_tokenize_parse_arg;
+	ctx->ctx.getopt_args = "l";
+	return &ctx->ctx;
+}
+
+static int
 fts_namespace_find(struct mail_user *user, const char *ns_prefix,
 		   struct mail_namespace **ns_r)
 {
@@ -279,6 +426,16 @@
 DOVEADM_CMD_PARAMS_END
 },
 {
+	.name = "fts tokenize",
+	.mail_cmd = cmd_fts_tokenize_alloc,
+	.usage = DOVEADM_CMD_MAIL_USAGE_PREFIX "<text>",
+DOVEADM_CMD_PARAMS_START
+DOVEADM_CMD_MAIL_COMMON
+DOVEADM_CMD_PARAM('l', "language", CMD_PARAM_STR, 0)
+DOVEADM_CMD_PARAM('\0', "text", CMD_PARAM_ARRAY, CMD_PARAM_FLAG_POSITIONAL)
+DOVEADM_CMD_PARAMS_END
+},
+{
 	.name = "fts optimize",
 	.mail_cmd = cmd_fts_optimize_alloc,
 	.usage = DOVEADM_CMD_MAIL_USAGE_PREFIX "[<namespace>]",