changeset 13122:bbb023aa1efd

fts: Added attachment decoding via an external script.
author Timo Sirainen <tss@iki.fi>
date Thu, 28 Jul 2011 17:23:01 +0300
parents 15082db3225a
children 6f43ee658efb
files src/plugins/fts/Makefile.am src/plugins/fts/decode2text.sh src/plugins/fts/fts-build.c src/plugins/fts/fts-parser-html.c src/plugins/fts/fts-parser-script.c src/plugins/fts/fts-parser.c src/plugins/fts/fts-parser.h
diffstat 7 files changed, 413 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/src/plugins/fts/Makefile.am	Thu Jul 28 17:01:26 2011 +0300
+++ b/src/plugins/fts/Makefile.am	Thu Jul 28 17:23:01 2011 +0300
@@ -19,6 +19,7 @@
 	fts-build-virtual.c \
 	fts-parser.c \
 	fts-parser-html.c \
+	fts-parser-script.c \
 	fts-plugin.c \
 	fts-search.c \
 	fts-search-serialize.c \
@@ -34,3 +35,5 @@
 	fts-plugin.h \
 	fts-search-serialize.h \
 	fts-storage.h
+
+EXTRA_DIST = decode2text.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts/decode2text.sh	Thu Jul 28 17:23:01 2011 +0300
@@ -0,0 +1,65 @@
+#!/bin/sh
+
+# Example attachment decoder script. The attachment comes from stdin, and
+# the script is expected to output UTF-8 data to stdout. (If the output isn't
+# UTF-8, everything except valid UTF-8 sequences are dropped from it.)
+
+# The attachment decoding is enabled by setting:
+#
+# plugin {
+#   fts_decoder = decode2text
+# }
+# service decode2text {
+#   executable = script /usr/local/bin/decode2text.sh
+#   user = dovecot
+#   unix_listener decode2text {
+#     mode = 0666
+#   }
+# }
+
+content_type=$1
+
+# The second parameter is the format's filename extension, which is used when
+# found from a filename of application/octet-stream. You can also add more
+# extensions by giving more parameters.
+formats='application/pdf pdf
+application/x-pdf pdf
+application/msword doc
+application/mspowerpoint ppt
+application/vnd.ms-powerpoint ppt
+application/ms-excel xls
+application/x-msexcel xls
+application/vnd.ms-excel xls
+'
+
+if [ "$content_type" = "" ]; then
+  echo "$formats"
+  exit 0
+fi
+
+fmt=`echo "$formats" | grep -w "^$content_type" | cut -d ' ' -f 2`
+if [ "$fmt" = "" ]; then
+  echo "Content-Type: $content_type not supported" >&2
+  exit 1
+fi
+
+# most decoders can't handle stdin directly, so write the attachment
+# to a temp file
+path=`mktemp`
+trap "rm -f $path" 0 1 2 3 15
+cat > $path
+
+LANG=en_US.UTF-8
+export LANG
+if [ $fmt = "pdf" ]; then
+  /usr/bin/pdftotext $path -
+elif [ $fmt = "doc" ]; then
+  /usr/bin/catdoc $path
+elif [ $fmt = "ppt" ]; then
+  /usr/bin/catppt $path
+elif [ $fmt = "xls" ]; then
+  /usr/bin/xls2csv $path
+else
+  echo "Buggy decoder script: $fmt not handled" >&2
+  exit 1
+fi
--- a/src/plugins/fts/fts-build.c	Thu Jul 28 17:01:26 2011 +0300
+++ b/src/plugins/fts/fts-build.c	Thu Jul 28 17:23:01 2011 +0300
@@ -29,7 +29,8 @@
 		content_type = t_str_new(64);
 		if (rfc822_parse_content_type(&parser, content_type) >= 0) {
 			i_free(ctx->content_type);
-			ctx->content_type = i_strdup(str_c(content_type));
+			ctx->content_type =
+				str_lcase(i_strdup(str_c(content_type)));
 		}
 	} T_END;
 }
@@ -99,21 +100,30 @@
 	} T_END;
 }
 
-static bool fts_build_body_begin(struct fts_storage_build_context *ctx)
+static bool
+fts_build_body_begin(struct fts_storage_build_context *ctx, bool *binary_body_r)
 {
 	const char *content_type;
 	struct fts_backend_build_key key;
 
 	i_assert(ctx->body_parser == NULL);
 
+	*binary_body_r = FALSE;
 	memset(&key, 0, sizeof(key));
 	key.uid = ctx->uid;
 
 	content_type = ctx->content_type != NULL ?
 		ctx->content_type : "text/plain";
-	if (fts_parser_init(content_type, ctx->content_disposition,
+	if (strncmp(content_type, "multipart/", 10) == 0) {
+		/* multiparts are never indexed, only their contents */
+		return FALSE;
+	}
+
+	if (fts_parser_init(ctx->box->storage->user,
+			    content_type, ctx->content_disposition,
 			    &ctx->body_parser)) {
 		/* extract text using the the returned parser */
+		*binary_body_r = TRUE;
 		key.type = FTS_BACKEND_BUILD_KEY_BODY_PART;
 	} else if (strncmp(content_type, "text/", 5) == 0 ||
 		   strncmp(content_type, "message/", 8) == 0) {
@@ -123,6 +133,7 @@
 		/* possibly binary */
 		if (!ctx->binary_mime_parts)
 			return FALSE;
+		*binary_body_r = TRUE;
 		key.type = FTS_BACKEND_BUILD_KEY_BODY_PART_BINARY;
 	}
 	key.body_content_type = content_type;
@@ -130,6 +141,26 @@
 	return fts_backend_update_set_build_key(ctx->update_ctx, &key);
 }
 
+static int fts_body_parser_finish(struct fts_storage_build_context *ctx)
+{
+	struct message_block block;
+	int ret = 0;
+
+	do {
+		memset(&block, 0, sizeof(block));
+		fts_parser_more(ctx->body_parser, &block);
+		if (fts_backend_update_build_more(ctx->update_ctx,
+						  block.data,
+						  block.size) < 0) {
+			ret = -1;
+			break;
+		}
+	} while (block.size > 0);
+
+	fts_parser_deinit(&ctx->body_parser);
+	return ret;
+}
+
 int fts_build_mail(struct fts_storage_build_context *ctx, struct mail *mail)
 {
 	enum message_decoder_flags decoder_flags = 0;
@@ -139,6 +170,7 @@
 	struct message_block raw_block, block;
 	struct message_part *prev_part, *parts;
 	bool skip_body = FALSE, body_part = FALSE, body_added = FALSE;
+	bool binary_body;
 	int ret;
 
 	ctx->uid = mail->uid;
@@ -153,8 +185,6 @@
 
 	if (ctx->dtcase)
 		decoder_flags |= MESSAGE_DECODER_FLAG_DTCASE;
-	if (ctx->binary_mime_parts)
-		decoder_flags |= MESSAGE_DECODER_FLAG_RETURN_BINARY;
 	decoder = message_decoder_init(decoder_flags);
 	for (;;) {
 		ret = message_parser_parse_next_block(parser, &raw_block);
@@ -168,8 +198,13 @@
 		if (raw_block.part != prev_part) {
 			/* body part changed. we're now parsing the end of
 			   boundary, possibly followed by message epilogue */
-			if (ctx->body_parser != NULL)
-				fts_parser_deinit(&ctx->body_parser);
+			if (ctx->body_parser != NULL) {
+				if (fts_body_parser_finish(ctx) < 0) {
+					ret = -1;
+					break;
+				}
+			}
+			message_decoder_set_return_binary(decoder, FALSE);
 			fts_backend_update_unset_build_key(ctx->update_ctx);
 			prev_part = raw_block.part;
 			i_free_and_null(ctx->content_type);
@@ -186,7 +221,9 @@
 			/* always handle headers */
 		} else if (raw_block.size == 0) {
 			/* end of headers */
-			skip_body = !fts_build_body_begin(ctx);
+			skip_body = !fts_build_body_begin(ctx, &binary_body);
+			if (binary_body)
+				message_decoder_set_return_binary(decoder, TRUE);
 			body_part = TRUE;
 		} else {
 			if (skip_body)
@@ -215,6 +252,8 @@
 			body_added = TRUE;
 		}
 	}
+	if (ret == 0 && ctx->body_parser != NULL)
+		ret = fts_body_parser_finish(ctx);
 	if (ret == 0 && body_part && !skip_body && !body_added) {
 		/* make sure body is added even when it doesn't exist */
 		ret = fts_backend_update_build_more(ctx->update_ctx, NULL, 0);
--- a/src/plugins/fts/fts-parser-html.c	Thu Jul 28 17:01:26 2011 +0300
+++ b/src/plugins/fts/fts-parser-html.c	Thu Jul 28 17:23:01 2011 +0300
@@ -44,7 +44,8 @@
 };
 
 static struct fts_parser *
-fts_parser_html_try_init(const char *content_type ATTR_UNUSED,
+fts_parser_html_try_init(struct mail_user *user ATTR_UNUSED,
+			 const char *content_type,
 			 const char *content_disposition ATTR_UNUSED)
 {
 	struct html_fts_parser *parser;
@@ -53,7 +54,7 @@
 		return NULL;
 
 	parser = i_new(struct html_fts_parser, 1);
-	parser->parser = fts_parser_html;
+	parser->parser.v = fts_parser_html;
 	parser->input = buffer_create_dynamic(default_pool, 512);
 	parser->output = buffer_create_dynamic(default_pool, 4096);
 	return &parser->parser;
@@ -234,7 +235,7 @@
 	i_free(parser);
 }
 
-struct fts_parser fts_parser_html = {
+struct fts_parser_vfuncs fts_parser_html = {
 	fts_parser_html_try_init,
 	fts_parser_html_more,
 	fts_parser_html_deinit
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts/fts-parser-script.c	Thu Jul 28 17:23:01 2011 +0300
@@ -0,0 +1,250 @@
+/* Copyright (c) 2011 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "network.h"
+#include "istream.h"
+#include "write-full.h"
+#include "module-context.h"
+#include "rfc822-parser.h"
+#include "rfc2231-parser.h"
+#include "message-parser.h"
+#include "mail-user.h"
+#include "fts-parser.h"
+
+#define SCRIPT_USER_CONTEXT(obj) \
+	MODULE_CONTEXT(obj, fts_parser_script_user_module)
+
+#define SCRIPT_HANDSHAKE "VERSION\tscript\t3\t0\nnoreply\n"
+
+struct content {
+	const char *content_type;
+	const char *const *extensions;
+};
+
+struct fts_parser_script_user {
+	union mail_user_module_context module_ctx;
+
+	ARRAY_DEFINE(content, struct content);
+};
+
+struct script_fts_parser {
+	struct fts_parser parser;
+
+	int fd;
+	char *path;
+
+	unsigned char outbuf[IO_BLOCK_SIZE];
+	bool failed;
+	bool shutdown;
+};
+
+static MODULE_CONTEXT_DEFINE_INIT(fts_parser_script_user_module,
+				  &mail_user_module_register);
+
+static int script_connect(struct mail_user *user, const char **path_r)
+{
+	const char *path;
+	int fd;
+
+	path = mail_user_plugin_getenv(user, "fts_decoder");
+	if (path == NULL)
+		return -1;
+
+	if (*path != '/')
+		path = t_strconcat(user->set->base_dir, "/", path, NULL);
+	fd = net_connect_unix_with_retries(path, 1000);
+	if (fd == -1)
+		i_error("net_connect_unix(%s) failed: %m", path);
+	else
+		net_set_nonblock(fd, FALSE);
+	*path_r = path;
+	return fd;
+}
+
+static int script_contents_read(struct mail_user *user)
+{
+	struct fts_parser_script_user *suser = SCRIPT_USER_CONTEXT(user);
+	const char *path, *cmd, *line;
+	char **args;
+	struct istream *input;
+	struct content *content;
+	bool eof_seen = FALSE;
+	int fd;
+
+	fd = script_connect(user, &path);
+	if (fd == -1)
+		return -1;
+
+	cmd = t_strdup_printf(SCRIPT_HANDSHAKE"\n");
+	if (write_full(fd, cmd, strlen(cmd)) < 0) {
+		i_error("write(%s) failed: %m", path);
+		(void)close(fd);
+		return -1;
+	}
+	input = i_stream_create_fd(fd, 1024, TRUE);
+	while ((line = i_stream_read_next_line(input)) != NULL) {
+		/* <content-type> <extension> [<extension> ...] */
+		args = p_strsplit_spaces(user->pool, line, " ");
+		if (args[0] == NULL) {
+			eof_seen = TRUE;
+			break;
+		}
+		if (args[0][0] == '\0' || args[1] == NULL) {
+			i_error("parser script sent invalid input: %s", line);
+			continue;
+		}
+
+		content = array_append_space(&suser->content);
+		content->content_type = args[0];
+		content->extensions = (const void *)(args+1);
+	}
+	i_stream_destroy(&input);
+
+	if (!eof_seen)
+		i_error("parser script didn't send empty EOF line");
+	return 0;
+}
+
+static bool script_support_content(struct mail_user *user,
+				   const char **content_type,
+				   const char *extension)
+{
+	struct fts_parser_script_user *suser = SCRIPT_USER_CONTEXT(user);
+	const struct content *content;
+
+	if (suser == NULL) {
+		suser = i_new(struct fts_parser_script_user, 1);
+		p_array_init(&suser->content, user->pool, 32);
+		MODULE_CONTEXT_SET(user, fts_parser_script_user_module, suser);
+	}
+	if (array_count(&suser->content) == 0) {
+		if (script_contents_read(user) < 0) {
+			array_free(&suser->content);
+			return FALSE;
+		}
+	}
+
+	if (strcmp(*content_type, "application/octet-stream") == 0) {
+		array_foreach(&suser->content, content) {
+			if (content->extensions != NULL &&
+			    str_array_icase_find(content->extensions, extension)) {
+				*content_type = content->content_type;
+				return TRUE;
+			}
+		}
+	} else {
+		array_foreach(&suser->content, content) {
+			if (strcmp(content->content_type, *content_type) == 0)
+				return TRUE;
+		}
+	}
+	return FALSE;
+}
+
+static void parse_content_disposition(const char *content_disposition,
+				      const char **extension_r)
+{
+	struct rfc822_parser_context parser;
+	const char *const *results;
+	string_t *str;
+
+	*extension_r = NULL;
+
+	if (content_disposition == NULL)
+		return;
+
+	rfc822_parser_init(&parser, (const unsigned char *)content_disposition,
+			   strlen(content_disposition), NULL);
+	(void)rfc822_skip_lwsp(&parser);
+
+	/* type; param; param; .. */
+	str = t_str_new(32);
+	if (rfc822_parse_mime_token(&parser, str) < 0)
+		return;
+
+	(void)rfc2231_parse(&parser, &results);
+	for (; *results != NULL; results += 2) {
+		if (strcasecmp(results[0], "filename") == 0) {
+			*extension_r = results[1];
+			break;
+		}
+	}
+}
+
+static struct fts_parser *
+fts_parser_script_try_init(struct mail_user *user,
+			   const char *content_type,
+			   const char *content_disposition)
+{
+	struct script_fts_parser *parser;
+	const char *extension, *path, *cmd;
+	int fd;
+
+	parse_content_disposition(content_disposition, &extension);
+	if (script_support_content(user, &content_type, extension) <= 0)
+		return NULL;
+
+	fd = script_connect(user, &path);
+	if (fd == -1)
+		return NULL;
+	cmd = t_strdup_printf(SCRIPT_HANDSHAKE"%s\n\n", content_type);
+	if (write_full(fd, cmd, strlen(cmd)) < 0) {
+		i_error("write(%s) failed: %m", path);
+		(void)close(fd);
+		return NULL;
+	}
+
+	parser = i_new(struct script_fts_parser, 1);
+	parser->parser.v = fts_parser_script;
+	parser->path = i_strdup(path);
+	parser->fd = fd;
+	return &parser->parser;
+}
+
+static void fts_parser_script_more(struct fts_parser *_parser,
+				   struct message_block *block)
+{
+	struct script_fts_parser *parser = (struct script_fts_parser *)_parser;
+	ssize_t ret;
+
+	if (block->size > 0) {
+		/* first we'll send everything to the script */
+		if (!parser->failed &&
+		    write_full(parser->fd, block->data, block->size) < 0) {
+			i_error("write(%s) failed: %m", parser->path);
+			parser->failed = TRUE;
+		}
+		block->size = 0;
+	} else {
+		if (!parser->shutdown) {
+			if (shutdown(parser->fd, SHUT_WR) < 0)
+				i_error("shutdown(%s) failed: %m", parser->path);
+			parser->shutdown = TRUE;
+		}
+		/* read the result from the script */
+		ret = read(parser->fd, parser->outbuf, sizeof(parser->outbuf));
+		if (ret < 0)
+			i_error("read(%s) failed: %m", parser->path);
+		else {
+			block->data = parser->outbuf;
+			block->size = ret;
+		}
+	}
+}
+
+static void fts_parser_script_deinit(struct fts_parser *_parser)
+{
+	struct script_fts_parser *parser = (struct script_fts_parser *)_parser;
+
+	if (close(parser->fd) < 0)
+		i_error("close(%s) failed: %m", parser->path);
+	i_free(parser->path);
+	i_free(parser);
+}
+
+struct fts_parser_vfuncs fts_parser_script = {
+	fts_parser_script_try_init,
+	fts_parser_script_more,
+	fts_parser_script_deinit
+};
--- a/src/plugins/fts/fts-parser.c	Thu Jul 28 17:01:26 2011 +0300
+++ b/src/plugins/fts/fts-parser.c	Thu Jul 28 17:23:01 2011 +0300
@@ -1,19 +1,24 @@
 /* Copyright (c) 2011 Dovecot authors, see the included COPYING file */
 
 #include "lib.h"
+#include "buffer.h"
+#include "unichar.h"
+#include "message-parser.h"
 #include "fts-parser.h"
 
-const struct fts_parser *parsers[] = {
-	&fts_parser_html
+const struct fts_parser_vfuncs *parsers[] = {
+	&fts_parser_html,
+	&fts_parser_script
 };
 
-bool fts_parser_init(const char *content_type, const char *content_disposition,
+bool fts_parser_init(struct mail_user *user,
+		     const char *content_type, const char *content_disposition,
 		     struct fts_parser **parser_r)
 {
 	unsigned int i;
 
 	for (i = 0; i < N_ELEMENTS(parsers); i++) {
-		*parser_r = parsers[i]->try_init(content_type,
+		*parser_r = parsers[i]->try_init(user, content_type,
 						 content_disposition);
 		if (*parser_r != NULL)
 			return TRUE;
@@ -23,7 +28,21 @@
 
 void fts_parser_more(struct fts_parser *parser, struct message_block *block)
 {
-	parser->more(parser, block);
+	parser->v.more(parser, block);
+
+	if (!uni_utf8_data_is_valid(block->data, block->size)) {
+		/* output isn't valid UTF-8. make it. */
+		if (parser->utf8_output == NULL) {
+			parser->utf8_output =
+				buffer_create_dynamic(default_pool, 4096);
+		} else {
+			buffer_set_used_size(parser->utf8_output, 0);
+		}
+		(void)uni_utf8_get_valid_data(block->data, block->size,
+					      parser->utf8_output);
+		block->data = parser->utf8_output->data;
+		block->size = parser->utf8_output->used;
+	}
 }
 
 void fts_parser_deinit(struct fts_parser **_parser)
@@ -31,5 +50,8 @@
 	struct fts_parser *parser = *_parser;
 
 	*_parser = NULL;
-	parser->deinit(parser);
+
+	if (parser->utf8_output != NULL)
+		buffer_free(&parser->utf8_output);
+	parser->v.deinit(parser);
 }
--- a/src/plugins/fts/fts-parser.h	Thu Jul 28 17:01:26 2011 +0300
+++ b/src/plugins/fts/fts-parser.h	Thu Jul 28 17:23:01 2011 +0300
@@ -2,18 +2,30 @@
 #define FTS_PARSER_H
 
 struct message_block;
+struct mail_user;
 
-struct fts_parser {
-	struct fts_parser *(*try_init)(const char *content_type,
+struct fts_parser_vfuncs {
+	struct fts_parser *(*try_init)(struct mail_user *user,
+				       const char *content_type,
 				       const char *content_disposition);
 	void (*more)(struct fts_parser *parser, struct message_block *block);
 	void (*deinit)(struct fts_parser *parser);
 };
 
-extern struct fts_parser fts_parser_html;
+struct fts_parser {
+	struct fts_parser_vfuncs v;
+	buffer_t *utf8_output;
+};
 
-bool fts_parser_init(const char *content_type, const char *content_disposition,
+extern struct fts_parser_vfuncs fts_parser_html;
+extern struct fts_parser_vfuncs fts_parser_script;
+
+bool fts_parser_init(struct mail_user *user,
+		     const char *content_type, const char *content_disposition,
 		     struct fts_parser **parser_r);
+/* The parser is initially called with message body blocks. Once message is
+   finished, it's still called with incoming size=0 while the parser increases
+   it to non-zero. */
 void fts_parser_more(struct fts_parser *parser, struct message_block *block);
 void fts_parser_deinit(struct fts_parser **parser);