Mercurial > dovecot > core-2.2
changeset 13122:bbb023aa1efd
fts: Added attachment decoding via an external script.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Thu, 28 Jul 2011 17:23:01 +0300 |
parents | 15082db3225a |
children | 6f43ee658efb |
files | src/plugins/fts/Makefile.am src/plugins/fts/decode2text.sh src/plugins/fts/fts-build.c src/plugins/fts/fts-parser-html.c src/plugins/fts/fts-parser-script.c src/plugins/fts/fts-parser.c src/plugins/fts/fts-parser.h |
diffstat | 7 files changed, 413 insertions(+), 21 deletions(-) [+] |
line wrap: on
line diff
--- a/src/plugins/fts/Makefile.am Thu Jul 28 17:01:26 2011 +0300 +++ b/src/plugins/fts/Makefile.am Thu Jul 28 17:23:01 2011 +0300 @@ -19,6 +19,7 @@ fts-build-virtual.c \ fts-parser.c \ fts-parser-html.c \ + fts-parser-script.c \ fts-plugin.c \ fts-search.c \ fts-search-serialize.c \ @@ -34,3 +35,5 @@ fts-plugin.h \ fts-search-serialize.h \ fts-storage.h + +EXTRA_DIST = decode2text.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/plugins/fts/decode2text.sh Thu Jul 28 17:23:01 2011 +0300 @@ -0,0 +1,65 @@ +#!/bin/sh + +# Example attachment decoder script. The attachment comes from stdin, and +# the script is expected to output UTF-8 data to stdout. (If the output isn't +# UTF-8, everything except valid UTF-8 sequences are dropped from it.) + +# The attachment decoding is enabled by setting: +# +# plugin { +# fts_decoder = decode2text +# } +# service decode2text { +# executable = script /usr/local/bin/decode2text.sh +# user = dovecot +# unix_listener decode2text { +# mode = 0666 +# } +# } + +content_type=$1 + +# The second parameter is the format's filename extension, which is used when +# found from a filename of application/octet-stream. You can also add more +# extensions by giving more parameters. +formats='application/pdf pdf +application/x-pdf pdf +application/msword doc +application/mspowerpoint ppt +application/vnd.ms-powerpoint ppt +application/ms-excel xls +application/x-msexcel xls +application/vnd.ms-excel xls +' + +if [ "$content_type" = "" ]; then + echo "$formats" + exit 0 +fi + +fmt=`echo "$formats" | grep -w "^$content_type" | cut -d ' ' -f 2` +if [ "$fmt" = "" ]; then + echo "Content-Type: $content_type not supported" >&2 + exit 1 +fi + +# most decoders can't handle stdin directly, so write the attachment +# to a temp file +path=`mktemp` +trap "rm -f $path" 0 1 2 3 15 +cat > $path + +LANG=en_US.UTF-8 +export LANG +if [ $fmt = "pdf" ]; then + /usr/bin/pdftotext $path - +elif [ $fmt = "doc" ]; then + /usr/bin/catdoc $path +elif [ $fmt = "ppt" ]; then + /usr/bin/catppt $path +elif [ $fmt = "xls" ]; then + /usr/bin/xls2csv $path +else + echo "Buggy decoder script: $fmt not handled" >&2 + exit 1 +fi
--- a/src/plugins/fts/fts-build.c Thu Jul 28 17:01:26 2011 +0300 +++ b/src/plugins/fts/fts-build.c Thu Jul 28 17:23:01 2011 +0300 @@ -29,7 +29,8 @@ content_type = t_str_new(64); if (rfc822_parse_content_type(&parser, content_type) >= 0) { i_free(ctx->content_type); - ctx->content_type = i_strdup(str_c(content_type)); + ctx->content_type = + str_lcase(i_strdup(str_c(content_type))); } } T_END; } @@ -99,21 +100,30 @@ } T_END; } -static bool fts_build_body_begin(struct fts_storage_build_context *ctx) +static bool +fts_build_body_begin(struct fts_storage_build_context *ctx, bool *binary_body_r) { const char *content_type; struct fts_backend_build_key key; i_assert(ctx->body_parser == NULL); + *binary_body_r = FALSE; memset(&key, 0, sizeof(key)); key.uid = ctx->uid; content_type = ctx->content_type != NULL ? ctx->content_type : "text/plain"; - if (fts_parser_init(content_type, ctx->content_disposition, + if (strncmp(content_type, "multipart/", 10) == 0) { + /* multiparts are never indexed, only their contents */ + return FALSE; + } + + if (fts_parser_init(ctx->box->storage->user, + content_type, ctx->content_disposition, &ctx->body_parser)) { /* extract text using the the returned parser */ + *binary_body_r = TRUE; key.type = FTS_BACKEND_BUILD_KEY_BODY_PART; } else if (strncmp(content_type, "text/", 5) == 0 || strncmp(content_type, "message/", 8) == 0) { @@ -123,6 +133,7 @@ /* possibly binary */ if (!ctx->binary_mime_parts) return FALSE; + *binary_body_r = TRUE; key.type = FTS_BACKEND_BUILD_KEY_BODY_PART_BINARY; } key.body_content_type = content_type; @@ -130,6 +141,26 @@ return fts_backend_update_set_build_key(ctx->update_ctx, &key); } +static int fts_body_parser_finish(struct fts_storage_build_context *ctx) +{ + struct message_block block; + int ret = 0; + + do { + memset(&block, 0, sizeof(block)); + fts_parser_more(ctx->body_parser, &block); + if (fts_backend_update_build_more(ctx->update_ctx, + block.data, + block.size) < 0) { + ret = -1; + break; + } + } while (block.size > 0); + + fts_parser_deinit(&ctx->body_parser); + return ret; +} + int fts_build_mail(struct fts_storage_build_context *ctx, struct mail *mail) { enum message_decoder_flags decoder_flags = 0; @@ -139,6 +170,7 @@ struct message_block raw_block, block; struct message_part *prev_part, *parts; bool skip_body = FALSE, body_part = FALSE, body_added = FALSE; + bool binary_body; int ret; ctx->uid = mail->uid; @@ -153,8 +185,6 @@ if (ctx->dtcase) decoder_flags |= MESSAGE_DECODER_FLAG_DTCASE; - if (ctx->binary_mime_parts) - decoder_flags |= MESSAGE_DECODER_FLAG_RETURN_BINARY; decoder = message_decoder_init(decoder_flags); for (;;) { ret = message_parser_parse_next_block(parser, &raw_block); @@ -168,8 +198,13 @@ if (raw_block.part != prev_part) { /* body part changed. we're now parsing the end of boundary, possibly followed by message epilogue */ - if (ctx->body_parser != NULL) - fts_parser_deinit(&ctx->body_parser); + if (ctx->body_parser != NULL) { + if (fts_body_parser_finish(ctx) < 0) { + ret = -1; + break; + } + } + message_decoder_set_return_binary(decoder, FALSE); fts_backend_update_unset_build_key(ctx->update_ctx); prev_part = raw_block.part; i_free_and_null(ctx->content_type); @@ -186,7 +221,9 @@ /* always handle headers */ } else if (raw_block.size == 0) { /* end of headers */ - skip_body = !fts_build_body_begin(ctx); + skip_body = !fts_build_body_begin(ctx, &binary_body); + if (binary_body) + message_decoder_set_return_binary(decoder, TRUE); body_part = TRUE; } else { if (skip_body) @@ -215,6 +252,8 @@ body_added = TRUE; } } + if (ret == 0 && ctx->body_parser != NULL) + ret = fts_body_parser_finish(ctx); if (ret == 0 && body_part && !skip_body && !body_added) { /* make sure body is added even when it doesn't exist */ ret = fts_backend_update_build_more(ctx->update_ctx, NULL, 0);
--- a/src/plugins/fts/fts-parser-html.c Thu Jul 28 17:01:26 2011 +0300 +++ b/src/plugins/fts/fts-parser-html.c Thu Jul 28 17:23:01 2011 +0300 @@ -44,7 +44,8 @@ }; static struct fts_parser * -fts_parser_html_try_init(const char *content_type ATTR_UNUSED, +fts_parser_html_try_init(struct mail_user *user ATTR_UNUSED, + const char *content_type, const char *content_disposition ATTR_UNUSED) { struct html_fts_parser *parser; @@ -53,7 +54,7 @@ return NULL; parser = i_new(struct html_fts_parser, 1); - parser->parser = fts_parser_html; + parser->parser.v = fts_parser_html; parser->input = buffer_create_dynamic(default_pool, 512); parser->output = buffer_create_dynamic(default_pool, 4096); return &parser->parser; @@ -234,7 +235,7 @@ i_free(parser); } -struct fts_parser fts_parser_html = { +struct fts_parser_vfuncs fts_parser_html = { fts_parser_html_try_init, fts_parser_html_more, fts_parser_html_deinit
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/plugins/fts/fts-parser-script.c Thu Jul 28 17:23:01 2011 +0300 @@ -0,0 +1,250 @@ +/* Copyright (c) 2011 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "str.h" +#include "network.h" +#include "istream.h" +#include "write-full.h" +#include "module-context.h" +#include "rfc822-parser.h" +#include "rfc2231-parser.h" +#include "message-parser.h" +#include "mail-user.h" +#include "fts-parser.h" + +#define SCRIPT_USER_CONTEXT(obj) \ + MODULE_CONTEXT(obj, fts_parser_script_user_module) + +#define SCRIPT_HANDSHAKE "VERSION\tscript\t3\t0\nnoreply\n" + +struct content { + const char *content_type; + const char *const *extensions; +}; + +struct fts_parser_script_user { + union mail_user_module_context module_ctx; + + ARRAY_DEFINE(content, struct content); +}; + +struct script_fts_parser { + struct fts_parser parser; + + int fd; + char *path; + + unsigned char outbuf[IO_BLOCK_SIZE]; + bool failed; + bool shutdown; +}; + +static MODULE_CONTEXT_DEFINE_INIT(fts_parser_script_user_module, + &mail_user_module_register); + +static int script_connect(struct mail_user *user, const char **path_r) +{ + const char *path; + int fd; + + path = mail_user_plugin_getenv(user, "fts_decoder"); + if (path == NULL) + return -1; + + if (*path != '/') + path = t_strconcat(user->set->base_dir, "/", path, NULL); + fd = net_connect_unix_with_retries(path, 1000); + if (fd == -1) + i_error("net_connect_unix(%s) failed: %m", path); + else + net_set_nonblock(fd, FALSE); + *path_r = path; + return fd; +} + +static int script_contents_read(struct mail_user *user) +{ + struct fts_parser_script_user *suser = SCRIPT_USER_CONTEXT(user); + const char *path, *cmd, *line; + char **args; + struct istream *input; + struct content *content; + bool eof_seen = FALSE; + int fd; + + fd = script_connect(user, &path); + if (fd == -1) + return -1; + + cmd = t_strdup_printf(SCRIPT_HANDSHAKE"\n"); + if (write_full(fd, cmd, strlen(cmd)) < 0) { + i_error("write(%s) failed: %m", path); + (void)close(fd); + return -1; + } + input = i_stream_create_fd(fd, 1024, TRUE); + while ((line = i_stream_read_next_line(input)) != NULL) { + /* <content-type> <extension> [<extension> ...] */ + args = p_strsplit_spaces(user->pool, line, " "); + if (args[0] == NULL) { + eof_seen = TRUE; + break; + } + if (args[0][0] == '\0' || args[1] == NULL) { + i_error("parser script sent invalid input: %s", line); + continue; + } + + content = array_append_space(&suser->content); + content->content_type = args[0]; + content->extensions = (const void *)(args+1); + } + i_stream_destroy(&input); + + if (!eof_seen) + i_error("parser script didn't send empty EOF line"); + return 0; +} + +static bool script_support_content(struct mail_user *user, + const char **content_type, + const char *extension) +{ + struct fts_parser_script_user *suser = SCRIPT_USER_CONTEXT(user); + const struct content *content; + + if (suser == NULL) { + suser = i_new(struct fts_parser_script_user, 1); + p_array_init(&suser->content, user->pool, 32); + MODULE_CONTEXT_SET(user, fts_parser_script_user_module, suser); + } + if (array_count(&suser->content) == 0) { + if (script_contents_read(user) < 0) { + array_free(&suser->content); + return FALSE; + } + } + + if (strcmp(*content_type, "application/octet-stream") == 0) { + array_foreach(&suser->content, content) { + if (content->extensions != NULL && + str_array_icase_find(content->extensions, extension)) { + *content_type = content->content_type; + return TRUE; + } + } + } else { + array_foreach(&suser->content, content) { + if (strcmp(content->content_type, *content_type) == 0) + return TRUE; + } + } + return FALSE; +} + +static void parse_content_disposition(const char *content_disposition, + const char **extension_r) +{ + struct rfc822_parser_context parser; + const char *const *results; + string_t *str; + + *extension_r = NULL; + + if (content_disposition == NULL) + return; + + rfc822_parser_init(&parser, (const unsigned char *)content_disposition, + strlen(content_disposition), NULL); + (void)rfc822_skip_lwsp(&parser); + + /* type; param; param; .. */ + str = t_str_new(32); + if (rfc822_parse_mime_token(&parser, str) < 0) + return; + + (void)rfc2231_parse(&parser, &results); + for (; *results != NULL; results += 2) { + if (strcasecmp(results[0], "filename") == 0) { + *extension_r = results[1]; + break; + } + } +} + +static struct fts_parser * +fts_parser_script_try_init(struct mail_user *user, + const char *content_type, + const char *content_disposition) +{ + struct script_fts_parser *parser; + const char *extension, *path, *cmd; + int fd; + + parse_content_disposition(content_disposition, &extension); + if (script_support_content(user, &content_type, extension) <= 0) + return NULL; + + fd = script_connect(user, &path); + if (fd == -1) + return NULL; + cmd = t_strdup_printf(SCRIPT_HANDSHAKE"%s\n\n", content_type); + if (write_full(fd, cmd, strlen(cmd)) < 0) { + i_error("write(%s) failed: %m", path); + (void)close(fd); + return NULL; + } + + parser = i_new(struct script_fts_parser, 1); + parser->parser.v = fts_parser_script; + parser->path = i_strdup(path); + parser->fd = fd; + return &parser->parser; +} + +static void fts_parser_script_more(struct fts_parser *_parser, + struct message_block *block) +{ + struct script_fts_parser *parser = (struct script_fts_parser *)_parser; + ssize_t ret; + + if (block->size > 0) { + /* first we'll send everything to the script */ + if (!parser->failed && + write_full(parser->fd, block->data, block->size) < 0) { + i_error("write(%s) failed: %m", parser->path); + parser->failed = TRUE; + } + block->size = 0; + } else { + if (!parser->shutdown) { + if (shutdown(parser->fd, SHUT_WR) < 0) + i_error("shutdown(%s) failed: %m", parser->path); + parser->shutdown = TRUE; + } + /* read the result from the script */ + ret = read(parser->fd, parser->outbuf, sizeof(parser->outbuf)); + if (ret < 0) + i_error("read(%s) failed: %m", parser->path); + else { + block->data = parser->outbuf; + block->size = ret; + } + } +} + +static void fts_parser_script_deinit(struct fts_parser *_parser) +{ + struct script_fts_parser *parser = (struct script_fts_parser *)_parser; + + if (close(parser->fd) < 0) + i_error("close(%s) failed: %m", parser->path); + i_free(parser->path); + i_free(parser); +} + +struct fts_parser_vfuncs fts_parser_script = { + fts_parser_script_try_init, + fts_parser_script_more, + fts_parser_script_deinit +};
--- a/src/plugins/fts/fts-parser.c Thu Jul 28 17:01:26 2011 +0300 +++ b/src/plugins/fts/fts-parser.c Thu Jul 28 17:23:01 2011 +0300 @@ -1,19 +1,24 @@ /* Copyright (c) 2011 Dovecot authors, see the included COPYING file */ #include "lib.h" +#include "buffer.h" +#include "unichar.h" +#include "message-parser.h" #include "fts-parser.h" -const struct fts_parser *parsers[] = { - &fts_parser_html +const struct fts_parser_vfuncs *parsers[] = { + &fts_parser_html, + &fts_parser_script }; -bool fts_parser_init(const char *content_type, const char *content_disposition, +bool fts_parser_init(struct mail_user *user, + const char *content_type, const char *content_disposition, struct fts_parser **parser_r) { unsigned int i; for (i = 0; i < N_ELEMENTS(parsers); i++) { - *parser_r = parsers[i]->try_init(content_type, + *parser_r = parsers[i]->try_init(user, content_type, content_disposition); if (*parser_r != NULL) return TRUE; @@ -23,7 +28,21 @@ void fts_parser_more(struct fts_parser *parser, struct message_block *block) { - parser->more(parser, block); + parser->v.more(parser, block); + + if (!uni_utf8_data_is_valid(block->data, block->size)) { + /* output isn't valid UTF-8. make it. */ + if (parser->utf8_output == NULL) { + parser->utf8_output = + buffer_create_dynamic(default_pool, 4096); + } else { + buffer_set_used_size(parser->utf8_output, 0); + } + (void)uni_utf8_get_valid_data(block->data, block->size, + parser->utf8_output); + block->data = parser->utf8_output->data; + block->size = parser->utf8_output->used; + } } void fts_parser_deinit(struct fts_parser **_parser) @@ -31,5 +50,8 @@ struct fts_parser *parser = *_parser; *_parser = NULL; - parser->deinit(parser); + + if (parser->utf8_output != NULL) + buffer_free(&parser->utf8_output); + parser->v.deinit(parser); }
--- a/src/plugins/fts/fts-parser.h Thu Jul 28 17:01:26 2011 +0300 +++ b/src/plugins/fts/fts-parser.h Thu Jul 28 17:23:01 2011 +0300 @@ -2,18 +2,30 @@ #define FTS_PARSER_H struct message_block; +struct mail_user; -struct fts_parser { - struct fts_parser *(*try_init)(const char *content_type, +struct fts_parser_vfuncs { + struct fts_parser *(*try_init)(struct mail_user *user, + const char *content_type, const char *content_disposition); void (*more)(struct fts_parser *parser, struct message_block *block); void (*deinit)(struct fts_parser *parser); }; -extern struct fts_parser fts_parser_html; +struct fts_parser { + struct fts_parser_vfuncs v; + buffer_t *utf8_output; +}; -bool fts_parser_init(const char *content_type, const char *content_disposition, +extern struct fts_parser_vfuncs fts_parser_html; +extern struct fts_parser_vfuncs fts_parser_script; + +bool fts_parser_init(struct mail_user *user, + const char *content_type, const char *content_disposition, struct fts_parser **parser_r); +/* The parser is initially called with message body blocks. Once message is + finished, it's still called with incoming size=0 while the parser increases + it to non-zero. */ void fts_parser_more(struct fts_parser *parser, struct message_block *block); void fts_parser_deinit(struct fts_parser **parser);