changeset 13116:f4a5e66be05b

fts: Strip text/html mails to plaintext before sending them to FTS backend.
author Timo Sirainen <tss@iki.fi>
date Wed, 27 Jul 2011 17:58:27 +0300
parents fa852748e601
children 3156e6616e83
files src/plugins/fts/Makefile.am src/plugins/fts/fts-build-private.h src/plugins/fts/fts-build.c src/plugins/fts/fts-parser-html.c src/plugins/fts/fts-parser.c src/plugins/fts/fts-parser.h src/plugins/fts/html-entities.h
diffstat 7 files changed, 567 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/src/plugins/fts/Makefile.am	Wed Jul 27 14:53:52 2011 +0300
+++ b/src/plugins/fts/Makefile.am	Wed Jul 27 17:58:27 2011 +0300
@@ -17,16 +17,20 @@
 	fts-build-indexer.c \
 	fts-build-mailbox.c \
 	fts-build-virtual.c \
+	fts-parser.c \
+	fts-parser-html.c \
 	fts-plugin.c \
 	fts-search.c \
 	fts-search-serialize.c \
 	fts-storage.c
 
 noinst_HEADERS = \
+	html-entities.h \
 	fts-api.h \
 	fts-api-private.h \
 	fts-build.h \
 	fts-build-private.h \
+	fts-parser.h \
 	fts-plugin.h \
 	fts-search-serialize.h \
 	fts-storage.h
--- a/src/plugins/fts/fts-build-private.h	Wed Jul 27 14:53:52 2011 +0300
+++ b/src/plugins/fts/fts-build-private.h	Wed Jul 27 17:58:27 2011 +0300
@@ -27,6 +27,7 @@
 
 	uint32_t uid;
 	char *content_type, *content_disposition;
+	struct fts_parser *body_parser;
 
 	unsigned int binary_mime_parts:1;
 	unsigned int dtcase:1;
--- a/src/plugins/fts/fts-build.c	Wed Jul 27 14:53:52 2011 +0300
+++ b/src/plugins/fts/fts-build.c	Wed Jul 27 17:58:27 2011 +0300
@@ -11,6 +11,7 @@
 #include "message-decoder.h"
 #include "../virtual/virtual-storage.h"
 #include "fts-api-private.h"
+#include "fts-parser.h"
 #include "fts-build-private.h"
 
 #define FTS_BUILD_NOTIFY_INTERVAL_SECS 10
@@ -103,13 +104,19 @@
 	const char *content_type;
 	struct fts_backend_build_key key;
 
+	i_assert(ctx->body_parser == NULL);
+
 	memset(&key, 0, sizeof(key));
 	key.uid = ctx->uid;
 
 	content_type = ctx->content_type != NULL ?
 		ctx->content_type : "text/plain";
-	if (strncmp(content_type, "text/", 5) == 0 ||
-	    strncmp(content_type, "message/", 8) == 0) {
+	if (fts_parser_init(content_type, ctx->content_disposition,
+			    &ctx->body_parser)) {
+		/* extract text using the the returned parser */
+		key.type = FTS_BACKEND_BUILD_KEY_BODY_PART;
+	} else if (strncmp(content_type, "text/", 5) == 0 ||
+		   strncmp(content_type, "message/", 8) == 0) {
 		/* text body parts */
 		key.type = FTS_BACKEND_BUILD_KEY_BODY_PART;
 	} else {
@@ -161,6 +168,8 @@
 		if (raw_block.part != prev_part) {
 			/* body part changed. we're now parsing the end of
 			   boundary, possibly followed by message epilogue */
+			if (ctx->body_parser != NULL)
+				fts_parser_deinit(&ctx->body_parser);
 			fts_backend_update_unset_build_key(ctx->update_ctx);
 			prev_part = raw_block.part;
 			i_free_and_null(ctx->content_type);
@@ -195,6 +204,8 @@
 			/* end of headers */
 		} else {
 			i_assert(body_part);
+			if (ctx->body_parser != NULL)
+				fts_parser_more(ctx->body_parser, &block);
 			if (fts_backend_update_build_more(ctx->update_ctx,
 							  block.data,
 							  block.size) < 0) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts/fts-parser-html.c	Wed Jul 27 17:58:27 2011 +0300
@@ -0,0 +1,241 @@
+/* Copyright (c) 2011 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "buffer.h"
+#include "unichar.h"
+#include "message-parser.h"
+#include "fts-parser.h"
+
+/* Zero-width space (&#x200B;) apparently also belongs here, but that gets a
+   bit tricky to handle.. is it actually used anywhere? */
+#define HTML_WHITESPACE(c) \
+	((c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n')
+
+enum html_state {
+	/* regular text */
+	HTML_STATE_TEXT,
+	/* tag outside "quoted string" */
+	HTML_STATE_TAG,
+	/* tag inside "quoted string" */
+	HTML_STATE_TAG_QUOTED,
+	/* tag -> "escape\ */
+	HTML_STATE_TAG_QUOTED_ESCAPE,
+	/* script/stype content */
+	HTML_STATE_IGNORE,
+	/* comment */
+	HTML_STATE_COMMENT,
+	/* comment is ending, we've seen "--" and now just waiting for ">" */
+	HTML_STATE_COMMENT_END
+};
+
+struct html_fts_parser {
+	struct fts_parser parser;
+
+	enum html_state state;
+	buffer_t *input, *output;
+	bool ignore_next_text;
+};
+
+struct {
+	const char *name;
+	unichar_t chr;
+} html_entities[] = {
+#include "html-entities.h"
+};
+
+static struct fts_parser *
+fts_parser_html_try_init(const char *content_type ATTR_UNUSED,
+			 const char *content_disposition ATTR_UNUSED)
+{
+	struct html_fts_parser *parser;
+
+	if (strcasecmp(content_type, "text/html") != 0)
+		return NULL;
+
+	parser = i_new(struct html_fts_parser, 1);
+	parser->parser = fts_parser_html;
+	parser->input = buffer_create_dynamic(default_pool, 512);
+	parser->output = buffer_create_dynamic(default_pool, 4096);
+	return &parser->parser;
+}
+
+static bool
+parse_tag_name(struct html_fts_parser *parser,
+	       const unsigned char *data, size_t size)
+{
+	size_t i = 1;
+
+	if (size >= 3 && memcmp(data, "!--", 3) == 0) {
+		parser->state = HTML_STATE_COMMENT;
+		return 3;
+	}
+
+	if (size > 5 && i_memcasecmp(data, "style", 5) == 0) {
+		i = 5;
+	} else if (size > 6 && i_memcasecmp(data, "script", 6) == 0) {
+		i = 6;
+	} else if (size <= 6) {
+		/* need more data */
+		return 0;
+	} else {
+		parser->state = HTML_STATE_TAG;
+		return 1;
+	}
+	parser->state = HTML_STATE_TAG;
+	if (HTML_WHITESPACE(data[i]) || data[i] == '>')
+		parser->ignore_next_text = TRUE;
+	return 1;
+}
+
+static bool html_entity_get_unichar(const char *name, unichar_t *chr_r)
+{
+	unsigned int i;
+
+	for (i = 0; i < N_ELEMENTS(html_entities); i++) {
+		if (strcasecmp(html_entities[i].name, name) == 0) {
+			*chr_r = html_entities[i].chr;
+			return TRUE;
+		}
+	}
+	return FALSE;
+}
+
+static size_t parse_entity(struct html_fts_parser *parser,
+			   const unsigned char *data, size_t size)
+{
+	char entity[10];
+	unichar_t chr;
+	size_t i;
+
+	for (i = 0; i < size; i++) {
+		if (data[i] == ';')
+			break;
+		if (HTML_WHITESPACE(data[i]) || i >= sizeof(entity)) {
+			/* broken entity */
+			return 1;
+		}
+	}
+	if (i == size)
+		return 0;
+
+	i_assert(i < sizeof(entity));
+	memcpy(entity, data, i); entity[i] = '\0';
+
+	if (html_entity_get_unichar(entity, &chr))
+		uni_ucs4_to_utf8_c(chr, parser->output);
+	return i + 1;
+}
+
+static size_t
+parse_data(struct html_fts_parser *parser,
+	   const unsigned char *data, size_t size)
+{
+	size_t i, ret;
+
+	for (i = 0; i < size; i++) {
+		char c = data[i];
+
+		switch (parser->state) {
+		case HTML_STATE_TEXT:
+			if (c == '<') {
+				ret = parse_tag_name(parser, data+i+1, size-i-1);
+				if (ret == 0)
+					return i;
+				i += ret - 1;
+			} else if (c == '&') {
+				ret = parse_entity(parser, data+i+1, size-i-1);
+				if (ret == 0)
+					return i;
+				i += ret - 1;
+			} else {
+				buffer_append_c(parser->output, c);
+			}
+			break;
+		case HTML_STATE_TAG:
+			if (c == '"')
+				parser->state = HTML_STATE_TAG_QUOTED;
+			else if (c == '>') {
+				parser->state = parser->ignore_next_text ?
+					HTML_STATE_IGNORE : HTML_STATE_TEXT;
+			}
+			break;
+		case HTML_STATE_TAG_QUOTED:
+			if (c == '"')
+				parser->state = HTML_STATE_TAG;
+			else if (c == '\\')
+				parser->state = HTML_STATE_TAG_QUOTED_ESCAPE;
+			break;
+		case HTML_STATE_TAG_QUOTED_ESCAPE:
+			parser->state = HTML_STATE_TAG_QUOTED;
+			break;
+		case HTML_STATE_IGNORE:
+			if (c == '<') {
+				parser->state = HTML_STATE_TAG;
+				parser->ignore_next_text = FALSE;
+			}
+			break;
+		case HTML_STATE_COMMENT:
+			if (c == '-') {
+				if (i+1 == size)
+					return i;
+				if (data[i+1] == '-') {
+					parser->state = HTML_STATE_COMMENT_END;
+					i++;
+				}
+			}
+			break;
+		case HTML_STATE_COMMENT_END:
+			if (c == '>')
+				parser->state = HTML_STATE_TEXT;
+			else if (!HTML_WHITESPACE(c))
+				parser->state = HTML_STATE_COMMENT;
+			break;
+		}
+	}
+	return i;
+}
+
+static void fts_parser_html_more(struct fts_parser *_parser,
+				 struct message_block *block)
+{
+	struct html_fts_parser *parser = (struct html_fts_parser *)_parser;
+	size_t size, buf_orig_size;
+
+	buffer_set_used_size(parser->output, 0);
+
+	if (parser->input->used > 0) {
+		/* we didn't get enough input the last time to know
+		   what to do. */
+		buf_orig_size = parser->input->used;
+
+		size = I_MIN(block->size, 128);
+		buffer_append(parser->input, block->data, size);
+		size = parse_data(parser, parser->input->data,
+				  parser->input->used);
+
+		i_assert(size >= buf_orig_size);
+		block->data += size - buf_orig_size;
+		block->size -= size - buf_orig_size;
+		buffer_set_used_size(parser->input, 0);
+	}
+	size = parse_data(parser, block->data, block->size);
+	buffer_append(parser->input, block->data + size, block->size - size);
+
+	block->data = parser->output->data;
+	block->size = parser->output->used;
+}
+
+static void fts_parser_html_deinit(struct fts_parser *_parser)
+{
+	struct html_fts_parser *parser = (struct html_fts_parser *)_parser;
+
+	buffer_free(&parser->input);
+	buffer_free(&parser->output);
+	i_free(parser);
+}
+
+struct fts_parser fts_parser_html = {
+	fts_parser_html_try_init,
+	fts_parser_html_more,
+	fts_parser_html_deinit
+};
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts/fts-parser.c	Wed Jul 27 17:58:27 2011 +0300
@@ -0,0 +1,35 @@
+/* Copyright (c) 2011 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "fts-parser.h"
+
+const struct fts_parser *parsers[] = {
+	&fts_parser_html
+};
+
+bool fts_parser_init(const char *content_type, const char *content_disposition,
+		     struct fts_parser **parser_r)
+{
+	unsigned int i;
+
+	for (i = 0; i < N_ELEMENTS(parsers); i++) {
+		*parser_r = parsers[i]->try_init(content_type,
+						 content_disposition);
+		if (*parser_r != NULL)
+			return TRUE;
+	}
+	return FALSE;
+}
+
+void fts_parser_more(struct fts_parser *parser, struct message_block *block)
+{
+	parser->more(parser, block);
+}
+
+void fts_parser_deinit(struct fts_parser **_parser)
+{
+	struct fts_parser *parser = *_parser;
+
+	*_parser = NULL;
+	parser->deinit(parser);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts/fts-parser.h	Wed Jul 27 17:58:27 2011 +0300
@@ -0,0 +1,20 @@
+#ifndef FTS_PARSER_H
+#define FTS_PARSER_H
+
+struct message_block;
+
+struct fts_parser {
+	struct fts_parser *(*try_init)(const char *content_type,
+				       const char *content_disposition);
+	void (*more)(struct fts_parser *parser, struct message_block *block);
+	void (*deinit)(struct fts_parser *parser);
+};
+
+extern struct fts_parser fts_parser_html;
+
+bool fts_parser_init(const char *content_type, const char *content_disposition,
+		     struct fts_parser **parser_r);
+void fts_parser_more(struct fts_parser *parser, struct message_block *block);
+void fts_parser_deinit(struct fts_parser **parser);
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts/html-entities.h	Wed Jul 27 17:58:27 2011 +0300
@@ -0,0 +1,253 @@
+{ "quot",	0x0022 },
+{ "amp",	0x0026 },
+{ "apos",	0x0027 },
+{ "lt",		0x003C },
+{ "gt",		0x003E },
+{ "nbsp",	0x00A0 },
+{ "iexcl",	0x00A1 },
+{ "cent",	0x00A2 },
+{ "pound",	0x00A3 },
+{ "curren",	0x00A4 },
+{ "yen",	0x00A5 },
+{ "brvbar",	0x00A6 },
+{ "sect",	0x00A7 },
+{ "uml",	0x00A8 },
+{ "copy",	0x00A9 },
+{ "ordf",	0x00AA },
+{ "laquo",	0x00AB },
+{ "not",	0x00AC },
+{ "shy",	0x00AD },
+{ "reg",	0x00AE },
+{ "macr",	0x00AF },
+{ "deg",	0x00B0 },
+{ "plusmn",	0x00B1 },
+{ "sup2",	0x00B2 },
+{ "sup3",	0x00B3 },
+{ "acute",	0x00B4 },
+{ "micro",	0x00B5 },
+{ "para",	0x00B6 },
+{ "middot",	0x00B7 },
+{ "cedil",	0x00B8 },
+{ "sup1",	0x00B9 },
+{ "ordm",	0x00BA },
+{ "raquo",	0x00BB },
+{ "frac14",	0x00BC },
+{ "frac12",	0x00BD },
+{ "frac34",	0x00BE },
+{ "iquest",	0x00BF },
+{ "Agrave",	0x00C0 },
+{ "Aacute",	0x00C1 },
+{ "Acirc",	0x00C2 },
+{ "Atilde",	0x00C3 },
+{ "Auml",	0x00C4 },
+{ "Aring",	0x00C5 },
+{ "AElig",	0x00C6 },
+{ "Ccedil",	0x00C7 },
+{ "Egrave",	0x00C8 },
+{ "Eacute",	0x00C9 },
+{ "Ecirc",	0x00CA },
+{ "Euml",	0x00CB },
+{ "Igrave",	0x00CC },
+{ "Iacute",	0x00CD },
+{ "Icirc",	0x00CE },
+{ "Iuml",	0x00CF },
+{ "ETH",	0x00D0 },
+{ "Ntilde",	0x00D1 },
+{ "Ograve",	0x00D2 },
+{ "Oacute",	0x00D3 },
+{ "Ocirc",	0x00D4 },
+{ "Otilde",	0x00D5 },
+{ "Ouml",	0x00D6 },
+{ "times",	0x00D7 },
+{ "Oslash",	0x00D8 },
+{ "Ugrave",	0x00D9 },
+{ "Uacute",	0x00DA },
+{ "Ucirc",	0x00DB },
+{ "Uuml",	0x00DC },
+{ "Yacute",	0x00DD },
+{ "THORN",	0x00DE },
+{ "szlig",	0x00DF },
+{ "agrave",	0x00E0 },
+{ "aacute",	0x00E1 },
+{ "acirc",	0x00E2 },
+{ "atilde",	0x00E3 },
+{ "auml",	0x00E4 },
+{ "aring",	0x00E5 },
+{ "aelig",	0x00E6 },
+{ "ccedil",	0x00E7 },
+{ "egrave",	0x00E8 },
+{ "eacute",	0x00E9 },
+{ "ecirc",	0x00EA },
+{ "euml",	0x00EB },
+{ "igrave",	0x00EC },
+{ "iacute",	0x00ED },
+{ "icirc",	0x00EE },
+{ "iuml",	0x00EF },
+{ "eth",	0x00F0 },
+{ "ntilde",	0x00F1 },
+{ "ograve",	0x00F2 },
+{ "oacute",	0x00F3 },
+{ "ocirc",	0x00F4 },
+{ "otilde",	0x00F5 },
+{ "ouml",	0x00F6 },
+{ "divide",	0x00F7 },
+{ "oslash",	0x00F8 },
+{ "ugrave",	0x00F9 },
+{ "uacute",	0x00FA },
+{ "ucirc",	0x00FB },
+{ "uuml",	0x00FC },
+{ "yacute",	0x00FD },
+{ "thorn",	0x00FE },
+{ "yuml",	0x00FF },
+{ "OElig",	0x0152 },
+{ "oelig",	0x0153 },
+{ "Scaron",	0x0160 },
+{ "scaron",	0x0161 },
+{ "Yuml",	0x0178 },
+{ "fnof",	0x0192 },
+{ "circ",	0x02C6 },
+{ "tilde",	0x02DC },
+{ "Alpha",	0x0391 },
+{ "Beta",	0x0392 },
+{ "Gamma",	0x0393 },
+{ "Delta",	0x0394 },
+{ "Epsilon",	0x0395 },
+{ "Zeta",	0x0396 },
+{ "Eta",	0x0397 },
+{ "Theta",	0x0398 },
+{ "Iota",	0x0399 },
+{ "Kappa",	0x039A },
+{ "Lambda",	0x039B },
+{ "Mu",		0x039C },
+{ "Nu",		0x039D },
+{ "Xi",		0x039E },
+{ "Omicron",	0x039F },
+{ "Pi",		0x03A0 },
+{ "Rho",	0x03A1 },
+{ "Sigma",	0x03A3 },
+{ "Tau",	0x03A4 },
+{ "Upsilon",	0x03A5 },
+{ "Phi",	0x03A6 },
+{ "Chi",	0x03A7 },
+{ "Psi",	0x03A8 },
+{ "Omega",	0x03A9 },
+{ "alpha",	0x03B1 },
+{ "beta",	0x03B2 },
+{ "gamma",	0x03B3 },
+{ "delta",	0x03B4 },
+{ "epsilon",	0x03B5 },
+{ "zeta",	0x03B6 },
+{ "eta",	0x03B7 },
+{ "theta",	0x03B8 },
+{ "iota",	0x03B9 },
+{ "kappa",	0x03BA },
+{ "lambda",	0x03BB },
+{ "mu",		0x03BC },
+{ "nu",		0x03BD },
+{ "xi",		0x03BE },
+{ "omicron",	0x03BF },
+{ "pi",		0x03C0 },
+{ "rho",	0x03C1 },
+{ "sigmaf",	0x03C2 },
+{ "sigma",	0x03C3 },
+{ "tau",	0x03C4 },
+{ "upsilon",	0x03C5 },
+{ "phi",	0x03C6 },
+{ "chi",	0x03C7 },
+{ "psi",	0x03C8 },
+{ "omega",	0x03C9 },
+{ "thetasym",	0x03D1 },
+{ "upsih",	0x03D2 },
+{ "piv",	0x03D6 },
+{ "ensp",	0x2002 },
+{ "emsp",	0x2003 },
+{ "thinsp",	0x2009 },
+{ "zwnj",	0x200C },
+{ "zwj",	0x200D },
+{ "lrm",	0x200E },
+{ "rlm",	0x200F },
+{ "ndash",	0x2013 },
+{ "mdash",	0x2014 },
+{ "lsquo",	0x2018 },
+{ "rsquo",	0x2019 },
+{ "sbquo",	0x201A },
+{ "ldquo",	0x201C },
+{ "rdquo",	0x201D },
+{ "bdquo",	0x201E },
+{ "dagger",	0x2020 },
+{ "Dagger",	0x2021 },
+{ "bull",	0x2022 },
+{ "hellip",	0x2026 },
+{ "permil",	0x2030 },
+{ "prime",	0x2032 },
+{ "Prime",	0x2033 },
+{ "lsaquo",	0x2039 },
+{ "rsaquo",	0x203A },
+{ "oline",	0x203E },
+{ "frasl",	0x2044 },
+{ "euro",	0x20AC },
+{ "image",	0x2111 },
+{ "weierp",	0x2118 },
+{ "real",	0x211C },
+{ "trade",	0x2122 },
+{ "alefsym",	0x2135 },
+{ "larr",	0x2190 },
+{ "uarr",	0x2191 },
+{ "rarr",	0x2192 },
+{ "darr",	0x2193 },
+{ "harr",	0x2194 },
+{ "crarr",	0x21B5 },
+{ "lArr",	0x21D0 },
+{ "uArr",	0x21D1 },
+{ "rArr",	0x21D2 },
+{ "dArr",	0x21D3 },
+{ "hArr",	0x21D4 },
+{ "forall",	0x2200 },
+{ "part",	0x2202 },
+{ "exist",	0x2203 },
+{ "empty",	0x2205 },
+{ "nabla",	0x2207 },
+{ "isin",	0x2208 },
+{ "notin",	0x2209 },
+{ "ni",		0x220B },
+{ "prod",	0x220F },
+{ "sum",	0x2211 },
+{ "minus",	0x2212 },
+{ "lowast",	0x2217 },
+{ "radic",	0x221A },
+{ "prop",	0x221D },
+{ "infin",	0x221E },
+{ "ang",	0x2220 },
+{ "and",	0x2227 },
+{ "or",		0x2228 },
+{ "cap",	0x2229 },
+{ "cup",	0x222A },
+{ "int",	0x222B },
+{ "there4",	0x2234 },
+{ "sim",	0x223C },
+{ "cong",	0x2245 },
+{ "asymp",	0x2248 },
+{ "ne",		0x2260 },
+{ "equiv",	0x2261 },
+{ "le",		0x2264 },
+{ "ge",		0x2265 },
+{ "sub",	0x2282 },
+{ "sup",	0x2283 },
+{ "nsub",	0x2284 },
+{ "sube",	0x2286 },
+{ "supe",	0x2287 },
+{ "oplus",	0x2295 },
+{ "otimes",	0x2297 },
+{ "perp",	0x22A5 },
+{ "sdot",	0x22C5 },
+{ "lceil",	0x2308 },
+{ "rceil",	0x2309 },
+{ "lfloor",	0x230A },
+{ "rfloor",	0x230B },
+{ "lang",	0x27E8 },
+{ "rang",	0x27E9 },
+{ "loz",	0x25CA },
+{ "spades",	0x2660 },
+{ "clubs",	0x2663 },
+{ "hearts",	0x2665 },
+{ "diams",	0x2666 }