Mercurial > dovecot > core-2.2
changeset 13116:f4a5e66be05b
fts: Strip text/html mails to plaintext before sending them to FTS backend.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Wed, 27 Jul 2011 17:58:27 +0300 |
parents | fa852748e601 |
children | 3156e6616e83 |
files | src/plugins/fts/Makefile.am src/plugins/fts/fts-build-private.h src/plugins/fts/fts-build.c src/plugins/fts/fts-parser-html.c src/plugins/fts/fts-parser.c src/plugins/fts/fts-parser.h src/plugins/fts/html-entities.h |
diffstat | 7 files changed, 567 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/src/plugins/fts/Makefile.am Wed Jul 27 14:53:52 2011 +0300 +++ b/src/plugins/fts/Makefile.am Wed Jul 27 17:58:27 2011 +0300 @@ -17,16 +17,20 @@ fts-build-indexer.c \ fts-build-mailbox.c \ fts-build-virtual.c \ + fts-parser.c \ + fts-parser-html.c \ fts-plugin.c \ fts-search.c \ fts-search-serialize.c \ fts-storage.c noinst_HEADERS = \ + html-entities.h \ fts-api.h \ fts-api-private.h \ fts-build.h \ fts-build-private.h \ + fts-parser.h \ fts-plugin.h \ fts-search-serialize.h \ fts-storage.h
--- a/src/plugins/fts/fts-build-private.h Wed Jul 27 14:53:52 2011 +0300 +++ b/src/plugins/fts/fts-build-private.h Wed Jul 27 17:58:27 2011 +0300 @@ -27,6 +27,7 @@ uint32_t uid; char *content_type, *content_disposition; + struct fts_parser *body_parser; unsigned int binary_mime_parts:1; unsigned int dtcase:1;
--- a/src/plugins/fts/fts-build.c Wed Jul 27 14:53:52 2011 +0300 +++ b/src/plugins/fts/fts-build.c Wed Jul 27 17:58:27 2011 +0300 @@ -11,6 +11,7 @@ #include "message-decoder.h" #include "../virtual/virtual-storage.h" #include "fts-api-private.h" +#include "fts-parser.h" #include "fts-build-private.h" #define FTS_BUILD_NOTIFY_INTERVAL_SECS 10 @@ -103,13 +104,19 @@ const char *content_type; struct fts_backend_build_key key; + i_assert(ctx->body_parser == NULL); + memset(&key, 0, sizeof(key)); key.uid = ctx->uid; content_type = ctx->content_type != NULL ? ctx->content_type : "text/plain"; - if (strncmp(content_type, "text/", 5) == 0 || - strncmp(content_type, "message/", 8) == 0) { + if (fts_parser_init(content_type, ctx->content_disposition, + &ctx->body_parser)) { + /* extract text using the the returned parser */ + key.type = FTS_BACKEND_BUILD_KEY_BODY_PART; + } else if (strncmp(content_type, "text/", 5) == 0 || + strncmp(content_type, "message/", 8) == 0) { /* text body parts */ key.type = FTS_BACKEND_BUILD_KEY_BODY_PART; } else { @@ -161,6 +168,8 @@ if (raw_block.part != prev_part) { /* body part changed. we're now parsing the end of boundary, possibly followed by message epilogue */ + if (ctx->body_parser != NULL) + fts_parser_deinit(&ctx->body_parser); fts_backend_update_unset_build_key(ctx->update_ctx); prev_part = raw_block.part; i_free_and_null(ctx->content_type); @@ -195,6 +204,8 @@ /* end of headers */ } else { i_assert(body_part); + if (ctx->body_parser != NULL) + fts_parser_more(ctx->body_parser, &block); if (fts_backend_update_build_more(ctx->update_ctx, block.data, block.size) < 0) {
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/plugins/fts/fts-parser-html.c Wed Jul 27 17:58:27 2011 +0300 @@ -0,0 +1,241 @@ +/* Copyright (c) 2011 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "buffer.h" +#include "unichar.h" +#include "message-parser.h" +#include "fts-parser.h" + +/* Zero-width space (​) apparently also belongs here, but that gets a + bit tricky to handle.. is it actually used anywhere? */ +#define HTML_WHITESPACE(c) \ + ((c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n') + +enum html_state { + /* regular text */ + HTML_STATE_TEXT, + /* tag outside "quoted string" */ + HTML_STATE_TAG, + /* tag inside "quoted string" */ + HTML_STATE_TAG_QUOTED, + /* tag -> "escape\ */ + HTML_STATE_TAG_QUOTED_ESCAPE, + /* script/stype content */ + HTML_STATE_IGNORE, + /* comment */ + HTML_STATE_COMMENT, + /* comment is ending, we've seen "--" and now just waiting for ">" */ + HTML_STATE_COMMENT_END +}; + +struct html_fts_parser { + struct fts_parser parser; + + enum html_state state; + buffer_t *input, *output; + bool ignore_next_text; +}; + +struct { + const char *name; + unichar_t chr; +} html_entities[] = { +#include "html-entities.h" +}; + +static struct fts_parser * +fts_parser_html_try_init(const char *content_type ATTR_UNUSED, + const char *content_disposition ATTR_UNUSED) +{ + struct html_fts_parser *parser; + + if (strcasecmp(content_type, "text/html") != 0) + return NULL; + + parser = i_new(struct html_fts_parser, 1); + parser->parser = fts_parser_html; + parser->input = buffer_create_dynamic(default_pool, 512); + parser->output = buffer_create_dynamic(default_pool, 4096); + return &parser->parser; +} + +static bool +parse_tag_name(struct html_fts_parser *parser, + const unsigned char *data, size_t size) +{ + size_t i = 1; + + if (size >= 3 && memcmp(data, "!--", 3) == 0) { + parser->state = HTML_STATE_COMMENT; + return 3; + } + + if (size > 5 && i_memcasecmp(data, "style", 5) == 0) { + i = 5; + } else if (size > 6 && i_memcasecmp(data, "script", 6) == 0) { + i = 6; + } else if (size <= 6) { + /* need more data */ + return 0; + } else { + parser->state = HTML_STATE_TAG; + return 1; + } + parser->state = HTML_STATE_TAG; + if (HTML_WHITESPACE(data[i]) || data[i] == '>') + parser->ignore_next_text = TRUE; + return 1; +} + +static bool html_entity_get_unichar(const char *name, unichar_t *chr_r) +{ + unsigned int i; + + for (i = 0; i < N_ELEMENTS(html_entities); i++) { + if (strcasecmp(html_entities[i].name, name) == 0) { + *chr_r = html_entities[i].chr; + return TRUE; + } + } + return FALSE; +} + +static size_t parse_entity(struct html_fts_parser *parser, + const unsigned char *data, size_t size) +{ + char entity[10]; + unichar_t chr; + size_t i; + + for (i = 0; i < size; i++) { + if (data[i] == ';') + break; + if (HTML_WHITESPACE(data[i]) || i >= sizeof(entity)) { + /* broken entity */ + return 1; + } + } + if (i == size) + return 0; + + i_assert(i < sizeof(entity)); + memcpy(entity, data, i); entity[i] = '\0'; + + if (html_entity_get_unichar(entity, &chr)) + uni_ucs4_to_utf8_c(chr, parser->output); + return i + 1; +} + +static size_t +parse_data(struct html_fts_parser *parser, + const unsigned char *data, size_t size) +{ + size_t i, ret; + + for (i = 0; i < size; i++) { + char c = data[i]; + + switch (parser->state) { + case HTML_STATE_TEXT: + if (c == '<') { + ret = parse_tag_name(parser, data+i+1, size-i-1); + if (ret == 0) + return i; + i += ret - 1; + } else if (c == '&') { + ret = parse_entity(parser, data+i+1, size-i-1); + if (ret == 0) + return i; + i += ret - 1; + } else { + buffer_append_c(parser->output, c); + } + break; + case HTML_STATE_TAG: + if (c == '"') + parser->state = HTML_STATE_TAG_QUOTED; + else if (c == '>') { + parser->state = parser->ignore_next_text ? + HTML_STATE_IGNORE : HTML_STATE_TEXT; + } + break; + case HTML_STATE_TAG_QUOTED: + if (c == '"') + parser->state = HTML_STATE_TAG; + else if (c == '\\') + parser->state = HTML_STATE_TAG_QUOTED_ESCAPE; + break; + case HTML_STATE_TAG_QUOTED_ESCAPE: + parser->state = HTML_STATE_TAG_QUOTED; + break; + case HTML_STATE_IGNORE: + if (c == '<') { + parser->state = HTML_STATE_TAG; + parser->ignore_next_text = FALSE; + } + break; + case HTML_STATE_COMMENT: + if (c == '-') { + if (i+1 == size) + return i; + if (data[i+1] == '-') { + parser->state = HTML_STATE_COMMENT_END; + i++; + } + } + break; + case HTML_STATE_COMMENT_END: + if (c == '>') + parser->state = HTML_STATE_TEXT; + else if (!HTML_WHITESPACE(c)) + parser->state = HTML_STATE_COMMENT; + break; + } + } + return i; +} + +static void fts_parser_html_more(struct fts_parser *_parser, + struct message_block *block) +{ + struct html_fts_parser *parser = (struct html_fts_parser *)_parser; + size_t size, buf_orig_size; + + buffer_set_used_size(parser->output, 0); + + if (parser->input->used > 0) { + /* we didn't get enough input the last time to know + what to do. */ + buf_orig_size = parser->input->used; + + size = I_MIN(block->size, 128); + buffer_append(parser->input, block->data, size); + size = parse_data(parser, parser->input->data, + parser->input->used); + + i_assert(size >= buf_orig_size); + block->data += size - buf_orig_size; + block->size -= size - buf_orig_size; + buffer_set_used_size(parser->input, 0); + } + size = parse_data(parser, block->data, block->size); + buffer_append(parser->input, block->data + size, block->size - size); + + block->data = parser->output->data; + block->size = parser->output->used; +} + +static void fts_parser_html_deinit(struct fts_parser *_parser) +{ + struct html_fts_parser *parser = (struct html_fts_parser *)_parser; + + buffer_free(&parser->input); + buffer_free(&parser->output); + i_free(parser); +} + +struct fts_parser fts_parser_html = { + fts_parser_html_try_init, + fts_parser_html_more, + fts_parser_html_deinit +};
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/plugins/fts/fts-parser.c Wed Jul 27 17:58:27 2011 +0300 @@ -0,0 +1,35 @@ +/* Copyright (c) 2011 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "fts-parser.h" + +const struct fts_parser *parsers[] = { + &fts_parser_html +}; + +bool fts_parser_init(const char *content_type, const char *content_disposition, + struct fts_parser **parser_r) +{ + unsigned int i; + + for (i = 0; i < N_ELEMENTS(parsers); i++) { + *parser_r = parsers[i]->try_init(content_type, + content_disposition); + if (*parser_r != NULL) + return TRUE; + } + return FALSE; +} + +void fts_parser_more(struct fts_parser *parser, struct message_block *block) +{ + parser->more(parser, block); +} + +void fts_parser_deinit(struct fts_parser **_parser) +{ + struct fts_parser *parser = *_parser; + + *_parser = NULL; + parser->deinit(parser); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/plugins/fts/fts-parser.h Wed Jul 27 17:58:27 2011 +0300 @@ -0,0 +1,20 @@ +#ifndef FTS_PARSER_H +#define FTS_PARSER_H + +struct message_block; + +struct fts_parser { + struct fts_parser *(*try_init)(const char *content_type, + const char *content_disposition); + void (*more)(struct fts_parser *parser, struct message_block *block); + void (*deinit)(struct fts_parser *parser); +}; + +extern struct fts_parser fts_parser_html; + +bool fts_parser_init(const char *content_type, const char *content_disposition, + struct fts_parser **parser_r); +void fts_parser_more(struct fts_parser *parser, struct message_block *block); +void fts_parser_deinit(struct fts_parser **parser); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/plugins/fts/html-entities.h Wed Jul 27 17:58:27 2011 +0300 @@ -0,0 +1,253 @@ +{ "quot", 0x0022 }, +{ "amp", 0x0026 }, +{ "apos", 0x0027 }, +{ "lt", 0x003C }, +{ "gt", 0x003E }, +{ "nbsp", 0x00A0 }, +{ "iexcl", 0x00A1 }, +{ "cent", 0x00A2 }, +{ "pound", 0x00A3 }, +{ "curren", 0x00A4 }, +{ "yen", 0x00A5 }, +{ "brvbar", 0x00A6 }, +{ "sect", 0x00A7 }, +{ "uml", 0x00A8 }, +{ "copy", 0x00A9 }, +{ "ordf", 0x00AA }, +{ "laquo", 0x00AB }, +{ "not", 0x00AC }, +{ "shy", 0x00AD }, +{ "reg", 0x00AE }, +{ "macr", 0x00AF }, +{ "deg", 0x00B0 }, +{ "plusmn", 0x00B1 }, +{ "sup2", 0x00B2 }, +{ "sup3", 0x00B3 }, +{ "acute", 0x00B4 }, +{ "micro", 0x00B5 }, +{ "para", 0x00B6 }, +{ "middot", 0x00B7 }, +{ "cedil", 0x00B8 }, +{ "sup1", 0x00B9 }, +{ "ordm", 0x00BA }, +{ "raquo", 0x00BB }, +{ "frac14", 0x00BC }, +{ "frac12", 0x00BD }, +{ "frac34", 0x00BE }, +{ "iquest", 0x00BF }, +{ "Agrave", 0x00C0 }, +{ "Aacute", 0x00C1 }, +{ "Acirc", 0x00C2 }, +{ "Atilde", 0x00C3 }, +{ "Auml", 0x00C4 }, +{ "Aring", 0x00C5 }, +{ "AElig", 0x00C6 }, +{ "Ccedil", 0x00C7 }, +{ "Egrave", 0x00C8 }, +{ "Eacute", 0x00C9 }, +{ "Ecirc", 0x00CA }, +{ "Euml", 0x00CB }, +{ "Igrave", 0x00CC }, +{ "Iacute", 0x00CD }, +{ "Icirc", 0x00CE }, +{ "Iuml", 0x00CF }, +{ "ETH", 0x00D0 }, +{ "Ntilde", 0x00D1 }, +{ "Ograve", 0x00D2 }, +{ "Oacute", 0x00D3 }, +{ "Ocirc", 0x00D4 }, +{ "Otilde", 0x00D5 }, +{ "Ouml", 0x00D6 }, +{ "times", 0x00D7 }, +{ "Oslash", 0x00D8 }, +{ "Ugrave", 0x00D9 }, +{ "Uacute", 0x00DA }, +{ "Ucirc", 0x00DB }, +{ "Uuml", 0x00DC }, +{ "Yacute", 0x00DD }, +{ "THORN", 0x00DE }, +{ "szlig", 0x00DF }, +{ "agrave", 0x00E0 }, +{ "aacute", 0x00E1 }, +{ "acirc", 0x00E2 }, +{ "atilde", 0x00E3 }, +{ "auml", 0x00E4 }, +{ "aring", 0x00E5 }, +{ "aelig", 0x00E6 }, +{ "ccedil", 0x00E7 }, +{ "egrave", 0x00E8 }, +{ "eacute", 0x00E9 }, +{ "ecirc", 0x00EA }, +{ "euml", 0x00EB }, +{ "igrave", 0x00EC }, +{ "iacute", 0x00ED }, +{ "icirc", 0x00EE }, +{ "iuml", 0x00EF }, +{ "eth", 0x00F0 }, +{ "ntilde", 0x00F1 }, +{ "ograve", 0x00F2 }, +{ "oacute", 0x00F3 }, +{ "ocirc", 0x00F4 }, +{ "otilde", 0x00F5 }, +{ "ouml", 0x00F6 }, +{ "divide", 0x00F7 }, +{ "oslash", 0x00F8 }, +{ "ugrave", 0x00F9 }, +{ "uacute", 0x00FA }, +{ "ucirc", 0x00FB }, +{ "uuml", 0x00FC }, +{ "yacute", 0x00FD }, +{ "thorn", 0x00FE }, +{ "yuml", 0x00FF }, +{ "OElig", 0x0152 }, +{ "oelig", 0x0153 }, +{ "Scaron", 0x0160 }, +{ "scaron", 0x0161 }, +{ "Yuml", 0x0178 }, +{ "fnof", 0x0192 }, +{ "circ", 0x02C6 }, +{ "tilde", 0x02DC }, +{ "Alpha", 0x0391 }, +{ "Beta", 0x0392 }, +{ "Gamma", 0x0393 }, +{ "Delta", 0x0394 }, +{ "Epsilon", 0x0395 }, +{ "Zeta", 0x0396 }, +{ "Eta", 0x0397 }, +{ "Theta", 0x0398 }, +{ "Iota", 0x0399 }, +{ "Kappa", 0x039A }, +{ "Lambda", 0x039B }, +{ "Mu", 0x039C }, +{ "Nu", 0x039D }, +{ "Xi", 0x039E }, +{ "Omicron", 0x039F }, +{ "Pi", 0x03A0 }, +{ "Rho", 0x03A1 }, +{ "Sigma", 0x03A3 }, +{ "Tau", 0x03A4 }, +{ "Upsilon", 0x03A5 }, +{ "Phi", 0x03A6 }, +{ "Chi", 0x03A7 }, +{ "Psi", 0x03A8 }, +{ "Omega", 0x03A9 }, +{ "alpha", 0x03B1 }, +{ "beta", 0x03B2 }, +{ "gamma", 0x03B3 }, +{ "delta", 0x03B4 }, +{ "epsilon", 0x03B5 }, +{ "zeta", 0x03B6 }, +{ "eta", 0x03B7 }, +{ "theta", 0x03B8 }, +{ "iota", 0x03B9 }, +{ "kappa", 0x03BA }, +{ "lambda", 0x03BB }, +{ "mu", 0x03BC }, +{ "nu", 0x03BD }, +{ "xi", 0x03BE }, +{ "omicron", 0x03BF }, +{ "pi", 0x03C0 }, +{ "rho", 0x03C1 }, +{ "sigmaf", 0x03C2 }, +{ "sigma", 0x03C3 }, +{ "tau", 0x03C4 }, +{ "upsilon", 0x03C5 }, +{ "phi", 0x03C6 }, +{ "chi", 0x03C7 }, +{ "psi", 0x03C8 }, +{ "omega", 0x03C9 }, +{ "thetasym", 0x03D1 }, +{ "upsih", 0x03D2 }, +{ "piv", 0x03D6 }, +{ "ensp", 0x2002 }, +{ "emsp", 0x2003 }, +{ "thinsp", 0x2009 }, +{ "zwnj", 0x200C }, +{ "zwj", 0x200D }, +{ "lrm", 0x200E }, +{ "rlm", 0x200F }, +{ "ndash", 0x2013 }, +{ "mdash", 0x2014 }, +{ "lsquo", 0x2018 }, +{ "rsquo", 0x2019 }, +{ "sbquo", 0x201A }, +{ "ldquo", 0x201C }, +{ "rdquo", 0x201D }, +{ "bdquo", 0x201E }, +{ "dagger", 0x2020 }, +{ "Dagger", 0x2021 }, +{ "bull", 0x2022 }, +{ "hellip", 0x2026 }, +{ "permil", 0x2030 }, +{ "prime", 0x2032 }, +{ "Prime", 0x2033 }, +{ "lsaquo", 0x2039 }, +{ "rsaquo", 0x203A }, +{ "oline", 0x203E }, +{ "frasl", 0x2044 }, +{ "euro", 0x20AC }, +{ "image", 0x2111 }, +{ "weierp", 0x2118 }, +{ "real", 0x211C }, +{ "trade", 0x2122 }, +{ "alefsym", 0x2135 }, +{ "larr", 0x2190 }, +{ "uarr", 0x2191 }, +{ "rarr", 0x2192 }, +{ "darr", 0x2193 }, +{ "harr", 0x2194 }, +{ "crarr", 0x21B5 }, +{ "lArr", 0x21D0 }, +{ "uArr", 0x21D1 }, +{ "rArr", 0x21D2 }, +{ "dArr", 0x21D3 }, +{ "hArr", 0x21D4 }, +{ "forall", 0x2200 }, +{ "part", 0x2202 }, +{ "exist", 0x2203 }, +{ "empty", 0x2205 }, +{ "nabla", 0x2207 }, +{ "isin", 0x2208 }, +{ "notin", 0x2209 }, +{ "ni", 0x220B }, +{ "prod", 0x220F }, +{ "sum", 0x2211 }, +{ "minus", 0x2212 }, +{ "lowast", 0x2217 }, +{ "radic", 0x221A }, +{ "prop", 0x221D }, +{ "infin", 0x221E }, +{ "ang", 0x2220 }, +{ "and", 0x2227 }, +{ "or", 0x2228 }, +{ "cap", 0x2229 }, +{ "cup", 0x222A }, +{ "int", 0x222B }, +{ "there4", 0x2234 }, +{ "sim", 0x223C }, +{ "cong", 0x2245 }, +{ "asymp", 0x2248 }, +{ "ne", 0x2260 }, +{ "equiv", 0x2261 }, +{ "le", 0x2264 }, +{ "ge", 0x2265 }, +{ "sub", 0x2282 }, +{ "sup", 0x2283 }, +{ "nsub", 0x2284 }, +{ "sube", 0x2286 }, +{ "supe", 0x2287 }, +{ "oplus", 0x2295 }, +{ "otimes", 0x2297 }, +{ "perp", 0x22A5 }, +{ "sdot", 0x22C5 }, +{ "lceil", 0x2308 }, +{ "rceil", 0x2309 }, +{ "lfloor", 0x230A }, +{ "rfloor", 0x230B }, +{ "lang", 0x27E8 }, +{ "rang", 0x27E9 }, +{ "loz", 0x25CA }, +{ "spades", 0x2660 }, +{ "clubs", 0x2663 }, +{ "hearts", 0x2665 }, +{ "diams", 0x2666 }