Mercurial > dovecot > core-2.2
changeset 11316:757cb3148407 HEAD
fts: Backends can now index non-text body parts if they support it.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Mon, 17 May 2010 18:06:57 +0200 |
parents | 7bb35ad5e80e |
children | 4ba05c3702be |
files | src/plugins/fts-solr/fts-backend-solr.c src/plugins/fts-squat/fts-backend-squat.c src/plugins/fts/fts-api-private.h src/plugins/fts/fts-api.c src/plugins/fts/fts-api.h src/plugins/fts/fts-storage.c |
diffstat | 6 files changed, 273 insertions(+), 82 deletions(-) [+] |
line wrap: on
line diff
--- a/src/plugins/fts-solr/fts-backend-solr.c Fri May 14 17:41:34 2010 +0200 +++ b/src/plugins/fts-solr/fts-backend-solr.c Mon May 17 18:06:57 2010 +0200 @@ -561,48 +561,78 @@ xml_encode(str, backend->id_box_name); } -static int -fts_backend_solr_build_more(struct fts_backend_build_context *_ctx, - uint32_t uid, const unsigned char *data, - size_t size, bool headers) +static void +fts_backend_solr_uid_changed(struct solr_fts_backend_build_context *ctx, + uint32_t uid) +{ + if (ctx->post == NULL) { + ctx->post = solr_connection_post_begin(solr_conn); + str_append(ctx->cmd, "<add>"); + } else { + str_append(ctx->cmd, "</field></doc>"); + } + ctx->prev_uid = uid; + ctx->headers = FALSE; + + fts_backend_solr_add_doc_prefix(ctx, uid); + str_printfa(ctx->cmd, "<field name=\"id\">"); + xml_encode_id(ctx->cmd, ctx->ctx.backend, uid, ctx->uid_validity); + str_append(ctx->cmd, "</field>"); +} + +static void +fts_backend_solr_build_hdr(struct fts_backend_build_context *_ctx, + uint32_t uid) { struct solr_fts_backend_build_context *ctx = (struct solr_fts_backend_build_context *)_ctx; - string_t *cmd = ctx->cmd; - /* body comes first, then headers */ - if (ctx->prev_uid != uid) { - /* uid changed */ - if (ctx->post == NULL) { - ctx->post = solr_connection_post_begin(solr_conn); - str_append(cmd, "<add>"); - } else { - str_append(cmd, "</field></doc>"); - } - ctx->prev_uid = uid; - - fts_backend_solr_add_doc_prefix(ctx, uid); - str_printfa(cmd, "<field name=\"id\">"); - xml_encode_id(cmd, _ctx->backend, uid, ctx->uid_validity); - str_append(cmd, "</field>"); - - ctx->headers = headers; - if (headers) { - str_append(cmd, "<field name=\"hdr\">"); - } else { - str_append(cmd, "<field name=\"body\">"); - } - } else if (headers && !ctx->headers) { - str_append(cmd, "</field><field name=\"hdr\">"); - } else { - i_assert(!(!headers && ctx->headers)); + if (uid != ctx->prev_uid) + fts_backend_solr_uid_changed(ctx, uid); + else { + i_assert(!ctx->headers); + str_append(ctx->cmd, "</field>"); } - xml_encode_data(cmd, data, size); - if (str_len(cmd) > SOLR_CMDBUF_SIZE-128) { - solr_connection_post_more(ctx->post, str_data(cmd), - str_len(cmd)); - str_truncate(cmd, 0); + ctx->headers = TRUE; + str_append(ctx->cmd, "<field name=\"hdr\">"); +} + +static bool +fts_backend_solr_build_body_begin(struct fts_backend_build_context *_ctx, + uint32_t uid, const char *content_type, + const char *content_disposition ATTR_UNUSED) +{ + struct solr_fts_backend_build_context *ctx = + (struct solr_fts_backend_build_context *)_ctx; + + if (!fts_backend_default_can_index(content_type)) + return FALSE; + + if (uid != ctx->prev_uid) + fts_backend_solr_uid_changed(ctx, uid); + else { + /* body comes first, then headers */ + i_assert(!ctx->headers); + } + + ctx->headers = FALSE; + str_append(ctx->cmd, "<field name=\"body\">"); + return TRUE; +} + +static int +fts_backend_solr_build_more(struct fts_backend_build_context *_ctx, + const unsigned char *data, size_t size) +{ + struct solr_fts_backend_build_context *ctx = + (struct solr_fts_backend_build_context *)_ctx; + + xml_encode_data(ctx->cmd, data, size); + if (str_len(ctx->cmd) > SOLR_CMDBUF_SIZE-128) { + solr_connection_post_more(ctx->post, str_data(ctx->cmd), + str_len(ctx->cmd)); + str_truncate(ctx->cmd, 0); } return 0; } @@ -806,6 +836,9 @@ fts_backend_solr_get_last_uid, fts_backend_solr_get_all_last_uids, fts_backend_solr_build_init, + fts_backend_solr_build_hdr, + fts_backend_solr_build_body_begin, + NULL, fts_backend_solr_build_more, fts_backend_solr_build_deinit, fts_backend_solr_expunge,
--- a/src/plugins/fts-squat/fts-backend-squat.c Fri May 14 17:41:34 2010 +0200 +++ b/src/plugins/fts-squat/fts-backend-squat.c Mon May 17 18:06:57 2010 +0200 @@ -21,6 +21,8 @@ struct squat_fts_backend_build_context { struct fts_backend_build_context ctx; struct squat_trie_build_context *build_ctx; + enum squat_index_type squat_type; + uint32_t uid; }; static void @@ -127,18 +129,41 @@ return 0; } -static int -fts_backend_squat_build_more(struct fts_backend_build_context *_ctx, - uint32_t uid, const unsigned char *data, - size_t size, bool headers) +static void +fts_backend_squat_build_hdr(struct fts_backend_build_context *_ctx, + uint32_t uid) +{ + struct squat_fts_backend_build_context *ctx = + (struct squat_fts_backend_build_context *)_ctx; + + ctx->squat_type = SQUAT_INDEX_TYPE_HEADER; + ctx->uid = uid; +} + +static bool +fts_backend_squat_build_body_begin(struct fts_backend_build_context *_ctx, + uint32_t uid, const char *content_type, + const char *content_disposition ATTR_UNUSED) { struct squat_fts_backend_build_context *ctx = (struct squat_fts_backend_build_context *)_ctx; - enum squat_index_type squat_type; + + if (!fts_backend_default_can_index(content_type)) + return FALSE; + + ctx->squat_type = SQUAT_INDEX_TYPE_BODY; + ctx->uid = uid; + return TRUE; +} - squat_type = headers ? SQUAT_INDEX_TYPE_HEADER : - SQUAT_INDEX_TYPE_BODY; - return squat_trie_build_more(ctx->build_ctx, uid, squat_type, +static int +fts_backend_squat_build_more(struct fts_backend_build_context *_ctx, + const unsigned char *data, size_t size) +{ + struct squat_fts_backend_build_context *ctx = + (struct squat_fts_backend_build_context *)_ctx; + + return squat_trie_build_more(ctx->build_ctx, ctx->uid, ctx->squat_type, data, size); } @@ -248,6 +273,9 @@ fts_backend_squat_get_last_uid, NULL, fts_backend_squat_build_init, + fts_backend_squat_build_hdr, + fts_backend_squat_build_body_begin, + NULL, fts_backend_squat_build_more, fts_backend_squat_build_deinit, fts_backend_squat_expunge,
--- a/src/plugins/fts/fts-api-private.h Fri May 14 17:41:34 2010 +0200 +++ b/src/plugins/fts/fts-api-private.h Mon May 17 18:06:57 2010 +0200 @@ -13,8 +13,13 @@ int (*build_init)(struct fts_backend *backend, uint32_t *last_uid_r, struct fts_backend_build_context **ctx_r); - int (*build_more)(struct fts_backend_build_context *ctx, uint32_t uid, - const unsigned char *data, size_t size, bool headers); + void (*build_hdr)(struct fts_backend_build_context *ctx, uint32_t uid); + bool (*build_body_begin)(struct fts_backend_build_context *ctx, + uint32_t uid, const char *content_type, + const char *content_disposition); + void (*build_body_end)(struct fts_backend_build_context *ctx); + int (*build_more)(struct fts_backend_build_context *ctx, + const unsigned char *data, size_t size); int (*build_deinit)(struct fts_backend_build_context *ctx); void (*expunge)(struct fts_backend *backend, struct mail *mail); @@ -80,6 +85,8 @@ void fts_backend_register(const struct fts_backend *backend); void fts_backend_unregister(const char *name); +bool fts_backend_default_can_index(const char *content_type); + void fts_filter_uids(ARRAY_TYPE(seq_range) *definite_dest, const ARRAY_TYPE(seq_range) *definite_filter, ARRAY_TYPE(seq_range) *maybe_dest,
--- a/src/plugins/fts/fts-api.c Fri May 14 17:41:34 2010 +0200 +++ b/src/plugins/fts/fts-api.c Mon May 17 18:06:57 2010 +0200 @@ -99,10 +99,29 @@ return ret; } -int fts_backend_build_more(struct fts_backend_build_context *ctx, uint32_t uid, - const unsigned char *data, size_t size, bool headers) +void fts_backend_build_hdr(struct fts_backend_build_context *ctx, uint32_t uid) +{ + ctx->backend->v.build_hdr(ctx, uid); +} + +bool fts_backend_build_body_begin(struct fts_backend_build_context *ctx, + uint32_t uid, const char *content_type, + const char *content_disposition) { - return ctx->backend->v.build_more(ctx, uid, data, size, headers); + return ctx->backend->v.build_body_begin(ctx, uid, content_type, + content_disposition); +} + +void fts_backend_build_body_end(struct fts_backend_build_context *ctx) +{ + if (ctx->backend->v.build_body_end != NULL) + ctx->backend->v.build_body_end(ctx); +} + +int fts_backend_build_more(struct fts_backend_build_context *ctx, + const unsigned char *data, size_t size) +{ + return ctx->backend->v.build_more(ctx, data, size); } int fts_backend_build_deinit(struct fts_backend_build_context **_ctx) @@ -321,3 +340,9 @@ pool_unref(&ctx->pool); return ret; } + +bool fts_backend_default_can_index(const char *content_type) +{ + return strncasecmp(content_type, "text/", 5) == 0 || + strcasecmp(content_type, "message/rfc822") == 0; +}
--- a/src/plugins/fts/fts-api.h Fri May 14 17:41:34 2010 +0200 +++ b/src/plugins/fts/fts-api.h Mon May 17 18:06:57 2010 +0200 @@ -8,8 +8,12 @@ #include "seq-range-array.h" enum fts_lookup_flags { + /* Search within header and/or body. + At least one of these must be set. */ FTS_LOOKUP_FLAG_HEADER = 0x01, FTS_LOOKUP_FLAG_BODY = 0x02, + + /* The key must NOT be found */ FTS_LOOKUP_FLAG_INVERT = 0x04 }; @@ -33,23 +37,42 @@ /* Get the last_uid for the mailbox. */ int fts_backend_get_last_uid(struct fts_backend *backend, uint32_t *last_uid_r); /* Get last_uids for all mailboxes that might be backend mailboxes for a - virtual mailbox. Depending on virtual mailbox configuration, this function - may also return mailboxes that don't really even match the virtual mailbox - patterns. The caller should filter out the list itself. */ + virtual mailbox. The backend can use mailbox_get_virtual_backend_boxes() or + mailbox_get_virtual_box_patterns() functions to get the list of mailboxes. + + Depending on virtual mailbox configuration, this function may also return + mailboxes that don't even match the virtual mailbox patterns. The caller + needs to be able to ignore the unnecessary ones. */ int fts_backend_get_all_last_uids(struct fts_backend *backend, pool_t pool, ARRAY_TYPE(fts_backend_uid_map) *last_uids); -/* Initialize adding new data to the index. last_uid_r is set to the last UID - that exists in the index. */ +/* Initialize adding new data to the index. last_uid_r is set to the last + indexed message's IMAP UID */ int fts_backend_build_init(struct fts_backend *backend, uint32_t *last_uid_r, struct fts_backend_build_context **ctx_r); -/* Add more contents to the index. The data must contain only full valid - UTF-8 characters, but it doesn't need to be NUL-terminated. size contains - the data size in bytes, not characters. headers is TRUE if the data contains - message headers instead of message body. */ -int fts_backend_build_more(struct fts_backend_build_context *ctx, uint32_t uid, - const unsigned char *data, size_t size, - bool headers); +/* Switch to building index for mail's headers or MIME part headers. */ +void fts_backend_build_hdr(struct fts_backend_build_context *ctx, uint32_t uid); +/* Switch to building index for the next body part. If backend doesn't want + to index this body part (based on content type/disposition check), it can + return FALSE and caller will skip to next part. The backend must return + TRUE for all text/xxx and message/rfc822 content types. + + The content_type contains a valid parsed "type/subtype" string. For messages + without (valid) Content-Type header, the content_type is set to "text/plain". + The content_disposition is passed without parsing/validation if it exists, + otherwise it's NULL. */ +bool fts_backend_build_body_begin(struct fts_backend_build_context *ctx, + uint32_t uid, const char *content_type, + const char *content_disposition); +/* Called once when the whole body part has been sent. */ +void fts_backend_build_body_end(struct fts_backend_build_context *ctx); +/* Add more content to the index for the currently selected header/body part. + The data must contain only full valid UTF-8 characters, but it doesn't need + to be NUL-terminated. size contains the data size in bytes, not characters. + This function may be called many times and the data block sizes may be + small. Backend returns 0 if ok, -1 if build should be aborted. */ +int fts_backend_build_more(struct fts_backend_build_context *ctx, + const unsigned char *data, size_t size); /* Finish adding new data to the index. */ int fts_backend_build_deinit(struct fts_backend_build_context **ctx); @@ -57,14 +80,15 @@ bool fts_backend_is_building(struct fts_backend *backend); /* Expunge given mail from the backend. Note that the transaction may still - fail later. */ + fail later, so backend shouldn't do anything irreversible. */ void fts_backend_expunge(struct fts_backend *backend, struct mail *mail); /* Called after transaction has been committed or rollbacked. */ void fts_backend_expunge_finish(struct fts_backend *backend, struct mailbox *box, bool committed); /* Lock/unlock the backend for multiple lookups. Returns 1 if locked, 0 if - locking timeouted, -1 if error. + locking timeouted, -1 if error. If backend doesn't require locking, it + always returns 1. It's not required to call these functions manually, but if you're doing multiple lookup/filter operations this avoids multiple lock/unlock calls. */ @@ -74,10 +98,14 @@ /* Start building a FTS lookup. */ struct fts_backend_lookup_context * fts_backend_lookup_init(struct fts_backend *backend); -/* Add a new search key to the lookup. */ +/* Add a new search key to the lookup. The keys are ANDed together. */ void fts_backend_lookup_add(struct fts_backend_lookup_context *ctx, const char *key, enum fts_lookup_flags flags); -/* Finish the lookup and return found UIDs. */ +/* Finish the lookup and return found UIDs. The definite_uids are returned + to client directly, while for maybe_uids Dovecot first verifies (by + opening and reading the mail) that they really do contain the searched + keys. The maybe_uids is useful with backends that can only filter out + messages, but can't definitively say if the search matched a message. */ int fts_backend_lookup_deinit(struct fts_backend_lookup_context **ctx, ARRAY_TYPE(seq_range) *definite_uids, ARRAY_TYPE(seq_range) *maybe_uids,
--- a/src/plugins/fts/fts-storage.c Fri May 14 17:41:34 2010 +0200 +++ b/src/plugins/fts/fts-storage.c Mon May 17 18:06:57 2010 +0200 @@ -6,6 +6,7 @@ #include "str.h" #include "istream.h" #include "time-util.h" +#include "rfc822-parser.h" #include "message-parser.h" #include "message-decoder.h" #include "mail-namespace.h" @@ -40,6 +41,7 @@ uint32_t uid; string_t *headers; + char *content_type, *content_disposition; }; struct fts_transaction_context { @@ -77,20 +79,52 @@ if (str_len(ctx->headers) == 0) return 0; - if (fts_backend_build_more(ctx->build, ctx->uid, str_data(ctx->headers), - str_len(ctx->headers), TRUE) < 0) + fts_backend_build_hdr(ctx->build, ctx->uid); + if (fts_backend_build_more(ctx->build, str_data(ctx->headers), + str_len(ctx->headers)) < 0) return -1; str_truncate(ctx->headers, 0); return 0; } -static bool fts_build_want_index_part(const struct message_block *block) +static void fts_build_parse_content_type(struct fts_storage_build_context *ctx, + const struct message_header_line *hdr) { - /* we'll index only text/xxx and message/rfc822 parts for now */ - return (block->part->flags & - (MESSAGE_PART_FLAG_TEXT | - MESSAGE_PART_FLAG_MESSAGE_RFC822)) != 0; + struct rfc822_parser_context parser; + string_t *content_type; + + rfc822_parser_init(&parser, hdr->full_value, hdr->full_value_len, NULL); + (void)rfc822_skip_lwsp(&parser); + + T_BEGIN { + content_type = t_str_new(64); + if (rfc822_parse_content_type(&parser, content_type) >= 0) { + i_free(ctx->content_type); + ctx->content_type = i_strdup(str_c(content_type)); + } + } T_END; +} + +static void +fts_build_parse_content_disposition(struct fts_storage_build_context *ctx, + const struct message_header_line *hdr) +{ + /* just pass it as-is to backend. */ + i_free(ctx->content_disposition); + ctx->content_disposition = + i_strndup(hdr->full_value, hdr->full_value_len); +} + +static void fts_parse_mail_header(struct fts_storage_build_context *ctx, + const struct message_block *raw_block) +{ + const struct message_header_line *hdr = raw_block->hdr; + + if (strcasecmp(hdr->name, "Content-Type") == 0) + fts_build_parse_content_type(ctx, hdr); + else if (strcasecmp(hdr->name, "Content-Disposition") == 0) + fts_build_parse_content_disposition(ctx, hdr); } static void fts_build_mail_header(struct fts_storage_build_context *ctx, @@ -114,6 +148,7 @@ struct message_decoder_context *decoder; struct message_block raw_block, block; struct message_part *prev_part, *parts; + bool skip_body = FALSE, body_part = FALSE; int ret; ctx->uid = uid; @@ -125,7 +160,8 @@ parser = message_parser_init(pool_datastack_create(), input, MESSAGE_HEADER_PARSER_FLAG_CLEAN_ONELINE, 0); - decoder = message_decoder_init(MESSAGE_DECODER_FLAG_DTCASE); + decoder = message_decoder_init(MESSAGE_DECODER_FLAG_DTCASE | + MESSAGE_DECODER_FLAG_RETURN_BINARY); for (;;) { ret = message_parser_parse_next_block(parser, &raw_block); i_assert(ret != 0); @@ -134,30 +170,62 @@ ret = 0; break; } - if (raw_block.hdr == NULL && raw_block.size != 0 && - !fts_build_want_index_part(&raw_block)) { - /* skipping this body */ - continue; + + if (raw_block.part != prev_part) { + /* body part changed. we're now parsing the end of + boundary, possibly followed by message epilogue */ + if (!skip_body && prev_part != NULL) { + i_assert(body_part); + fts_backend_build_body_end(ctx->build); + } + prev_part = raw_block.part; + i_free_and_null(ctx->content_type); + i_free_and_null(ctx->content_disposition); + + if (raw_block.size != 0) { + /* multipart. skip until beginning of next + part's headers */ + skip_body = TRUE; + } + } + + if (raw_block.hdr != NULL) { + /* always handle headers */ + } else if (raw_block.size == 0) { + /* end of headers */ + const char *content_type = ctx->content_type == NULL ? + "text/plain" : ctx->content_type; + + skip_body = !fts_backend_build_body_begin(ctx->build, + ctx->uid, content_type, + ctx->content_disposition); + body_part = TRUE; + } else { + if (skip_body) + continue; } if (!message_decoder_decode_next_block(decoder, &raw_block, &block)) continue; - if (block.hdr != NULL) + if (block.hdr != NULL) { + fts_parse_mail_header(ctx, &raw_block); fts_build_mail_header(ctx, &block); - else if (block.size == 0) { + } else if (block.size == 0) { /* end of headers */ str_append_c(ctx->headers, '\n'); } else { - if (fts_backend_build_more(ctx->build, ctx->uid, - block.data, block.size, - FALSE) < 0) { + i_assert(body_part); + if (fts_backend_build_more(ctx->build, + block.data, block.size) < 0) { ret = -1; break; } } } + if (!skip_body && body_part) + fts_backend_build_body_end(ctx->build); if (message_parser_deinit(&parser, &parts) < 0) mail_set_cache_corrupted(ctx->mail, MAIL_FETCH_MESSAGE_PARTS); message_decoder_deinit(&decoder); @@ -483,6 +551,8 @@ str_free(&ctx->headers); mail_search_args_unref(&ctx->search_args); + i_free(ctx->content_type); + i_free(ctx->content_disposition); i_free(ctx); return ret; }