changeset 11316:757cb3148407 HEAD

fts: Backends can now index non-text body parts if they support it.
author Timo Sirainen <tss@iki.fi>
date Mon, 17 May 2010 18:06:57 +0200
parents 7bb35ad5e80e
children 4ba05c3702be
files src/plugins/fts-solr/fts-backend-solr.c src/plugins/fts-squat/fts-backend-squat.c src/plugins/fts/fts-api-private.h src/plugins/fts/fts-api.c src/plugins/fts/fts-api.h src/plugins/fts/fts-storage.c
diffstat 6 files changed, 273 insertions(+), 82 deletions(-) [+]
line wrap: on
line diff
--- a/src/plugins/fts-solr/fts-backend-solr.c	Fri May 14 17:41:34 2010 +0200
+++ b/src/plugins/fts-solr/fts-backend-solr.c	Mon May 17 18:06:57 2010 +0200
@@ -561,48 +561,78 @@
 	xml_encode(str, backend->id_box_name);
 }
 
-static int
-fts_backend_solr_build_more(struct fts_backend_build_context *_ctx,
-			    uint32_t uid, const unsigned char *data,
-			    size_t size, bool headers)
+static void
+fts_backend_solr_uid_changed(struct solr_fts_backend_build_context *ctx,
+			     uint32_t uid)
+{
+	if (ctx->post == NULL) {
+		ctx->post = solr_connection_post_begin(solr_conn);
+		str_append(ctx->cmd, "<add>");
+	} else {
+		str_append(ctx->cmd, "</field></doc>");
+	}
+	ctx->prev_uid = uid;
+	ctx->headers = FALSE;
+
+	fts_backend_solr_add_doc_prefix(ctx, uid);
+	str_printfa(ctx->cmd, "<field name=\"id\">");
+	xml_encode_id(ctx->cmd, ctx->ctx.backend, uid, ctx->uid_validity);
+	str_append(ctx->cmd, "</field>");
+}
+
+static void
+fts_backend_solr_build_hdr(struct fts_backend_build_context *_ctx,
+			   uint32_t uid)
 {
 	struct solr_fts_backend_build_context *ctx =
 		(struct solr_fts_backend_build_context *)_ctx;
-	string_t *cmd = ctx->cmd;
 
-	/* body comes first, then headers */
-	if (ctx->prev_uid != uid) {
-		/* uid changed */
-		if (ctx->post == NULL) {
-			ctx->post = solr_connection_post_begin(solr_conn);
-			str_append(cmd, "<add>");
-		} else {
-			str_append(cmd, "</field></doc>");
-		}
-		ctx->prev_uid = uid;
-
-		fts_backend_solr_add_doc_prefix(ctx, uid);
-		str_printfa(cmd, "<field name=\"id\">");
-		xml_encode_id(cmd, _ctx->backend, uid, ctx->uid_validity);
-		str_append(cmd, "</field>");
-
-		ctx->headers = headers;
-		if (headers) {
-			str_append(cmd, "<field name=\"hdr\">");
-		} else {
-			str_append(cmd, "<field name=\"body\">");
-		}
-	} else if (headers && !ctx->headers) {
-		str_append(cmd, "</field><field name=\"hdr\">");
-	} else {
-		i_assert(!(!headers && ctx->headers));
+	if (uid != ctx->prev_uid)
+		fts_backend_solr_uid_changed(ctx, uid);
+	else {
+		i_assert(!ctx->headers);
+		str_append(ctx->cmd, "</field>");
 	}
 
-	xml_encode_data(cmd, data, size);
-	if (str_len(cmd) > SOLR_CMDBUF_SIZE-128) {
-		solr_connection_post_more(ctx->post, str_data(cmd),
-					  str_len(cmd));
-		str_truncate(cmd, 0);
+	ctx->headers = TRUE;
+	str_append(ctx->cmd, "<field name=\"hdr\">");
+}
+
+static bool
+fts_backend_solr_build_body_begin(struct fts_backend_build_context *_ctx,
+				  uint32_t uid, const char *content_type,
+				  const char *content_disposition ATTR_UNUSED)
+{
+	struct solr_fts_backend_build_context *ctx =
+		(struct solr_fts_backend_build_context *)_ctx;
+
+	if (!fts_backend_default_can_index(content_type))
+		return FALSE;
+
+	if (uid != ctx->prev_uid)
+		fts_backend_solr_uid_changed(ctx, uid);
+	else {
+		/* body comes first, then headers */
+		i_assert(!ctx->headers);
+	}
+
+	ctx->headers = FALSE;
+	str_append(ctx->cmd, "<field name=\"body\">");
+	return TRUE;
+}
+
+static int
+fts_backend_solr_build_more(struct fts_backend_build_context *_ctx,
+			    const unsigned char *data, size_t size)
+{
+	struct solr_fts_backend_build_context *ctx =
+		(struct solr_fts_backend_build_context *)_ctx;
+
+	xml_encode_data(ctx->cmd, data, size);
+	if (str_len(ctx->cmd) > SOLR_CMDBUF_SIZE-128) {
+		solr_connection_post_more(ctx->post, str_data(ctx->cmd),
+					  str_len(ctx->cmd));
+		str_truncate(ctx->cmd, 0);
 	}
 	return 0;
 }
@@ -806,6 +836,9 @@
 		fts_backend_solr_get_last_uid,
 		fts_backend_solr_get_all_last_uids,
 		fts_backend_solr_build_init,
+		fts_backend_solr_build_hdr,
+		fts_backend_solr_build_body_begin,
+		NULL,
 		fts_backend_solr_build_more,
 		fts_backend_solr_build_deinit,
 		fts_backend_solr_expunge,
--- a/src/plugins/fts-squat/fts-backend-squat.c	Fri May 14 17:41:34 2010 +0200
+++ b/src/plugins/fts-squat/fts-backend-squat.c	Mon May 17 18:06:57 2010 +0200
@@ -21,6 +21,8 @@
 struct squat_fts_backend_build_context {
 	struct fts_backend_build_context ctx;
 	struct squat_trie_build_context *build_ctx;
+	enum squat_index_type squat_type;
+	uint32_t uid;
 };
 
 static void
@@ -127,18 +129,41 @@
 	return 0;
 }
 
-static int
-fts_backend_squat_build_more(struct fts_backend_build_context *_ctx,
-			     uint32_t uid, const unsigned char *data,
-			     size_t size, bool headers)
+static void
+fts_backend_squat_build_hdr(struct fts_backend_build_context *_ctx,
+			    uint32_t uid)
+{
+	struct squat_fts_backend_build_context *ctx =
+		(struct squat_fts_backend_build_context *)_ctx;
+
+	ctx->squat_type = SQUAT_INDEX_TYPE_HEADER;
+	ctx->uid = uid;
+}
+
+static bool
+fts_backend_squat_build_body_begin(struct fts_backend_build_context *_ctx,
+				   uint32_t uid, const char *content_type,
+				   const char *content_disposition ATTR_UNUSED)
 {
 	struct squat_fts_backend_build_context *ctx =
 		(struct squat_fts_backend_build_context *)_ctx;
-	enum squat_index_type squat_type;
+
+	if (!fts_backend_default_can_index(content_type))
+		return FALSE;
+
+	ctx->squat_type = SQUAT_INDEX_TYPE_BODY;
+	ctx->uid = uid;
+	return TRUE;
+}
 
-	squat_type = headers ? SQUAT_INDEX_TYPE_HEADER :
-		SQUAT_INDEX_TYPE_BODY;
-	return squat_trie_build_more(ctx->build_ctx, uid, squat_type,
+static int
+fts_backend_squat_build_more(struct fts_backend_build_context *_ctx,
+			     const unsigned char *data, size_t size)
+{
+	struct squat_fts_backend_build_context *ctx =
+		(struct squat_fts_backend_build_context *)_ctx;
+
+	return squat_trie_build_more(ctx->build_ctx, ctx->uid, ctx->squat_type,
 				     data, size);
 }
 
@@ -248,6 +273,9 @@
 		fts_backend_squat_get_last_uid,
 		NULL,
 		fts_backend_squat_build_init,
+		fts_backend_squat_build_hdr,
+		fts_backend_squat_build_body_begin,
+		NULL,
 		fts_backend_squat_build_more,
 		fts_backend_squat_build_deinit,
 		fts_backend_squat_expunge,
--- a/src/plugins/fts/fts-api-private.h	Fri May 14 17:41:34 2010 +0200
+++ b/src/plugins/fts/fts-api-private.h	Mon May 17 18:06:57 2010 +0200
@@ -13,8 +13,13 @@
 
 	int (*build_init)(struct fts_backend *backend, uint32_t *last_uid_r,
 			  struct fts_backend_build_context **ctx_r);
-	int (*build_more)(struct fts_backend_build_context *ctx, uint32_t uid,
-			  const unsigned char *data, size_t size, bool headers);
+	void (*build_hdr)(struct fts_backend_build_context *ctx, uint32_t uid);
+	bool (*build_body_begin)(struct fts_backend_build_context *ctx,
+				 uint32_t uid, const char *content_type,
+				 const char *content_disposition);
+	void (*build_body_end)(struct fts_backend_build_context *ctx);
+	int (*build_more)(struct fts_backend_build_context *ctx,
+			  const unsigned char *data, size_t size);
 	int (*build_deinit)(struct fts_backend_build_context *ctx);
 
 	void (*expunge)(struct fts_backend *backend, struct mail *mail);
@@ -80,6 +85,8 @@
 void fts_backend_register(const struct fts_backend *backend);
 void fts_backend_unregister(const char *name);
 
+bool fts_backend_default_can_index(const char *content_type);
+
 void fts_filter_uids(ARRAY_TYPE(seq_range) *definite_dest,
 		     const ARRAY_TYPE(seq_range) *definite_filter,
 		     ARRAY_TYPE(seq_range) *maybe_dest,
--- a/src/plugins/fts/fts-api.c	Fri May 14 17:41:34 2010 +0200
+++ b/src/plugins/fts/fts-api.c	Mon May 17 18:06:57 2010 +0200
@@ -99,10 +99,29 @@
 	return ret;
 }
 
-int fts_backend_build_more(struct fts_backend_build_context *ctx, uint32_t uid,
-			   const unsigned char *data, size_t size, bool headers)
+void fts_backend_build_hdr(struct fts_backend_build_context *ctx, uint32_t uid)
+{
+	ctx->backend->v.build_hdr(ctx, uid);
+}
+
+bool fts_backend_build_body_begin(struct fts_backend_build_context *ctx,
+				  uint32_t uid, const char *content_type,
+				  const char *content_disposition)
 {
-	return ctx->backend->v.build_more(ctx, uid, data, size, headers);
+	return ctx->backend->v.build_body_begin(ctx, uid, content_type,
+						content_disposition);
+}
+
+void fts_backend_build_body_end(struct fts_backend_build_context *ctx)
+{
+	if (ctx->backend->v.build_body_end != NULL)
+		ctx->backend->v.build_body_end(ctx);
+}
+
+int fts_backend_build_more(struct fts_backend_build_context *ctx,
+			   const unsigned char *data, size_t size)
+{
+	return ctx->backend->v.build_more(ctx, data, size);
 }
 
 int fts_backend_build_deinit(struct fts_backend_build_context **_ctx)
@@ -321,3 +340,9 @@
 	pool_unref(&ctx->pool);
 	return ret;
 }
+
+bool fts_backend_default_can_index(const char *content_type)
+{
+	return strncasecmp(content_type, "text/", 5) == 0 ||
+		strcasecmp(content_type, "message/rfc822") == 0;
+}
--- a/src/plugins/fts/fts-api.h	Fri May 14 17:41:34 2010 +0200
+++ b/src/plugins/fts/fts-api.h	Mon May 17 18:06:57 2010 +0200
@@ -8,8 +8,12 @@
 #include "seq-range-array.h"
 
 enum fts_lookup_flags {
+	/* Search within header and/or body.
+	   At least one of these must be set. */
 	FTS_LOOKUP_FLAG_HEADER	= 0x01,
 	FTS_LOOKUP_FLAG_BODY	= 0x02,
+
+	/* The key must NOT be found */
 	FTS_LOOKUP_FLAG_INVERT	= 0x04
 };
 
@@ -33,23 +37,42 @@
 /* Get the last_uid for the mailbox. */
 int fts_backend_get_last_uid(struct fts_backend *backend, uint32_t *last_uid_r);
 /* Get last_uids for all mailboxes that might be backend mailboxes for a
-   virtual mailbox. Depending on virtual mailbox configuration, this function
-   may also return mailboxes that don't really even match the virtual mailbox
-   patterns. The caller should filter out the list itself. */
+   virtual mailbox. The backend can use mailbox_get_virtual_backend_boxes() or
+   mailbox_get_virtual_box_patterns() functions to get the list of mailboxes.
+
+   Depending on virtual mailbox configuration, this function may also return
+   mailboxes that don't even match the virtual mailbox patterns. The caller
+   needs to be able to ignore the unnecessary ones. */
 int fts_backend_get_all_last_uids(struct fts_backend *backend, pool_t pool,
 				  ARRAY_TYPE(fts_backend_uid_map) *last_uids);
 
-/* Initialize adding new data to the index. last_uid_r is set to the last UID
-   that exists in the index. */
+/* Initialize adding new data to the index. last_uid_r is set to the last
+   indexed message's IMAP UID */
 int fts_backend_build_init(struct fts_backend *backend, uint32_t *last_uid_r,
 			   struct fts_backend_build_context **ctx_r);
-/* Add more contents to the index. The data must contain only full valid
-   UTF-8 characters, but it doesn't need to be NUL-terminated. size contains
-   the data size in bytes, not characters. headers is TRUE if the data contains
-   message headers instead of message body. */
-int fts_backend_build_more(struct fts_backend_build_context *ctx, uint32_t uid,
-			   const unsigned char *data, size_t size,
-			   bool headers);
+/* Switch to building index for mail's headers or MIME part headers. */
+void fts_backend_build_hdr(struct fts_backend_build_context *ctx, uint32_t uid);
+/* Switch to building index for the next body part. If backend doesn't want
+   to index this body part (based on content type/disposition check), it can
+   return FALSE and caller will skip to next part. The backend must return
+   TRUE for all text/xxx and message/rfc822 content types.
+
+   The content_type contains a valid parsed "type/subtype" string. For messages
+   without (valid) Content-Type header, the content_type is set to "text/plain".
+   The content_disposition is passed without parsing/validation if it exists,
+   otherwise it's NULL. */
+bool fts_backend_build_body_begin(struct fts_backend_build_context *ctx,
+				  uint32_t uid, const char *content_type,
+				  const char *content_disposition);
+/* Called once when the whole body part has been sent. */
+void fts_backend_build_body_end(struct fts_backend_build_context *ctx);
+/* Add more content to the index for the currently selected header/body part.
+   The data must contain only full valid UTF-8 characters, but it doesn't need
+   to be NUL-terminated. size contains the data size in bytes, not characters.
+   This function may be called many times and the data block sizes may be
+   small. Backend returns 0 if ok, -1 if build should be aborted. */
+int fts_backend_build_more(struct fts_backend_build_context *ctx,
+			   const unsigned char *data, size_t size);
 /* Finish adding new data to the index. */
 int fts_backend_build_deinit(struct fts_backend_build_context **ctx);
 
@@ -57,14 +80,15 @@
 bool fts_backend_is_building(struct fts_backend *backend);
 
 /* Expunge given mail from the backend. Note that the transaction may still
-   fail later. */
+   fail later, so backend shouldn't do anything irreversible. */
 void fts_backend_expunge(struct fts_backend *backend, struct mail *mail);
 /* Called after transaction has been committed or rollbacked. */
 void fts_backend_expunge_finish(struct fts_backend *backend,
 				struct mailbox *box, bool committed);
 
 /* Lock/unlock the backend for multiple lookups. Returns 1 if locked, 0 if
-   locking timeouted, -1 if error.
+   locking timeouted, -1 if error. If backend doesn't require locking, it
+   always returns 1.
 
    It's not required to call these functions manually, but if you're doing
    multiple lookup/filter operations this avoids multiple lock/unlock calls. */
@@ -74,10 +98,14 @@
 /* Start building a FTS lookup. */
 struct fts_backend_lookup_context *
 fts_backend_lookup_init(struct fts_backend *backend);
-/* Add a new search key to the lookup. */
+/* Add a new search key to the lookup. The keys are ANDed together. */
 void fts_backend_lookup_add(struct fts_backend_lookup_context *ctx,
 			    const char *key, enum fts_lookup_flags flags);
-/* Finish the lookup and return found UIDs. */
+/* Finish the lookup and return found UIDs. The definite_uids are returned
+   to client directly, while for maybe_uids Dovecot first verifies (by
+   opening and reading the mail) that they really do contain the searched
+   keys. The maybe_uids is useful with backends that can only filter out
+   messages, but can't definitively say if the search matched a message. */
 int fts_backend_lookup_deinit(struct fts_backend_lookup_context **ctx,
 			      ARRAY_TYPE(seq_range) *definite_uids,
 			      ARRAY_TYPE(seq_range) *maybe_uids,
--- a/src/plugins/fts/fts-storage.c	Fri May 14 17:41:34 2010 +0200
+++ b/src/plugins/fts/fts-storage.c	Mon May 17 18:06:57 2010 +0200
@@ -6,6 +6,7 @@
 #include "str.h"
 #include "istream.h"
 #include "time-util.h"
+#include "rfc822-parser.h"
 #include "message-parser.h"
 #include "message-decoder.h"
 #include "mail-namespace.h"
@@ -40,6 +41,7 @@
 
 	uint32_t uid;
 	string_t *headers;
+	char *content_type, *content_disposition;
 };
 
 struct fts_transaction_context {
@@ -77,20 +79,52 @@
 	if (str_len(ctx->headers) == 0)
 		return 0;
 
-	if (fts_backend_build_more(ctx->build, ctx->uid, str_data(ctx->headers),
-				   str_len(ctx->headers), TRUE) < 0)
+	fts_backend_build_hdr(ctx->build, ctx->uid);
+	if (fts_backend_build_more(ctx->build, str_data(ctx->headers),
+				   str_len(ctx->headers)) < 0)
 		return -1;
 
 	str_truncate(ctx->headers, 0);
 	return 0;
 }
 
-static bool fts_build_want_index_part(const struct message_block *block)
+static void fts_build_parse_content_type(struct fts_storage_build_context *ctx,
+					 const struct message_header_line *hdr)
 {
-	/* we'll index only text/xxx and message/rfc822 parts for now */
-	return (block->part->flags &
-		(MESSAGE_PART_FLAG_TEXT |
-		 MESSAGE_PART_FLAG_MESSAGE_RFC822)) != 0;
+	struct rfc822_parser_context parser;
+	string_t *content_type;
+
+	rfc822_parser_init(&parser, hdr->full_value, hdr->full_value_len, NULL);
+	(void)rfc822_skip_lwsp(&parser);
+
+	T_BEGIN {
+		content_type = t_str_new(64);
+		if (rfc822_parse_content_type(&parser, content_type) >= 0) {
+			i_free(ctx->content_type);
+			ctx->content_type = i_strdup(str_c(content_type));
+		}
+	} T_END;
+}
+
+static void
+fts_build_parse_content_disposition(struct fts_storage_build_context *ctx,
+				    const struct message_header_line *hdr)
+{
+	/* just pass it as-is to backend. */
+	i_free(ctx->content_disposition);
+	ctx->content_disposition =
+		i_strndup(hdr->full_value, hdr->full_value_len);
+}
+
+static void fts_parse_mail_header(struct fts_storage_build_context *ctx,
+				  const struct message_block *raw_block)
+{
+	const struct message_header_line *hdr = raw_block->hdr;
+
+	if (strcasecmp(hdr->name, "Content-Type") == 0)
+		fts_build_parse_content_type(ctx, hdr);
+	else if (strcasecmp(hdr->name, "Content-Disposition") == 0)
+		fts_build_parse_content_disposition(ctx, hdr);
 }
 
 static void fts_build_mail_header(struct fts_storage_build_context *ctx,
@@ -114,6 +148,7 @@
 	struct message_decoder_context *decoder;
 	struct message_block raw_block, block;
 	struct message_part *prev_part, *parts;
+	bool skip_body = FALSE, body_part = FALSE;
 	int ret;
 
 	ctx->uid = uid;
@@ -125,7 +160,8 @@
 	parser = message_parser_init(pool_datastack_create(), input,
 				     MESSAGE_HEADER_PARSER_FLAG_CLEAN_ONELINE,
 				     0);
-	decoder = message_decoder_init(MESSAGE_DECODER_FLAG_DTCASE);
+	decoder = message_decoder_init(MESSAGE_DECODER_FLAG_DTCASE |
+				       MESSAGE_DECODER_FLAG_RETURN_BINARY);
 	for (;;) {
 		ret = message_parser_parse_next_block(parser, &raw_block);
 		i_assert(ret != 0);
@@ -134,30 +170,62 @@
 				ret = 0;
 			break;
 		}
-		if (raw_block.hdr == NULL && raw_block.size != 0 &&
-		    !fts_build_want_index_part(&raw_block)) {
-			/* skipping this body */
-			continue;
+
+		if (raw_block.part != prev_part) {
+			/* body part changed. we're now parsing the end of
+			   boundary, possibly followed by message epilogue */
+			if (!skip_body && prev_part != NULL) {
+				i_assert(body_part);
+				fts_backend_build_body_end(ctx->build);
+			}
+			prev_part = raw_block.part;
+			i_free_and_null(ctx->content_type);
+			i_free_and_null(ctx->content_disposition);
+
+			if (raw_block.size != 0) {
+				/* multipart. skip until beginning of next
+				   part's headers */
+				skip_body = TRUE;
+			}
+		}
+
+		if (raw_block.hdr != NULL) {
+			/* always handle headers */
+		} else if (raw_block.size == 0) {
+			/* end of headers */
+			const char *content_type = ctx->content_type == NULL ?
+				"text/plain" : ctx->content_type;
+
+			skip_body = !fts_backend_build_body_begin(ctx->build,
+					ctx->uid, content_type,
+					ctx->content_disposition);
+			body_part = TRUE;
+		} else {
+			if (skip_body)
+				continue;
 		}
 
 		if (!message_decoder_decode_next_block(decoder, &raw_block,
 						       &block))
 			continue;
 
-		if (block.hdr != NULL)
+		if (block.hdr != NULL) {
+			fts_parse_mail_header(ctx, &raw_block);
 			fts_build_mail_header(ctx, &block);
-		else if (block.size == 0) {
+		} else if (block.size == 0) {
 			/* end of headers */
 			str_append_c(ctx->headers, '\n');
 		} else {
-			if (fts_backend_build_more(ctx->build, ctx->uid,
-						   block.data, block.size,
-						   FALSE) < 0) {
+			i_assert(body_part);
+			if (fts_backend_build_more(ctx->build,
+						   block.data, block.size) < 0) {
 				ret = -1;
 				break;
 			}
 		}
 	}
+	if (!skip_body && body_part)
+		fts_backend_build_body_end(ctx->build);
 	if (message_parser_deinit(&parser, &parts) < 0)
 		mail_set_cache_corrupted(ctx->mail, MAIL_FETCH_MESSAGE_PARTS);
 	message_decoder_deinit(&decoder);
@@ -483,6 +551,8 @@
 
 	str_free(&ctx->headers);
 	mail_search_args_unref(&ctx->search_args);
+	i_free(ctx->content_type);
+	i_free(ctx->content_disposition);
 	i_free(ctx);
 	return ret;
 }