changeset 5341:acd4ed841b01 HEAD

Separate headers and body in building and searching. Added support for Lucene to index/search them separately.
author Timo Sirainen <tss@iki.fi>
date Fri, 16 Mar 2007 00:20:55 +0200
parents b7c4d7e2cc8c
children 4e13ca0f2f87
files src/plugins/fts-lucene/fts-backend-lucene.c src/plugins/fts-lucene/lucene-wrapper.cc src/plugins/fts-lucene/lucene-wrapper.h src/plugins/fts-squat/fts-backend-squat.c src/plugins/fts/fts-api-private.h src/plugins/fts/fts-api.c src/plugins/fts/fts-api.h src/plugins/fts/fts-storage.c
diffstat 8 files changed, 95 insertions(+), 49 deletions(-) [+]
line wrap: on
line diff
--- a/src/plugins/fts-lucene/fts-backend-lucene.c	Thu Mar 15 23:46:26 2007 +0200
+++ b/src/plugins/fts-lucene/fts-backend-lucene.c	Fri Mar 16 00:20:55 2007 +0200
@@ -109,7 +109,7 @@
 static int
 fts_backend_lucene_build_more(struct fts_backend_build_context *ctx,
 			      uint32_t uid, const unsigned char *data,
-			      size_t size)
+			      size_t size, bool headers)
 {
 	struct lucene_fts_backend *backend =
 		(struct lucene_fts_backend *)ctx->backend;
@@ -122,7 +122,7 @@
 
 	i_assert(backend->lstorage->selected_box == backend->box);
 	return lucene_index_build_more(backend->lstorage->index,
-				       uid, data, size);
+				       uid, data, size, headers);
 }
 
 static int
@@ -167,14 +167,16 @@
 }
 
 static int
-fts_backend_lucene_lookup(struct fts_backend *_backend, const char *key,
-			 ARRAY_TYPE(seq_range) *result)
+fts_backend_lucene_lookup(struct fts_backend *_backend,
+			  enum fts_lookup_flags flags,
+			  const char *key, ARRAY_TYPE(seq_range) *result)
 {
 	struct lucene_fts_backend *backend =
 		(struct lucene_fts_backend *)_backend;
 
 	fts_backend_select(backend);
-	return lucene_index_lookup(backend->lstorage->index, key, result);
+	return lucene_index_lookup(backend->lstorage->index,
+				   flags, key, result);
 }
 
 struct fts_backend fts_backend_lucene = {
--- a/src/plugins/fts-lucene/lucene-wrapper.cc	Thu Mar 15 23:46:26 2007 +0200
+++ b/src/plugins/fts-lucene/lucene-wrapper.cc	Fri Mar 16 00:20:55 2007 +0200
@@ -57,10 +57,12 @@
 public:
 	TokenStream *tokenStream(const TCHAR *fieldName,
 				 CL_NS(util)::Reader *reader) {
-		/* Everything except contents should go as-is without any
+		/* Everything except body/headers should go as-is without any
 		   modifications. Isn't there any easier way to do this than
 		   to implement a whole new RawTokenStream?.. */
-		if (fieldName != 0 && wcscmp(fieldName, L"contents") != 0)
+		if (fieldName != 0 &&
+		    wcscmp(fieldName, L"headers") != 0 &&
+		    wcscmp(fieldName, L"body") != 0)
 			return _CLNEW RawTokenStream(reader);
 
 		return standard::StandardAnalyzer::
@@ -282,7 +284,8 @@
 }
 
 int lucene_index_build_more(struct lucene_index *index, uint32_t uid,
-			    const unsigned char *data, size_t size)
+			    const unsigned char *data, size_t size,
+			    bool headers)
 {
 	unsigned int len;
 	char id[MAX_INT_STRLEN];
@@ -309,7 +312,10 @@
 		index->doc->add(*Field::Text(_T("box"), index->tmailbox_name));
 	}
 
-	index->doc->add(*Field::Text(_T("contents"), dest));
+	if (headers)
+		index->doc->add(*Field::Text(_T("headers"), dest));
+	else
+		index->doc->add(*Field::Text(_T("body"), dest));
 	return 0;
 }
 
@@ -411,12 +417,14 @@
 	}
 }
 
-int lucene_index_lookup(struct lucene_index *index, const char *key,
-			ARRAY_TYPE(seq_range) *result)
+int lucene_index_lookup(struct lucene_index *index, enum fts_lookup_flags flags,
+			const char *key, ARRAY_TYPE(seq_range) *result)
 {
 	const char *quoted_key;
 	int ret = 0;
 
+	i_assert((flags & (FTS_LOOKUP_FLAG_HEADERS|FTS_LOOKUP_FLAG_BODY)) != 0);
+
 	if (lucene_index_open_search(index) <= 0)
 		return -1;
 
@@ -429,15 +437,26 @@
 	lucene_utf8towcs(tkey, quoted_key, len + 1);
 	t_pop();
 
-	Query *content_query = NULL;
+	BooleanQuery lookup_query;
+	Query *content_query1 = NULL, *content_query2 = NULL;
 	try {
-		content_query = QueryParser::parse(tkey, _T("contents"),
-						   index->analyzer);
+		if ((flags & FTS_LOOKUP_FLAG_HEADERS) != 0) {
+			content_query1 = QueryParser::parse(tkey, _T("headers"),
+							    index->analyzer);
+			lookup_query.add(content_query1, false, false);
+		}
+		if ((flags & FTS_LOOKUP_FLAG_BODY) != 0) {
+			content_query2 = QueryParser::parse(tkey, _T("body"),
+							    index->analyzer);
+			lookup_query.add(content_query2, false, false);
+		}
 	} catch (CLuceneError &err) {
 		if (getenv("DEBUG") != NULL) {
 			i_info("lucene: QueryParser::parse(%s) failed: %s",
 			       str_sanitize(key, 40), err.what());
 		}
+		if (content_query1 != NULL)
+			_CLDELETE(content_query1);
 		lucene_index_close(index);
 		return -1;
 	}
@@ -445,7 +464,7 @@
 	BooleanQuery query;
 	Term mailbox_term(_T("box"), index->tmailbox_name);
 	TermQuery mailbox_query(&mailbox_term);
-	query.add(content_query, true, false);
+	query.add(&lookup_query, true, false);
 	query.add(&mailbox_query, true, false);
 
 	try {
@@ -469,6 +488,9 @@
 		ret = -1;
 	}
 
-	_CLDELETE(content_query);
+	if (content_query1 != NULL)
+		_CLDELETE(content_query1);
+	if (content_query2 != NULL)
+		_CLDELETE(content_query2);
 	return ret;
 }
--- a/src/plugins/fts-lucene/lucene-wrapper.h	Thu Mar 15 23:46:26 2007 +0200
+++ b/src/plugins/fts-lucene/lucene-wrapper.h	Fri Mar 16 00:20:55 2007 +0200
@@ -12,12 +12,13 @@
 
 int lucene_index_build_init(struct lucene_index *index, uint32_t *last_uid_r);
 int lucene_index_build_more(struct lucene_index *index, uint32_t uid,
-			    const unsigned char *data, size_t size);
+			    const unsigned char *data, size_t size,
+			    bool headers);
 int lucene_index_build_deinit(struct lucene_index *index);
 
 int lucene_index_expunge(struct lucene_index *index, uint32_t uid);
 
-int lucene_index_lookup(struct lucene_index *index, const char *key,
-			ARRAY_TYPE(seq_range) *result);
+int lucene_index_lookup(struct lucene_index *index, enum fts_lookup_flags flags,
+			const char *key, ARRAY_TYPE(seq_range) *result);
 
 #endif
--- a/src/plugins/fts-squat/fts-backend-squat.c	Thu Mar 15 23:46:26 2007 +0200
+++ b/src/plugins/fts-squat/fts-backend-squat.c	Fri Mar 16 00:20:55 2007 +0200
@@ -85,7 +85,7 @@
 static int
 fts_backend_squat_build_more(struct fts_backend_build_context *_ctx,
 			     uint32_t uid, const unsigned char *data,
-			     size_t size)
+			     size_t size, bool headers __attr_unused__)
 {
 	struct squat_fts_backend_build_context *ctx =
 		(struct squat_fts_backend_build_context *)_ctx;
@@ -179,8 +179,9 @@
 }
 
 static int
-fts_backend_squat_lookup(struct fts_backend *_backend, const char *key,
-			 ARRAY_TYPE(seq_range) *result)
+fts_backend_squat_lookup(struct fts_backend *_backend,
+			 enum fts_lookup_flags flags __attr_unused__,
+			 const char *key, ARRAY_TYPE(seq_range) *result)
 {
 	struct squat_fts_backend *backend =
 		(struct squat_fts_backend *)_backend;
@@ -189,8 +190,9 @@
 }
 
 static int
-fts_backend_squat_filter(struct fts_backend *_backend, const char *key,
-			 ARRAY_TYPE(seq_range) *result)
+fts_backend_squat_filter(struct fts_backend *_backend,
+			 enum fts_lookup_flags flags __attr_unused__,
+			 const char *key, ARRAY_TYPE(seq_range) *result)
 {
 	struct squat_fts_backend *backend =
 		(struct squat_fts_backend *)_backend;
--- a/src/plugins/fts/fts-api-private.h	Thu Mar 15 23:46:26 2007 +0200
+++ b/src/plugins/fts/fts-api-private.h	Fri Mar 16 00:20:55 2007 +0200
@@ -13,7 +13,7 @@
 		(*build_init)(struct fts_backend *backend,
 			      uint32_t *last_uid_r);
 	int (*build_more)(struct fts_backend_build_context *ctx, uint32_t uid,
-			  const unsigned char *data, size_t size);
+			  const unsigned char *data, size_t size, bool headers);
 	int (*build_deinit)(struct fts_backend_build_context *ctx);
 
 	void (*expunge)(struct fts_backend *backend, struct mail *mail);
@@ -23,10 +23,10 @@
 	int (*lock)(struct fts_backend *backend);
 	void (*unlock)(struct fts_backend *backend);
 
-	int (*lookup)(struct fts_backend *backend, const char *key,
-		      ARRAY_TYPE(seq_range) *result);
-	int (*filter)(struct fts_backend *backend, const char *key,
-		      ARRAY_TYPE(seq_range) *result);
+	int (*lookup)(struct fts_backend *backend, enum fts_lookup_flags flags,
+		      const char *key, ARRAY_TYPE(seq_range) *result);
+	int (*filter)(struct fts_backend *backend, enum fts_lookup_flags flags,
+		      const char *key, ARRAY_TYPE(seq_range) *result);
 };
 
 enum fts_backend_flags {
--- a/src/plugins/fts/fts-api.c	Thu Mar 15 23:46:26 2007 +0200
+++ b/src/plugins/fts/fts-api.c	Fri Mar 16 00:20:55 2007 +0200
@@ -67,9 +67,9 @@
 }
 
 int fts_backend_build_more(struct fts_backend_build_context *ctx, uint32_t uid,
-			   const unsigned char *data, size_t size)
+			   const unsigned char *data, size_t size, bool headers)
 {
-	return ctx->backend->v.build_more(ctx, uid, data, size);
+	return ctx->backend->v.build_more(ctx, uid, data, size, headers);
 }
 
 int fts_backend_build_deinit(struct fts_backend_build_context *ctx)
@@ -98,24 +98,24 @@
 	backend->v.unlock(backend);
 }
 
-int fts_backend_lookup(struct fts_backend *backend, const char *key,
-		       ARRAY_TYPE(seq_range) *result)
+int fts_backend_lookup(struct fts_backend *backend, enum fts_lookup_flags flags,
+		       const char *key, ARRAY_TYPE(seq_range) *result)
 {
-	return backend->v.lookup(backend, key, result);
+	return backend->v.lookup(backend, flags, key, result);
 }
 
-int fts_backend_filter(struct fts_backend *backend, const char *key,
-		       ARRAY_TYPE(seq_range) *result)
+int fts_backend_filter(struct fts_backend *backend, enum fts_lookup_flags flags,
+		       const char *key, ARRAY_TYPE(seq_range) *result)
 {
 	ARRAY_TYPE(seq_range) tmp_result;
 	int ret;
 
 	if (backend->v.filter != NULL)
-		return backend->v.filter(backend, key, result);
+		return backend->v.filter(backend, flags, key, result);
 
 	/* do this ourself */
 	i_array_init(&tmp_result, 64);
-	ret = fts_backend_lookup(backend, key, &tmp_result);
+	ret = fts_backend_lookup(backend, flags, key, &tmp_result);
 	if (ret == 0) {
 		const struct seq_range *range;
 		unsigned int i, count;
--- a/src/plugins/fts/fts-api.h	Thu Mar 15 23:46:26 2007 +0200
+++ b/src/plugins/fts/fts-api.h	Fri Mar 16 00:20:55 2007 +0200
@@ -6,6 +6,11 @@
 
 #include "seq-range-array.h"
 
+enum fts_lookup_flags {
+	FTS_LOOKUP_FLAG_HEADERS	= 0x01,
+	FTS_LOOKUP_FLAG_BODY	= 0x02
+};
+
 struct fts_backend *
 fts_backend_init(const char *backend_name, struct mailbox *box);
 void fts_backend_deinit(struct fts_backend *backend);
@@ -19,9 +24,11 @@
 fts_backend_build_init(struct fts_backend *backend, uint32_t *last_uid_r);
 /* Add more contents to the index. The data must contain only full valid
    UTF-8 characters, but it doesn't need to be NUL-terminated. size contains
-   the data size in bytes, not characters. */
+   the data size in bytes, not characters. headers is TRUE if the data contains
+   message headers instead of message body. */
 int fts_backend_build_more(struct fts_backend_build_context *ctx, uint32_t uid,
-			   const unsigned char *data, size_t size);
+			   const unsigned char *data, size_t size,
+			   bool headers);
 /* Finish adding new data to the index. */
 int fts_backend_build_deinit(struct fts_backend_build_context *ctx);
 
@@ -41,12 +48,12 @@
 void fts_backend_unlock(struct fts_backend *backend);
 
 /* Lookup key from the index and return the found UIDs in result. */
-int fts_backend_lookup(struct fts_backend *backend, const char *key,
-		       ARRAY_TYPE(seq_range) *result);
+int fts_backend_lookup(struct fts_backend *backend, enum fts_lookup_flags flags,
+		       const char *key, ARRAY_TYPE(seq_range) *result);
 /* Drop UIDs from the result list for which the key doesn't exist. The idea
    is that with multiple search keywords you first lookup one and then filter
    the rest. */
-int fts_backend_filter(struct fts_backend *backend, const char *key,
-		       ARRAY_TYPE(seq_range) *result);
+int fts_backend_filter(struct fts_backend *backend, enum fts_lookup_flags flags,
+		       const char *key, ARRAY_TYPE(seq_range) *result);
 
 #endif
--- a/src/plugins/fts/fts-storage.c	Thu Mar 15 23:46:26 2007 +0200
+++ b/src/plugins/fts/fts-storage.c	Fri Mar 16 00:20:55 2007 +0200
@@ -110,7 +110,7 @@
 		return 1;
 
 	if (fts_backend_build_more(ctx->build, ctx->uid, str_data(ctx->headers),
-				   str_len(ctx->headers)) < 0)
+				   str_len(ctx->headers), TRUE) < 0)
 		return -1;
 
 	str_truncate(ctx->headers, 0);
@@ -210,8 +210,8 @@
 			}
 		} else {
 			if (fts_backend_build_more(ctx->build, ctx->mail->uid,
-						   block.data,
-						   block.size) < 0) {
+						   block.data, block.size,
+						   FALSE) < 0) {
 				ret = -1;
 				break;
 			}
@@ -366,6 +366,7 @@
 				   ARRAY_TYPE(seq_range) *uid_result)
 {
 	const char *key;
+	enum fts_lookup_flags flags;
 
 	for (; args != NULL; args = args->next) {
 		switch (args->type) {
@@ -392,7 +393,11 @@
 				key = args->hdr_field_name;
 			}
 
-			if (fts_backend_filter(fctx->backend, key,
+			flags = FTS_LOOKUP_FLAG_BODY;
+			if (args->type == SEARCH_TEXT_FAST ||
+			    args->type == SEARCH_TEXT)
+				flags |= FTS_LOOKUP_FLAG_HEADERS;
+			if (fts_backend_filter(fctx->backend, flags, key,
 					       uid_result) < 0) {
 				/* failed, but we already have limited
 				   the search, so just ignore this */
@@ -420,6 +425,7 @@
 			    struct fts_search_context *fctx)
 {
 	struct fts_backend *backend = fctx->backend;
+	enum fts_lookup_flags flags;
 	const char *key;
 	ARRAY_TYPE(seq_range) uid_result;
 
@@ -433,11 +439,17 @@
 
 		/* we're only checking the existence
 		   of the header. */
+		flags = FTS_LOOKUP_FLAG_HEADERS;
 		key = fctx->best_arg->hdr_field_name;
+	} else {
+		flags = FTS_LOOKUP_FLAG_BODY;
+		if (fctx->best_arg->type == SEARCH_TEXT_FAST ||
+		    fctx->best_arg->type == SEARCH_TEXT)
+			flags |= FTS_LOOKUP_FLAG_HEADERS;
 	}
 
 	i_array_init(&uid_result, 64);
-	if (fts_backend_lookup(backend, key, &uid_result) < 0) {
+	if (fts_backend_lookup(backend, flags, key, &uid_result) < 0) {
 		/* failed, fallback to reading everything */
 		array_free(&uid_result);
 		return;