changeset 6905:8c779f3e16be HEAD

Fixes to FTS handling.
author Timo Sirainen <tss@iki.fi>
date Mon, 03 Dec 2007 11:25:49 +0200
parents 275d22eb25ba
children a201579e585d
files src/plugins/fts-squat/squat-trie-private.h src/plugins/fts-squat/squat-trie.c src/plugins/fts/fts-api.c src/plugins/fts/fts-storage.c src/plugins/fts/fts-storage.h
diffstat 5 files changed, 115 insertions(+), 91 deletions(-) [+]
line wrap: on
line diff
--- a/src/plugins/fts-squat/squat-trie-private.h	Mon Dec 03 11:02:44 2007 +0200
+++ b/src/plugins/fts-squat/squat-trie-private.h	Mon Dec 03 11:25:49 2007 +0200
@@ -20,6 +20,10 @@
 	uint32_t root_unused_uids;
 	uint32_t root_next_uid;
 	uint32_t root_uidlist_idx;
+
+	uint8_t partial_len;
+	uint8_t full_len;
+	uint8_t normalize_map[256];
 };
 
 /*
@@ -119,7 +123,7 @@
 	void *mmap_base;
 	size_t mmap_size;
 
-	unsigned char normalize_map[256];
+	unsigned char default_normalize_map[256];
 
 	unsigned int corrupted:1;
 };
--- a/src/plugins/fts-squat/squat-trie.c	Mon Dec 03 11:02:44 2007 +0200
+++ b/src/plugins/fts-squat/squat-trie.c	Mon Dec 03 11:25:49 2007 +0200
@@ -17,10 +17,12 @@
 #include <time.h>
 #include <sys/mman.h>
 
+#define DEFAULT_NORMALIZE_MAP_CHARS \
+	"EOTIRSACDNLMVUGPHBFWYXKJQZ0123456789@.-+#$%_&"
+#define DEFAULT_PARTIAL_LEN 4
+#define DEFAULT_FULL_LEN 4
+
 #define MAX_FAST_LEVEL 3
-#define MAX_PARTIAL_LEN 4
-#define MAX_FULL_LEN 4
-
 #define SEQUENTIAL_COUNT 46
 
 struct squat_trie_build_context {
@@ -62,33 +64,34 @@
 static void squat_trie_normalize_map_build(struct squat_trie *trie)
 {
 	static unsigned char valid_chars[] =
-		"EOTIRSACDNLMVUGPHBFWYXKJQZ0123456789@.-+#$%_&";
+		DEFAULT_NORMALIZE_MAP_CHARS;
 	unsigned int i, j;
 
-	memset(trie->normalize_map, 0, sizeof(trie->normalize_map));
+	memset(trie->default_normalize_map, 0,
+	       sizeof(trie->default_normalize_map));
 
 #if 1
 	for (i = 0, j = 1; i < sizeof(valid_chars)-1; i++) {
 		unsigned char chr = valid_chars[i];
 
 		if (chr >= 'A' && chr <= 'Z')
-			trie->normalize_map[chr-'A'+'a'] = j;
-		trie->normalize_map[chr] = j++;
+			trie->default_normalize_map[chr-'A'+'a'] = j;
+		trie->default_normalize_map[chr] = j++;
 	}
 	i_assert(j <= SEQUENTIAL_COUNT);
 
 	for (i = 128; i < 256; i++)
-		trie->normalize_map[i] = j++;
+		trie->default_normalize_map[i] = j++;
 #else
 	for (i = 0; i < sizeof(valid_chars)-1; i++) {
 		unsigned char chr = valid_chars[i];
 
 		if (chr >= 'A' && chr <= 'Z')
-			trie->normalize_map[chr-'A'+'a'] = chr;
-		trie->normalize_map[chr] = chr;
+			trie->default_normalize_map[chr-'A'+'a'] = chr;
+		trie->default_normalize_map[chr] = chr;
 	}
 	for (i = 128; i < 256; i++)
-		trie->normalize_map[i] = i_toupper(i);
+		trie->default_normalize_map[i] = i_toupper(i);
 #endif
 }
 
@@ -113,6 +116,7 @@
 
 static void squat_trie_clear(struct squat_trie *trie)
 {
+	trie->corrupted = FALSE;
 	node_free(trie, &trie->root);
 	memset(&trie->root, 0, sizeof(trie->root));
 	memset(&trie->hdr, 0, sizeof(trie->hdr));
@@ -166,6 +170,13 @@
 	trie->hdr.version = SQUAT_TRIE_VERSION;
 	trie->hdr.indexid = ioloop_time;
 	trie->hdr.uidvalidity = trie->uidvalidity;
+	trie->hdr.partial_len = DEFAULT_PARTIAL_LEN;
+	trie->hdr.full_len = DEFAULT_FULL_LEN;
+
+	i_assert(sizeof(trie->hdr.normalize_map) ==
+		 sizeof(trie->default_normalize_map));
+	memcpy(trie->hdr.normalize_map, trie->default_normalize_map,
+	       sizeof(trie->hdr.normalize_map));
 }
 
 static int squat_trie_open_fd(struct squat_trie *trie)
@@ -694,16 +705,15 @@
 squat_build_word(struct squat_trie *trie, uint32_t uid,
 		 const unsigned char *data, unsigned int size)
 {
-#if MAX_PARTIAL_LEN > 0
 	unsigned int i;
 
 	for (i = size - 1; i > 0; i--) {
 		if (squat_build_add(trie, uid, data + i,
-				    I_MIN(MAX_PARTIAL_LEN, size - i)) < 0)
+				    I_MIN(trie->hdr.partial_len, size - i)) < 0)
 			return -1;
 	}
-#endif
-	return squat_build_add(trie, uid, data, I_MIN(size, MAX_FULL_LEN));
+	return squat_build_add(trie, uid, data,
+			       I_MIN(size, trie->hdr.full_len));
 }
 
 static unsigned char *
@@ -715,7 +725,7 @@
 
 	dest = t_malloc(size);
 	for (i = 0; i < size; i++)
-		dest[i] = trie->normalize_map[data[i]];
+		dest[i] = trie->hdr.normalize_map[data[i]];
 	return dest;
 }
 
@@ -862,7 +872,7 @@
 	ctx = i_new(struct squat_trie_iterate_context, 1);
 	ctx->trie = trie;
 	ctx->cur.node = &trie->root;
-	i_array_init(&ctx->parents, MAX_FULL_LEN*2);
+	i_array_init(&ctx->parents, trie->hdr.partial_len*2);
 	return ctx;
 }
 
@@ -981,6 +991,23 @@
 	return squat_uidlist_rebuild_finish(ctx, ret < 0);
 }
 
+static bool squat_trie_check_header(struct squat_trie *trie)
+{
+	if (trie->hdr.version != SQUAT_TRIE_VERSION ||
+	    trie->hdr.uidvalidity != trie->uidvalidity)
+		return FALSE;
+
+	if (trie->hdr.partial_len > trie->hdr.full_len) {
+		i_error("Corrupted %s: partial len > full len", trie->path);
+		return FALSE;
+	}
+	if (trie->hdr.full_len == 0) {
+		i_error("Corrupted %s: full len=0", trie->path);
+		return FALSE;
+	}
+	return TRUE;
+}
+
 static int squat_trie_map_header(struct squat_trie *trie)
 {
 	if (trie->locked_file_size == 0) {
@@ -1008,8 +1035,7 @@
 
 	if (trie->hdr.root_offset == 0)
 		return 0;
-	if (trie->hdr.version != SQUAT_TRIE_VERSION ||
-	    trie->hdr.uidvalidity != trie->uidvalidity) {
+	if (!squat_trie_check_header(trie)) {
 		squat_trie_delete(trie);
 		squat_trie_close(trie);
 		squat_trie_header_init(trie);
@@ -1082,7 +1108,7 @@
 	ctx->trie = trie;
 	ctx->first_uid = trie->root.next_uid;
 
-	*last_uid_r = I_MAX(trie->root.next_uid/2, 1) - 1;
+	*last_uid_r = I_MAX((trie->root.next_uid+1)/2, 1) - 1;
 	*ctx_r = ctx;
 	return 0;
 }
@@ -1131,6 +1157,8 @@
 	ret = squat_write_nodes(ctx);
 	ctx->output = NULL;
 
+	if (trie->corrupted)
+		ret = -1;
 	if (ret == 0) {
 		trie->hdr.used_file_size = output->offset;
 		o_stream_seek(output, 0);
@@ -1194,7 +1222,7 @@
 			return -1;
 	}
 
-	*last_uid_r = I_MAX(trie->root.next_uid/2, 1) - 1;
+	*last_uid_r = I_MAX((trie->root.next_uid+1)/2, 1) - 1;
 	return 0;
 }
 
@@ -1334,10 +1362,10 @@
 	int ret;
 
 	do {
-		if (size <= MAX_PARTIAL_LEN)
+		if (size <= ctx->trie->hdr.partial_len)
 			block_len = size;
 		else
-			block_len = MAX_PARTIAL_LEN;
+			block_len = ctx->trie->hdr.partial_len;
 		block = data + size - block_len;
 
 		ret = squat_trie_lookup_data(ctx->trie, block, block_len,
@@ -1357,10 +1385,30 @@
 			seq_range_array_remove_invert_range(ctx->maybe_uids,
 							    &ctx->tmp_uids2);
 		}
-	} while (--size >= MAX_PARTIAL_LEN);
+	} while (--size >= ctx->trie->hdr.partial_len);
 	return 1;
 }
 
+static void squat_trie_add_unknown(struct squat_trie *trie,
+				   ARRAY_TYPE(seq_range) *maybe_uids)
+{
+	struct seq_range *range, new_range;
+	unsigned int count;
+	uint32_t last_uid;
+
+	last_uid = I_MAX((trie->root.next_uid+1)/2, 1) - 1;
+
+	range = array_get_modifiable(maybe_uids, &count);
+	if (count > 0 && range[count-1].seq2 == last_uid) {
+		/* increase the range */
+		range[count-1].seq2 = (uint32_t)-1;
+	} else {
+		new_range.seq1 = last_uid + 1;
+		new_range.seq2 = (uint32_t)-1;
+		array_append(maybe_uids, &new_range, 1);
+	}
+}
+
 int squat_trie_lookup(struct squat_trie *trie, const char *str,
 		      enum squat_index_type type,
 		      ARRAY_TYPE(seq_range) *definite_uids,
@@ -1407,11 +1455,12 @@
 							i - start);
 		}
 		t_pop();
-		return ret < 0 ? -1 :
-			(array_count(maybe_uids) > 0 ? 1 : 0);
+		squat_trie_add_unknown(trie, maybe_uids);
+		return ret < 0 ? -1 : 0;
 	}
 
-	if (MAX_FULL_LEN > MAX_PARTIAL_LEN || size <= MAX_PARTIAL_LEN) {
+	if (size <= trie->hdr.partial_len ||
+	    trie->hdr.full_len > trie->hdr.partial_len) {
 		ret = squat_trie_lookup_data(trie, data, size, &ctx.tmp_uids);
 		if (ret > 0) {
 			squat_trie_filter_type(type, &ctx.tmp_uids,
@@ -1421,7 +1470,7 @@
 		array_clear(definite_uids);
 	}
 
-	if (size <= MAX_PARTIAL_LEN || MAX_PARTIAL_LEN == 0) {
+	if (size <= trie->hdr.partial_len || trie->hdr.partial_len == 0) {
 		/* we have the result */
 		array_clear(maybe_uids);
 	} else {
@@ -1429,9 +1478,8 @@
 						i - start);
 	}
 	t_pop();
-	return ret < 0 ? -1 :
-		(array_count(maybe_uids) > 0 ||
-		 array_count(definite_uids) > 0 ? 1 : 0);
+	squat_trie_add_unknown(trie, maybe_uids);
+	return ret < 0 ? -1 : 0;
 }
 
 struct squat_uidlist *squat_trie_get_uidlist(struct squat_trie *trie)
--- a/src/plugins/fts/fts-api.c	Mon Dec 03 11:02:44 2007 +0200
+++ b/src/plugins/fts/fts-api.c	Mon Dec 03 11:25:49 2007 +0200
@@ -144,16 +144,8 @@
 
 	ret = backend->v.lookup(backend, key, flags & ~FTS_LOOKUP_FLAG_INVERT,
 				definite_uids, maybe_uids);
-	if (ret <= 0) {
-		if (unlikely(ret < 0))
-			return -1;
-		i_assert(array_count(definite_uids) == 0 &&
-			 array_count(maybe_uids) == 0);
-	} else {
-		i_assert(array_count(definite_uids) > 0 ||
-			 array_count(maybe_uids) > 0);
-	}
-
+	if (unlikely(ret < 0))
+		return -1;
 	if ((flags & FTS_LOOKUP_FLAG_INVERT) != 0)
 		fts_lookup_invert(definite_uids, maybe_uids);
 	return 0;
@@ -165,38 +157,35 @@
 		  const ARRAY_TYPE(seq_range) *src_maybe,
 		  const ARRAY_TYPE(seq_range) *src_definite)
 {
-	const struct seq_range *dest, *src;
-	unsigned int i, dest_count, src_count;
-	uint32_t seq, seq2;
-	bool removals;
+	ARRAY_TYPE(seq_range) src_unwanted;
+	const struct seq_range *range;
+	struct seq_range new_range;
+	unsigned int i, count;
+	uint32_t seq;
 
 	/* add/leave to dest_maybe if at least one list has maybe,
 	   and no lists have none */
-	dest = array_get(dest_maybe, &dest_count);
-	src = array_get(dest_maybe, &src_count);
+
+	t_push();
+	/* create unwanted sequences list from both sources */
+	t_array_init(&src_unwanted, 128);
+	new_range.seq1 = 0; new_range.seq2 = (uint32_t)-1;
+	array_append(&src_unwanted, &new_range, 1);
+	seq_range_array_remove_seq_range(&src_unwanted, src_maybe);
+	seq_range_array_remove_seq_range(&src_unwanted, src_definite);
 
 	/* drop unwanted uids */
-	for (i = 0; i < dest_count; ) {
-		seq2 = dest[i].seq2;
-		removals = FALSE;
-		for (seq = dest[i].seq1; seq <= seq2; seq++) {
-			if (!seq_range_exists(src_definite, seq) &&
-			    !seq_range_exists(src_maybe, seq)) {
-				if (seq_range_array_remove(dest_maybe, seq))
-					removals = TRUE;
-			}
-		}
-		if (!removals)
-			i++;
-	}
+	seq_range_array_remove_seq_range(dest_maybe, &src_unwanted);
 
-	/* add new uids */
-	for (i = 0; i < src_count; i++) {
-		for (seq = src[i].seq1; seq <= src[i].seq2; seq++) {
-			if (seq_range_exists(dest_definite, seq))
+	/* add uids that are in dest_definite and src_maybe lists */
+	range = array_get(dest_definite, &count);
+	for (i = 0; i < count; i++) {
+		for (seq = range[i].seq1; seq <= range[i].seq2; seq++) {
+			if (seq_range_exists(src_maybe, seq))
 				seq_range_array_add(dest_maybe, 0, seq);
 		}
 	}
+	t_pop();
 }
 
 int fts_backend_filter(struct fts_backend *backend, const char *key,
--- a/src/plugins/fts/fts-storage.c	Mon Dec 03 11:02:44 2007 +0200
+++ b/src/plugins/fts/fts-storage.c	Mon Dec 03 11:25:49 2007 +0200
@@ -67,13 +67,14 @@
 	return ret;
 }
 
-static int fts_build_mail_flush_headers(struct fts_storage_build_context *ctx)
+static int fts_build_mail_flush_headers(struct fts_storage_build_context *ctx,
+					bool root)
 {
 	if (str_len(ctx->headers) == 0)
 		return 0;
 
 	if (fts_backend_build_more(ctx->build, ctx->uid, str_data(ctx->headers),
-				   str_len(ctx->headers), TRUE) < 0)
+				   str_len(ctx->headers), root) < 0)
 		return -1;
 
 	str_truncate(ctx->headers, 0);
@@ -143,7 +144,8 @@
 			fts_build_mail_header(ctx, &block);
 		else if (block.size == 0) {
 			/* end of headers */
-			ret = fts_build_mail_flush_headers(ctx);
+			bool root = raw_block.part->parent == NULL;
+			ret = fts_build_mail_flush_headers(ctx, root);
 			if (ret < 0)
 				break;
 		} else {
@@ -179,8 +181,6 @@
 		/* no new messages */
 		return 0;
 	}
-	fctx->first_nonindexed_seq = seqset.seq1;
-
 	if (fctx->best_arg->type == SEARCH_HEADER) {
 		/* we're not updating the index just for header lookups */
 		return 0;
@@ -404,24 +404,6 @@
 	}
 }
 
-static int
-search_next_update_seq_finish(struct mail_search_context *ctx,
-			      struct fts_search_context *fctx)
-{
-	struct fts_mailbox *fbox = FTS_CONTEXT(ctx->transaction->box);
-
-	if (fctx->first_nonindexed_seq == 0) {
-		/* everything was indexed. we're done */
-		return 0;
-	}
-	if (ctx->seq < fctx->first_nonindexed_seq) {
-		/* scan the non-indexed messages */
-		fctx->seqs_set = FALSE;
-		ctx->seq = fctx->first_nonindexed_seq - 1;
-	}
-	return fbox->module_ctx.super.search_next_update_seq(ctx);
-}
-
 static int fts_mailbox_search_next_update_seq(struct mail_search_context *ctx)
 {
 	struct fts_mailbox *fbox = FTS_CONTEXT(ctx->transaction->box);
@@ -451,8 +433,10 @@
 
 		/* use whichever is lower of definite/maybe */
 		if (fctx->definite_idx == def_count) {
-			if (fctx->maybe_idx == maybe_count)
-				return search_next_update_seq_finish(ctx, fctx);
+			if (fctx->maybe_idx == maybe_count) {
+				/* we're finished */
+				return 0;
+			}
 			use_maybe = TRUE;
 		} else if (fctx->maybe_idx == maybe_count) {
 			use_maybe = FALSE;
--- a/src/plugins/fts/fts-storage.h	Mon Dec 03 11:02:44 2007 +0200
+++ b/src/plugins/fts/fts-storage.h	Mon Dec 03 11:25:49 2007 +0200
@@ -20,7 +20,6 @@
 
 	ARRAY_TYPE(seq_range) definite_seqs, maybe_seqs;
 	unsigned int definite_idx, maybe_idx;
-	uint32_t first_nonindexed_seq;
 
 	struct fts_backend *build_backend;
 	struct fts_storage_build_context *build_ctx;