changeset 7952:04a3be30e5a6 HEAD

Thread index fixes and optimizations.
author Timo Sirainen <tss@iki.fi>
date Thu, 26 Jun 2008 19:11:56 +0300
parents 5e3b995372d4
children 01c23befce4f
files doc/thread-refs.txt src/lib-index/mail-hash.c src/lib-index/mail-hash.h src/lib-storage/index/Makefile.am src/lib-storage/index/index-thread-finish.c src/lib-storage/index/index-thread-links.c src/lib-storage/index/index-thread-list.c src/lib-storage/index/index-thread-private.h src/lib-storage/index/index-thread.c
diffstat 9 files changed, 745 insertions(+), 193 deletions(-) [+]
line wrap: on
line diff
--- a/doc/thread-refs.txt	Thu Jun 26 08:49:02 2008 +0300
+++ b/doc/thread-refs.txt	Thu Jun 26 19:11:56 2008 +0300
@@ -62,17 +62,18 @@
   parent: Pointer to parent node. Children pointers aren't required.
   uid: Message's UID (0 = expunged or never even existed)
 
-  link_refcount: Number of messages referencing "this node" -> "parent node"
-    (e.g. "References: a b" would increase this message's reference as well as
-    b's reference, regardless of how the links are really added to the thread
-    tree).
+  parent_link_refcount: Number of messages containing "this message" -> "parent
+    message" link, i.e. "number of links to parent node". However since parents
+    can change, not all of these references might be from our current child
+    nodes. When this refcount reaches 0, it means we must detach from our
+    parent.
   expunge_rebuilds: If this message gets expunged, rebuild the thread tree.
   parent_unref_rebuilds: If a link between this node and its child gets
     unreferenced, rebuild the thread tree.
 }
 
 link_reference(parent_node, child_node)
-  parent_node.link_refcount++
+  child_node.parent_link_refcount++
   if is_ancestor(child_node, parent_node)
     // child_node is an ancestor of parent_node. Adding child_node ->
     // parent_node would introduce a loop. If any messages referencing the
@@ -144,8 +145,8 @@
   if parent.parent_unref_rebuilds
     return false
 
-  child.link_refcount--
-  if child.link_refcount == 0
+  child.parent_link_refcount--
+  if child.parent_link_refcount == 0
     child.parent = NIL
   return true  
 
--- a/src/lib-index/mail-hash.c	Thu Jun 26 08:49:02 2008 +0300
+++ b/src/lib-index/mail-hash.c	Thu Jun 26 19:11:56 2008 +0300
@@ -232,6 +232,8 @@
 
 	hdr->hash_size = I_MAX(primes_closest(hash_size), MAIL_HASH_MIN_SIZE);
 	hdr->record_count = 1; /* [0] always exists */
+	hdr->reset_counter = hash->hdr == NULL ? ioloop_time :
+		hash->hdr->reset_counter + 1;
 }
 
 static bool mail_hash_check_header(struct mail_hash *hash,
@@ -603,6 +605,11 @@
 	hash->lock_type = F_UNLCK;
 }
 
+int mail_hash_get_lock_type(struct mail_hash *hash)
+{
+	return hash->lock_type;
+}
+
 int mail_hash_create_excl_locked(struct mail_hash *hash)
 {
 	bool created;
@@ -696,6 +703,7 @@
 	int fd = hash->fd;
 
 	i_assert(hash->lock_type == F_WRLCK);
+	i_assert(trans->hdr.hashed_count <= trans->hdr.record_count);
 
 	if (trans->failed)
 		return -1;
@@ -726,7 +734,8 @@
 		trans->base_count * trans->hdr.record_size;
 	if (fd == -1) {
 		temp_path = t_strconcat(hash->filepath, ".tmp", NULL);
-		fd = open(temp_path, O_RDWR | O_CREAT | O_TRUNC, 0600);
+		fd = open(temp_path, O_RDWR | O_CREAT | O_TRUNC,
+			  hash->index->mode);
 		if (fd == -1) {
 			if (ENOSPACE(errno)) {
 				hash->index->nodiskspace = TRUE;
@@ -807,6 +816,17 @@
 	return trans->failed;
 }
 
+bool mail_hash_transaction_is_in_memory(struct mail_hash_transaction *trans)
+{
+	return trans->hash->in_memory;
+}
+
+struct mail_hash *
+mail_hash_transaction_get_hash(struct mail_hash_transaction *trans)
+{
+	return trans->hash;
+}
+
 void mail_hash_transaction_set_corrupted(struct mail_hash_transaction *trans,
 					 const char *error)
 {
@@ -966,6 +986,11 @@
 	if (trans->hdr.first_hole_idx != 0) {
 		/* allocate the record from the first hole */
 		idx = trans->hdr.first_hole_idx;
+		if (idx >= trans->hdr.record_count) {
+			mail_hash_transaction_set_corrupted(trans,
+				"Corrupted first_hole_idx");
+			idx = 0;
+		}
 		rec = mail_hash_idx(trans, idx);
 
 		memcpy(&trans->hdr.first_hole_idx, rec + 1,
@@ -1104,8 +1129,8 @@
 						"Too low hashed_count");
 			return;
 		}
-		trans->hdr.hashed_count--;
 	}
 
+	trans->hdr.hashed_count--;
 	mail_hash_remove_idx(trans, idx);
 }
--- a/src/lib-index/mail-hash.h	Thu Jun 26 08:49:02 2008 +0300
+++ b/src/lib-index/mail-hash.h	Thu Jun 26 19:11:56 2008 +0300
@@ -47,6 +47,8 @@
 	uint32_t last_uid;
 	/* Number of message records (records with non-zero UID) */
 	uint32_t message_count;
+	/* Increased every time the hash is reset */
+	uint32_t reset_counter;
 };
 
 struct mail_hash_record {
@@ -95,6 +97,8 @@
 int mail_hash_lock_exclusive(struct mail_hash *hash,
 			     enum mail_hash_lock_flags flags);
 void mail_hash_unlock(struct mail_hash *hash);
+/* Returns the current locking state (F_UNLCK, F_RDLCK, F_WRLCK) */
+int mail_hash_get_lock_type(struct mail_hash *hash);
 
 struct mail_hash_transaction *
 mail_hash_transaction_begin(struct mail_hash *hash, unsigned int min_hash_size);
@@ -103,6 +107,12 @@
 /* Returns TRUE if transaction is in broken state because of an earlier
    I/O error or detected file corruption. */
 bool mail_hash_transaction_is_broken(struct mail_hash_transaction *trans);
+/* Returns TRUE if hash is currently being updated in memory. */
+bool mail_hash_transaction_is_in_memory(struct mail_hash_transaction *trans);
+
+/* Returns the hash structure of the transaction. */
+struct mail_hash *
+mail_hash_transaction_get_hash(struct mail_hash_transaction *trans);
 
 /* Clear the entire hash file's contents. */
 void mail_hash_reset(struct mail_hash_transaction *trans);
--- a/src/lib-storage/index/Makefile.am	Thu Jun 26 08:49:02 2008 +0300
+++ b/src/lib-storage/index/Makefile.am	Thu Jun 26 19:11:56 2008 +0300
@@ -26,6 +26,7 @@
 	index-thread.c \
 	index-thread-finish.c \
 	index-thread-links.c \
+	index-thread-list.c \
 	index-transaction.c
 
 headers = \
--- a/src/lib-storage/index/index-thread-finish.c	Thu Jun 26 08:49:02 2008 +0300
+++ b/src/lib-storage/index/index-thread-finish.c	Thu Jun 26 19:11:56 2008 +0300
@@ -37,6 +37,7 @@
 
 	struct mail *tmp_mail;
 	struct mail_hash_transaction *hash_trans;
+	struct mail_thread_list_context *hash_list_ctx;
 
 	ARRAY_DEFINE(roots, struct mail_thread_root_node);
 	ARRAY_DEFINE(shadow_nodes, struct mail_thread_shadow_node);
@@ -127,16 +128,24 @@
 	return 0;
 }
 
+static uint32_t
+thread_lookup_existing(struct thread_finish_context *ctx, uint32_t idx)
+{
+	const struct mail_thread_node *node;
+
+	node = mail_hash_lookup_idx(ctx->hash_trans, idx);
+	i_assert(MAIL_INDEX_NODE_EXISTS(node));
+	i_assert(node->uid_or_id != 0);
+	return node->uid_or_id;
+}
+
 static int
 thread_child_node_fill(struct thread_finish_context *ctx,
 		       struct mail_thread_child_node *child)
 {
-	const struct mail_thread_node *node;
 	int tz;
 
-	node = mail_hash_lookup_idx(ctx->hash_trans, child->idx);
-	i_assert(node->uid != 0 && node->exists);
-	child->uid = node->uid;
+	child->uid = thread_lookup_existing(ctx, child->idx);
 
 	if (!mail_set_uid(ctx->tmp_mail, child->uid)) {
 		/* the UID should have existed. we would have rebuild
@@ -164,7 +173,6 @@
 		     ARRAY_TYPE(mail_thread_child_node) *sorted_children)
 {
 	const struct mail_thread_shadow_node *shadows;
-	const struct mail_thread_node *node;
 	struct mail_thread_child_node child, *children;
 	unsigned int count;
 
@@ -177,9 +185,7 @@
 	i_assert(child.idx != 0);
 	if (shadows[child.idx].next_sibling_idx == 0) {
 		/* only child - don't bother setting sort date */
-		node = mail_hash_lookup_idx(ctx->hash_trans, child.idx);
-		i_assert(node->uid != 0 && node->exists);
-		child.uid = node->uid;
+		child.uid = thread_lookup_existing(ctx, child.idx);
 
 		array_append(sorted_children, &child, 1);
 		return 0;
@@ -202,12 +208,11 @@
 {
 	struct subject_gather_context gather_ctx;
 	struct mail_thread_root_node *roots;
-	const struct mail_thread_node *node;
 	const char *subject;
 	unsigned int i, count;
 	ARRAY_TYPE(mail_thread_child_node) sorted_children;
 	const struct mail_thread_child_node *children;
-	uint32_t idx;
+	uint32_t idx, uid;
 	int ret = 0;
 
 	memset(&gather_ctx, 0, sizeof(gather_ctx));
@@ -240,10 +245,8 @@
 			continue;
 		}
 
-		node = mail_hash_lookup_idx(ctx->hash_trans, idx);
-		i_assert(node->uid != 0 && node->exists);
-
-		if (!mail_set_uid(ctx->tmp_mail, node->uid)) {
+		uid = thread_lookup_existing(ctx, idx);
+		if (!mail_set_uid(ctx->tmp_mail, uid)) {
 			/* the UID should have existed. we would have rebuild
 			   the thread tree otherwise. */
 			mail_hash_transaction_set_corrupted(ctx->hash_trans,
@@ -443,7 +446,8 @@
 	roots = array_get_modifiable(&ctx->roots, &root_count);
 	for (idx = 1; idx < record_count; idx++) {
 		node = mail_hash_lookup_idx(ctx->hash_trans, idx);
-		if (MAIL_HASH_RECORD_IS_DELETED(&node->rec) || !node->exists)
+		if (MAIL_HASH_RECORD_IS_DELETED(&node->rec) ||
+		    !MAIL_INDEX_NODE_EXISTS(node))
 			continue;
 
 		child.idx = idx;
@@ -467,14 +471,17 @@
 	return 0;
 }
 
-static void mail_thread_create_shadows(struct thread_finish_context *ctx,
-				       uint32_t record_count)
+static int mail_thread_create_shadows(struct thread_finish_context *ctx,
+				      uint32_t record_count)
 {
+	struct mail_thread_list_update_context *thread_list_ctx;
 	struct mail_thread_node *node, *parent;
 	struct mail_thread_root_node root;
 	struct mail_thread_child_node child;
+	struct mail_hash *hash;
 	uint8_t *referenced;
 	uint32_t idx, i, j, parent_idx, alloc_size, max;
+	int ret = 0;
 
 	ctx->use_sent_date = FALSE;
 
@@ -485,26 +492,42 @@
 	referenced = i_new(uint8_t, alloc_size);
 	for (idx = 1; idx < record_count; idx++) {
 		node = mail_hash_lookup_idx(ctx->hash_trans, idx);
-		if (MAIL_HASH_RECORD_IS_DELETED(&node->rec))
+		if (MAIL_HASH_RECORD_IS_DELETED(&node->rec)) {
+			/* @UNSAFE: mark deleted records as referenced, so we
+			   don't waste time with them */
+			referenced[idx / 8] |= 1 << (idx % 8);
 			continue;
+		}
 
-		if (node->exists) {
+		if (MAIL_INDEX_NODE_EXISTS(node)) {
 			/* @UNSAFE: don't remove existing nodes */
 			referenced[idx / 8] |= 1 << (idx % 8);
 		}
 
 		if (node->parent_idx == 0) {
+			/* root node - add to roots list */
 			root.node.idx = idx;
-			root.node.uid = node->uid;
-			root.dummy = !node->exists;
+			if (!MAIL_INDEX_NODE_EXISTS(node)) {
+				root.dummy = TRUE;
+				root.node.uid = 0;
+			} else {
+				root.dummy = FALSE;
+				root.node.uid = node->uid_or_id;
+			}
 			array_append(&ctx->roots, &root, 1);
 			continue;
 		}
+		if (node->parent_idx >= record_count) {
+			mail_hash_transaction_set_corrupted(ctx->hash_trans,
+				"parent_idx too large");
+			ret = -1;
+			break;
+		}
 
 		/* @UNSAFE: keep track of referenced nodes */
 		referenced[node->parent_idx / 8] |= 1 << (node->parent_idx % 8);
 
-		if (!node->exists) {
+		if (!MAIL_INDEX_NODE_EXISTS(node)) {
 			/* dummy node */
 			continue;
 		}
@@ -514,7 +537,8 @@
 		   parents, add it as the highest dummy's child. */
 		parent_idx = node->parent_idx;
 		parent = mail_hash_lookup_idx(ctx->hash_trans, parent_idx);
-		while (!parent->exists && parent->parent_idx != 0) {
+		while (!MAIL_INDEX_NODE_EXISTS(parent) &&
+		       parent->parent_idx != 0) {
 			parent_idx = parent->parent_idx;
 			parent = mail_hash_lookup_idx(ctx->hash_trans,
 						      parent_idx);
@@ -523,22 +547,40 @@
 	}
 
 	/* remove unneeded records from hash */
-	for (i = 1; i < alloc_size; i++) {
+	hash = mail_hash_transaction_get_hash(ctx->hash_trans);
+	if (ret < 0 || mail_hash_get_lock_type(hash) != F_WRLCK) {
+		/* thread index isn't locked, can't delete anything */
+		i_free(referenced);
+		return ret;
+	}
+
+	/* FIXME: we could remove everything from thread list by pointing to
+	   existing messages' References: headers */
+	thread_list_ctx = mail_thread_list_update_begin(ctx->hash_list_ctx,
+							ctx->hash_trans);
+	for (i = 0; i < alloc_size; i++) {
 		if (referenced[i] == 0xff)
 			continue;
 
+		j = i == 0 ? 1 : 0;
 		max = i+1 < alloc_size ? 8 : (record_count % 8);
-		for (j = 0; j < max; j++) {
+		for (; j < max; j++) {
 			if ((referenced[i] & (1 << j)) != 0)
 				continue;
 
 			idx = i*8 + j;
 			node = mail_hash_lookup_idx(ctx->hash_trans, idx);
+
+			if (node->ref_index == MAIL_INDEX_NODE_REF_EXT) {
+				mail_thread_list_remove(thread_list_ctx,
+							node->uid_or_id);
+			}
 			mail_hash_remove(ctx->hash_trans, idx,
 					 node->msgid_crc32);
 		}
 	}
 	i_free(referenced);
+	return mail_thread_list_commit(&thread_list_ctx);
 }
 
 static int mail_thread_finish(struct thread_finish_context *ctx,
@@ -556,7 +598,8 @@
 	/* make sure all shadow indexes are accessible directly. */
 	(void)array_idx_modifiable(&ctx->shadow_nodes, hdr->record_count);
 
-	mail_thread_create_shadows(ctx, hdr->record_count);
+	if (mail_thread_create_shadows(ctx, hdr->record_count) < 0)
+		return -1;
 
 	/* (4) */
 	ctx->use_sent_date = TRUE;
@@ -588,11 +631,10 @@
 nodes_change_uids_to_seqs(struct mail_thread_iterate_context *iter, bool root)
 {
 	struct mail_thread_child_node *children;
-	struct mailbox *box;
+	struct mailbox *box = iter->ctx->tmp_mail->box;
 	unsigned int i, count;
 	uint32_t uid, seq;
 
-	box = mailbox_transaction_get_mailbox(iter->ctx->tmp_mail->transaction);
 	children = array_get_modifiable(&iter->children, &count);
 	for (i = 0; i < count; i++) {
 		uid = children[i].uid;
@@ -656,21 +698,27 @@
 struct mail_thread_iterate_context *
 mail_thread_iterate_init_full(struct mail *tmp_mail,
 			      struct mail_hash_transaction *hash_trans,
+			      struct mail_thread_list_context *hash_list_ctx,
 			      enum mail_thread_type thread_type,
 			      bool return_seqs)
 {
 	struct mail_thread_iterate_context *iter;
 	struct thread_finish_context *ctx;
+	struct mail_hash *hash;
 
 	iter = i_new(struct mail_thread_iterate_context, 1);
 	ctx = iter->ctx = i_new(struct thread_finish_context, 1);
 	ctx->refcount = 1;
 	ctx->tmp_mail = tmp_mail;
 	ctx->hash_trans = hash_trans;
+	ctx->hash_list_ctx = hash_list_ctx;
 	ctx->return_seqs = return_seqs;
 	if (mail_thread_finish(ctx, thread_type) < 0)
 		iter->failed = TRUE;
 	else {
+		hash = mail_hash_transaction_get_hash(hash_trans);
+		if (mail_hash_get_lock_type(hash) == F_WRLCK)
+			(void)mail_hash_transaction_write(hash_trans);
 		mail_thread_iterate_fill_root(iter);
 		if (return_seqs)
 			nodes_change_uids_to_seqs(iter, TRUE);
@@ -713,10 +761,7 @@
 
 	if (ret < 0) {
 		struct mail *mail = iter->ctx->tmp_mail;
-		struct mailbox *box;
-
-		box = mailbox_transaction_get_mailbox(mail->transaction);
-		mail_storage_set_internal_error(box->storage);
+		mail_storage_set_internal_error(mail->box->storage);
 	}
 	if (--iter->ctx->refcount == 0) {
 		array_free(&iter->ctx->roots);
--- a/src/lib-storage/index/index-thread-links.c	Thu Jun 26 08:49:02 2008 +0300
+++ b/src/lib-storage/index/index-thread-links.c	Thu Jun 26 19:11:56 2008 +0300
@@ -8,13 +8,15 @@
 
 static struct mail_thread_node *
 thread_msgid_get(struct mail_thread_update_context *ctx, uint32_t ref_uid,
-		 const char *msgid, uint32_t *idx_r)
+		 unsigned int ref_index, const char *msgid, uint32_t *idx_r)
 {
 	struct mail_thread_node *node, new_node;
 	struct msgid_search_key key;
 	const char **msgidp;
 	uint32_t idx;
 
+	i_assert(ref_index != MAIL_INDEX_NODE_REF_MSGID);
+
 	key.msgid = msgid;
 	key.msgid_crc32 = crc32_str_nonzero(msgid);
 
@@ -23,20 +25,18 @@
 		/* not found, create */
 		memset(&new_node, 0, sizeof(new_node));
 		new_node.msgid_crc32 = key.msgid_crc32;
-		new_node.uid = ref_uid;
+		if (ref_index <= MAIL_INDEX_NODE_REF_MAX_VALUE) {
+			new_node.uid_or_id = ref_uid;
+			new_node.ref_index = ref_index;
+		} else {
+			new_node.uid_or_id =
+				mail_thread_list_add(ctx->thread_list_ctx,
+						     msgid);
+			new_node.ref_index = MAIL_INDEX_NODE_REF_EXT;
+		}
 
 		mail_hash_insert(ctx->hash_trans, &key, &new_node, &idx);
 		node = mail_hash_lookup_idx(ctx->hash_trans, idx);
-	} else if (node->uid == 0 && ref_uid != 0) {
-		/* make non-existing node uniquely identifiable */
-		if (node->exists) {
-			mail_hash_transaction_set_corrupted(ctx->hash_trans,
-							    "uid=0 found");
-			ctx->failed = TRUE;
-		} else {
-			node->uid = ref_uid;
-			mail_hash_update(ctx->hash_trans, idx);
-		}
 	}
 
 	/* keep message-ids cached */
@@ -55,11 +55,17 @@
 	struct mail_thread_node unode;
 
 	if (msgid != NULL) {
-		node = thread_msgid_get(ctx, 0, msgid, idx_r);
-		if (!node->exists) {
+		node = thread_msgid_get(ctx, 0, 0, msgid, idx_r);
+		if (!MAIL_INDEX_NODE_EXISTS(node)) {
 			/* add UID to node */
-			node->uid = uid;
-			node->exists = TRUE;
+			if (node->ref_index == MAIL_INDEX_NODE_REF_EXT &&
+			    node->uid_or_id != 0) {
+				mail_thread_list_remove(ctx->thread_list_ctx,
+							node->uid_or_id);
+			}
+
+			node->uid_or_id = uid;
+			node->ref_index = MAIL_INDEX_NODE_REF_MSGID;
 		} else {
 			/* duplicate, keep the original. if the original ever
 			   gets expunged, rebuild. */
@@ -72,8 +78,8 @@
 	if (msgid == NULL) {
 		/* no valid message-id */
 		memset(&unode, 0, sizeof(unode));
-		unode.uid = uid;
-		unode.exists = TRUE;
+		unode.uid_or_id = uid;
+		unode.ref_index = MAIL_INDEX_NODE_REF_MSGID;
 		mail_hash_insert(ctx->hash_trans, NULL, &unode, idx_r);
 	}
 }
@@ -100,7 +106,7 @@
 	parent = mail_hash_lookup_idx(ctx->hash_trans, parent_idx);
 	child = mail_hash_lookup_idx(ctx->hash_trans, child_idx);
 
-	child->link_refcount++;
+	child->parent_link_refcount++;
 	mail_hash_update(ctx->hash_trans, child_idx);
 
 	if (thread_node_has_ancestor(ctx, parent, child)) {
@@ -144,7 +150,7 @@
 		child->parent_idx = parent_idx;
 	else {
 		/* Conflicting parent already exists, keep the original */
-		if (child->exists) {
+		if (MAIL_INDEX_NODE_EXISTS(child)) {
 			/* If this message gets expunged,
 			   the parent is changed. */
 			child->expunge_rebuilds = TRUE;
@@ -166,27 +172,56 @@
 	mail_hash_update(ctx->hash_trans, child_idx);
 }
 
-static void
+struct thread_message_id {
+	const char *str;
+	uint32_t crc32;
+	unsigned int collisions_after;
+};
+
+static const char *
 thread_link_references(struct mail_thread_update_context *ctx, uint32_t ref_uid,
-		       const char **references)
+		       const char *references)
 {
-	const char *msgid, *last_msgid;
+	ARRAY_DEFINE(id_arr, struct thread_message_id);
+	struct thread_message_id *ids;
+	struct thread_message_id id;
 	uint32_t parent_idx, child_idx;
+	unsigned int i, j, count, ref_index;
 
-	last_msgid = message_id_get_next(references);
-	if (last_msgid == NULL)
-		return;
-	(void)thread_msgid_get(ctx, ref_uid, last_msgid, &parent_idx);
+	/* put all message IDs to an array */
+	memset(&id, 0, sizeof(id));
+	t_array_init(&id_arr, 32);
+	while ((id.str = message_id_get_next(&references)) != NULL) {
+		id.crc32 = crc32_str_nonzero(id.str);
+		array_append(&id_arr, &id, 1);
+	}
+
+	ids = array_get_modifiable(&id_arr, &count);
+	if (count <= 1)
+		return count == 0 ? NULL : ids[0].str;
 
-	while ((msgid = message_id_get_next(references)) != NULL) {
-		(void)thread_msgid_get(ctx, ref_uid, msgid, &child_idx);
+	/* count collisions */
+	for (i = 0; i < count; i++) {
+		for (j = i + 1; j < count; j++) {
+			if (ids[i].crc32 == ids[j].crc32)
+				ids[i].collisions_after++;
+		}
+	}
+
+	ref_index = MAIL_INDEX_NODE_REF_REFERENCES_LAST +
+		ids[0].collisions_after;
+	thread_msgid_get(ctx, ref_uid, ref_index, ids[0].str, &parent_idx);
+	for (i = 1; i < count; i++) {
+		ref_index = MAIL_INDEX_NODE_REF_REFERENCES_LAST +
+			ids[i].collisions_after;
+		thread_msgid_get(ctx, ref_uid, ref_index,
+				 ids[i].str, &child_idx);
 		thread_link_reference(ctx, parent_idx, child_idx);
 		parent_idx = child_idx;
-		last_msgid = msgid;
 	}
 
 	/* link the last ID to us */
-	*references = last_msgid;
+	return ids[count-1].str;
 }
 
 static int thread_get_mail_header(struct mail *mail, const char *name,
@@ -203,34 +238,14 @@
 	return 0;
 }
 
-static bool references_are_crc32_unique(const char *references)
-{
-	const char *msgid;
-	uint32_t msgid_crc32;
-	ARRAY_TYPE(uint32_t) crc_arr;
-	const uint32_t *crc;
-	unsigned int i, count;
-
-	t_array_init(&crc_arr, 32);
-	while ((msgid = message_id_get_next(&references)) != NULL) {
-		msgid_crc32 = crc32_str_nonzero(msgid);
-		crc = array_get(&crc_arr, &count);
-		for (i = 0; i < count; i++) {
-			if (crc[i] == msgid_crc32)
-				return FALSE;
-		}
-		array_append(&crc_arr, &msgid_crc32, 1);
-	}
-	return TRUE;
-}
-
 int mail_thread_add(struct mail_thread_update_context *ctx, struct mail *mail)
 {
 	const char *message_id, *in_reply_to, *references, *parent_msgid;
 	const struct mail_thread_node *parent, *old_parent;
 	struct mail_hash_header *hdr;
 	struct mail_thread_node *node;
-	uint32_t idx, parent_idx, ref_uid;
+	uint32_t idx, parent_idx;
+	unsigned int ref_index;
 
 	hdr = mail_hash_get_header(ctx->hash_trans);
 	i_assert(mail->uid > hdr->last_uid);
@@ -241,21 +256,22 @@
 	    thread_get_mail_header(mail, HDR_REFERENCES, &references) < 0)
 		return -1;
 
-	ref_uid = references_are_crc32_unique(references) ? mail->uid : 0;
 	thread_msg_add(ctx, mail->uid, message_id_get_next(&message_id), &idx);
-	thread_link_references(ctx, ref_uid, &references);
+	parent_msgid = thread_link_references(ctx, mail->uid, references);
 
-	if (references != NULL)
-		parent_msgid = references;
+	if (parent_msgid != NULL)
+		ref_index = MAIL_INDEX_NODE_REF_REFERENCES_LAST;
 	else {
 		/* no valid IDs in References:, use In-Reply-To: instead */
 		if (thread_get_mail_header(mail, HDR_IN_REPLY_TO,
 					   &in_reply_to) < 0)
 			return -1;
 		parent_msgid = message_id_get_next(&in_reply_to);
+		ref_index = MAIL_INDEX_NODE_REF_INREPLYTO;
 	}
 	parent = parent_msgid == NULL ? NULL :
-		thread_msgid_get(ctx, ref_uid, parent_msgid, &parent_idx);
+		thread_msgid_get(ctx, mail->uid, ref_index,
+				 parent_msgid, &parent_idx);
 
 	node = mail_hash_lookup_idx(ctx->hash_trans, idx);
 	old_parent = node->parent_idx == 0 ? NULL :
@@ -277,8 +293,10 @@
 
 static bool
 mail_thread_node_lookup(struct mail_thread_update_context *ctx, uint32_t uid,
-			uint32_t *idx_r, struct mail_thread_node **node_r)
+			uint32_t *idx_r, const char **msgid_r,
+			struct mail_thread_node **node_r)
 {
+	struct mail_thread_node *node;
 	struct msgid_search_key key;
 	const char *msgids;
 	int ret;
@@ -295,14 +313,17 @@
 		return FALSE;
 	key.msgid_crc32 = crc32_str_nonzero(key.msgid);
 
-	*node_r = mail_hash_lookup(ctx->hash_trans, &key, idx_r);
-	if (*node_r == NULL)
+	node = mail_hash_lookup(ctx->hash_trans, &key, idx_r);
+	if (node == NULL)
 		return FALSE;
 
-	if ((*node_r)->uid != ctx->tmp_mail->uid) {
+	if (node->ref_index != MAIL_INDEX_NODE_REF_MSGID ||
+	    node->uid_or_id != uid) {
 		/* duplicate Message-ID probably */
 		return FALSE;
 	}
+	*msgid_r = key.msgid;
+	*node_r = node;
 	return TRUE;
 }
 
@@ -333,7 +354,8 @@
 
 static bool
 thread_unref_msgid(struct mail_thread_update_context *ctx,
-		   uint32_t parent_idx, uint32_t child_idx)
+		   uint32_t ref_uid, uint32_t parent_idx,
+		   const char *child_msgid, uint32_t child_idx)
 {
 	struct mail_thread_node *parent, *child;
 
@@ -342,24 +364,39 @@
 		return FALSE;
 
 	child = mail_hash_lookup_idx(ctx->hash_trans, child_idx);
-	if (child->link_refcount == 0) {
+	if (child->parent_link_refcount == 0) {
 		mail_hash_transaction_set_corrupted(ctx->hash_trans,
 						    "unexpected refcount=0");
 		return FALSE;
 	}
-	child->link_refcount--;
-	if (child->link_refcount == 0) {
+	child->parent_link_refcount--;
+	if (child->parent_link_refcount == 0) {
 		/* we don't have a root anymore */
 		child->parent_idx = 0;
 	}
+
+	if (child->uid_or_id == ref_uid &&
+	    child->ref_index != MAIL_INDEX_NODE_REF_EXT) {
+		child->uid_or_id =
+			mail_thread_list_add(ctx->thread_list_ctx, child_msgid);
+		child->ref_index = MAIL_INDEX_NODE_REF_EXT;
+	}
 	mail_hash_update(ctx->hash_trans, child_idx);
+
+	if (parent->uid_or_id == ref_uid &&
+	    parent->ref_index != MAIL_INDEX_NODE_REF_EXT) {
+		parent->uid_or_id =
+			mail_thread_list_add(ctx->thread_list_ctx, child_msgid);
+		parent->ref_index = MAIL_INDEX_NODE_REF_EXT;
+		mail_hash_update(ctx->hash_trans, parent_idx);
+	}
 	return TRUE;
 }
 
 static bool
-thread_unref_links(struct mail_thread_update_context *ctx,
-		   uint32_t last_child_idx, const char *references,
-		   bool *valid_r)
+thread_unref_links(struct mail_thread_update_context *ctx, uint32_t ref_uid,
+		   const char *last_child_msgid, uint32_t last_child_idx,
+		   const char *references, bool *valid_r)
 {
 	const char *msgid;
 	uint32_t parent_idx, child_idx;
@@ -378,22 +415,24 @@
 
 	while ((msgid = message_id_get_next(&references)) != NULL) {
 		if (!thread_msgid_lookup(ctx, msgid, &child_idx) ||
-		    !thread_unref_msgid(ctx, parent_idx, child_idx))
+		    !thread_unref_msgid(ctx, ref_uid, parent_idx,
+					msgid, child_idx))
 			return FALSE;
 		parent_idx = child_idx;
 	}
-	return thread_unref_msgid(ctx, parent_idx, last_child_idx);
+	return thread_unref_msgid(ctx, ref_uid, parent_idx,
+				  last_child_msgid, last_child_idx);
 }
 
 int mail_thread_remove(struct mail_thread_update_context *ctx, uint32_t uid)
 {
 	struct mail_hash_header *hdr;
 	struct mail_thread_node *node;
-	const char *references, *in_reply_to;
+	const char *msgid, *references, *in_reply_to;
 	uint32_t idx, parent_idx;
 	bool have_refs;
 
-	if (!mail_thread_node_lookup(ctx, uid, &idx, &node))
+	if (!mail_thread_node_lookup(ctx, uid, &idx, &msgid, &node))
 		return 0;
 	if (node->expunge_rebuilds)
 		return 0;
@@ -402,7 +441,7 @@
 				  &references) < 0)
 		return -1;
 
-	if (!thread_unref_links(ctx, idx, references, &have_refs))
+	if (!thread_unref_links(ctx, uid, msgid, idx, references, &have_refs))
 		return 0;
 	if (!have_refs) {
 		/* no valid IDs in References:, use In-Reply-To: instead */
@@ -412,15 +451,16 @@
 		in_reply_to = message_id_get_next(&in_reply_to);
 		if (in_reply_to != NULL &&
 		    (!thread_msgid_lookup(ctx, in_reply_to, &parent_idx) ||
-		     !thread_unref_msgid(ctx, parent_idx, idx)))
+		     !thread_unref_msgid(ctx, uid, parent_idx,
+					 in_reply_to, idx)))
 			return 0;
 	}
 
 	/* get the node again, the pointer may have changed */
 	node = mail_hash_lookup_idx(ctx->hash_trans, idx);
 
-	node->uid = 0;
-	node->exists = FALSE;
+	node->uid_or_id = mail_thread_list_add(ctx->thread_list_ctx, msgid);
+	node->ref_index = MAIL_INDEX_NODE_REF_EXT;
 	mail_hash_update(ctx->hash_trans, idx);
 
 	hdr = mail_hash_get_header(ctx->hash_trans);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-storage/index/index-thread-list.c	Thu Jun 26 19:11:56 2008 +0300
@@ -0,0 +1,325 @@
+/* Copyright (c) 2008 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "array.h"
+#include "str.h"
+#include "bsearch-insert-pos.h"
+#include "istream.h"
+#include "ostream.h"
+#include "index-storage.h"
+#include "index-thread-private.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+struct mail_thread_list_context {
+	struct mailbox *box;
+	char *path;
+
+	uint32_t reset_counter;
+	ARRAY_TYPE(uint32_t) ids;
+	ARRAY_TYPE(const_string) msgids;
+	uint32_t last_id;
+	pool_t msgid_pool;
+
+	unsigned int modified:1;
+};
+
+struct mail_thread_list_update_context {
+	struct mail_thread_list_context *ctx;
+	struct mail_hash_transaction *hash_trans;
+
+	unsigned int refreshed:1;
+	unsigned int failed:1;
+};
+
+struct mail_thread_list_context *mail_thread_list_init(struct mailbox *box)
+{
+	struct index_mailbox *ibox = (struct index_mailbox *)box;
+	struct mail_thread_list_context *ctx;
+
+	ctx = i_new(struct mail_thread_list_context, 1);
+	ctx->box = box;
+	ctx->path = i_strdup_printf("%s"MAIL_THREAD_INDEX_SUFFIX".ids",
+				    ibox->index->filepath);
+	ctx->msgid_pool =
+		pool_alloconly_create(MEMPOOL_GROWING"thread msgid pool", 1024);
+	i_array_init(&ctx->ids, 32);
+	i_array_init(&ctx->msgids, 32);
+	return ctx;
+}
+
+void mail_thread_list_deinit(struct mail_thread_list_context **_ctx)
+{
+	struct mail_thread_list_context *ctx = *_ctx;
+
+	*_ctx = NULL;
+	array_free(&ctx->ids);
+	array_free(&ctx->msgids);
+	pool_unref(&ctx->msgid_pool);
+	i_free(ctx->path);
+	i_free(ctx);
+}
+
+struct mail_thread_list_update_context *
+mail_thread_list_update_begin(struct mail_thread_list_context *ctx,
+			      struct mail_hash_transaction *hash_trans)
+{
+	struct mail_thread_list_update_context *update_ctx;
+
+	update_ctx = i_new(struct mail_thread_list_update_context, 1);
+	update_ctx->ctx = ctx;
+	update_ctx->hash_trans = hash_trans;
+
+	if (mail_hash_transaction_is_in_memory(hash_trans))
+		update_ctx->refreshed = TRUE;
+	return update_ctx;
+}
+
+static int uint32_cmp(const void *p1, const void *p2)
+{
+	const uint32_t *u1 = p1, *u2 = p2;
+
+	return *u1 < *u2 ? -1 : (*u1 > *u2 ? 1 : 0);
+}
+
+static int mail_thread_list_read(struct mail_thread_list_context *ctx)
+{
+	struct istream *input;
+	const char *line;
+	uint32_t id, prev_id;
+	int fd, ret = 0;
+
+	array_clear(&ctx->ids);
+	array_clear(&ctx->msgids);
+	p_clear(ctx->msgid_pool);
+
+	fd = open(ctx->path, O_RDONLY);
+	if (fd == -1) {
+		if (errno == ENOENT)
+			return 0;
+		mail_storage_set_critical(ctx->box->storage,
+			"open(%s) failed: %m", ctx->path);
+		return -1;
+	}
+	input = i_stream_create_fd(fd, (size_t)-1, FALSE);
+	if ((line = i_stream_read_next_line(input)) == NULL)
+		ret = -1;
+	else
+		ctx->reset_counter = strtoul(line, NULL, 10);
+
+	prev_id = 0;
+	while ((line = i_stream_read_next_line(input)) != NULL) {
+		for (id = 0; *line >= '0' && *line <= '9'; line++)
+			id = id*10 + (*line - '0');
+		if (*line != ' ' || id <= prev_id) {
+			ret = -1;
+			break;
+		}
+		prev_id = id;
+		line++;
+
+		line = p_strdup(ctx->msgid_pool, line);
+		array_append(&ctx->ids, &id, 1);
+		array_append(&ctx->msgids, &line, 1);
+	}
+	ctx->last_id = prev_id;
+	if (input->stream_errno != 0) {
+		errno = input->stream_errno;
+		mail_storage_set_critical(ctx->box->storage,
+			"read(%s) failed: %m", ctx->path);
+		ret = -1;
+	} else if (ret < 0) {
+		mail_storage_set_critical(ctx->box->storage,
+			"Corrupted thread list file %s", ctx->path);
+	}
+	i_stream_unref(&input);
+
+	if (close(fd) < 0) {
+		mail_storage_set_critical(ctx->box->storage,
+			"close(%s) failed: %m", ctx->path);
+		ret = -1;
+	}
+	return ret;
+}
+
+static int mail_thread_list_refresh(struct mail_thread_list_update_context *ctx)
+{
+	struct mail_hash_header *hdr;
+
+	if (ctx->refreshed)
+		return 0;
+	ctx->refreshed = TRUE;
+
+	if (mail_thread_list_read(ctx->ctx) < 0)
+		return -1;
+
+	hdr = mail_hash_get_header(ctx->hash_trans);
+	if (ctx->ctx->reset_counter != hdr->reset_counter) {
+		array_clear(&ctx->ctx->ids);
+		array_clear(&ctx->ctx->msgids);
+		p_clear(ctx->ctx->msgid_pool);
+	}
+	return 0;
+}
+
+static inline bool
+mail_thread_lookup_idx(struct mail_thread_list_update_context *ctx,
+		       uint32_t id, unsigned int *idx_r)
+{
+	const uint32_t *ids;
+	unsigned int count;
+
+	ids = array_get(&ctx->ctx->ids, &count);
+	return bsearch_insert_pos(&id, ids, I_MIN(count, id), sizeof(*ids),
+				  uint32_cmp, idx_r);
+}
+
+int mail_thread_list_lookup(struct mail_thread_list_update_context *ctx,
+			    uint32_t id, const char **msgid_r)
+{
+	const char *const *msgid_p;
+	unsigned int idx;
+
+	if (!mail_thread_lookup_idx(ctx, id, &idx)) {
+		if (mail_thread_list_refresh(ctx) < 0)
+			return -1;
+		if (!mail_thread_lookup_idx(ctx, id, &idx)) {
+			*msgid_r = NULL;
+			return 0;
+		}
+	}
+
+	msgid_p = array_idx(&ctx->ctx->msgids, idx);
+	*msgid_r = *msgid_p;
+	return 1;
+}
+
+uint32_t mail_thread_list_add(struct mail_thread_list_update_context *ctx,
+			      const char *msgid)
+{
+	if (mail_thread_list_refresh(ctx) < 0)
+		ctx->failed = TRUE;
+
+	msgid = p_strdup(ctx->ctx->msgid_pool, msgid);
+	ctx->ctx->modified = TRUE;
+	ctx->ctx->last_id++;
+	array_append(&ctx->ctx->ids, &ctx->ctx->last_id, 1);
+	array_append(&ctx->ctx->msgids, &msgid, 1);
+	return ctx->ctx->last_id;
+}
+
+void mail_thread_list_remove(struct mail_thread_list_update_context *ctx,
+			     uint32_t id)
+{
+	unsigned int idx;
+
+	if (mail_thread_list_refresh(ctx) < 0)
+		return;
+
+	if (!mail_thread_lookup_idx(ctx, id, &idx)) {
+		mail_storage_set_critical(ctx->ctx->box->storage,
+					  "%s lost ID %u", ctx->ctx->path, id);
+		return;
+	}
+
+	ctx->ctx->modified = TRUE;
+	array_delete(&ctx->ctx->ids, idx, 1);
+	array_delete(&ctx->ctx->msgids, idx, 1);
+}
+
+static int mail_thread_list_write(struct mail_thread_list_context *ctx,
+				  uint32_t reset_counter)
+{
+	struct index_mailbox *ibox = (struct index_mailbox *)ctx->box;
+	const uint32_t *ids;
+	const char *temp_path, *const *msgids;
+	unsigned int i, count;
+	struct ostream *output;
+	string_t *str;
+	int fd, ret = 0;
+
+	ids = array_get(&ctx->ids, &count);
+	if (count == 0) {
+		/* everything deleted */
+		if (unlink(ctx->path) < 0 && errno != ENOENT)
+			i_error("unlink(%s) failed: %m", ctx->path);
+		ctx->modified = FALSE;
+		return 0;
+	}
+	msgids = array_idx(&ctx->msgids, 0);
+
+	/* write all IDs to .tmp file */
+	temp_path = t_strconcat(ctx->path, ".tmp", NULL);
+	fd = open(temp_path, O_CREAT | O_TRUNC | O_WRONLY, ibox->index->mode);
+	if (fd == -1) {
+		mail_storage_set_critical(ctx->box->storage,
+					  "creat(%s) failed: %m", temp_path);
+		return -1;
+	}
+	str = t_str_new(256);
+	output = o_stream_create_fd_file(fd, 0, FALSE);
+	str_printfa(str, "%u\n", reset_counter);
+	o_stream_send(output, str_data(str), str_len(str));
+
+	for (i = 0; i < count; i++) {
+		str_truncate(str, 0);
+		str_printfa(str, "%u %s\n", ids[i], msgids[i]);
+		o_stream_send(output, str_data(str), str_len(str));
+	}
+	if (output->last_failed_errno != 0) {
+		errno = output->last_failed_errno;
+		mail_storage_set_critical(ctx->box->storage,
+					  "write(%s) failed: %m", temp_path);
+		ret = -1;
+	}
+	o_stream_unref(&output);
+	if (close(fd) < 0) {
+		mail_storage_set_critical(ctx->box->storage,
+					  "close(%s) failed: %m", temp_path);
+		ret = -1;
+	}
+	if (ret < 0) {
+		if (unlink(temp_path) < 0)
+			i_error("unlink(%s) failed: %m", temp_path);
+		return -1;
+	}
+
+	/* finish the write by renaming the file */
+	if (rename(temp_path, ctx->path) < 0) {
+		mail_storage_set_critical(ctx->box->storage,
+			"rename(%s, %s) failed: %m", temp_path, ctx->path);
+		return -1;
+	}
+
+	ctx->modified = FALSE;
+	return 0;
+}
+
+int mail_thread_list_commit(struct mail_thread_list_update_context **_ctx)
+{
+	struct mail_thread_list_update_context *ctx = *_ctx;
+	int ret = ctx->failed ? -1 : 0;
+
+	*_ctx = NULL;
+
+	if (ctx->ctx->modified && !ctx->failed &&
+	    !mail_hash_transaction_is_in_memory(ctx->hash_trans)) {
+		struct mail_hash_header *hdr;
+
+		hdr = mail_hash_get_header(ctx->hash_trans);
+		ret = mail_thread_list_write(ctx->ctx, hdr->reset_counter);
+	}
+	i_free(ctx);
+	return ret;
+}
+
+void mail_thread_list_rollback(struct mail_thread_list_update_context **_ctx)
+{
+	struct mail_thread_list_update_context *ctx = *_ctx;
+
+	*_ctx = NULL;
+	i_free(ctx);
+}
--- a/src/lib-storage/index/index-thread-private.h	Thu Jun 26 08:49:02 2008 +0300
+++ b/src/lib-storage/index/index-thread-private.h	Thu Jun 26 19:11:56 2008 +0300
@@ -2,11 +2,14 @@
 #define INDEX_THREAD_PRIVATE_H
 
 struct index_mailbox;
+struct mail_thread_list_context;
 
 #include "crc32.h"
 #include "mail-hash.h"
 #include "mail-thread.h"
 
+#define MAIL_THREAD_INDEX_SUFFIX ".thread"
+
 #define HDR_MESSAGE_ID "message-id"
 #define HDR_IN_REPLY_TO "in-reply-to"
 #define HDR_REFERENCES "references"
@@ -20,25 +23,56 @@
 struct mail_thread_node {
 	struct mail_hash_record rec;
 
-	/* exists=TRUE: UID of the mail this node belongs to
-	   exists=FALSE: UID of some message that references (in References: or
-	   In-Reply-To: header) this node. Of all the valid references exactly
-	   one has the same CRC32 as this node's msgid_crc32. */
-	uint32_t uid;
-	uint32_t parent_idx;
+	/* CRC32 checksum of this node's message ID (except CRC32 0 -> 1).
+	   If there are more nodes with the same checksum, use uid and
+	   ref_index fields to find the original string. */
 	uint32_t msgid_crc32;
+	/* ref_index=0: ID in external thread IDs list (beginning from 1)
+	   ref_index=1: UID of the mail whose Message-ID: header points to
+	     this node.
+	   ref_index>1: UID of some message which has references to this
+	     node. ref_index spcifies how the reference string is found. */
+	uint32_t uid_or_id;
+	/* Index for this node's parent node */
+	unsigned int parent_idx:27;
+	/* 0 = uid contains the index to the wanted Message ID in external
+	       thread IDs list
+	   1 = Message-ID: header's first valid ID
+	   2 = In-Reply-To: header's first valid ID
+	   3 = References: header's last ID with the same CRC32
+	   4 = References: header's second last ID with the same CRC32
+	     (i.e. one non-matching ID already had a CRC32 collision)
+	   ..
+	   31 = References: header's 29th last ID with the same CRC32
+	*/
+	unsigned int ref_index:5;
+	/* Number of messages containing "this message" -> "parent message"
+	   link, i.e. "number of links to parent node". However since parents
+	   can change, not all of these references might be from our current
+	   child nodes. When this refcount reaches 0, it means we must detach
+	   from our parent. */
+	unsigned int parent_link_refcount:30;
+	/* If uid is expunged, rebuild the thread tree. */
+	unsigned int expunge_rebuilds:1;
+	/* If a link between this node and its child gets unreferenced,
+	   rebuild the thread tree. */
+	unsigned int parent_unref_rebuilds:1;
+};
+#define MAIL_INDEX_NODE_REF_EXT 0
+#define MAIL_INDEX_NODE_REF_MSGID 1
+#define MAIL_INDEX_NODE_REF_INREPLYTO 2
+#define MAIL_INDEX_NODE_REF_REFERENCES_LAST 3
+#define MAIL_INDEX_NODE_REF_MAX_VALUE 31
 
-	uint32_t link_refcount:29;
-	uint32_t expunge_rebuilds:1;
-	uint32_t parent_unref_rebuilds:1;
-	uint32_t exists:1;
-};
+#define MAIL_INDEX_NODE_EXISTS(node) \
+	((node)->ref_index == MAIL_INDEX_NODE_REF_MSGID)
 
 struct mail_thread_update_context {
 	struct mail *tmp_mail;
 
 	struct mail_hash *hash;
 	struct mail_hash_transaction *hash_trans;
+	struct mail_thread_list_update_context *thread_list_ctx;
 
 	/* Hash record idx -> Message-ID */
 	ARRAY_DEFINE(msgid_cache, const char *);
@@ -64,9 +98,26 @@
 struct mail_thread_iterate_context *
 mail_thread_iterate_init_full(struct mail *tmp_mail,
 			      struct mail_hash_transaction *hash_trans,
+			      struct mail_thread_list_context *hash_list_ctx,
 			      enum mail_thread_type thread_type,
 			      bool return_seqs);
 
 void index_thread_mailbox_index_opened(struct index_mailbox *ibox);
 
+struct mail_thread_list_context *
+mail_thread_list_init(struct mailbox *box);
+void mail_thread_list_deinit(struct mail_thread_list_context **ctx);
+
+struct mail_thread_list_update_context *
+mail_thread_list_update_begin(struct mail_thread_list_context *ctx,
+			      struct mail_hash_transaction *hash_trans);
+int mail_thread_list_lookup(struct mail_thread_list_update_context *ctx,
+			    uint32_t id, const char **msgid_r);
+uint32_t mail_thread_list_add(struct mail_thread_list_update_context *ctx,
+			      const char *msgid);
+void mail_thread_list_remove(struct mail_thread_list_update_context *ctx,
+			     uint32_t id);
+int mail_thread_list_commit(struct mail_thread_list_update_context **ctx);
+void mail_thread_list_rollback(struct mail_thread_list_update_context **ctx);
+
 #endif
--- a/src/lib-storage/index/index-thread.c	Thu Jun 26 08:49:02 2008 +0300
+++ b/src/lib-storage/index/index-thread.c	Thu Jun 26 19:11:56 2008 +0300
@@ -35,6 +35,8 @@
 	union mailbox_module_context module_ctx;
 	struct mail_hash *hash;
 
+	struct mail_thread_list_context *list_ctx;
+
 	/* set only temporarily while needed */
 	struct mail_thread_update_context *ctx;
 };
@@ -63,69 +65,87 @@
 }
 
 static const char *
-mail_thread_find_msgid_crc32(struct mail_thread_update_context *ctx,
-			     uint32_t msgid_crc32)
+mail_thread_nth_last_refid_crc32(const char *msgids, unsigned int ref_index,
+				 uint32_t msgid_crc32)
 {
-	const char *msgids, *msgid;
-	int ret;
+	const unsigned int idx_from_end =
+		ref_index - MAIL_INDEX_NODE_REF_REFERENCES_LAST + 1;
+	const char **last_msgids, *msgid;
+	unsigned int idx = 0;
 
-	/* if there are any valid references, it's in them */
-	ret = mail_get_first_header(ctx->tmp_mail, HDR_REFERENCES, &msgids);
-	if (ret == 0) {
-		/* no References: header, fallback to In-Reply-To: */
-		if (mail_get_first_header(ctx->tmp_mail, HDR_IN_REPLY_TO,
-					  &msgids) <= 0)
-			return NULL;
-
-		msgid = message_id_get_next(&msgids);
-		if (msgid != NULL && crc32_str_nonzero(msgid) == msgid_crc32)
-			return msgid;
-		return NULL;
-	}
-	if (ret < 0)
-		return NULL;
-
+	last_msgids = t_new(const char *, idx_from_end);
 	while ((msgid = message_id_get_next(&msgids)) != NULL) {
 		if (crc32_str_nonzero(msgid) == msgid_crc32) {
-			/* found it. there aren't any colliding message-id
-			   CRC32s in this message or it wouldn't have been
-			   added as a reference UID. */
-			return msgid;
+			last_msgids[idx % idx_from_end] = msgid;
+			idx++;
 		}
 	}
-	return NULL;
-
+	return last_msgids[idx % idx_from_end];
 }
 
 static const char *
-mail_thread_node_get_msgid(struct mail_thread_update_context *ctx,
+mail_thread_node_get_msgid(struct mail_hash_transaction *trans,
+			   struct mail_thread_update_context *ctx,
 			   const struct mail_thread_node *node, uint32_t idx)
 {
-	const char *msgids, *msgid, **p;
+	const char *msgids, *msgid = NULL, **p;
+	int ret;
+
+	if (node->ref_index == MAIL_INDEX_NODE_REF_EXT) {
+		if (mail_thread_list_lookup(ctx->thread_list_ctx,
+					    node->uid_or_id, &msgid) < 0)
+			return NULL;
+		if (msgid == NULL) {
+			mail_hash_transaction_set_corrupted(trans,
+				"Referenced list msgid lost");
+		}
+		return msgid;
+	}
 
 	p = array_idx_modifiable(&ctx->msgid_cache, idx);
 	if (*p != NULL)
 		return *p;
 
-	if (node->uid == 0)
+	ret = mail_set_uid(ctx->tmp_mail, node->uid_or_id);
+	if (ret <= 0) {
+		if (ret == 0) {
+			mail_hash_transaction_set_corrupted(trans,
+				t_strdup_printf("Referenced UID %u lost",
+						node->uid_or_id));
+		}
 		return NULL;
-
-	if (mail_set_uid(ctx->tmp_mail, node->uid) <= 0)
-		return NULL;
-	if (node->exists) {
-		/* we can get the Message-ID directly */
+	}
+	switch (node->ref_index) {
+	case MAIL_INDEX_NODE_REF_MSGID:
+		/* Message-ID: header */
 		if (mail_get_first_header(ctx->tmp_mail, HDR_MESSAGE_ID,
-					  &msgids) <= 0)
+					  &msgids) < 0)
 			return NULL;
-
+		msgid = message_id_get_next(&msgids);
+		break;
+	case MAIL_INDEX_NODE_REF_INREPLYTO:
+		/* In-Reply-To: header */
+		if (mail_get_first_header(ctx->tmp_mail, HDR_IN_REPLY_TO,
+					  &msgids) < 0)
+			return NULL;
 		msgid = message_id_get_next(&msgids);
-	} else {
-		/* find from a referencing message */
-		msgid = mail_thread_find_msgid_crc32(ctx, node->msgid_crc32);
+		break;
+	default:
+		/* References: header */
+		if (mail_get_first_header(ctx->tmp_mail, HDR_REFERENCES,
+					  &msgids) < 0)
+			return NULL;
+		msgid = mail_thread_nth_last_refid_crc32(msgids,
+							 node->ref_index,
+							 node->msgid_crc32);
+		break;
 	}
 
-	if (msgid == NULL)
+	if (msgid == NULL) {
+		/* shouldn't have happened */
+		mail_hash_transaction_set_corrupted(trans, "Message ID lost");
 		return NULL;
+	}
 
 	*p = p_strdup(ctx->msgid_pool, msgid);
 	return *p;
@@ -139,6 +159,7 @@
 	struct mail_thread_update_context *ctx = tbox->ctx;
 	const struct mail_thread_node *node;
 	const char *msgid;
+	bool ret;
 
 	node = mail_hash_lookup_idx(trans, idx);
 	if (key_rec->msgid_crc32 != node->msgid_crc32)
@@ -148,14 +169,19 @@
 	ctx->cmp_last_idx = idx;
 
 	/* either a match or a collision, need to look closer */
-	msgid = mail_thread_node_get_msgid(ctx, node, ctx->cmp_last_idx);
-	if (msgid == NULL) {
-		/* we couldn't figure out the Message-ID for whatever reason.
-		   we'll need to fallback to rebuilding the whole thread */
-		ctx->rebuild = TRUE;
-		return FALSE;
-	}
-	return strcmp(msgid, key_rec->msgid) == 0;
+	T_BEGIN {
+		msgid = mail_thread_node_get_msgid(trans, ctx, node, idx);
+		if (msgid != NULL)
+			ret = strcmp(msgid, key_rec->msgid) == 0;
+		else {
+			/* we couldn't figure out the Message-ID for whatever
+			   reason. we'll need to fallback to rebuilding the
+			   whole thread. */
+			ctx->rebuild = TRUE;
+			ret = FALSE;
+		}
+	} T_END;
+	return ret;
 }
 
 static unsigned int mail_thread_hash_rec(const void *p)
@@ -358,6 +384,9 @@
 	ctx->t = mailbox_transaction_begin(ctx->box, 0);
 	ctx->search = mailbox_search_init(ctx->t, ctx->search_args, NULL);
 	ctx->thread_ctx.tmp_mail = mail_alloc(ctx->t, 0, NULL);
+	ctx->thread_ctx.thread_list_ctx =
+		mail_thread_list_update_begin(tbox->list_ctx,
+					      ctx->thread_ctx.hash_trans);
 
 	hdr = mail_hash_get_header(ctx->thread_ctx.hash_trans);
 	count = status.messages < hdr->record_count ? 0 :
@@ -465,6 +494,9 @@
 {
 	struct mail_thread_mailbox *tbox = MAIL_THREAD_CONTEXT(ctx->box);
 
+	if (ctx->thread_ctx.thread_list_ctx != NULL)
+		mail_thread_list_rollback(&ctx->thread_ctx.thread_list_ctx);
+
 	mail_hash_transaction_end(&ctx->thread_ctx.hash_trans);
 
 	(void)mailbox_search_deinit(&ctx->search);
@@ -500,8 +532,11 @@
 mail_thread_iterate_init(struct mail_thread_context *ctx,
 			 enum mail_thread_type thread_type, bool write_seqs)
 {
+	struct mail_thread_mailbox *tbox = MAIL_THREAD_CONTEXT(ctx->box);
+
 	return mail_thread_iterate_init_full(ctx->thread_ctx.tmp_mail,
 					     ctx->thread_ctx.hash_trans,
+					     tbox->list_ctx,
 					     thread_type, write_seqs);
 }
 
@@ -526,6 +561,33 @@
 	return seq2 == hash_hdr->message_count;
 }
 
+static void
+mail_thread_expunge_handler_deinit(struct mail_thread_mailbox *tbox,
+				   struct mail_thread_update_context *ctx)
+{
+	struct mailbox_transaction_context *t;
+
+	t = ctx->tmp_mail->transaction;
+
+	if (ctx->failed)
+		mail_thread_list_rollback(&ctx->thread_list_ctx);
+	else {
+		if (mail_thread_list_commit(&ctx->thread_list_ctx) < 0)
+			ctx->failed = TRUE;
+	}
+	if (!ctx->failed)
+		(void)mail_hash_transaction_write(ctx->hash_trans);
+	mail_hash_transaction_end(&ctx->hash_trans);
+	mail_hash_unlock(tbox->hash);
+
+	mail_free(&ctx->tmp_mail);
+	/* don't commit. we're in the middle of syncing and
+	   this transaction isn't marked as external. */
+	(void)mailbox_transaction_rollback(&t);
+	array_free(&ctx->msgid_cache);
+	pool_unref(&ctx->msgid_pool);
+}
+
 static int
 mail_thread_expunge_handler(struct mail_index_sync_map_ctx *sync_ctx,
 			    uint32_t seq, const void *data,
@@ -541,21 +603,8 @@
 		/* deinit */
 		if (!ctx->syncing)
 			return 0;
-		if (ctx->hash != NULL) {
-			t = ctx->tmp_mail->transaction;
-
-			if (!ctx->failed)
-				(void)mail_hash_transaction_write(ctx->hash_trans);
-			mail_hash_transaction_end(&ctx->hash_trans);
-			mail_hash_unlock(tbox->hash);
-
-			mail_free(&ctx->tmp_mail);
-			/* don't commit. we're in the middle of syncing and
-			   this transaction isn't marked as external. */
-			(void)mailbox_transaction_rollback(&t);
-			array_free(&ctx->msgid_cache);
-			pool_unref(&ctx->msgid_pool);
-		}
+		if (ctx->hash != NULL)
+			mail_thread_expunge_handler_deinit(tbox, ctx);
 		i_free_and_null(tbox->ctx);
 		return 0;
 	}
@@ -575,6 +624,9 @@
 
 		ctx->hash = tbox->hash;
 		ctx->hash_trans = mail_hash_transaction_begin(ctx->hash, 0);
+		ctx->thread_list_ctx =
+			mail_thread_list_update_begin(tbox->list_ctx,
+						      ctx->hash_trans);
 		ctx->msgid_pool = pool_alloconly_create(MEMPOOL_GROWING"msgids",
 							20 * APPROX_MSGID_SIZE);
 		i_array_init(&ctx->msgid_cache, 20);
@@ -610,6 +662,7 @@
 
 	i_assert(tbox->ctx == NULL);
 
+	mail_thread_list_deinit(&tbox->list_ctx);
 	mail_hash_free(&tbox->hash);
 	ret = tbox->module_ctx.super.close(box);
 	i_free(tbox);
@@ -626,7 +679,8 @@
 	tbox->module_ctx.super = box->v;
 	box->v.close = mail_thread_mailbox_close;
 
-	tbox->hash = mail_hash_alloc(ibox->index, ".thread",
+	tbox->list_ctx = mail_thread_list_init(box);
+	tbox->hash = mail_hash_alloc(ibox->index, MAIL_THREAD_INDEX_SUFFIX,
 				     sizeof(struct mail_thread_node),
 				     mail_thread_hash_key,
 				     mail_thread_hash_rec,