view src/lib-storage/index/index-thread-finish.c @ 9354:687ac828b964 HEAD

lib-index: modseqs weren't tracked properly within session when changes were done.
author Timo Sirainen <tss@iki.fi>
date Tue, 01 Sep 2009 13:05:03 -0400
parents a2a28c52e951
children 00cd9aacd03c
line wrap: on
line source

/* Copyright (c) 2002-2009 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "array.h"
#include "hash.h"
#include "imap-base-subject.h"
#include "mail-storage-private.h"
#include "index-thread-private.h"

#include <stdlib.h>

struct mail_thread_shadow_node {
	uint32_t first_child_idx, next_sibling_idx;
};

struct mail_thread_root_node {
	/* node.idx usually points to indexes from mail hash. However
	   REFERENCES step (5) may add temporary dummy roots. They use larger
	   index numbers than exist in the hash. */
	struct mail_thread_child_node node;

	/* Used temporarily by (5)(B) base subject gathering.
	   root_idx1 is node's index in roots[] array + 1.
	   parent_root_idx points to root_idx1, or 0 for root. */
	unsigned int root_idx1;
	uint32_t parent_root_idx1;

	/* subject contained a Re: or Fwd: */
	unsigned int reply_or_forward:1;
	/* a dummy node */
	unsigned int dummy:1;
	/* ignore this node - it's a dummy without children */
	unsigned int ignore:1;
};

struct thread_finish_context {
	unsigned int refcount;

	struct mail *tmp_mail;
	struct mail_thread_cache *cache;

	ARRAY_DEFINE(roots, struct mail_thread_root_node);
	ARRAY_DEFINE(shadow_nodes, struct mail_thread_shadow_node);
	unsigned int next_new_root_idx;

	unsigned int use_sent_date:1;
	unsigned int return_seqs:1;
};

struct mail_thread_iterate_context {
	struct thread_finish_context *ctx;

	ARRAY_TYPE(mail_thread_child_node) children;
	unsigned int next_idx;
	bool failed;
};

struct subject_gather_context {
	struct thread_finish_context *ctx;

	pool_t subject_pool;
	struct hash_table *subject_hash;
};

static void
add_base_subject(struct subject_gather_context *ctx, const char *subject,
		 struct mail_thread_root_node *node)
{
	struct mail_thread_root_node *hash_node;
	char *hash_subject;
	void *key, *value;
	bool is_reply_or_forward;

	subject = imap_get_base_subject_cased(pool_datastack_create(), subject,
					      &is_reply_or_forward);
	/* (ii) If the thread subject is empty, skip this message. */
	if (*subject == '\0')
		return;

	/* (iii) Look up the message associated with the thread
	   subject in the subject table. */
	if (!hash_table_lookup_full(ctx->subject_hash, subject, &key, &value)) {
		/* (iv) If there is no message in the subject table with the
		   thread subject, add the current message and the thread
		   subject to the subject table. */
		hash_subject = p_strdup(ctx->subject_pool, subject);
		hash_table_insert(ctx->subject_hash, hash_subject, node);
	} else {
		hash_subject = key;
		hash_node = value;

		/* Otherwise, if the message in the subject table is not a
		   dummy, AND either of the following criteria are true:

		     The current message is a dummy, OR

                     The message in the subject table is a reply or forward
		     and the current message is not.

		   then replace the message in the subject table with the
		   current message. */
		if (!hash_node->dummy &&
		    (node->dummy ||
		     (hash_node->reply_or_forward && !is_reply_or_forward))) {
			hash_node->parent_root_idx1 = node->root_idx1;
			hash_table_update(ctx->subject_hash, hash_subject, node);
		} else {
			node->parent_root_idx1 = hash_node->root_idx1;
		}
	}

	node->reply_or_forward = is_reply_or_forward;
}

static int mail_thread_child_node_cmp(const void *p1, const void *p2)
{
	const struct mail_thread_child_node *c1 = p1, *c2 = p2;

	if (c1->sort_date < c2->sort_date)
		return -1;
	if (c1->sort_date > c2->sort_date)
		return 1;

	if (c1->uid < c2->uid)
		return -1;
	if (c1->uid > c2->uid)
		return 1;
	return 0;
}

static uint32_t
thread_lookup_existing(struct thread_finish_context *ctx, uint32_t idx)
{
	const struct mail_thread_node *node;

	node = array_idx(&ctx->cache->thread_nodes, idx);
	i_assert(MAIL_THREAD_NODE_EXISTS(node));
	i_assert(node->uid != 0);
	return node->uid;
}

static void
thread_child_node_fill(struct thread_finish_context *ctx,
		       struct mail_thread_child_node *child)
{
	int tz;

	child->uid = thread_lookup_existing(ctx, child->idx);

	if (!mail_set_uid(ctx->tmp_mail, child->uid)) {
		/* the UID should have existed. we would have rebuild
		   the thread tree otherwise. */
		i_unreached();
	}

	/* get sent date if we want to use it and if it's valid */
	if (!ctx->use_sent_date)
		child->sort_date = 0;
	else if (mail_get_date(ctx->tmp_mail, &child->sort_date, &tz) < 0)
		child->sort_date = 0;

	if (child->sort_date == 0) {
		/* fallback to received date */
		(void)mail_get_received_date(ctx->tmp_mail, &child->sort_date);
	}
}

static void
thread_sort_children(struct thread_finish_context *ctx, uint32_t parent_idx,
		     ARRAY_TYPE(mail_thread_child_node) *sorted_children)
{
	const struct mail_thread_shadow_node *shadows;
	struct mail_thread_child_node child, *children;
	unsigned int count;

	memset(&child, 0, sizeof(child));
	array_clear(sorted_children);

	/* add all child indexes to the array */
	shadows = array_get(&ctx->shadow_nodes, &count);
	child.idx = shadows[parent_idx].first_child_idx;
	i_assert(child.idx != 0);
	if (shadows[child.idx].next_sibling_idx == 0) {
		/* only child - don't bother setting sort date */
		child.uid = thread_lookup_existing(ctx, child.idx);

		array_append(sorted_children, &child, 1);
		return;
	}
	while (child.idx != 0) {
		thread_child_node_fill(ctx, &child);

		array_append(sorted_children, &child, 1);
		child.idx = shadows[child.idx].next_sibling_idx;
	}

	/* sort the children */
	children = array_get_modifiable(sorted_children, &count);
	qsort(children, count, sizeof(*children), mail_thread_child_node_cmp);
}

static void gather_base_subjects(struct thread_finish_context *ctx)
{
	struct subject_gather_context gather_ctx;
	struct mail_thread_root_node *roots;
	const char *subject;
	unsigned int i, count;
	ARRAY_TYPE(mail_thread_child_node) sorted_children;
	const struct mail_thread_child_node *children;
	uint32_t idx, uid;

	memset(&gather_ctx, 0, sizeof(gather_ctx));
	gather_ctx.ctx = ctx;

	roots = array_get_modifiable(&ctx->roots, &count);
	if (count == 0)
		return;
	gather_ctx.subject_pool =
		pool_alloconly_create(MEMPOOL_GROWING"base subjects",
				      nearest_power(count * 20));
	gather_ctx.subject_hash =
		hash_table_create(default_pool, gather_ctx.subject_pool,
				  count * 2, str_hash,
				  (hash_cmp_callback_t *)strcmp);

	i_array_init(&sorted_children, 64);
	for (i = 0; i < count; i++) {
		roots[i].root_idx1 = i + 1;
		if (!roots[i].dummy)
			idx = roots[i].node.idx;
		else if (!roots[i].ignore) {
			/* find the oldest child */
			thread_sort_children(ctx, roots[i].node.idx,
					     &sorted_children);
			children = array_idx(&sorted_children, 0);
			idx = children[0].idx;
		} else {
			/* dummy without children */
			continue;
		}

		uid = thread_lookup_existing(ctx, idx);
		if (!mail_set_uid(ctx->tmp_mail, uid)) {
			/* the UID should have existed. we would have rebuild
			   the thread tree otherwise. */
			i_unreached();
		}
		if (mail_get_first_header(ctx->tmp_mail, HDR_SUBJECT,
					  &subject) > 0) T_BEGIN {
			add_base_subject(&gather_ctx, subject, &roots[i]);
		} T_END;
	}
	i_assert(roots[count-1].parent_root_idx1 <= count);
	array_free(&sorted_children);
	hash_table_destroy(&gather_ctx.subject_hash);
	pool_unref(&gather_ctx.subject_pool);
}

static void thread_add_shadow_child(struct thread_finish_context *ctx,
				    uint32_t parent_idx, uint32_t child_idx)
{
	struct mail_thread_shadow_node *parent_shadow, *child_shadow;

	parent_shadow = array_idx_modifiable(&ctx->shadow_nodes, parent_idx);
	child_shadow = array_idx_modifiable(&ctx->shadow_nodes, child_idx);

	child_shadow->next_sibling_idx = parent_shadow->first_child_idx;
	parent_shadow->first_child_idx = child_idx;
}

static void mail_thread_root_thread_merge(struct thread_finish_context *ctx,
					  struct mail_thread_root_node *cur)
{
	struct mail_thread_root_node *roots, *root, new_root;
	struct mail_thread_shadow_node *shadows;
	unsigned int count;
	uint32_t idx, next_idx;

	i_assert(cur->parent_root_idx1 != 0);

	/* The highest parent is the same as the current message in the
	   subject table. */
	roots = array_get_modifiable(&ctx->roots, &count);
	root = cur;
	do {
		i_assert(root->parent_root_idx1 <= count);
		root = &roots[root->parent_root_idx1 - 1];
	} while (root->parent_root_idx1 != 0);
	i_assert(!root->ignore);

	shadows = array_idx_modifiable(&ctx->shadow_nodes, 0);
	if (cur->dummy) {
		/* If both messages are dummies, append the current
                   message's children to the children of the message in
		   the subject table (the children of both messages
		   become siblings), and then delete the current message. */
		i_assert(root->dummy);

		idx = shadows[cur->node.idx].first_child_idx;
		while (idx != 0) {
			next_idx = shadows[idx].next_sibling_idx;
			thread_add_shadow_child(ctx, root->node.idx, idx);
			idx = next_idx;
		}

		shadows[cur->node.idx].first_child_idx = 0;
		cur->ignore = TRUE;
	} else if (root->dummy || (cur->reply_or_forward &&
				   !root->reply_or_forward)) {
		/* a) If the message in the subject table is a dummy and the
		   current message is not, make the current message a
		   child of the message in the subject table (a sibling
		   of its children).

		   b) If the current message is a reply or forward and the
		   message in the subject table is not, make the current
		   message a child of the message in the subject table (a
		   sibling of its children). */
		thread_add_shadow_child(ctx, root->node.idx, cur->node.idx);
		cur->ignore = TRUE;
	} else  {
		/* Otherwise, create a new dummy message and make both
		   the current message and the message in the subject
		   table children of the dummy.  Then replace the message
                   in the subject table with the dummy message. */
		memset(&new_root, 0, sizeof(new_root));
		new_root.root_idx1 = array_count(&ctx->roots) + 1;
		new_root.node.idx = ctx->next_new_root_idx++;
		new_root.dummy = TRUE;

		thread_add_shadow_child(ctx, new_root.node.idx, root->node.idx);
		thread_add_shadow_child(ctx, new_root.node.idx, cur->node.idx);

		root->parent_root_idx1 = new_root.root_idx1;
		root->ignore = TRUE;
		cur->ignore = TRUE;

		/* append last, since it breaks root and cur pointers */
		array_append(&ctx->roots, &new_root, 1);

		/* make sure all shadow indexes are accessible directly */
		(void)array_idx_modifiable(&ctx->shadow_nodes,
					   new_root.node.idx);
	}
}

static bool merge_subject_threads(struct thread_finish_context *ctx)
{
	struct mail_thread_root_node *roots;
	unsigned int i, count;
	bool changed = FALSE;

	roots = array_get_modifiable(&ctx->roots, &count);
	for (i = 0; i < count; i++) {
		if (roots[i].parent_root_idx1 != 0 && !roots[i].ignore) {
			mail_thread_root_thread_merge(ctx, &roots[i]);
			/* more roots may have been added */
			roots = array_idx_modifiable(&ctx->roots, 0);
			changed = TRUE;
		}
	}

	return changed;
}

static void sort_root_nodes(struct thread_finish_context *ctx)
{
	ARRAY_TYPE(mail_thread_child_node) sorted_children;
	const struct mail_thread_child_node *children;
	const struct mail_thread_shadow_node *shadows;
	struct mail_thread_root_node *roots;
	unsigned int i, count, child_count;

	i_array_init(&sorted_children, 64);
	shadows = array_idx(&ctx->shadow_nodes, 0);
	roots = array_get_modifiable(&ctx->roots, &count);
	for (i = 0; i < count; i++) {
		if (roots[i].ignore)
			continue;
		if (roots[i].dummy) {
			/* sort by the first child */
			if (shadows[roots[i].node.idx].first_child_idx == 0) {
				/* childless dummy node */
				roots[i].ignore = TRUE;
				continue;
			}
			thread_sort_children(ctx, roots[i].node.idx,
					     &sorted_children);
			children = array_get(&sorted_children, &child_count);
			if (child_count == 1) {
				/* only one child - deferred step (3).
				   promote the child to the root. */
				roots[i].node = children[0];
				thread_child_node_fill(ctx, &roots[i].node);
				roots[i].dummy = FALSE;
			} else {
				roots[i].node.uid = children[0].uid;
				roots[i].node.sort_date = children[0].sort_date;
			}
		} else {
			thread_child_node_fill(ctx, &roots[i].node);
		}
	}
	array_free(&sorted_children);
	qsort(roots, count, sizeof(*roots), mail_thread_child_node_cmp);
}

static int mail_thread_root_node_idx_cmp(const void *key, const void *value)
{
	const uint32_t *idx = key;
	const struct mail_thread_root_node *root = value;

	return *idx < root->node.idx ? -1 :
		*idx > root->node.idx ? 1 : 0;
}

static void sort_root_nodes_ref2(struct thread_finish_context *ctx,
				 uint32_t record_count)
{
	const struct mail_thread_node *node;
	struct mail_thread_root_node *roots, *root;
	struct mail_thread_child_node child;
	const struct mail_thread_shadow_node *shadows;
	unsigned int root_count;
	uint32_t idx, parent_idx;

	roots = array_get_modifiable(&ctx->roots, &root_count);

	/* drop childless dummy nodes */
	shadows = array_idx(&ctx->shadow_nodes, 0);
	for (idx = 0; idx < root_count; idx++) {
		if (roots[idx].dummy &&
		    shadows[roots[idx].node.idx].first_child_idx == 0)
			roots[idx].ignore = TRUE;
	}

	for (idx = 1; idx < record_count; idx++) {
		node = array_idx(&ctx->cache->thread_nodes, idx);
		if (!MAIL_THREAD_NODE_EXISTS(node))
			continue;

		child.idx = idx;
		thread_child_node_fill(ctx, &child);

		parent_idx = idx;
		while (node->parent_idx != 0) {
			parent_idx = node->parent_idx;
			node = array_idx(&ctx->cache->thread_nodes,
					 node->parent_idx);
		}
		root = bsearch(&parent_idx, roots, root_count, sizeof(*roots),
			       mail_thread_root_node_idx_cmp);
		i_assert(root != NULL);

		if (root->node.sort_date < child.sort_date)
			root->node.sort_date = child.sort_date;
	}
	qsort(roots, root_count, sizeof(*roots), mail_thread_child_node_cmp);
}

static void mail_thread_create_shadows(struct thread_finish_context *ctx,
				       uint32_t record_count)
{
	const struct mail_thread_node *node, *parent;
	struct mail_thread_root_node root;
	struct mail_thread_child_node child;
	uint32_t idx, parent_idx;

	ctx->use_sent_date = FALSE;

	memset(&root, 0, sizeof(root));
	memset(&child, 0, sizeof(child));

	/* We may see dummy messages without parents or children. We can't
	   free them since the nodes are in an array, but they may get reused
	   later so just leave them be. With the current algorithm when this
	   happens all the struct fields are always zero at that point, so
	   we don't even have to try to zero them. */
	for (idx = 1; idx < record_count; idx++) {
		node = array_idx(&ctx->cache->thread_nodes, idx);

		if (node->parent_idx == 0) {
			/* root node - add to roots list */
			root.node.idx = idx;
			if (!MAIL_THREAD_NODE_EXISTS(node)) {
				root.dummy = TRUE;
				root.node.uid = 0;
			} else {
				root.dummy = FALSE;
				root.node.uid = node->uid;
			}
			array_append(&ctx->roots, &root, 1);
			continue;
		}
		i_assert(node->parent_idx < record_count);

		if (!MAIL_THREAD_NODE_EXISTS(node)) {
			/* dummy node */
			continue;
		}

		/* Find the node's first non-dummy parent and add the
		   node as its child. If there are no non-dummy
		   parents, add it as the highest dummy's child. */
		parent_idx = node->parent_idx;
		parent = array_idx(&ctx->cache->thread_nodes, parent_idx);
		while (!MAIL_THREAD_NODE_EXISTS(parent) &&
		       parent->parent_idx != 0) {
			parent_idx = parent->parent_idx;
			parent = array_idx(&ctx->cache->thread_nodes,
					   parent_idx);
		}
		thread_add_shadow_child(ctx, parent_idx, idx);
	}
}

static void mail_thread_finish(struct thread_finish_context *ctx,
			       enum mail_thread_type thread_type)
{
	unsigned int record_count = array_count(&ctx->cache->thread_nodes);

	ctx->next_new_root_idx = record_count + 1;

	/* (2) save root nodes and (3) remove dummy messages */
	i_array_init(&ctx->roots, I_MIN(128, record_count));
	i_array_init(&ctx->shadow_nodes, record_count);
	/* make sure all shadow indexes are accessible directly. */
	(void)array_idx_modifiable(&ctx->shadow_nodes, record_count);

	mail_thread_create_shadows(ctx, record_count);

	/* (4) */
	ctx->use_sent_date = TRUE;
	switch (thread_type) {
	case MAIL_THREAD_REFERENCES:
		sort_root_nodes(ctx);
		/* (5) Gather together messages under the root that have
		   the same base subject text. */
		gather_base_subjects(ctx);
		/* (5.C) Merge threads with the same thread subject. */
		if (merge_subject_threads(ctx)) {
			/* root ordering may have changed, sort them again. */
			sort_root_nodes(ctx);
		}
		break;
	case MAIL_THREAD_REFS:
		sort_root_nodes_ref2(ctx, record_count);
		break;
	default:
		i_unreached();
	}
}

static void
nodes_change_uids_to_seqs(struct mail_thread_iterate_context *iter, bool root)
{
	struct mail_thread_child_node *children;
	struct mailbox *box = iter->ctx->tmp_mail->box;
	unsigned int i, count;
	uint32_t uid, seq;

	children = array_get_modifiable(&iter->children, &count);
	for (i = 0; i < count; i++) {
		uid = children[i].uid;
		if (uid == 0) {
			/* dummy root */
			if (root)
				continue;
			i_unreached();
		} else {
			mailbox_get_seq_range(box, uid, uid, &seq, &seq);
			i_assert(seq != 0);
		}
		children[i].uid = seq;
	}
}

static void
mail_thread_iterate_fill_root(struct mail_thread_iterate_context *iter)
{
	struct mail_thread_root_node *roots;
	unsigned int i, count;

	roots = array_get_modifiable(&iter->ctx->roots, &count);
	i_array_init(&iter->children, count);
	for (i = 0; i < count; i++) {
		if (!roots[i].ignore) {
			if (roots[i].dummy)
				roots[i].node.uid = 0;
			array_append(&iter->children, &roots[i].node, 1);
		}
	}
}

static struct mail_thread_iterate_context *
mail_thread_iterate_children(struct mail_thread_iterate_context *parent_iter,
			     uint32_t parent_idx)
{
	struct mail_thread_iterate_context *child_iter;

	child_iter = i_new(struct mail_thread_iterate_context, 1);
	child_iter->ctx = parent_iter->ctx;
	child_iter->ctx->refcount++;

	i_array_init(&child_iter->children, 8);
	thread_sort_children(child_iter->ctx, parent_idx,
			     &child_iter->children);
	if (child_iter->ctx->return_seqs)
		nodes_change_uids_to_seqs(child_iter, FALSE);
	return child_iter;
}

struct mail_thread_iterate_context *
mail_thread_iterate_init_full(struct mail_thread_cache *cache,
			      struct mail *tmp_mail,
			      enum mail_thread_type thread_type,
			      bool return_seqs)
{
	struct mail_thread_iterate_context *iter;
	struct thread_finish_context *ctx;

	iter = i_new(struct mail_thread_iterate_context, 1);
	ctx = iter->ctx = i_new(struct thread_finish_context, 1);
	ctx->refcount = 1;
	ctx->cache = cache;
	ctx->tmp_mail = tmp_mail;
	ctx->return_seqs = return_seqs;
	mail_thread_finish(ctx, thread_type);

	mail_thread_iterate_fill_root(iter);
	if (return_seqs)
		nodes_change_uids_to_seqs(iter, TRUE);
	return iter;
}

const struct mail_thread_child_node *
mail_thread_iterate_next(struct mail_thread_iterate_context *iter,
			 struct mail_thread_iterate_context **child_iter_r)
{
	const struct mail_thread_child_node *children, *child;
	const struct mail_thread_shadow_node *shadow;
	unsigned int count;

	children = array_get(&iter->children, &count);
	if (iter->next_idx >= count)
		return NULL;

	child = &children[iter->next_idx++];
	if (child_iter_r != NULL) {
		shadow = array_idx(&iter->ctx->shadow_nodes, child->idx);
		*child_iter_r = shadow->first_child_idx == 0 ? NULL :
			mail_thread_iterate_children(iter, child->idx);
	}
	return child;
}

unsigned int mail_thread_iterate_count(struct mail_thread_iterate_context *iter)
{
	return array_count(&iter->children);
}

int mail_thread_iterate_deinit(struct mail_thread_iterate_context **_iter)
{
	struct mail_thread_iterate_context *iter = *_iter;

	*_iter = NULL;

	if (--iter->ctx->refcount == 0) {
		array_free(&iter->ctx->roots);
		array_free(&iter->ctx->shadow_nodes);
		i_free(iter->ctx);
	}
	array_free(&iter->children);
	i_free(iter);
	return 0;
}