view src/lib-storage/mail-sort.c @ 950:ceb3ea5e1a2a HEAD

SORT optimization. It now uses memory to store one or two of the sort criteria items. Should be "fast enough" now, sorting ~4000 messages doesn't take hardly any time.
author Timo Sirainen <tss@iki.fi>
date Sat, 11 Jan 2003 19:48:25 +0200
parents 4f697dde0fca
children 411006be3c66
line wrap: on
line source

/* Copyright (C) 2002 Timo Sirainen */

/* Implementation of draft-ietf-imapext-sort-10 sorting algorithm.
   Pretty messy code actually, adding any sort types requires care.
   This is pretty fast however and takes only as much memory as needed to be
   reasonably fast. */

#include "lib.h"
#include "buffer.h"
#include "hash.h"
#include "ostream.h"
#include "imap-base-subject.h"
#include "mail-sort.h"

#include <stdlib.h>

#define IS_SORT_STRING(type) \
	((type) == MAIL_SORT_CC || (type) == MAIL_SORT_FROM || \
	 (type) == MAIL_SORT_SUBJECT || (type) == MAIL_SORT_TO)

#define IS_SORT_TIME(type) \
	((type) == MAIL_SORT_ARRIVAL || (type) == MAIL_SORT_DATE)

struct mail_sort_context {
	enum mail_sort_type output[MAX_SORT_PROGRAM_SIZE];
	enum mail_sort_type common_mask, cache_mask;

	struct ostream *outstream;
	const struct mail_sort_callbacks *callbacks;
	void *func_context;

	buffer_t *sort_buffer;
	size_t sort_element_size;

	pool_t temp_pool, str_pool;
	struct hash_table *string_table;

	time_t last_arrival, last_date;
	uoff_t last_size;
	char *last_cc, *last_from, *last_subject, *last_to;
};

static void mail_sort_flush(struct mail_sort_context *ctx);

static enum mail_sort_type
mail_sort_normalize(const enum mail_sort_type *input, buffer_t *output)
{
        enum mail_sort_type type, mask = 0;
	int pos, reverse;

	reverse = FALSE;
	for (pos = 0; *input != MAIL_SORT_END; input++) {
		if (*input == MAIL_SORT_REVERSE)
			reverse = !reverse;
		else {
			if ((mask & *input) == 0) {
				if (reverse) {
					type = MAIL_SORT_REVERSE;
					buffer_append(output,
						      &type, sizeof(type));
				}

				buffer_append(output, input, sizeof(*input));
				mask |= *input;
			}

			reverse = FALSE;
		}
	}

	type = MAIL_SORT_END;
	buffer_append(output, &type, sizeof(type));

	return mask;
}

static enum mail_sort_type
mail_sort_get_common_mask(const enum mail_sort_type *input,
			  enum mail_sort_type **output)
{
	enum mail_sort_type mask = 0;

	while (*input == **output && *input != MAIL_SORT_END) {
		if (*input != MAIL_SORT_REVERSE)
			mask |= *input;
		input++; (*output)++;
	}

	return mask;
}

struct mail_sort_context *
mail_sort_init(const enum mail_sort_type *input, enum mail_sort_type *output,
	       struct ostream *outstream,
	       const struct mail_sort_callbacks *callbacks, void *context)
{
	/* @UNSAFE */
	struct mail_sort_context *ctx;
	enum mail_sort_type norm_input[MAX_SORT_PROGRAM_SIZE];
	enum mail_sort_type norm_output[MAX_SORT_PROGRAM_SIZE];
	buffer_t *buf;
	int i;

	ctx = i_new(struct mail_sort_context, 1);
	ctx->temp_pool = pool_alloconly_create("sort temp", 8192);
	ctx->outstream = outstream;

	t_push();
	buf = buffer_create_data(data_stack_pool,
				 norm_input, sizeof(norm_input));
	mail_sort_normalize(input, buf);

	buf = buffer_create_data(data_stack_pool,
				 norm_output, sizeof(norm_output));
	mail_sort_normalize(output, buf);
	t_pop();

	/* remove the common part from output, we already know input is sorted
	   that much so we don't have to worry about it. */
	output = norm_output;
        ctx->common_mask = mail_sort_get_common_mask(norm_input, &output);

	for (i = 0; output[i] != MAIL_SORT_END; i++)
		ctx->output[i] = output[i];
	ctx->output[i] = MAIL_SORT_END;

	/* figure out what data we'd like to cache */
	ctx->sort_element_size = sizeof(unsigned int);
	ctx->cache_mask = 0;

	for (i = 0; output[i] != MAIL_SORT_END; i++) {
		if (IS_SORT_STRING(output[i])) {
			ctx->sort_element_size += sizeof(const char *);

			/* cache the second rule as well, if available */
			if (ctx->cache_mask != 0) {
				ctx->cache_mask |= output[i];
				break;
			}
			ctx->cache_mask |= output[i];
		} else if (IS_SORT_TIME(output[i])) {
			ctx->sort_element_size += sizeof(time_t);
			ctx->cache_mask |= output[i];
			break;
		} else if (output[i] == MAIL_SORT_SIZE) {
			ctx->sort_element_size += sizeof(uoff_t);
			ctx->cache_mask |= output[i];
			break;
		}
	}

	if ((ctx->cache_mask & MAIL_SORT_CC) ||
	    (ctx->cache_mask & MAIL_SORT_FROM) ||
	    (ctx->cache_mask & MAIL_SORT_TO) ||
	    (ctx->cache_mask & MAIL_SORT_SUBJECT)) {
		ctx->str_pool = pool_alloconly_create("sort str", 8192);
		ctx->string_table = hash_create(default_pool, ctx->str_pool,
						0, str_hash,
						(HashCompareFunc)strcmp);
	}

	ctx->sort_buffer = buffer_create_dynamic(system_pool,
						 128 * ctx->sort_element_size,
						 (size_t)-1);

	ctx->callbacks = callbacks;
	ctx->func_context = context;
	return ctx;
}

void mail_sort_deinit(struct mail_sort_context *ctx)
{
	mail_sort_flush(ctx);

	if (ctx->string_table != NULL)
		hash_destroy(ctx->string_table);
	if (ctx->str_pool != NULL)
		pool_unref(ctx->str_pool);
	buffer_free(ctx->sort_buffer);
	pool_unref(ctx->temp_pool);

	i_free(ctx->last_cc);
	i_free(ctx->last_from);
	i_free(ctx->last_subject);
	i_free(ctx->last_to);

	i_free(ctx);
}

static const char *string_table_get(struct mail_sort_context *ctx,
				    const char *str)
{
	char *value;

	if (str == NULL)
		return NULL;
	if (*str == '\0')
		return "";

	value = hash_lookup(ctx->string_table, str);
	if (value == NULL) {
		value = p_strdup(ctx->str_pool, str);
		hash_insert(ctx->string_table, value, value);
	}

	return value;
}

static void mail_sort_check_flush(struct mail_sort_context *ctx,
				  unsigned int id)
{
	const char *str;
	time_t t;
	uoff_t size;
	int changed = FALSE;

	if (ctx->common_mask & MAIL_SORT_ARRIVAL) {
		t = ctx->callbacks->input_time(MAIL_SORT_ARRIVAL, id,
					       ctx->func_context);
		if (t != ctx->last_arrival) {
			ctx->last_arrival = t;
			changed = TRUE;
		}
	}

	if (ctx->common_mask & MAIL_SORT_CC) {
		str = ctx->callbacks->input_mailbox(MAIL_SORT_CC, id,
						    ctx->func_context);
		str = str_ucase(t_strdup_noconst(str));
		if (strcmp(str, ctx->last_cc) != 0) {
			i_free(ctx->last_cc);
			ctx->last_cc = i_strdup(str);
			changed = TRUE;
		}
	}

	if (ctx->common_mask & MAIL_SORT_DATE) {
		t = ctx->callbacks->input_time(MAIL_SORT_DATE, id,
					       ctx->func_context);
		if (t != ctx->last_date) {
			ctx->last_date = t;
			changed = TRUE;
		}
	}

	if (ctx->common_mask & MAIL_SORT_FROM) {
		str = ctx->callbacks->input_mailbox(MAIL_SORT_FROM, id,
						    ctx->func_context);
		str = str_ucase(t_strdup_noconst(str));
		if (strcmp(str, ctx->last_from) != 0) {
			i_free(ctx->last_from);
			ctx->last_from = i_strdup(str);
			changed = TRUE;
		}
	}

	if (ctx->common_mask & MAIL_SORT_SIZE) {
		size = ctx->callbacks->input_uofft(MAIL_SORT_SIZE, id,
						   ctx->func_context);
		if (size != ctx->last_size) {
			ctx->last_size = size;
			changed = TRUE;
		}
	}

	if (ctx->common_mask & MAIL_SORT_SUBJECT) {
		str = ctx->callbacks->input_str(MAIL_SORT_SUBJECT, id,
						ctx->func_context);
		p_clear(ctx->temp_pool);
		str = imap_get_base_subject_cased(ctx->temp_pool, str, NULL);

		if (strcmp(str, ctx->last_subject) != 0) {
			i_free(ctx->last_subject);
			ctx->last_subject = i_strdup(str);
			changed = TRUE;
		}
	}

	if (ctx->common_mask & MAIL_SORT_TO) {
		str = ctx->callbacks->input_mailbox(MAIL_SORT_TO, id,
						    ctx->func_context);
		str = str_ucase(t_strdup_noconst(str));
		if (strcmp(str, ctx->last_to) != 0) {
			i_free(ctx->last_to);
			ctx->last_to = i_strdup(str);
			changed = TRUE;
		}
	}

	if (changed)
		mail_sort_flush(ctx);
}

void mail_sort_input(struct mail_sort_context *ctx, unsigned int id)
{
	/* @UNSAFE */
	unsigned char *buf;
	time_t t;
	uoff_t size;
	const char *str;
	size_t pos;

	t_push();
	if (ctx->common_mask != 0)
		mail_sort_check_flush(ctx, id);

	buf = buffer_append_space(ctx->sort_buffer, ctx->sort_element_size);
	memcpy(buf, &id, sizeof(id)); pos = sizeof(id);

	if (ctx->cache_mask & MAIL_SORT_ARRIVAL) {
		if (ctx->common_mask & MAIL_SORT_ARRIVAL)
			t = ctx->last_arrival;
		else {
			t = ctx->callbacks->input_time(MAIL_SORT_ARRIVAL, id,
						       ctx->func_context);
		}
		memcpy(buf + pos, &t, sizeof(t)); pos += sizeof(t);
	}

	if (ctx->cache_mask & MAIL_SORT_DATE) {
		if (ctx->common_mask & MAIL_SORT_DATE)
			t = ctx->last_date;
		else {
			t = ctx->callbacks->input_time(MAIL_SORT_DATE, id,
						       ctx->func_context);
		}
		memcpy(buf + pos, &t, sizeof(t)); pos += sizeof(t);
	}

	if (ctx->cache_mask & MAIL_SORT_SIZE) {
		if (ctx->common_mask & MAIL_SORT_SIZE)
			size = ctx->last_size;
		else {
			size = ctx->callbacks->input_uofft(MAIL_SORT_SIZE, id,
							   ctx->func_context);
		}

		memcpy(buf + pos, &size, sizeof(size)); pos += sizeof(size);
	}

	if (ctx->cache_mask & MAIL_SORT_CC) {
		if (ctx->common_mask & MAIL_SORT_CC)
			str = ctx->last_cc;
		else {
			str = ctx->callbacks->input_mailbox(MAIL_SORT_CC, id,
							    ctx->func_context);
			if (str != NULL)
				str = str_ucase(t_strdup_noconst(str));
		}
		str = string_table_get(ctx, str);

		memcpy(buf + pos, &str, sizeof(const char *));
		pos += sizeof(const char *);
	}

	if (ctx->cache_mask & MAIL_SORT_FROM) {
		if (ctx->common_mask & MAIL_SORT_FROM)
			str = ctx->last_from;
		else {
			str = ctx->callbacks->input_mailbox(MAIL_SORT_FROM, id,
							    ctx->func_context);
			if (str != NULL)
				str = str_ucase(t_strdup_noconst(str));
		}
		str = string_table_get(ctx, str);

		memcpy(buf + pos, &str, sizeof(const char *));
		pos += sizeof(const char *);
	}

	if (ctx->cache_mask & MAIL_SORT_TO) {
		if (ctx->common_mask & MAIL_SORT_TO)
			str = ctx->last_to;
		else {
			str = ctx->callbacks->input_mailbox(MAIL_SORT_TO, id,
							    ctx->func_context);
			if (str != NULL)
				str = str_ucase(t_strdup_noconst(str));
		}
		str = string_table_get(ctx, str);

		memcpy(buf + pos, &str, sizeof(const char *));
		pos += sizeof(const char *);
	}

	if (ctx->cache_mask & MAIL_SORT_SUBJECT) {
		if (ctx->common_mask & MAIL_SORT_SUBJECT)
			str = ctx->last_subject;
		else {
			str = ctx->callbacks->input_str(MAIL_SORT_SUBJECT, id,
							ctx->func_context);
			p_clear(ctx->temp_pool);
			str = imap_get_base_subject_cased(ctx->temp_pool,
							  str, NULL);
		}
		str = string_table_get(ctx, str);

		memcpy(buf + pos, &str, sizeof(const char *));
		pos += sizeof(const char *);
	}

	i_assert(pos == ctx->sort_element_size);

	t_pop();
}

static struct mail_sort_context *qsort_context;

static time_t get_time(enum mail_sort_type type, const unsigned char *buf,
		       struct mail_sort_context *ctx)
{
	time_t t;

	if ((ctx->cache_mask & type) == 0) {
		return ctx->callbacks->
			input_time(type, *((unsigned int *) buf),
				   ctx->func_context);
	}

	/* use memcpy() to avoid any alignment problems */
	memcpy(&t, buf + sizeof(unsigned int), sizeof(t));
	return t;
}

static time_t get_uofft(enum mail_sort_type type, const unsigned char *buf,
			struct mail_sort_context *ctx)
{
	uoff_t size;

	if ((ctx->cache_mask & type) == 0) {
		return ctx->callbacks->
			input_uofft(type, *((unsigned int *) buf),
				    ctx->func_context);
	}

	/* use memcpy() to avoid any alignment problems */
	memcpy(&size, buf + sizeof(unsigned int), sizeof(size));
	return size;
}
static const char *get_str(enum mail_sort_type type, const unsigned char *buf,
			   struct mail_sort_context *ctx)
{
	const char *str;
	enum mail_sort_type type2;
	int pos;

	if ((ctx->cache_mask & type) == 0) {
		unsigned int id = *((unsigned int *) buf);

		if (type == MAIL_SORT_SUBJECT) {
			str = ctx->callbacks->input_str(MAIL_SORT_SUBJECT, id,
							ctx->func_context);
			p_clear(ctx->temp_pool);
			str = imap_get_base_subject_cased(ctx->temp_pool,
							  str, NULL);
		} else {
			str = ctx->callbacks->input_mailbox(type, id,
							    ctx->func_context);
			if (str != NULL)
				str = str_ucase(t_strdup_noconst(str));

		}
		return str;
	}

	/* figure out where it is. pretty ugly. */
	type2 = (ctx->cache_mask & ~type);

	if (type2 == 0)
		pos = 0;
	else if (IS_SORT_TIME(type2))
		pos = sizeof(time_t);
	else if (type2 == MAIL_SORT_SIZE)
		pos = sizeof(uoff_t);
	else {
		if (type == MAIL_SORT_SUBJECT)
			pos = sizeof(const char *);
		else if (type2 != MAIL_SORT_SUBJECT && type > type2)
			pos = sizeof(const char *);
		else
			pos = 0;
	}

	/* use memcpy() to avoid any alignment problems */
	memcpy(&str, buf + pos + sizeof(unsigned int), sizeof(const char *));
	return str;
}

static int mail_sort_qsort_func(const void *p1, const void *p2)
{
	enum mail_sort_type *output;
	int ret, reverse = FALSE;

	output = qsort_context->output;

	t_push();

	ret = 0;
	for (; *output != MAIL_SORT_END && ret == 0; output++) {
		if (*output == MAIL_SORT_REVERSE) {
			reverse = !reverse;
			continue;
		}

		switch (*output) {
		case MAIL_SORT_ARRIVAL:
		case MAIL_SORT_DATE: {
			time_t r1, r2;

			r1 = get_time(*output, p1, qsort_context);
			r2 = get_time(*output, p2, qsort_context);
			ret = r1 < r2 ? -1 : r1 > r2 ? 1 : 0;
			break;
		}
		case MAIL_SORT_SIZE: {
			uoff_t r1, r2;

			r1 = get_uofft(*output, p1, qsort_context);
			r2 = get_uofft(*output, p2, qsort_context);
			ret = r1 < r2 ? -1 : r1 > r2 ? 1 : 0;
			break;
		}
		case MAIL_SORT_CC:
		case MAIL_SORT_FROM:
		case MAIL_SORT_TO:
		case MAIL_SORT_SUBJECT: {
			const char *s1, *s2;

			s1 = get_str(*output, p1, qsort_context);
			s2 = get_str(*output, p2, qsort_context);
			if (s1 == NULL)
				ret = s2 == NULL ? 0 : -1;
			else if (s2 == NULL)
				ret = 1;
			else
				ret = strcmp(s1, s2);
			break;
		}
		default:
			i_unreached();
		}

		if (reverse) {
			if (ret > 0)
				ret = -1;
			else if (ret < 0)
				ret = 1;
		}

		reverse = FALSE;
	}

	qsort_context->callbacks->input_reset(qsort_context->func_context);

	t_pop();

	return ret != 0 ? ret :
		(*((unsigned int *) p1) < *((unsigned int *) p2) ? -1 : 1);
}

static void mail_sort_flush(struct mail_sort_context *ctx)
{
	unsigned char *arr;
	size_t i, count;

	qsort_context = ctx;

	arr = buffer_get_modifyable_data(ctx->sort_buffer, NULL);
	count = buffer_get_used_size(ctx->sort_buffer) / ctx->sort_element_size;
	qsort(arr, count, ctx->sort_element_size, mail_sort_qsort_func);

	for (i = 0; i < count; i++, arr += ctx->sort_element_size) {
		unsigned int id = *((unsigned int *) arr);

		t_push();
		o_stream_send(ctx->outstream, " ", 1);
		o_stream_send_str(ctx->outstream, dec2str(id));
		t_pop();
	}

	buffer_set_used_size(ctx->sort_buffer, 0);

	if (ctx->string_table != NULL) {
		hash_clear(ctx->string_table, TRUE);
		p_clear(ctx->str_pool);
	}
}