changeset 950:ceb3ea5e1a2a HEAD

SORT optimization. It now uses memory to store one or two of the sort criteria items. Should be "fast enough" now, sorting ~4000 messages doesn't take hardly any time.
author Timo Sirainen <tss@iki.fi>
date Sat, 11 Jan 2003 19:48:25 +0200
parents e601f13d95b1
children 6f1e87a51872
files src/lib-storage/index/index-search.c src/lib-storage/index/index-sort.c src/lib-storage/mail-sort.c src/lib-storage/mail-sort.h
diffstat 4 files changed, 322 insertions(+), 89 deletions(-) [+]
line wrap: on
line diff
--- a/src/lib-storage/index/index-search.c	Sat Jan 11 19:42:55 2003 +0200
+++ b/src/lib-storage/index/index-search.c	Sat Jan 11 19:48:25 2003 +0200
@@ -948,7 +948,7 @@
 		index_sort_ctx.output = output;
 
 		thread_ctx = NULL;
-		sort_ctx = mail_sort_init(sort_unsorted, sorting,
+		sort_ctx = mail_sort_init(sort_unsorted, sorting, output,
 					  &index_sort_callbacks,
 					  &index_sort_ctx);
 		o_stream_send_str(output, "* SORT");
--- a/src/lib-storage/index/index-sort.c	Sat Jan 11 19:42:55 2003 +0200
+++ b/src/lib-storage/index/index-sort.c	Sat Jan 11 19:48:25 2003 +0200
@@ -154,25 +154,10 @@
 	ctx->cached = FALSE;
 }
 
-static void _output(unsigned int *data, size_t count, void *context)
-{
-	struct index_sort_context *ctx = context;
-	size_t i;
-
-	/* FIXME: works only with UIDs! */
-	for (i = 0; i < count; i++) {
-		t_push();
-		o_stream_send(ctx->output, " ", 1);
-		o_stream_send_str(ctx->output, dec2str(data[i]));
-		t_pop();
-	}
-}
-
 struct mail_sort_callbacks index_sort_callbacks = {
 	_input_time,
 	_input_uofft,
 	_input_mailbox,
 	_input_str,
-	_input_reset,
-	_output
+	_input_reset
 };
--- a/src/lib-storage/mail-sort.c	Sat Jan 11 19:42:55 2003 +0200
+++ b/src/lib-storage/mail-sort.c	Sat Jan 11 19:48:25 2003 +0200
@@ -1,24 +1,39 @@
 /* Copyright (C) 2002 Timo Sirainen */
 
-/* Implementation of draft-ietf-imapext-sort-10 sorting algorithm */
+/* Implementation of draft-ietf-imapext-sort-10 sorting algorithm.
+   Pretty messy code actually, adding any sort types requires care.
+   This is pretty fast however and takes only as much memory as needed to be
+   reasonably fast. */
 
 #include "lib.h"
 #include "buffer.h"
+#include "hash.h"
 #include "ostream.h"
 #include "imap-base-subject.h"
 #include "mail-sort.h"
 
 #include <stdlib.h>
 
+#define IS_SORT_STRING(type) \
+	((type) == MAIL_SORT_CC || (type) == MAIL_SORT_FROM || \
+	 (type) == MAIL_SORT_SUBJECT || (type) == MAIL_SORT_TO)
+
+#define IS_SORT_TIME(type) \
+	((type) == MAIL_SORT_ARRIVAL || (type) == MAIL_SORT_DATE)
+
 struct mail_sort_context {
 	enum mail_sort_type output[MAX_SORT_PROGRAM_SIZE];
-	enum mail_sort_type common_mask;
+	enum mail_sort_type common_mask, cache_mask;
 
+	struct ostream *outstream;
 	const struct mail_sort_callbacks *callbacks;
 	void *func_context;
 
 	buffer_t *sort_buffer;
-	pool_t temp_pool;
+	size_t sort_element_size;
+
+	pool_t temp_pool, str_pool;
+	struct hash_table *string_table;
 
 	time_t last_arrival, last_date;
 	uoff_t last_size;
@@ -76,8 +91,10 @@
 
 struct mail_sort_context *
 mail_sort_init(const enum mail_sort_type *input, enum mail_sort_type *output,
+	       struct ostream *outstream,
 	       const struct mail_sort_callbacks *callbacks, void *context)
 {
+	/* @UNSAFE */
 	struct mail_sort_context *ctx;
 	enum mail_sort_type norm_input[MAX_SORT_PROGRAM_SIZE];
 	enum mail_sort_type norm_output[MAX_SORT_PROGRAM_SIZE];
@@ -85,6 +102,8 @@
 	int i;
 
 	ctx = i_new(struct mail_sort_context, 1);
+	ctx->temp_pool = pool_alloconly_create("sort temp", 8192);
+	ctx->outstream = outstream;
 
 	t_push();
 	buf = buffer_create_data(data_stack_pool,
@@ -105,11 +124,45 @@
 		ctx->output[i] = output[i];
 	ctx->output[i] = MAIL_SORT_END;
 
+	/* figure out what data we'd like to cache */
+	ctx->sort_element_size = sizeof(unsigned int);
+	ctx->cache_mask = 0;
+
+	for (i = 0; output[i] != MAIL_SORT_END; i++) {
+		if (IS_SORT_STRING(output[i])) {
+			ctx->sort_element_size += sizeof(const char *);
+
+			/* cache the second rule as well, if available */
+			if (ctx->cache_mask != 0) {
+				ctx->cache_mask |= output[i];
+				break;
+			}
+			ctx->cache_mask |= output[i];
+		} else if (IS_SORT_TIME(output[i])) {
+			ctx->sort_element_size += sizeof(time_t);
+			ctx->cache_mask |= output[i];
+			break;
+		} else if (output[i] == MAIL_SORT_SIZE) {
+			ctx->sort_element_size += sizeof(uoff_t);
+			ctx->cache_mask |= output[i];
+			break;
+		}
+	}
+
+	if ((ctx->cache_mask & MAIL_SORT_CC) ||
+	    (ctx->cache_mask & MAIL_SORT_FROM) ||
+	    (ctx->cache_mask & MAIL_SORT_TO) ||
+	    (ctx->cache_mask & MAIL_SORT_SUBJECT)) {
+		ctx->str_pool = pool_alloconly_create("sort str", 8192);
+		ctx->string_table = hash_create(default_pool, ctx->str_pool,
+						0, str_hash,
+						(HashCompareFunc)strcmp);
+	}
+
 	ctx->sort_buffer = buffer_create_dynamic(system_pool,
-						 128 * sizeof(unsigned int),
+						 128 * ctx->sort_element_size,
 						 (size_t)-1);
 
-	ctx->temp_pool = pool_alloconly_create("Sort", 8192);
 	ctx->callbacks = callbacks;
 	ctx->func_context = context;
 	return ctx;
@@ -118,6 +171,11 @@
 void mail_sort_deinit(struct mail_sort_context *ctx)
 {
 	mail_sort_flush(ctx);
+
+	if (ctx->string_table != NULL)
+		hash_destroy(ctx->string_table);
+	if (ctx->str_pool != NULL)
+		pool_unref(ctx->str_pool);
 	buffer_free(ctx->sort_buffer);
 	pool_unref(ctx->temp_pool);
 
@@ -129,31 +187,23 @@
 	i_free(ctx);
 }
 
-static int addr_strcmp(const char *s1, const char *s2)
+static const char *string_table_get(struct mail_sort_context *ctx,
+				    const char *str)
 {
-	if (s1 == NULL)
-		return s2 == NULL ? 0 : -1;
-	if (s2 == NULL)
-		return 1;
-
-	/* FIXME: maybe create ascii_strcasecmp()? strcasecmp() may compare
-	   non-ASCII too if locale is set. We don't do that now though. */
-	return strcasecmp(s1, s2);
-}
+	char *value;
 
-static int subject_cmp(pool_t pool, const char *s1, const char *s2)
-{
-	int ret;
+	if (str == NULL)
+		return NULL;
+	if (*str == '\0')
+		return "";
 
-	if (s1 == NULL)
-		return s2 == NULL ? 0 : -1;
-	if (s2 == NULL)
-		return 1;
+	value = hash_lookup(ctx->string_table, str);
+	if (value == NULL) {
+		value = p_strdup(ctx->str_pool, str);
+		hash_insert(ctx->string_table, value, value);
+	}
 
-	p_clear(pool);
-	ret = strcmp(imap_get_base_subject_cased(pool, s1, NULL),
-		     imap_get_base_subject_cased(pool, s2, NULL));
-	return ret;
+	return value;
 }
 
 static void mail_sort_check_flush(struct mail_sort_context *ctx,
@@ -174,9 +224,10 @@
 	}
 
 	if (ctx->common_mask & MAIL_SORT_CC) {
-		str = ctx->callbacks->input_str(MAIL_SORT_CC, id,
-						ctx->func_context);
-		if (addr_strcmp(str, ctx->last_cc) != 0) {
+		str = ctx->callbacks->input_mailbox(MAIL_SORT_CC, id,
+						    ctx->func_context);
+		str = str_ucase(t_strdup_noconst(str));
+		if (strcmp(str, ctx->last_cc) != 0) {
 			i_free(ctx->last_cc);
 			ctx->last_cc = i_strdup(str);
 			changed = TRUE;
@@ -193,9 +244,10 @@
 	}
 
 	if (ctx->common_mask & MAIL_SORT_FROM) {
-		str = ctx->callbacks->input_str(MAIL_SORT_FROM, id,
-						ctx->func_context);
-		if (addr_strcmp(str, ctx->last_from) != 0) {
+		str = ctx->callbacks->input_mailbox(MAIL_SORT_FROM, id,
+						    ctx->func_context);
+		str = str_ucase(t_strdup_noconst(str));
+		if (strcmp(str, ctx->last_from) != 0) {
 			i_free(ctx->last_from);
 			ctx->last_from = i_strdup(str);
 			changed = TRUE;
@@ -203,8 +255,8 @@
 	}
 
 	if (ctx->common_mask & MAIL_SORT_SIZE) {
-		size = ctx->callbacks->input_time(MAIL_SORT_SIZE, id,
-						  ctx->func_context);
+		size = ctx->callbacks->input_uofft(MAIL_SORT_SIZE, id,
+						   ctx->func_context);
 		if (size != ctx->last_size) {
 			ctx->last_size = size;
 			changed = TRUE;
@@ -214,7 +266,10 @@
 	if (ctx->common_mask & MAIL_SORT_SUBJECT) {
 		str = ctx->callbacks->input_str(MAIL_SORT_SUBJECT, id,
 						ctx->func_context);
-		if (subject_cmp(ctx->temp_pool, str, ctx->last_subject) != 0) {
+		p_clear(ctx->temp_pool);
+		str = imap_get_base_subject_cased(ctx->temp_pool, str, NULL);
+
+		if (strcmp(str, ctx->last_subject) != 0) {
 			i_free(ctx->last_subject);
 			ctx->last_subject = i_strdup(str);
 			changed = TRUE;
@@ -222,9 +277,10 @@
 	}
 
 	if (ctx->common_mask & MAIL_SORT_TO) {
-		str = ctx->callbacks->input_str(MAIL_SORT_TO, id,
-						ctx->func_context);
-		if (addr_strcmp(str, ctx->last_to) != 0) {
+		str = ctx->callbacks->input_mailbox(MAIL_SORT_TO, id,
+						    ctx->func_context);
+		str = str_ucase(t_strdup_noconst(str));
+		if (strcmp(str, ctx->last_to) != 0) {
 			i_free(ctx->last_to);
 			ctx->last_to = i_strdup(str);
 			changed = TRUE;
@@ -237,26 +293,205 @@
 
 void mail_sort_input(struct mail_sort_context *ctx, unsigned int id)
 {
+	/* @UNSAFE */
+	unsigned char *buf;
+	time_t t;
+	uoff_t size;
+	const char *str;
+	size_t pos;
+
+	t_push();
 	if (ctx->common_mask != 0)
 		mail_sort_check_flush(ctx, id);
 
-	buffer_append(ctx->sort_buffer, &id, sizeof(id));
+	buf = buffer_append_space(ctx->sort_buffer, ctx->sort_element_size);
+	memcpy(buf, &id, sizeof(id)); pos = sizeof(id);
+
+	if (ctx->cache_mask & MAIL_SORT_ARRIVAL) {
+		if (ctx->common_mask & MAIL_SORT_ARRIVAL)
+			t = ctx->last_arrival;
+		else {
+			t = ctx->callbacks->input_time(MAIL_SORT_ARRIVAL, id,
+						       ctx->func_context);
+		}
+		memcpy(buf + pos, &t, sizeof(t)); pos += sizeof(t);
+	}
+
+	if (ctx->cache_mask & MAIL_SORT_DATE) {
+		if (ctx->common_mask & MAIL_SORT_DATE)
+			t = ctx->last_date;
+		else {
+			t = ctx->callbacks->input_time(MAIL_SORT_DATE, id,
+						       ctx->func_context);
+		}
+		memcpy(buf + pos, &t, sizeof(t)); pos += sizeof(t);
+	}
+
+	if (ctx->cache_mask & MAIL_SORT_SIZE) {
+		if (ctx->common_mask & MAIL_SORT_SIZE)
+			size = ctx->last_size;
+		else {
+			size = ctx->callbacks->input_uofft(MAIL_SORT_SIZE, id,
+							   ctx->func_context);
+		}
+
+		memcpy(buf + pos, &size, sizeof(size)); pos += sizeof(size);
+	}
+
+	if (ctx->cache_mask & MAIL_SORT_CC) {
+		if (ctx->common_mask & MAIL_SORT_CC)
+			str = ctx->last_cc;
+		else {
+			str = ctx->callbacks->input_mailbox(MAIL_SORT_CC, id,
+							    ctx->func_context);
+			if (str != NULL)
+				str = str_ucase(t_strdup_noconst(str));
+		}
+		str = string_table_get(ctx, str);
+
+		memcpy(buf + pos, &str, sizeof(const char *));
+		pos += sizeof(const char *);
+	}
+
+	if (ctx->cache_mask & MAIL_SORT_FROM) {
+		if (ctx->common_mask & MAIL_SORT_FROM)
+			str = ctx->last_from;
+		else {
+			str = ctx->callbacks->input_mailbox(MAIL_SORT_FROM, id,
+							    ctx->func_context);
+			if (str != NULL)
+				str = str_ucase(t_strdup_noconst(str));
+		}
+		str = string_table_get(ctx, str);
+
+		memcpy(buf + pos, &str, sizeof(const char *));
+		pos += sizeof(const char *);
+	}
+
+	if (ctx->cache_mask & MAIL_SORT_TO) {
+		if (ctx->common_mask & MAIL_SORT_TO)
+			str = ctx->last_to;
+		else {
+			str = ctx->callbacks->input_mailbox(MAIL_SORT_TO, id,
+							    ctx->func_context);
+			if (str != NULL)
+				str = str_ucase(t_strdup_noconst(str));
+		}
+		str = string_table_get(ctx, str);
+
+		memcpy(buf + pos, &str, sizeof(const char *));
+		pos += sizeof(const char *);
+	}
+
+	if (ctx->cache_mask & MAIL_SORT_SUBJECT) {
+		if (ctx->common_mask & MAIL_SORT_SUBJECT)
+			str = ctx->last_subject;
+		else {
+			str = ctx->callbacks->input_str(MAIL_SORT_SUBJECT, id,
+							ctx->func_context);
+			p_clear(ctx->temp_pool);
+			str = imap_get_base_subject_cased(ctx->temp_pool,
+							  str, NULL);
+		}
+		str = string_table_get(ctx, str);
+
+		memcpy(buf + pos, &str, sizeof(const char *));
+		pos += sizeof(const char *);
+	}
+
+	i_assert(pos == ctx->sort_element_size);
+
+	t_pop();
 }
 
-static struct mail_sort_context *mail_sort_qsort_context;
+static struct mail_sort_context *qsort_context;
+
+static time_t get_time(enum mail_sort_type type, const unsigned char *buf,
+		       struct mail_sort_context *ctx)
+{
+	time_t t;
+
+	if ((ctx->cache_mask & type) == 0) {
+		return ctx->callbacks->
+			input_time(type, *((unsigned int *) buf),
+				   ctx->func_context);
+	}
+
+	/* use memcpy() to avoid any alignment problems */
+	memcpy(&t, buf + sizeof(unsigned int), sizeof(t));
+	return t;
+}
+
+static time_t get_uofft(enum mail_sort_type type, const unsigned char *buf,
+			struct mail_sort_context *ctx)
+{
+	uoff_t size;
+
+	if ((ctx->cache_mask & type) == 0) {
+		return ctx->callbacks->
+			input_uofft(type, *((unsigned int *) buf),
+				    ctx->func_context);
+	}
+
+	/* use memcpy() to avoid any alignment problems */
+	memcpy(&size, buf + sizeof(unsigned int), sizeof(size));
+	return size;
+}
+static const char *get_str(enum mail_sort_type type, const unsigned char *buf,
+			   struct mail_sort_context *ctx)
+{
+	const char *str;
+	enum mail_sort_type type2;
+	int pos;
+
+	if ((ctx->cache_mask & type) == 0) {
+		unsigned int id = *((unsigned int *) buf);
+
+		if (type == MAIL_SORT_SUBJECT) {
+			str = ctx->callbacks->input_str(MAIL_SORT_SUBJECT, id,
+							ctx->func_context);
+			p_clear(ctx->temp_pool);
+			str = imap_get_base_subject_cased(ctx->temp_pool,
+							  str, NULL);
+		} else {
+			str = ctx->callbacks->input_mailbox(type, id,
+							    ctx->func_context);
+			if (str != NULL)
+				str = str_ucase(t_strdup_noconst(str));
+
+		}
+		return str;
+	}
+
+	/* figure out where it is. pretty ugly. */
+	type2 = (ctx->cache_mask & ~type);
+
+	if (type2 == 0)
+		pos = 0;
+	else if (IS_SORT_TIME(type2))
+		pos = sizeof(time_t);
+	else if (type2 == MAIL_SORT_SIZE)
+		pos = sizeof(uoff_t);
+	else {
+		if (type == MAIL_SORT_SUBJECT)
+			pos = sizeof(const char *);
+		else if (type2 != MAIL_SORT_SUBJECT && type > type2)
+			pos = sizeof(const char *);
+		else
+			pos = 0;
+	}
+
+	/* use memcpy() to avoid any alignment problems */
+	memcpy(&str, buf + pos + sizeof(unsigned int), sizeof(const char *));
+	return str;
+}
 
 static int mail_sort_qsort_func(const void *p1, const void *p2)
 {
-	const unsigned int *i1 = p1;
-	const unsigned int *i2 = p2;
 	enum mail_sort_type *output;
-        const struct mail_sort_callbacks *cb;
-	void *ctx;
 	int ret, reverse = FALSE;
 
-	output = mail_sort_qsort_context->output;
-	cb = mail_sort_qsort_context->callbacks;
-	ctx = mail_sort_qsort_context->func_context;
+	output = qsort_context->output;
 
 	t_push();
 
@@ -272,34 +507,35 @@
 		case MAIL_SORT_DATE: {
 			time_t r1, r2;
 
-			r1 = cb->input_time(*output, *i1, ctx);
-			r2 = cb->input_time(*output, *i2, ctx);
+			r1 = get_time(*output, p1, qsort_context);
+			r2 = get_time(*output, p2, qsort_context);
 			ret = r1 < r2 ? -1 : r1 > r2 ? 1 : 0;
 			break;
 		}
 		case MAIL_SORT_SIZE: {
 			uoff_t r1, r2;
 
-			r1 = cb->input_uofft(*output, *i1, ctx);
-			r2 = cb->input_uofft(*output, *i2, ctx);
+			r1 = get_uofft(*output, p1, qsort_context);
+			r2 = get_uofft(*output, p2, qsort_context);
 			ret = r1 < r2 ? -1 : r1 > r2 ? 1 : 0;
 			break;
 		}
 		case MAIL_SORT_CC:
 		case MAIL_SORT_FROM:
-		case MAIL_SORT_TO: {
-			const char *a1, *a2;
+		case MAIL_SORT_TO:
+		case MAIL_SORT_SUBJECT: {
+			const char *s1, *s2;
 
-			a1 = cb->input_mailbox(*output, *i1, ctx);
-			a2 = cb->input_mailbox(*output, *i2, ctx);
-			ret = addr_strcmp(a1, a2);
+			s1 = get_str(*output, p1, qsort_context);
+			s2 = get_str(*output, p2, qsort_context);
+			if (s1 == NULL)
+				ret = s2 == NULL ? 0 : -1;
+			else if (s2 == NULL)
+				ret = 1;
+			else
+				ret = strcmp(s1, s2);
 			break;
 		}
-		case MAIL_SORT_SUBJECT:
-			ret = subject_cmp(mail_sort_qsort_context->temp_pool,
-					  cb->input_str(*output, *i1, ctx),
-					  cb->input_str(*output, *i2, ctx));
-			break;
 		default:
 			i_unreached();
 		}
@@ -314,24 +550,38 @@
 		reverse = FALSE;
 	}
 
-	cb->input_reset(ctx);
+	qsort_context->callbacks->input_reset(qsort_context->func_context);
 
 	t_pop();
 
-	return ret != 0 ? ret : (*i1 < *i2 ? -1 : 1);
+	return ret != 0 ? ret :
+		(*((unsigned int *) p1) < *((unsigned int *) p2) ? -1 : 1);
 }
 
 static void mail_sort_flush(struct mail_sort_context *ctx)
 {
-	unsigned int *arr;
-	size_t count;
+	unsigned char *arr;
+	size_t i, count;
 
-	mail_sort_qsort_context = ctx;
+	qsort_context = ctx;
 
 	arr = buffer_get_modifyable_data(ctx->sort_buffer, NULL);
-	count = buffer_get_used_size(ctx->sort_buffer) / sizeof(unsigned int);
-	qsort(arr, count, sizeof(unsigned int), mail_sort_qsort_func);
+	count = buffer_get_used_size(ctx->sort_buffer) / ctx->sort_element_size;
+	qsort(arr, count, ctx->sort_element_size, mail_sort_qsort_func);
+
+	for (i = 0; i < count; i++, arr += ctx->sort_element_size) {
+		unsigned int id = *((unsigned int *) arr);
 
-	ctx->callbacks->output(arr, count, ctx->func_context);
+		t_push();
+		o_stream_send(ctx->outstream, " ", 1);
+		o_stream_send_str(ctx->outstream, dec2str(id));
+		t_pop();
+	}
+
 	buffer_set_used_size(ctx->sort_buffer, 0);
+
+	if (ctx->string_table != NULL) {
+		hash_clear(ctx->string_table, TRUE);
+		p_clear(ctx->str_pool);
+	}
 }
--- a/src/lib-storage/mail-sort.h	Sat Jan 11 19:42:55 2003 +0200
+++ b/src/lib-storage/mail-sort.h	Sat Jan 11 19:48:25 2003 +0200
@@ -22,9 +22,6 @@
 
 	/* done parsing this message, free all resources */
 	void (*input_reset)(void *context);
-
-	/* result callback */
-	void (*output)(unsigned int *data, size_t count, void *context);
 };
 
 /* input and output are arrays of sort programs ending with MAIL_SORT_END.
@@ -33,6 +30,7 @@
    is known, the less memory is used. */
 struct mail_sort_context *
 mail_sort_init(const enum mail_sort_type *input, enum mail_sort_type *output,
+	       struct ostream *outstream,
 	       const struct mail_sort_callbacks *callbacks, void *context);
 void mail_sort_deinit(struct mail_sort_context *ctx);