changeset 15190:60c4815778fb

lib-index: Simplified writing to dovecot.index.cache file. The old method was basically: - write max. 32 kB to internal buffer - flush it by writing to reserved areas (with no locks) The reserved areas were acquired by doing (whenever needed): - lock dovecot.index.cache - reserve data from dovecot.index.cache for writing, potentially increasing the file size by writing 0 bytes. the reserved area size varies. - unlock dovecot.index.cache This worked, but if multiple processes were writing to the cache file it could have left incomplete reserved areas as holes. The holes were attempted to be filled if they were large enough. The new method is: - write max. 256 kB to internal buffer - lock dovecot.index.cache - append the buffer to dovecot.index.cache - unlock dovecot.index.cache No reserved areas, holes or anything else weird going on. Ideally no data would be overwritten in the dovecot.index.cache file, only appended. Unfortunately currently some data is still overwritten: - mail_cache_header.{deleted_space,continued_record_count} - mail_cache_header_fields.next_offset when writing a new one - mail_cache_header_fields.{last_used,decision} - mail_cache_record.prev_offset The changing headers could eventually be moved to dovecot.index. This however is a backwards-incompatible change. The record's prev_offset could maybe simply just not be written in those (somewhat rare) problematic situations.
author Timo Sirainen <tss@iki.fi>
date Thu, 04 Oct 2012 02:34:53 +0300
parents aa5c1d162714
children b9cb9c3cdfdc
files src/doveadm/doveadm-dump-index.c src/lib-index/mail-cache-compress.c src/lib-index/mail-cache-private.h src/lib-index/mail-cache-transaction.c src/lib-index/mail-cache.c
diffstat 5 files changed, 151 insertions(+), 569 deletions(-) [+]
line wrap: on
line diff
--- a/src/doveadm/doveadm-dump-index.c	Thu Oct 04 02:08:23 2012 +0300
+++ b/src/doveadm/doveadm-dump-index.c	Thu Oct 04 02:34:53 2012 +0300
@@ -324,8 +324,8 @@
 	       hdr->file_seq, unixdate2str(hdr->file_seq),
 	       hdr->file_seq - hdr->indexid);
 	printf("continued_record_count = %u\n", hdr->continued_record_count);
-	printf("hole_offset .......... = %u\n", hdr->hole_offset);
-	printf("used_file_size ....... = %u\n", hdr->used_file_size);
+	printf("hole_offset (unused) . = %u\n", hdr->unused_old_hole_offset);
+	printf("used_file_size (old) . = %u\n", hdr->backwards_compat_used_file_size);
 	printf("deleted_space ........ = %u\n", hdr->deleted_space);
 	printf("field_header_offset .. = %u (0x%08x nontranslated)\n",
 	       mail_index_offset_to_uint32(hdr->field_header_offset),
--- a/src/lib-index/mail-cache-compress.c	Thu Oct 04 02:08:23 2012 +0300
+++ b/src/lib-index/mail-cache-compress.c	Thu Oct 04 02:34:53 2012 +0300
@@ -272,7 +272,7 @@
 	mail_cache_compress_get_fields(&ctx, used_fields_count);
 	o_stream_nsend(output, ctx.buffer->data, ctx.buffer->used);
 
-	hdr.used_file_size = output->offset;
+	hdr.backwards_compat_used_file_size = output->offset;
 	buffer_free(&ctx.buffer);
 	buffer_free(&ctx.field_seen);
 
@@ -287,12 +287,6 @@
 		array_free(ext_offsets);
 		return -1;
 	}
-
-	if (hdr.used_file_size < MAIL_CACHE_INITIAL_SIZE) {
-		/* grow the file some more. doesn't matter if it fails */
-		(void)file_set_size(fd, MAIL_CACHE_INITIAL_SIZE);
-	}
-
 	o_stream_destroy(&output);
 
 	if (cache->index->fsync_mode == FSYNC_MODE_ALWAYS) {
--- a/src/lib-index/mail-cache-private.h	Thu Oct 04 02:08:23 2012 +0300
+++ b/src/lib-index/mail-cache-private.h	Thu Oct 04 02:34:53 2012 +0300
@@ -13,9 +13,6 @@
 /* Never compress the file if it's smaller than this */
 #define MAIL_CACHE_COMPRESS_MIN_SIZE (1024*50)
 
-/* Don't bother remembering holes smaller than this */
-#define MAIL_CACHE_MIN_HOLE_SIZE 1024
-
 /* Compress the file when deleted space reaches n% of total size */
 #define MAIL_CACHE_COMPRESS_PERCENTAGE 20
 
@@ -27,15 +24,6 @@
    the latest cache header. */
 #define MAIL_CACHE_HEADER_FIELD_CONTINUE_COUNT 4
 
-/* Initial size for the file */
-#define MAIL_CACHE_INITIAL_SIZE (sizeof(struct mail_cache_header) + 10240)
-
-/* When more space is needed, grow the file n% larger than the previous size */
-#define MAIL_CACHE_GROW_PERCENTAGE 10
-
-/* When allocating space for transactions, don't use blocks larger than this. */
-#define MAIL_CACHE_MAX_RESERVED_BLOCK_SIZE (1024*512)
-
 #define MAIL_CACHE_LOCK_TIMEOUT 10
 #define MAIL_CACHE_LOCK_CHANGE_TIMEOUT 300
 
@@ -58,8 +46,8 @@
 
 	uint32_t continued_record_count;
 
-	uint32_t hole_offset;
-	uint32_t used_file_size;
+	uint32_t unused_old_hole_offset;
+	uint32_t backwards_compat_used_file_size;
 	uint32_t deleted_space;
 
 	uint32_t field_header_offset;
@@ -102,17 +90,6 @@
 	/* array of { uint32_t field; [ uint32_t size; ] { .. } } */
 };
 
-struct mail_cache_hole_header {
-	uint32_t next_offset; /* 0 if no holes left */
-	uint32_t size; /* including this header */
-
-	/* make sure we notice if we're treating hole as mail_cache_record.
-	   magic is a large number so if it's treated as size field, it'll
-	   point outside the file */
-#define MAIL_CACHE_HOLE_HEADER_MAGIC 0xffeedeff
-	uint32_t magic;
-};
-
 struct mail_cache_field_private {
 	struct mail_cache_field field;
 
@@ -230,6 +207,8 @@
 
 int mail_cache_write(struct mail_cache *cache, const void *data, size_t size,
 		     uoff_t offset);
+int mail_cache_append(struct mail_cache *cache, const void *data, size_t size,
+		      uint32_t *offset_r);
 
 int mail_cache_header_fields_read(struct mail_cache *cache);
 int mail_cache_header_fields_update(struct mail_cache *cache);
--- a/src/lib-index/mail-cache-transaction.c	Thu Oct 04 02:08:23 2012 +0300
+++ b/src/lib-index/mail-cache-transaction.c	Thu Oct 04 02:34:53 2012 +0300
@@ -15,16 +15,12 @@
 #include <stddef.h>
 #include <sys/stat.h>
 
-#define MAIL_CACHE_WRITE_BUFFER 32768
+#define MAIL_CACHE_INIT_WRITE_BUFFER (1024*16)
+#define MAIL_CACHE_MAX_WRITE_BUFFER (1024*256)
 
 #define CACHE_TRANS_CONTEXT(obj) \
 	MODULE_CONTEXT(obj, cache_mail_index_transaction_module)
 
-struct mail_cache_reservation {
-	uint32_t offset;
-	uint32_t size;
-};
-
 struct mail_cache_transaction_ctx {
 	union mail_index_transaction_module_context module_ctx;
 	struct mail_index_transaction_vfuncs super;
@@ -39,11 +35,9 @@
 	buffer_t *cache_data;
 	ARRAY(uint32_t) cache_data_seq;
 	uint32_t prev_seq;
-	size_t prev_pos;
+	size_t last_rec_pos;
 
-        ARRAY(struct mail_cache_reservation) reservations;
-	uint32_t reserved_space_offset, reserved_space;
-	uint32_t last_grow_size;
+	uoff_t bytes_written;
 
 	unsigned int tried_compression:1;
 	unsigned int changes:1;
@@ -52,10 +46,9 @@
 static MODULE_CONTEXT_DEFINE_INIT(cache_mail_index_transaction_module,
 				  &mail_index_module_register);
 
-static void
-mail_cache_transaction_free_reservations(struct mail_cache_transaction_ctx *ctx);
-static int mail_cache_link_unlocked(struct mail_cache *cache,
-				    uint32_t old_offset, uint32_t new_offset);
+static int mail_cache_transaction_lock(struct mail_cache_transaction_ctx *ctx);
+static int mail_cache_link_locked(struct mail_cache *cache,
+				  uint32_t old_offset, uint32_t new_offset);
 
 static void mail_index_transaction_cache_reset(struct mail_index_transaction *t)
 {
@@ -105,7 +98,6 @@
 	ctx->cache = view->cache;
 	ctx->view = view;
 	ctx->trans = t;
-	i_array_init(&ctx->reservations, 32);
 
 	i_assert(view->transaction == NULL);
 	view->transaction = ctx;
@@ -132,23 +124,27 @@
 	if (array_is_created(&ctx->cache_data_seq))
 		array_clear(&ctx->cache_data_seq);
 	ctx->prev_seq = 0;
-	ctx->prev_pos = 0;
-
-	array_clear(&ctx->reservations);
-	ctx->reserved_space_offset = 0;
-	ctx->reserved_space = 0;
-	ctx->last_grow_size = 0;
+	ctx->last_rec_pos = 0;
 
 	ctx->changes = FALSE;
 }
 
-static void
-mail_cache_transaction_free(struct mail_cache_transaction_ctx **_ctx)
+void mail_cache_transaction_rollback(struct mail_cache_transaction_ctx **_ctx)
 {
 	struct mail_cache_transaction_ctx *ctx = *_ctx;
 
 	*_ctx = NULL;
 
+	if (ctx->bytes_written > 0) {
+		/* we already wrote to the cache file. we can't (or don't want
+		   to) delete that data, so just mark it as deleted space */
+		if (mail_cache_transaction_lock(ctx) > 0) {
+			ctx->cache->hdr_copy.deleted_space +=
+				ctx->bytes_written;
+			(void)mail_cache_unlock(ctx->cache);
+		}
+	}
+
 	MODULE_CONTEXT_UNSET(ctx->trans, cache_mail_index_transaction_module);
 
 	ctx->view->transaction = NULL;
@@ -159,7 +155,6 @@
 		buffer_free(&ctx->cache_data);
 	if (array_is_created(&ctx->cache_data_seq))
 		array_free(&ctx->cache_data_seq);
-	array_free(&ctx->reservations);
 	i_free(ctx);
 }
 
@@ -279,346 +274,14 @@
 	return 1;
 }
 
-static int mail_cache_grow_file(struct mail_cache *cache, size_t size)
-{
-	struct stat st;
-	uoff_t new_fsize, grow_size;
-
-	i_assert(cache->locked);
-
-	/* grow the file */
-	new_fsize = cache->hdr_copy.used_file_size + size;
-	grow_size = new_fsize / 100 * MAIL_CACHE_GROW_PERCENTAGE;
-	if (grow_size < 16384)
-		grow_size = 16384;
-	new_fsize += grow_size;
-	new_fsize &= ~1023;
-
-	if (fstat(cache->fd, &st) < 0) {
-		mail_cache_set_syscall_error(cache, "fstat()");
-		return -1;
-	}
-
-	if ((uoff_t)st.st_size < new_fsize) {
-		if (file_set_size(cache->fd, new_fsize) < 0) {
-			mail_cache_set_syscall_error(cache, "file_set_size()");
-			return -1;
-		}
-	}
-	return 0;
-}
-
-static bool mail_cache_unlink_hole(struct mail_cache *cache, size_t size,
-				   struct mail_cache_hole_header *hole_r)
-{
-	struct mail_cache_header *hdr = &cache->hdr_copy;
-	struct mail_cache_hole_header hole;
-	uint32_t offset, prev_offset;
-
-	i_assert(cache->locked);
-
-	offset = hdr->hole_offset; prev_offset = 0;
-	while (offset != 0) {
-		if (pread_full(cache->fd, &hole, sizeof(hole), offset) <= 0) {
-			mail_cache_set_syscall_error(cache, "pread_full()");
-			return FALSE;
-		}
-
-		if (hole.magic != MAIL_CACHE_HOLE_HEADER_MAGIC) {
-			mail_cache_set_corrupted(cache,
-				"Invalid magic in hole header");
-			return FALSE;
-		}
-
-		if (hole.size >= size)
-			break;
-
-		prev_offset = offset;
-		offset = hole.next_offset;
-	}
-	if (offset == 0)
-		return FALSE;
-
-	if (prev_offset == 0)
-		hdr->hole_offset = hole.next_offset;
-	else {
-		if (mail_cache_write(cache, &hole.next_offset,
-				     sizeof(hole.next_offset), prev_offset) < 0)
-			return FALSE;
-	}
-	hdr->deleted_space -= hole.size;
-	cache->hdr_modified = TRUE;
-
-	hole_r->next_offset = offset;
-	hole_r->size = hole.size;
-	return TRUE;
-}
-
-static void
-mail_cache_transaction_add_reservation(struct mail_cache_transaction_ctx *ctx,
-				       uint32_t offset, uint32_t size)
-{
-	struct mail_cache_reservation res;
-
-	ctx->reserved_space_offset = offset;
-	ctx->reserved_space = size;
-
-	res.offset = offset;
-	res.size = size;
-
-	array_append(&ctx->reservations, &res, 1);
-}
-
-static void
-mail_cache_transaction_partial_commit(struct mail_cache_transaction_ctx *ctx,
-				      uint32_t offset, uint32_t size)
-{
-	struct mail_cache_reservation *res;
-	unsigned int i, count;
-
-	if (offset + size == ctx->cache->hdr_copy.used_file_size &&
-	    offset + size == ctx->reserved_space_offset) {
-		i_assert(ctx->reserved_space == 0);
-		ctx->reserved_space_offset = 0;
-	}
-
-	res = array_get_modifiable(&ctx->reservations, &count);
-	for (i = 0; i < count; i++) {
-		if (res[i].offset == offset) {
-			if (res[i].size == size) {
-				array_delete(&ctx->reservations, i, 1);
-			} else {
-				i_assert(res[i].size > size);
-				res[i].offset += size;
-				res[i].size -= size;
-			}
-			break;
-		}
-	}
-}
-
 static int
-mail_cache_transaction_reserve_more(struct mail_cache_transaction_ctx *ctx,
-				    size_t block_size, bool commit)
+mail_cache_transaction_update_index(struct mail_cache_transaction_ctx *ctx,
+				    uint32_t write_offset)
 {
 	struct mail_cache *cache = ctx->cache;
-	struct mail_cache_header *hdr = &cache->hdr_copy;
-	struct mail_cache_hole_header hole;
-	struct mail_cache_reservation *reservations;
-	unsigned int count;
-
-	i_assert(cache->locked);
-
-	if (mail_cache_unlink_hole(cache, block_size, &hole)) {
-		/* found a large enough hole. */
-		mail_cache_transaction_add_reservation(ctx, hole.next_offset,
-						       hole.size);
-		return 0;
-	}
-
-	if (MAIL_CACHE_IS_UNUSABLE(cache)) {
-		/* mail_cache_unlink_hole() could have noticed corruption */
-		return -1;
-	}
-
-	if ((uint32_t)-1 - hdr->used_file_size < block_size) {
-		mail_index_set_error(cache->index, "Cache file too large: %s",
-				     cache->filepath);
-		return -1;
-	}
-
-	if (!commit && block_size < MAIL_CACHE_MAX_RESERVED_BLOCK_SIZE) {
-		/* allocate some more space than we need */
-		size_t new_block_size = (block_size + ctx->last_grow_size) * 2;
-		if (new_block_size > MAIL_CACHE_MAX_RESERVED_BLOCK_SIZE)
-			new_block_size = MAIL_CACHE_MAX_RESERVED_BLOCK_SIZE;
-
-		if ((uint32_t)-1 - hdr->used_file_size >= new_block_size) {
-			block_size = new_block_size;
-			ctx->last_grow_size = new_block_size;
-		}
-	}
-
-	if (mail_cache_grow_file(ctx->cache, block_size) < 0)
-		return -1;
-
-	if (ctx->reserved_space_offset + ctx->reserved_space ==
-	    hdr->used_file_size) {
-		/* we can simply grow it */
-
-		/* grow reservation. it's probably the last one in the buffer,
-		   but it's not guarateed because we might have used holes
-		   as well */
-		reservations = array_get_modifiable(&ctx->reservations, &count);
-
-		do {
-			i_assert(count > 0);
-			count--;
-		} while (reservations[count].offset +
-			 reservations[count].size != hdr->used_file_size);
-
-		reservations[count].size += block_size;
-		ctx->reserved_space += block_size;
-	} else {
-		mail_cache_transaction_add_reservation(ctx, hdr->used_file_size,
-						       block_size);
-	}
-
-	cache->hdr_modified = TRUE;
-	hdr->used_file_size = ctx->reserved_space_offset + ctx->reserved_space;
-	return 0;
-}
-
-static void
-mail_cache_free_space(struct mail_cache *cache, uint32_t offset, uint32_t size)
-{
-	struct mail_cache_hole_header hole;
-
-	i_assert(cache->locked);
-
-	if (MAIL_CACHE_IS_UNUSABLE(cache))
-		return;
-
-	if (offset + size == cache->hdr_copy.used_file_size) {
-		/* we can just set used_file_size back */
-		cache->hdr_modified = TRUE;
-		cache->hdr_copy.used_file_size = offset;
-	} else if (size >= MAIL_CACHE_MIN_HOLE_SIZE) {
-		/* set it up as a hole */
-		hole.next_offset = cache->hdr_copy.hole_offset;
-		hole.size = size;
-		hole.magic = MAIL_CACHE_HOLE_HEADER_MAGIC;
-
-		if (mail_cache_write(cache, &hole, sizeof(hole), offset) < 0)
-			return;
-
-		cache->hdr_copy.deleted_space += size;
-		cache->hdr_copy.hole_offset = offset;
-		cache->hdr_modified = TRUE;
-	}
-}
-
-static void
-mail_cache_transaction_free_reservations(struct mail_cache_transaction_ctx *ctx)
-{
-	const struct mail_cache_reservation *reservations;
-	unsigned int count;
-
-	if (ctx->reserved_space == 0 && array_count(&ctx->reservations) == 0)
-		return;
-
-	if (mail_cache_transaction_lock(ctx) <= 0)
-		return;
-
-	reservations = array_get(&ctx->reservations, &count);
-
-	/* free flushed data as well. do it from end to beginning so we have
-	   a better chance of updating used_file_size instead of adding holes */
-	while (count > 0) {
-		count--;
-		mail_cache_free_space(ctx->cache,
-				      reservations[count].offset,
-				      reservations[count].size);
-	}
-	(void)mail_cache_unlock(ctx->cache);
-}
-
-static int
-mail_cache_transaction_free_space(struct mail_cache_transaction_ctx *ctx)
-{
-	bool locked = ctx->cache->locked;
-
-	if (ctx->reserved_space == 0)
-		return 0;
-
-	if (!locked) {
-		if (mail_cache_transaction_lock(ctx) <= 0)
-			return 0;
-	}
-
-	/* check again - locking might have reopened the cache file */
-	if (ctx->reserved_space != 0) {
-		i_assert(ctx->cache_file_seq == ctx->cache->hdr->file_seq);
-		mail_cache_free_space(ctx->cache, ctx->reserved_space_offset,
-				      ctx->reserved_space);
-		ctx->reserved_space_offset = 0;
-                ctx->reserved_space = 0;
-	}
-
-	if (!locked) {
-		if (mail_cache_unlock(ctx->cache) < 0)
-			return -1;
-	}
-	return 0;
-}
-
-static int
-mail_cache_transaction_get_space(struct mail_cache_transaction_ctx *ctx,
-				 size_t min_size, size_t max_size,
-				 uint32_t *offset_r, size_t *available_space_r,
-				 bool commit)
-{
-	bool locked = ctx->cache->locked;
-	uint32_t cache_file_seq;
-	size_t size;
-	int ret;
-
-	i_assert((min_size & 3) == 0);
-	i_assert((max_size & 3) == 0);
-
-	if (min_size > ctx->reserved_space) {
-		/* not enough preallocated space in transaction, get more */
-		cache_file_seq = ctx->cache_file_seq;
-		if (!locked) {
-			if ((ret = mail_cache_transaction_lock(ctx)) <= 0)
-				return ret;
-		}
-		ret = mail_cache_transaction_reserve_more(ctx, max_size,
-							  commit);
-		if (!locked) {
-			if (mail_cache_unlock(ctx->cache) < 0)
-				return -1;
-		}
-
-		if (ret < 0)
-			return -1;
-
-		if (cache_file_seq != ctx->cache_file_seq) {
-			/* cache file reopened - need to abort */
-			return 0;
-		}
-
-		size = max_size;
-	} else {
-		size = I_MIN(max_size, ctx->reserved_space);
-	}
-
-	*offset_r = ctx->reserved_space_offset;
-	ctx->reserved_space_offset += size;
-	ctx->reserved_space -= size;
-	*available_space_r = size;
-	i_assert((size & 3) == 0);
-
-	if (size == max_size && commit) {
-		/* final commit - see if we can free the rest of the
-		   reserved space */
-		if (mail_cache_transaction_free_space(ctx) < 0)
-			return -1;
-	}
-
-	i_assert(size >= min_size);
-	return 1;
-}
-
-static int
-mail_cache_transaction_update_index(struct mail_cache_transaction_ctx *ctx,
-				    const struct mail_cache_record *rec,
-				    const uint32_t *seq, uint32_t *seq_idx,
-				    uint32_t seq_limit, uint32_t write_offset,
-				    uint32_t *size_r)
-{
-	struct mail_cache *cache = ctx->cache;
-	uint32_t i, old_offset, orig_write_offset;
+	const struct mail_cache_record *rec = ctx->cache_data->data;
+	const uint32_t *seqs;
+	uint32_t i, seq_count, old_offset;
 
 	mail_index_ext_using_reset_id(ctx->trans, ctx->cache->ext_id,
 				      ctx->cache_file_seq);
@@ -626,9 +289,9 @@
 	/* write the cache_offsets to index file. records' prev_offset
 	   is updated to point to old cache record when index is being
 	   synced. */
-	orig_write_offset = write_offset;
-	for (i = *seq_idx; i < seq_limit; i++) {
-		mail_index_update_ext(ctx->trans, seq[i], cache->ext_id,
+	seqs = array_get(&ctx->cache_data_seq, &seq_count);
+	for (i = 0; i < seq_count; i++) {
+		mail_index_update_ext(ctx->trans, seqs[i], cache->ext_id,
 				      &write_offset, &old_offset);
 		if (old_offset != 0) {
 			/* we added records for this message multiple
@@ -637,20 +300,19 @@
 			   transaction log, we need to do the linking
 			   ourself here. */
 			if (old_offset > write_offset) {
-				if (mail_cache_link_unlocked(cache, old_offset,
-							     write_offset) < 0)
+				if (mail_cache_link_locked(cache, old_offset,
+							   write_offset) < 0)
 					return -1;
 			} else {
 				/* if we're combining multiple transactions,
 				   make sure the one with the smallest offset
 				   is written into index. this is required for
 				   non-file-mmaped cache to work properly. */
-				mail_index_update_ext(ctx->trans, seq[i],
+				mail_index_update_ext(ctx->trans, seqs[i],
 						      cache->ext_id,
 						      &old_offset, NULL);
-				if (mail_cache_link_unlocked(cache,
-							     write_offset,
-							     old_offset) < 0)
+				if (mail_cache_link_locked(cache, write_offset,
+							   old_offset) < 0)
 					return -1;
 			}
 		}
@@ -658,122 +320,77 @@
 		write_offset += rec->size;
 		rec = CONST_PTR_OFFSET(rec, rec->size);
 	}
-
-	*seq_idx = i;
-	*size_r = write_offset - orig_write_offset;
 	return 0;
 }
 
 static int
 mail_cache_transaction_flush(struct mail_cache_transaction_ctx *ctx)
 {
-	struct mail_cache *cache = ctx->cache;
-	const struct mail_cache_record *rec, *tmp_rec;
-	const uint32_t *seq;
-	uint32_t write_offset, write_size, rec_pos, seq_idx, seq_limit;
-	size_t size, max_size;
-	unsigned int seq_count;
+	uint32_t write_offset;
 	int ret;
-	bool commit;
 
-	if (MAIL_CACHE_IS_UNUSABLE(cache))
+	i_assert(!ctx->cache->locked);
+
+	if (mail_cache_transaction_lock(ctx) <= 0)
 		return -1;
 
-	commit = ctx->prev_seq == 0;
-	if (commit) {
-		/* committing, remove the last dummy record */
-		buffer_set_used_size(ctx->cache_data, ctx->prev_pos);
-	}
-
-	if (ctx->cache_file_seq != ctx->cache->hdr->file_seq) {
-		/* cache file reopened - need to abort */
-		mail_cache_transaction_reset(ctx);
-		return 0;
+	/* first write the actual data to cache file */
+	i_assert(ctx->last_rec_pos <= ctx->cache_data->used);
+	if (mail_cache_append(ctx->cache, ctx->cache_data->data,
+			      ctx->last_rec_pos, &write_offset) < 0)
+		ret = -1;
+	else {
+		/* update records' cache offsets to index */
+		ctx->bytes_written += ctx->last_rec_pos;
+		ret = mail_cache_transaction_update_index(ctx, write_offset);
 	}
-
-	rec = buffer_get_data(ctx->cache_data, &size);
-	i_assert(ctx->prev_pos <= size);
-
-	seq = array_get(&ctx->cache_data_seq, &seq_count);
-	seq_limit = 0;
-
-	for (seq_idx = 0, rec_pos = 0; rec_pos < ctx->prev_pos;) {
-		max_size = ctx->prev_pos - rec_pos;
-
-		ret = mail_cache_transaction_get_space(ctx, rec->size,
-						       max_size, &write_offset,
-						       &max_size, commit);
-		if (ret <= 0) {
-			/* error / couldn't lock / cache file reopened */
-			return ret;
-		}
-
-		if (rec_pos + max_size < ctx->prev_pos) {
-			/* see how much we can really write there */
-			tmp_rec = rec;
-			for (size = 0; size + tmp_rec->size <= max_size; ) {
-				seq_limit++;
-				size += tmp_rec->size;
-				tmp_rec = CONST_PTR_OFFSET(tmp_rec,
-							   tmp_rec->size);
-			}
-			max_size = size;
-		} else {
-			seq_limit = seq_count;
-		}
-
-		/* write it to file */
-		i_assert(ctx->cache_file_seq == cache->hdr->file_seq);
-		if (mail_cache_write(cache, rec, max_size, write_offset) < 0)
-			return -1;
-
-		if (mail_cache_transaction_update_index(ctx, rec, seq,
-							&seq_idx, seq_limit,
-							write_offset,
-							&write_size) < 0)
-			return -1;
-
-		rec_pos += write_size;
-		rec = CONST_PTR_OFFSET(rec, write_size);
-	}
+	if (mail_cache_unlock(ctx->cache) < 0)
+		ret = -1;
 
 	/* drop the written data from buffer */
 	buffer_copy(ctx->cache_data, 0,
-		    ctx->cache_data, ctx->prev_pos, (size_t)-1);
+		    ctx->cache_data, ctx->last_rec_pos, (size_t)-1);
 	buffer_set_used_size(ctx->cache_data,
-			     buffer_get_used_size(ctx->cache_data) -
-			     ctx->prev_pos);
-	ctx->prev_pos = 0;
+			     ctx->cache_data->used - ctx->last_rec_pos);
+	ctx->last_rec_pos = 0;
 
 	array_clear(&ctx->cache_data_seq);
-	return 1;
+	return 0;
+}
+
+static void
+mail_cache_transaction_update_last_rec(struct mail_cache_transaction_ctx *ctx)
+{
+	struct mail_cache_record *rec;
+	void *data;
+	size_t size;
+
+	data = buffer_get_modifiable_data(ctx->cache_data, &size);
+	rec = PTR_OFFSET(data, ctx->last_rec_pos);
+	rec->size = size - ctx->last_rec_pos;
+	i_assert(rec->size > sizeof(*rec));
+
+	/* FIXME: here would be a good place to set prev_offset to
+	   avoid doing it later, but avoid circular prev_offsets
+	   when cache is updated multiple times within the same
+	   transaction */
+
+	array_append(&ctx->cache_data_seq, &ctx->prev_seq, 1);
+	ctx->last_rec_pos = size;
 }
 
 static void
 mail_cache_transaction_switch_seq(struct mail_cache_transaction_ctx *ctx)
 {
-	struct mail_cache_record *rec, new_rec;
-	void *data;
-	size_t size;
+	struct mail_cache_record new_rec;
 
 	if (ctx->prev_seq != 0) {
-		/* fix record size */
-		data = buffer_get_modifiable_data(ctx->cache_data, &size);
-		rec = PTR_OFFSET(data, ctx->prev_pos);
-		rec->size = size - ctx->prev_pos;
-		i_assert(rec->size > sizeof(*rec));
-
-		/* FIXME: here would be a good place to set prev_offset to
-		   avoid doing it later, but avoid circular prev_offsets
-		   when cache is updated multiple times within the same
-		   transaction */
-
-		array_append(&ctx->cache_data_seq, &ctx->prev_seq, 1);
-		ctx->prev_pos = size;
+		/* update previously added cache record's size */
+		mail_cache_transaction_update_last_rec(ctx);
 	} else if (ctx->cache_data == NULL) {
 		ctx->cache_data =
 			buffer_create_dynamic(default_pool,
-					      MAIL_CACHE_WRITE_BUFFER);
+					      MAIL_CACHE_INIT_WRITE_BUFFER);
 		i_array_init(&ctx->cache_data_seq, 64);
 	}
 
@@ -787,57 +404,37 @@
 int mail_cache_transaction_commit(struct mail_cache_transaction_ctx **_ctx)
 {
 	struct mail_cache_transaction_ctx *ctx = *_ctx;
-	struct mail_cache *cache = ctx->cache;
 	int ret = 0;
 
-	if (!ctx->changes || MAIL_CACHE_IS_UNUSABLE(cache)) {
-		mail_cache_transaction_free(_ctx);
-		return 0;
-	}
-
-	if (mail_cache_transaction_lock(ctx) <= 0) {
-		mail_cache_transaction_rollback(_ctx);
-		return -1;
+	if (ctx->changes) {
+		if (ctx->prev_seq != 0)
+			mail_cache_transaction_update_last_rec(ctx);
+		if (mail_cache_transaction_flush(ctx) < 0)
+			ret = -1;
+		else {
+			/* successfully wrote everything */
+			ctx->bytes_written = 0;
+		}
+		/* Here would be a good place to do fdatasync() to make sure
+		   everything is written before offsets are updated to index.
+		   However it slows down I/O unneededly and we're pretty good
+		   at catching and fixing cache corruption, so we no longer do
+		   it. */
 	}
-
-	if (ctx->prev_seq != 0)
-                mail_cache_transaction_switch_seq(ctx);
-
-	if (mail_cache_transaction_flush(ctx) < 0)
-		ret = -1;
-
-	/* Here would be a good place to do fdatasync() to make sure
-	   everything is written before offsets are updated to index.
-	   However it slows down I/O unneededly and we're pretty good at
-	   catching and fixing cache corruption, so we no longer do it. */
-
-	if (mail_cache_unlock(cache) < 0)
-		ret = -1;
-	mail_cache_transaction_free(_ctx);
+	mail_cache_transaction_rollback(_ctx);
 	return ret;
 }
 
-void mail_cache_transaction_rollback(struct mail_cache_transaction_ctx **_ctx)
-{
-	struct mail_cache_transaction_ctx *ctx = *_ctx;
-
-	mail_cache_transaction_free_reservations(ctx);
-	mail_cache_transaction_free(_ctx);
-}
-
 static int
 mail_cache_header_fields_write(struct mail_cache_transaction_ctx *ctx,
 			       const buffer_t *buffer)
 {
 	struct mail_cache *cache = ctx->cache;
-	size_t max_size, size = buffer->used;
 	uint32_t offset, hdr_offset;
 
-	if (mail_cache_transaction_get_space(ctx, size, size,
-					     &offset, &max_size, TRUE) <= 0)
-		return -1;
+	i_assert(cache->locked);
 
-	if (mail_cache_write(cache, buffer->data, size, offset) < 0)
+	if (mail_cache_append(cache, buffer->data, buffer->used, &offset) < 0)
 		return -1;
 
 	if (cache->index->fsync_mode == FSYNC_MODE_ALWAYS) {
@@ -846,15 +443,11 @@
 			return -1;
 		}
 	}
+	/* find offset to the previous header's "next_offset" field */
 	if (mail_cache_header_fields_get_next_offset(cache, &hdr_offset) < 0)
 		return -1;
 
-	/* if we rollback the transaction, we must not overwrite this
-	   area because it's already committed after updating the
-	   header offset */
-	mail_cache_transaction_partial_commit(ctx, offset, size);
-
-	/* after it's guaranteed to be in disk, update header offset */
+	/* update the next_offset offset, so our new header will be found */
 	offset = mail_index_uint32_to_offset(offset);
 	if (mail_cache_write(cache, &offset, sizeof(offset), hdr_offset) < 0)
 		return -1;
@@ -1011,13 +604,13 @@
 	if (fixed_size == (unsigned int)-1)
 		full_size += sizeof(data_size32);
 
-	if (ctx->cache_data->used + full_size >
-	    buffer_get_size(ctx->cache_data) && ctx->prev_pos > 0) {
+	if (ctx->cache_data->used + full_size > MAIL_CACHE_MAX_WRITE_BUFFER &&
+	    ctx->last_rec_pos > 0) {
 		/* time to flush our buffer. if flushing fails because the
 		   cache file had been compressed and was reopened, return
 		   without adding the cached data since cache_data buffer
 		   doesn't contain the cache_rec anymore. */
-		if (mail_cache_transaction_flush(ctx) <= 0) {
+		if (mail_cache_transaction_flush(ctx) < 0) {
 			/* make sure the transaction is reset, so we don't
 			   constantly try to flush for each call to this
 			   function */
@@ -1027,7 +620,7 @@
 	}
 
 	buffer_append(ctx->cache_data, &file_field, sizeof(file_field));
-	if (fixed_size == (unsigned int)-1) {
+	if (fixed_size == -1U) {
 		buffer_append(ctx->cache_data, &data_size32,
 			      sizeof(data_size32));
 	}
@@ -1080,8 +673,8 @@
 	return mail_cache_field_exists(ctx->view, seq, field_idx) == 0;
 }
 
-static int mail_cache_link_unlocked(struct mail_cache *cache,
-				    uint32_t old_offset, uint32_t new_offset)
+static int mail_cache_link_locked(struct mail_cache *cache,
+				  uint32_t old_offset, uint32_t new_offset)
 {
 	new_offset += offsetof(struct mail_cache_record, prev_offset);
 	return mail_cache_write(cache, &old_offset, sizeof(old_offset),
@@ -1122,7 +715,7 @@
 		return 0;
 	}
 
-	if (mail_cache_link_unlocked(cache, old_offset, new_offset) < 0)
+	if (mail_cache_link_locked(cache, old_offset, new_offset) < 0)
 		return -1;
 
 	cache->hdr_copy.continued_record_count++;
--- a/src/lib-index/mail-cache.c	Thu Oct 04 02:08:23 2012 +0300
+++ b/src/lib-index/mail-cache.c	Thu Oct 04 02:34:53 2012 +0300
@@ -193,23 +193,30 @@
 static void mail_cache_update_need_compress(struct mail_cache *cache)
 {
 	const struct mail_cache_header *hdr = cache->hdr;
+	struct stat st;
 	unsigned int cont_percentage;
-	uoff_t max_del_space;
+	uoff_t file_size, max_del_space;
+
+	if (fstat(cache->fd, &st) < 0) {
+		if (!ESTALE_FSTAT(errno))
+			mail_cache_set_syscall_error(cache, "fstat()");
+		return;
+	}
+	file_size = st.st_size;
 
         cont_percentage = hdr->continued_record_count * 100 /
 		(cache->index->map->rec_map->records_count == 0 ? 1 :
 		 cache->index->map->rec_map->records_count);
 	if (cont_percentage >= MAIL_CACHE_COMPRESS_CONTINUED_PERCENTAGE &&
-	    hdr->used_file_size >= MAIL_CACHE_COMPRESS_MIN_SIZE) {
+	    file_size >= MAIL_CACHE_COMPRESS_MIN_SIZE) {
 		/* too many continued rows, compress */
 		cache->need_compress_file_seq = hdr->file_seq;
 	}
 
 	/* see if we've reached the max. deleted space in file */
-	max_del_space = hdr->used_file_size / 100 *
-		MAIL_CACHE_COMPRESS_PERCENTAGE;
+	max_del_space = file_size / 100 * MAIL_CACHE_COMPRESS_PERCENTAGE;
 	if (hdr->deleted_space >= max_del_space &&
-	    hdr->used_file_size >= MAIL_CACHE_COMPRESS_MIN_SIZE)
+	    file_size >= MAIL_CACHE_COMPRESS_MIN_SIZE)
 		cache->need_compress_file_seq = hdr->file_seq;
 }
 
@@ -243,25 +250,6 @@
 		mail_cache_set_corrupted(cache, "file_seq is 0");
 		return FALSE;
 	}
-
-	/* only check the header if we're locked */
-	if (!cache->locked)
-		return TRUE;
-
-	if (hdr->used_file_size < sizeof(struct mail_cache_header)) {
-		mail_cache_set_corrupted(cache, "used_file_size too small");
-		return FALSE;
-	}
-	if ((hdr->used_file_size % sizeof(uint32_t)) != 0) {
-		mail_cache_set_corrupted(cache, "used_file_size not aligned");
-		return FALSE;
-	}
-
-	if (cache->mmap_base != NULL &&
-	    hdr->used_file_size > cache->mmap_length) {
-		mail_cache_set_corrupted(cache, "used_file_size too large");
-		return FALSE;
-	}
 	return TRUE;
 }
 
@@ -643,11 +631,10 @@
 	if (cache->field_header_write_pending)
                 ret = mail_cache_header_fields_update(cache);
 
-	cache->locked = FALSE;
-
 	if (MAIL_CACHE_IS_UNUSABLE(cache)) {
 		/* we found it to be broken during the lock. just clean up. */
 		cache->hdr_modified = FALSE;
+		cache->locked = FALSE;
 		return -1;
 	}
 
@@ -665,6 +652,7 @@
 			mail_cache_set_syscall_error(cache, "fdatasync()");
 	}
 
+	cache->locked = FALSE;
 	mail_cache_unlock_file(cache);
 	return ret;
 }
@@ -672,6 +660,8 @@
 int mail_cache_write(struct mail_cache *cache, const void *data, size_t size,
 		     uoff_t offset)
 {
+	i_assert(cache->locked);
+
 	if (pwrite_full(cache->fd, data, size, offset) < 0) {
 		mail_cache_set_syscall_error(cache, "pwrite_full()");
 		return -1;
@@ -687,6 +677,32 @@
 	return 0;
 }
 
+int mail_cache_append(struct mail_cache *cache, const void *data, size_t size,
+		      uint32_t *offset_r)
+{
+	struct stat st;
+
+	if (fstat(cache->fd, &st) < 0) {
+		if (!ESTALE_FSTAT(errno))
+			mail_cache_set_syscall_error(cache, "fstat()");
+		return -1;
+	}
+	if ((uoff_t)st.st_size > (uint32_t)-1 ||
+	    (uint32_t)-1 - (uoff_t)st.st_size < size) {
+		mail_cache_set_corrupted(cache, "Cache file too large");
+		return -1;
+	}
+	*offset_r = st.st_size;
+	if (mail_cache_write(cache, data, size, *offset_r) < 0)
+		return -1;
+
+	/* FIXME: this is updated only so that older Dovecot versions (<=v2.1)
+	   can read this file. we can remove this later. */
+	cache->hdr_modified = TRUE;
+	cache->hdr_copy.backwards_compat_used_file_size = *offset_r + size;
+	return 0;
+}
+
 bool mail_cache_exists(struct mail_cache *cache)
 {
 	return !MAIL_CACHE_IS_UNUSABLE(cache);