view src/doveadm/dsync/dsync-mail.c @ 19674:fc0219628b49

dsync: Improved header hash v2 algorithm to remove repeated '?' chars. This is to help with Yahoo that replaces UTF-8 chars in headers with a single '?' (instead of '?' per each 8bit byte).
author Timo Sirainen <timo.sirainen@dovecot.fi>
date Thu, 28 Jan 2016 20:47:02 +0200
parents 25f06710e671
children d3aa060852e6
line wrap: on
line source

/* Copyright (c) 2013-2016 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "array.h"
#include "hex-binary.h"
#include "md5.h"
#include "istream.h"
#include "istream-crlf.h"
#include "message-size.h"
#include "mail-storage.h"
#include "dsync-mail.h"

/* These should be good enough to identify all normal mails. Received: header
   would make it even better, but those can be somewhat large. Also these
   fields can be looked up using IMAP ENVELOPE, which is more efficient in
   some IMAP servers. */
static const char *hashed_headers[] = {
	"Date", "Message-ID", NULL
};

struct mailbox_header_lookup_ctx *
dsync_mail_get_hash_headers(struct mailbox *box)
{
	return mailbox_header_lookup_init(box, hashed_headers);
}

void dsync_mail_hash_more(struct md5_context *md5_ctx, unsigned int version,
			  const unsigned char *data, size_t size)
{
	size_t i, start;

	i_assert(version == 1 || version == 2);

	if (version == 1) {
		md5_update(md5_ctx, data, size);
		return;
	}
	/* - Dovecot IMAP replaces NULs with 0x80 character.
	   - Dovecot POP3 with outlook-no-nuls workaround replaces NULs
	   with 0x80 character.
	   - Zimbra replaces 8bit chars with '?' in header fetches,
	   but not body fetches.
	   - Yahoo replaces 8bit chars with '?' in partial header
	   fetches, but not POP3 TOP. UTF-8 character sequence writes only a
	   single '?'

	   So we'll just replace all control and 8bit chars with '?' and
	   remove any repeated '?', which hopefully will satisfy everybody.

	   (Keep this code in sync with pop3-migration plugin.)
	   */
	for (i = start = 0; i < size; i++) {
		if ((data[i] < 0x20 || data[i] >= 0x7f || data[i] == '?') &&
		    (data[i] != '\t' && data[i] != '\n')) {
			/* remove repeated '?' */
			if (start < i || i == 0) {
				md5_update(md5_ctx, data + start, i-start);
				md5_update(md5_ctx, "?", 1);
			}
			start = i+1;
		}
	}
	md5_update(md5_ctx, data + start, i-start);
}

int dsync_mail_get_hdr_hash(struct mail *mail, unsigned int version,
			    const char **hdr_hash_r)
{
	struct istream *hdr_input, *input;
	struct mailbox_header_lookup_ctx *hdr_ctx;
	struct md5_context md5_ctx;
	unsigned char md5_result[MD5_RESULTLEN];
	const unsigned char *data;
	size_t size;
	int ret = 0;

	hdr_ctx = mailbox_header_lookup_init(mail->box, hashed_headers);
	ret = mail_get_header_stream(mail, hdr_ctx, &hdr_input);
	mailbox_header_lookup_unref(&hdr_ctx);
	if (ret < 0)
		return -1;

	input = i_stream_create_lf(hdr_input);

	md5_init(&md5_ctx);
	while (!i_stream_is_eof(input)) {
		if (i_stream_read_data(input, &data, &size, 0) == -1)
			break;
		if (size == 0)
			break;
		dsync_mail_hash_more(&md5_ctx, version, data, size);
		i_stream_skip(input, size);
	}
	if (input->stream_errno != 0)
		ret = -1;
	i_stream_unref(&input);

	md5_final(&md5_ctx, md5_result);
	*hdr_hash_r = binary_to_hex(md5_result, sizeof(md5_result));
	return ret;
}

int dsync_mail_fill(struct mail *mail, bool minimal_fill,
		    struct dsync_mail *dmail_r, const char **error_field_r)
{
	const char *guid;

	memset(dmail_r, 0, sizeof(*dmail_r));

	if (mail_get_special(mail, MAIL_FETCH_GUID, &guid) < 0) {
		*error_field_r = "GUID";
		return -1;
	}
	dmail_r->guid = guid;
	dmail_r->uid = mail->uid;

	dmail_r->input_mail = mail;
	dmail_r->input_mail_uid = mail->uid;

	if (mail_get_save_date(mail, &dmail_r->saved_date) < 0) {
		*error_field_r = "saved-date";
		return -1;
	}
	if (!minimal_fill)
		return dsync_mail_fill_nonminimal(mail, dmail_r, error_field_r);
	dmail_r->minimal_fields = TRUE;
	return 0;
}

int dsync_mail_fill_nonminimal(struct mail *mail, struct dsync_mail *dmail_r,
			       const char **error_field_r)
{
	const char *str;

	if (mail_get_stream(mail, NULL, NULL, &dmail_r->input) < 0) {
		*error_field_r = "body";
		return -1;
	}

	if (mail_get_special(mail, MAIL_FETCH_UIDL_BACKEND, &dmail_r->pop3_uidl) < 0) {
		*error_field_r = "pop3-uidl";
		return -1;
	}
	if (mail_get_special(mail, MAIL_FETCH_POP3_ORDER, &str) < 0) {
		*error_field_r = "pop3-order";
		return -1;
	}
	if (*str != '\0') {
		if (str_to_uint(str, &dmail_r->pop3_order) < 0)
			i_unreached();
	}
	if (mail_get_received_date(mail, &dmail_r->received_date) < 0) {
		*error_field_r = "received-date";
		return -1;
	}
	return 0;
}

static void
const_string_array_dup(pool_t pool, const ARRAY_TYPE(const_string) *src,
		       ARRAY_TYPE(const_string) *dest)
{
	const char *const *strings, *str;
	unsigned int i, count;

	if (!array_is_created(src))
		return;

	strings = array_get(src, &count);
	if (count == 0)
		return;

	p_array_init(dest, pool, count);
	for (i = 0; i < count; i++) {
		str = p_strdup(pool, strings[i]);
		array_append(dest, &str, 1);
	}
}

void dsync_mail_change_dup(pool_t pool, const struct dsync_mail_change *src,
			   struct dsync_mail_change *dest_r)
{
	dest_r->type = src->type;
	dest_r->uid = src->uid;
	if (src->guid != NULL) {
		dest_r->guid = *src->guid == '\0' ? "" :
			p_strdup(pool, src->guid);
	}
	dest_r->hdr_hash = p_strdup(pool, src->hdr_hash);
	dest_r->modseq = src->modseq;
	dest_r->pvt_modseq = src->pvt_modseq;

	dest_r->add_flags = src->add_flags;
	dest_r->remove_flags = src->remove_flags;
	dest_r->final_flags = src->final_flags;
	dest_r->keywords_reset = src->keywords_reset;
	const_string_array_dup(pool, &src->keyword_changes,
			       &dest_r->keyword_changes);
	dest_r->received_timestamp = src->received_timestamp;
}