changeset 19674:fc0219628b49

dsync: Improved header hash v2 algorithm to remove repeated '?' chars. This is to help with Yahoo that replaces UTF-8 chars in headers with a single '?' (instead of '?' per each 8bit byte).
author Timo Sirainen <timo.sirainen@dovecot.fi>
date Thu, 28 Jan 2016 20:47:02 +0200
parents f22a6d0198c4
children 364874711d5b
files src/doveadm/dsync/Makefile.am src/doveadm/dsync/dsync-mail.c src/doveadm/dsync/dsync-mail.h src/doveadm/dsync/test-dsync-mail.c
diffstat 4 files changed, 62 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/src/doveadm/dsync/Makefile.am	Thu Jan 28 20:23:51 2016 +0200
+++ b/src/doveadm/dsync/Makefile.am	Thu Jan 28 20:47:02 2016 +0200
@@ -58,6 +58,7 @@
 	dsync-transaction-log-scan.h
 
 test_programs = \
+	test-dsync-mail \
 	test-dsync-mailbox-tree-sync
 
 noinst_PROGRAMS = $(test_programs)
@@ -66,6 +67,10 @@
 	../../lib-test/libtest.la \
 	../../lib/liblib.la
 
+test_dsync_mail_SOURCES = test-dsync-mail.c
+test_dsync_mail_LDADD = $(pkglib_LTLIBRARIES) $(test_libs)
+test_dsync_mail_DEPENDENCIES = $(pkglib_LTLIBRARIES) $(test_libs)
+
 test_dsync_mailbox_tree_sync_SOURCES = test-dsync-mailbox-tree-sync.c
 test_dsync_mailbox_tree_sync_LDADD = dsync-mailbox-tree-sync.lo dsync-mailbox-tree.lo $(test_libs)
 test_dsync_mailbox_tree_sync_DEPENDENCIES = $(pkglib_LTLIBRARIES) $(test_libs)
--- a/src/doveadm/dsync/dsync-mail.c	Thu Jan 28 20:23:51 2016 +0200
+++ b/src/doveadm/dsync/dsync-mail.c	Thu Jan 28 20:47:02 2016 +0200
@@ -24,9 +24,8 @@
 	return mailbox_header_lookup_init(box, hashed_headers);
 }
 
-static void
-dsync_mail_hash_more(struct md5_context *md5_ctx, unsigned int version,
-		     const unsigned char *data, size_t size)
+void dsync_mail_hash_more(struct md5_context *md5_ctx, unsigned int version,
+			  const unsigned char *data, size_t size)
 {
 	size_t i, start;
 
@@ -42,18 +41,22 @@
 	   - Zimbra replaces 8bit chars with '?' in header fetches,
 	   but not body fetches.
 	   - Yahoo replaces 8bit chars with '?' in partial header
-	   fetches, but not POP3 TOP.
+	   fetches, but not POP3 TOP. UTF-8 character sequence writes only a
+	   single '?'
 
-	   So we'll just replace all control and 8bit chars with '?',
-	   which hopefully will satisfy everybody.
+	   So we'll just replace all control and 8bit chars with '?' and
+	   remove any repeated '?', which hopefully will satisfy everybody.
 
 	   (Keep this code in sync with pop3-migration plugin.)
 	   */
 	for (i = start = 0; i < size; i++) {
-		if ((data[i] < 0x20 || data[i] >= 0x80) &&
+		if ((data[i] < 0x20 || data[i] >= 0x7f || data[i] == '?') &&
 		    (data[i] != '\t' && data[i] != '\n')) {
-			md5_update(md5_ctx, data + start, i-start);
-			md5_update(md5_ctx, "?", 1);
+			/* remove repeated '?' */
+			if (start < i || i == 0) {
+				md5_update(md5_ctx, data + start, i-start);
+				md5_update(md5_ctx, "?", 1);
+			}
 			start = i+1;
 		}
 	}
--- a/src/doveadm/dsync/dsync-mail.h	Thu Jan 28 20:23:51 2016 +0200
+++ b/src/doveadm/dsync/dsync-mail.h	Thu Jan 28 20:47:02 2016 +0200
@@ -3,6 +3,7 @@
 
 #include "mail-types.h"
 
+struct md5_context;
 struct mail;
 struct mailbox;
 
@@ -95,4 +96,8 @@
 void dsync_mail_change_dup(pool_t pool, const struct dsync_mail_change *src,
 			   struct dsync_mail_change *dest_r);
 
+/* private: */
+void dsync_mail_hash_more(struct md5_context *md5_ctx, unsigned int version,
+			  const unsigned char *data, size_t size);
+
 #endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/doveadm/dsync/test-dsync-mail.c	Thu Jan 28 20:47:02 2016 +0200
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "md5.h"
+#include "dsync-mail.h"
+#include "test-common.h"
+
+static const unsigned char test_input[] =
+	"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
+	"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+	"\x20!?x??yz\x7f\x80\x90\xff-plop\xff";
+static const unsigned char test_output[] =
+	"?\t\n? !?x?yz?-plop?";
+
+static void test_dsync_mail_hash_more(void)
+{
+	struct md5_context md5_ctx;
+	unsigned char md5_input[MD5_RESULTLEN], md5_output[MD5_RESULTLEN];
+
+	test_begin("dsync_mail_hash_more v2");
+	md5_init(&md5_ctx);
+	dsync_mail_hash_more(&md5_ctx, 2, test_input, sizeof(test_input)-1);
+	md5_final(&md5_ctx, md5_input);
+
+	md5_init(&md5_ctx);
+	md5_update(&md5_ctx, test_output, sizeof(test_output)-1);
+	md5_final(&md5_ctx, md5_output);
+
+	test_assert(memcmp(md5_input, md5_output, MD5_RESULTLEN) == 0);
+	test_end();
+}
+
+int main(void)
+{
+	static void (*test_functions[])(void) = {
+		test_dsync_mail_hash_more,
+		NULL
+	};
+	return test_run(test_functions);
+}