changeset 18603:e4b62ba0fb5a

lib-fts: Various improvements to test-fts-tokenizer
author Timo Sirainen <tss@iki.fi>
date Sat, 09 May 2015 18:28:04 +0300
parents 7542e3be6721
children c469d8f4cde7
files src/lib-fts/test-fts-tokenizer.c
diffstat 1 files changed, 133 insertions(+), 430 deletions(-) [+]
line wrap: on
line diff
--- a/src/lib-fts/test-fts-tokenizer.c	Sat May 09 18:00:58 2015 +0300
+++ b/src/lib-fts/test-fts-tokenizer.c	Sat May 09 18:28:04 2015 +0300
@@ -1,16 +1,30 @@
 /* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
 
 #include "lib.h"
-#include "sha2.h"
-#include "hex-binary.h"
+#include "unichar.h"
 #include "test-common.h"
 #include "fts-tokenizer.h"
 #include "fts-tokenizer-private.h"
-/* TODO: fix including and linking of this. */
-/* #include "fts-tokenizer-generic-private.h" */
+#include "fts-tokenizer-generic-private.h"
 
 #include <stdlib.h>
 
+#define TEST_INPUT_TEXT \
+	"hello world\r\n\nAnd there\twas: text galore, " \
+	"abc@example.com, " \
+	"Bar Baz <bar@example.org>, " \
+	"foo@domain " \
+	"1234567890123456789012345678ä," \
+	"12345678901234567890123456789ä," \
+	"123456789012345678901234567890ä," \
+	"and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n " \
+	"(\"Hello world\")3.14 3,14 last"
+#define TEST_INPUT_ADDRESS \
+	"@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
+	"Bar Baz <bar@example.org>" \
+	"Foo Bar (comment)foo.bar@host.example.org " \
+	"foo, foo@domain"
+
 static void test_fts_tokenizer_find(void)
 {
 	test_begin("fts tokenizer find");
@@ -19,34 +33,79 @@
 	test_end();
 }
 
+static void
+test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
+			   const char *const *expected_output)
+{
+	const unsigned char *input = (const unsigned char *)_input;
+	const char *token;
+	unsigned int i, max, outi, char_len, input_len = strlen(_input);
+
+	/* test all input at once */
+	outi = 0;
+	while (fts_tokenizer_next(tok, input, input_len, &token) > 0) {
+		test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+		outi++;
+	}
+	while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
+		test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+		outi++;
+	}
+	test_assert(expected_output[outi] == NULL);
+
+	/* test input one byte at a time */
+	for (i = outi = 0; i < input_len; i += char_len) {
+		char_len = uni_utf8_char_bytes(input[i]);
+		while (fts_tokenizer_next(tok, input+i, char_len, &token) > 0) {
+			test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+			outi++;
+		}
+	}
+	while (fts_tokenizer_final(tok, &token) > 0) {
+		test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+		outi++;
+	}
+	test_assert(expected_output[outi] == NULL);
+
+	/* test input in random chunks */
+	for (i = outi = 0; i < input_len; i += char_len) {
+		max = rand() % (input_len - i) + 1;
+		for (char_len = 0; char_len < max; )
+			char_len += uni_utf8_char_bytes(input[i+char_len]);
+		while (fts_tokenizer_next(tok, input+i, char_len, &token) > 0) {
+			test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+			outi++;
+		}
+	}
+	while (fts_tokenizer_final(tok, &token) > 0) {
+		test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+		outi++;
+	}
+	test_assert(expected_output[outi] == NULL);
+}
+
 static void test_fts_tokenizer_generic_only(void)
 {
-	static const unsigned char input[] =
-		"hello world\r\nAnd there\twas: text "
-		"galore, and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n (\"Hello world\")last ";
+	static const char input[] = TEST_INPUT_TEXT;
 	static const char *const expected_output[] = {
 		"hello", "world", "And",
 		"there", "was", "text", "galore",
+		"abc", "example", "com", "Bar", "Baz",
+		"bar", "example", "org", "foo", "domain",
+		"1234567890123456789012345678ä",
+		"12345678901234567890123456789",
+		"123456789012345678901234567890",
 		"and", "longlonglongabcdefghijklmnopqr",
-		"more", "Hello", "world", "last", NULL
+		"more", "Hello", "world", "3", "14", "3", "14", "last", NULL
 	};
 	struct fts_tokenizer *tok;
-	const char * const *eopp = expected_output;
-	const char *token, *error;
+	const char *error;
 
 	test_begin("fts tokenizer generic simple");
 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
-/*TODO: Uncomment when fts-tokenizer-generic-private.h inclusion is fixed */
-/*test_assert(((struct generic_fts_tokenizer *) tok)->algorithm ==  BOUNDARY_ALGORITHM_SIMPLE);*/
-	while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
-		test_assert(strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
-		test_assert(strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	test_assert(*eopp == NULL);
+	test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
+
+	test_tokenizer_inputoutput(tok, input, expected_output);
 	fts_tokenizer_unref(&tok);
 	test_end();
 }
@@ -55,7 +114,7 @@
 {
 	/* with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
 	   U+205A(e2 81 9a) and U+205F(e2 81 9f )*/
-	static const unsigned char input[] =
+	static const char input[] =
 		"hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
 		"galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n";
 	static const char *const expected_output[] = {
@@ -64,61 +123,12 @@
 		"and", "more", NULL
 	};
 	struct fts_tokenizer *tok;
-	const char * const *eopp = expected_output;
-	const char *token, *error;
+	const char *error;
 
 	test_begin("fts tokenizer generic simple with Unicode whitespace");
-	fts_tokenizer_register(fts_tokenizer_generic);
 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
-	while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
-		test_assert(strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
-		test_assert(strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	test_assert(*eopp == NULL);
+	test_tokenizer_inputoutput(tok, input, expected_output);
 	fts_tokenizer_unref(&tok);
-	fts_tokenizer_unregister(fts_tokenizer_generic);
-	test_end();
-}
-
-static void test_fts_tokenizer_char_generic_only(void)
-{
-	static const unsigned char input[] =
-		"abc@example.com, "
-		"Bar Baz <bar@example.org>, "
-		"foo@domain";
-	static const char *const expected_output[] = {
-		"abc", "example", "com", "Bar", "Baz",
-		"bar", "example", "org", "foo", "domain", NULL
-	};
-	struct fts_tokenizer *tok;
-	const char * const *eopp = expected_output;
-	const char *token, *error;
-	unsigned int i;
-	int ret;
-
-	test_begin("fts tokenizer generic simple input one character at a time");
-	fts_tokenizer_register(fts_tokenizer_generic);
-
-	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
-
-	for (i = 0; i <= sizeof(input)-1; ) {
-		ret = i < sizeof(input)-1 ?
-			fts_tokenizer_next(tok, &input[i], 1, &token) :
-			fts_tokenizer_next(tok, NULL, 0, &token);
-		if (ret == 0) {
-			i++;
-			continue;
-		}
-		test_assert(null_strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	test_assert(*eopp == NULL);
-	fts_tokenizer_unref(&tok);
-	fts_tokenizer_unregister(fts_tokenizer_generic);
 	test_end();
 }
 
@@ -126,34 +136,25 @@
 
 static void test_fts_tokenizer_generic_tr29_only(void)
 {
-	static const unsigned char input[] =
-		"hello world\r\n\nAnd there\twas: text "
-		"galore, and more.\n\n (\"Hello world\")3.14 3,14 last"
-		" longlonglongabcdefghijklmnopqrstuvwxyz 1.";
+	static const char input[] = TEST_INPUT_TEXT;
 	static const char *const expected_output[] = {
 		"hello", "world", "And",
 		"there", "was", "text", "galore",
-		"and", "more", "Hello", "world", "3.14",
-		"3,14", "last", "longlonglongabcdefghijklmnopqr", "1", NULL
+		"abc", "example.com", "Bar", "Baz",
+		"bar", "example.org", "foo", "domain",
+		"1234567890123456789012345678ä",
+		"12345678901234567890123456789",
+		"123456789012345678901234567890",
+		"and", "longlonglongabcdefghijklmnopqr",
+		"more", "Hello", "world", "3.14", "3,14", "last", NULL
 	};
 	struct fts_tokenizer *tok;
-	const char * const *eopp = expected_output;
-	const char *token, *error;
+	const char *error;
 
 	test_begin("fts tokenizer generic TR29");
-	fts_tokenizer_register(fts_tokenizer_generic);
 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
-	while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
-		test_assert(strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
-		test_assert(strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	test_assert(*eopp == NULL);
+	test_tokenizer_inputoutput(tok, input, expected_output);
 	fts_tokenizer_unref(&tok);
-	fts_tokenizer_unregister(fts_tokenizer_generic);
 	test_end();
 }
 
@@ -163,7 +164,7 @@
 {
 	/* with Unicode(utf8) U+2000(e2 80 80) and U+205A(e2 81 9a) and U+205F(e2
 	   81 9f)*/
-	static const unsigned char input[] =
+	static const char input[] =
 		"hello world\r\nAnd\xE2\x80\x80there\twas: text "
 		"galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n";
 	static const char *const expected_output[] = {
@@ -172,404 +173,112 @@
 		"and", "more", NULL
 	};
 	struct fts_tokenizer *tok;
-	const char * const *eopp = expected_output;
-	const char *token, *error;
+	const char *error;
 
 	test_begin("fts tokenizer generic TR29 with Unicode whitespace");
-	fts_tokenizer_register(fts_tokenizer_generic);
 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
-	while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
-		test_assert(strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
-		test_assert(strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	test_assert(*eopp == NULL);
+	test_tokenizer_inputoutput(tok, input, expected_output);
 	fts_tokenizer_unref(&tok);
-	fts_tokenizer_unregister(fts_tokenizer_generic);
 	test_end();
 }
 
 static void test_fts_tokenizer_generic_tr29_midnumlet_end(void)
 {
 	/* u+FF0E is EF BC 8E  */
-	static const unsigned char input[] =
+	static const char input[] =
 		"hello world\xEF\xBC\x8E";
 	static const char *const expected_output[] = {
 		"hello", "world", NULL
 	};
 	struct fts_tokenizer *tok;
-	const char * const *eopp = expected_output;
-	const char *token, *error;
+	const char *error;
 
 	test_begin("fts tokenizer generic TR29 with MinNumLet U+FF0E at end");
-	fts_tokenizer_register(fts_tokenizer_generic);
 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
-	while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
-		test_assert(null_strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
-		test_assert(null_strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	test_assert(*eopp == NULL);
+	test_tokenizer_inputoutput(tok, input, expected_output);
 	fts_tokenizer_unref(&tok);
-	fts_tokenizer_unregister(fts_tokenizer_generic);
-	test_end();
-}
-
-static void test_fts_tokenizer_char_generic_tr29_only(void)
-{
-	static const unsigned char input[] =
-		"abc@example.com, "
-		"Bar Baz <bar@example.org>, "
-		"foo@domain";
-	static const char *const expected_output[] = {
-		"abc", "example.com", "Bar", "Baz",
-		"bar", "example.org", "foo", "domain", NULL
-	};
-	struct fts_tokenizer *tok;
-	const char * const *eopp = expected_output;
-	const char *token, *error;
-	unsigned int i;
-	int ret;
-
-	test_begin("fts tokenizer generic TR29 input one character at a time");
-	fts_tokenizer_register(fts_tokenizer_generic);
-
-	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
-
-	for (i = 0; i <= sizeof(input)-1; ) {
-		ret = i < sizeof(input)-1 ?
-			fts_tokenizer_next(tok, &input[i], 1, &token) :
-			fts_tokenizer_next(tok, NULL, 0, &token);
-		if (ret == 0) {
-			i++;
-			continue;
-		}
-		test_assert(null_strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	test_assert(*eopp == NULL);
-	fts_tokenizer_unref(&tok);
-	fts_tokenizer_unregister(fts_tokenizer_generic);
 	test_end();
 }
 
-static void test_fts_tokenizer_line_address_only(void)
+static void test_fts_tokenizer_address_only(void)
 {
-	static const char *const input[] = {
-		"abc@example.com",
-		" Bar Baz <bar@example.org>",
-		"foo@domain",
-		" moro foo@domain Bar Baz <bar@example.org>"
-	};
+	static const char input[] = TEST_INPUT_ADDRESS;
 	static const char *const expected_output[] = {
-		"abc@example.com", "bar@example.org",
-		"foo@domain", "foo@domain", "bar@example.org", NULL
+		"abc.dfg@example.com", "bar@example.org",
+		"foo.bar@host.example.org", "foo@domain", NULL
 	};
 	struct fts_tokenizer *tok;
-	const char * const *eopp = expected_output;
-	const char *token, *error;
-	unsigned int i;
-	int ret;
-
-	test_begin("fts tokenizer email address only, input one line at a time");
-	fts_tokenizer_register(fts_tokenizer_email_address);
-
-	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
-
-	for (i = 0; i <= N_ELEMENTS(input);) {
-		ret = i < N_ELEMENTS(input) ?
-			fts_tokenizer_next(tok, (unsigned char *)input[i],
-			                   strlen(input[i]), &token) :
-			fts_tokenizer_next(tok, NULL, 0, &token);
-		if (ret == 0) {
-			i++;
-			continue;
-		}
-		test_assert(null_strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	test_assert(*eopp == NULL);
-	fts_tokenizer_unref(&tok);
-	fts_tokenizer_unregister(fts_tokenizer_email_address);
-	test_end();
+	const char *error;
 
-}
-static void test_fts_tokenizer_char_address_only(void)
-{
-	static const unsigned char input[] =
-		"@invalid invalid@ abc@example.com, "
-		"Bar Baz <bar@example.org>, "
-		"foo@domain";
-	static const char *const expected_output[] = {
-		"abc@example.com", "bar@example.org",
-		"foo@domain", NULL
-	};
-	struct fts_tokenizer *tok;
-	const char * const *eopp = expected_output;
-	const char *token, *error;
-	unsigned int i;
-	int ret;
-
-	test_begin("fts tokenizer email address only, input one character at a time");
-	fts_tokenizer_register(fts_tokenizer_email_address);
+	test_begin("fts tokenizer email address only");
 	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
-
-	for (i = 0; i <= sizeof(input)-1; ) {
-		ret = i < sizeof(input)-1 ?
-			fts_tokenizer_next(tok, &input[i], 1, &token) :
-			fts_tokenizer_next(tok, NULL, 0, &token);
-		if (ret == 0) {
-			i++;
-			continue;
-		}
-		test_assert(null_strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	test_assert(*eopp == NULL);
+	test_tokenizer_inputoutput(tok, input, expected_output);
 	fts_tokenizer_unref(&tok);
-	fts_tokenizer_unregister(fts_tokenizer_email_address);
 	test_end();
 }
 
-static void test_fts_tokenizer_rand_address_only(void)
+static void test_fts_tokenizer_address_parent(void)
 {
-	static const unsigned char input[] =
-		"@invalid invalid@ Abc Dfg <abc.dfg@example.com>, "
-		"Foo Bar (comment)foo.bar@host.example.org foo ";
-
+	static const char input[] = TEST_INPUT_ADDRESS;
 	static const char *const expected_output[] = {
-		"abc.dfg@example.com",
-		"foo.bar@host.example.org",
-		 NULL
-	};
-	struct fts_tokenizer *tok;
-	const char * const *eopp = expected_output;
-	const char *token, *error;
-	unsigned int i, step, step_max = 10;
-	int ret;
-
-	test_begin("fts tokenizer email address, input random length");
-	fts_tokenizer_register(fts_tokenizer_email_address);
-	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL,
-	                                 NULL, &tok, &error) == 0);
-	step = rand() % step_max + 1;
-	for (i = 0; i <= sizeof(input)-1; ) {
-		ret = i < sizeof(input)-1 ?
-			fts_tokenizer_next(tok, &input[i], step, &token) :
-			fts_tokenizer_next(tok, NULL, 0, &token);
-		if (ret == 0) {
-			i += step;
-			step = rand() % step_max + 1;
-			step = I_MIN(step, sizeof(input) - i);
-			continue;
-		}
-		test_assert(null_strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	test_assert(*eopp == NULL);
-	fts_tokenizer_unref(&tok);
-	fts_tokenizer_unregister(fts_tokenizer_email_address);
-	test_end();
-}
-
-static void test_fts_tokenizer_address_char(void)
-{
-	static const unsigned char input[] =
-		"@invalid invalid@ abc@example.com, "
-		"Bar Baz <bar@example.org>, "
-		"foo@domain";
-	static const char *const expected_output[] = {
-		"invalid", "invalid", "abc", "example", "com", "abc@example.com", "Bar", "Baz",
-		"bar", "example", "org", "bar@example.org",
-		"foo", "domain", "foo@domain", NULL
+		"invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
+		"Bar", "Baz", "bar", "example", "org", "bar@example.org",
+		"Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
+		"foo", "foo", "domain", "foo@domain", NULL
 	};
 	struct fts_tokenizer *tok, *gen_tok;
-	const char * const *eopp = expected_output;
-	const char *token, *error;
-	unsigned int i;
-	int ret;
+	const char *error;
 
-	test_begin("fts tokenizer email address + parent, input one character at a time");
-
+	test_begin("fts tokenizer email address + parent");
 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
 	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
-
-	for (i = 0; i <= sizeof(input)-1; ) {
-		ret = i < sizeof(input)-1 ?
-			fts_tokenizer_next(tok, &input[i], 1, &token) :
-			fts_tokenizer_next(tok, NULL, 0, &token);
-		if (ret == 0) {
-			i++;
-			continue;
-		}
-		test_assert(*eopp != NULL);
-		test_assert(null_strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	test_assert(*eopp == NULL);
+	test_tokenizer_inputoutput(tok, input, expected_output);
 	fts_tokenizer_unref(&tok);
 	fts_tokenizer_unref(&gen_tok);
 	test_end();
 }
 
-static void test_fts_tokenizer_address_line(void)
-{
-	static const char *const input[] = {
-		"@invalid invalid@ abc@example.com, ",
-		"Bar Baz <bar@example.org>, ",
-		"foo@domain, ",
-		"foo@domain Bar Baz <bar@example.org>, "
-	};
-	static const char *const expected_output[] = {
-		"invalid", "invalid", "abc", "example", "com", "abc@example.com", "Bar", "Baz",
-		"bar", "example", "org", "bar@example.org",
-		"foo", "domain", "foo@domain",
-		"foo", "domain", "foo@domain", "Bar", "Baz",
-		"bar", "example", "org", "bar@example.org", NULL
-	};
-	struct fts_tokenizer *tok, *gen_tok;
-	const char * const *eopp = expected_output;
-	const char *token, *error;
-	unsigned int i;
-	int ret;
-
-	test_begin("fts tokenizer email address + parent, input one line at a time");
-
-	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
-	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
-
-	for (i = 0; i <= N_ELEMENTS(input);) {
-		ret = i < N_ELEMENTS(input) ?
-			fts_tokenizer_next(tok, (unsigned char *)input[i],
-			                   strlen(input[i]), &token) :
-			fts_tokenizer_next(tok, NULL, 0, &token);
-		if (ret == 0) {
-			i++;
-			continue;
-		}
-		test_assert(null_strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	test_assert(*eopp == NULL);
-	fts_tokenizer_unref(&tok);
-	fts_tokenizer_unref(&gen_tok);
-	test_end();
-
-}
-
-static void test_fts_tokenizer_address_rand(void)
-{
-	static const unsigned char input[] =
-		"@invalid invalid@ abc@example.com, "
-		"Bar Baz <bar@example.org>, "
-		"foo@domain";
-	static const char *const expected_output[] = {
-		"invalid", "invalid", "abc", "example", "com", "abc@example.com", "Bar", "Baz",
-		"bar", "example", "org", "bar@example.org",
-		"foo", "domain", "foo@domain", NULL
-	};
-	struct fts_tokenizer *tok, *gen_tok;
-	const char * const *eopp = expected_output;
-	const char *token, *error;
-	unsigned int i, step, step_max = 10;
-	int ret;
-
-	test_begin("fts tokenizer email address + parent, input random length");
-	fts_tokenizer_register(fts_tokenizer_generic);
-	fts_tokenizer_register(fts_tokenizer_email_address);
-
-	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
-	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
-
-	//srand(1424142100); /* had a bug */
-	step = rand() % step_max + 1;
-	for (i = 0; i <= sizeof(input)-1; ) {
-		ret = i < sizeof(input)-1 ?
-		      fts_tokenizer_next(tok, &input[i], step, &token) :
-		      fts_tokenizer_next(tok, NULL, 0, &token);
-		if (ret == 0) {
-			i += step;
-			step = rand() % step_max + 1;
-			step = I_MIN(step, sizeof(input) - i);
-			continue;
-		}
-		test_assert(null_strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	test_assert(*eopp == NULL);
-	fts_tokenizer_unref(&tok);
-	fts_tokenizer_unref(&gen_tok);
-	fts_tokenizer_unregister(fts_tokenizer_generic);
-	fts_tokenizer_unregister(fts_tokenizer_email_address);
-	test_end();
-}
-
 static void test_fts_tokenizer_address_search(void)
 {
-	static const unsigned char input[] =
-		"@invalid invalid@ abc@example.com, "
-		"Bar Baz <bar@example.org>, "
-		"foo@domain";
+	static const char input[] = TEST_INPUT_ADDRESS;
 	static const char *const expected_output[] = {
-		"invalid", "invalid", "abc@example.com", "Bar", "Baz",
-		"bar@example.org", "foo@domain", NULL
+		"invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
+		"Bar", "Baz", "bar@example.org",
+		"Foo", "Bar", "comment", "foo.bar@host.example.org",
+		"foo", "foo@domain", NULL
 	};
-	static const char *const settings[] = { "search", "" };
+	static const char *const settings[] = { "search", "", NULL };
 	struct fts_tokenizer *tok, *gen_tok;
-	const char * const *eopp = expected_output;
 	const char *token, *error;
-	unsigned int i;
-	int ret;
 
-	test_begin("fts tokenizer search email address + parent, input one character at a time");
-
+	test_begin("fts tokenizer search email address + parent");
 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
 	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
-
-	for (i = 0; i <= sizeof(input)-1; ) {
-		ret = i < sizeof(input)-1 ?
-			fts_tokenizer_next(tok, &input[i], 1, &token) :
-			fts_tokenizer_next(tok, NULL, 0, &token);
-		if (ret == 0) {
-			i++;
-			continue;
-		}
-		test_assert(*eopp != NULL);
-		test_assert(null_strcmp(token, *eopp) == 0);
-		eopp++;
-	}
-	test_assert(*eopp == NULL);
+	test_tokenizer_inputoutput(tok, input, expected_output);
 
 	/* make sure state is forgotten at EOF */
 	test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token) == 0);
-	test_assert(fts_tokenizer_next(tok, NULL, 0, &token) > 0 &&
+	test_assert(fts_tokenizer_final(tok, &token) > 0 &&
 		    strcmp(token, "foo") == 0);
-	test_assert(fts_tokenizer_next(tok, NULL, 0, &token) == 0);
+	test_assert(fts_tokenizer_final(tok, &token) == 0);
 
 	test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token) == 0);
-	test_assert(fts_tokenizer_next(tok, NULL, 0, &token) > 0 &&
+	test_assert(fts_tokenizer_final(tok, &token) > 0 &&
 		    strcmp(token, "bar@baz") == 0);
-	test_assert(fts_tokenizer_next(tok, NULL, 0, &token) == 0);
+	test_assert(fts_tokenizer_final(tok, &token) == 0);
 
 	test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token) == 0);
-	test_assert(fts_tokenizer_next(tok, NULL, 0, &token) > 0 &&
+	test_assert(fts_tokenizer_final(tok, &token) > 0 &&
 		    strcmp(token, "foo") == 0);
-	test_assert(fts_tokenizer_next(tok, NULL, 0, &token) == 0);
+	test_assert(fts_tokenizer_final(tok, &token) == 0);
 
 	/* test reset explicitly */
 	test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token) == 0);
 	fts_tokenizer_reset(tok);
 	test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token) == 0);
-	test_assert(fts_tokenizer_next(tok, NULL, 0, &token) > 0 &&
+	test_assert(fts_tokenizer_final(tok, &token) > 0 &&
 		    strcmp(token, "b@c") == 0);
-	test_assert(fts_tokenizer_next(tok, NULL, 0, &token) == 0);
-
+	test_assert(fts_tokenizer_final(tok, &token) == 0);
 
 	fts_tokenizer_unref(&tok);
 	fts_tokenizer_unref(&gen_tok);
@@ -582,17 +291,11 @@
 		test_fts_tokenizer_find,
 		test_fts_tokenizer_generic_only,
 		test_fts_tokenizer_generic_unicode_whitespace,
-		test_fts_tokenizer_char_generic_only,
 		test_fts_tokenizer_generic_tr29_only,
 		test_fts_tokenizer_generic_tr29_unicode_whitespace,
-		test_fts_tokenizer_char_generic_tr29_only,
 		test_fts_tokenizer_generic_tr29_midnumlet_end,
-		test_fts_tokenizer_char_address_only,
-		test_fts_tokenizer_line_address_only,
-		test_fts_tokenizer_rand_address_only,
-		test_fts_tokenizer_address_char,
-		test_fts_tokenizer_address_line,
-		test_fts_tokenizer_address_rand,
+		test_fts_tokenizer_address_only,
+		test_fts_tokenizer_address_parent,
 		test_fts_tokenizer_address_search,
 		NULL
 	};