view src/lib-fts/test-fts-tokenizer.c @ 18573:60f07e741c57

lib-fts: Implemented "search" parameter to fts-tokenizer-address.
author Timo Sirainen <tss@iki.fi>
date Sat, 09 May 2015 13:15:09 +0300
parents 7c1fe66e8855
children 363397c3701e
line wrap: on
line source

/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "sha2.h"
#include "hex-binary.h"
#include "test-common.h"
#include "fts-tokenizer.h"
#include "fts-tokenizer-private.h"
/* TODO: fix including and linking of this. */
/* #include "fts-tokenizer-generic-private.h" */

#include <stdlib.h>

static void test_fts_tokenizer_generic_only(void)
{
	static const unsigned char input[] =
		"hello world\r\nAnd there\twas: text "
		"galore, and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n (\"Hello world\")last ";
	static const char *const expected_output[] = {
		"hello", "world", "And",
		"there", "was", "text", "galore",
		"and", "longlonglongabcdefghijklmnopqr",
		"more", "Hello", "world", "last", NULL
	};
	const struct fts_tokenizer *tok_class;
	struct fts_tokenizer *tok;
	const char * const *eopp = expected_output;
	const char *token, *error;

	test_begin("fts tokenizer generic simple");
	fts_tokenizers_init();
	tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
	test_assert(fts_tokenizer_create(tok_class, NULL, NULL, &tok, &error) == 0);
/*TODO: Uncomment when fts-tokenizer-generic-private.h inclusion is fixed */
/*test_assert(((struct generic_fts_tokenizer *) tok)->algorithm ==  BOUNDARY_ALGORITHM_SIMPLE);*/
	while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
		test_assert(strcmp(token, *eopp) == 0);
		eopp++;
	}
	while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
		test_assert(strcmp(token, *eopp) == 0);
		eopp++;
	}
	test_assert(*eopp == NULL);
	fts_tokenizer_unref(&tok);
	fts_tokenizers_deinit();
	test_end();
}

static void test_fts_tokenizer_generic_unicode_whitespace(void)
{
	/* with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
	   U+205A(e2 81 9a) and U+205F(e2 81 9f )*/
	static const unsigned char input[] =
		"hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
		"galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n";
	static const char *const expected_output[] = {
		"hello", "world", "And",
		"there", "was", "text", "galore",
		"and", "more", NULL
	};
	const struct fts_tokenizer *tok_class;
	struct fts_tokenizer *tok;
	const char * const *eopp = expected_output;
	const char *token, *error;

	test_begin("fts tokenizer generic simple with Unicode whitespace");
	fts_tokenizer_register(fts_tokenizer_generic);
	tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
	test_assert(fts_tokenizer_create(tok_class, NULL, NULL, &tok, &error) == 0);
	while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
		test_assert(strcmp(token, *eopp) == 0);
		eopp++;
	}
	while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
		test_assert(strcmp(token, *eopp) == 0);
		eopp++;
	}
	test_assert(*eopp == NULL);
	fts_tokenizer_unref(&tok);
	fts_tokenizer_unregister(fts_tokenizer_generic);
	test_end();
}

static void test_fts_tokenizer_char_generic_only(void)
{
	static const unsigned char input[] =
		"abc@example.com, "
		"Bar Baz <bar@example.org>, "
		"foo@domain";
	static const char *const expected_output[] = {
		"abc", "example", "com", "Bar", "Baz",
		"bar", "example", "org", "foo", "domain", NULL
	};
	struct fts_tokenizer *tok;
	const char * const *eopp = expected_output;
	const char *token, *error;
	unsigned int i;
	int ret;

	test_begin("fts tokenizer generic simple input one character at a time");
	fts_tokenizer_register(fts_tokenizer_generic);

	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);

	for (i = 0; i <= sizeof(input)-1; ) {
		ret = i < sizeof(input)-1 ?
			fts_tokenizer_next(tok, &input[i], 1, &token) :
			fts_tokenizer_next(tok, NULL, 0, &token);
		if (ret == 0) {
			i++;
			continue;
		}
		test_assert(null_strcmp(token, *eopp) == 0);
		eopp++;
	}
	test_assert(*eopp == NULL);
	fts_tokenizer_unref(&tok);
	fts_tokenizer_unregister(fts_tokenizer_generic);
	test_end();
}

const char *const tr29_settings[] = {"algorithm", "tr29", NULL};

static void test_fts_tokenizer_generic_tr29_only(void)
{
	static const unsigned char input[] =
		"hello world\r\n\nAnd there\twas: text "
		"galore, and more.\n\n (\"Hello world\")3.14 3,14 last"
		" longlonglongabcdefghijklmnopqrstuvwxyz 1.";
	static const char *const expected_output[] = {
		"hello", "world", "And",
		"there", "was", "text", "galore",
		"and", "more", "Hello", "world", "3.14",
		"3,14", "last", "longlonglongabcdefghijklmnopqr", "1", NULL
	};
	const struct fts_tokenizer *tok_class;
	struct fts_tokenizer *tok;
	const char * const *eopp = expected_output;
	const char *token, *error;

	test_begin("fts tokenizer generic TR29");
	fts_tokenizer_register(fts_tokenizer_generic);
	tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
	test_assert(fts_tokenizer_create(tok_class, NULL, tr29_settings, &tok, &error) == 0);
	while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
		test_assert(strcmp(token, *eopp) == 0);
		eopp++;
	}
	while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
		test_assert(strcmp(token, *eopp) == 0);
		eopp++;
	}
	test_assert(*eopp == NULL);
	fts_tokenizer_unref(&tok);
	fts_tokenizer_unregister(fts_tokenizer_generic);
	test_end();
}

/* TODO: U+206F is in "Format" and therefore currently not word break.
   This definitely needs to be remapped. */
static void test_fts_tokenizer_generic_tr29_unicode_whitespace(void)
{
	/* with Unicode(utf8) U+2000(e2 80 80) and U+205A(e2 81 9a) and U+205F(e2
	   81 9f)*/
	static const unsigned char input[] =
		"hello world\r\nAnd\xE2\x80\x80there\twas: text "
		"galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n";
	static const char *const expected_output[] = {
		"hello", "world", "And",
		"there", "was", "text", "galore",
		"and", "more", NULL
	};
	const struct fts_tokenizer *tok_class;
	struct fts_tokenizer *tok;
	const char * const *eopp = expected_output;
	const char *token, *error;

	test_begin("fts tokenizer generic TR29 with Unicode whitespace");
	fts_tokenizer_register(fts_tokenizer_generic);
	tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
	test_assert(fts_tokenizer_create(tok_class, NULL, tr29_settings, &tok, &error) == 0);
	while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
		test_assert(strcmp(token, *eopp) == 0);
		eopp++;
	}
	while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
		test_assert(strcmp(token, *eopp) == 0);
		eopp++;
	}
	test_assert(*eopp == NULL);
	fts_tokenizer_unref(&tok);
	fts_tokenizer_unregister(fts_tokenizer_generic);
	test_end();
}

static void test_fts_tokenizer_generic_tr29_midnumlet_end(void)
{
	/* u+FF0E is EF BC 8E  */
	static const unsigned char input[] =
		"hello world\xEF\xBC\x8E";
	static const char *const expected_output[] = {
		"hello", "world", NULL
	};
	const struct fts_tokenizer *tok_class;
	struct fts_tokenizer *tok;
	const char * const *eopp = expected_output;
	const char *token, *error;

	test_begin("fts tokenizer generic TR29 with MinNumLet U+FF0E at end");
	fts_tokenizer_register(fts_tokenizer_generic);
	tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
	test_assert(fts_tokenizer_create(tok_class, NULL, tr29_settings, &tok, &error) == 0);
	while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
		test_assert(null_strcmp(token, *eopp) == 0);
		eopp++;
	}
	while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
		test_assert(null_strcmp(token, *eopp) == 0);
		eopp++;
	}
	test_assert(*eopp == NULL);
	fts_tokenizer_unref(&tok);
	fts_tokenizer_unregister(fts_tokenizer_generic);
	test_end();
}

static void test_fts_tokenizer_char_generic_tr29_only(void)
{
	static const unsigned char input[] =
		"abc@example.com, "
		"Bar Baz <bar@example.org>, "
		"foo@domain";
	static const char *const expected_output[] = {
		"abc", "example.com", "Bar", "Baz",
		"bar", "example.org", "foo", "domain", NULL
	};
	struct fts_tokenizer *tok;
	const char * const *eopp = expected_output;
	const char *token, *error;
	unsigned int i;
	int ret;

	test_begin("fts tokenizer generic TR29 input one character at a time");
	fts_tokenizer_register(fts_tokenizer_generic);

	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);

	for (i = 0; i <= sizeof(input)-1; ) {
		ret = i < sizeof(input)-1 ?
			fts_tokenizer_next(tok, &input[i], 1, &token) :
			fts_tokenizer_next(tok, NULL, 0, &token);
		if (ret == 0) {
			i++;
			continue;
		}
		test_assert(null_strcmp(token, *eopp) == 0);
		eopp++;
	}
	test_assert(*eopp == NULL);
	fts_tokenizer_unref(&tok);
	fts_tokenizer_unregister(fts_tokenizer_generic);
	test_end();
}

static void test_fts_tokenizer_line_address_only(void)
{
	static const char *const input[] = {
		"abc@example.com",
		" Bar Baz <bar@example.org>",
		"foo@domain",
		" moro foo@domain Bar Baz <bar@example.org>"
	};
	static const char *const expected_output[] = {
		"abc@example.com", "bar@example.org",
		"foo@domain", "foo@domain", "bar@example.org", NULL
	};
	struct fts_tokenizer *tok;
	const char * const *eopp = expected_output;
	const char *token, *error;
	unsigned int i;
	int ret;

	test_begin("fts tokenizer email address only, input one line at a time");
	fts_tokenizer_register(fts_tokenizer_email_address);

	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);

	for (i = 0; i <= N_ELEMENTS(input);) {
		ret = i < N_ELEMENTS(input) ?
			fts_tokenizer_next(tok, (unsigned char *)input[i],
			                   strlen(input[i]), &token) :
			fts_tokenizer_next(tok, NULL, 0, &token);
		if (ret == 0) {
			i++;
			continue;
		}
		test_assert(null_strcmp(token, *eopp) == 0);
		eopp++;
	}
	test_assert(*eopp == NULL);
	fts_tokenizer_unref(&tok);
	fts_tokenizer_unregister(fts_tokenizer_email_address);
	test_end();

}
static void test_fts_tokenizer_char_address_only(void)
{
	static const unsigned char input[] =
		"@invalid invalid@ abc@example.com, "
		"Bar Baz <bar@example.org>, "
		"foo@domain";
	static const char *const expected_output[] = {
		"abc@example.com", "bar@example.org",
		"foo@domain", NULL
	};
	struct fts_tokenizer *tok;
	const char * const *eopp = expected_output;
	const char *token, *error;
	unsigned int i;
	int ret;

	test_begin("fts tokenizer email address only, input one character at a time");
	fts_tokenizer_register(fts_tokenizer_email_address);
	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);

	for (i = 0; i <= sizeof(input)-1; ) {
		ret = i < sizeof(input)-1 ?
			fts_tokenizer_next(tok, &input[i], 1, &token) :
			fts_tokenizer_next(tok, NULL, 0, &token);
		if (ret == 0) {
			i++;
			continue;
		}
		test_assert(null_strcmp(token, *eopp) == 0);
		eopp++;
	}
	test_assert(*eopp == NULL);
	fts_tokenizer_unref(&tok);
	fts_tokenizer_unregister(fts_tokenizer_email_address);
	test_end();
}

static void test_fts_tokenizer_rand_address_only(void)
{
	static const unsigned char input[] =
		"@invalid invalid@ Abc Dfg <abc.dfg@example.com>, "
		"Foo Bar (comment)foo.bar@host.example.org foo ";

	static const char *const expected_output[] = {
		"abc.dfg@example.com",
		"foo.bar@host.example.org",
		 NULL
	};
	struct fts_tokenizer *tok;
	const char * const *eopp = expected_output;
	const char *token, *error;
	unsigned int i, step, step_max = 10;
	int ret;

	test_begin("fts tokenizer email address, input random length");
	fts_tokenizer_register(fts_tokenizer_email_address);
	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL,
	                                 NULL, &tok, &error) == 0);
	step = rand() % step_max + 1;
	for (i = 0; i <= sizeof(input)-1; ) {
		ret = i < sizeof(input)-1 ?
			fts_tokenizer_next(tok, &input[i], step, &token) :
			fts_tokenizer_next(tok, NULL, 0, &token);
		if (ret == 0) {
			i += step;
			step = rand() % step_max + 1;
			step = I_MIN(step, sizeof(input) - i);
			continue;
		}
		test_assert(null_strcmp(token, *eopp) == 0);
		eopp++;
	}
	test_assert(*eopp == NULL);
	fts_tokenizer_unref(&tok);
	fts_tokenizer_unregister(fts_tokenizer_email_address);
	test_end();
}

static void test_fts_tokenizer_address_char(void)
{
	static const unsigned char input[] =
		"@invalid invalid@ abc@example.com, "
		"Bar Baz <bar@example.org>, "
		"foo@domain";
	static const char *const expected_output[] = {
		"invalid", "invalid", "abc", "example", "com", "abc@example.com", "Bar", "Baz",
		"bar", "example", "org", "bar@example.org",
		"foo", "domain", "foo@domain", NULL
	};
	struct fts_tokenizer *tok, *gen_tok;
	const char * const *eopp = expected_output;
	const char *token, *error;
	unsigned int i;
	int ret;

	test_begin("fts tokenizer email address + parent, input one character at a time");
	fts_tokenizers_init();

	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);

	for (i = 0; i <= sizeof(input)-1; ) {
		ret = i < sizeof(input)-1 ?
			fts_tokenizer_next(tok, &input[i], 1, &token) :
			fts_tokenizer_next(tok, NULL, 0, &token);
		if (ret == 0) {
			i++;
			continue;
		}
		test_assert(*eopp != NULL);
		test_assert(null_strcmp(token, *eopp) == 0);
		eopp++;
	}
	test_assert(*eopp == NULL);
	fts_tokenizer_unref(&tok);
	fts_tokenizer_unref(&gen_tok);
	fts_tokenizers_deinit();
	test_end();
}

static void test_fts_tokenizer_address_line(void)
{
	static const char *const input[] = {
		"@invalid invalid@ abc@example.com, ",
		"Bar Baz <bar@example.org>, ",
		"foo@domain, ",
		"foo@domain Bar Baz <bar@example.org>, "
	};
	static const char *const expected_output[] = {
		"invalid", "invalid", "abc", "example", "com", "abc@example.com", "Bar", "Baz",
		"bar", "example", "org", "bar@example.org",
		"foo", "domain", "foo@domain",
		"foo", "domain", "foo@domain", "Bar", "Baz",
		"bar", "example", "org", "bar@example.org", NULL
	};
	struct fts_tokenizer *tok, *gen_tok;
	const char * const *eopp = expected_output;
	const char *token, *error;
	unsigned int i;
	int ret;

	test_begin("fts tokenizer email address + parent, input one line at a time");
	fts_tokenizers_init();

	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);

	for (i = 0; i <= N_ELEMENTS(input);) {
		ret = i < N_ELEMENTS(input) ?
			fts_tokenizer_next(tok, (unsigned char *)input[i],
			                   strlen(input[i]), &token) :
			fts_tokenizer_next(tok, NULL, 0, &token);
		if (ret == 0) {
			i++;
			continue;
		}
		test_assert(null_strcmp(token, *eopp) == 0);
		eopp++;
	}
	test_assert(*eopp == NULL);
	fts_tokenizer_unref(&tok);
	fts_tokenizer_unref(&gen_tok);
	fts_tokenizers_deinit();
	test_end();

}

static void test_fts_tokenizer_address_rand(void)
{
	static const unsigned char input[] =
		"@invalid invalid@ abc@example.com, "
		"Bar Baz <bar@example.org>, "
		"foo@domain";
	static const char *const expected_output[] = {
		"invalid", "invalid", "abc", "example", "com", "abc@example.com", "Bar", "Baz",
		"bar", "example", "org", "bar@example.org",
		"foo", "domain", "foo@domain", NULL
	};
	struct fts_tokenizer *tok, *gen_tok;
	const char * const *eopp = expected_output;
	const char *token, *error;
	unsigned int i, step, step_max = 10;
	int ret;

	test_begin("fts tokenizer email address + parent, input random length");
	fts_tokenizer_register(fts_tokenizer_generic);
	fts_tokenizer_register(fts_tokenizer_email_address);

	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);

	//srand(1424142100); /* had a bug */
	step = rand() % step_max + 1;
	for (i = 0; i <= sizeof(input)-1; ) {
		ret = i < sizeof(input)-1 ?
		      fts_tokenizer_next(tok, &input[i], step, &token) :
		      fts_tokenizer_next(tok, NULL, 0, &token);
		if (ret == 0) {
			i += step;
			step = rand() % step_max + 1;
			step = I_MIN(step, sizeof(input) - i);
			continue;
		}
		test_assert(null_strcmp(token, *eopp) == 0);
		eopp++;
	}
	test_assert(*eopp == NULL);
	fts_tokenizer_unref(&tok);
	fts_tokenizer_unref(&gen_tok);
	fts_tokenizer_unregister(fts_tokenizer_generic);
	fts_tokenizer_unregister(fts_tokenizer_email_address);
	test_end();
}

static void test_fts_tokenizer_address_search(void)
{
	static const unsigned char input[] =
		"@invalid invalid@ abc@example.com, "
		"Bar Baz <bar@example.org>, "
		"foo@domain";
	static const char *const expected_output[] = {
		"invalid", "invalid", "abc@example.com", "Bar", "Baz",
		"bar@example.org", "foo@domain", NULL
	};
	static const char *const settings[] = { "search", "" };
	struct fts_tokenizer *tok, *gen_tok;
	const char * const *eopp = expected_output;
	const char *token, *error;
	unsigned int i;
	int ret;

	test_begin("fts tokenizer search email address + parent, input one character at a time");
	fts_tokenizers_init();

	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);

	for (i = 0; i <= sizeof(input)-1; ) {
		ret = i < sizeof(input)-1 ?
			fts_tokenizer_next(tok, &input[i], 1, &token) :
			fts_tokenizer_next(tok, NULL, 0, &token);
		if (ret == 0) {
			i++;
			continue;
		}
		test_assert(*eopp != NULL);
		test_assert(null_strcmp(token, *eopp) == 0);
		eopp++;
	}
	test_assert(*eopp == NULL);
	fts_tokenizer_unref(&tok);
	fts_tokenizer_unref(&gen_tok);
	fts_tokenizers_deinit();
	test_end();
}

int main(void)
{
	static void (*test_functions[])(void) = {
		test_fts_tokenizer_generic_only,
		test_fts_tokenizer_generic_unicode_whitespace,
		test_fts_tokenizer_char_generic_only,
		test_fts_tokenizer_generic_tr29_only,
		test_fts_tokenizer_generic_tr29_unicode_whitespace,
		test_fts_tokenizer_char_generic_tr29_only,
		test_fts_tokenizer_generic_tr29_midnumlet_end,
		test_fts_tokenizer_char_address_only,
		test_fts_tokenizer_line_address_only,
		test_fts_tokenizer_rand_address_only,
		test_fts_tokenizer_address_char,
		test_fts_tokenizer_address_line,
		test_fts_tokenizer_address_rand,
		test_fts_tokenizer_address_search,
		NULL
	};

	return test_run(test_functions);
}