Mercurial > dovecot > core-2.2
changeset 18632:9f06c6054e3e
lib-fts: Improved test-fts-tokenizer to run multiple text inputs
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Mon, 11 May 2015 14:34:50 +0300 |
parents | 103f64df4e77 |
children | 2483039db977 |
files | src/lib-fts/test-fts-tokenizer.c |
diffstat | 1 files changed, 73 insertions(+), 95 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-fts/test-fts-tokenizer.c Mon May 11 14:22:05 2015 +0300 +++ b/src/lib-fts/test-fts-tokenizer.c Mon May 11 14:34:50 2015 +0300 @@ -9,22 +9,34 @@ #include <stdlib.h> -#define TEST_INPUT_TEXT \ - "hello world\r\n\nAnd there\twas: text galore, " \ - "abc@example.com, " \ - "Bar Baz <bar@example.org>, " \ - "foo@domain " \ - "1234567890123456789012345678ä," \ - "12345678901234567890123456789ä," \ - "123456789012345678901234567890ä," \ - "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n " \ - "(\"Hello world\")3.14 3,14 last" #define TEST_INPUT_ADDRESS \ "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \ "Bar Baz <bar@example.org>" \ "Foo Bar (comment)foo.bar@host.example.org " \ "foo, foo@domain" +static const char *test_inputs[] = { + /* generic things and word truncation: */ + "hello world\r\n\nAnd there\twas: text galore, " + "abc@example.com, " + "Bar Baz <bar@example.org>, " + "foo@domain " + "1234567890123456789012345678ä," + "12345678901234567890123456789ä," + "123456789012345678901234567890ä," + "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n " + "(\"Hello world\")3.14 3,14 last", + + /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and + U+205A(e2 81 9a) and U+205F(e2 81 9f) */ + "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text " + "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n", + + /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */ + "hello world\xEF\xBC\x8E" + +}; + static void test_fts_tokenizer_find(void) { test_begin("fts tokenizer find"); @@ -33,16 +45,17 @@ test_end(); } -static void +static unsigned int test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input, - const char *const *expected_output) + const char *const *expected_output, + unsigned int first_outi) { const unsigned char *input = (const unsigned char *)_input; const char *token, *error; - unsigned int i, max, outi, char_len, input_len = strlen(_input); + unsigned int i, outi, max, char_len, input_len = strlen(_input); /* test all input at once */ - outi = 0; + outi = first_outi; while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) { test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi); outi++; @@ -51,10 +64,11 @@ test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi); outi++; } - test_assert(expected_output[outi] == NULL); + test_assert_idx(expected_output[outi] == NULL, outi); /* test input one byte at a time */ - for (i = outi = 0; i < input_len; i += char_len) { + outi = first_outi; + for (i = 0; i < input_len; i += char_len) { char_len = uni_utf8_char_bytes(input[i]); while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) { test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi); @@ -65,10 +79,11 @@ test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi); outi++; } - test_assert(expected_output[outi] == NULL); + test_assert_idx(expected_output[outi] == NULL, outi); /* test input in random chunks */ - for (i = outi = 0; i < input_len; i += char_len) { + outi = first_outi; + for (i = 0; i < input_len; i += char_len) { max = rand() % (input_len - i) + 1; for (char_len = 0; char_len < max; ) char_len += uni_utf8_char_bytes(input[i+char_len]); @@ -81,12 +96,25 @@ test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi); outi++; } - test_assert(expected_output[outi] == NULL); + test_assert_idx(expected_output[outi] == NULL, outi); + return outi+1; +} + +static void +test_tokenizer_inputs(struct fts_tokenizer *tok, + const char *const *expected_output) +{ + unsigned int i, outi = 0; + + for (i = 0; i < N_ELEMENTS(test_inputs); i++) { + outi = test_tokenizer_inputoutput(tok, test_inputs[i], + expected_output, outi); + } + test_assert_idx(expected_output[outi] == NULL, outi); } static void test_fts_tokenizer_generic_only(void) { - static const char input[] = TEST_INPUT_TEXT; static const char *const expected_output[] = { "hello", "world", "And", "there", "was", "text", "galore", @@ -96,7 +124,15 @@ "12345678901234567890123456789", "123456789012345678901234567890", "and", "longlonglongabcdefghijklmnopqr", - "more", "Hello", "world", "3", "14", "3", "14", "last", NULL + "more", "Hello", "world", "3", "14", "3", "14", "last", NULL, + + "hello", "world", "And", + "there", "was", "text", "galore", + "and", "more", NULL, + + "hello", "world", NULL, + + NULL }; struct fts_tokenizer *tok; const char *error; @@ -105,38 +141,17 @@ test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0); test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE); - test_tokenizer_inputoutput(tok, input, expected_output); - fts_tokenizer_unref(&tok); - test_end(); -} - -static void test_fts_tokenizer_generic_unicode_whitespace(void) -{ - /* with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and - U+205A(e2 81 9a) and U+205F(e2 81 9f )*/ - static const char input[] = - "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text " - "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n"; - static const char *const expected_output[] = { - "hello", "world", "And", - "there", "was", "text", "galore", - "and", "more", NULL - }; - struct fts_tokenizer *tok; - const char *error; - - test_begin("fts tokenizer generic simple with Unicode whitespace"); - test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0); - test_tokenizer_inputoutput(tok, input, expected_output); + test_tokenizer_inputs(tok, expected_output); fts_tokenizer_unref(&tok); test_end(); } const char *const tr29_settings[] = {"algorithm", "tr29", NULL}; +/* TODO: U+206F is in "Format" and therefore currently not word break. + This definitely needs to be remapped. */ static void test_fts_tokenizer_generic_tr29_only(void) { - static const char input[] = TEST_INPUT_TEXT; static const char *const expected_output[] = { "hello", "world", "And", "there", "was", "text", "galore", @@ -146,56 +161,22 @@ "12345678901234567890123456789", "123456789012345678901234567890", "and", "longlonglongabcdefghijklmnopqr", - "more", "Hello", "world", "3.14", "3,14", "last", NULL + "more", "Hello", "world", "3.14", "3,14", "last", NULL, + + "hello", "world", "And", + "there", "was", "text", "galore", + "and", "more", NULL, + + "hello", "world", NULL, + + NULL }; struct fts_tokenizer *tok; const char *error; test_begin("fts tokenizer generic TR29"); test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0); - test_tokenizer_inputoutput(tok, input, expected_output); - fts_tokenizer_unref(&tok); - test_end(); -} - -/* TODO: U+206F is in "Format" and therefore currently not word break. - This definitely needs to be remapped. */ -static void test_fts_tokenizer_generic_tr29_unicode_whitespace(void) -{ - /* with Unicode(utf8) U+2000(e2 80 80) and U+205A(e2 81 9a) and U+205F(e2 - 81 9f)*/ - static const char input[] = - "hello world\r\nAnd\xE2\x80\x80there\twas: text " - "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n"; - static const char *const expected_output[] = { - "hello", "world", "And", - "there", "was", "text", "galore", - "and", "more", NULL - }; - struct fts_tokenizer *tok; - const char *error; - - test_begin("fts tokenizer generic TR29 with Unicode whitespace"); - test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0); - test_tokenizer_inputoutput(tok, input, expected_output); - fts_tokenizer_unref(&tok); - test_end(); -} - -static void test_fts_tokenizer_generic_tr29_midnumlet_end(void) -{ - /* u+FF0E is EF BC 8E */ - static const char input[] = - "hello world\xEF\xBC\x8E"; - static const char *const expected_output[] = { - "hello", "world", NULL - }; - struct fts_tokenizer *tok; - const char *error; - - test_begin("fts tokenizer generic TR29 with MinNumLet U+FF0E at end"); - test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0); - test_tokenizer_inputoutput(tok, input, expected_output); + test_tokenizer_inputs(tok, expected_output); fts_tokenizer_unref(&tok); test_end(); } @@ -212,7 +193,7 @@ test_begin("fts tokenizer email address only"); test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0); - test_tokenizer_inputoutput(tok, input, expected_output); + test_tokenizer_inputoutput(tok, input, expected_output, 0); fts_tokenizer_unref(&tok); test_end(); } @@ -232,7 +213,7 @@ test_begin("fts tokenizer email address + parent"); test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0); test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0); - test_tokenizer_inputoutput(tok, input, expected_output); + test_tokenizer_inputoutput(tok, input, expected_output, 0); fts_tokenizer_unref(&tok); fts_tokenizer_unref(&gen_tok); test_end(); @@ -254,7 +235,7 @@ test_begin("fts tokenizer search email address + parent"); test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0); test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0); - test_tokenizer_inputoutput(tok, input, expected_output); + test_tokenizer_inputoutput(tok, input, expected_output, 0); /* make sure state is forgotten at EOF */ test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0); @@ -290,10 +271,7 @@ static void (*test_functions[])(void) = { test_fts_tokenizer_find, test_fts_tokenizer_generic_only, - test_fts_tokenizer_generic_unicode_whitespace, test_fts_tokenizer_generic_tr29_only, - test_fts_tokenizer_generic_tr29_unicode_whitespace, - test_fts_tokenizer_generic_tr29_midnumlet_end, test_fts_tokenizer_address_only, test_fts_tokenizer_address_parent, test_fts_tokenizer_address_search,