Mercurial > dovecot > core-2.2
changeset 18778:e3f9e4c8a338
lib-fts: tokenizers - don't include removed apostrophes as part of the token size
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Mon, 01 Jun 2015 21:48:59 +0300 |
parents | f44961c66a48 |
children | 218c3988e105 |
files | src/lib-fts/fts-tokenizer-generic.c src/lib-fts/test-fts-tokenizer.c |
diffstat | 2 files changed, 19 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-fts/fts-tokenizer-generic.c Mon Jun 01 21:35:39 2015 +0300 +++ b/src/lib-fts/fts-tokenizer-generic.c Mon Jun 01 21:48:59 2015 +0300 @@ -180,6 +180,19 @@ size_t append_len, pos = 0, appended = 0; unichar_t c; + if (size == 0) + return; + if (data[0] == '\'' && tok->token->used == 0) { + /* Skip apostrophes in the beginning of the token. + We need to do it here so that we don't truncate the + token too early. */ + data++; + size--; + if (size == 0) + return; + i_assert(data[0] != '\''); + } + i_assert(tok->max_length >= tok->token->used); append_len = I_MIN(size, tok->max_length - tok->token->used);
--- a/src/lib-fts/test-fts-tokenizer.c Mon Jun 01 21:35:39 2015 +0300 +++ b/src/lib-fts/test-fts-tokenizer.c Mon Jun 01 21:48:59 2015 +0300 @@ -31,6 +31,8 @@ "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''", + "'1234567890123456789012345678ä," + /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and U+205A(e2 81 9a) and U+205F(e2 81 9f) */ "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text " @@ -136,6 +138,8 @@ "quoted", "text", "word", "hlo", "words", "you're", "bad", "word", "pre", "post", NULL, + "1234567890123456789012345678ä", + "hello", "world", "And", "there", "was", "text", "galore", "and", "more", NULL, @@ -178,6 +182,8 @@ "quoted", "text", "word", "hlo", "words", "you're", "bad", "word", "pre", "post", NULL, + "1234567890123456789012345678ä", + "hello", "world", "And", "there", "was", "text", "galore", "and", "more", NULL,