Mercurial > dovecot > core-2.2
changeset 18559:2048dade16e7
lib-fts: Fixed using max_length setting in simple tokenizer
author | Teemu Huovila <teemu.huovila@dovecot.fi> |
---|---|
date | Sat, 09 May 2015 11:16:22 +0300 |
parents | 8d445959df03 |
children | b0a934361563 |
files | src/lib-fts/fts-tokenizer-generic.c src/lib-fts/test-fts-tokenizer.c |
diffstat | 2 files changed, 16 insertions(+), 10 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-fts/fts-tokenizer-generic.c Sat May 09 11:15:50 2015 +0300 +++ b/src/lib-fts/fts-tokenizer-generic.c Sat May 09 11:16:22 2015 +0300 @@ -86,7 +86,7 @@ fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok, const char **token_r) { - *token_r = t_strndup(tok->token->data, tok->token->used); + *token_r = t_strndup(tok->token->data, I_MIN(tok->token->used, tok->max_length)); buffer_set_used_size(tok->token, 0); return 1; } @@ -147,8 +147,7 @@ char_start_i = i; if (data_is_word_boundary(data, size, &i)) { len = char_start_i - start; - buffer_append(tok->token, data + start, - I_MIN(len, tok->max_length)); + buffer_append(tok->token, data + start, len); if (tok->token->used == 0) { /* no text read yet */ start = i + 1; @@ -161,14 +160,16 @@ } /* word boundary not found yet */ len = i - start; - buffer_append(tok->token, data + start, I_MIN(len, tok->max_length)); - + buffer_append(tok->token, data + start, len); *skip_r = i; - if (size == 0 && tok->token->used > 0) { - /* return the last token */ + /* return the last token */ + if (size == 0 && tok->token->used > 0) return fts_tokenizer_generic_simple_current_token(tok, token_r); - } + + /* token too long */ + if (tok->token->used > tok->max_length) + return fts_tokenizer_generic_simple_current_token(tok, token_r); return 0; }
--- a/src/lib-fts/test-fts-tokenizer.c Sat May 09 11:15:50 2015 +0300 +++ b/src/lib-fts/test-fts-tokenizer.c Sat May 09 11:16:22 2015 +0300 @@ -6,6 +6,8 @@ #include "test-common.h" #include "fts-tokenizer.h" #include "fts-tokenizer-private.h" +/* TODO: fix including and linking of this. */ +/* #include "fts-tokenizer-generic-private.h" */ #include <stdlib.h> @@ -13,11 +15,12 @@ { static const unsigned char input[] = "hello world\r\nAnd there\twas: text " - "galore, and more.\n\n (\"Hello world\")last "; + "galore, and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n (\"Hello world\")last "; static const char *const expected_output[] = { "hello", "world", "And", "there", "was", "text", "galore", - "and", "more", "Hello", "world", "last", NULL + "and", "longlonglongabcdefghijklmnopqr", + "more", "Hello", "world", "last", NULL }; const struct fts_tokenizer *tok_class; struct fts_tokenizer *tok; @@ -28,6 +31,8 @@ fts_tokenizers_init(); tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME); test_assert(fts_tokenizer_create(tok_class, NULL, NULL, &tok, &error) == 0); +/*TODO: Uncomment when fts-tokenizer-generic-private.h inclusion is fixed */ +/*test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);*/ while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) { test_assert(strcmp(token, *eopp) == 0); eopp++;