Mercurial > dovecot > core-2.2
changeset 18560:b0a934361563
lib-fts: Improve using max_length in tr29 tokenizer
author | Teemu Huovila <teemu.huovila@dovecot.fi> |
---|---|
date | Sat, 09 May 2015 11:17:03 +0300 |
parents | 2048dade16e7 |
children | 7de648f42bc0 |
files | src/lib-fts/fts-tokenizer-generic.c src/lib-fts/test-fts-tokenizer.c |
diffstat | 2 files changed, 12 insertions(+), 10 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-fts/fts-tokenizer-generic.c Sat May 09 11:16:22 2015 +0300 +++ b/src/lib-fts/fts-tokenizer-generic.c Sat May 09 11:17:03 2015 +0300 @@ -469,11 +469,14 @@ const char **token_r) { size_t end_skip = 0; + ssize_t len; if (is_one_past_end(tok)) end_skip = tok->last_size; - *token_r = t_strndup(tok->token->data, tok->token->used - end_skip); + len = I_MIN(tok->token->used, tok->max_length) - end_skip; + i_assert(len > 0); + *token_r = t_strndup(tok->token->data, len); buffer_set_used_size(tok->token, 0); tok->prev_prev_letter = LETTER_TYPE_NONE; tok->prev_letter = LETTER_TYPE_NONE; @@ -525,7 +528,7 @@ (struct generic_fts_tokenizer *)_tok; unichar_t c; - size_t i, char_start_i, start_skip = 0, len; + size_t i, char_start_i, start_skip = 0; enum letter_type lt; /* TODO: Process 8bit chars separately, to speed things up. */ @@ -542,17 +545,15 @@ continue; } if (uni_found_word_boundary(tok, lt)) { - len = I_MIN(char_start_i, tok->max_length); - i_assert(len >= start_skip && size >= start_skip); + i_assert(char_start_i >= start_skip && size >= start_skip); buffer_append(tok->token, data + start_skip, - len - start_skip); + char_start_i - start_skip); *skip_r = i + 1; return fts_tokenizer_generic_tr29_current_token(tok, token_r); } } - len = I_MIN(i, tok->max_length); - i_assert(len >= start_skip && size >= start_skip); - buffer_append(tok->token, data + start_skip, len - start_skip); + i_assert(i >= start_skip && size >= start_skip); + buffer_append(tok->token, data + start_skip, i - start_skip); *skip_r = i; if (size == 0 && tok->token->used > 0) {
--- a/src/lib-fts/test-fts-tokenizer.c Sat May 09 11:16:22 2015 +0300 +++ b/src/lib-fts/test-fts-tokenizer.c Sat May 09 11:17:03 2015 +0300 @@ -126,12 +126,13 @@ { static const unsigned char input[] = "hello world\r\n\nAnd there\twas: text " - "galore, and more.\n\n (\"Hello world\")3.14 3,14 last 1."; + "galore, and more.\n\n (\"Hello world\")3.14 3,14 last" + " longlonglongabcdefghijklmnopqrstuvwxyz 1."; static const char *const expected_output[] = { "hello", "world", "And", "there", "was", "text", "galore", "and", "more", "Hello", "world", "3.14", - "3,14", "last", "1", NULL + "3,14", "last", "longlonglongabcdefghijklmnopqr", "1", NULL }; const struct fts_tokenizer *tok_class; struct fts_tokenizer *tok;