Mercurial > dovecot > core-2.2
changeset 18772:62b201a1ee06
lib-fts: tr29 cleanup - consistently call valid chars "token" and "non-token" chars.
Instead of word/token/text.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Mon, 01 Jun 2015 21:16:35 +0300 |
parents | 04ea590951c1 |
children | b239f075147b |
files | src/lib-fts/fts-tokenizer-generic.c |
diffstat | 1 files changed, 7 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-fts/fts-tokenizer-generic.c Mon Jun 01 21:11:55 2015 +0300 +++ b/src/lib-fts/fts-tokenizer-generic.c Mon Jun 01 21:16:35 2015 +0300 @@ -513,9 +513,8 @@ TODO: Does this "reverse approach" include too much in "whitespace"? TODO: Possibly use is_word_break()? */ -static bool is_nonword(enum letter_type lt) +static bool is_nontoken(enum letter_type lt) { - if (lt == LETTER_TYPE_REGIONAL_INDICATOR || lt == LETTER_TYPE_KATAKANA || lt == LETTER_TYPE_HEBREW_LETTER || lt == LETTER_TYPE_ALETTER || lt == LETTER_TYPE_NUMERIC) @@ -561,7 +560,7 @@ i_assert(len > 0); len--; } - /* we're skipping all non-text at the beginning of the word, + /* we're skipping all non-token chars at the beginning of the word, so by this point we must have something here - even if we just deleted the last character */ i_assert(len > 0); @@ -596,7 +595,7 @@ * No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2). * Break just once, not before and after. * Break at MidNumLet, except apostrophes (diverging from WB6/WB7). - * Other things also (e.g. is_nonword(), not really pure tr29. Meant + * Other things also (e.g. is_nontoken(), not really pure tr29. Meant to assist in finding individual words. */ static bool @@ -637,9 +636,10 @@ i_unreached(); i += uni_utf8_char_bytes(data[i]); lt = letter_type(c); - if (tok->prev_letter == LETTER_TYPE_NONE && is_nonword(lt)) { - /* TODO: test that start_skip works with multibyte utf8 chars */ - start_skip = i; /* Skip non-token chars at start of data */ + if (tok->prev_letter == LETTER_TYPE_NONE && is_nontoken(lt)) { + /* Skip non-token chars at the beginning of token */ + i_assert(tok->token->used == 0); + start_skip = i; continue; } if (uni_found_word_boundary(tok, lt)) {