Mercurial > dovecot > core-2.2
changeset 18818:c909977ec1a1
lib-fts: Added "english-possessive" filter.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Wed, 03 Jun 2015 01:04:49 +0300 |
parents | ff79a2178fd4 |
children | 06505210b25d |
files | src/lib-fts/Makefile.am src/lib-fts/fts-filter-english-possessive.c src/lib-fts/fts-filter.c src/lib-fts/fts-filter.h src/lib-fts/test-fts-filter.c |
diffstat | 5 files changed, 109 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-fts/Makefile.am Wed Jun 03 01:04:07 2015 +0300 +++ b/src/lib-fts/Makefile.am Wed Jun 03 01:04:49 2015 +0300 @@ -62,6 +62,7 @@ libfts_la_SOURCES = \ fts-filter.c \ + fts-filter-english-possessive.c \ fts-filter-lowercase.c \ fts-filter-normalizer-icu.c \ fts-filter-stopwords.c \
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib-fts/fts-filter-english-possessive.c Wed Jun 03 01:04:49 2015 +0300 @@ -0,0 +1,47 @@ +/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "unichar.h" +#include "fts-common.h" +#include "fts-filter-private.h" + +static unichar_t get_ending_utf8_char(const char *str, unsigned int *end_pos) +{ + unichar_t c; + + while (!UTF8_IS_START_SEQ(str[*end_pos])) { + i_assert(*end_pos > 0); + *end_pos -= 1; + } + if (uni_utf8_get_char(str + *end_pos, &c) <= 0) + i_unreached(); + return c; +} + +static int +fts_filter_english_possessive_filter(struct fts_filter *filter ATTR_UNUSED, + const char **token, + const char **error_r ATTR_UNUSED) +{ + unsigned int len = strlen(*token); + unichar_t c; + + if (len > 1 && ((*token)[len-1] == 's' || (*token)[len-1] == 'S')) { + len -= 2; + c = get_ending_utf8_char(*token, &len); + if (IS_APOSTROPHE(c)) + *token = t_strndup(*token, len); + } + return 1; +} + +static const struct fts_filter fts_filter_english_possessive_real = { + .class_name = "english-possessive", + .v = { + NULL, + fts_filter_english_possessive_filter, + NULL + } +}; + +const struct fts_filter *fts_filter_english_possessive = &fts_filter_english_possessive_real;
--- a/src/lib-fts/fts-filter.c Wed Jun 03 01:04:07 2015 +0300 +++ b/src/lib-fts/fts-filter.c Wed Jun 03 01:04:49 2015 +0300 @@ -20,6 +20,7 @@ fts_filter_register(fts_filter_stemmer_snowball); fts_filter_register(fts_filter_normalizer_icu); fts_filter_register(fts_filter_lowercase); + fts_filter_register(fts_filter_english_possessive); } void fts_filters_deinit(void)
--- a/src/lib-fts/fts-filter.h Wed Jun 03 01:04:07 2015 +0300 +++ b/src/lib-fts/fts-filter.h Wed Jun 03 01:04:49 2015 +0300 @@ -32,6 +32,9 @@ /* Lowecases the input. Currently only ASCII data is lowercased. */ extern const struct fts_filter *fts_filter_lowercase; +/* Removes <'s> suffix from words. */ +extern const struct fts_filter *fts_filter_english_possessive; + /* Register all built-in filters. */ void fts_filters_init(void); void fts_filters_deinit(void);
--- a/src/lib-fts/test-fts-filter.c Wed Jun 03 01:04:07 2015 +0300 +++ b/src/lib-fts/test-fts-filter.c Wed Jun 03 01:04:49 2015 +0300 @@ -572,6 +572,62 @@ #endif #endif +static void test_fts_filter_english_possessive(void) +{ + struct fts_filter *norm = NULL; + const char *input[] = { + "foo'", + + "foo's", + "fooä's", + "foo'S", + "foos'S", + "foo's's", + "foo'ss", + + "foo\xE2\x80\x99s", + "fooä\xE2\x80\x99s", + "foo\xE2\x80\x99S", + "foos\xE2\x80\x99S", + "foo\xE2\x80\x99s\xE2\x80\x99s", + "foo\xE2\x80\x99ss" + }; + const char *expected_output[] = { + "foo'", + + "foo", + "fooä", + "foo", + "foos", + "foo's", + "foo'ss", + + "foo", + "fooä", + "foo", + "foos", + "foo\xE2\x80\x99s", + "foo\xE2\x80\x99ss" + }; + const char *error = NULL; + const char *token = NULL; + unsigned int i; + + test_begin("fts filter english possessive"); + + T_BEGIN { + test_assert(fts_filter_create(fts_filter_english_possessive, NULL, NULL, NULL, &norm, &error) == 0); + for (i = 0; i < N_ELEMENTS(input); i++) { + token = input[i]; + test_assert_idx(fts_filter_filter(norm, &token, &error) == 1, i); + test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i); + } + fts_filter_unref(&norm); + } T_END; + test_assert(norm == NULL); + test_end(); +} + /* TODO: Functions to test 1. ref-unref pairs 2. multiple registers + an unregister + find */ @@ -600,6 +656,7 @@ test_fts_filter_normalizer_stopwords_stemmer_eng, #endif #endif + test_fts_filter_english_possessive, NULL }; int ret;