Mercurial > dovecot > core-2.2
changeset 18552:95a827d97e5b
fts: Change filter API to be able to return errors
Modify fts_filter_filter() to return integer status codes. It returns
1 if a token was returned, 0 if it was filtered away and -1 on error.
author | Teemu Huovila <teemu.huovila@dovecot.fi> |
---|---|
date | Sat, 09 May 2015 11:06:45 +0300 |
parents | 7fe766887394 |
children | 3ae8ae7f1022 |
files | src/lib-fts/fts-filter-normalizer-icu.c src/lib-fts/fts-filter-normalizer-simple.c src/lib-fts/fts-filter-private.h src/lib-fts/fts-filter-stemmer-snowball.c src/lib-fts/fts-filter-stopwords.c src/lib-fts/fts-filter.c src/lib-fts/fts-filter.h src/lib-fts/test-fts-filter.c src/plugins/fts/fts-build-mail.c src/plugins/fts/fts-search-args.c |
diffstat | 10 files changed, 126 insertions(+), 95 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-fts/fts-filter-normalizer-icu.c Sat May 09 11:05:04 2015 +0300 +++ b/src/lib-fts/fts-filter-normalizer-icu.c Sat May 09 11:06:45 2015 +0300 @@ -98,8 +98,9 @@ return 0; } -static int make_utf8(const UChar *src, char **dst, const char **error_r) +static int make_utf8(const UChar *src, const char **_dst, const char **error_r) { + char *dst; char *retp = NULL; int32_t dsize = 0; int32_t dsize_actual = 0; @@ -120,9 +121,9 @@ i_assert(NULL == retp); dsize++; /* room for '\0' byte */ - *dst = t_malloc(dsize); + dst = t_malloc(dsize); err = U_ZERO_ERROR; - retp = u_strToUTF8WithSub(*dst, dsize, &dsize_actual, src, usrc_len, + retp = u_strToUTF8WithSub(dst, dsize, &dsize_actual, src, usrc_len, UNICODE_REPLACEMENT_CHAR, &sub_num, &err); if (U_FAILURE(err)) i_panic("Lib ICU u_strToUTF8WithSub() failed: %s", @@ -137,8 +138,9 @@ " Substitutions (%d) were made.", sub_num); return -1; } - i_assert(retp == *dst); + i_assert(retp == dst); + *_dst = dst; return 0; } @@ -212,27 +214,24 @@ return 0; } -/* Returns 0 on success and -1 on error. */ -/* TODO: delay errors until _deinit() and return some other values? */ -static const char * -fts_filter_normalizer_icu_filter(struct fts_filter *filter, const char *token) +static int +fts_filter_normalizer_icu_filter(struct fts_filter *filter, const char **token) { UErrorCode err = U_ZERO_ERROR; UChar *utext = NULL; int32_t utext_cap = 0; int32_t utext_len = -1; int32_t utext_limit; - char *normalized = NULL; struct fts_filter_normalizer *np = (struct fts_filter_normalizer *)filter; /* TODO: fix error handling */ if (np->error != NULL) - return NULL; + goto err_exit; - if (make_uchar(token, &utext, &utext_cap) < 0) { + if (make_uchar(*token, &utext, &utext_cap) < 0) { fts_filter_normalizer_icu_error(&np->error, "Conversion to UChar failed"); - return NULL; + goto err_exit; } /* TODO: Some problems here. How much longer can the result @@ -249,8 +248,9 @@ size utrans_transUChars indicated */ utext_len++; /* room for '\0' bytes(2) */ utext_cap = utext_len; - if (make_uchar(token, &utext, &utext_cap) < 0) - return NULL; + if (make_uchar(*token, &utext, &utext_cap) < 0) { + goto err_exit; + } i_assert(utext_cap == utext_len); utext_limit = u_strlen(utext); utext_len = -1; @@ -262,13 +262,17 @@ if (U_FAILURE(err)) { icu_error(&np->error, err, "utrans_transUChars()"); - return NULL; + goto err_exit; } - if (make_utf8(utext, &normalized, &np->error) < 0) - return NULL; + if (make_utf8(utext, token, &np->error) < 0) { + goto err_exit; + } - return normalized; + return 1; + err_exit: + *token = NULL; + return -1; } #else @@ -289,7 +293,7 @@ return -1; } -static const char * +static int fts_filter_normalizer_icu_filter(struct fts_filter *filter ATTR_UNUSED, const char *token ATTR_UNUSED) {
--- a/src/lib-fts/fts-filter-normalizer-simple.c Sat May 09 11:05:04 2015 +0300 +++ b/src/lib-fts/fts-filter-normalizer-simple.c Sat May 09 11:06:45 2015 +0300 @@ -48,18 +48,21 @@ return 0; } -static const char * +static int fts_filter_normalizer_simple_filter(struct fts_filter *_filter, - const char *token) + const char **token) { struct fts_filter_normalizer_simple *filter = (struct fts_filter_normalizer_simple *)_filter; str_truncate(filter->str, 0); - if (uni_utf8_to_decomposed_titlecase(token, strlen(token), - filter->str) < 0) - return NULL; - return str_c(filter->str); + if (uni_utf8_to_decomposed_titlecase(*token, strlen(*token), + filter->str) < 0) { + *token = NULL; + return -1; + } + *token = str_c(filter->str); + return 1; } static const struct fts_filter_vfuncs normalizer_filter_vfuncs = {
--- a/src/lib-fts/fts-filter-private.h Sat May 09 11:05:04 2015 +0300 +++ b/src/lib-fts/fts-filter-private.h Sat May 09 11:06:45 2015 +0300 @@ -17,7 +17,7 @@ const char *const *settings, struct fts_filter **filter_r, const char **error_r); - const char * (*filter)(struct fts_filter *filter, const char *token); + int (*filter)(struct fts_filter *filter, const char **token); void (*destroy)(struct fts_filter *filter); };
--- a/src/lib-fts/fts-filter-stemmer-snowball.c Sat May 09 11:05:04 2015 +0300 +++ b/src/lib-fts/fts-filter-stemmer-snowball.c Sat May 09 11:06:45 2015 +0300 @@ -66,18 +66,20 @@ return 0; } -static const char * +static int fts_filter_stemmer_snowball_filter(struct fts_filter *filter, - const char *token) + const char **token) { const sb_symbol *base; int len; struct fts_filter_stemmer_snowball *sp = (struct fts_filter_stemmer_snowball *) filter; - base = sb_stemmer_stem(sp->stemmer, (const unsigned char *)token, strlen(token)); + base = sb_stemmer_stem(sp->stemmer, (const unsigned char *)*token, strlen(*token)); len = sb_stemmer_length(sp->stemmer); - return t_strdup_until(base, base + len); + *token = t_strdup_until(base, base + len); + + return *token != NULL? 1: -1; } #else @@ -101,11 +103,11 @@ { } -static const char * +static int fts_filter_stemmer_snowball_filter(struct fts_filter *filter ATTR_UNUSED, - const char *token ATTR_UNUSED) + const char **token ATTR_UNUSED) { - return NULL; + return -1; } #endif
--- a/src/lib-fts/fts-filter-stopwords.c Sat May 09 11:05:04 2015 +0300 +++ b/src/lib-fts/fts-filter-stopwords.c Sat May 09 11:06:45 2015 +0300 @@ -125,18 +125,20 @@ return ret; } -static const char * -fts_filter_stopwords_filter(struct fts_filter *filter, const char *token) +static int +fts_filter_stopwords_filter(struct fts_filter *filter, const char **token) { const char *stopword; struct fts_filter_stopwords *sp = (struct fts_filter_stopwords *) filter; - stopword = hash_table_lookup(sp->stopwords, token); - if (stopword != NULL) - return NULL; + stopword = hash_table_lookup(sp->stopwords, *token); + if (stopword != NULL) { + *token = NULL; + return 0; + } else - return token; + return 1; } const struct fts_filter_vfuncs stopwords_filter_vfuncs = {
--- a/src/lib-fts/fts-filter.c Sat May 09 11:05:04 2015 +0300 +++ b/src/lib-fts/fts-filter.c Sat May 09 11:06:45 2015 +0300 @@ -92,19 +92,19 @@ /* TODO: Avoid multiple allocations by using a buffer in v->filter? Do this non-recursively? */ -const char * -fts_filter_filter(struct fts_filter *filter, const char *token) +int +fts_filter_filter(struct fts_filter *filter, const char **token) { - const char *filtered = NULL; + int ret; if (filter->parent == NULL) return filter->v->filter(filter, token); - filtered = fts_filter_filter(filter->parent, token); + ret = fts_filter_filter(filter->parent, token); - if(filtered != NULL) - return filter->v->filter(filter, filtered); + if(ret > 0) + return filter->v->filter(filter, token); - return NULL; + return ret; }
--- a/src/lib-fts/fts-filter.h Sat May 09 11:05:04 2015 +0300 +++ b/src/lib-fts/fts-filter.h Sat May 09 11:06:45 2015 +0300 @@ -58,8 +58,11 @@ void fts_filter_ref(struct fts_filter *filter); void fts_filter_unref(struct fts_filter **filter); -/* Returns the filtered token or NULL, if it was completely removed */ -const char * -fts_filter_filter(struct fts_filter *filter, const char *token); +/* Returns 1 if token is returned in *token, 0 if token was filtered + out and -1 on error. + Input is also given via *token. +*/ +int +fts_filter_filter(struct fts_filter *filter, const char **token); #endif
--- a/src/lib-fts/test-fts-filter.c Sat May 09 11:05:04 2015 +0300 +++ b/src/lib-fts/test-fts-filter.c Sat May 09 11:06:45 2015 +0300 @@ -24,7 +24,7 @@ "drive", NULL, NULL, NULL, "reason", NULL, NULL, NULL, "sing"}; const char **ip, **op; - const char *filtered; + const char *token; test_begin("fts filter stopwords, English"); filter_class = fts_filter_find(STOPWORDS_FILTER_NAME); @@ -34,12 +34,14 @@ ip = input; op = output; while (*ip != NULL) { - filtered = fts_filter_filter(filter, *ip); - if (filtered == NULL) + token = *ip; + ret = fts_filter_filter(filter, &token); + test_assert(ret >= 0); + if (ret == 0) test_assert(*op == NULL); else { test_assert(*op != NULL); - test_assert(strcmp(*ip, filtered) == 0); + test_assert(strcmp(*ip, token) == 0); } op++; ip++; @@ -66,7 +68,7 @@ {"kuka", "kenet", "keneen", "testi", "eiv\xC3\xA4t", NULL}; const char *output2[] = {NULL, NULL, NULL, "testi", NULL}; const char **ip, **op; - const char *filtered; + const char *token; test_begin("fts filter stopwords, Finnish"); filter_class = fts_filter_find(STOPWORDS_FILTER_NAME); @@ -76,12 +78,14 @@ ip = input; op = output; while (*ip != NULL) { - filtered = fts_filter_filter(filter, *ip); - if (filtered == NULL) + token = *ip; + ret = fts_filter_filter(filter, &token); + test_assert(ret >= 0); + if (ret == 0) test_assert(*op == NULL); else { test_assert(*op != NULL); - test_assert(strcmp(*ip, filtered) == 0); + test_assert(strcmp(*ip, token) == 0); } op++; ip++; @@ -95,12 +99,13 @@ ip = input2; op = output2; while (*ip != NULL) { - filtered = fts_filter_filter(filter, *ip); - if (filtered == NULL) + token = *ip; + ret = fts_filter_filter(filter, &token); + if (ret == 0) test_assert(*op == NULL); else { test_assert(*op != NULL); - test_assert(strcmp(*ip, filtered) == 0); + test_assert(strcmp(*ip, token) == 0); } op++; ip++; @@ -127,7 +132,7 @@ "quelconque", NULL, "l\xE2\x80\x99""av\xC3\xA8nement",}; const char **ip, **op; - const char *filtered; + const char *token; test_begin("fts filter stopwords, French"); filter_class = fts_filter_find(STOPWORDS_FILTER_NAME); @@ -137,12 +142,14 @@ ip = input; op = output; while (*ip != NULL) { - filtered = fts_filter_filter(filter, *ip); - if (filtered == NULL) + token = *ip; + ret = fts_filter_filter(filter, &token); + test_assert(ret >= 0); + if (ret == 0) test_assert(*op == NULL); else { test_assert(*op != NULL); - test_assert(strcmp(*ip, filtered) == 0); + test_assert(strcmp(*ip, token) == 0); } op++; ip++; @@ -177,7 +184,7 @@ struct fts_filter *stemmer; const char *error; struct fts_language language = { .name = "EN" }; - const char *base = NULL; + const char *token = NULL; const char * const tokens[] = { "dries" ,"friendlies", "All", "human", "beings", "are", "born", "free", "and", "equal", "in", "dignity", "and", @@ -199,9 +206,10 @@ test_assert(ret == 0); bpp = bases; for (tpp=tokens; *tpp != NULL; tpp++) { - base = fts_filter_filter(stemmer, *tpp); - test_assert(base != NULL); - test_assert(null_strcmp(base, *bpp) == 0); + token = *tpp; + ret = fts_filter_filter(stemmer, &token); + test_assert(token != NULL); + test_assert(null_strcmp(token, *bpp) == 0); bpp++; } fts_filter_unref(&stemmer); @@ -216,7 +224,7 @@ struct fts_filter *stemmer; const char *error; struct fts_language language = { .name = "fRench" }; - const char *base = NULL; + const char *token = NULL; const char * const tokens[] = { "Tous", "les", "\xC3\xAAtres", "humains", "naissent", "libres", "et", "\xC3\xA9gaux", "en", "dignit\xC3\xA9", @@ -233,9 +241,10 @@ test_assert(ret == 0); bpp = bases; for (tpp=tokens; *tpp != NULL; tpp++) { - base = fts_filter_filter(stemmer, *tpp); - test_assert(base != NULL); - test_assert(null_strcmp(base, *bpp) == 0); + token = *tpp; + ret = fts_filter_filter(stemmer, &token); + test_assert(token != NULL); + test_assert(null_strcmp(token, *bpp) == 0); bpp++; } fts_filter_unref(&stemmer); @@ -251,7 +260,7 @@ struct fts_filter *filter; const char *error; struct fts_language language = { .name = "eN" }; - const char *base = NULL; + const char *token = NULL; const char * const tokens[] = { "dries" ,"friendlies", "All", "human", "beings", "are", "born", "free", "and", "equal", "in", "dignity", "and", @@ -279,12 +288,13 @@ bpp = bases; for (tpp=tokens; *tpp != NULL; tpp++) { - base = fts_filter_filter(stemmer, *tpp); - if (base == NULL) + token = *tpp; + ret = fts_filter_filter(stemmer, &token); + if (ret == 0) test_assert(*bpp == NULL); else { test_assert(*bpp != NULL); - test_assert(null_strcmp(*bpp, base) == 0); + test_assert(null_strcmp(*bpp, token) == 0); } bpp++; } @@ -322,7 +332,7 @@ const char * const settings[] = {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL}; const char *error = NULL; - const char *normalized = NULL; + const char *token = NULL; unsigned int i; test_begin("fts filter normalizer Swedish short text"); @@ -333,8 +343,9 @@ test_assert(ret == 0); for (i = 0; i < N_ELEMENTS(input); i++) { if (input[i] != NULL) { - test_assert_idx((normalized = fts_filter_filter(norm, input[i])) != NULL, i); - test_assert_idx(null_strcmp(normalized, expected_output[i]) == 0, i); + token = input[i]; + test_assert_idx(fts_filter_filter(norm, &token) == 1, i); + test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i); } } fts_filter_unref(&norm); @@ -366,7 +377,7 @@ "vem kan segla forutan vind?\naaooaa" }; const char *error = NULL; - const char *normalized = NULL; + const char *token = NULL; unsigned int i; test_begin("fts filter normalizer Swedish short text using default ID"); @@ -377,8 +388,9 @@ test_assert(ret == 0); for (i = 0; i < N_ELEMENTS(input); i++) { if (input[i] != NULL) { - test_assert_idx((normalized = fts_filter_filter(norm, input[i])) != NULL, i); - test_assert_idx(null_strcmp(normalized, expected_output[i]) == 0, i); + token = input[i]; + test_assert_idx(fts_filter_filter(norm, &token) == 1, i); + test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i); } } fts_filter_unref(&norm); @@ -398,7 +410,7 @@ {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL}; char buf[4096] = {0}; const char *error = NULL; - const char *normalized = NULL; + const char *tokens; int ret; unsigned char sha512_digest[SHA512_RESULTLEN]; struct sha512_ctx ctx; @@ -424,11 +436,11 @@ test_assert(input != NULL); sha512_init(&ctx); while (NULL != fgets(buf, sizeof(buf), input)) { - - if ((normalized = fts_filter_filter(norm, buf)) == NULL){ + tokens = buf; + if (fts_filter_filter(norm, &tokens) != 1){ break; } - sha512_loop(&ctx, normalized, strlen(normalized)); + sha512_loop(&ctx, tokens, strlen(tokens)); } fclose(input); sha512_result(&ctx, sha512_digest); @@ -470,7 +482,7 @@ //{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL}; {"id", "Lower", NULL}; struct fts_language language = { .name = "En" }; - const char *base = NULL; + const char *token = NULL; const char * const tokens[] = { "dries" ,"friendlies", "All", "human", "beings", "are", "born", "free", "and", "equal", "in", "dignity", "and", @@ -503,12 +515,13 @@ bpp = bases; for (tpp = tokens; *tpp != NULL; tpp++) { - base = fts_filter_filter(stemmer, *tpp); - if (base == NULL) + token = *tpp; + ret = fts_filter_filter(stemmer, &token); + if (ret == 0) test_assert(*bpp == NULL); else { test_assert(*bpp != NULL); - test_assert(strcasecmp(*bpp, base) == 0); + test_assert(strcasecmp(*bpp, token) == 0); } bpp++; }
--- a/src/plugins/fts/fts-build-mail.c Sat May 09 11:05:04 2015 +0300 +++ b/src/plugins/fts/fts-build-mail.c Sat May 09 11:06:45 2015 +0300 @@ -249,9 +249,11 @@ tokenizer = fts_user_get_index_tokenizer(ctx->update_ctx->backend->ns->user); while ((ret = fts_tokenizer_next(tokenizer, data, size, &token)) > 0) { if (filter != NULL) { - token = fts_filter_filter(filter, token); - if (token == NULL) + ret = fts_filter_filter(filter, &token); + if (ret == 0) continue; + if (ret < 0) + break; } if (fts_backend_update_build_more(ctx->update_ctx, (const void *)token,
--- a/src/plugins/fts/fts-search-args.c Sat May 09 11:05:04 2015 +0300 +++ b/src/plugins/fts/fts-search-args.c Sat May 09 11:06:45 2015 +0300 @@ -64,6 +64,7 @@ struct fts_user_language *const *langp; ARRAY_TYPE(const_string) tokens; const char *token2; + int ret; t_array_init(&tokens, 4); /* first add the word exactly as it without any tokenization */ @@ -73,9 +74,10 @@ /* add the word filtered */ array_foreach(languages, langp) { - token2 = (*langp)->filter == NULL ? token : - fts_filter_filter((*langp)->filter, token); - if (token2 != NULL) { + token2 = t_strdup(token); + if ((*langp)->filter != NULL) + ret = fts_filter_filter((*langp)->filter, &token2); + if (ret > 0) { token2 = t_strdup(token2); array_append(&tokens, &token2, 1); }