changeset 18552:95a827d97e5b

fts: Change filter API to be able to return errors Modify fts_filter_filter() to return integer status codes. It returns 1 if a token was returned, 0 if it was filtered away and -1 on error.
author Teemu Huovila <teemu.huovila@dovecot.fi>
date Sat, 09 May 2015 11:06:45 +0300
parents 7fe766887394
children 3ae8ae7f1022
files src/lib-fts/fts-filter-normalizer-icu.c src/lib-fts/fts-filter-normalizer-simple.c src/lib-fts/fts-filter-private.h src/lib-fts/fts-filter-stemmer-snowball.c src/lib-fts/fts-filter-stopwords.c src/lib-fts/fts-filter.c src/lib-fts/fts-filter.h src/lib-fts/test-fts-filter.c src/plugins/fts/fts-build-mail.c src/plugins/fts/fts-search-args.c
diffstat 10 files changed, 126 insertions(+), 95 deletions(-) [+]
line wrap: on
line diff
--- a/src/lib-fts/fts-filter-normalizer-icu.c	Sat May 09 11:05:04 2015 +0300
+++ b/src/lib-fts/fts-filter-normalizer-icu.c	Sat May 09 11:06:45 2015 +0300
@@ -98,8 +98,9 @@
 	return 0;
 }
 
-static int make_utf8(const UChar *src, char **dst, const char **error_r)
+static int make_utf8(const UChar *src, const char **_dst, const char **error_r)
 {
+	char *dst;
 	char *retp = NULL;
 	int32_t dsize = 0;
 	int32_t dsize_actual = 0;
@@ -120,9 +121,9 @@
 	i_assert(NULL == retp);
 
 	dsize++; /* room for '\0' byte */
-	*dst = t_malloc(dsize);
+	dst = t_malloc(dsize);
 	err = U_ZERO_ERROR;
-	retp = u_strToUTF8WithSub(*dst, dsize, &dsize_actual, src, usrc_len,
+	retp = u_strToUTF8WithSub(dst, dsize, &dsize_actual, src, usrc_len,
 	                         UNICODE_REPLACEMENT_CHAR, &sub_num, &err);
 	if (U_FAILURE(err))
 		i_panic("Lib ICU u_strToUTF8WithSub() failed: %s",
@@ -137,8 +138,9 @@
 		                    " Substitutions (%d) were made.", sub_num);
 		return -1;
 	}
-	i_assert(retp == *dst);
+	i_assert(retp == dst);
 
+	*_dst = dst;
 	return 0;
 }
 
@@ -212,27 +214,24 @@
 	return 0;
 }
 
-/* Returns 0 on success and -1 on error. */
-/* TODO: delay errors until _deinit() and return some other values? */
-static const char *
-fts_filter_normalizer_icu_filter(struct fts_filter *filter, const char *token)
+static int
+fts_filter_normalizer_icu_filter(struct fts_filter *filter, const char **token)
 {
 	UErrorCode err = U_ZERO_ERROR;
 	UChar *utext = NULL;
 	int32_t utext_cap = 0;
 	int32_t utext_len = -1;
 	int32_t utext_limit;
-	char *normalized = NULL;
 	struct fts_filter_normalizer *np =
 		(struct fts_filter_normalizer *)filter;
 
 	/* TODO: fix error handling */
 	if (np->error != NULL)
-		return NULL;
+		goto err_exit;
 
-	if (make_uchar(token, &utext, &utext_cap) < 0) {
+	if (make_uchar(*token, &utext, &utext_cap) < 0) {
 		fts_filter_normalizer_icu_error(&np->error, "Conversion to UChar failed");
-		return NULL;
+		goto err_exit;
 	}
 	/*
 	   TODO: Some problems here.  How much longer can the result
@@ -249,8 +248,9 @@
 		   size utrans_transUChars indicated */
 		utext_len++; /* room for '\0' bytes(2) */
 		utext_cap = utext_len;
-		if (make_uchar(token, &utext, &utext_cap) < 0)
-			return NULL;
+		if (make_uchar(*token, &utext, &utext_cap) < 0) {
+			goto err_exit;
+		}
 		i_assert(utext_cap ==  utext_len);
 		utext_limit = u_strlen(utext);
 		utext_len = -1;
@@ -262,13 +262,17 @@
 
 	if (U_FAILURE(err)) {
 		icu_error(&np->error, err, "utrans_transUChars()");
-		return NULL;
+		goto err_exit;
 	}
 
-	if (make_utf8(utext, &normalized, &np->error) < 0)
-		return NULL;
+	if (make_utf8(utext, token, &np->error) < 0) {
+		goto err_exit;
+	}
 
-	return normalized;
+	return 1;
+ err_exit:
+	*token = NULL;
+	return -1;
 }
 
 #else
@@ -289,7 +293,7 @@
 	return -1;
 }
 
-static const char *
+static int
 fts_filter_normalizer_icu_filter(struct fts_filter *filter ATTR_UNUSED,
 				 const char *token ATTR_UNUSED)
 {
--- a/src/lib-fts/fts-filter-normalizer-simple.c	Sat May 09 11:05:04 2015 +0300
+++ b/src/lib-fts/fts-filter-normalizer-simple.c	Sat May 09 11:06:45 2015 +0300
@@ -48,18 +48,21 @@
 	return 0;
 }
 
-static const char *
+static int
 fts_filter_normalizer_simple_filter(struct fts_filter *_filter,
-				    const char *token)
+				    const char **token)
 {
 	struct fts_filter_normalizer_simple *filter =
 		(struct fts_filter_normalizer_simple *)_filter;
 
 	str_truncate(filter->str, 0);
-	if (uni_utf8_to_decomposed_titlecase(token, strlen(token),
-					     filter->str) < 0)
-		return NULL;
-	return str_c(filter->str);
+	if (uni_utf8_to_decomposed_titlecase(*token, strlen(*token),
+	                                     filter->str) < 0) {
+		*token = NULL;
+		return -1;
+	}
+	*token = str_c(filter->str);
+	return 1;
 }
 
 static const struct fts_filter_vfuncs normalizer_filter_vfuncs = {
--- a/src/lib-fts/fts-filter-private.h	Sat May 09 11:05:04 2015 +0300
+++ b/src/lib-fts/fts-filter-private.h	Sat May 09 11:06:45 2015 +0300
@@ -17,7 +17,7 @@
 	              const char *const *settings,
 	              struct fts_filter **filter_r,
 	              const char **error_r);
-	const char * (*filter)(struct fts_filter *filter, const char *token);
+	int (*filter)(struct fts_filter *filter, const char **token);
 	void (*destroy)(struct fts_filter *filter);
 };
 
--- a/src/lib-fts/fts-filter-stemmer-snowball.c	Sat May 09 11:05:04 2015 +0300
+++ b/src/lib-fts/fts-filter-stemmer-snowball.c	Sat May 09 11:06:45 2015 +0300
@@ -66,18 +66,20 @@
 	return 0;
 }
 
-static const char *
+static int
 fts_filter_stemmer_snowball_filter(struct fts_filter *filter,
-                                   const char *token)
+                                   const char **token)
 {
 	const sb_symbol *base;
 	int len;
 	struct fts_filter_stemmer_snowball *sp =
 		(struct fts_filter_stemmer_snowball *) filter;
 
-	base = sb_stemmer_stem(sp->stemmer, (const unsigned char *)token, strlen(token));
+	base = sb_stemmer_stem(sp->stemmer, (const unsigned char *)*token, strlen(*token));
 	len = sb_stemmer_length(sp->stemmer);
-	return t_strdup_until(base, base + len);
+	*token = t_strdup_until(base, base + len);
+
+	return *token != NULL? 1: -1;
 }
 
 #else
@@ -101,11 +103,11 @@
 {
 }
 
-static const char *
+static int
 fts_filter_stemmer_snowball_filter(struct fts_filter *filter ATTR_UNUSED,
-                                   const char *token ATTR_UNUSED)
+                                   const char **token ATTR_UNUSED)
 {
-	return NULL;
+	return -1;
 }
 
 #endif
--- a/src/lib-fts/fts-filter-stopwords.c	Sat May 09 11:05:04 2015 +0300
+++ b/src/lib-fts/fts-filter-stopwords.c	Sat May 09 11:06:45 2015 +0300
@@ -125,18 +125,20 @@
 	return ret;
 }
 
-static const char *
-fts_filter_stopwords_filter(struct fts_filter *filter, const char *token)
+static int
+fts_filter_stopwords_filter(struct fts_filter *filter, const char **token)
 {
 	const char *stopword;
 	struct fts_filter_stopwords *sp =
 		(struct fts_filter_stopwords *) filter;
 
-	stopword = hash_table_lookup(sp->stopwords, token);
-	if (stopword != NULL)
-		return NULL;
+	stopword = hash_table_lookup(sp->stopwords, *token);
+	if (stopword != NULL) {
+		*token = NULL;
+		return 0;
+	}
 	else
-		return token;
+		return 1;
 }
 
 const struct fts_filter_vfuncs stopwords_filter_vfuncs = {
--- a/src/lib-fts/fts-filter.c	Sat May 09 11:05:04 2015 +0300
+++ b/src/lib-fts/fts-filter.c	Sat May 09 11:06:45 2015 +0300
@@ -92,19 +92,19 @@
 
 /* TODO: Avoid multiple allocations by using a buffer in v->filter?
  Do this non-recursively? */
-const char *
-fts_filter_filter(struct fts_filter *filter, const char *token)
+int
+fts_filter_filter(struct fts_filter *filter, const char **token)
 
 {
-	const char *filtered = NULL;
+	int ret;
 
 	if (filter->parent == NULL)
 		return filter->v->filter(filter, token);
 
-	filtered = fts_filter_filter(filter->parent, token);
+	ret = fts_filter_filter(filter->parent, token);
 
-	if(filtered != NULL)
-		return filter->v->filter(filter, filtered);
+	if(ret > 0)
+		return filter->v->filter(filter, token);
 
-	return NULL;
+	return ret;
 }
--- a/src/lib-fts/fts-filter.h	Sat May 09 11:05:04 2015 +0300
+++ b/src/lib-fts/fts-filter.h	Sat May 09 11:06:45 2015 +0300
@@ -58,8 +58,11 @@
 void fts_filter_ref(struct fts_filter *filter);
 void fts_filter_unref(struct fts_filter **filter);
 
-/* Returns the filtered token or NULL, if it was completely removed */
-const char *
-fts_filter_filter(struct fts_filter *filter, const char *token);
+/* Returns 1 if token is returned in *token, 0 if token was filtered
+   out and -1 on error.
+   Input is also given via *token.
+*/
+int
+fts_filter_filter(struct fts_filter *filter, const char **token);
 
 #endif
--- a/src/lib-fts/test-fts-filter.c	Sat May 09 11:05:04 2015 +0300
+++ b/src/lib-fts/test-fts-filter.c	Sat May 09 11:06:45 2015 +0300
@@ -24,7 +24,7 @@
 	                       "drive", NULL, NULL, NULL, "reason",
 	                       NULL, NULL, NULL,  "sing"};
 	const char **ip, **op;
-	const char *filtered;
+	const char *token;
 
 	test_begin("fts filter stopwords, English");
 	filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
@@ -34,12 +34,14 @@
 	ip = input;
 	op = output;
 	while (*ip != NULL) {
-		filtered = fts_filter_filter(filter, *ip);
-		if (filtered == NULL)
+		token = *ip;
+		ret = fts_filter_filter(filter, &token);
+		test_assert(ret >= 0);
+		if (ret == 0)
 			test_assert(*op == NULL);
 		else {
 			test_assert(*op != NULL);
-			test_assert(strcmp(*ip, filtered)  == 0);
+			test_assert(strcmp(*ip, token)  == 0);
 		}
 		op++;
 		ip++;
@@ -66,7 +68,7 @@
 		{"kuka", "kenet", "keneen", "testi", "eiv\xC3\xA4t", NULL};
 	const char *output2[] = {NULL, NULL, NULL, "testi", NULL};
 	const char **ip, **op;
-	const char *filtered;
+	const char *token;
 
 	test_begin("fts filter stopwords, Finnish");
 	filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
@@ -76,12 +78,14 @@
 	ip = input;
 	op = output;
 	while (*ip != NULL) {
-		filtered = fts_filter_filter(filter, *ip);
-		if (filtered == NULL)
+		token = *ip;
+		ret = fts_filter_filter(filter, &token);
+		test_assert(ret >= 0);
+		if (ret == 0)
 			test_assert(*op == NULL);
 		else {
 			test_assert(*op != NULL);
-			test_assert(strcmp(*ip, filtered)  == 0);
+			test_assert(strcmp(*ip, token)  == 0);
 		}
 		op++;
 		ip++;
@@ -95,12 +99,13 @@
 	ip = input2;
 	op = output2;
 	while (*ip != NULL) {
-		filtered = fts_filter_filter(filter, *ip);
-		if (filtered == NULL)
+		token = *ip;
+		ret = fts_filter_filter(filter, &token);
+		if (ret == 0)
 			test_assert(*op == NULL);
 		else {
 			test_assert(*op != NULL);
-			test_assert(strcmp(*ip, filtered)  == 0);
+			test_assert(strcmp(*ip, token)  == 0);
 		}
 		op++;
 		ip++;
@@ -127,7 +132,7 @@
 	                        "quelconque", NULL, 
 	                        "l\xE2\x80\x99""av\xC3\xA8nement",};
 	const char **ip, **op;
-	const char *filtered;
+	const char *token;
 
 	test_begin("fts filter stopwords, French");
 	filter_class = fts_filter_find(STOPWORDS_FILTER_NAME);
@@ -137,12 +142,14 @@
 	ip = input;
 	op = output;
 	while (*ip != NULL) {
-		filtered = fts_filter_filter(filter, *ip);
-		if (filtered == NULL)
+		token = *ip;
+		ret = fts_filter_filter(filter, &token);
+		test_assert(ret >= 0);
+		if (ret == 0)
 			test_assert(*op == NULL);
 		else {
 			test_assert(*op != NULL);
-			test_assert(strcmp(*ip, filtered)  == 0);
+			test_assert(strcmp(*ip, token)  == 0);
 		}
 		op++;
 		ip++;
@@ -177,7 +184,7 @@
 	struct fts_filter *stemmer;
 	const char *error;
 	struct fts_language language = { .name = "EN" };
-	const char *base = NULL;
+	const char *token = NULL;
 	const char * const tokens[] = {
 		"dries" ,"friendlies", "All", "human", "beings", "are",
 		 "born", "free", "and", "equal", "in", "dignity", "and",
@@ -199,9 +206,10 @@
 	test_assert(ret == 0);
 	bpp = bases;
 	for (tpp=tokens; *tpp != NULL; tpp++) {
-		base = fts_filter_filter(stemmer, *tpp);
-		test_assert(base != NULL);
-		test_assert(null_strcmp(base, *bpp) == 0);
+		token = *tpp;
+		ret = fts_filter_filter(stemmer, &token);
+		test_assert(token != NULL);
+		test_assert(null_strcmp(token, *bpp) == 0);
 		bpp++;
 	}
 	fts_filter_unref(&stemmer);
@@ -216,7 +224,7 @@
 	struct fts_filter *stemmer;
 	const char *error;
 	struct fts_language language = { .name = "fRench" };
-	const char *base = NULL;
+	const char *token = NULL;
 	const char * const tokens[] = {
 		"Tous", "les", "\xC3\xAAtres", "humains", "naissent",
 		"libres", "et",	"\xC3\xA9gaux", "en", "dignit\xC3\xA9",
@@ -233,9 +241,10 @@
 	test_assert(ret == 0);
 	bpp = bases;
 	for (tpp=tokens; *tpp != NULL; tpp++) {
-		base = fts_filter_filter(stemmer, *tpp);
-		test_assert(base != NULL);
-		test_assert(null_strcmp(base, *bpp) == 0);
+		token = *tpp;
+		ret = fts_filter_filter(stemmer, &token);
+		test_assert(token != NULL);
+		test_assert(null_strcmp(token, *bpp) == 0);
 		bpp++;
 	}
 	fts_filter_unref(&stemmer);
@@ -251,7 +260,7 @@
 	struct fts_filter *filter;
 	const char *error;
 	struct fts_language language = { .name = "eN" };
-	const char *base = NULL;
+	const char *token = NULL;
 	const char * const tokens[] = {
 		"dries" ,"friendlies", "All", "human", "beings", "are",
 		 "born", "free", "and", "equal", "in", "dignity", "and",
@@ -279,12 +288,13 @@
 
 	bpp = bases;
 	for (tpp=tokens; *tpp != NULL; tpp++) {
-		base = fts_filter_filter(stemmer, *tpp);
-		if (base == NULL)
+		token = *tpp;
+		ret = fts_filter_filter(stemmer, &token);
+		if (ret == 0)
 			test_assert(*bpp == NULL);
 		else {
 			test_assert(*bpp != NULL);
-			test_assert(null_strcmp(*bpp, base)  == 0);
+			test_assert(null_strcmp(*bpp, token)  == 0);
 		}
 		bpp++;
 	}
@@ -322,7 +332,7 @@
 	const char * const settings[] =
 		{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL};
 	const char *error = NULL;
-	const char *normalized = NULL;
+	const char *token = NULL;
 	unsigned int i;
 
 	test_begin("fts filter normalizer Swedish short text");
@@ -333,8 +343,9 @@
 		test_assert(ret == 0);
 		for (i = 0; i < N_ELEMENTS(input); i++) {
 			if (input[i] != NULL) {
-				test_assert_idx((normalized = fts_filter_filter(norm, input[i])) != NULL, i);
-				test_assert_idx(null_strcmp(normalized, expected_output[i]) == 0, i);
+				token = input[i];
+				test_assert_idx(fts_filter_filter(norm, &token) == 1, i);
+				test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i);
 			}
 		}
 		fts_filter_unref(&norm);
@@ -366,7 +377,7 @@
 		"vem kan segla forutan vind?\naaooaa"
 	};
 	const char *error = NULL;
-	const char *normalized = NULL;
+	const char *token = NULL;
 	unsigned int i;
 
 	test_begin("fts filter normalizer Swedish short text using default ID");
@@ -377,8 +388,9 @@
 		test_assert(ret == 0);
 		for (i = 0; i < N_ELEMENTS(input); i++) {
 			if (input[i] != NULL) {
-				test_assert_idx((normalized = fts_filter_filter(norm, input[i])) != NULL, i);
-				test_assert_idx(null_strcmp(normalized, expected_output[i]) == 0, i);
+				token = input[i];
+				test_assert_idx(fts_filter_filter(norm, &token) == 1, i);
+				test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i);
 			}
 		}
 		fts_filter_unref(&norm);
@@ -398,7 +410,7 @@
 		{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL};
 	char buf[4096] = {0};
 	const char *error = NULL;
-	const char *normalized = NULL;
+	const char *tokens;
 	int ret;
 	unsigned char sha512_digest[SHA512_RESULTLEN];
 	struct sha512_ctx ctx;
@@ -424,11 +436,11 @@
 		test_assert(input != NULL);
 		sha512_init(&ctx);
 		while (NULL != fgets(buf, sizeof(buf), input)) {
-
-			if ((normalized = fts_filter_filter(norm, buf)) == NULL){
+			tokens = buf;
+			if (fts_filter_filter(norm, &tokens) != 1){
 				break;
 			}
-			sha512_loop(&ctx, normalized, strlen(normalized));
+			sha512_loop(&ctx, tokens, strlen(tokens));
 		}
 		fclose(input);
 		sha512_result(&ctx, sha512_digest);
@@ -470,7 +482,7 @@
 		//{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL};
 		{"id", "Lower", NULL};
 	struct fts_language language = { .name = "En" };
-	const char *base = NULL;
+	const char *token = NULL;
 	const char * const tokens[] = {
 		"dries" ,"friendlies", "All", "human", "beings", "are",
 		"born", "free", "and", "equal", "in", "dignity", "and",
@@ -503,12 +515,13 @@
 
 	bpp = bases;
 	for (tpp = tokens; *tpp != NULL; tpp++) {
-		base = fts_filter_filter(stemmer, *tpp);
-		if (base == NULL)
+		token = *tpp;
+		ret = fts_filter_filter(stemmer, &token);
+		if (ret == 0)
 			test_assert(*bpp == NULL);
 		else {
 			test_assert(*bpp != NULL);
-			test_assert(strcasecmp(*bpp, base)  == 0);
+			test_assert(strcasecmp(*bpp, token)  == 0);
 		}
 		bpp++;
 	}
--- a/src/plugins/fts/fts-build-mail.c	Sat May 09 11:05:04 2015 +0300
+++ b/src/plugins/fts/fts-build-mail.c	Sat May 09 11:06:45 2015 +0300
@@ -249,9 +249,11 @@
 	tokenizer = fts_user_get_index_tokenizer(ctx->update_ctx->backend->ns->user);
 	while ((ret = fts_tokenizer_next(tokenizer, data, size, &token)) > 0) {
 		if (filter != NULL) {
-			token = fts_filter_filter(filter, token);
-			if (token == NULL)
+			ret = fts_filter_filter(filter, &token);
+			if (ret == 0)
 				continue;
+			if (ret < 0)
+				break;
 		}
 		if (fts_backend_update_build_more(ctx->update_ctx,
 						  (const void *)token,
--- a/src/plugins/fts/fts-search-args.c	Sat May 09 11:05:04 2015 +0300
+++ b/src/plugins/fts/fts-search-args.c	Sat May 09 11:06:45 2015 +0300
@@ -64,6 +64,7 @@
 	struct fts_user_language *const *langp;
 	ARRAY_TYPE(const_string) tokens;
 	const char *token2;
+	int ret;
 
 	t_array_init(&tokens, 4);
 	/* first add the word exactly as it without any tokenization */
@@ -73,9 +74,10 @@
 
 	/* add the word filtered */
 	array_foreach(languages, langp) {
-		token2 = (*langp)->filter == NULL ? token :
-			fts_filter_filter((*langp)->filter, token);
-		if (token2 != NULL) {
+		token2 = t_strdup(token);
+		if ((*langp)->filter != NULL)
+			ret = fts_filter_filter((*langp)->filter, &token2);
+		if (ret > 0) {
 			token2 = t_strdup(token2);
 			array_append(&tokens, &token2, 1);
 		}