changeset 19934:5d5b2fd1b95e

lib-fts: Limit maximum length of addresses found. The address tokenizer now takes a "maxlen" parameter, which defaults to 254 bytes. Previously addresses, or something looking like it, could be of any length. This could cause trouble in fts backends.
author Teemu Huovila <teemu.huovila@dovecot.fi>
date Tue, 15 Mar 2016 10:48:31 +0200
parents 159b933b617d
children 64db1cafe6e9
files src/lib-fts/fts-tokenizer-address.c src/lib-fts/test-fts-tokenizer.c
diffstat 2 files changed, 33 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/src/lib-fts/fts-tokenizer-address.c	Tue Mar 15 10:47:20 2016 +0200
+++ b/src/lib-fts/fts-tokenizer-address.c	Tue Mar 15 10:48:31 2016 +0200
@@ -5,10 +5,13 @@
 #include "buffer.h"
 #include "rfc822-parser.h"
 #include "fts-tokenizer-private.h"
+#include "fts-tokenizer-common.h"
 
 #define IS_DTEXT(c) \
 	(rfc822_atext_chars[(int)(unsigned char)(c)] == 2)
 
+#define FTS_DEFAULT_ADDRESS_MAX_LENGTH 254
+
 enum email_address_parser_state {
 	EMAIL_ADDRESS_PARSER_STATE_NONE = 0,
 	EMAIL_ADDRESS_PARSER_STATE_LOCALPART,
@@ -21,6 +24,7 @@
 	enum email_address_parser_state state;
 	string_t *last_word;
 	string_t *parent_data; /* Copy of input data between tokens. */
+	unsigned int max_length;
 	bool search;
 };
 
@@ -31,13 +35,20 @@
 {
 	struct email_address_fts_tokenizer *tok;
 	bool search = FALSE;
+	unsigned int max_length = FTS_DEFAULT_ADDRESS_MAX_LENGTH;
 	unsigned int i;
 
 	for (i = 0; settings[i] != NULL; i += 2) {
-		const char *key = settings[i];
+		const char *key = settings[i], *value = settings[i+1];
 
 		if (strcmp(key, "search") == 0) {
 			search = TRUE;
+		} else if (strcmp(key, "maxlen") == 0) {
+			if (str_to_uint(value, &max_length) < 0 ||
+			    max_length == 0) {
+				*error_r = t_strdup_printf("Invalid maxlen setting: %s", value);
+				return -1;
+			}
 		} else {
 			*error_r = t_strdup_printf("Unknown setting: %s", key);
 			return -1;
@@ -48,6 +59,7 @@
 	tok->tokenizer = *fts_tokenizer_email_address;
 	tok->last_word = str_new(default_pool, 128);
 	tok->parent_data = str_new(default_pool, 128);
+	tok->max_length = max_length;
 	tok->search = search;
 	*tokenizer_r = &tok->tokenizer;
 	return 0;
@@ -69,7 +81,20 @@
 {
 	tok->tokenizer.skip_parents = TRUE;
 	tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE;
-	*token_r = t_strdup(str_c(tok->last_word));
+	if (str_len(tok->last_word) > tok->max_length) {
+		str_truncate(tok->last_word, tok->max_length);
+		/* As future proofing, delete partial utf8.
+		   IS_DTEXT() does not actually allow utf8 addresses
+		   yet though. */
+		const unsigned char *data = tok->last_word->data;
+		size_t len = tok->last_word->used;
+		fts_tokenizer_delete_trailing_partial_char(data, &len);
+		i_assert(len <= tok->max_length);
+		*token_r = len == 0 ? "" :
+			t_strndup(tok->last_word->data, len);
+	} else {
+		*token_r = t_strdup(str_c(tok->last_word));
+	}
 }
 
 static bool
--- a/src/lib-fts/test-fts-tokenizer.c	Tue Mar 15 10:47:20 2016 +0200
+++ b/src/lib-fts/test-fts-tokenizer.c	Tue Mar 15 10:48:31 2016 +0200
@@ -12,7 +12,8 @@
 	"@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
 	"Bar Baz <bar@example.org>" \
 	"Foo Bar (comment)foo.bar@host.example.org " \
-	"foo, foo@domain"
+	"foo, foo@domain" \
+	"abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld"
 
 static const char *test_inputs[] = {
 	/* generic things and word truncation: */
@@ -307,7 +308,8 @@
 	static const char input[] = TEST_INPUT_ADDRESS;
 	static const char *const expected_output[] = {
 		"abc.dfg@example.com", "bar@example.org",
-		"foo.bar@host.example.org", "foo@domain", NULL
+		"foo.bar@host.example.org", "foo@domain",
+		"abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", NULL
 	};
 	struct fts_tokenizer *tok;
 	const char *error;
@@ -326,7 +328,7 @@
 		"invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
 		"Bar", "Baz", "bar", "example", "org", "bar@example.org",
 		"Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
-		"foo", "foo", "domain", "foo@domain", NULL
+		"foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde",  "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",  NULL
 	};
 	struct fts_tokenizer *tok, *gen_tok;
 	const char *error;
@@ -358,7 +360,7 @@
 		"invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
 		"Bar", "Baz", "bar@example.org",
 		"Foo", "Bar", "comment", "foo.bar@host.example.org",
-		"foo", "foo@domain", NULL
+		"foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", NULL
 	};
 	static const char *const settings[] = { "search", "", NULL };
 	struct fts_tokenizer *tok, *gen_tok;