Mercurial > dovecot > core-2.2
changeset 19934:5d5b2fd1b95e
lib-fts: Limit maximum length of addresses found.
The address tokenizer now takes a "maxlen" parameter, which
defaults to 254 bytes.
Previously addresses, or something looking like it, could
be of any length. This could cause trouble in fts backends.
author | Teemu Huovila <teemu.huovila@dovecot.fi> |
---|---|
date | Tue, 15 Mar 2016 10:48:31 +0200 |
parents | 159b933b617d |
children | 64db1cafe6e9 |
files | src/lib-fts/fts-tokenizer-address.c src/lib-fts/test-fts-tokenizer.c |
diffstat | 2 files changed, 33 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-fts/fts-tokenizer-address.c Tue Mar 15 10:47:20 2016 +0200 +++ b/src/lib-fts/fts-tokenizer-address.c Tue Mar 15 10:48:31 2016 +0200 @@ -5,10 +5,13 @@ #include "buffer.h" #include "rfc822-parser.h" #include "fts-tokenizer-private.h" +#include "fts-tokenizer-common.h" #define IS_DTEXT(c) \ (rfc822_atext_chars[(int)(unsigned char)(c)] == 2) +#define FTS_DEFAULT_ADDRESS_MAX_LENGTH 254 + enum email_address_parser_state { EMAIL_ADDRESS_PARSER_STATE_NONE = 0, EMAIL_ADDRESS_PARSER_STATE_LOCALPART, @@ -21,6 +24,7 @@ enum email_address_parser_state state; string_t *last_word; string_t *parent_data; /* Copy of input data between tokens. */ + unsigned int max_length; bool search; }; @@ -31,13 +35,20 @@ { struct email_address_fts_tokenizer *tok; bool search = FALSE; + unsigned int max_length = FTS_DEFAULT_ADDRESS_MAX_LENGTH; unsigned int i; for (i = 0; settings[i] != NULL; i += 2) { - const char *key = settings[i]; + const char *key = settings[i], *value = settings[i+1]; if (strcmp(key, "search") == 0) { search = TRUE; + } else if (strcmp(key, "maxlen") == 0) { + if (str_to_uint(value, &max_length) < 0 || + max_length == 0) { + *error_r = t_strdup_printf("Invalid maxlen setting: %s", value); + return -1; + } } else { *error_r = t_strdup_printf("Unknown setting: %s", key); return -1; @@ -48,6 +59,7 @@ tok->tokenizer = *fts_tokenizer_email_address; tok->last_word = str_new(default_pool, 128); tok->parent_data = str_new(default_pool, 128); + tok->max_length = max_length; tok->search = search; *tokenizer_r = &tok->tokenizer; return 0; @@ -69,7 +81,20 @@ { tok->tokenizer.skip_parents = TRUE; tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE; - *token_r = t_strdup(str_c(tok->last_word)); + if (str_len(tok->last_word) > tok->max_length) { + str_truncate(tok->last_word, tok->max_length); + /* As future proofing, delete partial utf8. + IS_DTEXT() does not actually allow utf8 addresses + yet though. */ + const unsigned char *data = tok->last_word->data; + size_t len = tok->last_word->used; + fts_tokenizer_delete_trailing_partial_char(data, &len); + i_assert(len <= tok->max_length); + *token_r = len == 0 ? "" : + t_strndup(tok->last_word->data, len); + } else { + *token_r = t_strdup(str_c(tok->last_word)); + } } static bool
--- a/src/lib-fts/test-fts-tokenizer.c Tue Mar 15 10:47:20 2016 +0200 +++ b/src/lib-fts/test-fts-tokenizer.c Tue Mar 15 10:48:31 2016 +0200 @@ -12,7 +12,8 @@ "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \ "Bar Baz <bar@example.org>" \ "Foo Bar (comment)foo.bar@host.example.org " \ - "foo, foo@domain" + "foo, foo@domain" \ + "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld" static const char *test_inputs[] = { /* generic things and word truncation: */ @@ -307,7 +308,8 @@ static const char input[] = TEST_INPUT_ADDRESS; static const char *const expected_output[] = { "abc.dfg@example.com", "bar@example.org", - "foo.bar@host.example.org", "foo@domain", NULL + "foo.bar@host.example.org", "foo@domain", + "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", NULL }; struct fts_tokenizer *tok; const char *error; @@ -326,7 +328,7 @@ "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com", "Bar", "Baz", "bar", "example", "org", "bar@example.org", "Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org", - "foo", "foo", "domain", "foo@domain", NULL + "foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", NULL }; struct fts_tokenizer *tok, *gen_tok; const char *error; @@ -358,7 +360,7 @@ "invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com", "Bar", "Baz", "bar@example.org", "Foo", "Bar", "comment", "foo.bar@host.example.org", - "foo", "foo@domain", NULL + "foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", NULL }; static const char *const settings[] = { "search", "", NULL }; struct fts_tokenizer *tok, *gen_tok;