view src/lib-fts/fts-tokenizer-address.c @ 18566:b9f85e125639

lib-fts: Removed unnecessary code from fts-address-tokenizer. chars_after_at() was only used to check if local-part was empty, but it was checked at a state where it never could have been empty.
author Timo Sirainen <tss@iki.fi>
date Sat, 09 May 2015 12:32:46 +0300
parents 7fe766887394
children bcfe4c592427
line wrap: on
line source

/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "str.h"
#include "buffer.h"
#include "fts-tokenizer-private.h"

#define FTS_DEFAULT_NO_PARENT FALSE
#define FTS_DEFAULT_SEARCH FALSE

enum email_address_parser_state {
	EMAIL_ADDRESS_PARSER_STATE_NONE = 0,
	EMAIL_ADDRESS_PARSER_STATE_LOCALPART,
	EMAIL_ADDRESS_PARSER_STATE_DOMAIN,
	EMAIL_ADDRESS_PARSER_STATE_COMPLETE
};

struct email_address_fts_tokenizer {
	struct fts_tokenizer tokenizer;
	enum email_address_parser_state state;
	string_t *last_word;
	string_t *parent_data; /* Copy of input data between tokens.
	                          TODO: could be buffer_t maybe */
	bool no_parent;
	bool search;
};

/*
   Extracted from core rfc822-parser.c

   atext        =       ALPHA / DIGIT / ; Any character except controls,
                        "!" / "#" /     ;  SP, and specials.
                        "$" / "%" /     ;  Used for atoms
                        "&" / "'" /
                        "*" / "+" /
                        "-" / "/" /
                        "=" / "?" /
                        "^" / "_" /
                        "`" / "{" /
                        "|" / "}" /
                        "~"

  MIME:

  token := 1*<any (US-ASCII) CHAR except SPACE, CTLs,
              or tspecials>
  tspecials :=  "(" / ")" / "<" / ">" / "@" /
                "," / ";" / ":" / "\" / <">
                "/" / "[" / "]" / "?" / "="

  So token is same as dot-atom, except stops also at '/', '?' and '='.
*/

/* atext chars are marked with 1, alpha and digits with 2,
   atext-but-mime-tspecials with 4 */
unsigned char rfc822_atext_chars[256] = {
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0-15 */
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16-31 */
	0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 4, /* 32-47 */
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 4, 0, 4, /* 48-63 */
	0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 64-79 */
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 1, /* 80-95 */
	1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 96-111 */
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, /* 112-127 */

	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
};

#define IS_ATEXT(c) \
	(rfc822_atext_chars[(int)(unsigned char)(c)] != 0)
#define IS_DTEXT(c) \
	(rfc822_atext_chars[(int)(unsigned char)(c)] == 2)


static int
fts_tokenizer_email_address_create(const char *const *settings,
				   struct fts_tokenizer **tokenizer_r,
				   const char **error_r)
{
	struct email_address_fts_tokenizer *tok;
	bool no_parent = FTS_DEFAULT_NO_PARENT;
	bool search = FTS_DEFAULT_SEARCH;
	unsigned int i;

	for (i = 0; settings[i] != NULL; i += 2) {
		const char *key = settings[i];

		if (strcmp(key, "no_parent") == 0) {
			no_parent = TRUE;
		}else if (strcmp(key, "search") == 0) {
			search = TRUE;
		} else {
			*error_r = t_strdup_printf("Unknown setting: %s", key);
			return -1;
		}
	}

	tok = i_new(struct email_address_fts_tokenizer, 1);
	tok->tokenizer = *fts_tokenizer_email_address;
	tok->last_word = str_new(default_pool, 128);
	tok->parent_data = str_new(default_pool, 128);
	tok->no_parent = no_parent;
	tok->search = search;
	*tokenizer_r = &tok->tokenizer;
	return 0;
}

static void fts_tokenizer_email_address_destroy(struct fts_tokenizer *_tok)
{
	struct email_address_fts_tokenizer *tok =
		(struct email_address_fts_tokenizer *)_tok;

	str_free(&tok->last_word);
	str_free(&tok->parent_data);
	i_free(tok);
}

static int
fts_tokenizer_address_current_token(struct email_address_fts_tokenizer *tok,
                                    const char **token_r)
{
	tok->tokenizer.skip_parents = TRUE;
	tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE;
	*token_r = t_strdup(str_c(tok->last_word));
	return 1;
}

static int
fts_tokenizer_address_parent_data(struct email_address_fts_tokenizer *tok,
                                  const char **token_r)
{
	/* TODO: search option removes address from data here. */
	if (tok->search && tok->state >= EMAIL_ADDRESS_PARSER_STATE_DOMAIN)
		i_debug("Would remove current token");

	*token_r = t_strdup(str_c(tok->parent_data));
	str_truncate(tok->parent_data, 0);
	return 1;
}

/* Used to rewind past characters that can not be the start of a new localpart.
 Returns size that can be skipped. */
static size_t skip_nonlocal_part(const unsigned char *data, size_t size)
{
	const unsigned char *p = data;
	size_t skip = 0;

	/* Yes, a dot can start an address. De facto before de jure. */
	while ( skip < size && (!IS_ATEXT(*p) && *p != '.')) {
		skip++;
		p++;
	}
	return skip;
}

/* TODO: 
   - DONT dereference *p past size!
*/
static enum email_address_parser_state
fts_tokenizer_email_address_parse_local(struct email_address_fts_tokenizer *tok,
                                        const unsigned char *data, size_t size,
                                        size_t *skip_r)
{
	size_t pos = 0;
	const unsigned char *p = data;
	bool at = FALSE;

	while (pos < size && (IS_ATEXT(*p) || (*p == '@' || *p == '.'))) {
		if (*p == '@')
			at = TRUE;
		pos++;
		p++;
		if (at)
			break;
	}
	 /* localpart and @ */
	if (at && (pos > 1 || str_len(tok->last_word) > 0)) {
		str_append_n(tok->last_word, data, pos);
		*skip_r = pos;
		return EMAIL_ADDRESS_PARSER_STATE_DOMAIN;
	}

	/* localpart, @ not included yet */
	if (pos > 0 && (IS_ATEXT(*(p-1)) || *(p-1) == '.')) {
		str_append_n(tok->last_word, data, pos);
		*skip_r = pos;
		return  EMAIL_ADDRESS_PARSER_STATE_LOCALPART;
	}
	/* not a localpart. skip past rest of no-good chars. */
	pos += skip_nonlocal_part(p, size - pos);
	*skip_r = pos;
	return EMAIL_ADDRESS_PARSER_STATE_NONE;
}

/* TODO:
 - allow address literals
 - reject "@..."
 - reject "@.host.tld"
*/
static enum email_address_parser_state
fts_tokenizer_email_address_parse_domain(struct email_address_fts_tokenizer *tok,
                                         const unsigned char *data, size_t size,
                                         size_t *skip_r)
{
	size_t pos = 0;
	const unsigned char *p = data;

	while (pos < size && (IS_DTEXT(*p) || *p == '.')) {
		pos++;
		p++;
	}
	 /* A complete domain name */
	if ((pos > 1 && pos < size) || /* non-atext after atext in this data*/
	    pos < size) { /* non-atext after previous atext */
		str_append_n(tok->last_word, data, pos);
		*skip_r = pos;
		return EMAIL_ADDRESS_PARSER_STATE_COMPLETE;
	}
	if (pos == size) { /* All good, but possibly not complete. */
		str_append_n(tok->last_word, data, pos);
		*skip_r = pos;
		return EMAIL_ADDRESS_PARSER_STATE_DOMAIN;
	}
	/* not a domain. skip past no-good chars. */
	pos += skip_nonlocal_part(p, size - pos);
	*skip_r = pos;
	return EMAIL_ADDRESS_PARSER_STATE_NONE;
}

/* Buffer raw data for parent. */
static void
fts_tokenizer_address_update_parent(struct email_address_fts_tokenizer *tok,
                                    const unsigned char *data, size_t size)
{
	if (!tok->no_parent)
		str_append_n(tok->parent_data, data, size);
}
static int
fts_tokenizer_email_address_next(struct fts_tokenizer *_tok,
                                 const unsigned char *data, size_t size,
                                 size_t *skip_r, const char **token_r)
{
	struct email_address_fts_tokenizer *tok =
		(struct email_address_fts_tokenizer *)_tok;
	size_t pos = 0, local_skip;

	if (tok->tokenizer.skip_parents == TRUE)
		tok->tokenizer.skip_parents = FALSE;

	if (tok->state == EMAIL_ADDRESS_PARSER_STATE_COMPLETE) {
		*skip_r = pos;
		return fts_tokenizer_address_current_token(tok, token_r);
	}

	/* end of data, output lingering tokens. first the parents data, then
	   possibly our token, if complete enough */
	if (size == 0) {
		if (!tok->no_parent && str_len(tok->parent_data) > 0)
			return fts_tokenizer_address_parent_data(tok, token_r);

		if (tok->state == EMAIL_ADDRESS_PARSER_STATE_DOMAIN)
			return fts_tokenizer_address_current_token(tok, token_r);
	}

	/* 1) regular input data OR
	   2) circle around to return completed address */
	while(pos < size || tok->state == EMAIL_ADDRESS_PARSER_STATE_COMPLETE) {

		switch (tok->state) {
		case EMAIL_ADDRESS_PARSER_STATE_NONE:
			/* no part of address found yet. remove possible
			   earlier data */
			str_truncate(tok->last_word, 0);

			/* fall through */
		case EMAIL_ADDRESS_PARSER_STATE_LOCALPART:
			/* last_word is empty or has the beginnings of a valid
			   local-part, but no '@' found yet. continue parsing
			   the beginning of data to see if it contains a full
			   local-part@ */
			tok->state =
				fts_tokenizer_email_address_parse_local(tok,
				                                        data + pos,
				                                        size - pos,
				                                        &local_skip);
			fts_tokenizer_address_update_parent(tok, data+pos,
			                                    local_skip);
			pos += local_skip;

			break;
		case EMAIL_ADDRESS_PARSER_STATE_DOMAIN:
			/* last_word has a local-part@ and maybe the beginning
			   of a domain. continue parsing the beginning of data
			   to see if it contains a valid domain. */

			tok->state =
				fts_tokenizer_email_address_parse_domain(tok,
				                                        data + pos,
				                                        size - pos,
				                                        &local_skip);
			fts_tokenizer_address_update_parent(tok, data+pos,
			                                    local_skip);
			pos += local_skip;

			break;
		case EMAIL_ADDRESS_PARSER_STATE_COMPLETE:
			/* skip tailing non-atext */
			local_skip = skip_nonlocal_part(data+pos, size - pos);
			*skip_r = pos + local_skip;
			fts_tokenizer_address_update_parent(tok, data+pos,
			                                    local_skip);
			if (!tok->no_parent)
				return fts_tokenizer_address_parent_data(tok, token_r);
			else {
				return fts_tokenizer_address_current_token(tok, token_r);
			}
		default:
			i_unreached();
		}

	}
	*skip_r = pos;
	return 0;
}

static const struct fts_tokenizer_vfuncs email_address_tokenizer_vfuncs = {
	fts_tokenizer_email_address_create,
	fts_tokenizer_email_address_destroy,
	fts_tokenizer_email_address_next
};

static const struct fts_tokenizer fts_tokenizer_email_address_real = {
	.name = FTS_TOKENIZER_EMAIL_ADDRESS_NAME,
	.v = &email_address_tokenizer_vfuncs
};
const struct fts_tokenizer *fts_tokenizer_email_address =
	&fts_tokenizer_email_address_real;