view src/lib-fts/fts-tokenizer-generic.c @ 18576:3bcd04bf5635

lib-fts: Use case-sensitive settings comparisons in fts-tokenizer Dovecot in general doesn't allow case-insensitive settings.
author Timo Sirainen <tss@iki.fi>
date Sat, 09 May 2015 13:20:29 +0300
parents b0a934361563
children eff53aa9cb58
line wrap: on
line source

/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "buffer.h"
#include "unichar.h"
#include "bsearch-insert-pos.h"
#include "fts-tokenizer-private.h"
#include "fts-tokenizer-generic-private.h"
#include "word-boundary-data.c"
#include "word-break-data.c"

#define FTS_DEFAULT_TOKEN_MAX_LENGTH 30

static unsigned char fts_ascii_word_boundaries[128] = {
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */

	1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 32-47:  !"#$%&()*+,-./ */
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, /* 48-63: :;<=>? */
	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 64-79: @ */
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 80-95: [\]^ */
	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 96-111: ` */
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0  /* 112-127: {|}~ */
};

static int
fts_tokenizer_generic_create(const char *const *settings,
			     struct fts_tokenizer **tokenizer_r,
			     const char **error_r)
{
	struct generic_fts_tokenizer *tok;
	unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH;
	enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE;
	unsigned int i;

	for (i = 0; settings[i] != NULL; i += 2) {
		const char *key = settings[i], *value = settings[i+1];

		if (strcmp(key, "maxlen") == 0) {
			if (str_to_uint(value, &max_length) < 0 ||
			    max_length == 0) {
				*error_r = t_strdup_printf(
					"Invalid maxlen setting: %s", value);
				return -1;
			}
		} else if (strcmp(key, "algorithm") == 0) {
			if (strcmp(value, ALGORITHM_TR29_NAME) == 0)
				algo = BOUNDARY_ALGORITHM_TR29;
			else if (strcmp(value, ALGORITHM_SIMPLE_NAME) == 0)
				;
			else {
				*error_r = t_strdup_printf(
				        "Invalid algorithm: %s", value);
				return -1;
			}
		} else {
			*error_r = t_strdup_printf("Unknown setting: %s", key);
			return -1;
		}
	}

	tok = i_new(struct generic_fts_tokenizer, 1);
	if (algo == BOUNDARY_ALGORITHM_TR29)
		tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29;
	else
		tok->tokenizer.v = &generic_tokenizer_vfuncs_simple;
	tok->max_length = max_length;
	tok->algorithm = algo;
	tok->token = buffer_create_dynamic(default_pool, 64);

	*tokenizer_r = &tok->tokenizer;
	return 0;
}

static void
fts_tokenizer_generic_destroy(struct fts_tokenizer *_tok)
{
	struct generic_fts_tokenizer *tok =
		(struct generic_fts_tokenizer *)_tok;

	buffer_free(&tok->token);
	i_free(tok);
}

static int
fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
                                           const char **token_r)
{
	*token_r = t_strndup(tok->token->data, I_MIN(tok->token->used, tok->max_length));
	buffer_set_used_size(tok->token, 0);
	return 1;
}

/* TODO: This is duplicated from unichar.c */
static bool uint32_find(const uint32_t *data, unsigned int count,
			uint32_t value, unsigned int *idx_r)
{
	BINARY_NUMBER_SEARCH(data, count, value, idx_r);
}

static bool is_word_break(unichar_t c)
{
	unsigned int idx;

	/* Unicode General Punctuation, including deprecated characters. */
	if (c >= 0x2000 && c <= 0x206f)
		return TRUE;

	/* From word-break-data.c, which is generated from PropList.txt. */
	if (uint32_find(White_Space, N_ELEMENTS(White_Space), c, &idx))
		return TRUE;
	if (uint32_find(Dash, N_ELEMENTS(Dash), c, &idx))
		return TRUE;
	if (uint32_find(Terminal_Punctuation, N_ELEMENTS(Terminal_Punctuation), c, &idx))
		return TRUE;
	if (uint32_find(STerm, N_ELEMENTS(STerm), c, &idx))
		return TRUE;
	if (uint32_find(Pattern_White_Space, N_ELEMENTS(Pattern_White_Space), c, &idx))
		return TRUE;
	return FALSE;
}

static bool
data_is_word_boundary(const unsigned char *data, size_t size, size_t *i)
{
	unichar_t c;

	if (data[*i] < 0x80)
		return fts_ascii_word_boundaries[data[*i]] != 0;
	/* unicode punctuation? */
	if (uni_utf8_get_char_n(data + *i, size - *i, &c) <= 0)
		i_unreached();
	*i += uni_utf8_char_bytes(data[*i]) - 1;
	return is_word_break(c);
}

static int
fts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
                                  const unsigned char *data, size_t size,
                                  size_t *skip_r, const char **token_r)
{
	struct generic_fts_tokenizer *tok =
		(struct generic_fts_tokenizer *)_tok;
	size_t i, char_start_i, len, start = 0;

	for (i = 0; i < size; i++) {
		char_start_i = i;
		if (data_is_word_boundary(data, size, &i)) {
			len = char_start_i - start;
			buffer_append(tok->token, data + start, len);
			if (tok->token->used == 0) {
				/* no text read yet */
				start = i + 1;
				continue;
			}
			/* word boundary found - return a new token */
			*skip_r = i + 1;
			return fts_tokenizer_generic_simple_current_token(tok, token_r);
		}
	}
	/* word boundary not found yet */
	len = i - start;
	buffer_append(tok->token, data + start, len);
	*skip_r = i;

	/* return the last token */
	if (size == 0 && tok->token->used > 0)
		return fts_tokenizer_generic_simple_current_token(tok, token_r);

	/* token too long */
	if (tok->token->used > tok->max_length)
		return fts_tokenizer_generic_simple_current_token(tok, token_r);
	return 0;
}

/* TODO: Arrange array searches roughly in order of likelyhood of a match.
   TODO: Make some array of the arrays, so this can be a foreach loop.
   TODO: Check for Hangul.
   TODO: Add Hyphens U+002D HYPHEN-MINUS, U+2010 HYPHEN, possibly also
   U+058A ( ֊ ) ARMENIAN HYPHEN, and U+30A0 KATAKANA-HIRAGANA DOUBLE
   HYPHEN.
   TODO
*/
static enum letter_type letter_type(unichar_t c)
{
	unsigned int idx;

	if (uint32_find(CR, N_ELEMENTS(CR), c, &idx))
		return LETTER_TYPE_CR;
	if (uint32_find(LF, N_ELEMENTS(LF), c, &idx))
		return LETTER_TYPE_LF;
	if (uint32_find(Newline, N_ELEMENTS(Newline), c, &idx))
		return LETTER_TYPE_NEWLINE;
	if (uint32_find(Extend, N_ELEMENTS(Extend), c, &idx))
		return LETTER_TYPE_EXTEND;
	if (uint32_find(Regional_Indicator, N_ELEMENTS(Regional_Indicator), c, &idx))
		return LETTER_TYPE_REGIONAL_INDICATOR;
	if (uint32_find(Format, N_ELEMENTS(Format), c, &idx))
		return LETTER_TYPE_FORMAT;
	if (uint32_find(Katakana, N_ELEMENTS(Katakana), c, &idx))
		return LETTER_TYPE_KATAKANA;
	if (uint32_find(Hebrew_Letter, N_ELEMENTS(Hebrew_Letter), c, &idx))
		return LETTER_TYPE_HEBREW_LETTER;
	if (uint32_find(ALetter, N_ELEMENTS(ALetter), c, &idx))
		return LETTER_TYPE_ALETTER;
	if (uint32_find(Single_Quote, N_ELEMENTS(Single_Quote), c, &idx))
		return LETTER_TYPE_SINGLE_QUOTE;
	if (uint32_find(Double_Quote, N_ELEMENTS(Double_Quote), c, &idx))
		return LETTER_TYPE_DOUBLE_QUOTE;
	if (uint32_find(MidNumLet, N_ELEMENTS(MidNumLet), c, &idx))
		return LETTER_TYPE_MIDNUMLET;
	if (uint32_find(MidLetter, N_ELEMENTS(MidLetter), c, &idx))
		return LETTER_TYPE_MIDLETTER;
	if (uint32_find(MidNum, N_ELEMENTS(MidNum), c, &idx))
		return LETTER_TYPE_MIDNUM;
	if (uint32_find(Numeric, N_ELEMENTS(Numeric), c, &idx))
		return LETTER_TYPE_NUMERIC;
	if (uint32_find(ExtendNumLet, N_ELEMENTS(ExtendNumLet), c, &idx))
		return LETTER_TYPE_EXTENDNUMLET;
	return LETTER_TYPE_OTHER;
}

static bool letter_panic(struct generic_fts_tokenizer *tok ATTR_UNUSED)
{
	i_panic("Letter type should not be used.");
}

/* WB3, WB3a and WB3b, but really different since we try to eat
   whitespace between words. */
static bool letter_cr_lf_newline(struct generic_fts_tokenizer *tok ATTR_UNUSED)
{
	return TRUE;
}

static bool letter_extend_format(struct generic_fts_tokenizer *tok ATTR_UNUSED)
{
	/* WB4 */
	return FALSE;
}

static bool letter_regional_indicator(struct generic_fts_tokenizer *tok)
{
	/* WB13c */
	if (tok->prev_letter == LETTER_TYPE_REGIONAL_INDICATOR)
		return FALSE;

	return TRUE; /* Any / Any */
}

static bool letter_katakana(struct generic_fts_tokenizer *tok)
{
	/* WB13 */
	if (tok->prev_letter == LETTER_TYPE_KATAKANA)
		return FALSE;

	/* WB13b */
	if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
		return FALSE;

	return TRUE; /* Any / Any */
}

static bool letter_hebrew(struct generic_fts_tokenizer *tok)
{
	/* WB5 */
	if (tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
		return FALSE;

	/* WB7 WB7c */
	if (tok->prev_prev_letter == LETTER_TYPE_HEBREW_LETTER &&
	    (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
	     tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
	     tok->prev_letter == LETTER_TYPE_MIDLETTER ||
	     tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE))
		return FALSE;

	/* WB10 */
	if (tok->prev_letter == LETTER_TYPE_NUMERIC)
		return FALSE;

	/* WB13b */
	if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
		return FALSE;

	return TRUE; /* Any / Any */
}

static bool letter_aletter(struct generic_fts_tokenizer *tok)
{
	/* WB5 */
	if (tok->prev_letter == LETTER_TYPE_ALETTER)
		return FALSE;

	/* WB7 */
	if (tok->prev_prev_letter == LETTER_TYPE_ALETTER &&
	    (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
	     tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
	     tok->prev_letter == LETTER_TYPE_MIDLETTER))
		return FALSE;

	/* WB10 */
	if (tok->prev_letter == LETTER_TYPE_NUMERIC)
		return FALSE;

	/* WB13b */
	if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
		return FALSE;


	return TRUE; /* Any / Any */
}

static bool letter_single_quote(struct generic_fts_tokenizer *tok)
{
	/* WB6 */
	if (tok->prev_letter == LETTER_TYPE_ALETTER ||
	    tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
		return FALSE;

	/* WB12 */
	if (tok->prev_letter == LETTER_TYPE_NUMERIC)
		return FALSE;

	return TRUE; /* Any / Any */
}

static bool letter_double_quote(struct generic_fts_tokenizer *tok)
{

	if (tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE)
		return FALSE;

	return TRUE; /* Any / Any */
}

static bool letter_midnumlet(struct generic_fts_tokenizer *tok)
{
	/* WB6 */
	if (tok->prev_letter == LETTER_TYPE_ALETTER ||
	    tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
		return FALSE;

	/* WB12 */
	if (tok->prev_letter == LETTER_TYPE_NUMERIC)
		return FALSE;

	return TRUE; /* Any / Any */
}

static bool letter_midletter(struct generic_fts_tokenizer *tok)
{
	/* WB6 */
	if (tok->prev_letter == LETTER_TYPE_ALETTER ||
	    tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
		return FALSE;

	return TRUE; /* Any / Any */
}

static bool letter_midnum(struct generic_fts_tokenizer *tok)
{
	/* WB12 */
	if (tok->prev_letter == LETTER_TYPE_NUMERIC)
		return FALSE;

	return TRUE; /* Any / Any */
}

static bool letter_numeric(struct generic_fts_tokenizer *tok)
{
	/* WB8 */
	if (tok->prev_letter == LETTER_TYPE_NUMERIC)
		return FALSE;

	/* WB9 */
	if (tok->prev_letter == LETTER_TYPE_ALETTER ||
	    tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
		return FALSE;

	/* WB11 */
	if(tok->prev_prev_letter == LETTER_TYPE_NUMERIC &&
	   (tok->prev_letter == LETTER_TYPE_MIDNUM ||
	    tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
	    tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE))
		return FALSE;

	/* WB13b */
	if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
		return FALSE;

	return TRUE; /* Any / Any */
}

static bool letter_extendnumlet(struct generic_fts_tokenizer *tok)
{

	/* WB13a */
	if (tok->prev_letter == LETTER_TYPE_ALETTER ||
	    tok->prev_letter == LETTER_TYPE_HEBREW_LETTER ||
	    tok->prev_letter == LETTER_TYPE_NUMERIC ||
	    tok->prev_letter == LETTER_TYPE_KATAKANA ||
	    tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
		return FALSE;

	return TRUE; /* Any / Any */
}

static bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED)

{
	return TRUE; /* Any / Any */
}

static void
add_prev_letter(struct generic_fts_tokenizer *tok, enum letter_type lt)
{
	if(tok->prev_letter != LETTER_TYPE_NONE) {
		tok->prev_prev_letter = tok->prev_letter;
		tok->prev_letter = lt;
	} else
		tok->prev_letter = lt;
}

/*
   TODO: Define what to skip between words.
   TODO: Include double quotation marks? Messes up parsing?
   TODO: Does this "reverse approach" include too much in "whitespace"?
   TODO: Possibly use is_word_break()?
 */
static bool is_nonword(enum letter_type lt)
{

	if (lt == LETTER_TYPE_REGIONAL_INDICATOR || lt == LETTER_TYPE_KATAKANA ||
	    lt == LETTER_TYPE_HEBREW_LETTER || lt == LETTER_TYPE_ALETTER ||
	    lt == LETTER_TYPE_SINGLE_QUOTE || lt == LETTER_TYPE_NUMERIC)
		return FALSE; /* TODO: Include LETTER_TYPE_DOUBLE_QUOTE? */

	return TRUE;
}

/* The way things are done WB6/7 and WB11/12 "false positives" can
   leave trailing unwanted chars. They are searched for here. This is
   very kludgy and should be coded into the rules themselves
   somehow.
*/
static bool is_one_past_end(struct generic_fts_tokenizer *tok)
{

	/* Short circuit for simple algorithm. */
	if (tok->prev_letter == LETTER_TYPE_NONE)
		return FALSE;

	/* WB6/7 false positive detected at one past end. */
	if (tok->prev_letter == LETTER_TYPE_MIDLETTER ||
	    tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
	    tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE )
		return TRUE;

	/* WB12/12 false positive detected at one past end. */
	if (tok->prev_letter == LETTER_TYPE_MIDNUM ||
	    tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
	    tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
		return TRUE;

	return FALSE;
}
static int
fts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
                                         const char **token_r)
{
	size_t end_skip = 0;
	ssize_t len;

	if (is_one_past_end(tok))
		end_skip = tok->last_size;

	len = I_MIN(tok->token->used, tok->max_length) - end_skip;
	i_assert(len > 0);
	*token_r = t_strndup(tok->token->data, len);
	buffer_set_used_size(tok->token, 0);
	tok->prev_prev_letter = LETTER_TYPE_NONE;
	tok->prev_letter = LETTER_TYPE_NONE;
	return 1;
}
/*
  Find word boundaries in input text. Based on Unicode standard annex
  #29, but tailored for FTS purposes.
  http://www.unicode.org/reports/tr29/

  Adaptions: No word boundary at Start-Of-Text or End-of-Text (Wb1 and
  WB2). Break just once, not before and after.  Other things also, not
  really pure tr29. Meant to assist in finding individual words.

  TODO: If this letter_fns based approach is too kludgy, do a FSM with function
  pointers and transition tables.

  TODO: Alternative idea: Replace everything with a super simplistic
  "lt != ALETTER, HEBREW, NUMERIC, ... --> word break"

  TODO: Rules get split up over several functions. Is it too
  confusing?
*/
static bool
uni_found_word_boundary(struct generic_fts_tokenizer *tok, enum letter_type lt)
{
	/* No rule knows what to do with just one char, except the linebreaks
	   we eat away (above) anyway. */
	if (tok->prev_letter == LETTER_TYPE_NONE)
		goto false_out;

	if (letter_fns[lt].fn(tok))
		return TRUE;

 false_out:
	/* Extend and format types are ignored. */
	if (lt == LETTER_TYPE_EXTEND || lt == LETTER_TYPE_FORMAT)
		return FALSE;
	add_prev_letter(tok,lt);
	return FALSE;
}

static int
fts_tokenizer_generic_next_tr29(struct fts_tokenizer *_tok,
			   const unsigned char *data, size_t size,
                                size_t *skip_r, const char **token_r)
{
	struct generic_fts_tokenizer *tok =
		(struct generic_fts_tokenizer *)_tok;

	unichar_t c;
	size_t i, char_start_i, start_skip = 0;
	enum letter_type lt;

	/* TODO: Process 8bit chars separately, to speed things up. */
	for (i = 0; i < size; i++) {
		char_start_i = i;
		if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
			i_unreached();
		tok->last_size = uni_utf8_char_bytes(data[i]);
		i += tok->last_size - 1; /* Utf8 bytes > 1, for() handles the 1 byte increment. */
		lt = letter_type(c);
		if (tok->prev_letter == LETTER_TYPE_NONE && is_nonword(lt)) {
			/* TODO: test that start_skip works with multibyte utf8 chars */
			start_skip = i + 1; /* Skip non-token chars at start of data */
			continue;
		}
		if (uni_found_word_boundary(tok, lt)) {
			i_assert(char_start_i >= start_skip && size >= start_skip);
			buffer_append(tok->token, data + start_skip,
			              char_start_i - start_skip);
			*skip_r = i + 1;
			return fts_tokenizer_generic_tr29_current_token(tok, token_r);
		}
	}
	i_assert(i >= start_skip && size >= start_skip);
	buffer_append(tok->token, data + start_skip, i - start_skip);
	*skip_r = i;

	if (size == 0 && tok->token->used > 0) {
		/* return the last token */
		*skip_r = 0;
		return fts_tokenizer_generic_tr29_current_token(tok, token_r);
	}
	return 0;
}

static int
fts_tokenizer_generic_next(struct fts_tokenizer *_tok ATTR_UNUSED,
			   const unsigned char *data ATTR_UNUSED,
                           size_t size ATTR_UNUSED,
                           size_t *skip_r ATTR_UNUSED,
                           const char **token_r ATTR_UNUSED)
{
	i_unreached();
}

static const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs = {
	fts_tokenizer_generic_create,
	fts_tokenizer_generic_destroy,
	fts_tokenizer_generic_next
};

static const struct fts_tokenizer fts_tokenizer_generic_real = {
	.name = FTS_TOKENIZER_GENERIC_NAME,
	.v = &generic_tokenizer_vfuncs
};
const struct fts_tokenizer *fts_tokenizer_generic = &fts_tokenizer_generic_real;

const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple = {
	fts_tokenizer_generic_create,
	fts_tokenizer_generic_destroy,
	fts_tokenizer_generic_next_simple
};
const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_tr29 = {
	fts_tokenizer_generic_create,
	fts_tokenizer_generic_destroy,
	fts_tokenizer_generic_next_tr29
};