view src/lib-fts/fts-filter-stopwords.c @ 21390:2e2563132d5f

Updated copyright notices to include the year 2017.
author Stephan Bosch <stephan.bosch@dovecot.fi>
date Wed, 11 Jan 2017 02:51:13 +0100
parents 5bdcdff02638
children cb108f786fb4
line wrap: on
line source

/* Copyright (c) 2014-2017 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "array.h"
#include "istream.h"
#include "strfuncs.h"
#include "hash.h"
#include "unichar.h"
#include "fts-language.h"
#include "fts-filter-private.h"

#define STOPWORDS_FILE_FORMAT "%s/stopwords_%s.txt"

#define STOPWORDS_CUTCHARS "|#\t "
#define STOPWORDS_DISALLOWED_CHARS "/\\<>.,\":()\t\n\r"

struct fts_filter_stopwords {
	struct fts_filter filter;
	struct fts_language *lang;
	pool_t pool;
	HASH_TABLE(const char *, const char *) stopwords;
	const char *stopwords_dir;
};

static int fts_filter_stopwords_read_list(struct fts_filter_stopwords *filter,
					  const char **error_r)
{
	struct istream *input;
	const char *line, *word, *path;
	int ret = 0;
	size_t len;

	path = t_strdup_printf(STOPWORDS_FILE_FORMAT,
			       filter->stopwords_dir, filter->lang->name);

	input = i_stream_create_file(path, IO_BLOCK_SIZE);
	while ((line = i_stream_read_next_line(input)) != NULL) {
		len = strcspn(line, STOPWORDS_CUTCHARS);
		if (len == 0)
			continue;
		if (strcspn(line, STOPWORDS_DISALLOWED_CHARS) < len)
			continue;
		word = p_strndup(filter->pool, line, len);
		hash_table_insert(filter->stopwords, word, word);
	}

	if (input->stream_errno != 0) {
		*error_r = t_strdup_printf("Failed to read stopword list %s: %s",
					   path, i_stream_get_error(input));
		ret = -1;
	}

	if (ret == 0 && hash_table_count(filter->stopwords) == 0)
		i_warning("Stopwords list \"%s\" seems empty. Is the file correctly formatted?", path);

	i_stream_destroy(&input);
	return ret;
}

static void fts_filter_stopwords_destroy(struct fts_filter *filter)
{
	struct fts_filter_stopwords *sp = (struct fts_filter_stopwords *)filter;

	if (hash_table_is_created(sp->stopwords))
		hash_table_destroy(&sp->stopwords);
	pool_unref(&sp->pool);
}

static int
fts_filter_stopwords_create(const struct fts_language *lang,
                            const char *const *settings,
                            struct fts_filter **filter_r,
                            const char **error_r)
{
	struct fts_filter_stopwords *sp;
	pool_t pp;
	const char *dir = NULL;
	unsigned int i;

	for (i = 0; settings[i] != NULL; i += 2) {
		const char *key = settings[i], *value = settings[i+1];

		if (strcmp(key, "stopwords_dir") == 0) {
			dir = value;
		} else {
			*error_r = t_strdup_printf("Unknown setting: %s", key);
			return -1;
		}
	}
	pp = pool_alloconly_create(MEMPOOL_GROWING"fts_filter_stopwords",
	                           sizeof(struct fts_filter));
	sp = p_new(pp, struct fts_filter_stopwords, 1);
	sp->filter = *fts_filter_stopwords;
	sp->pool = pp;
	sp->lang = p_malloc(sp->pool, sizeof(struct fts_language));
	sp->lang->name = p_strdup(sp->pool, lang->name);
	if (dir != NULL)
		sp->stopwords_dir = p_strdup(pp, dir);
	else
		sp->stopwords_dir = DATADIR"/stopwords";
	*filter_r = &sp->filter;
	return 0;
}

static int
fts_filter_stopwords_filter(struct fts_filter *filter, const char **token,
			    const char **error_r)
{
	struct fts_filter_stopwords *sp =
		(struct fts_filter_stopwords *) filter;

	if (!hash_table_is_created(sp->stopwords)) {
		hash_table_create(&sp->stopwords, sp->pool, 0, str_hash, strcmp);
		if (fts_filter_stopwords_read_list(sp, error_r) < 0)
			return -1;
	}
	return hash_table_lookup(sp->stopwords, *token) == NULL ? 1 : 0;
}

const struct fts_filter fts_filter_stopwords_real = {
	.class_name = "stopwords",
	.v = {
		fts_filter_stopwords_create,
		fts_filter_stopwords_filter,
		fts_filter_stopwords_destroy
	}
};
const struct fts_filter *fts_filter_stopwords = &fts_filter_stopwords_real;