changeset 4621:446646de0c4a HEAD

--with-lucene now enables lucene full text search indexing. Note that using it breaks IMAP RFC. It also seems to have problems finding texts that have special characters in them, such as email addresses. You can anyway enable it by loading fts and fts_lucene plugins and setting fts=lucene in plugin section.
author Timo Sirainen <tss@iki.fi>
date Mon, 18 Sep 2006 02:15:53 +0300
parents f66c874602a4
children 1febe4a45c36
files configure.in src/plugins/Makefile.am src/plugins/fts-lucene/.cvsignore src/plugins/fts-lucene/Makefile.am src/plugins/fts-lucene/fts-backend-lucene.c src/plugins/fts-lucene/fts-lucene-plugin.c src/plugins/fts-lucene/fts-lucene-plugin.h src/plugins/fts-lucene/lucene-wrapper.cc src/plugins/fts-lucene/lucene-wrapper.h
diffstat 9 files changed, 535 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/configure.in	Mon Sep 18 02:08:35 2006 +0300
+++ b/configure.in	Mon Sep 18 02:15:53 2006 +0300
@@ -218,6 +218,16 @@
 	fi,
 	want_sqlite=no)
 
+AC_ARG_WITH(lucene,
+[  --with-lucene           Build with CLucene full text search support],
+	if test x$withval = xno; then
+		want_lucene=no
+	else
+		want_lucene=yes
+	fi,
+	want_lucene=no)
+AM_CONDITIONAL(BUILD_LUCENE, test "$want_lucene" = "yes")
+
 AC_ARG_WITH(ssl,
 [  --with-ssl=[gnutls|openssl] Build with GNUTLS or OpenSSL (default)],
 	if test x$withval = xno; then
@@ -1801,6 +1811,7 @@
 src/plugins/convert/Makefile
 src/plugins/expire/Makefile
 src/plugins/fts/Makefile
+src/plugins/fts-lucene/Makefile
 src/plugins/quota/Makefile
 src/plugins/imap-quota/Makefile
 src/plugins/trash/Makefile
--- a/src/plugins/Makefile.am	Mon Sep 18 02:08:35 2006 +0300
+++ b/src/plugins/Makefile.am	Mon Sep 18 02:15:53 2006 +0300
@@ -2,4 +2,8 @@
 ZLIB = zlib
 endif
 
-SUBDIRS = acl convert expire fts quota imap-quota trash $(ZLIB)
+if BUILD_LUCENE
+FTS_LUCENE = fts-lucene
+endif
+
+SUBDIRS = acl convert expire fts quota imap-quota trash $(ZLIB) $(FTS_LUCENE)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts-lucene/.cvsignore	Mon Sep 18 02:15:53 2006 +0300
@@ -0,0 +1,8 @@
+*.la
+*.lo
+*.o
+.deps
+.libs
+Makefile
+Makefile.in
+so_locations
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts-lucene/Makefile.am	Mon Sep 18 02:15:53 2006 +0300
@@ -0,0 +1,29 @@
+AM_CPPFLAGS = \
+	-I$(top_srcdir)/src/lib \
+	-I$(top_srcdir)/src/lib-mail \
+	-I$(top_srcdir)/src/lib-storage \
+	-I$(top_srcdir)/src/plugins/fts \
+	-I/usr/lib
+
+lib02_fts_lucene_plugin_la_LDFLAGS = -module -avoid-version
+
+module_LTLIBRARIES = \
+	lib02_fts_lucene_plugin.la
+
+lib02_fts_lucene_plugin_la_LIBADD = \
+	-lclucene
+
+lib02_fts_lucene_plugin_la_SOURCES = \
+	fts-lucene-plugin.c \
+	fts-backend-lucene.c \
+	lucene-wrapper.cc
+
+noinst_HEADERS = \
+	fts-lucene-plugin.h
+
+install-exec-local:
+	for d in imap lda; do \
+	  $(mkdir_p) $(DESTDIR)$(moduledir)/$$d; \
+	  rm -f $(DESTDIR)$(moduledir)/$$d/lib02_fts_lucene_plugin.so; \
+	  $(LN_S) ../lib02_fts_lucene_plugin.so $(DESTDIR)$(moduledir)/$$d; \
+	done
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts-lucene/fts-backend-lucene.c	Mon Sep 18 02:15:53 2006 +0300
@@ -0,0 +1,111 @@
+/* Copyright (C) 2006 Timo Sirainen */
+
+#include "lib.h"
+#include "lucene-wrapper.h"
+#include "fts-lucene-plugin.h"
+
+struct lucene_fts_backend {
+	struct fts_backend backend;
+	struct lucene_index *index;
+
+	uint32_t last_uid;
+};
+
+static struct fts_backend *fts_backend_lucene_init(const char *path)
+{
+	struct lucene_fts_backend *backend;
+
+	backend = i_new(struct lucene_fts_backend, 1);
+	backend->backend = fts_backend_lucene;
+	backend->index = lucene_index_init(path);
+	return &backend->backend;
+}
+
+static void fts_backend_lucene_deinit(struct fts_backend *_backend)
+{
+	struct lucene_fts_backend *backend =
+		(struct lucene_fts_backend *)_backend;
+
+	lucene_index_deinit(backend->index);
+	i_free(backend);
+}
+
+static struct fts_backend_build_context *
+fts_backend_lucene_build_init(struct fts_backend *_backend, uint32_t *last_uid_r)
+{
+	struct lucene_fts_backend *backend =
+		(struct lucene_fts_backend *)_backend;
+	struct fts_backend_build_context *ctx;
+
+	ctx = i_new(struct fts_backend_build_context, 1);
+	ctx->backend = _backend;
+	if (lucene_index_build_init(backend->index, &backend->last_uid) < 0)
+		ctx->failed = TRUE;
+
+	*last_uid_r = backend->last_uid;
+	return ctx;
+}
+
+static int
+fts_backend_lucene_build_more(struct fts_backend_build_context *ctx,
+			      uint32_t uid, const unsigned char *data,
+			      size_t size)
+{
+	struct lucene_fts_backend *backend =
+		(struct lucene_fts_backend *)ctx->backend;
+
+	if (ctx->failed)
+		return -1;
+
+	i_assert(uid >= backend->last_uid);
+	backend->last_uid = uid;
+
+	return lucene_index_build_more(backend->index, uid, data, size);
+}
+
+static int
+fts_backend_lucene_build_deinit(struct fts_backend_build_context *ctx)
+{
+	struct lucene_fts_backend *backend =
+		(struct lucene_fts_backend *)ctx->backend;
+	int ret = ctx->failed ? -1 : 0;
+
+	lucene_index_build_deinit(backend->index);
+	i_free(ctx);
+	return ret;
+}
+
+static int
+fts_backend_lucene_lookup(struct fts_backend *_backend, const char *key,
+			 ARRAY_TYPE(seq_range) *result)
+{
+	struct lucene_fts_backend *backend =
+		(struct lucene_fts_backend *)_backend;
+
+	return lucene_index_lookup(backend->index, key, result);
+}
+
+static int
+fts_backend_lucene_filter(struct fts_backend *_backend, const char *key,
+			 ARRAY_TYPE(seq_range) *result)
+{
+	struct lucene_fts_backend *backend =
+		(struct lucene_fts_backend *)_backend;
+
+	return lucene_index_filter(backend->index, key, result);
+}
+
+struct fts_backend fts_backend_lucene = {
+	"lucene",
+	TRUE,
+
+	{
+		fts_backend_lucene_init,
+		fts_backend_lucene_deinit,
+		fts_backend_lucene_build_init,
+		fts_backend_lucene_build_more,
+		fts_backend_lucene_build_deinit,
+		fts_backend_lucene_lookup,
+		fts_backend_lucene_filter
+	}
+};
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.c	Mon Sep 18 02:15:53 2006 +0300
@@ -0,0 +1,14 @@
+/* Copyright (C) 2006 Timo Sirainen */
+
+#include "lib.h"
+#include "fts-lucene-plugin.h"
+
+void fts_lucene_plugin_init(void)
+{
+	fts_backend_register(&fts_backend_lucene);
+}
+
+void fts_lucene_plugin_deinit(void)
+{
+	fts_backend_unregister(fts_backend_lucene.name);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.h	Mon Sep 18 02:15:53 2006 +0300
@@ -0,0 +1,11 @@
+#ifndef __FTS_LUCENE_PLUGIN_H
+#define __FTS_LUCENE_PLUGIN_H
+
+#include "fts-api-private.h"
+
+extern struct fts_backend fts_backend_lucene;
+
+void fts_lucene_plugin_init(void);
+void fts_lucene_plugin_deinit(void);
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts-lucene/lucene-wrapper.cc	Mon Sep 18 02:15:53 2006 +0300
@@ -0,0 +1,327 @@
+/* Copyright (C) 2006 Timo Sirainen */
+
+extern "C" {
+#include "lib.h"
+#include "str-sanitize.h"
+#include "lucene-wrapper.h"
+};
+#include <CLucene.h>
+
+using namespace lucene::document;
+using namespace lucene::index;
+using namespace lucene::search;
+using namespace lucene::queryParser;
+
+struct lucene_index {
+	char *path;
+
+	IndexReader *reader;
+	IndexWriter *writer;
+	IndexSearcher *searcher;
+	lucene::analysis::standard::StandardAnalyzer *analyzer;
+
+	Document *doc;
+	uint32_t prev_uid, last_uid;
+};
+
+static const uint8_t utf8_skip_table[256] = {
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+};
+
+struct lucene_index *lucene_index_init(const char *path)
+{
+	struct lucene_index *index;
+
+	index = i_new(struct lucene_index, 1);
+	index->path = i_strdup(path);
+	return index;
+}
+
+static void lucene_index_close(struct lucene_index *index)
+{
+	_CLDELETE(index->reader);
+	_CLDELETE(index->writer);
+	_CLDELETE(index->searcher);
+	_CLDELETE(index->analyzer);
+}
+
+void lucene_index_deinit(struct lucene_index *index)
+{
+	lucene_index_close(index);
+	i_free(index->path);
+	i_free(index);
+}
+
+static int lucene_index_open(struct lucene_index *index)
+{
+	if (index->reader != NULL)
+		return 1;
+
+	if (!IndexReader::indexExists(index->path))
+		return 0;
+
+	try {
+		index->reader = IndexReader::open(index->path);
+	} catch (CLuceneError &err) {
+		i_error("lucene: IndexReader::open(%s): %s", index->path, err.what());
+		return -1;
+	}
+	return 1;
+}
+
+static int lucene_index_open_search(struct lucene_index *index)
+{
+	int ret;
+
+	if (index->searcher != NULL)
+		return 1;
+
+	if ((ret = lucene_index_open(index)) <= 0)
+		return ret;
+
+	if (index->analyzer == NULL) {
+		index->analyzer =
+			_CLNEW lucene::analysis::standard::StandardAnalyzer();
+	}
+
+	index->searcher = _CLNEW IndexSearcher(index->reader);
+	return 1;
+}
+
+static int lucene_doc_get_uid(struct lucene_index *index,
+			      Document *doc, uint32_t *uid_r)
+{
+	Field *field = doc->getField(_T("uid"));
+	TCHAR *uid = field == NULL ? NULL : field->stringValue();
+	if (uid == NULL) {
+		i_error("lucene: Corrupted FTS index %s: No UID for document",
+			index->path);
+		return -1;
+	}
+
+	uint32_t num = 0;
+	while (*uid != 0) {
+		num = num*10 + (*uid - '0');
+		uid++;
+	}
+	*uid_r = num;
+	return 0;
+}
+
+static int
+lucene_index_get_last_uid(struct lucene_index *index, uint32_t *last_uid_r)
+{
+	int32_t max_docnum = index->reader->maxDoc();
+
+	if (max_docnum == 0) {
+		*last_uid_r = 0;
+		return 0;
+	}
+
+	Document *doc = index->reader->document(max_docnum-1);
+	if (lucene_doc_get_uid(index, doc, last_uid_r) < 0) {
+		_CLDELETE(doc);
+		return -1;
+	}
+	_CLDELETE(doc);
+	return 0;
+}
+
+int lucene_index_build_init(struct lucene_index *index, uint32_t *last_uid_r)
+{
+	uint32_t last_uid = 0;
+
+	if (lucene_index_open(index) < 0)
+		return -1;
+
+	if (index->reader == NULL)
+		index->last_uid = 0;
+	else {
+		if (lucene_index_get_last_uid(index, &index->last_uid) < 0)
+			return -1;
+	}
+	*last_uid_r = index->last_uid;
+
+	if (index->writer != NULL)
+		return 0;
+
+	bool exists = IndexReader::indexExists(index->path);
+	index->analyzer = _CLNEW lucene::analysis::standard::StandardAnalyzer();
+	try {
+		index->writer = _CLNEW IndexWriter(index->path,
+						   index->analyzer, !exists);
+	} catch (CLuceneError &err) {
+		i_error("lucene: IndexWriter(%s) failed: %s",
+			index->path, err.what());
+		return -1;
+	}
+
+	index->writer->setMaxFieldLength(MAX_INT_STRLEN);
+	return 0;
+}
+
+static unsigned int utf8_strlen_n(const void *datap, size_t size)
+{
+	const unsigned char *data = (const unsigned char *)datap;
+	const unsigned char *end = data + size;
+	unsigned int skip, len = 0;
+	size_t i;
+
+	for (i = 0; i < size && data[i] != '\0'; ) {
+		i += utf8_skip_table[data[i] & 0xff];
+		i_assert(i <= size);
+		len++;
+	}
+	return len;
+}
+
+static int lucene_index_build_flush(struct lucene_index *index)
+{
+	int ret = 0;
+
+	if (index->doc == NULL)
+		return 0;
+
+	try {
+		index->writer->addDocument(index->doc);
+	} catch (CLuceneError &err) {
+		i_error("lucene: IndexWriter::addDocument(%s) failed: %s",
+			index->path, err.what());
+		ret = -1;
+	}
+
+	_CLDELETE(index->doc);
+	index->doc = NULL;
+	return ret;
+}
+
+int lucene_index_build_more(struct lucene_index *index, uint32_t uid,
+			    const unsigned char *data, size_t size)
+{
+	unsigned int len;
+	char id[MAX_INT_STRLEN];
+
+	i_assert(uid > index->last_uid);
+	i_assert(size > 0);
+
+	len = utf8_strlen_n(data, size);
+	wchar_t dest[len+1];
+	lucene_utf8towcs(dest, (const char *)data, len + 1);
+
+	if (uid != index->prev_uid) {
+		char id[MAX_INT_STRLEN];
+		TCHAR tid[MAX_INT_STRLEN];
+
+		if (lucene_index_build_flush(index) < 0)
+			return -1;
+		index->prev_uid = uid;
+
+		index->doc = _CLNEW Document();
+		i_snprintf(id, sizeof(id), "%u", uid);
+		STRCPY_AtoT(tid, id, MAX_INT_STRLEN);
+		index->doc->add(*Field::Text(_T("uid"), tid));
+	}
+
+	index->doc->add(*Field::Text(_T("contents"), dest));
+	index->writer->addDocument(index->doc);
+	return 0;
+}
+
+int lucene_index_build_deinit(struct lucene_index *index)
+{
+	int ret = 0;
+
+	index->prev_uid = 0;
+	if (index->writer == NULL) {
+		lucene_index_close(index);
+		return -1;
+	}
+
+	if (lucene_index_build_flush(index) < 0)
+		ret = -1;
+
+	try {
+		index->writer->optimize();
+	} catch (CLuceneError &err) {
+		i_error("lucene: IndexWriter::optimize(%s) failed: %s",
+			index->path, err.what());
+		ret = -1;
+	}
+	try {
+		index->writer->close();
+	} catch (CLuceneError &err) {
+		i_error("lucene: IndexWriter::close(%s) failed: %s",
+			index->path, err.what());
+		ret = -1;
+	}
+
+	lucene_index_close(index);
+	return ret;
+}
+
+int lucene_index_lookup(struct lucene_index *index, const char *key,
+			ARRAY_TYPE(seq_range) *result)
+{
+	const char *quoted_key;
+	int ret = 0;
+
+	if (lucene_index_open_search(index) <= 0)
+		return -1;
+
+	t_push();
+	quoted_key = t_strdup_printf("\"%s\"", key);
+	unsigned int len = utf8_strlen_n(quoted_key, (size_t)-1);
+	wchar_t tkey[len + 1];
+	lucene_utf8towcs(tkey, quoted_key, len + 1);
+	t_pop();
+
+	Query *query = NULL;
+	try {
+		query = QueryParser::parse(tkey, _T("contents"),
+					   index->analyzer);
+	} catch (CLuceneError &err) {
+		i_error("lucene: QueryParser::parse(%s) failed: %s",
+			str_sanitize(key, 40), err.what());
+		lucene_index_close(index);
+		return -1;
+	}
+
+	try {
+		Hits *hits = index->searcher->search(query);
+
+		for (int32_t i = 0; i < hits->length(); i++) {
+			uint32_t uid;
+
+			if (lucene_doc_get_uid(index, &hits->doc(i),
+					       &uid) < 0) {
+				ret = -1;
+				break;
+			}
+
+			seq_range_array_add(result, 0, uid);
+		}
+		_CLDELETE(hits);
+	} catch (CLuceneError &err) {
+		i_error("lucene: search(%s) failed: %s",
+			index->path, err.what());
+		ret = -1;
+	}
+
+	_CLDELETE(query);
+	lucene_index_close(index);
+	return ret;
+}
+
+int lucene_index_filter(struct lucene_index *index, const char *key,
+			ARRAY_TYPE(seq_range) *result)
+{
+	/* FIXME: implement */
+	return -1;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts-lucene/lucene-wrapper.h	Mon Sep 18 02:15:53 2006 +0300
@@ -0,0 +1,19 @@
+#ifndef __LUCENE_WRAPPER_H
+#define __LUCENE_WRAPPER_H
+
+#include "fts-api-private.h"
+
+struct lucene_index *lucene_index_init(const char *path);
+void lucene_index_deinit(struct lucene_index *index);
+
+int lucene_index_build_init(struct lucene_index *index, uint32_t *last_uid_r);
+int lucene_index_build_more(struct lucene_index *index, uint32_t uid,
+			    const unsigned char *data, size_t size);
+int lucene_index_build_deinit(struct lucene_index *index);
+
+int lucene_index_lookup(struct lucene_index *index, const char *key,
+			ARRAY_TYPE(seq_range) *result);
+int lucene_index_filter(struct lucene_index *index, const char *key,
+			ARRAY_TYPE(seq_range) *result);
+
+#endif