changeset 13186:b099ac847f86

fts-lucene: Added initial support for language detection and stemming. This requires textcat and snowball (libstemmer) libraries. This can be enabled by setting: fts_lucene = textcat_dir=/usr/share/libtextcat \ textcat_conf=/etc/dovecot/textcat.conf I'm not yet sure how good of an idea it is to actually use this, so it needs some testing.. For example how bad is it if language is detected incorrectly?
author Timo Sirainen <tss@iki.fi>
date Tue, 09 Aug 2011 15:05:18 +0300
parents 9d784aab6398
children 1de8c2e4adb2
files configure.in src/plugins/fts-lucene/Makefile.am src/plugins/fts-lucene/Snowball.cc src/plugins/fts-lucene/SnowballAnalyzer.h src/plugins/fts-lucene/SnowballFilter.h src/plugins/fts-lucene/fts-backend-lucene.c src/plugins/fts-lucene/fts-lucene-plugin.c src/plugins/fts-lucene/fts-lucene-plugin.h src/plugins/fts-lucene/lucene-wrapper.cc src/plugins/fts-lucene/lucene-wrapper.h
diffstat 10 files changed, 455 insertions(+), 19 deletions(-) [+]
line wrap: on
line diff
--- a/configure.in	Tue Aug 09 14:13:13 2011 +0300
+++ b/configure.in	Tue Aug 09 15:05:18 2011 +0300
@@ -2607,6 +2607,16 @@
 fi
 AM_CONDITIONAL(BUILD_SOLR, test "$have_solr" = "yes")
 
+if test "$want_lucene" = "yes"; then
+  AC_CHECK_LIB(textcat, special_textcat_Init, [
+    AC_CHECK_LIB(stemmer, sb_stemmer_new, [
+      have_lucene_textcat=yes
+      AC_DEFINE(HAVE_LUCENE_TEXTCAT,, Define if you want textcat and stemming support for CLucene)
+    ])
+  ])
+fi
+AM_CONDITIONAL(BUILD_LUCENE_TEXTCAT, test "$have_lucene_textcat" = "yes")
+
 dnl **
 dnl ** Settings
 dnl **
--- a/src/plugins/fts-lucene/Makefile.am	Tue Aug 09 14:13:13 2011 +0300
+++ b/src/plugins/fts-lucene/Makefile.am	Tue Aug 09 15:05:18 2011 +0300
@@ -12,14 +12,29 @@
 module_LTLIBRARIES = \
 	lib21_fts_lucene_plugin.la
 
+if BUILD_LUCENE_TEXTCAT
+TEXTCAT_LIBS = -lstemmer -ltextcat
+endif
+
 lib21_fts_lucene_plugin_la_LIBADD = \
-	-lclucene-shared -lclucene-core
+	-lclucene-shared -lclucene-core $(TEXTCAT_LIBS)
 
 lib21_fts_lucene_plugin_la_SOURCES = \
 	fts-lucene-plugin.c \
 	fts-backend-lucene.c \
-	lucene-wrapper.cc
+	lucene-wrapper.cc \
+	Snowball.cc
 
 noinst_HEADERS = \
 	fts-lucene-plugin.h \
 	lucene-wrapper.h
+	SnowballAnalyzer.h \
+	SnowballFilter.h
+
+if BUILD_LUCENE_TEXTCAT
+exampledir = $(docdir)/example-config
+example_DATA = \
+	textcat.conf
+else
+EXTRA_DIST = textcat.conf
+endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts-lucene/Snowball.cc	Tue Aug 09 15:05:18 2011 +0300
@@ -0,0 +1,124 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#include <CLucene.h>
+#include "SnowballAnalyzer.h"
+#include "SnowballFilter.h"
+#include <CLucene/util/CLStreams.h>
+#include <CLucene/analysis/Analyzers.h>
+#include <CLucene/analysis/standard/StandardTokenizer.h>
+#include <CLucene/analysis/standard/StandardFilter.h>
+
+extern "C" {
+#include "lib.h"
+#include "buffer.h"
+#include "unichar.h"
+#include "lucene-wrapper.h"
+};
+
+CL_NS_USE(analysis)
+CL_NS_USE(util)
+CL_NS_USE2(analysis,standard)
+
+CL_NS_DEF2(analysis,snowball)
+
+  /** Builds the named analyzer with no stop words. */
+  SnowballAnalyzer::SnowballAnalyzer(const char* language) {
+    this->language = strdup(language);
+	stopSet = NULL;
+  }
+
+  SnowballAnalyzer::~SnowballAnalyzer(){
+	  _CLDELETE_CARRAY(language);
+	  if ( stopSet != NULL )
+		  _CLDELETE(stopSet);
+  }
+
+  /** Builds the named analyzer with the given stop words.
+  */
+  SnowballAnalyzer::SnowballAnalyzer(const char* language, const TCHAR** stopWords) {
+    this->language = strdup(language);
+
+    stopSet = _CLNEW CLTCSetList(true);
+	StopFilter::fillStopTable(stopSet,stopWords);
+  }
+
+  TokenStream* SnowballAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
+	 return this->tokenStream(fieldName,reader,false);
+  }
+
+  /** Constructs a {@link StandardTokenizer} filtered by a {@link
+      StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
+  TokenStream* SnowballAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader, bool deleteReader) {
+		BufferedReader* bufferedReader = reader->__asBufferedReader();
+		TokenStream* result;
+
+		if ( bufferedReader == NULL )
+			result =  _CLNEW StandardTokenizer( _CLNEW FilteredBufferedReader(reader, deleteReader), true );
+		else
+			result = _CLNEW StandardTokenizer(bufferedReader, deleteReader);
+
+	 result = _CLNEW StandardFilter(result, true);
+    result = _CLNEW CL_NS(analysis)::LowerCaseFilter(result, true);
+    if (stopSet != NULL)
+      result = _CLNEW CL_NS(analysis)::StopFilter(result, true, stopSet);
+    result = _CLNEW SnowballFilter(result, language, true);
+    return result;
+  }
+  
+  
+  
+  
+  
+  
+  
+    /** Construct the named stemming filter.
+   *
+   * @param in the input tokens to stem
+   * @param name the name of a stemmer
+   */
+	SnowballFilter::SnowballFilter(TokenStream* in, const char* language, bool deleteTS):
+		TokenFilter(in,deleteTS)
+	{
+		stemmer = sb_stemmer_new(language, NULL); //use utf8 encoding
+
+		if ( stemmer == NULL ){
+			_CLTHROWA(CL_ERR_IllegalArgument, "language not available for stemming\n"); //todo: richer error
+		}
+    }
+
+	SnowballFilter::~SnowballFilter(){
+		sb_stemmer_delete(stemmer);
+	}
+
+  /** Returns the next input Token, after being stemmed */
+  Token* SnowballFilter::next(Token* token){
+    if (input->next(token) == NULL)
+      return NULL;
+
+	unsigned char utf8text[LUCENE_MAX_WORD_LEN*5+1];
+	unsigned int len = I_MIN(LUCENE_MAX_WORD_LEN, token->termLength());
+
+	buffer_t buf = { 0, 0, { 0, 0, 0, 0, 0 } };
+	i_assert(sizeof(wchar_t) == sizeof(unichar_t));
+	buffer_create_data(&buf, utf8text, sizeof(utf8text));
+	uni_ucs4_to_utf8((const unichar_t *)token->termBuffer(), len, &buf);
+
+    const sb_symbol* stemmed = sb_stemmer_stem(stemmer, utf8text, buf.used);
+	if ( stemmed == NULL )
+		_CLTHROWA(CL_ERR_Runtime,"Out of memory");
+
+	int stemmedLen=sb_stemmer_length(stemmer);
+
+	unsigned int tchartext_size = uni_utf8_strlen_n(stemmed, stemmedLen) + 1;
+	TCHAR tchartext[tchartext_size];
+	lucene_utf8_n_to_tchar(stemmed,stemmedLen,tchartext,tchartext_size);
+	token->set(tchartext,token->startOffset(), token->endOffset(), token->type());
+	return token;
+  }
+
+
+CL_NS_END2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts-lucene/SnowballAnalyzer.h	Tue Aug 09 15:05:18 2011 +0300
@@ -0,0 +1,44 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_snowball_analyser_
+#define _lucene_analysis_snowball_analyser_
+
+#include "CLucene/analysis/AnalysisHeader.h"
+
+CL_CLASS_DEF(util,BufferedReader)
+CL_NS_DEF2(analysis,snowball)
+
+/** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
+ * LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}.
+ *
+ * Available stemmers are listed in {@link net.sf.snowball.ext}.  The name of a
+ * stemmer is the part of the class name before "Stemmer", e.g., the stemmer in
+ * {@link EnglishStemmer} is named "English".
+ */
+class CLUCENE_CONTRIBS_EXPORT SnowballAnalyzer: public Analyzer {
+  char* language;
+  CLTCSetList* stopSet;
+
+public:
+  /** Builds the named analyzer with no stop words. */
+  SnowballAnalyzer(const char* language="english");
+
+  /** Builds the named analyzer with the given stop words.
+  */
+  SnowballAnalyzer(const char* language, const TCHAR** stopWords);
+
+  ~SnowballAnalyzer();
+
+  /** Constructs a {@link StandardTokenizer} filtered by a {@link
+      StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
+  TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+  TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader, bool deleteReader);
+};
+
+CL_NS_END2
+#endif
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts-lucene/SnowballFilter.h	Tue Aug 09 15:05:18 2011 +0300
@@ -0,0 +1,41 @@
+/*------------------------------------------------------------------------------
+* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
+*
+* Distributable under the terms of either the Apache License (Version 2.0) or
+* the GNU Lesser General Public License, as specified in the COPYING file.
+------------------------------------------------------------------------------*/
+#ifndef _lucene_analysis_snowball_filter_
+#define _lucene_analysis_snowball_filter_
+
+#include "CLucene/analysis/AnalysisHeader.h"
+#include "libstemmer.h"
+
+CL_NS_DEF2(analysis,snowball)
+
+/** A filter that stems words using a Snowball-generated stemmer.
+ *
+ * Available stemmers are listed in {@link net.sf.snowball.ext}.  The name of a
+ * stemmer is the part of the class name before "Stemmer", e.g., the stemmer in
+ * {@link EnglishStemmer} is named "English".
+ *
+ * Note: todo: This is not thread safe...
+ */
+class CLUCENE_CONTRIBS_EXPORT SnowballFilter: public TokenFilter {
+	struct sb_stemmer * stemmer;
+public:
+
+  /** Construct the named stemming filter.
+   *
+   * @param in the input tokens to stem
+   * @param name the name of a stemmer
+   */
+	SnowballFilter(TokenStream* in, const char* language, bool deleteTS);
+
+	~SnowballFilter();
+
+    /** Returns the next input Token, after being stemmed */
+    Token* next(Token* token);
+};
+
+CL_NS_END2
+#endif
--- a/src/plugins/fts-lucene/fts-backend-lucene.c	Tue Aug 09 14:13:13 2011 +0300
+++ b/src/plugins/fts-lucene/fts-backend-lucene.c	Tue Aug 09 15:05:18 2011 +0300
@@ -102,15 +102,18 @@
 {
 	struct lucene_fts_backend *backend =
 		(struct lucene_fts_backend *)_backend;
-	struct mailbox_list *list = _backend->ns->list;
+	struct fts_lucene_user *fuser =
+		FTS_LUCENE_USER_CONTEXT(_backend->ns->user);
 	const char *path;
 
-	path = mailbox_list_get_path(list, NULL,
+	path = mailbox_list_get_path(_backend->ns->list, NULL,
 				     MAILBOX_LIST_PATH_TYPE_INDEX);
 	i_assert(path != NULL); /* fts already checked this */
 
 	backend->dir_path = i_strconcat(path, "/"LUCENE_INDEX_DIR_NAME, NULL);
-	backend->index = lucene_index_init(backend->dir_path);
+	backend->index = lucene_index_init(backend->dir_path,
+					   fuser->set.textcat_dir,
+					   fuser->set.textcat_conf);
 	return 0;
 }
 
--- a/src/plugins/fts-lucene/fts-lucene-plugin.c	Tue Aug 09 14:13:13 2011 +0300
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.c	Tue Aug 09 15:05:18 2011 +0300
@@ -1,19 +1,85 @@
 /* Copyright (c) 2006-2011 Dovecot authors, see the included COPYING file */
 
 #include "lib.h"
-#include "mail-storage-private.h"
+#include "mail-storage-hooks.h"
 #include "fts-lucene-plugin.h"
 
 const char *fts_lucene_plugin_version = DOVECOT_VERSION;
 
-unsigned int fts_lucene_storage_module_id;
+struct fts_lucene_user_module fts_lucene_user_module =
+	MODULE_CONTEXT_INIT(&mail_user_module_register);
+
+static int
+fts_lucene_plugin_init_settings(struct mail_user *user,
+				struct fts_lucene_settings *set,
+				const char *str)
+{
+	const char *const *tmp;
+
+	for (tmp = t_strsplit_spaces(str, " "); *tmp != NULL; tmp++) {
+		if (strncmp(*tmp, "textcat_conf=", 13) == 0) {
+			set->textcat_conf = p_strdup(user->pool, *tmp + 13);
+		} else if (strncmp(*tmp, "textcat_dir=", 12) == 0) {
+			set->textcat_dir = p_strdup(user->pool, *tmp + 12);
+		} else {
+			i_error("fts_lucene: Invalid setting: %s", *tmp);
+			return -1;
+		}
+	}
+	if (set->textcat_conf != NULL && set->textcat_dir == NULL) {
+		i_error("fts_lucene: textcat_conf set, but textcat_dir unset");
+		return -1;
+	}
+	if (set->textcat_conf == NULL && set->textcat_dir != NULL) {
+		i_error("fts_lucene: textcat_dir set, but textcat_conf unset");
+		return -1;
+	}
+#ifndef HAVE_LUCENE_TEXTCAT
+	if (set->textcat_conf != NULL) {
+		i_error("fts_lucene: textcat_dir set, "
+			"but Dovecot built without textcat support");
+		return -1;
+	}
+#endif
+	return 0;
+}
+
+static void fts_lucene_mail_user_create(struct mail_user *user, const char *env)
+{
+	struct fts_lucene_user *fuser;
+
+	fuser = p_new(user->pool, struct fts_lucene_user, 1);
+	if (fts_lucene_plugin_init_settings(user, &fuser->set, env) < 0) {
+		/* invalid settings, disabling */
+		return;
+	}
+
+	MODULE_CONTEXT_SET(user, fts_lucene_user_module, fuser);
+}
+
+static void fts_lucene_mail_user_created(struct mail_user *user)
+{
+	const char *env;
+
+	env = mail_user_plugin_getenv(user, "fts_lucene");
+	if (env != NULL)
+		fts_lucene_mail_user_create(user, env);
+}
+
+static struct mail_storage_hooks fts_lucene_mail_storage_hooks = {
+	.mail_user_created = fts_lucene_mail_user_created
+};
 
 void fts_lucene_plugin_init(struct module *module ATTR_UNUSED)
 {
 	fts_backend_register(&fts_backend_lucene);
+	mail_storage_hooks_add(module, &fts_lucene_mail_storage_hooks);
 }
 
 void fts_lucene_plugin_deinit(void)
 {
 	fts_backend_unregister(fts_backend_lucene.name);
+	mail_storage_hooks_remove(&fts_lucene_mail_storage_hooks);
 }
+
+const char *fts_lucene_plugin_dependencies[] = { "fts", NULL };
--- a/src/plugins/fts-lucene/fts-lucene-plugin.h	Tue Aug 09 14:13:13 2011 +0300
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.h	Tue Aug 09 15:05:18 2011 +0300
@@ -1,9 +1,24 @@
 #ifndef FTS_LUCENE_PLUGIN_H
 #define FTS_LUCENE_PLUGIN_H
 
+#include "module-context.h"
+#include "mail-user.h"
 #include "fts-api-private.h"
 
+#define FTS_LUCENE_USER_CONTEXT(obj) \
+	MODULE_CONTEXT(obj, fts_lucene_user_module)
+
+struct fts_lucene_settings {
+	const char *textcat_conf, *textcat_dir;
+};
+
+struct fts_lucene_user {
+	union mail_user_module_context module_ctx;
+	struct fts_lucene_settings set;
+};
+
 extern struct fts_backend fts_backend_lucene;
+extern MODULE_CONTEXT_DEFINE(fts_lucene_user_module, &mail_user_module_register);
 
 void fts_lucene_plugin_init(struct module *module);
 void fts_lucene_plugin_deinit(void);
--- a/src/plugins/fts-lucene/lucene-wrapper.cc	Tue Aug 09 14:13:13 2011 +0300
+++ b/src/plugins/fts-lucene/lucene-wrapper.cc	Tue Aug 09 15:05:18 2011 +0300
@@ -13,16 +13,20 @@
 
 #include <dirent.h>
 #include <sys/stat.h>
+#include <libtextcat/textcat.h>
 };
 #include <CLucene.h>
 #include <CLucene/util/CLStreams.h>
 #include <CLucene/search/MultiPhraseQuery.h>
+#include "SnowballAnalyzer.h"
 
 /* Lucene's default is 10000. Use it here also.. */
 #define MAX_TERMS_PER_DOCUMENT 10000
 
 #define LUCENE_LOCK_OVERRIDE_SECS 60
 
+#define DEFAULT_LANGUAGE "english"
+
 using namespace lucene::document;
 using namespace lucene::index;
 using namespace lucene::search;
@@ -31,26 +35,49 @@
 using namespace lucene::analysis;
 using namespace lucene::util;
 
+struct lucene_analyzer {
+	char *lang;
+	Analyzer *analyzer;
+};
+
 struct lucene_index {
 	char *path;
+	char *textcat_dir, *textcat_conf;
 	wchar_t mailbox_guid[MAILBOX_GUID_HEX_LENGTH + 1];
 
 	IndexReader *reader;
 	IndexWriter *writer;
 	IndexSearcher *searcher;
-	Analyzer *analyzer;
+
+	Analyzer *default_analyzer, *cur_analyzer;
+	ARRAY_DEFINE(analyzers, struct lucene_analyzer);
 
 	Document *doc;
 	uint32_t prev_uid;
 };
 
-struct lucene_index *lucene_index_init(const char *path)
+static void *textcat = NULL;
+static bool textcat_broken = FALSE;
+static int textcat_refcount = 0;
+
+struct lucene_index *lucene_index_init(const char *path,
+				       const char *textcat_dir,
+				       const char *textcat_conf)
 {
 	struct lucene_index *index;
 
 	index = i_new(struct lucene_index, 1);
 	index->path = i_strdup(path);
-	index->analyzer = _CLNEW standard::StandardAnalyzer();
+	index->textcat_dir = i_strdup(textcat_dir);
+	index->textcat_conf = i_strdup(textcat_conf);
+#ifdef HAVE_LUCENE_TEXTCAT
+	index->default_analyzer = _CLNEW snowball::SnowballAnalyzer(DEFAULT_LANGUAGE);
+#else
+	index->default_analyzer = _CLNEW standard::StandardAnalyzer();
+#endif
+	i_array_init(&index->analyzers, 32);
+	textcat_refcount++;
+
 	return index;
 }
 
@@ -63,15 +90,29 @@
 
 void lucene_index_deinit(struct lucene_index *index)
 {
+	struct lucene_analyzer *a;
+
 	lucene_index_close(index);
-	_CLDELETE(index->analyzer);
+	array_foreach_modifiable(&index->analyzers, a) {
+		i_free(a->lang);
+		_CLDELETE(a->analyzer);
+	}
+	array_free(&index->analyzers);
+	if (--textcat_refcount == 0 && textcat != NULL) {
+#ifdef HAVE_LUCENE_TEXTCAT
+		textcat_Done(textcat);
+#endif
+		textcat = NULL;
+	}
+	_CLDELETE(index->default_analyzer);
+	i_free(index->textcat_dir);
+	i_free(index->textcat_conf);
 	i_free(index->path);
 	i_free(index);
 }
 
-static void
-lucene_utf8_n_to_tchar(const unsigned char *src, size_t srcsize,
-		       wchar_t *dest, size_t destsize)
+void lucene_utf8_n_to_tchar(const unsigned char *src, size_t srcsize,
+			    wchar_t *dest, size_t destsize)
 {
 	ARRAY_TYPE(unichars) dest_arr;
 	buffer_t buf = { 0, 0, { 0, 0, 0, 0, 0 } };
@@ -245,7 +286,8 @@
 	bool exists = IndexReader::indexExists(index->path);
 	try {
 		index->writer = _CLNEW IndexWriter(index->path,
-						   index->analyzer, !exists);
+						   index->default_analyzer,
+						   !exists);
 	} catch (CLuceneError &err) {
 		lucene_handle_error(index, err, "IndexWriter()");
 		return -1;
@@ -254,6 +296,64 @@
 	return 0;
 }
 
+static Analyzer *get_analyzer(struct lucene_index *index, const char *lang)
+{
+	const struct lucene_analyzer *a;
+	struct lucene_analyzer new_analyzer;
+	Analyzer *analyzer;
+
+	array_foreach(&index->analyzers, a) {
+		if (strcmp(a->lang, lang) == 0)
+			return a->analyzer;
+	}
+
+	memset(&new_analyzer, 0, sizeof(new_analyzer));
+	new_analyzer.lang = i_strdup(lang);
+	new_analyzer.analyzer = _CLNEW snowball::SnowballAnalyzer(lang);
+	array_append_i(&index->analyzers.arr, &new_analyzer, 1);
+	return new_analyzer.analyzer;
+}
+
+#ifdef HAVE_LUCENE_TEXTCAT
+static Analyzer *
+guess_analyzer(struct lucene_index *index, const void *data, size_t size)
+{
+	const char *lang;
+
+	if (textcat_broken)
+		return NULL;
+
+	if (textcat == NULL) {
+		textcat = index->textcat_conf == NULL ? NULL :
+			special_textcat_Init(index->textcat_conf,
+					     index->textcat_dir);
+		if (textcat == NULL) {
+			textcat_broken = TRUE;
+			return NULL;
+		}
+	}
+
+	/* try to guess the language */
+	lang = textcat_Classify(textcat, (const char *)data,
+				I_MIN(size, 500));
+	const char *p = strchr(lang, ']');
+	if (lang[0] != '[' || p == NULL)
+		return NULL;
+	lang = t_strdup_until(lang+1, p);
+	if (strcmp(lang, DEFAULT_LANGUAGE) == 0)
+		return index->default_analyzer;
+
+	return get_analyzer(index, lang);
+}
+#else
+static Analyzer *
+guess_analyzer(struct lucene_index *index ATTR_UNUSED,
+	       const void *data ATTR_UNUSED, size_t size ATTR_UNUSED)
+{
+	return NULL;
+}
+#endif
+
 static int lucene_index_build_flush(struct lucene_index *index)
 {
 	int ret = 0;
@@ -262,7 +362,10 @@
 		return 0;
 
 	try {
-		index->writer->addDocument(index->doc);
+		index->writer->addDocument(index->doc,
+					   index->cur_analyzer != NULL ?
+					   index->cur_analyzer :
+					   index->default_analyzer);
 	} catch (CLuceneError &err) {
 		lucene_handle_error(index, err, "IndexWriter::addDocument()");
 		ret = -1;
@@ -270,6 +373,7 @@
 
 	_CLDELETE(index->doc);
 	index->doc = NULL;
+	index->cur_analyzer = NULL;
 	return ret;
 }
 
@@ -307,6 +411,8 @@
 		if (fts_header_want_indexed(hdr_name))
 			index->doc->add(*_CLNEW Field(wname, dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
 	} else if (size > 0) {
+		if (index->cur_analyzer == NULL)
+			index->cur_analyzer = guess_analyzer(index, data, size);
 		index->doc->add(*_CLNEW Field(_T("body"), dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
 	}
 	return 0;
@@ -453,7 +559,7 @@
 
 	IndexWriter *writer = NULL;
 	try {
-		writer = _CLNEW IndexWriter(index->path, index->analyzer, false);
+		writer = _CLNEW IndexWriter(index->path, index->default_analyzer, false);
 		writer->optimize();
 	} catch (CLuceneError &err) {
 		lucene_handle_error(index, err, "IndexWriter::optimize()");
@@ -562,7 +668,12 @@
 		 const TCHAR *key, const struct mail_search_arg *arg)
 {
 	const TCHAR *wvalue = t_lucene_utf8_to_tchar(arg->value.str);
-	return getFieldQuery(index->analyzer, key, wvalue, arg->fuzzy);
+	Analyzer *analyzer = guess_analyzer(index, arg->value.str,
+					    strlen(arg->value.str));
+	if (analyzer == NULL)
+		analyzer = index->default_analyzer;
+
+	return getFieldQuery(analyzer, key, wvalue, arg->fuzzy);
 }
 
 static bool
--- a/src/plugins/fts-lucene/lucene-wrapper.h	Tue Aug 09 14:13:13 2011 +0300
+++ b/src/plugins/fts-lucene/lucene-wrapper.h	Tue Aug 09 15:05:18 2011 +0300
@@ -2,10 +2,13 @@
 #define LUCENE_WRAPPER_H
 
 #include "fts-api-private.h"
+#include "mail-types.h"
 
 #define MAILBOX_GUID_HEX_LENGTH (MAIL_GUID_128_SIZE*2)
 
-struct lucene_index *lucene_index_init(const char *path);
+struct lucene_index *lucene_index_init(const char *path,
+				       const char *textcat_dir,
+				       const char *textcat_conf);
 void lucene_index_deinit(struct lucene_index *index);
 
 void lucene_index_select_mailbox(struct lucene_index *index,
@@ -34,4 +37,8 @@
 			      struct mail_search_arg *args, bool and_args,
 			      struct fts_multi_result *result);
 
+/* internal: */
+void lucene_utf8_n_to_tchar(const unsigned char *src, size_t srcsize,
+			    wchar_t *dest, size_t destsize);
+
 #endif