Mercurial > dovecot > core-2.2
changeset 15142:a200fdbc1fa0
fts-lucene: Added "normalize" option to put data through user's normalizer function.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Tue, 18 Sep 2012 20:09:03 +0300 |
parents | 99305e4dd403 |
children | f9424b066dcb |
files | src/plugins/fts-lucene/Snowball.cc src/plugins/fts-lucene/SnowballAnalyzer.h src/plugins/fts-lucene/SnowballFilter.h src/plugins/fts-lucene/fts-lucene-plugin.c src/plugins/fts-lucene/fts-lucene-plugin.h src/plugins/fts-lucene/lucene-wrapper.cc |
diffstat | 6 files changed, 51 insertions(+), 11 deletions(-) [+] |
line wrap: on
line diff
--- a/src/plugins/fts-lucene/Snowball.cc Tue Sep 18 20:07:21 2012 +0300 +++ b/src/plugins/fts-lucene/Snowball.cc Tue Sep 18 20:09:03 2012 +0300 @@ -26,8 +26,9 @@ CL_NS_DEF2(analysis,snowball) /** Builds the named analyzer with no stop words. */ - SnowballAnalyzer::SnowballAnalyzer(const char* language) { + SnowballAnalyzer::SnowballAnalyzer(normalizer_func_t *normalizer, const char* language) { this->language = strdup(language); + this->normalizer = normalizer; stopSet = NULL; prevstream = NULL; } @@ -67,7 +68,7 @@ result = _CLNEW CL_NS(analysis)::LowerCaseFilter(result, true); if (stopSet != NULL) result = _CLNEW CL_NS(analysis)::StopFilter(result, true, stopSet); - result = _CLNEW SnowballFilter(result, language, true); + result = _CLNEW SnowballFilter(result, normalizer, language, true); return result; } @@ -87,10 +88,11 @@ * @param in the input tokens to stem * @param name the name of a stemmer */ - SnowballFilter::SnowballFilter(TokenStream* in, const char* language, bool deleteTS): + SnowballFilter::SnowballFilter(TokenStream* in, normalizer_func_t *normalizer, const char* language, bool deleteTS): TokenFilter(in,deleteTS) { stemmer = sb_stemmer_new(language, NULL); //use utf8 encoding + this->normalizer = normalizer; if ( stemmer == NULL ){ _CLTHROWA(CL_ERR_IllegalArgument, "language not available for stemming\n"); //todo: richer error @@ -120,10 +122,24 @@ int stemmedLen=sb_stemmer_length(stemmer); - unsigned int tchartext_size = uni_utf8_strlen_n(stemmed, stemmedLen) + 1; - TCHAR tchartext[tchartext_size]; - lucene_utf8_n_to_tchar(stemmed,stemmedLen,tchartext,tchartext_size); - token->set(tchartext,token->startOffset(), token->endOffset(), token->type()); + if (normalizer == NULL) { + unsigned int tchartext_size = + uni_utf8_strlen_n(stemmed, stemmedLen) + 1; + TCHAR tchartext[tchartext_size]; + lucene_utf8_n_to_tchar(stemmed, stemmedLen, tchartext, tchartext_size); + token->set(tchartext,token->startOffset(), token->endOffset(), token->type()); + } else T_BEGIN { + buffer_t *norm_buf = buffer_create_dynamic(pool_datastack_create(), + stemmedLen); + normalizer(stemmed, stemmedLen, norm_buf); + + unsigned int tchartext_size = + uni_utf8_strlen_n(norm_buf->data, norm_buf->used) + 1; + TCHAR tchartext[tchartext_size]; + lucene_utf8_n_to_tchar((const unsigned char *)norm_buf->data, + norm_buf->used, tchartext, tchartext_size); + token->set(tchartext,token->startOffset(), token->endOffset(), token->type()); + } T_END; return token; }
--- a/src/plugins/fts-lucene/SnowballAnalyzer.h Tue Sep 18 20:07:21 2012 +0300 +++ b/src/plugins/fts-lucene/SnowballAnalyzer.h Tue Sep 18 20:09:03 2012 +0300 @@ -7,6 +7,10 @@ #ifndef _lucene_analysis_snowball_analyser_ #define _lucene_analysis_snowball_analyser_ +extern "C" { +#include "lib.h" +#include "unichar.h" +}; #include "CLucene/analysis/AnalysisHeader.h" CL_CLASS_DEF(util,BufferedReader) @@ -21,12 +25,13 @@ */ class CLUCENE_CONTRIBS_EXPORT SnowballAnalyzer: public Analyzer { char* language; + normalizer_func_t *normalizer; CLTCSetList* stopSet; TokenStream *prevstream; public: /** Builds the named analyzer with no stop words. */ - SnowballAnalyzer(const char* language="english"); + SnowballAnalyzer(normalizer_func_t *normalizer, const char* language="english"); /** Builds the named analyzer with the given stop words. */
--- a/src/plugins/fts-lucene/SnowballFilter.h Tue Sep 18 20:07:21 2012 +0300 +++ b/src/plugins/fts-lucene/SnowballFilter.h Tue Sep 18 20:09:03 2012 +0300 @@ -22,6 +22,7 @@ */ class CLUCENE_CONTRIBS_EXPORT SnowballFilter: public TokenFilter { struct sb_stemmer * stemmer; + normalizer_func_t *normalizer; public: /** Construct the named stemming filter. @@ -29,7 +30,7 @@ * @param in the input tokens to stem * @param name the name of a stemmer */ - SnowballFilter(TokenStream* in, const char* language, bool deleteTS); + SnowballFilter(TokenStream* in, normalizer_func_t *normalizer, const char* language, bool deleteTS); ~SnowballFilter();
--- a/src/plugins/fts-lucene/fts-lucene-plugin.c Tue Sep 18 20:07:21 2012 +0300 +++ b/src/plugins/fts-lucene/fts-lucene-plugin.c Tue Sep 18 20:09:03 2012 +0300 @@ -28,6 +28,8 @@ set->textcat_dir = p_strdup(user->pool, *tmp + 12); } else if (strncmp(*tmp, "whitespace_chars=", 17) == 0) { set->whitespace_chars = p_strdup(user->pool, *tmp + 17); + } else if (strcmp(*tmp, "normalize") == 0) { + set->normalize = TRUE; } else { i_error("fts_lucene: Invalid setting: %s", *tmp); return -1; @@ -49,6 +51,11 @@ "but Dovecot built without stemmer support"); return -1; } + if (set->normalize) { + i_error("fts_lucene: normalize not currently supported " + "without stemmer support"); + return -1; + } #else if (set->default_language == NULL) set->default_language = "english"; @@ -71,6 +78,8 @@ crc = set->default_language == NULL ? 0 : crc32_str(set->default_language); crc = crc32_str_more(crc, set->whitespace_chars); + if (set->normalize) + crc = crc32_str_more(crc, "n"); return crc; }
--- a/src/plugins/fts-lucene/fts-lucene-plugin.h Tue Sep 18 20:07:21 2012 +0300 +++ b/src/plugins/fts-lucene/fts-lucene-plugin.h Tue Sep 18 20:09:03 2012 +0300 @@ -12,6 +12,7 @@ const char *default_language; const char *textcat_conf, *textcat_dir; const char *whitespace_chars; + bool normalize; }; struct fts_lucene_user {
--- a/src/plugins/fts-lucene/lucene-wrapper.cc Tue Sep 18 20:07:21 2012 +0300 +++ b/src/plugins/fts-lucene/lucene-wrapper.cc Tue Sep 18 20:09:03 2012 +0300 @@ -10,6 +10,7 @@ #include "mail-index.h" #include "mail-search.h" #include "mail-namespace.h" +#include "mailbox-list-private.h" #include "mail-storage.h" #include "fts-expunge-log.h" #include "fts-lucene-plugin.h" @@ -58,6 +59,7 @@ char *path; struct mailbox_list *list; struct fts_lucene_settings set; + normalizer_func_t *normalizer; wchar_t mailbox_guid[MAILBOX_GUID_HEX_LENGTH + 1]; @@ -107,6 +109,8 @@ index = i_new(struct lucene_index, 1); index->path = i_strdup(path); index->list = list; + index->normalizer = !set->normalize ? NULL : + list->ns->user->default_normalizer; if (set != NULL) index->set = *set; else { @@ -115,9 +119,11 @@ } #ifdef HAVE_LUCENE_STEMMER index->default_analyzer = - _CLNEW snowball::SnowballAnalyzer(index->set.default_language); + _CLNEW snowball::SnowballAnalyzer(index->normalizer, + index->set.default_language); #else index->default_analyzer = _CLNEW standard::StandardAnalyzer(); + i_assert(index->normalizer == NULL); #endif i_array_init(&index->analyzers, 32); textcat_refcount++; @@ -397,6 +403,7 @@ #ifdef HAVE_LUCENE_TEXTCAT static Analyzer *get_analyzer(struct lucene_index *index, const char *lang) { + normalizer_func_t *normalizer = index->normalizer; const struct lucene_analyzer *a; struct lucene_analyzer new_analyzer; Analyzer *analyzer; @@ -408,7 +415,8 @@ memset(&new_analyzer, 0, sizeof(new_analyzer)); new_analyzer.lang = i_strdup(lang); - new_analyzer.analyzer = _CLNEW snowball::SnowballAnalyzer(lang); + new_analyzer.analyzer = + _CLNEW snowball::SnowballAnalyzer(normalizer, lang); array_append_i(&index->analyzers.arr, &new_analyzer, 1); return new_analyzer.analyzer; }