Mercurial > dovecot > core-2.2
view src/plugins/fts-lucene/lucene-wrapper.cc @ 13258:6b8ef63846d7
fts-lucene: Fix to previous change: actually use WildcardQuery, not TermQuery..
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Tue, 16 Aug 2011 20:04:29 +0300 |
parents | 16d8a1c1543f |
children | b0777f44c999 |
line wrap: on
line source
/* Copyright (c) 2006-2010 Dovecot authors, see the included COPYING file */ extern "C" { #include "lib.h" #include "array.h" #include "unichar.h" #include "hash.h" #include "hex-binary.h" #include "mail-index.h" #include "mail-search.h" #include "mail-namespace.h" #include "mail-storage.h" #include "fts-expunge-log.h" #include "lucene-wrapper.h" #include <sys/stat.h> #ifdef HAVE_LUCENE_TEXTCAT # include <libtextcat/textcat.h> #endif }; #include <CLucene.h> #include <CLucene/util/CLStreams.h> #include <CLucene/search/MultiPhraseQuery.h> #include "SnowballAnalyzer.h" /* Lucene's default is 10000. Use it here also.. */ #define MAX_TERMS_PER_DOCUMENT 10000 #define LUCENE_LOCK_OVERRIDE_SECS 60 #define DEFAULT_LANGUAGE "english" using namespace lucene::document; using namespace lucene::index; using namespace lucene::search; using namespace lucene::queryParser; using namespace lucene::analysis; using namespace lucene::analysis; using namespace lucene::util; struct lucene_analyzer { char *lang; Analyzer *analyzer; }; struct lucene_index { char *path; char *textcat_dir, *textcat_conf; wchar_t mailbox_guid[MAILBOX_GUID_HEX_LENGTH + 1]; IndexReader *reader; IndexWriter *writer; IndexSearcher *searcher; Analyzer *default_analyzer, *cur_analyzer; ARRAY_DEFINE(analyzers, struct lucene_analyzer); Document *doc; uint32_t prev_uid; }; struct rescan_context { struct lucene_index *index; struct mailbox_list *list; struct mailbox *box; mail_guid_128_t box_guid; int box_ret; ARRAY_TYPE(seq_range) uids; struct seq_range_iter uids_iter; unsigned int uids_iter_n; uint32_t last_existing_uid; bool warned; }; static void *textcat = NULL; static bool textcat_broken = FALSE; static int textcat_refcount = 0; struct lucene_index *lucene_index_init(const char *path, const char *textcat_dir, const char *textcat_conf) { struct lucene_index *index; index = i_new(struct lucene_index, 1); index->path = i_strdup(path); index->textcat_dir = i_strdup(textcat_dir); index->textcat_conf = i_strdup(textcat_conf); #ifdef HAVE_LUCENE_TEXTCAT index->default_analyzer = _CLNEW snowball::SnowballAnalyzer(DEFAULT_LANGUAGE); #else index->default_analyzer = _CLNEW standard::StandardAnalyzer(); #endif i_array_init(&index->analyzers, 32); textcat_refcount++; return index; } void lucene_index_close(struct lucene_index *index) { _CLDELETE(index->reader); _CLDELETE(index->writer); _CLDELETE(index->searcher); } void lucene_index_deinit(struct lucene_index *index) { struct lucene_analyzer *a; lucene_index_close(index); array_foreach_modifiable(&index->analyzers, a) { i_free(a->lang); _CLDELETE(a->analyzer); } array_free(&index->analyzers); if (--textcat_refcount == 0 && textcat != NULL) { #ifdef HAVE_LUCENE_TEXTCAT textcat_Done(textcat); #endif textcat = NULL; } _CLDELETE(index->default_analyzer); i_free(index->textcat_dir); i_free(index->textcat_conf); i_free(index->path); i_free(index); } void lucene_utf8_n_to_tchar(const unsigned char *src, size_t srcsize, wchar_t *dest, size_t destsize) { ARRAY_TYPE(unichars) dest_arr; buffer_t buf = { 0, 0, { 0, 0, 0, 0, 0 } }; i_assert(sizeof(wchar_t) == sizeof(unichar_t)); buffer_create_data(&buf, dest, sizeof(wchar_t) * destsize); array_create_from_buffer(&dest_arr, &buf, sizeof(wchar_t)); if (uni_utf8_to_ucs4_n(src, srcsize, &dest_arr) < 0) i_unreached(); i_assert(array_count(&dest_arr)+1 == destsize); dest[destsize-1] = 0; } static const wchar_t *t_lucene_utf8_to_tchar(const char *str) { ARRAY_TYPE(unichars) dest_arr; const unichar_t *ret; i_assert(sizeof(wchar_t) == sizeof(unichar_t)); t_array_init(&dest_arr, strlen(str) + 1); if (uni_utf8_to_ucs4(str, &dest_arr) < 0) i_unreached(); (void)array_append_space(&dest_arr); ret = array_idx(&dest_arr, 0); return (const wchar_t *)ret; } void lucene_index_select_mailbox(struct lucene_index *index, const wchar_t guid[MAILBOX_GUID_HEX_LENGTH]) { memcpy(index->mailbox_guid, guid, MAILBOX_GUID_HEX_LENGTH * sizeof(wchar_t)); index->mailbox_guid[MAILBOX_GUID_HEX_LENGTH] = '\0'; } void lucene_index_unselect_mailbox(struct lucene_index *index) { memset(index->mailbox_guid, 0, sizeof(index->mailbox_guid)); } static void lucene_handle_error(struct lucene_index *index, CLuceneError &err, const char *msg) { const char *what = err.what(); i_error("lucene index %s: %s failed: %s", index->path, msg, what); } static int lucene_index_open(struct lucene_index *index) { if (index->reader != NULL) return 1; if (!IndexReader::indexExists(index->path)) return 0; try { index->reader = IndexReader::open(index->path); } catch (CLuceneError &err) { lucene_handle_error(index, err, "IndexReader::open()"); return -1; } return 1; } static int lucene_index_open_search(struct lucene_index *index) { int ret; if (index->searcher != NULL) return 1; if ((ret = lucene_index_open(index)) <= 0) return ret; index->searcher = _CLNEW IndexSearcher(index->reader); return 1; } static int lucene_doc_get_uid(struct lucene_index *index, Document *doc, uint32_t *uid_r) { Field *field = doc->getField(_T("uid")); const TCHAR *uid = field == NULL ? NULL : field->stringValue(); if (uid == NULL) { i_error("lucene: Corrupted FTS index %s: No UID for document", index->path); return -1; } uint32_t num = 0; while (*uid != 0) { num = num*10 + (*uid - '0'); uid++; } *uid_r = num; return 0; } int lucene_index_get_last_uid(struct lucene_index *index, uint32_t *last_uid_r) { int ret = 0; *last_uid_r = 0; if ((ret = lucene_index_open_search(index)) <= 0) return ret; Term mailbox_term(_T("box"), index->mailbox_guid); TermQuery query(&mailbox_term); uint32_t last_uid = 0; try { Hits *hits = index->searcher->search(&query); for (size_t i = 0; i < hits->length(); i++) { uint32_t uid; if (lucene_doc_get_uid(index, &hits->doc(i), &uid) < 0) { ret = -1; break; } if (uid > last_uid) last_uid = uid; } _CLDELETE(hits); } catch (CLuceneError &err) { lucene_handle_error(index, err, "last_uid search"); ret = -1; } *last_uid_r = last_uid; return ret; } int lucene_index_get_doc_count(struct lucene_index *index, uint32_t *count_r) { int ret; if (index->reader == NULL) { lucene_index_close(index); if ((ret = lucene_index_open(index)) < 0) return -1; if (ret == 0) { *count_r = 0; return 0; } } *count_r = index->reader->numDocs(); return 0; } int lucene_index_build_init(struct lucene_index *index) { const char *lock_path; struct stat st; lucene_index_close(index); lock_path = t_strdup_printf("%s/write.lock", index->path); if (stat(lock_path, &st) == 0 && st.st_mtime < time(NULL) - LUCENE_LOCK_OVERRIDE_SECS) { if (unlink(lock_path) < 0) i_error("unlink(%s) failed: %m"); } bool exists = IndexReader::indexExists(index->path); try { index->writer = _CLNEW IndexWriter(index->path, index->default_analyzer, !exists); } catch (CLuceneError &err) { lucene_handle_error(index, err, "IndexWriter()"); return -1; } index->writer->setMaxFieldLength(MAX_TERMS_PER_DOCUMENT); return 0; } static Analyzer *get_analyzer(struct lucene_index *index, const char *lang) { const struct lucene_analyzer *a; struct lucene_analyzer new_analyzer; Analyzer *analyzer; array_foreach(&index->analyzers, a) { if (strcmp(a->lang, lang) == 0) return a->analyzer; } memset(&new_analyzer, 0, sizeof(new_analyzer)); new_analyzer.lang = i_strdup(lang); new_analyzer.analyzer = _CLNEW snowball::SnowballAnalyzer(lang); array_append_i(&index->analyzers.arr, &new_analyzer, 1); return new_analyzer.analyzer; } #ifdef HAVE_LUCENE_TEXTCAT static Analyzer * guess_analyzer(struct lucene_index *index, const void *data, size_t size) { const char *lang; if (textcat_broken) return NULL; if (textcat == NULL) { textcat = index->textcat_conf == NULL ? NULL : special_textcat_Init(index->textcat_conf, index->textcat_dir); if (textcat == NULL) { textcat_broken = TRUE; return NULL; } } /* try to guess the language */ lang = textcat_Classify(textcat, (const char *)data, I_MIN(size, 500)); const char *p = strchr(lang, ']'); if (lang[0] != '[' || p == NULL) return NULL; lang = t_strdup_until(lang+1, p); if (strcmp(lang, DEFAULT_LANGUAGE) == 0) return index->default_analyzer; return get_analyzer(index, lang); } #else static Analyzer * guess_analyzer(struct lucene_index *index ATTR_UNUSED, const void *data ATTR_UNUSED, size_t size ATTR_UNUSED) { return NULL; } #endif static int lucene_index_build_flush(struct lucene_index *index) { int ret = 0; if (index->doc == NULL) return 0; try { index->writer->addDocument(index->doc, index->cur_analyzer != NULL ? index->cur_analyzer : index->default_analyzer); } catch (CLuceneError &err) { lucene_handle_error(index, err, "IndexWriter::addDocument()"); ret = -1; } _CLDELETE(index->doc); index->doc = NULL; index->cur_analyzer = NULL; return ret; } int lucene_index_build_more(struct lucene_index *index, uint32_t uid, const unsigned char *data, size_t size, const char *hdr_name) { wchar_t id[MAX_INT_STRLEN]; size_t namesize, datasize; if (uid != index->prev_uid) { if (lucene_index_build_flush(index) < 0) return -1; index->prev_uid = uid; index->doc = _CLNEW Document(); swprintf(id, N_ELEMENTS(id), L"%u", uid); index->doc->add(*_CLNEW Field(_T("uid"), id, Field::STORE_YES | Field::INDEX_UNTOKENIZED)); index->doc->add(*_CLNEW Field(_T("box"), index->mailbox_guid, Field::STORE_YES | Field::INDEX_UNTOKENIZED)); } datasize = uni_utf8_strlen_n(data, size) + 1; wchar_t dest[datasize]; lucene_utf8_n_to_tchar(data, size, dest, datasize); if (hdr_name != NULL) { /* hdr_name should be ASCII, but don't break in case it isn't */ hdr_name = t_str_lcase(hdr_name); namesize = uni_utf8_strlen(hdr_name) + 1; wchar_t wname[namesize]; lucene_utf8_n_to_tchar((const unsigned char *)hdr_name, strlen(hdr_name), wname, namesize); index->doc->add(*_CLNEW Field(_T("hdr"), wname, Field::STORE_NO | Field::INDEX_UNTOKENIZED)); index->doc->add(*_CLNEW Field(_T("hdr"), dest, Field::STORE_NO | Field::INDEX_TOKENIZED)); if (fts_header_want_indexed(hdr_name)) index->doc->add(*_CLNEW Field(wname, dest, Field::STORE_NO | Field::INDEX_TOKENIZED)); } else if (size > 0) { if (index->cur_analyzer == NULL) index->cur_analyzer = guess_analyzer(index, data, size); index->doc->add(*_CLNEW Field(_T("body"), dest, Field::STORE_NO | Field::INDEX_TOKENIZED)); } return 0; } int lucene_index_build_deinit(struct lucene_index *index) { int ret = 0; if (index->prev_uid == 0) { /* no changes. */ return 0; } index->prev_uid = 0; if (index->writer == NULL) { lucene_index_close(index); return -1; } if (lucene_index_build_flush(index) < 0) ret = -1; try { index->writer->close(); } catch (CLuceneError &err) { lucene_handle_error(index, err, "IndexWriter::close()"); ret = -1; } lucene_index_close(index); return ret; } static int wcharguid_to_guid(mail_guid_128_t *dest, const wchar_t *src) { buffer_t buf = { 0, 0, { 0, 0, 0, 0, 0 } }; char src_chars[MAIL_GUID_128_SIZE*2 + 1]; unsigned int i; for (i = 0; i < sizeof(src_chars)-1; i++) { if ((src[i] >= '0' && src[i] <= '9') || (src[i] >= 'a' && src[i] <= 'f')) src_chars[i] = src[i]; else return -1; } if (src[i] != '\0') return -1; src_chars[i] = '\0'; buffer_create_data(&buf, dest, sizeof(*dest)); return hex_to_binary(src_chars, &buf); } static int rescan_get_uids(struct mailbox *box, ARRAY_TYPE(seq_range) *uids) { struct mailbox_status status; if (mailbox_get_status(box, STATUS_MESSAGES, &status) < 0) return -1; if (status.messages > 0) T_BEGIN { ARRAY_TYPE(seq_range) seqs; t_array_init(&seqs, 2); seq_range_array_add_range(&seqs, 1, status.messages); mailbox_get_uid_range(box, &seqs, uids); } T_END; return 0; } static int rescan_finish(struct rescan_context *ctx) { int ret; ret = fts_index_set_last_uid(ctx->box, ctx->last_existing_uid); mailbox_free(&ctx->box); return ret; } static int fts_lucene_get_mailbox_guid(struct lucene_index *index, Document *doc, mail_guid_128_t *guid_r) { Field *field = doc->getField(_T("box")); const TCHAR *box_guid = field == NULL ? NULL : field->stringValue(); if (box_guid == NULL) { i_error("lucene: Corrupted FTS index %s: No mailbox for document", index->path); return -1; } if (wcharguid_to_guid(guid_r, box_guid) < 0) { i_error("lucene: Corrupted FTS index %s: " "box field not in expected format", index->path); return -1; } return 0; } static int rescan_open_mailbox(struct rescan_context *ctx, Document *doc) { mail_guid_128_t guid; int ret; if (fts_lucene_get_mailbox_guid(ctx->index, doc, &guid) < 0) return 0; if (memcmp(guid, ctx->box_guid, sizeof(guid)) == 0) { /* same as last one */ return ctx->box_ret; } memcpy(ctx->box_guid, guid, sizeof(ctx->box_guid)); if (ctx->box != NULL) rescan_finish(ctx); ctx->box = mailbox_alloc_guid(ctx->list, guid, MAILBOX_FLAG_KEEP_RECENT); if (mailbox_open(ctx->box) < 0) { enum mail_error error; const char *errstr; errstr = mailbox_get_last_error(ctx->box, &error); if (error == MAIL_ERROR_NOTFOUND) ret = 0; else { i_error("lucene: Couldn't open mailbox %s: %s", mailbox_get_vname(ctx->box), errstr); ret = -1; } mailbox_free(&ctx->box); ctx->box_ret = ret; return ret; } if (mailbox_sync(ctx->box, (enum mailbox_sync_flags)0) < 0) { i_error("lucene: Failed to sync mailbox %s: %s", mailbox_get_vname(ctx->box), mailbox_get_last_error(ctx->box, NULL)); mailbox_free(&ctx->box); ctx->box_ret = -1; return -1; } array_clear(&ctx->uids); rescan_get_uids(ctx->box, &ctx->uids); ctx->warned = FALSE; ctx->last_existing_uid = 0; ctx->uids_iter_n = 0; seq_range_array_iter_init(&ctx->uids_iter, &ctx->uids); ctx->box_ret = 1; return 1; } static int rescan_next(struct rescan_context *ctx, Document *doc) { uint32_t lucene_uid, idx_uid; if (lucene_doc_get_uid(ctx->index, doc, &lucene_uid) < 0) return 0; if (seq_range_array_iter_nth(&ctx->uids_iter, ctx->uids_iter_n, &idx_uid)) { if (idx_uid == lucene_uid) { ctx->uids_iter_n++; ctx->last_existing_uid = idx_uid; return 1; } if (idx_uid < lucene_uid) { /* lucene is missing an UID from the middle. delete the rest of the messages from this mailbox and reindex. */ if (!ctx->warned) { i_warning("lucene: Mailbox %s " "missing UIDs in the middle", mailbox_get_vname(ctx->box)); ctx->warned = TRUE; } } else { /* UID has been expunged from index. delete from lucene as well. */ } return 0; } else { /* the rest of the messages have been expunged from index */ return 0; } } int lucene_index_rescan(struct lucene_index *index, struct mailbox_list *list) { static const TCHAR *sort_fields[] = { _T("box"), _T("uid"), NULL }; struct rescan_context ctx; mail_guid_128_t guid; bool failed = false; int ret; if ((ret = lucene_index_open_search(index)) <= 0) return ret; Term term(_T("box"), _T("*")); WildcardQuery query(&term); Sort sort(sort_fields); memset(&ctx, 0, sizeof(ctx)); ctx.index = index; ctx.list = list; i_array_init(&ctx.uids, 128); try { Hits *hits = index->searcher->search(&query, &sort); for (size_t i = 0; i < hits->length(); i++) { ret = rescan_open_mailbox(&ctx, &hits->doc(i)); if (ret > 0) ret = rescan_next(&ctx, &hits->doc(i)); if (ret < 0) failed = true; else if (ret == 0) index->reader->deleteDocument(hits->id(i)); } _CLDELETE(hits); index->reader->close(); lucene_index_close(index); } catch (CLuceneError &err) { lucene_handle_error(index, err, "rescan search"); failed = true; } if (ctx.box != NULL) rescan_finish(&ctx); array_free(&ctx.uids); return failed ? -1 : 0; } static void guid128_to_wguid(const mail_guid_128_t guid, wchar_t wguid_hex[MAILBOX_GUID_HEX_LENGTH + 1]) { buffer_t buf = { 0, 0, { 0, 0, 0, 0, 0 } }; unsigned char guid_hex[MAILBOX_GUID_HEX_LENGTH]; unsigned int i; buffer_create_data(&buf, guid_hex, MAILBOX_GUID_HEX_LENGTH); binary_to_hex_append(&buf, guid, MAIL_GUID_128_SIZE); for (i = 0; i < MAILBOX_GUID_HEX_LENGTH; i++) wguid_hex[i] = guid_hex[i]; wguid_hex[i] = '\0'; } static void lucene_index_query_range_add(BooleanQuery *query, wchar_t *wuid, wchar_t max_char) { wchar_t i; for (i = wuid[0]; i <= max_char; i++) { wuid[0] = i; Term *term = _CLNEW Term(_T("uid"), wuid); query->add(_CLNEW WildcardQuery(term), true, BooleanClause::SHOULD); _CLDECDELETE(term); } } static int lucene_index_expunge_record(struct lucene_index *index, const struct fts_expunge_log_read_record *rec) { const struct seq_range *range; unsigned int count; int ret; if ((ret = lucene_index_open_search(index)) <= 0) return ret; range = array_get(&rec->uids, &count); BooleanQuery query; BooleanQuery uids_query; /* RangeQuery actually just adds each term within the range to the search query, causing "too many clauses" at some point. So use WildcardQuery to get something approximately true. */ uint32_t seq1 = range[0].seq1, seq2 = range[count-1].seq2; if (seq2 / seq1 > 10) { /* just iterate through everything */ } else { wchar_t wuid1[MAX_INT_STRLEN], wuid2[MAX_INT_STRLEN]; unsigned int i; swprintf(wuid1, N_ELEMENTS(wuid1), L"%u", range[0].seq1); swprintf(wuid2, N_ELEMENTS(wuid2), L"%u", range[count-1].seq2); for (i = 1; wuid1[i] != '\0'; i++) wuid1[i] = '?'; for (i = 1; wuid2[i] != '\0'; i++) wuid2[i] = '?'; if (wcslen(wuid1) == wcslen(wuid2)) { /* for example: 1???..9??? */ lucene_index_query_range_add(&uids_query, wuid1, wuid2[0]); } else { /* for example: 4?? .. 5??? */ lucene_index_query_range_add(&uids_query, wuid1, '9'); wchar_t max = wuid2[0]; wuid2[0] = '1'; lucene_index_query_range_add(&uids_query, wuid2, max); } query.add(&uids_query, BooleanClause::MUST); } wchar_t wguid[MAILBOX_GUID_HEX_LENGTH + 1]; guid128_to_wguid(rec->mailbox_guid, wguid); Term term(_T("box"), wguid); TermQuery mailbox_query(&term); query.add(&mailbox_query, BooleanClause::MUST); try { Hits *hits = index->searcher->search(&query); for (size_t i = 0; i < hits->length(); i++) { uint32_t uid; if (lucene_doc_get_uid(index, &hits->doc(i), &uid) < 0 || seq_range_exists(&rec->uids, uid)) index->reader->deleteDocument(hits->id(i)); } _CLDELETE(hits); } catch (CLuceneError &err) { lucene_handle_error(index, err, "expunge search"); ret = -1; } return ret < 0 ? -1 : 0; } int lucene_index_expunge_from_log(struct lucene_index *index, struct fts_expunge_log *log) { struct fts_expunge_log_read_ctx *ctx; const struct fts_expunge_log_read_record *rec; int ret = 0, ret2; ctx = fts_expunge_log_read_begin(log); while ((rec = fts_expunge_log_read_next(ctx)) != NULL) { if (lucene_index_expunge_record(index, rec) < 0) { ret = -1; break; } } try { if (index->reader != NULL) index->reader->close(); lucene_index_close(index); } catch (CLuceneError &err) { lucene_handle_error(index, err, "expunge delete"); ret = -1; } ret2 = fts_expunge_log_read_end(&ctx); if (ret < 0 || ret2 < 0) return -1; return ret2; } int lucene_index_optimize(struct lucene_index *index) { int ret = 0; if (IndexReader::isLocked(index->path)) IndexReader::unlock(index->path); IndexWriter *writer = NULL; try { writer = _CLNEW IndexWriter(index->path, index->default_analyzer, false); writer->optimize(); } catch (CLuceneError &err) { lucene_handle_error(index, err, "IndexWriter::optimize()"); ret = -1; } if (writer != NULL) _CLDELETE(writer); return ret; } // Mostly copy&pasted from CLucene's QueryParser static Query* getFieldQuery(Analyzer *analyzer, const TCHAR* _field, const TCHAR* queryText, bool fuzzy) { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count StringReader reader(queryText); TokenStream* source = analyzer->tokenStream(_field, &reader); CLVector<CL_NS(analysis)::Token*, Deletor::Object<CL_NS(analysis)::Token> > v; CL_NS(analysis)::Token* t = NULL; int32_t positionCount = 0; bool severalTokensAtSamePosition = false; while (true) { t = _CLNEW Token(); try { Token* _t = source->next(t); if (_t == NULL) _CLDELETE(t); }_CLCATCH_ERR(CL_ERR_IO, _CLLDELETE(source);_CLLDELETE(t);,{ t = NULL; }); if (t == NULL) break; v.push_back(t); if (t->getPositionIncrement() != 0) positionCount += t->getPositionIncrement(); else severalTokensAtSamePosition = true; } try { source->close(); } _CLCATCH_ERR_CLEANUP(CL_ERR_IO, {_CLLDELETE(source);_CLLDELETE(t);} ); /* cleanup */ _CLLDELETE(source); if (v.size() == 0) return NULL; else if (v.size() == 1) { Term* tm = _CLNEW Term(_field, v.at(0)->termBuffer()); Query* ret; if (fuzzy) ret = _CLNEW FuzzyQuery( tm ); else ret = _CLNEW PrefixQuery( tm ); _CLDECDELETE(tm); return ret; } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery* q = _CLNEW BooleanQuery(true); for(size_t i=0; i<v.size(); i++ ){ Term* tm = _CLNEW Term(_field, v.at(i)->termBuffer()); q->add(_CLNEW TermQuery(tm), true, BooleanClause::SHOULD); _CLDECDELETE(tm); } return q; }else { MultiPhraseQuery* mpq = _CLNEW MultiPhraseQuery(); CLArrayList<Term*> multiTerms; int32_t position = -1; for (size_t i = 0; i < v.size(); i++) { t = v.at(i); if (t->getPositionIncrement() > 0 && multiTerms.size() > 0) { ValueArray<Term*> termsArray(multiTerms.size()); multiTerms.toArray(termsArray.values); mpq->add(&termsArray,position); multiTerms.clear(); } position += t->getPositionIncrement(); multiTerms.push_back(_CLNEW Term(_field, t->termBuffer())); } ValueArray<Term*> termsArray(multiTerms.size()); multiTerms.toArray(termsArray.values); mpq->add(&termsArray,position); return mpq; } }else { PhraseQuery* pq = _CLNEW PhraseQuery(); int32_t position = -1; for (size_t i = 0; i < v.size(); i++) { t = v.at(i); Term* tm = _CLNEW Term(_field, t->termBuffer()); position += t->getPositionIncrement(); pq->add(tm,position); _CLDECDELETE(tm); } return pq; } } } static Query * lucene_get_query(struct lucene_index *index, const TCHAR *key, const struct mail_search_arg *arg) { const TCHAR *wvalue = t_lucene_utf8_to_tchar(arg->value.str); Analyzer *analyzer = guess_analyzer(index, arg->value.str, strlen(arg->value.str)); if (analyzer == NULL) analyzer = index->default_analyzer; return getFieldQuery(analyzer, key, wvalue, arg->fuzzy); } static bool lucene_add_definite_query(struct lucene_index *index, BooleanQuery &query, struct mail_search_arg *arg, bool and_args) { Query *q; if (arg->match_not && !and_args) { /* FIXME: we could handle this by doing multiple queries.. */ return false; } switch (arg->type) { case SEARCH_TEXT: { BooleanQuery *bq = _CLNEW BooleanQuery(); Query *q1 = lucene_get_query(index, _T("hdr"), arg); Query *q2 = lucene_get_query(index, _T("body"), arg); if (q1 == NULL && q2 == NULL) q = NULL; else { if (q1 != NULL) bq->add(q1, true, BooleanClause::SHOULD); if (q2 != NULL) bq->add(q2, true, BooleanClause::SHOULD); q = bq; } break; } case SEARCH_BODY: q = lucene_get_query(index, _T("body"), arg); break; case SEARCH_HEADER: case SEARCH_HEADER_ADDRESS: case SEARCH_HEADER_COMPRESS_LWSP: if (!fts_header_want_indexed(arg->hdr_field_name)) return false; if (*arg->value.str == '\0') { /* FIXME: handle existence of a search key */ return false; } q = lucene_get_query(index, t_lucene_utf8_to_tchar(arg->hdr_field_name), arg); break; default: return false; } if (q == NULL) { /* couldn't handle this search after all (e.g. trying to search a stop word) */ return false; } if (!and_args) query.add(q, true, BooleanClause::SHOULD); else if (!arg->match_not) query.add(q, true, BooleanClause::MUST); else query.add(q, true, BooleanClause::MUST_NOT); return true; } static bool lucene_add_maybe_query(struct lucene_index *index, BooleanQuery &query, struct mail_search_arg *arg, bool and_args) { Query *q; if (arg->match_not && !and_args) { /* FIXME: we could handle this by doing multiple queries.. */ return false; } switch (arg->type) { case SEARCH_HEADER: case SEARCH_HEADER_ADDRESS: case SEARCH_HEADER_COMPRESS_LWSP: if (fts_header_want_indexed(arg->hdr_field_name)) return false; /* we can check if the search key exists in some header and filter out the messages that have no chance of matching */ q = lucene_get_query(index, _T("hdr"), arg); break; default: return false; } if (q == NULL) { /* couldn't handle this search after all (e.g. trying to search a stop word) */ return false; } if (!and_args) query.add(q, true, BooleanClause::SHOULD); else if (!arg->match_not) query.add(q, true, BooleanClause::MUST); else query.add(q, true, BooleanClause::MUST_NOT); return true; } static int lucene_index_search(struct lucene_index *index, Query &search_query, struct fts_result *result, ARRAY_TYPE(seq_range) *uids_r) { struct fts_score_map *score; int ret = 0; BooleanQuery query; query.add(&search_query, BooleanClause::MUST); Term mailbox_term(_T("box"), index->mailbox_guid); TermQuery mailbox_query(&mailbox_term); query.add(&mailbox_query, BooleanClause::MUST); try { Hits *hits = index->searcher->search(&query); uint32_t last_uid = 0; if (result != NULL) result->scores_sorted = true; for (size_t i = 0; i < hits->length(); i++) { uint32_t uid; if (lucene_doc_get_uid(index, &hits->doc(i), &uid) < 0) { ret = -1; break; } if (result != NULL) { if (uid < last_uid) result->scores_sorted = false; last_uid = uid; seq_range_array_add(uids_r, 0, uid); score = array_append_space(&result->scores); score->uid = uid; score->score = hits->score(i); } } _CLDELETE(hits); return ret; } catch (CLuceneError &err) { lucene_handle_error(index, err, "search"); return -1; } } int lucene_index_lookup(struct lucene_index *index, struct mail_search_arg *args, bool and_args, struct fts_result *result) { struct mail_search_arg *arg; if (lucene_index_open_search(index) <= 0) return -1; BooleanQuery def_query; bool have_definites = false; for (arg = args; arg != NULL; arg = arg->next) { if (lucene_add_definite_query(index, def_query, arg, and_args)) { arg->match_always = true; have_definites = true; } } if (have_definites) { if (lucene_index_search(index, def_query, result, &result->definite_uids) < 0) return -1; } BooleanQuery maybe_query; bool have_maybies = false; for (arg = args; arg != NULL; arg = arg->next) { if (lucene_add_maybe_query(index, maybe_query, arg, and_args)) { arg->match_always = true; have_maybies = true; } } if (have_maybies) { if (lucene_index_search(index, maybe_query, NULL, &result->maybe_uids) < 0) return -1; } return 0; } static int lucene_index_search_multi(struct lucene_index *index, struct hash_table *guids, Query &search_query, struct fts_multi_result *result) { struct fts_score_map *score; int ret = 0; BooleanQuery query; query.add(&search_query, BooleanClause::MUST); BooleanQuery mailbox_query; struct hash_iterate_context *iter; void *key, *value; iter = hash_table_iterate_init(guids); while (hash_table_iterate(iter, &key, &value)) { Term *term = _CLNEW Term(_T("box"), (wchar_t *)key); TermQuery *q = _CLNEW TermQuery(term); mailbox_query.add(q, true, BooleanClause::SHOULD); } hash_table_iterate_deinit(&iter); query.add(&mailbox_query, BooleanClause::MUST); try { Hits *hits = index->searcher->search(&query); for (size_t i = 0; i < hits->length(); i++) { uint32_t uid; Field *field = hits->doc(i).getField(_T("box")); const TCHAR *box_guid = field == NULL ? NULL : field->stringValue(); if (box_guid == NULL) { i_error("lucene: Corrupted FTS index %s: No mailbox for document", index->path); ret = -1; break; } struct fts_result *br = (struct fts_result *) hash_table_lookup(guids, (const void *)box_guid); if (br == NULL) { i_warning("lucene: Returned unexpected mailbox with GUID %ls", box_guid); continue; } if (lucene_doc_get_uid(index, &hits->doc(i), &uid) < 0) { ret = -1; break; } if (!array_is_created(&br->definite_uids)) { p_array_init(&br->definite_uids, result->pool, 32); p_array_init(&br->scores, result->pool, 32); } seq_range_array_add(&br->definite_uids, 0, uid); score = array_append_space(&br->scores); score->uid = uid; score->score = hits->score(i); } _CLDELETE(hits); return ret; } catch (CLuceneError &err) { lucene_handle_error(index, err, "multi search"); return -1; } } int lucene_index_lookup_multi(struct lucene_index *index, struct hash_table *guids, struct mail_search_arg *args, bool and_args, struct fts_multi_result *result) { struct mail_search_arg *arg; if (lucene_index_open_search(index) <= 0) return -1; BooleanQuery def_query; bool have_definites = false; for (arg = args; arg != NULL; arg = arg->next) { if (lucene_add_definite_query(index, def_query, arg, and_args)) { arg->match_always = true; have_definites = true; } } if (have_definites) { if (lucene_index_search_multi(index, guids, def_query, result) < 0) return -1; } return 0; } struct lucene_index_iter { struct lucene_index *index; struct lucene_index_record rec; Term *term; WildcardQuery *query; Sort *sort; Hits *hits; size_t i; bool failed; }; struct lucene_index_iter * lucene_index_iter_init(struct lucene_index *index) { static const TCHAR *sort_fields[] = { _T("box"), _T("uid"), NULL }; struct lucene_index_iter *iter; int ret; iter = i_new(struct lucene_index_iter, 1); iter->index = index; if ((ret = lucene_index_open_search(index)) <= 0) { if (ret < 0) iter->failed = true; return iter; } iter->term = _CLNEW Term(_T("box"), _T("*")); iter->query = _CLNEW WildcardQuery(iter->term); iter->sort = _CLNEW Sort(sort_fields); try { iter->hits = index->searcher->search(iter->query, iter->sort); } catch (CLuceneError &err) { lucene_handle_error(index, err, "rescan search"); iter->failed = true; } return iter; } const struct lucene_index_record * lucene_index_iter_next(struct lucene_index_iter *iter) { if (iter->hits == NULL) return NULL; if (iter->i == iter->hits->length()) return NULL; Document *doc = &iter->hits->doc(iter->i); iter->i++; memset(&iter->rec, 0, sizeof(iter->rec)); (void)fts_lucene_get_mailbox_guid(iter->index, doc, &iter->rec.mailbox_guid); (void)lucene_doc_get_uid(iter->index, doc, &iter->rec.uid); return &iter->rec; } int lucene_index_iter_deinit(struct lucene_index_iter **_iter) { struct lucene_index_iter *iter = *_iter; int ret = iter->failed ? -1 : 0; *_iter = NULL; if (iter->hits != NULL) _CLDELETE(iter->hits); if (iter->query != NULL) { _CLDELETE(iter->query); _CLDELETE(iter->sort); _CLDELETE(iter->term); } i_free(iter); return ret; }