Mercurial > dovecot > original-hg > dovecot-1.2
changeset 5341:acd4ed841b01 HEAD
Separate headers and body in building and searching. Added support for
Lucene to index/search them separately.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Fri, 16 Mar 2007 00:20:55 +0200 |
parents | b7c4d7e2cc8c |
children | 4e13ca0f2f87 |
files | src/plugins/fts-lucene/fts-backend-lucene.c src/plugins/fts-lucene/lucene-wrapper.cc src/plugins/fts-lucene/lucene-wrapper.h src/plugins/fts-squat/fts-backend-squat.c src/plugins/fts/fts-api-private.h src/plugins/fts/fts-api.c src/plugins/fts/fts-api.h src/plugins/fts/fts-storage.c |
diffstat | 8 files changed, 95 insertions(+), 49 deletions(-) [+] |
line wrap: on
line diff
--- a/src/plugins/fts-lucene/fts-backend-lucene.c Thu Mar 15 23:46:26 2007 +0200 +++ b/src/plugins/fts-lucene/fts-backend-lucene.c Fri Mar 16 00:20:55 2007 +0200 @@ -109,7 +109,7 @@ static int fts_backend_lucene_build_more(struct fts_backend_build_context *ctx, uint32_t uid, const unsigned char *data, - size_t size) + size_t size, bool headers) { struct lucene_fts_backend *backend = (struct lucene_fts_backend *)ctx->backend; @@ -122,7 +122,7 @@ i_assert(backend->lstorage->selected_box == backend->box); return lucene_index_build_more(backend->lstorage->index, - uid, data, size); + uid, data, size, headers); } static int @@ -167,14 +167,16 @@ } static int -fts_backend_lucene_lookup(struct fts_backend *_backend, const char *key, - ARRAY_TYPE(seq_range) *result) +fts_backend_lucene_lookup(struct fts_backend *_backend, + enum fts_lookup_flags flags, + const char *key, ARRAY_TYPE(seq_range) *result) { struct lucene_fts_backend *backend = (struct lucene_fts_backend *)_backend; fts_backend_select(backend); - return lucene_index_lookup(backend->lstorage->index, key, result); + return lucene_index_lookup(backend->lstorage->index, + flags, key, result); } struct fts_backend fts_backend_lucene = {
--- a/src/plugins/fts-lucene/lucene-wrapper.cc Thu Mar 15 23:46:26 2007 +0200 +++ b/src/plugins/fts-lucene/lucene-wrapper.cc Fri Mar 16 00:20:55 2007 +0200 @@ -57,10 +57,12 @@ public: TokenStream *tokenStream(const TCHAR *fieldName, CL_NS(util)::Reader *reader) { - /* Everything except contents should go as-is without any + /* Everything except body/headers should go as-is without any modifications. Isn't there any easier way to do this than to implement a whole new RawTokenStream?.. */ - if (fieldName != 0 && wcscmp(fieldName, L"contents") != 0) + if (fieldName != 0 && + wcscmp(fieldName, L"headers") != 0 && + wcscmp(fieldName, L"body") != 0) return _CLNEW RawTokenStream(reader); return standard::StandardAnalyzer:: @@ -282,7 +284,8 @@ } int lucene_index_build_more(struct lucene_index *index, uint32_t uid, - const unsigned char *data, size_t size) + const unsigned char *data, size_t size, + bool headers) { unsigned int len; char id[MAX_INT_STRLEN]; @@ -309,7 +312,10 @@ index->doc->add(*Field::Text(_T("box"), index->tmailbox_name)); } - index->doc->add(*Field::Text(_T("contents"), dest)); + if (headers) + index->doc->add(*Field::Text(_T("headers"), dest)); + else + index->doc->add(*Field::Text(_T("body"), dest)); return 0; } @@ -411,12 +417,14 @@ } } -int lucene_index_lookup(struct lucene_index *index, const char *key, - ARRAY_TYPE(seq_range) *result) +int lucene_index_lookup(struct lucene_index *index, enum fts_lookup_flags flags, + const char *key, ARRAY_TYPE(seq_range) *result) { const char *quoted_key; int ret = 0; + i_assert((flags & (FTS_LOOKUP_FLAG_HEADERS|FTS_LOOKUP_FLAG_BODY)) != 0); + if (lucene_index_open_search(index) <= 0) return -1; @@ -429,15 +437,26 @@ lucene_utf8towcs(tkey, quoted_key, len + 1); t_pop(); - Query *content_query = NULL; + BooleanQuery lookup_query; + Query *content_query1 = NULL, *content_query2 = NULL; try { - content_query = QueryParser::parse(tkey, _T("contents"), - index->analyzer); + if ((flags & FTS_LOOKUP_FLAG_HEADERS) != 0) { + content_query1 = QueryParser::parse(tkey, _T("headers"), + index->analyzer); + lookup_query.add(content_query1, false, false); + } + if ((flags & FTS_LOOKUP_FLAG_BODY) != 0) { + content_query2 = QueryParser::parse(tkey, _T("body"), + index->analyzer); + lookup_query.add(content_query2, false, false); + } } catch (CLuceneError &err) { if (getenv("DEBUG") != NULL) { i_info("lucene: QueryParser::parse(%s) failed: %s", str_sanitize(key, 40), err.what()); } + if (content_query1 != NULL) + _CLDELETE(content_query1); lucene_index_close(index); return -1; } @@ -445,7 +464,7 @@ BooleanQuery query; Term mailbox_term(_T("box"), index->tmailbox_name); TermQuery mailbox_query(&mailbox_term); - query.add(content_query, true, false); + query.add(&lookup_query, true, false); query.add(&mailbox_query, true, false); try { @@ -469,6 +488,9 @@ ret = -1; } - _CLDELETE(content_query); + if (content_query1 != NULL) + _CLDELETE(content_query1); + if (content_query2 != NULL) + _CLDELETE(content_query2); return ret; }
--- a/src/plugins/fts-lucene/lucene-wrapper.h Thu Mar 15 23:46:26 2007 +0200 +++ b/src/plugins/fts-lucene/lucene-wrapper.h Fri Mar 16 00:20:55 2007 +0200 @@ -12,12 +12,13 @@ int lucene_index_build_init(struct lucene_index *index, uint32_t *last_uid_r); int lucene_index_build_more(struct lucene_index *index, uint32_t uid, - const unsigned char *data, size_t size); + const unsigned char *data, size_t size, + bool headers); int lucene_index_build_deinit(struct lucene_index *index); int lucene_index_expunge(struct lucene_index *index, uint32_t uid); -int lucene_index_lookup(struct lucene_index *index, const char *key, - ARRAY_TYPE(seq_range) *result); +int lucene_index_lookup(struct lucene_index *index, enum fts_lookup_flags flags, + const char *key, ARRAY_TYPE(seq_range) *result); #endif
--- a/src/plugins/fts-squat/fts-backend-squat.c Thu Mar 15 23:46:26 2007 +0200 +++ b/src/plugins/fts-squat/fts-backend-squat.c Fri Mar 16 00:20:55 2007 +0200 @@ -85,7 +85,7 @@ static int fts_backend_squat_build_more(struct fts_backend_build_context *_ctx, uint32_t uid, const unsigned char *data, - size_t size) + size_t size, bool headers __attr_unused__) { struct squat_fts_backend_build_context *ctx = (struct squat_fts_backend_build_context *)_ctx; @@ -179,8 +179,9 @@ } static int -fts_backend_squat_lookup(struct fts_backend *_backend, const char *key, - ARRAY_TYPE(seq_range) *result) +fts_backend_squat_lookup(struct fts_backend *_backend, + enum fts_lookup_flags flags __attr_unused__, + const char *key, ARRAY_TYPE(seq_range) *result) { struct squat_fts_backend *backend = (struct squat_fts_backend *)_backend; @@ -189,8 +190,9 @@ } static int -fts_backend_squat_filter(struct fts_backend *_backend, const char *key, - ARRAY_TYPE(seq_range) *result) +fts_backend_squat_filter(struct fts_backend *_backend, + enum fts_lookup_flags flags __attr_unused__, + const char *key, ARRAY_TYPE(seq_range) *result) { struct squat_fts_backend *backend = (struct squat_fts_backend *)_backend;
--- a/src/plugins/fts/fts-api-private.h Thu Mar 15 23:46:26 2007 +0200 +++ b/src/plugins/fts/fts-api-private.h Fri Mar 16 00:20:55 2007 +0200 @@ -13,7 +13,7 @@ (*build_init)(struct fts_backend *backend, uint32_t *last_uid_r); int (*build_more)(struct fts_backend_build_context *ctx, uint32_t uid, - const unsigned char *data, size_t size); + const unsigned char *data, size_t size, bool headers); int (*build_deinit)(struct fts_backend_build_context *ctx); void (*expunge)(struct fts_backend *backend, struct mail *mail); @@ -23,10 +23,10 @@ int (*lock)(struct fts_backend *backend); void (*unlock)(struct fts_backend *backend); - int (*lookup)(struct fts_backend *backend, const char *key, - ARRAY_TYPE(seq_range) *result); - int (*filter)(struct fts_backend *backend, const char *key, - ARRAY_TYPE(seq_range) *result); + int (*lookup)(struct fts_backend *backend, enum fts_lookup_flags flags, + const char *key, ARRAY_TYPE(seq_range) *result); + int (*filter)(struct fts_backend *backend, enum fts_lookup_flags flags, + const char *key, ARRAY_TYPE(seq_range) *result); }; enum fts_backend_flags {
--- a/src/plugins/fts/fts-api.c Thu Mar 15 23:46:26 2007 +0200 +++ b/src/plugins/fts/fts-api.c Fri Mar 16 00:20:55 2007 +0200 @@ -67,9 +67,9 @@ } int fts_backend_build_more(struct fts_backend_build_context *ctx, uint32_t uid, - const unsigned char *data, size_t size) + const unsigned char *data, size_t size, bool headers) { - return ctx->backend->v.build_more(ctx, uid, data, size); + return ctx->backend->v.build_more(ctx, uid, data, size, headers); } int fts_backend_build_deinit(struct fts_backend_build_context *ctx) @@ -98,24 +98,24 @@ backend->v.unlock(backend); } -int fts_backend_lookup(struct fts_backend *backend, const char *key, - ARRAY_TYPE(seq_range) *result) +int fts_backend_lookup(struct fts_backend *backend, enum fts_lookup_flags flags, + const char *key, ARRAY_TYPE(seq_range) *result) { - return backend->v.lookup(backend, key, result); + return backend->v.lookup(backend, flags, key, result); } -int fts_backend_filter(struct fts_backend *backend, const char *key, - ARRAY_TYPE(seq_range) *result) +int fts_backend_filter(struct fts_backend *backend, enum fts_lookup_flags flags, + const char *key, ARRAY_TYPE(seq_range) *result) { ARRAY_TYPE(seq_range) tmp_result; int ret; if (backend->v.filter != NULL) - return backend->v.filter(backend, key, result); + return backend->v.filter(backend, flags, key, result); /* do this ourself */ i_array_init(&tmp_result, 64); - ret = fts_backend_lookup(backend, key, &tmp_result); + ret = fts_backend_lookup(backend, flags, key, &tmp_result); if (ret == 0) { const struct seq_range *range; unsigned int i, count;
--- a/src/plugins/fts/fts-api.h Thu Mar 15 23:46:26 2007 +0200 +++ b/src/plugins/fts/fts-api.h Fri Mar 16 00:20:55 2007 +0200 @@ -6,6 +6,11 @@ #include "seq-range-array.h" +enum fts_lookup_flags { + FTS_LOOKUP_FLAG_HEADERS = 0x01, + FTS_LOOKUP_FLAG_BODY = 0x02 +}; + struct fts_backend * fts_backend_init(const char *backend_name, struct mailbox *box); void fts_backend_deinit(struct fts_backend *backend); @@ -19,9 +24,11 @@ fts_backend_build_init(struct fts_backend *backend, uint32_t *last_uid_r); /* Add more contents to the index. The data must contain only full valid UTF-8 characters, but it doesn't need to be NUL-terminated. size contains - the data size in bytes, not characters. */ + the data size in bytes, not characters. headers is TRUE if the data contains + message headers instead of message body. */ int fts_backend_build_more(struct fts_backend_build_context *ctx, uint32_t uid, - const unsigned char *data, size_t size); + const unsigned char *data, size_t size, + bool headers); /* Finish adding new data to the index. */ int fts_backend_build_deinit(struct fts_backend_build_context *ctx); @@ -41,12 +48,12 @@ void fts_backend_unlock(struct fts_backend *backend); /* Lookup key from the index and return the found UIDs in result. */ -int fts_backend_lookup(struct fts_backend *backend, const char *key, - ARRAY_TYPE(seq_range) *result); +int fts_backend_lookup(struct fts_backend *backend, enum fts_lookup_flags flags, + const char *key, ARRAY_TYPE(seq_range) *result); /* Drop UIDs from the result list for which the key doesn't exist. The idea is that with multiple search keywords you first lookup one and then filter the rest. */ -int fts_backend_filter(struct fts_backend *backend, const char *key, - ARRAY_TYPE(seq_range) *result); +int fts_backend_filter(struct fts_backend *backend, enum fts_lookup_flags flags, + const char *key, ARRAY_TYPE(seq_range) *result); #endif
--- a/src/plugins/fts/fts-storage.c Thu Mar 15 23:46:26 2007 +0200 +++ b/src/plugins/fts/fts-storage.c Fri Mar 16 00:20:55 2007 +0200 @@ -110,7 +110,7 @@ return 1; if (fts_backend_build_more(ctx->build, ctx->uid, str_data(ctx->headers), - str_len(ctx->headers)) < 0) + str_len(ctx->headers), TRUE) < 0) return -1; str_truncate(ctx->headers, 0); @@ -210,8 +210,8 @@ } } else { if (fts_backend_build_more(ctx->build, ctx->mail->uid, - block.data, - block.size) < 0) { + block.data, block.size, + FALSE) < 0) { ret = -1; break; } @@ -366,6 +366,7 @@ ARRAY_TYPE(seq_range) *uid_result) { const char *key; + enum fts_lookup_flags flags; for (; args != NULL; args = args->next) { switch (args->type) { @@ -392,7 +393,11 @@ key = args->hdr_field_name; } - if (fts_backend_filter(fctx->backend, key, + flags = FTS_LOOKUP_FLAG_BODY; + if (args->type == SEARCH_TEXT_FAST || + args->type == SEARCH_TEXT) + flags |= FTS_LOOKUP_FLAG_HEADERS; + if (fts_backend_filter(fctx->backend, flags, key, uid_result) < 0) { /* failed, but we already have limited the search, so just ignore this */ @@ -420,6 +425,7 @@ struct fts_search_context *fctx) { struct fts_backend *backend = fctx->backend; + enum fts_lookup_flags flags; const char *key; ARRAY_TYPE(seq_range) uid_result; @@ -433,11 +439,17 @@ /* we're only checking the existence of the header. */ + flags = FTS_LOOKUP_FLAG_HEADERS; key = fctx->best_arg->hdr_field_name; + } else { + flags = FTS_LOOKUP_FLAG_BODY; + if (fctx->best_arg->type == SEARCH_TEXT_FAST || + fctx->best_arg->type == SEARCH_TEXT) + flags |= FTS_LOOKUP_FLAG_HEADERS; } i_array_init(&uid_result, 64); - if (fts_backend_lookup(backend, key, &uid_result) < 0) { + if (fts_backend_lookup(backend, flags, key, &uid_result) < 0) { /* failed, fallback to reading everything */ array_free(&uid_result); return;