Mercurial > dovecot > core-2.2
changeset 13259:b0777f44c999
fts-lucene: Another attempt to avoid "Too Many Clauses" error by not using ? wildcard.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Tue, 16 Aug 2011 21:08:23 +0300 |
parents | 6b8ef63846d7 |
children | bd0e2c2d0f1d |
files | src/plugins/fts-lucene/lucene-wrapper.cc |
diffstat | 1 files changed, 82 insertions(+), 37 deletions(-) [+] |
line wrap: on
line diff
--- a/src/plugins/fts-lucene/lucene-wrapper.cc Tue Aug 16 20:04:29 2011 +0300 +++ b/src/plugins/fts-lucene/lucene-wrapper.cc Tue Aug 16 21:08:23 2011 +0300 @@ -688,12 +688,31 @@ static void lucene_index_query_range_add(BooleanQuery *query, wchar_t *wuid, - wchar_t max_char) + unsigned int idx, wchar_t max_char) { - wchar_t i; + wchar_t c; + + if (wuid[idx] == '1' && max_char == '9') { + /* 123[1..9]* = 123* */ + wuid[idx] = '*'; + wuid[idx+1] = '\0'; - for (i = wuid[0]; i <= max_char; i++) { - wuid[0] = i; + Term *term = _CLNEW Term(_T("uid"), wuid); + query->add(_CLNEW WildcardQuery(term), true, BooleanClause::SHOULD); + _CLDECDELETE(term); + return; + } + + if (wuid[idx+1] == '\0') { + /* 2..5 */ + } else { + /* 2* .. 5* */ + wuid[idx+1] = '*'; + wuid[idx+2] = '\0'; + } + + for (c = wuid[idx]; c <= max_char; c++) { + wuid[idx] = c; Term *term = _CLNEW Term(_T("uid"), wuid); query->add(_CLNEW WildcardQuery(term), true, BooleanClause::SHOULD); @@ -701,6 +720,63 @@ } } +static bool +lucene_index_add_uid_filter(BooleanQuery *query, uint32_t seq1, uint32_t seq2) +{ + unsigned int i; + + i_assert(seq1 <= seq2); + + /* RangeQuery actually just adds each term within the range to the + search query, causing "too many clauses" at some point. + So use WildcardQuery to get something approximately true. */ + wchar_t wuid1[MAX_INT_STRLEN+2], wuid2[MAX_INT_STRLEN+2]; + swprintf(wuid1, N_ELEMENTS(wuid1), L"%u", seq1); + swprintf(wuid2, N_ELEMENTS(wuid2), L"%u", seq2); + + if (wcslen(wuid1) == wcslen(wuid2)) { + /* find the common prefix and use it */ + for (i = 0; wuid1[i] != '\0'; i++) { + if (wuid1[i] != wuid2[i]) + break; + } + if (i == 0 && wuid1[i] == '1' && wuid2[i] == '9') { + /* need to scan everything */ + return false; + } + + if (wuid1[i] != '\0') { + lucene_index_query_range_add(query, wuid1, i, wuid2[i]); + return true; + } + /* only a single UID */ + Term *term = _CLNEW Term(_T("uid"), wuid1); + query->add(_CLNEW TermQuery(term), true, BooleanClause::SHOULD); + _CLDECDELETE(term); + return true; + } + if (wcslen(wuid1)+1 != wcslen(wuid2)) { + /* need to scan everything */ + return false; + } + + /* we can optimize this only if seq1 starts with >1 higher number than + seq2. For example: + + 40 .. 500 -> have to go through everything + 40 .. 400 -> -''- + 40 .. 300 -> -''- + 40 .. 200 -> we can avoid 3* + 40 .. 100 -> we can avoid 2* and 3* */ + if (wuid1[0] <= wuid2[0]+1) + return false; + + lucene_index_query_range_add(query, wuid1, 0, '9'); + wuid1[0] = '1'; + lucene_index_query_range_add(query, wuid1, 0, wuid2[0]); + return true; +} + static int lucene_index_expunge_record(struct lucene_index *index, const struct fts_expunge_log_read_record *rec) @@ -716,40 +792,9 @@ BooleanQuery query; BooleanQuery uids_query; - /* RangeQuery actually just adds each term within the range to the - search query, causing "too many clauses" at some point. - So use WildcardQuery to get something approximately true. */ - uint32_t seq1 = range[0].seq1, seq2 = range[count-1].seq2; - - if (seq2 / seq1 > 10) { - /* just iterate through everything */ - } else { - wchar_t wuid1[MAX_INT_STRLEN], wuid2[MAX_INT_STRLEN]; - unsigned int i; - - swprintf(wuid1, N_ELEMENTS(wuid1), L"%u", range[0].seq1); - swprintf(wuid2, N_ELEMENTS(wuid2), L"%u", range[count-1].seq2); - - for (i = 1; wuid1[i] != '\0'; i++) - wuid1[i] = '?'; - for (i = 1; wuid2[i] != '\0'; i++) - wuid2[i] = '?'; - - if (wcslen(wuid1) == wcslen(wuid2)) { - /* for example: 1???..9??? */ - lucene_index_query_range_add(&uids_query, - wuid1, wuid2[0]); - } else { - /* for example: 4?? .. 5??? */ - lucene_index_query_range_add(&uids_query, - wuid1, '9'); - wchar_t max = wuid2[0]; - wuid2[0] = '1'; - lucene_index_query_range_add(&uids_query, - wuid2, max); - } + if (lucene_index_add_uid_filter(&uids_query, range[0].seq1, + range[count-1].seq2)) query.add(&uids_query, BooleanClause::MUST); - } wchar_t wguid[MAILBOX_GUID_HEX_LENGTH + 1]; guid128_to_wguid(rec->mailbox_guid, wguid);