Mercurial > dovecot > original-hg > dovecot-1.2
diff src/lib-mail/message-body-search.c @ 608:debb8468514e HEAD
SEARCH CHARSET now works properly with message bodies, and in general body
searching works more correctly by decoding base64/qp data. Non-text MIME
parts are currently not included in search, that could be made optional.
Also the body is parsed separately for each keyword, that could be
optimized.
Changed base64_decode() behaviour so that it can accept non-base64 data as
well, ie. line feeds etc.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Wed, 13 Nov 2002 13:08:18 +0200 |
parents | |
children | dd574ac271c1 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib-mail/message-body-search.c Wed Nov 13 13:08:18 2002 +0200 @@ -0,0 +1,400 @@ +/* Copyright (C) 2002 Timo Sirainen */ + +#include "lib.h" +#include "base64.h" +#include "ibuffer.h" +#include "charset-utf8.h" +#include "rfc822-tokenize.h" +#include "quoted-printable.h" +#include "message-parser.h" +#include "message-content-parser.h" +#include "message-header-search.h" +#include "message-body-search.h" + +#define DECODE_BLOCK_SIZE 8192 + +typedef struct { + Pool pool; + + const char *key; + size_t key_len; + + const char *charset; + unsigned int unknown_charset:1; +} BodySearchContext; + +typedef struct { + BodySearchContext *body_ctx; + + HeaderSearchContext *hdr_search_ctx; + CharsetTranslation *translation; + + unsigned char decode_buf[DECODE_BLOCK_SIZE]; + size_t decode_buf_used; + + size_t *matches; + ssize_t match_count; + + const char *content_type; + const char *content_charset; + + unsigned int content_qp:1; + unsigned int content_base64:1; + unsigned int content_unknown:1; + unsigned int content_type_text:1; /* text/any or message/any */ + unsigned int found:1; +} PartSearchContext; + +static void parse_content_type(const Rfc822Token *tokens, int count, + void *context) +{ + PartSearchContext *ctx = context; + + if (ctx->content_type != NULL && tokens[0].token == 'A') { + ctx->content_type = rfc822_tokens_get_value(tokens, count); + ctx->content_type_text = + strncasecmp(ctx->content_type, "text/", 5) == 0 || + strncasecmp(ctx->content_type, "message/", 8) == 0; + } +} + +static void parse_content_type_param(const Rfc822Token *name, + const Rfc822Token *value, + int value_count, void *context) +{ + PartSearchContext *ctx = context; + + if (name->len != 7 || strncasecmp(name->ptr, "charset", 7) != 0) + return; + + if (ctx->content_charset == NULL) { + ctx->content_charset = + rfc822_tokens_get_value(value, value_count); + } +} + +static void parse_content_encoding(const Rfc822Token *tokens, + int count __attr_unused__, void *context) +{ + PartSearchContext *ctx = context; + + if (tokens[0].token != 'A') + return; + + switch (tokens[0].len) { + case 4: + if (strncasecmp(tokens[0].ptr, "7bit", 4) != 0 && + strncasecmp(tokens[0].ptr, "8bit", 4) != 0) + ctx->content_unknown = TRUE; + break; + case 6: + if (strncasecmp(tokens[0].ptr, "base64", 6) == 0) + ctx->content_base64 = TRUE; + else if (strncasecmp(tokens[0].ptr, "binary", 6) != 0) + ctx->content_unknown = TRUE; + break; + case 16: + if (strncasecmp(tokens[0].ptr, "quoted-printable", 16) == 0) + ctx->content_qp = TRUE; + else + ctx->content_unknown = TRUE; + break; + default: + ctx->content_unknown = TRUE; + break; + } +} + +static void header_find(MessagePart *part __attr_unused__, + const char *name, size_t name_len, + const char *value, size_t value_len, void *context) +{ + PartSearchContext *ctx = context; + + if (ctx->found) + return; + + ctx->found = message_header_search(value, &value_len, + ctx->hdr_search_ctx); + + if (name_len == 12 && strncasecmp(name, "Content-Type", 12) == 0) { + (void)message_content_parse_header(t_strndup(value, value_len), + parse_content_type, + parse_content_type_param, + ctx); + } else if (name_len == 25 && + strncasecmp(name, "Content-Transfer-Encoding", 25) == 0) { + (void)message_content_parse_header(t_strndup(value, value_len), + parse_content_encoding, + NULL, ctx); + } +} + +static int message_search_header(PartSearchContext *ctx, IBuffer *inbuf) +{ + ctx->hdr_search_ctx = message_header_search_init(data_stack_pool, + ctx->body_ctx->key, + ctx->body_ctx->charset, + NULL); + + /* we default to text content-type */ + ctx->content_type_text = TRUE; + message_parse_header(NULL, inbuf, NULL, header_find, ctx); + + return ctx->found; +} + +static int message_search_decoded_block(PartSearchContext *ctx, + const unsigned char *data, size_t size) +{ + const unsigned char *p, *end, *key; + size_t key_len; + ssize_t i; + int found; + + key = (const unsigned char *) ctx->body_ctx->key; + key_len = ctx->body_ctx->key_len; + + end = data + size; found = 0; + for (p = data; p != end; p++) { + for (i = ctx->match_count-1; i >= 0; i--) { + if (key[ctx->matches[i]] == *p) { + if (++ctx->matches[i] == key_len) { + /* full match */ + p++; + found = TRUE; + break; + } + } else { + /* non-match */ + ctx->match_count--; + if (i != ctx->match_count) { + memmove(ctx->matches + i, + ctx->matches + i + 1, + ctx->match_count - i); + } + } + } + + if (found) + break; + + if (*p == key[0]) { + if (key_len == 1) { + /* only one character in search key */ + p++; + found = 1; + break; + } + i_assert((size_t)ctx->match_count < key_len); + ctx->matches[ctx->match_count++] = 1; + } + } + + return found; +} + +static int message_search_body_block(PartSearchContext *ctx, + const unsigned char *data, size_t size) +{ + const unsigned char *inbuf; + unsigned char outbuf[DECODE_BLOCK_SIZE]; + size_t inbuf_size, outbuf_size, max_size; + + if (ctx->body_ctx->unknown_charset || ctx->translation == NULL) + return message_search_decoded_block(ctx, data, size); + + while (size > 0) { + if (ctx->decode_buf_used == 0) { + inbuf = data; + inbuf_size = I_MIN(size, sizeof(ctx->decode_buf)); + + data += inbuf_size; + size -= inbuf_size; + } else { + /* some characters already in buffer, ie. last + conversion contained partial data */ + max_size = sizeof(ctx->decode_buf) - + ctx->decode_buf_used; + if (max_size > size) + max_size = size; + + memcpy(ctx->decode_buf + ctx->decode_buf_used, + data, max_size); + ctx->decode_buf_used += max_size; + + inbuf = ctx->decode_buf; + inbuf_size = ctx->decode_buf_used; + + data += max_size; + size -= max_size; + } + + outbuf_size = sizeof(outbuf); + if (!charset_to_ucase_utf8(ctx->translation, + &inbuf, &inbuf_size, + outbuf, &outbuf_size)) { + /* something failed */ + return -1; + } + + if (message_search_decoded_block(ctx, outbuf, outbuf_size)) + return 1; + + if (inbuf_size > 0) { + /* partial input, save it */ + memmove(ctx->decode_buf, inbuf, inbuf_size); + ctx->decode_buf_used = inbuf_size; + } + } + + return 0; +} + +static int message_search_body(PartSearchContext *ctx, IBuffer *inbuf, + MessagePart *part) +{ + const unsigned char *data, *decoded; + unsigned char *decodebuf; + size_t data_size, decoded_size, pos; + uoff_t old_limit; + ssize_t ret; + int found; + + if (ctx->content_unknown) { + /* unknown content-encoding-type, ignore */ + return FALSE; + } + + if (!ctx->content_type_text) { + /* non-text content, ignore - FIXME: should be configurable? */ + return FALSE; + } + + ctx->translation = charset_to_utf8_begin(ctx->content_charset != NULL ? + ctx->content_charset : "ascii", + NULL); + + ctx->match_count = 0; + ctx->matches = t_malloc(sizeof(size_t) * ctx->body_ctx->key_len); + + i_buffer_skip(inbuf, part->physical_pos + + part->header_size.physical_size - inbuf->v_offset); + + old_limit = inbuf->v_limit; + i_buffer_set_read_limit(inbuf, inbuf->v_offset + + part->body_size.physical_size); + + found = FALSE; pos = 0; + while (i_buffer_read_data(inbuf, &data, &data_size, pos) > 0) { + /* limit the size of t_malloc()s */ + if (data_size > DECODE_BLOCK_SIZE) + data_size = DECODE_BLOCK_SIZE; + pos = data_size; + + t_push(); + if (ctx->content_qp) { + decoded = decodebuf = t_malloc(data_size); + decoded_size = quoted_printable_decode(data, &data_size, + decodebuf); + } else if (ctx->content_base64) { + decoded_size = MAX_BASE64_DECODED_SIZE(data_size); + decoded = decodebuf = t_malloc(decoded_size); + + ret = base64_decode(data, &data_size, decodebuf); + decoded_size = ret < 0 ? 0 : (size_t)decoded_size; + } else { + decoded = data; + decoded_size = data_size; + } + + ret = message_search_body_block(ctx, decoded, decoded_size); + if (ret != 0) { + t_pop(); + found = ret > 0; + break; + } + + t_pop(); + i_buffer_skip(inbuf, data_size); + pos -= data_size; + } + + i_buffer_set_read_limit(inbuf, old_limit); + + if (ctx->translation != NULL) + charset_to_utf8_end(ctx->translation); + return found; +} + +static int message_body_search_init(BodySearchContext *ctx, const char *key, + const char *charset, int *unknown_charset) +{ + size_t size; + + memset(ctx, 0, sizeof(BodySearchContext)); + + /* get the key uppercased */ + size = strlen(key); + key = charset_to_ucase_utf8_string(charset, unknown_charset, + (const unsigned char *) key, &size); + if (key == NULL) + return FALSE; + + i_assert(size <= SSIZE_T_MAX/sizeof(size_t)); + + ctx->key = key; + ctx->key_len = size; + ctx->charset = charset; + ctx->unknown_charset = charset == NULL; + + return TRUE; +} + +static int message_body_search_ctx(BodySearchContext *ctx, IBuffer *inbuf, + MessagePart *part) +{ + PartSearchContext part_ctx; + int found; + + found = FALSE; + while (part != NULL && !found) { + i_assert(inbuf->v_offset <= part->physical_pos); + + i_buffer_skip(inbuf, part->physical_pos - inbuf->v_offset); + + memset(&part_ctx, 0, sizeof(part_ctx)); + part_ctx.body_ctx = ctx; + + t_push(); + + if (message_search_header(&part_ctx, inbuf)) { + found = TRUE; + } else if (part->children != NULL) { + /* multipart/xxx or message/rfc822 */ + if (message_body_search_ctx(ctx, inbuf, part->children)) + found = TRUE; + } else { + if (message_search_body(&part_ctx, inbuf, part)) + found = TRUE; + } + + t_pop(); + + part = part->next; + } + + return found; +} + +int message_body_search(const char *key, const char *charset, + int *unknown_charset, IBuffer *inbuf, + MessagePart *part) +{ + BodySearchContext ctx; + + if (!message_body_search_init(&ctx, key, charset, unknown_charset)) + return -1; + + return message_body_search_ctx(&ctx, inbuf, part); +}