Mercurial > dovecot > core-2.2
view src/plugins/fts/fts-build-mail.c @ 15714:90710c6c3beb
Updated copyright notices to include year 2013.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Sat, 02 Feb 2013 17:01:07 +0200 |
parents | 13e74bd5ac8c |
children | 36ef72481934 |
line wrap: on
line source
* Copyright (c) 2006-2013 Dovecot authors, see the included COPYING file */ #include "lib.h" #include "istream.h" #include "buffer.h" #include "str.h" #include "rfc822-parser.h" #include "message-address.h" #include "message-parser.h" #include "message-decoder.h" #include "mail-storage.h" #include "fts-parser.h" #include "fts-api-private.h" #include "fts-build-mail.h" /* there are other characters as well, but this doesn't have to be exact */ #define IS_WORD_WHITESPACE(c) \ ((c) == ' ' || (c) == '\t' || (c) == '\n') /* if we see a word larger than this, just go ahead and split it from wherever */ #define MAX_WORD_SIZE 1024 struct fts_mail_build_context { struct mail *mail; struct fts_backend_update_context *update_ctx; char *content_type, *content_disposition; struct fts_parser *body_parser; buffer_t *word_buf; }; static void fts_build_parse_content_type(struct fts_mail_build_context *ctx, const struct message_header_line *hdr) { struct rfc822_parser_context parser; string_t *content_type; rfc822_parser_init(&parser, hdr->full_value, hdr->full_value_len, NULL); rfc822_skip_lwsp(&parser); T_BEGIN { content_type = t_str_new(64); if (rfc822_parse_content_type(&parser, content_type) >= 0) { i_free(ctx->content_type); ctx->content_type = str_lcase(i_strdup(str_c(content_type))); } } T_END; } static void fts_build_parse_content_disposition(struct fts_mail_build_context *ctx, const struct message_header_line *hdr) { /* just pass it as-is to backend. */ i_free(ctx->content_disposition); ctx->content_disposition = i_strndup(hdr->full_value, hdr->full_value_len); } static void fts_parse_mail_header(struct fts_mail_build_context *ctx, const struct message_block *raw_block) { const struct message_header_line *hdr = raw_block->hdr; if (strcasecmp(hdr->name, "Content-Type") == 0) fts_build_parse_content_type(ctx, hdr); else if (strcasecmp(hdr->name, "Content-Disposition") == 0) fts_build_parse_content_disposition(ctx, hdr); } static void fts_build_unstructured_header(struct fts_mail_build_context *ctx, const struct message_header_line *hdr) { const unsigned char *data = hdr->full_value; unsigned char *buf = NULL; unsigned int i; /* @UNSAFE: if there are any NULs, replace them with spaces */ for (i = 0; i < hdr->full_value_len; i++) { if (data[i] == '\0') { if (buf == NULL) { buf = i_malloc(hdr->full_value_len); memcpy(buf, data, i); data = buf; } buf[i] = ' '; } else if (buf != NULL) { buf[i] = data[i]; } } (void)fts_backend_update_build_more(ctx->update_ctx, data, hdr->full_value_len); i_free(buf); } static void fts_build_mail_header(struct fts_mail_build_context *ctx, const struct message_block *block) { const struct message_header_line *hdr = block->hdr; struct fts_backend_build_key key; if (hdr->eoh) return; /* hdr->full_value is always set because we get the block from message_decoder */ memset(&key, 0, sizeof(key)); key.uid = ctx->mail->uid; key.type = block->part->physical_pos == 0 ? FTS_BACKEND_BUILD_KEY_HDR : FTS_BACKEND_BUILD_KEY_MIME_HDR; key.hdr_name = hdr->name; if (!fts_backend_update_set_build_key(ctx->update_ctx, &key)) return; if (!message_header_is_address(hdr->name)) { /* regular unstructured header */ fts_build_unstructured_header(ctx, hdr); } else T_BEGIN { /* message address. normalize it to give better search results. */ struct message_address *addr; string_t *str; addr = message_address_parse(pool_datastack_create(), hdr->full_value, hdr->full_value_len, -1U, TRUE); str = t_str_new(hdr->full_value_len); message_address_write(str, addr); (void)fts_backend_update_build_more(ctx->update_ctx, str_data(str), str_len(str)); } T_END; } static bool fts_build_body_begin(struct fts_mail_build_context *ctx, bool *binary_body_r) { struct mail_storage *storage; const char *content_type; struct fts_backend_build_key key; i_assert(ctx->body_parser == NULL); *binary_body_r = FALSE; memset(&key, 0, sizeof(key)); key.uid = ctx->mail->uid; content_type = ctx->content_type != NULL ? ctx->content_type : "text/plain"; if (strncmp(content_type, "multipart/", 10) == 0) { /* multiparts are never indexed, only their contents */ return FALSE; } storage = mailbox_get_storage(ctx->mail->box); if (fts_parser_init(mail_storage_get_user(storage), content_type, ctx->content_disposition, &ctx->body_parser)) { /* extract text using the the returned parser */ *binary_body_r = TRUE; key.type = FTS_BACKEND_BUILD_KEY_BODY_PART; } else if (strncmp(content_type, "text/", 5) == 0 || strncmp(content_type, "message/", 8) == 0) { /* text body parts */ key.type = FTS_BACKEND_BUILD_KEY_BODY_PART; } else { /* possibly binary */ if ((ctx->update_ctx->backend->flags & FTS_BACKEND_FLAG_BINARY_MIME_PARTS) == 0) return FALSE; *binary_body_r = TRUE; key.type = FTS_BACKEND_BUILD_KEY_BODY_PART_BINARY; } if (ctx->body_parser == NULL) ctx->body_parser = fts_parser_text_init(); key.body_content_type = content_type; key.body_content_disposition = ctx->content_disposition; return fts_backend_update_set_build_key(ctx->update_ctx, &key); } static int fts_build_body_block(struct fts_mail_build_context *ctx, struct message_block *block, bool last) { unsigned int i; i_assert(block->hdr == NULL); if ((ctx->update_ctx->backend->flags & FTS_BACKEND_FLAG_BUILD_FULL_WORDS) == 0) { return fts_backend_update_build_more(ctx->update_ctx, block->data, block->size); } /* we'll need to send only full words to the backend */ if (ctx->word_buf != NULL && ctx->word_buf->used > 0) { /* continuing previous word */ for (i = 0; i < block->size; i++) { if (IS_WORD_WHITESPACE(block->data[i])) break; } buffer_append(ctx->word_buf, block->data, i); block->data += i; block->size -= i; if (block->size == 0 && ctx->word_buf->used < MAX_WORD_SIZE && !last) { /* word is still not finished */ return 0; } /* we have a full word, index it */ if (fts_backend_update_build_more(ctx->update_ctx, ctx->word_buf->data, ctx->word_buf->used) < 0) return -1; buffer_set_used_size(ctx->word_buf, 0); } /* find the boundary for last word */ if (last) i = block->size; else { for (i = block->size; i > 0; i--) { if (IS_WORD_WHITESPACE(block->data[i-1])) break; } } if (fts_backend_update_build_more(ctx->update_ctx, block->data, i) < 0) return -1; if (i < block->size) { if (ctx->word_buf == NULL) { ctx->word_buf = buffer_create_dynamic(default_pool, 128); } buffer_append(ctx->word_buf, block->data + i, block->size - i); } return 0; } static int fts_body_parser_finish(struct fts_mail_build_context *ctx) { struct message_block block; int ret = 0; do { memset(&block, 0, sizeof(block)); fts_parser_more(ctx->body_parser, &block); if (fts_build_body_block(ctx, &block, FALSE) < 0) { ret = -1; break; } } while (block.size > 0); fts_parser_deinit(&ctx->body_parser); return ret; } static int fts_build_mail_real(struct fts_backend_update_context *update_ctx, struct mail *mail) { struct fts_mail_build_context ctx; struct istream *input; struct message_parser_ctx *parser; struct message_decoder_context *decoder; struct message_block raw_block, block; struct message_part *prev_part, *parts; bool skip_body = FALSE, body_part = FALSE, body_added = FALSE; bool binary_body; int ret; if (mail_get_stream(mail, NULL, NULL, &input) < 0) return mail->expunged ? 0 : -1; memset(&ctx, 0, sizeof(ctx)); ctx.update_ctx = update_ctx; ctx.mail = mail; prev_part = NULL; parser = message_parser_init(pool_datastack_create(), input, MESSAGE_HEADER_PARSER_FLAG_CLEAN_ONELINE, 0); decoder = message_decoder_init(update_ctx->normalizer, 0); for (;;) { ret = message_parser_parse_next_block(parser, &raw_block); i_assert(ret != 0); if (ret < 0) { if (input->stream_errno == 0) ret = 0; break; } if (raw_block.part != prev_part) { /* body part changed. we're now parsing the end of boundary, possibly followed by message epilogue */ if (ctx.body_parser != NULL) { if (fts_body_parser_finish(&ctx) < 0) { ret = -1; break; } } message_decoder_set_return_binary(decoder, FALSE); fts_backend_update_unset_build_key(update_ctx); prev_part = raw_block.part; i_free_and_null(ctx.content_type); i_free_and_null(ctx.content_disposition); if (raw_block.size != 0) { /* multipart. skip until beginning of next part's headers */ skip_body = TRUE; } } if (raw_block.hdr != NULL) { /* always handle headers */ } else if (raw_block.size == 0) { /* end of headers */ skip_body = !fts_build_body_begin(&ctx, &binary_body); if (binary_body) message_decoder_set_return_binary(decoder, TRUE); body_part = TRUE; } else { if (skip_body) continue; } if (!message_decoder_decode_next_block(decoder, &raw_block, &block)) continue; if (block.hdr != NULL) { fts_parse_mail_header(&ctx, &raw_block); fts_build_mail_header(&ctx, &block); } else if (block.size == 0) { /* end of headers */ } else { i_assert(body_part); if (ctx.body_parser != NULL) fts_parser_more(ctx.body_parser, &block); if (fts_build_body_block(&ctx, &block, FALSE) < 0) { ret = -1; break; } body_added = TRUE; } } if (ret == 0 && ctx.body_parser != NULL) ret = fts_body_parser_finish(&ctx); if (ret == 0 && body_part && !skip_body && !body_added) { /* make sure body is added even when it doesn't exist */ block.data = NULL; block.size = 0; ret = fts_build_body_block(&ctx, &block, TRUE); } if (message_parser_deinit(&parser, &parts) < 0) mail_set_cache_corrupted(mail, MAIL_FETCH_MESSAGE_PARTS); message_decoder_deinit(&decoder); i_free(ctx.content_type); i_free(ctx.content_disposition); if (ctx.word_buf != NULL) buffer_free(&ctx.word_buf); return ret < 0 ? -1 : 1; } int fts_build_mail(struct fts_backend_update_context *update_ctx, struct mail *mail) { int ret; T_BEGIN { ret = fts_build_mail_real(update_ctx, mail); } T_END; return ret; }