view src/lib-mail/message-snippet.c @ 19552:0f22db71df7a

global: freshen copyright git ls-files | xargs perl -p -i -e 's/(\d+)-201[0-5]/$1-2016/g;s/ (201[0-5]) Dovecot/ $1-2016 Dovecot/'
author Timo Sirainen <timo.sirainen@dovecot.fi>
date Wed, 13 Jan 2016 12:24:03 +0200
parents c02969e65b64
children 7d7723409083
line wrap: on
line source

/* Copyright (c) 2015-2016 Dovecot authors, see the included COPYING file */

#include "lib.h"
#include "buffer.h"
#include "str.h"
#include "istream.h"
#include "mail-html2text.h"
#include "message-parser.h"
#include "message-decoder.h"
#include "message-snippet.h"

enum snippet_state {
	/* beginning of the line */
	SNIPPET_STATE_NEWLINE = 0,
	/* within normal text */
	SNIPPET_STATE_NORMAL,
	/* within quoted text - skip until EOL */
	SNIPPET_STATE_QUOTED
};

struct snippet_context {
	string_t *snippet;
	unsigned int chars_left;
	enum snippet_state state;
	bool add_whitespace;
	struct mail_html2text *html2text;
	buffer_t *plain_output;
};

static bool snippet_generate(struct snippet_context *ctx,
			     const unsigned char *data, size_t size)
{
	unsigned int i, count;

	if (ctx->html2text != NULL) {
		buffer_set_used_size(ctx->plain_output, 0);
		mail_html2text_more(ctx->html2text, data, size,
				    ctx->plain_output);
		data = ctx->plain_output->data;
		size = ctx->plain_output->used;
	}

	/* message-decoder should feed us only valid and complete
	   UTF-8 input */
	for (i = 0; i < size; i += count) {
		count = 1;
		switch (ctx->state) {
		case SNIPPET_STATE_NEWLINE:
			if (data[i] == '>' && ctx->html2text == NULL) {
				ctx->state = SNIPPET_STATE_QUOTED;
				break;
			}
			ctx->state = SNIPPET_STATE_NORMAL;
			/* fallthrough */
		case SNIPPET_STATE_NORMAL:
			if (data[i] == '\r' || data[i] == '\n' ||
			    data[i] == '\t' || data[i] == ' ') {
				ctx->add_whitespace = TRUE;
				if (data[i] == '\n')
					ctx->state = SNIPPET_STATE_NEWLINE;
				break;
			}
			if (ctx->add_whitespace) {
				str_append_c(ctx->snippet, ' ');
				ctx->add_whitespace = FALSE;
				if (ctx->chars_left-- == 0)
					return FALSE;
			}
			if (ctx->chars_left-- == 0)
				return FALSE;
			count = uni_utf8_char_bytes(data[i]);
			i_assert(i + count <= size);
			str_append_n(ctx->snippet, data + i, count);
			break;
		case SNIPPET_STATE_QUOTED:
			if (data[i] == '\n')
				ctx->state = SNIPPET_STATE_NEWLINE;
			break;
		}
	}
	return TRUE;
}

int message_snippet_generate(struct istream *input,
			     unsigned int max_snippet_chars,
			     string_t *snippet)
{
	struct message_parser_ctx *parser;
	struct message_part *parts;
	struct message_decoder_context *decoder;
	struct message_block raw_block, block;
	struct snippet_context ctx;
	pool_t pool;
	int ret;

	memset(&ctx, 0, sizeof(ctx));
	pool = pool_alloconly_create("message snippet", 1024);
	ctx.snippet = snippet;
	ctx.chars_left = max_snippet_chars;

	parser = message_parser_init(pool_datastack_create(), input, 0, 0);
	decoder = message_decoder_init(NULL, 0);
	while ((ret = message_parser_parse_next_block(parser, &raw_block)) > 0) {
		if (!message_decoder_decode_next_block(decoder, &raw_block, &block))
			continue;
		if (block.size == 0) {
			const char *ct;

			if (block.hdr != NULL)
				continue;

			/* end of headers - verify that we can use this
			   Content-Type. we get here only once, because we
			   always handle only one non-multipart MIME part. */
			ct = message_decoder_current_content_type(decoder);
			if (ct == NULL)
				/* text/plain */ ;
			else if (mail_html2text_content_type_match(ct)) {
				ctx.html2text = mail_html2text_init(MAIL_HTML2TEXT_FLAG_SKIP_QUOTED);
				ctx.plain_output = buffer_create_dynamic(pool, 1024);
			} else if (strncasecmp(ct, "text/", 5) != 0)
				break;
			continue;
		}
		if (!snippet_generate(&ctx, block.data, block.size))
			break;
	}
	i_assert(ret != 0);
	message_decoder_deinit(&decoder);
	if (message_parser_deinit(&parser, &parts) < 0)
		i_unreached();
	if (ctx.html2text != NULL)
		mail_html2text_deinit(&ctx.html2text);
	pool_unref(&pool);
	return input->stream_errno == 0 ? 0 : -1;
}