view src/lib-mail/message-parser.c @ 765:553f050c8313 HEAD

Added buffer API. Point is to hide all buffer writing behind this API which verifies that nothing overflows. Much better than doing the same checks all around the code, even if it is slightly slower. Buffer reading is still mostly done directly, that isn't such a big security risk and I can't think of a reasonable API for it anyway.
author Timo Sirainen <tss@iki.fi>
date Sun, 08 Dec 2002 07:23:07 +0200
parents f57c52738f90
children e49f7397af98
line wrap: on
line source

/* Copyright (C) 2002 Timo Sirainen */

#include "lib.h"
#include "istream.h"
#include "rfc822-tokenize.h"
#include "message-content-parser.h"
#include "message-parser.h"
#include "message-size.h"

typedef struct _MessageBoundary {
	struct _MessageBoundary *next;

	MessagePart *part;
	const char *boundary;
	size_t len;
} MessageBoundary;

typedef struct {
	Pool pool;
	MessagePart *part;

	char *last_boundary;
	char *last_content_type;
	MessageBoundary *boundaries;

	MessageHeaderFunc func;
	void *context;
} MessageParseContext;

static MessagePart *message_parse_part(IStream *input,
				       MessageParseContext *parse_ctx);
static MessagePart *message_parse_body(IStream *input,
				       MessageBoundary *boundaries,
				       MessageSize *body_size);
static MessagePart *message_skip_boundary(IStream *input,
					  MessageBoundary *boundaries,
					  MessageSize *boundary_size);

static void message_size_add_part(MessageSize *dest, MessagePart *part)
{
	dest->physical_size +=
		part->header_size.physical_size +
		part->body_size.physical_size;
	dest->virtual_size +=
		part->header_size.virtual_size +
		part->body_size.virtual_size;
	dest->lines += part->header_size.lines + part->body_size.lines;
}

static MessagePart *message_part_append(Pool pool, MessagePart *parent)
{
	MessagePart *part, **list;

	part = p_new(pool, MessagePart, 1);
	part->parent = parent;

	/* set child position */
	part->physical_pos =
		parent->physical_pos +
		parent->body_size.physical_size +
		parent->header_size.physical_size;

	list = &part->parent->children;
	while (*list != NULL)
		list = &(*list)->next;

	*list = part;
	return part;
}

static void parse_content_type(const Rfc822Token *tokens, int count,
			       void *context)
{
	MessageParseContext *parse_ctx = context;
	const char *str;

	if (tokens[0].token != 'A')
		return;

	if (parse_ctx->last_content_type != NULL)
		return;

	str = rfc822_tokens_get_value(tokens, count);
	parse_ctx->last_content_type = p_strdup(parse_ctx->pool, str);

	if (strcasecmp(str, "message/rfc822") == 0)
		parse_ctx->part->flags |= MESSAGE_PART_FLAG_MESSAGE_RFC822;
	else if (strncasecmp(str, "text/", 5) == 0)
		parse_ctx->part->flags |= MESSAGE_PART_FLAG_TEXT;
	else if (strncasecmp(str, "multipart/", 10) == 0) {
		parse_ctx->part->flags |= MESSAGE_PART_FLAG_MULTIPART;

		if (strcasecmp(str+10, "digest") == 0) {
			parse_ctx->part->flags |=
				MESSAGE_PART_FLAG_MULTIPART_DIGEST;
		}
	}
}

static void parse_content_type_param(const Rfc822Token *name,
				     const Rfc822Token *value,
				     int value_count, void *context)
{
	MessageParseContext *parse_ctx = context;
	const char *str;

	if ((parse_ctx->part->flags & MESSAGE_PART_FLAG_MULTIPART) == 0 ||
	    name->len != 8 || strncasecmp(name->ptr, "boundary", 8) != 0)
		return;

	if (parse_ctx->last_boundary == NULL) {
		str = rfc822_tokens_get_value(value, value_count);
		parse_ctx->last_boundary = p_strdup(parse_ctx->pool, str);
	}
}

static void parse_header_field(MessagePart *part,
			       const char *name, size_t name_len,
			       const char *value, size_t value_len,
			       void *context)
{
	MessageParseContext *parse_ctx = context;

	/* call the user-defined header parser */
	if (parse_ctx->func != NULL) {
		parse_ctx->func(part, name, name_len, value, value_len,
				parse_ctx->context);
	}

	if (name_len == 12 && strncasecmp(name, "Content-Type", 12) == 0) {
		/* we need to know the boundary */
		(void)message_content_parse_header(t_strndup(value, value_len),
						   parse_content_type,
						   parse_content_type_param,
						   parse_ctx);
	}
}

static MessagePart *message_parse_multipart(IStream *input,
					    MessageParseContext *parse_ctx)
{
	MessagePart *parent_part, *next_part, *part;
	MessageBoundary *b;

	/* multipart message. add new boundary */
	b = t_new(MessageBoundary, 1);
	b->part = parse_ctx->part;
	b->boundary = parse_ctx->last_boundary;
	b->len = strlen(b->boundary);

	b->next = parse_ctx->boundaries;
	parse_ctx->boundaries = b;

	/* reset fields */
	parse_ctx->last_boundary = NULL;
	parse_ctx->last_content_type = NULL;

	/* skip the data before the first boundary */
	parent_part = parse_ctx->part;
	next_part = message_skip_boundary(input, parse_ctx->boundaries,
					  &parent_part->body_size);

	/* now, parse the parts */
	while (next_part == parent_part) {
		/* new child */
		part = message_part_append(parse_ctx->pool, parent_part);

                parse_ctx->part = part;
		next_part = message_parse_part(input, parse_ctx);

		/* update our size */
		message_size_add_part(&parent_part->body_size, part);

		if (next_part != parent_part)
			break;

		/* skip the boundary */
		next_part = message_skip_boundary(input, parse_ctx->boundaries,
						  &parent_part->body_size);
	}

	/* remove boundary */
	i_assert(parse_ctx->boundaries == b);
	parse_ctx->boundaries = b->next;
	return next_part;
}

static MessagePart *message_parse_part(IStream *input,
				       MessageParseContext *parse_ctx)
{
	MessagePart *next_part, *part;
	uoff_t hdr_size;

	message_parse_header(parse_ctx->part, input,
			     &parse_ctx->part->header_size,
			     parse_header_field, parse_ctx);

	/* update message position/size */
	hdr_size = parse_ctx->part->header_size.physical_size;

	if (parse_ctx->last_boundary != NULL)
		return message_parse_multipart(input, parse_ctx);

	if (parse_ctx->last_content_type == NULL) {
		if (parse_ctx->part->parent != NULL &&
		    (parse_ctx->part->parent->flags &
		     MESSAGE_PART_FLAG_MULTIPART_DIGEST)) {
			/* when there's no content-type specified and we're
			   below multipart/digest, the assume message/rfc822
			   content-type */
			parse_ctx->part->flags |=
				MESSAGE_PART_FLAG_MESSAGE_RFC822;
		} else {
			/* otherwise we default to text/plain */
			parse_ctx->part->flags |= MESSAGE_PART_FLAG_TEXT;
		}
	}

	parse_ctx->last_boundary = NULL;
        parse_ctx->last_content_type = NULL;

	if (parse_ctx->part->flags & MESSAGE_PART_FLAG_MESSAGE_RFC822) {
		/* message/rfc822 part - the message body begins with
		   headers again, this works pretty much the same as
		   a single multipart/mixed item */
		part = message_part_append(parse_ctx->pool, parse_ctx->part);

		parse_ctx->part = part;
		next_part = message_parse_part(input, parse_ctx);
		parse_ctx->part = part->parent;

		/* our body size is the size of header+body in message/rfc822 */
		message_size_add_part(&part->parent->body_size, part);
	} else {
		/* normal message, read until the next boundary */
		part = parse_ctx->part;
		next_part = message_parse_body(input, parse_ctx->boundaries,
					       &part->body_size);
	}

	return next_part;
}

MessagePart *message_parse(Pool pool, IStream *input,
			   MessageHeaderFunc func, void *context)
{
	MessagePart *part;
	MessageParseContext parse_ctx;

	memset(&parse_ctx, 0, sizeof(parse_ctx));
	parse_ctx.pool = pool;
	parse_ctx.func = func;
	parse_ctx.context = context;
	parse_ctx.part = part = p_new(pool, MessagePart, 1);

	message_parse_part(input, &parse_ctx);
	return part;
}

/* skip over to next line increasing message size */
static void message_skip_line(IStream *input, MessageSize *msg_size)
{
	const unsigned char *msg;
	size_t i, size, startpos;

	startpos = 0;

	while (i_stream_read_data(input, &msg, &size, startpos) > 0) {
		for (i = startpos; i < size; i++) {
			if (msg[i] == '\n') {
				if (msg_size != NULL) {
					if (i == 0 || msg[i-1] != '\r')
						msg_size->virtual_size++;
					msg_size->lines++;
				}
				break;
			}
		}

		if (i < size) {
			startpos = i+1;
			break;
		}

		/* leave the last character, it may be \r */
		i_stream_skip(input, i - 1);
		startpos = 1;

		if (msg_size != NULL) {
			msg_size->physical_size += i - 1;
			msg_size->virtual_size += i - 1;
		}
	}

	i_stream_skip(input, startpos);

	if (msg_size != NULL) {
		msg_size->physical_size += startpos;
		msg_size->virtual_size += startpos;
	}
}

void message_parse_header(MessagePart *part, IStream *input,
			  MessageSize *hdr_size,
			  MessageHeaderFunc func, void *context)
{
	const unsigned char *msg;
	size_t i, size, parse_size, startpos, missing_cr_count;
	size_t line_start, colon_pos, end_pos, name_len, value_len;
	int ret;

	if (hdr_size != NULL)
		memset(hdr_size, 0, sizeof(MessageSize));

	missing_cr_count = startpos = line_start = 0;
	colon_pos = UINT_MAX;
	for (;;) {
		ret = i_stream_read_data(input, &msg, &size, startpos+1);
		if (ret == -2) {
			/* overflow, line is too long. just skip it. */
			i_assert(size > 2);

                        message_skip_line(input, hdr_size);
			startpos = line_start = 0;
			colon_pos = UINT_MAX;
			continue;
		}

		if (ret < 0 || (ret <= 0 && size == startpos)) {
			/* EOF and nothing in buffer. the later check is
			   needed only when there's no message body */
			break;
		}

		parse_size = size <= startpos+1 ? size : size-1;
		for (i = startpos; i < parse_size; i++) {
			if (msg[i] == ':' && colon_pos == UINT_MAX) {
				colon_pos = i;
				continue;
			}

			if (msg[i] != '\n')
				continue;

			if (hdr_size != NULL)
				hdr_size->lines++;

			if (i == 0 || msg[i-1] != '\r') {
				/* missing CR */
				missing_cr_count++;
			}

			if (i == 0 || (i == 1 && msg[i-1] == '\r')) {
				/* no headers at all */
				break;
			}

			if ((i > 0 && msg[i-1] == '\n') ||
			    (i > 1 && msg[i-2] == '\n' && msg[i-1] == '\r')) {
				/* \n\n or \n\r\n - end of headers */
				break;
			}

			/* make sure the header doesn't continue to next line */
			if (i+1 == size || !IS_LWSP(msg[i+1])) {
				if (colon_pos != UINT_MAX &&
				    colon_pos != line_start && func != NULL &&
				    !IS_LWSP(msg[line_start])) {
					/* we have a valid header line */

					/* get length of name-field */
					end_pos = colon_pos-1;
					while (end_pos > line_start &&
					       IS_LWSP(msg[end_pos]))
						end_pos--;
					name_len = end_pos - line_start + 1;

					/* get length of value field. skip
					   only the initial LWSP after ':'.
					   some fields may want to keep
					   the extra spaces.. */
					colon_pos++;
					if (colon_pos < i &&
					    IS_LWSP(msg[colon_pos]))
						colon_pos++;
					value_len = i - colon_pos;
					if (msg[i-1] == '\r') value_len--;

					/* and finally call the function */
					func(part,
					     (const char *) msg + line_start,
					     name_len,
					     (const char *) msg + colon_pos,
					     value_len, context);
				}

				colon_pos = UINT_MAX;
				line_start = i+1;
			}
		}

		if (i < parse_size) {
			/* end of header */
			startpos = i+1;
			break;
		}

		/* leave the last line to buffer */
		if (colon_pos != UINT_MAX)
			colon_pos -= line_start;
		if (hdr_size != NULL)
			hdr_size->physical_size += line_start;
		i_stream_skip(input, line_start);

		startpos = i-line_start;
		line_start = 0;
	}

	i_stream_skip(input, startpos);

	if (hdr_size != NULL) {
		hdr_size->physical_size += startpos;
		hdr_size->virtual_size +=
			hdr_size->physical_size + missing_cr_count;
		i_assert(hdr_size->virtual_size >= hdr_size->physical_size);
	}

	if (func != NULL) {
		/* "end of headers" notify */
		func(part, "", 0, "", 0, context);
	}
}

static MessageBoundary *boundary_find(MessageBoundary *boundaries,
				      const char *msg, size_t len)
{
	while (boundaries != NULL) {
		if (boundaries->len <= len &&
		    strncmp(boundaries->boundary, msg, boundaries->len) == 0)
			return boundaries;

		boundaries = boundaries->next;
	}

	return NULL;
}

/* read until next boundary is found. if skip_over = FALSE, stop at the
   [\r]\n before the boundary, otherwise leave it right after the known
   boundary so the ending "--" can be checked. */
static MessageBoundary *
message_find_boundary(IStream *input, MessageBoundary *boundaries,
		      MessageSize *msg_size, int skip_over)
{
	MessageBoundary *boundary;
	const unsigned char *msg;
	size_t i, size, startpos, line_start, missing_cr_count;

	boundary = NULL;
	missing_cr_count = startpos = line_start = 0;

	while (i_stream_read_data(input, &msg, &size, startpos) > 0) {
		for (i = startpos; i < size; i++) {
			if (msg[i] != '\n')
				continue;

			if (i > line_start+2 && msg[line_start] == '-' &&
			    msg[line_start+1] == '-') {
				/* possible boundary */
				boundary = boundary_find(boundaries,
					(const char *) msg + line_start + 2,
					i - line_start - 2);
				if (boundary != NULL)
					break;
			}

			if (i == 0 || msg[i-1] != '\r') {
				/* missing CR */
				missing_cr_count++;
			}

			msg_size->lines++;
			line_start = i+1;
		}

		if (boundary != NULL)
			break;

		if (i - line_start > 128 &&
		    msg[line_start] == '-' && msg[line_start+1] == '-') {
			/* long partial line, see if it's a boundary.
			   RFC-2046 says that the boundaries must be
			   70 chars without "--" or less. We allow
			   a bit larger.. */
			boundary = boundary_find(boundaries,
					(const char *) msg + line_start + 2,
					i - line_start - 2);
			if (boundary != NULL)
				break;

			/* nope, we can skip over the line, just
			   leave the last char since it may be \r */
			i--;
		} else {
			/* leave the last line to buffer, it may be
			   boundary */
			i = line_start;
			if (i > 2) i -= 2; /* leave the \r\n too */
			line_start -= i;
		}

		i_stream_skip(input, i);
		msg_size->physical_size += i;
		msg_size->virtual_size += i;

		startpos = size - i;
	}

	if (boundary != NULL) {
		if (skip_over) {
			/* leave the pointer right after the boundary */
			line_start += 2 + boundary->len;
		} else if (line_start > 0 && msg[line_start-1] == '\n') {
			/* leave the \r\n before the boundary */
			line_start--;
			msg_size->lines--;

			if (line_start > 0 && msg[line_start-1] == '\r')
				line_start--;
			else
				missing_cr_count--;
		}
		startpos = line_start;
	}

	i_stream_skip(input, startpos);
	msg_size->physical_size += startpos;
	msg_size->virtual_size += startpos + missing_cr_count;

	i_assert(msg_size->virtual_size >= msg_size->physical_size);

	return boundary;
}

static MessagePart *message_parse_body(IStream *input,
				       MessageBoundary *boundaries,
				       MessageSize *body_size)
{
	MessageBoundary *boundary;

	if (boundaries == NULL) {
		message_get_body_size(input, body_size, (uoff_t)-1);
		return NULL;
	} else {
		boundary = message_find_boundary(input, boundaries,
						 body_size, FALSE);
		return boundary == NULL ? NULL : boundary->part;
	}
}

/* skip data until next boundary is found. if it's end boundary,
   skip the footer as well. */
static MessagePart *message_skip_boundary(IStream *input,
					  MessageBoundary *boundaries,
					  MessageSize *boundary_size)
{
	MessageBoundary *boundary;
	const unsigned char *msg;
	size_t size;
	int end_boundary;

	boundary = message_find_boundary(input, boundaries,
					 boundary_size, TRUE);
	if (boundary == NULL)
		return NULL;

	/* now, see if it's end boundary */
	end_boundary = FALSE;
	if (i_stream_read_data(input, &msg, &size, 1) > 0)
		end_boundary = msg[0] == '-' && msg[1] == '-';

	/* skip the rest of the line */
	message_skip_line(input, boundary_size);

	if (end_boundary) {
		/* skip the footer */
		return message_parse_body(input, boundaries, boundary_size);
	}

	return boundary == NULL ? NULL : boundary->part;
}