changeset 18504:a9f8a617dc02

lib-mail: Added qp-decoder, which is a rewritten quoted_printable_decode() The main benefit is that qp-decoder allows feeding data to it in smaller pieces. It can also give better error reporting.
author Timo Sirainen <tss@iki.fi>
date Sun, 03 May 2015 14:50:01 +0300
parents 333533e2d231
children a8e9fdcb17c5
files src/lib-mail/Makefile.am src/lib-mail/qp-decoder.c src/lib-mail/qp-decoder.h src/lib-mail/test-qp-decoder.c
diffstat 4 files changed, 406 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/src/lib-mail/Makefile.am	Wed Apr 29 17:45:30 2015 +0200
+++ b/src/lib-mail/Makefile.am	Sun May 03 14:50:01 2015 +0300
@@ -31,6 +31,7 @@
 	message-size.c \
 	message-snippet.c \
 	ostream-dot.c \
+	qp-decoder.c \
 	quoted-printable.c \
 	rfc2231-parser.c \
 	rfc822-parser.c
@@ -65,6 +66,7 @@
 	message-size.h \
 	message-snippet.h \
 	ostream-dot.h \
+	qp-decoder.h \
 	quoted-printable.h \
 	rfc2231-parser.h \
 	rfc822-parser.h
@@ -91,6 +93,7 @@
 	test-message-part \
 	test-message-snippet \
 	test-ostream-dot \
+	test-qp-decoder \
 	test-quoted-printable \
 	test-rfc2231-parser
 
@@ -181,6 +184,10 @@
 test_ostream_dot_LDADD = ostream-dot.lo $(test_libs)
 test_ostream_dot_DEPENDENCIES = $(test_deps)
 
+test_qp_decoder_SOURCES = test-qp-decoder.c
+test_qp_decoder_LDADD = qp-decoder.lo $(test_libs)
+test_qp_decoder_DEPENDENCIES = $(test_deps)
+
 test_quoted_printable_SOURCES = test-quoted-printable.c
 test_quoted_printable_LDADD = quoted-printable.lo $(test_libs)
 test_quoted_printable_DEPENDENCIES = $(test_deps)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-mail/qp-decoder.c	Sun May 03 14:50:01 2015 +0300
@@ -0,0 +1,285 @@
+/* Copyright (c) 2002-2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "buffer.h"
+#include "hex-binary.h"
+#include "qp-decoder.h"
+
+/* quoted-printable lines can be max 76 characters. if we've seen more than
+   that much whitespace, it means there really shouldn't be anything else left
+   in the line except trailing whitespace. */
+#define QP_MAX_WHITESPACE_LEN 76
+
+#define QP_IS_TRAILING_WHITESPACE(c) \
+	((c) == ' ' || (c) == '\t')
+
+enum qp_state {
+	STATE_TEXT = 0,
+	STATE_WHITESPACE,
+	STATE_EQUALS,
+	STATE_EQUALS_WHITESPACE,
+	STATE_HEX2,
+	STATE_CR,
+	STATE_SOFTCR
+};
+
+struct qp_decoder {
+	buffer_t *dest;
+	buffer_t *whitespace;
+	enum qp_state state;
+	char hexchar;
+};
+
+struct qp_decoder *qp_decoder_init(buffer_t *dest)
+{
+	struct qp_decoder *qp;
+
+	qp = i_new(struct qp_decoder, 1);
+	qp->dest = dest;
+	qp->whitespace = buffer_create_dynamic(default_pool, 80);
+	return qp;
+}
+
+void qp_decoder_deinit(struct qp_decoder **_qp)
+{
+	struct qp_decoder *qp = *_qp;
+
+	buffer_free(&qp->whitespace);
+	i_free(qp);
+}
+
+static size_t
+qp_decoder_more_text(struct qp_decoder *qp, const unsigned char *src,
+		     size_t src_size)
+{
+	size_t i, start = 0, ret = src_size;
+
+	for (i = 0; i < src_size; i++) {
+		if (src[i] > '=') {
+			/* fast path */
+			continue;
+		}
+		switch (src[i]) {
+		case '=':
+			qp->state = STATE_EQUALS;
+			break;
+		case '\r':
+			qp->state = STATE_CR;
+			break;
+		case '\n':
+			/* LF without preceding CR */
+			buffer_append(qp->dest, src+start, i-start);
+			buffer_append(qp->dest, "\r\n", 2);
+			start = i+1;
+			continue;
+		case ' ':
+		case '\t':
+			i_assert(qp->whitespace->used == 0);
+			qp->state = STATE_WHITESPACE;
+			buffer_append_c(qp->whitespace, src[i]);
+			break;
+		default:
+			continue;
+		}
+		ret = i+1;
+		break;
+	}
+	buffer_append(qp->dest, src+start, i-start);
+	return ret;
+}
+
+static void qp_decoder_invalid(struct qp_decoder *qp, const char **error_r)
+{
+	switch (qp->state) {
+	case STATE_EQUALS:
+		buffer_append_c(qp->dest, '=');
+		*error_r = "'=' not followed by two hex digits";
+		break;
+	case STATE_HEX2:
+		buffer_append_c(qp->dest, '=');
+		buffer_append_c(qp->dest, qp->hexchar);
+		*error_r = "'=<hex>' not followed by a hex digit";
+		break;
+	case STATE_EQUALS_WHITESPACE:
+		buffer_append_c(qp->dest, '=');
+		buffer_append_buf(qp->dest, qp->whitespace, 0, (size_t)-1);
+		buffer_set_used_size(qp->whitespace, 0);
+		*error_r = "'=<whitespace>' not followed by newline";
+		break;
+	case STATE_CR:
+		buffer_append_buf(qp->dest, qp->whitespace, 0, (size_t)-1);
+		buffer_set_used_size(qp->whitespace, 0);
+		buffer_append_c(qp->dest, '\r');
+		*error_r = "CR not followed by LF";
+		break;
+	case STATE_SOFTCR:
+		buffer_append_c(qp->dest, '=');
+		buffer_append_buf(qp->dest, qp->whitespace, 0, (size_t)-1);
+		buffer_set_used_size(qp->whitespace, 0);
+		buffer_append_c(qp->dest, '\r');
+		*error_r = "CR not followed by LF";
+		break;
+	case STATE_TEXT:
+	case STATE_WHITESPACE:
+		i_unreached();
+	}
+	qp->state = STATE_TEXT;
+	i_assert(*error_r != NULL);
+}
+
+int qp_decoder_more(struct qp_decoder *qp, const unsigned char *src,
+		    size_t src_size, size_t *invalid_src_pos_r,
+		    const char **error_r)
+{
+	const char *error;
+	size_t i;
+
+	*invalid_src_pos_r = (size_t)-1;
+	*error_r = NULL;
+
+	for (i = 0; i < src_size; ) {
+		switch (qp->state) {
+		case STATE_TEXT:
+			i += qp_decoder_more_text(qp, src+i, src_size-i);
+			/* don't increment i any more than we already did,
+			   so continue instead of break */
+			continue;
+		case STATE_WHITESPACE:
+			if (QP_IS_TRAILING_WHITESPACE(src[i])) {
+				/* more whitespace */
+				if (qp->whitespace->used <= QP_MAX_WHITESPACE_LEN)
+					buffer_append_c(qp->whitespace, src[i]);
+			} else if (src[i] == '\r') {
+				qp->state = STATE_CR;
+			} else if (src[i] == '\n') {
+				/* drop the trailing whitespace */
+				buffer_append(qp->dest, "\r\n", 2);
+				buffer_set_used_size(qp->whitespace, 0);
+			} else {
+				/* this wasn't trailing whitespace.
+				   put it back. */
+				buffer_append_buf(qp->dest, qp->whitespace,
+						  0, (size_t)-1);
+				if (qp->whitespace->used > QP_MAX_WHITESPACE_LEN) {
+					/* we already truncated some of the
+					   whitespace away, because the line
+					   is too long */
+					if (*invalid_src_pos_r == (size_t)-1) {
+						*invalid_src_pos_r = i;
+						*error_r = "Too much whitespace";
+					}
+				}
+				buffer_set_used_size(qp->whitespace, 0);
+				qp->state = STATE_TEXT;
+				continue; /* don't increment i */
+			}
+			break;
+		case STATE_EQUALS:
+			if ((src[i] >= '0' && src[i] <= '9') ||
+			    (src[i] >= 'A' && src[i] <= 'F') ||
+			    /* lowercase hex isn't strictly valid, but allow */
+			    (src[i] >= 'a' && src[i] <= 'f')) {
+				qp->hexchar = src[i];
+				qp->state = STATE_HEX2;
+			} else if (QP_IS_TRAILING_WHITESPACE(src[i])) {
+				i_assert(qp->whitespace->used == 0);
+				buffer_append_c(qp->whitespace, src[i]);
+				qp->state = STATE_EQUALS_WHITESPACE;
+			} else if (src[i] == '\r')
+				qp->state = STATE_SOFTCR;
+			else if (src[i] == '\n') {
+				qp->state = STATE_TEXT;
+			} else {
+				/* invalid input */
+				qp_decoder_invalid(qp, &error);
+				if (*invalid_src_pos_r == (size_t)-1) {
+					*invalid_src_pos_r = i;
+					*error_r = error;
+				}
+				continue; /* don't increment i */
+			}
+			break;
+		case STATE_HEX2:
+			if ((src[i] >= '0' && src[i] <= '9') ||
+			    (src[i] >= 'A' && src[i] <= 'F') ||
+			    (src[i] >= 'a' && src[i] <= 'f')) {
+				char data[3];
+
+				data[0] = qp->hexchar;
+				data[1] = src[i];
+				data[2] = '\0';
+				if (hex_to_binary(data, qp->dest) < 0)
+					i_unreached();
+				qp->state = STATE_TEXT;
+			} else {
+				/* invalid input */
+				qp_decoder_invalid(qp, &error);
+				if (*invalid_src_pos_r == (size_t)-1) {
+					*invalid_src_pos_r = i;
+					*error_r = error;
+				}
+				continue; /* don't increment i */
+			}
+			break;
+		case STATE_EQUALS_WHITESPACE:
+			if (QP_IS_TRAILING_WHITESPACE(src[i])) {
+				if (qp->whitespace->used <= QP_MAX_WHITESPACE_LEN)
+					buffer_append_c(qp->whitespace, src[i]);
+				else {
+					/* if this isn't going to get truncated
+					   anyway, it's going to be an error */
+				}
+			} else if (src[i] == '\r')
+				qp->state = STATE_SOFTCR;
+			else if (src[i] == '\n') {
+				buffer_set_used_size(qp->whitespace, 0);
+				qp->state = STATE_TEXT;
+			} else {
+				/* =<whitespace> not followed by [CR]LF
+				   is invalid. */
+				qp_decoder_invalid(qp, &error);
+				if (*invalid_src_pos_r == (size_t)-1) {
+					*invalid_src_pos_r = i;
+					*error_r = error;
+				}
+				continue; /* don't increment i */
+			}
+			break;
+		case STATE_CR:
+		case STATE_SOFTCR:
+			if (src[i] == '\n') {
+				buffer_set_used_size(qp->whitespace, 0);
+				if (qp->state != STATE_SOFTCR)
+					buffer_append(qp->dest, "\r\n", 2);
+				qp->state = STATE_TEXT;
+			} else {
+				qp_decoder_invalid(qp, &error);
+				if (*invalid_src_pos_r == (size_t)-1) {
+					*invalid_src_pos_r = i;
+					*error_r = error;
+				}
+				continue; /* don't increment i */
+			}
+			break;
+		}
+		i++;
+	}
+	i_assert((*invalid_src_pos_r == (size_t)-1) == (*error_r == NULL));
+	return *invalid_src_pos_r == (size_t)-1 ? 0 : -1;
+}
+
+int qp_decoder_finish(struct qp_decoder *qp, const char **error_r)
+{
+	int ret;
+
+	if (qp->state == STATE_TEXT || qp->state == STATE_WHITESPACE) {
+		ret = 0;
+		*error_r = NULL;
+	} else {
+		qp_decoder_invalid(qp, error_r);
+		ret = -1;
+	}
+	qp->state = STATE_TEXT;
+	buffer_set_used_size(qp->whitespace, 0);
+	return ret;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-mail/qp-decoder.h	Sun May 03 14:50:01 2015 +0300
@@ -0,0 +1,19 @@
+#ifndef QP_DECODER_H
+#define QP_DECODER_H
+
+/* Initialize quoted-printable decoder. Write all the decoded output to dest. */
+struct qp_decoder *qp_decoder_init(buffer_t *dest);
+void qp_decoder_deinit(struct qp_decoder **qp);
+
+/* Translate more quoted printable data into binary. Returns 0 if input was
+   valid, -1 if there were some decoding errors (which were skipped over).
+   LFs without preceding CR are returned as CRLF (but =0A isn't). */
+int qp_decoder_more(struct qp_decoder *qp, const unsigned char *src,
+		    size_t src_size, size_t *invalid_src_pos_r,
+		    const char **error_r);
+/* Finish decoding any pending input. Returns the same as qp_decoder_more().
+   This function also resets the entire decoder state, so the same decoder can
+   be used to decode more data if wanted. */
+int qp_decoder_finish(struct qp_decoder *qp, const char **error_r);
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-mail/test-qp-decoder.c	Sun May 03 14:50:01 2015 +0300
@@ -0,0 +1,95 @@
+/* Copyright (c) 2007-2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "qp-decoder.h"
+#include "test-common.h"
+
+struct test_quoted_printable_decode_data {
+	const char *input;
+	const char *output;
+	size_t error_pos;
+	int ret;
+};
+
+static void test_qp_decoder(void)
+{
+#define WHITESPACE10 "   \t   \t \t"
+#define WHITESPACE70 WHITESPACE10 WHITESPACE10 WHITESPACE10 WHITESPACE10 WHITESPACE10 WHITESPACE10 WHITESPACE10
+	static struct test_quoted_printable_decode_data tests[] = {
+		{ "foo  \r\nbar=\n", "foo\r\nbar", 0, 0 },
+		{ "foo\t=\nbar", "foo\tbar", 0, 0 },
+		{ "foo = \n=01", "foo \001", 0, 0 },
+		{ "foo =\t\r\nbar", "foo bar", 0, 0 },
+		{ "foo =\r\n=01", "foo \001", 0, 0 },
+		{ "foo  \nbar=\r\n", "foo\r\nbar", 0, 0 },
+		{ "=0A=0D  ", "\n\r", 0, 0 },
+		{ "foo_bar", "foo_bar", 0, 0 },
+		{ "\n\n", "\r\n\r\n", 0, 0 },
+		{ "\r\n\n\n\r\n", "\r\n\r\n\r\n\r\n", 0, 0 },
+
+		{ "foo=", "foo=", 4, -1 },
+		{ "foo= \t", "foo= \t", 6, -1 },
+		{ "foo= \r", "foo= \r", 6, -1 },
+		{ "foo= \r bar", "foo= \r bar", 6, -1 },
+		{ "foo=A", "foo=A", 5, -1 },
+		{ "foo=Ax", "foo=Ax", 5, -1 },
+		{ "foo=Ax=xy", "foo=Ax=xy", 5, -1 },
+
+		/* above 76 whitespaces is invalid and gets truncated
+		   (at 77th whitespace because of the current implementation) */
+		{ WHITESPACE70"      7\n", WHITESPACE70"      7\r\n", 0, 0 },
+		{ WHITESPACE70"       8\n", WHITESPACE70"       8\r\n", 77, -1 },
+		{ WHITESPACE70"        9\n", WHITESPACE70"       9\r\n", 78, -1 },
+		{ WHITESPACE70"         0\n", WHITESPACE70"       0\r\n", 79, -1 }
+	};
+	string_t *str;
+	unsigned int i, j;
+
+	test_begin("qp-decoder");
+	str = t_str_new(128);
+	for (i = 0; i < N_ELEMENTS(tests); i++) {
+		const char *input = tests[i].input;
+		struct qp_decoder *qp = qp_decoder_init(str);
+		size_t error_pos;
+		const char *error;
+		int ret;
+
+		/* try all at once */
+		ret = qp_decoder_more(qp, (const void *)input, strlen(input),
+				      &error_pos, &error);
+		if (qp_decoder_finish(qp, &error) < 0 && ret == 0) {
+			error_pos = strlen(input);
+			ret = -1;
+		}
+		test_assert_idx(ret == tests[i].ret, i);
+		test_assert_idx(ret == 0 || error_pos == tests[i].error_pos, i);
+		test_assert_idx(strcmp(str_c(str), tests[i].output) == 0, i);
+
+		/* try in small pieces */
+		str_truncate(str, 0);
+		ret = 0;
+		for (j = 0; input[j] != '\0'; j++) {
+			unsigned char c = input[j];
+			if (qp_decoder_more(qp, &c, 1, &error_pos, &error) < 0)
+				ret = -1;
+		}
+		if (qp_decoder_finish(qp, &error) < 0)
+			ret = -1;
+		test_assert_idx(ret == tests[i].ret, i);
+		test_assert_idx(strcmp(str_c(str), tests[i].output) == 0, i);
+
+		qp_decoder_deinit(&qp);
+		str_truncate(str, 0);
+	}
+	test_end();
+}
+
+int main(void)
+{
+	static void (*test_functions[])(void) = {
+		test_qp_decoder,
+		NULL
+	};
+	return test_run(test_functions);
+}