changeset 6915:671c2eb25f3d HEAD

Remove illegal UTF-8 sequences from output.
author Timo Sirainen <tss@iki.fi>
date Mon, 03 Dec 2007 15:41:01 +0200
parents 9c3f0e180751
children 0b8a78914db7
files src/lib-mail/message-decoder.c
diffstat 1 files changed, 64 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/src/lib-mail/message-decoder.c	Mon Dec 03 15:06:27 2007 +0200
+++ b/src/lib-mail/message-decoder.c	Mon Dec 03 15:41:01 2007 +0200
@@ -209,6 +209,65 @@
 	ctx->translation_size = 0;
 }
 
+static inline unsigned int
+is_valid_utf8_seq(const unsigned char *input, unsigned int size)
+{
+	size_t i, len;
+
+	len = uni_utf8_char_bytes(input[0]);
+	if (unlikely(len > size))
+		return 0;
+
+	for (i = 0; i < len; i++) {
+		if (unlikely(uni_utf8_char_bytes(input[i]) != len-i))
+			return 0;
+	}
+	return len;
+}
+
+static const unsigned char *
+get_valid_utf8(const unsigned char *input, size_t size, buffer_t *tmpbuf,
+	       size_t *output_size_r)
+{
+	size_t i, len;
+
+	/* find the first invalid utf8 sequence */
+	for (i = 0; i < size;) {
+		if (input[i] < 0x80)
+			i++;
+		else {
+			len = is_valid_utf8_seq(input + i, size-i);
+			if (unlikely(len == 0))
+				goto broken;
+			i += len;
+		}
+	}
+	/* we can use it as-is */
+	*output_size_r = size;
+	return input;
+broken:
+	/* broken utf-8 input - skip the broken characters */
+	buffer_set_used_size(tmpbuf, 0);
+	buffer_append(tmpbuf, input, i++);
+
+	while (i < size) {
+		if (input[i] < 0x80) {
+			buffer_append_c(tmpbuf, input[i++]);
+			continue;
+		}
+
+		len = is_valid_utf8_seq(input + i, size-i);
+		if (len == 0) {
+			i++;
+			continue;
+		}
+		buffer_append(tmpbuf, input + i, len);
+		i += len;
+	}
+	*output_size_r = tmpbuf->used;
+	return tmpbuf->data;
+}
+
 static bool message_decode_body(struct message_decoder_context *ctx,
 				struct message_block *input,
 				struct message_block *output)
@@ -309,12 +368,13 @@
 			output->data = ctx->buf2->data;
 			output->size = ctx->buf2->used;
 		} else {
-			output->data = data;
-			output->size = size;
+			output->data = get_valid_utf8(data, size, ctx->buf2,
+						      &output->size);
 		}
 	} else if (ctx->charset_trans == NULL) {
-		output->data = data;
-		output->size = size;
+		/* unknown charset */
+		output->data = get_valid_utf8(data, size, ctx->buf2,
+					      &output->size);
 	} else {
 		buffer_set_used_size(ctx->buf2, 0);
 		if (ctx->translation_size != 0)