Mercurial > dovecot > original-hg > dovecot-1.2
changeset 6915:671c2eb25f3d HEAD
Remove illegal UTF-8 sequences from output.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Mon, 03 Dec 2007 15:41:01 +0200 |
parents | 9c3f0e180751 |
children | 0b8a78914db7 |
files | src/lib-mail/message-decoder.c |
diffstat | 1 files changed, 64 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-mail/message-decoder.c Mon Dec 03 15:06:27 2007 +0200 +++ b/src/lib-mail/message-decoder.c Mon Dec 03 15:41:01 2007 +0200 @@ -209,6 +209,65 @@ ctx->translation_size = 0; } +static inline unsigned int +is_valid_utf8_seq(const unsigned char *input, unsigned int size) +{ + size_t i, len; + + len = uni_utf8_char_bytes(input[0]); + if (unlikely(len > size)) + return 0; + + for (i = 0; i < len; i++) { + if (unlikely(uni_utf8_char_bytes(input[i]) != len-i)) + return 0; + } + return len; +} + +static const unsigned char * +get_valid_utf8(const unsigned char *input, size_t size, buffer_t *tmpbuf, + size_t *output_size_r) +{ + size_t i, len; + + /* find the first invalid utf8 sequence */ + for (i = 0; i < size;) { + if (input[i] < 0x80) + i++; + else { + len = is_valid_utf8_seq(input + i, size-i); + if (unlikely(len == 0)) + goto broken; + i += len; + } + } + /* we can use it as-is */ + *output_size_r = size; + return input; +broken: + /* broken utf-8 input - skip the broken characters */ + buffer_set_used_size(tmpbuf, 0); + buffer_append(tmpbuf, input, i++); + + while (i < size) { + if (input[i] < 0x80) { + buffer_append_c(tmpbuf, input[i++]); + continue; + } + + len = is_valid_utf8_seq(input + i, size-i); + if (len == 0) { + i++; + continue; + } + buffer_append(tmpbuf, input + i, len); + i += len; + } + *output_size_r = tmpbuf->used; + return tmpbuf->data; +} + static bool message_decode_body(struct message_decoder_context *ctx, struct message_block *input, struct message_block *output) @@ -309,12 +368,13 @@ output->data = ctx->buf2->data; output->size = ctx->buf2->used; } else { - output->data = data; - output->size = size; + output->data = get_valid_utf8(data, size, ctx->buf2, + &output->size); } } else if (ctx->charset_trans == NULL) { - output->data = data; - output->size = size; + /* unknown charset */ + output->data = get_valid_utf8(data, size, ctx->buf2, + &output->size); } else { buffer_set_used_size(ctx->buf2, 0); if (ctx->translation_size != 0)