Mercurial > dovecot > core-2.2
changeset 4608:61e0fe257a83 HEAD
Added message-decoder to decode message's contents into readable UTF-8.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Sun, 17 Sep 2006 19:23:44 +0300 |
parents | 71b3570946f8 |
children | 48a16f1254b5 |
files | src/lib-mail/Makefile.am src/lib-mail/message-decoder.c src/lib-mail/message-decoder.h |
diffstat | 3 files changed, 345 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-mail/Makefile.am Sun Sep 17 19:22:31 2006 +0300 +++ b/src/lib-mail/Makefile.am Sun Sep 17 19:23:44 2006 +0300 @@ -10,6 +10,7 @@ message-body-search.c \ message-content-parser.c \ message-date.c \ + message-decoder.c \ message-header-decode.c \ message-header-parser.c \ message-header-search.c \ @@ -28,6 +29,7 @@ message-body-search.h \ message-content-parser.h \ message-date.h \ + message-decoder.h \ message-header-decode.h \ message-header-parser.h \ message-header-search.h \
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib-mail/message-decoder.c Sun Sep 17 19:23:44 2006 +0300 @@ -0,0 +1,323 @@ +/* Copyright (C) 2006 Timo Sirainen */ + +#include "lib.h" +#include "buffer.h" +#include "strescape.h" +#include "base64.h" +#include "charset-utf8.h" +#include "quoted-printable.h" +#include "message-parser.h" +#include "message-content-parser.h" +#include "message-header-decode.h" +#include "message-decoder.h" + +enum content_type { + CONTENT_TYPE_UNKNOWN = 0, + CONTENT_TYPE_BINARY, + CONTENT_TYPE_QP, + CONTENT_TYPE_BASE64 +}; + +/* Both base64 and q-p takes max 3 bytes per character */ +#define MAX_ENCODING_BUF_SIZE 2 + +/* UTF-8 takes max 5 bytes per character. Not sure about others, but I'd think + 10 is more than enough for everyone.. */ +#define MAX_TRANSLATION_BUF_SIZE 10 + +struct message_decoder_context { + struct message_header_line hdr; + buffer_t *buf, *buf2; + + struct charset_translation *charset_trans; + char translation_buf[MAX_TRANSLATION_BUF_SIZE]; + unsigned int translation_size; + + char encoding_buf[MAX_ENCODING_BUF_SIZE]; + unsigned int encoding_size; + + char *content_charset; + enum content_type content_type; + + unsigned int charset_utf8:1; +}; + +struct message_decoder_context *message_decoder_init(void) +{ + struct message_decoder_context *ctx; + + ctx = i_new(struct message_decoder_context, 1); + ctx->buf = buffer_create_dynamic(default_pool, 8192); + ctx->buf2 = buffer_create_dynamic(default_pool, 8192); + return ctx; +} + +void message_decoder_deinit(struct message_decoder_context **_ctx) +{ + struct message_decoder_context *ctx = *_ctx; + + *_ctx = NULL; + + buffer_free(ctx->buf); + buffer_free(ctx->buf2); + i_free(ctx); +} + +static bool +message_decode_header_callback(const unsigned char *data, size_t size, + const char *charset, void *context) +{ + struct message_decoder_context *ctx = context; + struct charset_translation *t; + bool unknown_charset; + + if (charset == NULL || strcasecmp(charset, "UTF-8") == 0) { + /* ASCII */ + buffer_append(ctx->buf, data, size); + return TRUE; + } + + t = charset_to_utf8_begin(charset, &unknown_charset); + if (unknown_charset) { + /* let's just ignore this part */ + return TRUE; + } + + /* ignore any errors */ + (void)charset_to_ucase_utf8_full(t, data, &size, ctx->buf); + charset_to_utf8_end(&t); + return TRUE; +} + +static void parse_content_encoding(const unsigned char *value, size_t value_len, + void *context) +{ + struct message_decoder_context *ctx = context; + + ctx->content_type = CONTENT_TYPE_UNKNOWN; + + switch (value_len) { + case 4: + if (memcasecmp(value, "7bit", 4) == 0 || + memcasecmp(value, "8bit", 4) == 0) + ctx->content_type = CONTENT_TYPE_BINARY; + break; + case 6: + if (memcasecmp(value, "base64", 6) == 0) + ctx->content_type = CONTENT_TYPE_BASE64; + else if (memcasecmp(value, "binary", 6) == 0) + ctx->content_type = CONTENT_TYPE_BINARY; + break; + case 16: + if (memcasecmp(value, "quoted-printable", 16) == 0) + ctx->content_type = CONTENT_TYPE_QP; + break; + } +} + +static void +parse_content_type_param(const unsigned char *name, size_t name_len, + const unsigned char *value, size_t value_len, + bool value_quoted, void *context) +{ + struct message_decoder_context *ctx = context; + + if (name_len == 7 && memcasecmp(name, "charset", 7) == 0 && + ctx->content_charset == NULL) { + ctx->content_charset = i_strndup(value, value_len); + if (value_quoted) str_unescape(ctx->content_charset); + + ctx->charset_utf8 = charset_is_utf8(ctx->content_charset); + } +} + +static bool message_decode_header(struct message_decoder_context *ctx, + struct message_header_line *hdr, + struct message_block *output) +{ + if (hdr->continues) { + hdr->use_full_value = TRUE; + return FALSE; + } + + if (hdr->name_len == 12 && + strcasecmp(hdr->name, "Content-Type") == 0) { + message_content_parse_header(hdr->full_value, + hdr->full_value_len, + NULL, + parse_content_type_param, ctx); + } + if (hdr->name_len == 25 && + strcasecmp(hdr->name, "Content-Transfer-Encoding") == 0) { + message_content_parse_header(hdr->full_value, + hdr->full_value_len, + parse_content_encoding, + NULL, ctx); + } + + buffer_set_used_size(ctx->buf, 0); + message_header_decode(hdr->full_value, hdr->full_value_len, + message_decode_header_callback, ctx); + + ctx->hdr = *hdr; + ctx->hdr.full_value = ctx->buf->data; + ctx->hdr.full_value_len = ctx->buf->used; + ctx->hdr.value_len = 0; + + output->hdr = &ctx->hdr; + return TRUE; +} + +static void translation_buf_decode(struct message_decoder_context *ctx, + const unsigned char **data, size_t *size) +{ + unsigned char trans_buf[MAX_TRANSLATION_BUF_SIZE+1]; + size_t pos, skip; + + /* @UNSAFE */ + memcpy(trans_buf, ctx->translation_buf, ctx->translation_size); + skip = sizeof(trans_buf) - ctx->translation_size; + if (skip > *size) + skip = *size; + memcpy(trans_buf + ctx->translation_size, data, skip); + + pos = *size; + (void)charset_to_ucase_utf8_full(ctx->charset_trans, + *data, &pos, ctx->buf2); + + i_assert(pos > ctx->translation_size); + skip = (ctx->translation_size + skip) - pos; + + i_assert(*size >= skip); + *data += skip; + *size -= skip; + + ctx->translation_size = 0; +} + +static bool message_decode_body(struct message_decoder_context *ctx, + struct message_block *input, + struct message_block *output) +{ + unsigned char new_buf[MAX_ENCODING_BUF_SIZE+1]; + const unsigned char *data = NULL; + size_t pos, size = 0, skip = 0; + bool unknown_charset; + + if (ctx->charset_trans == NULL && !ctx->charset_utf8) { + ctx->charset_trans = + charset_to_utf8_begin(ctx->content_charset != NULL ? + ctx->content_charset : "UTF-8", + &unknown_charset); + } + + if (ctx->encoding_size != 0) { + /* @UNSAFE */ + memcpy(new_buf, ctx->encoding_buf, ctx->encoding_size); + skip = sizeof(new_buf) - ctx->encoding_size; + if (skip > input->size) + skip = input->size; + memcpy(new_buf + ctx->encoding_size, input->data, skip); + } + + switch (ctx->content_type) { + case CONTENT_TYPE_UNKNOWN: + /* just skip this body */ + return FALSE; + + case CONTENT_TYPE_BINARY: + data = input->data; + size = pos = input->size; + break; + case CONTENT_TYPE_QP: + buffer_set_used_size(ctx->buf, 0); + if (ctx->encoding_size != 0) { + quoted_printable_decode(new_buf, + ctx->encoding_size + skip, + &pos, ctx->buf); + i_assert(pos > ctx->encoding_size); + skip = (ctx->encoding_size + skip) - pos; + } + + quoted_printable_decode(input->data, input->size, + &pos, ctx->buf); + pos += skip; + data = ctx->buf->data; + size = ctx->buf->used; + break; + case CONTENT_TYPE_BASE64: + buffer_set_used_size(ctx->buf, 0); + if (ctx->encoding_size != 0) { + if (base64_decode(new_buf, ctx->encoding_size + skip, + &pos, ctx->buf) < 0) { + /* corrupted base64 data, don't bother with + the rest of it */ + return FALSE; + } + i_assert(pos > ctx->encoding_size); + skip = (ctx->encoding_size + skip) - pos; + } + if (base64_decode(input->data + skip, input->size - skip, + &pos, ctx->buf) < 0) { + /* corrupted base64 data, don't bother with + the rest of it */ + return FALSE; + } + pos += skip; + data = ctx->buf->data; + size = ctx->buf->used; + break; + } + + if (pos != input->size) { + /* @UNSAFE */ + ctx->encoding_size = input->size - pos; + i_assert(ctx->encoding_size <= sizeof(ctx->encoding_buf)); + memcpy(ctx->encoding_buf, input->data + pos, + ctx->encoding_size); + } + + if (ctx->charset_utf8) { + output->data = data; + output->size = size; + } else { + buffer_set_used_size(ctx->buf2, 0); + if (ctx->translation_size != 0) + translation_buf_decode(ctx, &data, &size); + + pos = size; + (void)charset_to_ucase_utf8_full(ctx->charset_trans, + data, &pos, ctx->buf2); + if (pos != size) { + ctx->translation_size = size - pos; + i_assert(ctx->translation_size <= + sizeof(ctx->translation_buf)); + memcpy(ctx->translation_buf, data + pos, + ctx->translation_size); + } + output->data = ctx->buf2->data; + output->size = ctx->buf2->used; + } + + output->hdr = NULL; + return TRUE; +} + +bool message_decoder_decode_next_block(struct message_decoder_context *ctx, + struct message_block *input, + struct message_block *output) +{ + if (input->part != output->part) { + /* MIME part changed. */ + i_free_and_null(ctx->content_charset); + ctx->content_type = CONTENT_TYPE_BINARY; + ctx->charset_utf8 = TRUE; + } + + output->part = input->part; + + if (input->hdr != NULL) + return message_decode_header(ctx, input->hdr, output); + else + return message_decode_body(ctx, input, output); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib-mail/message-decoder.h Sun Sep 17 19:23:44 2006 +0300 @@ -0,0 +1,20 @@ +#ifndef __MESSAGE_DECODER_H +#define __MESSAGE_DECODER_H + +struct message_block; + +/* Decode message's contents as UTF-8, both the headers and the MIME bodies. + The bodies are decoded from quoted-printable and base64 formats if needed. */ +struct message_decoder_context *message_decoder_init(void); +void message_decoder_deinit(struct message_decoder_context **ctx); + +/* Decode input and return decoded output. Headers are returned only in their + full multiline forms. + + Returns TRUE if output is given, FALSE if more data is needed. If the input + ends in a partial character, it's returned in the next output. */ +bool message_decoder_decode_next_block(struct message_decoder_context *ctx, + struct message_block *input, + struct message_block *output); + +#endif