Mercurial > dovecot > original-hg > dovecot-1.2
changeset 6951:1f70c72e4312 HEAD
Moved uni_utf8_get_valid_data() to lib/
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Sat, 08 Dec 2007 15:45:17 +0200 |
parents | 63e225ab7361 |
children | 08e4d7efcd6a |
files | src/lib-mail/message-decoder.c src/lib/unichar.c src/lib/unichar.h |
diffstat | 3 files changed, 71 insertions(+), 63 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-mail/message-decoder.c Sat Dec 08 15:42:25 2007 +0200 +++ b/src/lib-mail/message-decoder.c Sat Dec 08 15:45:17 2007 +0200 @@ -207,65 +207,6 @@ ctx->translation_size = 0; } -static inline unsigned int -is_valid_utf8_seq(const unsigned char *input, unsigned int size) -{ - size_t i, len; - - len = uni_utf8_char_bytes(input[0]); - if (unlikely(len > size)) - return 0; - - for (i = 0; i < len; i++) { - if (unlikely(uni_utf8_char_bytes(input[i]) != len-i)) - return 0; - } - return len; -} - -static const unsigned char * -get_valid_utf8(const unsigned char *input, size_t size, buffer_t *tmpbuf, - size_t *output_size_r) -{ - size_t i, len; - - /* find the first invalid utf8 sequence */ - for (i = 0; i < size;) { - if (input[i] < 0x80) - i++; - else { - len = is_valid_utf8_seq(input + i, size-i); - if (unlikely(len == 0)) - goto broken; - i += len; - } - } - /* we can use it as-is */ - *output_size_r = size; - return input; -broken: - /* broken utf-8 input - skip the broken characters */ - buffer_set_used_size(tmpbuf, 0); - buffer_append(tmpbuf, input, i++); - - while (i < size) { - if (input[i] < 0x80) { - buffer_append_c(tmpbuf, input[i++]); - continue; - } - - len = is_valid_utf8_seq(input + i, size-i); - if (len == 0) { - i++; - continue; - } - buffer_append(tmpbuf, input + i, len); - i += len; - } - *output_size_r = tmpbuf->used; - return tmpbuf->data; -} - static void message_decode_body_init_charset(struct message_decoder_context *ctx) { enum charset_flags flags; @@ -382,13 +323,14 @@ output->data = ctx->buf2->data; output->size = ctx->buf2->used; } else { - output->data = get_valid_utf8(data, size, ctx->buf2, - &output->size); + output->data = + uni_utf8_get_valid_data(data, size, ctx->buf2, + &output->size); } } else if (ctx->charset_trans == NULL) { /* unknown charset */ - output->data = get_valid_utf8(data, size, ctx->buf2, - &output->size); + output->data = uni_utf8_get_valid_data(data, size, ctx->buf2, + &output->size); } else { buffer_set_used_size(ctx->buf2, 0); if (ctx->translation_size != 0)
--- a/src/lib/unichar.c Sat Dec 08 15:42:25 2007 +0200 +++ b/src/lib/unichar.c Sat Dec 08 15:45:17 2007 +0200 @@ -285,3 +285,62 @@ } return 0; } + +static inline unsigned int +is_valid_utf8_seq(const unsigned char *input, unsigned int size) +{ + size_t i, len; + + len = uni_utf8_char_bytes(input[0]); + if (unlikely(len > size)) + return 0; + + for (i = 0; i < len; i++) { + if (unlikely(uni_utf8_char_bytes(input[i]) != len-i)) + return 0; + } + return len; +} + +const unsigned char * +uni_utf8_get_valid_data(const unsigned char *input, size_t size, + buffer_t *tmpbuf, size_t *output_size_r) +{ + size_t i, len; + + /* find the first invalid utf8 sequence */ + for (i = 0; i < size;) { + if (input[i] < 0x80) + i++; + else { + len = is_valid_utf8_seq(input + i, size-i); + if (unlikely(len == 0)) + goto broken; + i += len; + } + } + /* we can use it as-is */ + *output_size_r = size; + return input; +broken: + /* broken utf-8 input - skip the broken characters */ + buffer_set_used_size(tmpbuf, 0); + buffer_append(tmpbuf, input, i++); + + while (i < size) { + if (input[i] < 0x80) { + buffer_append_c(tmpbuf, input[i++]); + continue; + } + + len = is_valid_utf8_seq(input + i, size-i); + if (len == 0) { + i++; + continue; + } + buffer_append(tmpbuf, input + i, len); + i += len; + } + *output_size_r = tmpbuf->used; + return tmpbuf->data; +}
--- a/src/lib/unichar.h Sat Dec 08 15:42:25 2007 +0200 +++ b/src/lib/unichar.h Sat Dec 08 15:45:17 2007 +0200 @@ -40,4 +40,11 @@ int uni_utf8_to_decomposed_titlecase(const void *input, size_t max_len, buffer_t *output); +/* If input contains only valid UTF-8 input, return it directly. If input + contains invalid UTF-8 input, write only valid UTF-8 characters to the + given buffer and return it. */ +const unsigned char * +uni_utf8_get_valid_data(const unsigned char *input, size_t size, + buffer_t *tmpbuf, size_t *output_size_r); + #endif