Mercurial > dovecot > original-hg > dovecot-1.2
changeset 7185:6f014a866f38 HEAD
Replace invalid UTF8 input with a replacement character.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Tue, 22 Jan 2008 09:31:59 +0200 |
parents | 7416737df8b8 |
children | d48c419a27ca |
files | src/lib/unichar.c src/lib/unichar.h |
diffstat | 2 files changed, 27 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib/unichar.c Tue Jan 22 08:49:24 2008 +0200 +++ b/src/lib/unichar.c Tue Jan 22 09:31:59 2008 +0200 @@ -260,6 +260,22 @@ return TRUE; } +static void output_add_replacement_char(buffer_t *output) +{ + /* 0xfffd */ + static const unsigned char replacement_utf8[] = { 0xef, 0xbf, 0xbd }; +#define REPLACEMENT_UTF8_LEN 3 + + if (output->used >= REPLACEMENT_UTF8_LEN && + memcmp(CONST_PTR_OFFSET(output->data, + output->used - REPLACEMENT_UTF8_LEN), + replacement_utf8, REPLACEMENT_UTF8_LEN) == 0) { + /* don't add the replacement char multiple times */ + return; + } + buffer_append(output, replacement_utf8, REPLACEMENT_UTF8_LEN); +} + int uni_utf8_to_decomposed_titlecase(const void *_input, size_t max_len, buffer_t *output) { @@ -273,6 +289,7 @@ /* invalid input. try the next byte. */ ret = -1; input++; max_len--; + output_add_replacement_char(output); continue; } bytes = uni_utf8_char_bytes(*input); @@ -327,6 +344,7 @@ /* broken utf-8 input - skip the broken characters */ buffer_append(buf, input, i++); + output_add_replacement_char(buf); while (i < size) { if (input[i] < 0x80) { buffer_append_c(buf, input[i++]); @@ -336,6 +354,7 @@ len = is_valid_utf8_seq(input + i, size-i); if (len == 0) { i++; + output_add_replacement_char(buf); continue; } buffer_append(buf, input + i, len);
--- a/src/lib/unichar.h Tue Jan 22 08:49:24 2008 +0200 +++ b/src/lib/unichar.h Tue Jan 22 09:31:59 2008 +0200 @@ -1,6 +1,9 @@ #ifndef UNICHAR_H #define UNICHAR_H +/* Character used to replace invalid input. */ +#define UNICODE_REPLACEMENT_CHAR 0xfffd + typedef uint32_t unichar_t; ARRAY_DEFINE_TYPE(unichars, unichar_t); @@ -37,13 +40,14 @@ /* Convert UTF-8 input to titlecase and decompose the titlecase characters to output buffer. Returns 0 if ok, -1 if input was invalid. This generates - output that's compatible with i;unicode-casemap comparator. */ + output that's compatible with i;unicode-casemap comparator. Invalid input + is replaced with unicode replacement character (0xfffd). */ int uni_utf8_to_decomposed_titlecase(const void *input, size_t max_len, buffer_t *output); -/* If input contains only valid UTF-8 characters, return TRUE. If input - contains invalid UTF-8 characters, write only the valid ones to buf and - return FALSE. */ +/* If input contains only valid UTF-8 characters, return TRUE without updating + buf. If input contains invalid UTF-8 characters, replace them with unicode + replacement character (0xfffd), write the output to buf and return FALSE. */ bool uni_utf8_get_valid_data(const unsigned char *input, size_t size, buffer_t *buf);