# HG changeset patch # User Timo Sirainen # Date 1225566721 -7200 # Node ID 81c0fe5bd635995615a822f4a5bb481a15b2671a # Parent 59fc12b2b08ab6e6e28b4571bbf3a155590b3afe Added code for encoding and decoding IMAP's modified-UTF7 strings. diff -r 59fc12b2b08a -r 81c0fe5bd635 src/lib-imap/Makefile.am --- a/src/lib-imap/Makefile.am Sat Nov 01 21:11:36 2008 +0200 +++ b/src/lib-imap/Makefile.am Sat Nov 01 21:12:01 2008 +0200 @@ -15,6 +15,7 @@ imap-parser.c \ imap-quote.c \ imap-seqset.c \ + imap-utf7.c \ imap-util.c headers = \ @@ -27,6 +28,7 @@ imap-parser.h \ imap-quote.h \ imap-seqset.h \ + imap-utf7.h \ imap-util.h if INSTALL_HEADERS diff -r 59fc12b2b08a -r 81c0fe5bd635 src/lib-imap/imap-utf7.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib-imap/imap-utf7.c Sat Nov 01 21:12:01 2008 +0200 @@ -0,0 +1,249 @@ +/* Copyright (c) 2008 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "str.h" +#include "unichar.h" +#include "imap-utf7.h" + +static const char imap_b64enc[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + +#define XX 0xff +static const unsigned char imap_b64dec[256] = { + XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, + XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, + XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,62, 63,XX,XX,XX, + 52,53,54,55, 56,57,58,59, 60,61,XX,XX, XX,XX,XX,XX, + XX, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14, + 15,16,17,18, 19,20,21,22, 23,24,25,XX, XX,XX,XX,XX, + XX,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40, + 41,42,43,44, 45,46,47,48, 49,50,51,XX, XX,XX,XX,XX, + XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, + XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, + XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, + XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, + XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, + XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, + XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, + XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX, XX,XX,XX,XX +}; + +static void +mbase64_encode(string_t *dest, const unsigned char *in, unsigned int len) +{ + str_append_c(dest, '&'); + while (len >= 3) { + str_append_c(dest, imap_b64enc[in[0] >> 2]); + str_append_c(dest, imap_b64enc[((in[0] & 3) << 4) | + (in[1] >> 4)]); + str_append_c(dest, imap_b64enc[((in[1] & 0x0f) << 2) | + ((in[2] & 0xc0) >> 6)]); + str_append_c(dest, imap_b64enc[in[2] & 0x3f]); + in += 3; + len -= 3; + } + if (len > 0) { + str_append_c(dest, imap_b64enc[in[0] >> 2]); + if (len == 1) + str_append_c(dest, imap_b64enc[(in[0] & 0x03) << 4]); + else { + str_append_c(dest, imap_b64enc[((in[0] & 0x03) << 4) | + (in[1] >> 4)]); + str_append_c(dest, imap_b64enc[(in[1] & 0x0f) << 2]); + } + } + str_append_c(dest, '-'); +} + +int imap_utf8_to_utf7(const char *src, string_t *dest) +{ + const char *p; + unichar_t chr; + uint8_t *utf16, *u; + uint16_t u16; + + for (p = src; *p != '\0'; p++) { + if (*p == '&' || (unsigned char)*p >= 0x80) + break; + } + if (*p == '\0') { + /* no ASCII characters that need to be encoded */ + str_append(dest, src); + return 0; + } + + /* at least one encoded character */ + str_append_n(dest, src, p-src); + utf16 = t_malloc(strlen(p)*2); + while (*p != '\0') { + if (*p == '&') { + str_append(dest, "&-"); + p++; + continue; + } + if ((unsigned char)*p < 0x80) { + str_append_c(dest, *p); + p++; + continue; + } + + u = utf16; + while ((unsigned char)*p >= 0x80) { + if (uni_utf8_get_char(p, &chr) <= 0) + return -1; + /* @UNSAFE */ + if (chr < UTF16_SURROGATE_BASE) { + *u++ = chr >> 8; + *u++ = chr & 0xff; + } else { + u16 = UTF16_SURROGATE_HIGH(chr); + *u++ = u16 >> 8; + *u++ = u16 & 0xff; + u16 = UTF16_SURROGATE_LOW(chr); + *u++ = u16 >> 8; + *u++ = u16 & 0xff; + } + p += uni_utf8_char_bytes(*p); + } + mbase64_encode(dest, utf16, u-utf16); + } + return 0; +} + +static int utf16buf_to_utf8(string_t *dest, const unsigned char output[4], + unsigned int *_pos, unsigned int len) +{ + unsigned int pos = *_pos; + uint16_t high, low; + unichar_t chr; + + if (len % 2 != 0) + return -1; + + high = (output[pos % 4] << 8) | output[(pos+1) % 4]; + if (high < UTF16_SURROGATE_HIGH_FIRST || + high > UTF16_SURROGATE_HIGH_MAX) { + /* single byte */ + uni_ucs4_to_utf8_c(high, dest); + *_pos = (pos + 2) % 4; + return 0; + } + + if (high > UTF16_SURROGATE_HIGH_LAST) + return -1; + if (len != 4) { + /* missing the second character */ + return -1; + } + + low = (output[(pos+2)%4] << 8) | output[(pos+3) % 4]; + if (low < UTF16_SURROGATE_LOW_FIRST || low > UTF16_SURROGATE_LOW_LAST) + return -1; + + chr = UTF16_SURROGATE_BASE + + (((high & UTF16_SURROGATE_MASK) << UTF16_SURROGATE_SHIFT) | + (low & UTF16_SURROGATE_MASK)); + uni_ucs4_to_utf8_c(chr, dest); + return 0; +} + +static int mbase64_decode_to_utf8(string_t *dest, const char **_src) +{ + const char *src = *_src; + unsigned char input[4], output[4]; + unsigned int outstart = 0, outpos = 0; + + while (*src != '-') { + input[0] = imap_b64dec[(uint8_t)src[0]]; + input[1] = imap_b64dec[(uint8_t)src[1]]; + if (input[0] == 0xff || input[1] == 0xff) + return -1; + + output[outpos % 4] = (input[0] << 2) | (input[1] >> 4); + if (++outpos % 4 == outstart) { + if (utf16buf_to_utf8(dest, output, &outstart, 4) < 0) + return -1; + } + + input[2] = imap_b64dec[(uint8_t)src[2]]; + if (input[2] == 0xff) { + if (src[2] != '-') + return -1; + + src += 2; + break; + } + + output[outpos % 4] = (input[1] << 4) | (input[2] >> 2); + if (++outpos % 4 == outstart) { + if (utf16buf_to_utf8(dest, output, &outstart, 4) < 0) + return -1; + } + + input[3] = imap_b64dec[(uint8_t)src[3]]; + if (input[3] == 0xff) { + if (src[3] != '-') + return -1; + + src += 3; + break; + } + + output[outpos % 4] = ((input[2] << 6) & 0xc0) | input[3]; + if (++outpos % 4 == outstart) { + if (utf16buf_to_utf8(dest, output, &outstart, 4) < 0) + return -1; + } + + src += 4; + } + if (outstart != outpos % 4) { + if (utf16buf_to_utf8(dest, output, &outstart, + (4 + outpos - outstart) % 4) < 0) + return -1; + } + + /* found ending '-' */ + *_src = src + 1; + return 0; +} + +int imap_utf7_to_utf8(const char *src, string_t *dest) +{ + const char *p; + + for (p = src; *p != '\0'; p++) { + if (*p == '&' || (unsigned char)*p >= 0x80) + break; + } + if (*p == '\0') { + /* no IMAP-UTF-7 encoded characters */ + str_append(dest, src); + return 0; + } + if ((unsigned char)*p >= 0x80) { + /* 8bit characters - the input is broken */ + return -1; + } + + /* at least one encoded character */ + str_append_n(dest, src, p-src); + while (*p != '\0') { + if (*p == '&') { + if (*++p == '-') { + str_append_c(dest, '&'); + p++; + } else { + if (mbase64_decode_to_utf8(dest, &p) < 0) + return -1; + if (p[0] == '&' && p[1] != '-') { + /* &...-& */ + return -1; + } + } + } else { + str_append_c(dest, *p++); + } + } + return 0; +} diff -r 59fc12b2b08a -r 81c0fe5bd635 src/lib-imap/imap-utf7.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib-imap/imap-utf7.h Sat Nov 01 21:12:01 2008 +0200 @@ -0,0 +1,11 @@ +#ifndef IMAP_UTF7_H +#define IMAP_UTF7_H + +/* Convert an UTF-8 string to IMAP-UTF-7. Returns 0 if ok, -1 if src isn't + valid UTF-8. */ +int imap_utf8_to_utf7(const char *src, string_t *dest); +/* Convert IMAP-UTF-7 string to UTF-8. Returns 0 if ok, -1 if src isn't + valid IMAP-UTF-7. */ +int imap_utf7_to_utf8(const char *src, string_t *dest); + +#endif diff -r 59fc12b2b08a -r 81c0fe5bd635 src/tests/test-imap.c --- a/src/tests/test-imap.c Sat Nov 01 21:11:36 2008 +0200 +++ b/src/tests/test-imap.c Sat Nov 01 21:12:01 2008 +0200 @@ -1,7 +1,10 @@ /* Copyright (c) 2008 Dovecot authors, see the included COPYING file */ #include "lib.h" +#include "str.h" +#include "unichar.h" #include "imap-match.h" +#include "imap-utf7.h" #include "test-common.h" struct test_imap_match { @@ -75,10 +78,99 @@ } } +static void test_imap_utf7(void) +{ + static const char *to_utf7[] = { + "&&x&&", "&-&-x&-&-", + "~peter/mail/台北/日本語", "~peter/mail/&U,BTFw-/&ZeVnLIqe-", + "tietäjä", "tiet&AOQ-j&AOQ-", + "p", NULL, + NULL + }; + static const char *invalid_utf7[] = { + "&Jjo!", + "&U,BTFw-&ZeVnLIqe-", + NULL + }; + string_t *src, *dest; + const char *orig_src; + unsigned int i, j; + unichar_t chr; + bool success, all_success = TRUE; + + src = t_str_new(256); + dest = t_str_new(256); + + for (i = 0; to_utf7[i] != NULL; i += 2) { + str_truncate(dest, 0); + if (imap_utf8_to_utf7(to_utf7[i], dest) < 0) + success = to_utf7[i+1] == NULL; + else { + success = to_utf7[i+1] != NULL && + strcmp(to_utf7[i+1], str_c(dest)) == 0; + } + if (!success) { + test_out(t_strdup_printf("imap_utf8_to_utf7(%d)", i/2), + FALSE); + all_success = FALSE; + } else if (to_utf7[i+1] != NULL) { + str_truncate(dest, 0); + if (imap_utf7_to_utf8(to_utf7[i+1], dest) < 0 || + strcmp(to_utf7[i], str_c(dest)) != 0) { + test_out(t_strdup_printf("imap_utf7_to_utf8(%d)", i/2), + FALSE); + all_success = FALSE; + } + } + } + if (all_success) + test_out("imap_utf8_to_utf7()", TRUE); + + success = TRUE; + for (chr = 0xffff; chr <= 0x10010; chr++) { + for (i = 1; i <= 10; i++) { + str_truncate(src, 0); + str_truncate(dest, 0); + for (j = 0; j < i; j++) { + if (j % 3 == 0) + str_append_c(src, 'x'); + if (j % 5 == 0) + str_append_c(src, '&'); + uni_ucs4_to_utf8_c(chr, src); + } + + orig_src = t_strdup(str_c(src)); + str_truncate(src, 0); + + if (imap_utf8_to_utf7(orig_src, dest) < 0) + success = FALSE; + else if (imap_utf7_to_utf8(str_c(dest), src) < 0) + success = FALSE; + else + success = strcmp(str_c(src), orig_src) == 0; + if (!success) + goto end; + } + } +end: + test_out("imap_utf7_to_utf8(reverse)", success); + for (i = 0; invalid_utf7[i] != NULL; i++) { + str_truncate(dest, 0); + if (imap_utf7_to_utf8(invalid_utf7[i], dest) == 0) { + test_out(t_strdup_printf("imap_utf7_to_utf8(invalid.%d)", i), + FALSE); + all_success = FALSE; + } + } + if (all_success) + test_out("imap_utf7_to_utf8(invalid)", TRUE); +} + int main(void) { test_init(); test_imap_match(); + test_imap_utf7(); return test_deinit(); }