# HG changeset patch # User Stephan Bosch # Date 1523664351 -7200 # Node ID a556724ce39b6e02b9b49986d39cbfe00796cfc9 # Parent 9899d141ec9e2e3ea244af75c5a7f681cc635dd7 lib: Implement str_sanitize_utf8(). Unlike str_sanitize(), this function truncates strings based on a UTF8 code point limit rather than a maximum size in bytes. Also, the Unicode replacement character is used to mark invalid/control characters and an ellipsis character is used to indicate the string truncation. For the normal str_sanitize() this is done using a question mark and triple dots respectively. diff -r 9899d141ec9e -r a556724ce39b src/lib/str-sanitize.c --- a/src/lib/str-sanitize.c Tue Apr 17 21:31:38 2018 +0200 +++ b/src/lib/str-sanitize.c Sat Apr 14 02:05:51 2018 +0200 @@ -22,6 +22,27 @@ return i; } + +static size_t +str_sanitize_skip_start_utf8(const char *src, uintmax_t max_chars) +{ + unichar_t chr; + uintmax_t c; + size_t i; + + for (i = 0, c = 0; c < max_chars && src[i] != '\0'; ) { + int len = uni_utf8_get_char(src+i, &chr); + if (len <= 0) + break; + if ((unsigned char)src[i] < 32) + break; + c++; + i += len; + } + i_assert(c <= max_chars); + return i; +} + static void str_sanitize_truncate_char(string_t *dest, unsigned int initial_pos) { const unsigned char *data = str_data(dest); @@ -77,6 +98,42 @@ } } +void str_sanitize_append_utf8(string_t *dest, const char *src, + uintmax_t max_cps) +{ + size_t last_pos = 0; + unichar_t chr; + uintmax_t c; + size_t i; + + i_assert(max_cps > 0); + + for (i = 0, c = 0; c < max_cps && src[i] != '\0'; ) { + int len = uni_utf8_get_char(src+i, &chr); + if (len == 0) + break; /* input ended too early */ + + last_pos = str_len(dest); + if (len < 0) { + /* invalid UTF-8 */ + str_append(dest, UNICODE_REPLACEMENT_CHAR_UTF8); + i++; + continue; + } + if ((unsigned char)src[i] < 32) + str_append(dest, UNICODE_REPLACEMENT_CHAR_UTF8); + else + str_append_n(dest, src+i, len); + i += len; + c++; + } + + if (src[i] != '\0') { + str_truncate(dest, last_pos); + str_append(dest, UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8); + } +} + const char *str_sanitize(const char *src, size_t max_bytes) { string_t *str; @@ -93,3 +150,21 @@ str_sanitize_append(str, src, max_bytes); return str_c(str); } + +const char *str_sanitize_utf8(const char *src, uintmax_t max_cps) +{ + string_t *str; + size_t i; + + if (src == NULL) + return NULL; + + i = str_sanitize_skip_start_utf8(src, max_cps); + if (src[i] == '\0') + return src; + + str = t_str_new(I_MIN(max_cps, 256)); + str_sanitize_append_utf8(str, src, max_cps); + return str_c(str); +} + diff -r 9899d141ec9e -r a556724ce39b src/lib/str-sanitize.h --- a/src/lib/str-sanitize.h Tue Apr 17 21:31:38 2018 +0200 +++ b/src/lib/str-sanitize.h Sat Apr 14 02:05:51 2018 +0200 @@ -6,8 +6,17 @@ src is treated as UTF-8 input, but max_bytes is in bytes instead of UTF-8 characters. */ void str_sanitize_append(string_t *dest, const char *src, size_t max_bytes); +/* All control characters in src will be appended as the unicode replacement + character (U+FFFD). If src has more than max_cps unicode code points, it's + truncated with a horizontal ellipsis character (U+2026) appended to the end. + */ +void str_sanitize_append_utf8(string_t *dest, const char *src, + uintmax_t max_cps); /* Return src sanitized. If there are no changes, src pointer is returned. If src is NULL, returns NULL. */ const char *str_sanitize(const char *src, size_t max_bytes); +/* The unicode version of str_sanitize() using str_sanitize_append_utf8() + internally. */ +const char *str_sanitize_utf8(const char *src, uintmax_t max_cps); #endif diff -r 9899d141ec9e -r a556724ce39b src/lib/test-str-sanitize.c --- a/src/lib/test-str-sanitize.c Tue Apr 17 21:31:38 2018 +0200 +++ b/src/lib/test-str-sanitize.c Sat Apr 14 02:05:51 2018 +0200 @@ -10,7 +10,7 @@ const char *sanitized; /* NULL for no change */ }; -void test_str_sanitize(void) +static void test_str_sanitize_max_bytes(void) { static struct str_sanitize_test tests[] = { { NULL, 2, NULL }, @@ -64,3 +64,64 @@ } test_end(); } + +static void test_str_sanitize_max_codepoints(void) +{ + static const struct str_sanitize_test tests[] = { + { NULL, 2, NULL }, + { "", 2, NULL }, + { "a", 2, NULL }, + { "ab", 2, NULL }, + { "abc", 2, "a\xE2\x80\xA6" }, + { "abcd", 3, "ab\xE2\x80\xA6" }, + { "abcde", 4, "abc\xE2\x80\xA6" }, + { "\xD1\x81", 1, "\xD1\x81" }, + { "\xD1\x81", 2, "\xD1\x81" }, + { "\xD1\x81", 3, NULL }, + { "\xC3\xA4\xC3\xA4zyxa", 1, "\xE2\x80\xA6" }, + { "\xC3\xA4\xC3\xA4zyxa", 2, "\xC3\xA4\xE2\x80\xA6" }, + { "\xC3\xA4\xC3\xA4zyxa", 3, "\xC3\xA4\xC3\xA4\xE2\x80\xA6" }, + { "\xC3\xA4\xC3\xA4zyxa", 4, "\xC3\xA4\xC3\xA4z\xE2\x80\xA6" }, + { "\xC3\xA4\xC3\xA4zyxa", 5, "\xC3\xA4\xC3\xA4zy\xE2\x80\xA6" }, + { "\xC3\xA4\xC3\xA4zyxa", 6, "\xC3\xA4\xC3\xA4zyxa" }, + { "\xC3\xA4\xC3\xA4zyxa", 7, "\xC3\xA4\xC3\xA4zyxa" }, + { "\xC3\xA4\xC3\xA4zyxa", 8, "\xC3\xA4\xC3\xA4zyxa" }, + { "\001x\x1fy\x81", 10, "\xEF\xBF\xBDx\xEF\xBF\xBDy\xEF\xBF\xBD" } + }; + const char *str; + string_t *str2; + unsigned int i; + + test_begin("str_sanitize_utf8"); + for (i = 0; i < N_ELEMENTS(tests); i++) { + str = str_sanitize_utf8(tests[i].str, tests[i].max_len); + if (tests[i].sanitized != NULL) + test_assert_idx(null_strcmp(str, tests[i].sanitized) == 0, i); + else + test_assert_idx(str == tests[i].str, i); + } + test_end(); + + test_begin("str_sanitize_append_utf8"); + str2 = t_str_new(128); + for (i = 0; i < N_ELEMENTS(tests); i++) { + if (tests[i].str == NULL) + continue; + str_truncate(str2, 0); + str_append(str2, "1234567890"); + str_sanitize_append_utf8(str2, tests[i].str, tests[i].max_len); + + test_assert_idx(strncmp(str_c(str2), "1234567890", 10) == 0, i); + if (tests[i].sanitized != NULL) + test_assert_idx(strcmp(str_c(str2)+10, tests[i].sanitized) == 0, i); + else + test_assert_idx(strcmp(str_c(str2)+10, tests[i].str) == 0, i); + } + test_end(); +} + +void test_str_sanitize(void) +{ + test_str_sanitize_max_bytes(); + test_str_sanitize_max_codepoints(); +}