changeset 22922:a556724ce39b

lib: Implement str_sanitize_utf8(). Unlike str_sanitize(), this function truncates strings based on a UTF8 code point limit rather than a maximum size in bytes. Also, the Unicode replacement character is used to mark invalid/control characters and an ellipsis character is used to indicate the string truncation. For the normal str_sanitize() this is done using a question mark and triple dots respectively.
author Stephan Bosch <stephan.bosch@dovecot.fi>
date Sat, 14 Apr 2018 02:05:51 +0200
parents 9899d141ec9e
children afb793aaaccb
files src/lib/str-sanitize.c src/lib/str-sanitize.h src/lib/test-str-sanitize.c
diffstat 3 files changed, 146 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/src/lib/str-sanitize.c	Tue Apr 17 21:31:38 2018 +0200
+++ b/src/lib/str-sanitize.c	Sat Apr 14 02:05:51 2018 +0200
@@ -22,6 +22,27 @@
 	return i;
 }
 
+
+static size_t
+str_sanitize_skip_start_utf8(const char *src, uintmax_t max_chars)
+{
+	unichar_t chr;
+	uintmax_t c;
+	size_t i;
+
+	for (i = 0, c = 0; c < max_chars && src[i] != '\0'; ) {
+		int len = uni_utf8_get_char(src+i, &chr);
+		if (len <= 0)
+			break;
+		if ((unsigned char)src[i] < 32)
+			break;
+		c++;
+		i += len;
+	}
+	i_assert(c <= max_chars);
+	return i;
+}
+
 static void str_sanitize_truncate_char(string_t *dest, unsigned int initial_pos)
 {
 	const unsigned char *data = str_data(dest);
@@ -77,6 +98,42 @@
 	}
 }
 
+void str_sanitize_append_utf8(string_t *dest, const char *src,
+			      uintmax_t max_cps)
+{
+	size_t last_pos = 0;
+	unichar_t chr;
+	uintmax_t c;
+	size_t i;
+
+	i_assert(max_cps > 0);
+
+	for (i = 0, c = 0; c < max_cps && src[i] != '\0'; ) {
+		int len = uni_utf8_get_char(src+i, &chr);
+		if (len == 0)
+			break; /* input ended too early */
+
+		last_pos = str_len(dest);
+		if (len < 0) {
+			/* invalid UTF-8 */
+			str_append(dest, UNICODE_REPLACEMENT_CHAR_UTF8);
+			i++;
+			continue;
+		}
+		if ((unsigned char)src[i] < 32)
+			str_append(dest, UNICODE_REPLACEMENT_CHAR_UTF8);
+		else
+			str_append_n(dest, src+i, len);
+		i += len;
+		c++;
+	}
+
+	if (src[i] != '\0') {
+		str_truncate(dest, last_pos);
+		str_append(dest, UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8);
+	}
+}
+
 const char *str_sanitize(const char *src, size_t max_bytes)
 {
 	string_t *str;
@@ -93,3 +150,21 @@
 	str_sanitize_append(str, src, max_bytes);
 	return str_c(str);
 }
+
+const char *str_sanitize_utf8(const char *src, uintmax_t max_cps)
+{
+	string_t *str;
+	size_t i;
+
+	if (src == NULL)
+		return NULL;
+
+	i = str_sanitize_skip_start_utf8(src, max_cps);
+	if (src[i] == '\0')
+		return src;
+
+	str = t_str_new(I_MIN(max_cps, 256));
+	str_sanitize_append_utf8(str, src, max_cps);
+	return str_c(str);
+}
+
--- a/src/lib/str-sanitize.h	Tue Apr 17 21:31:38 2018 +0200
+++ b/src/lib/str-sanitize.h	Sat Apr 14 02:05:51 2018 +0200
@@ -6,8 +6,17 @@
    src is treated as UTF-8 input, but max_bytes is in bytes instead of
    UTF-8 characters. */
 void str_sanitize_append(string_t *dest, const char *src, size_t max_bytes);
+/* All control characters in src will be appended as the unicode replacement
+   character (U+FFFD). If src has more than max_cps unicode code points, it's
+   truncated with a horizontal ellipsis character (U+2026) appended to the end.
+ */
+void str_sanitize_append_utf8(string_t *dest, const char *src,
+			      uintmax_t max_cps);
 /* Return src sanitized. If there are no changes, src pointer is returned.
    If src is NULL, returns NULL. */
 const char *str_sanitize(const char *src, size_t max_bytes);
+/* The unicode version of str_sanitize() using str_sanitize_append_utf8()
+   internally. */
+const char *str_sanitize_utf8(const char *src, uintmax_t max_cps);
 
 #endif
--- a/src/lib/test-str-sanitize.c	Tue Apr 17 21:31:38 2018 +0200
+++ b/src/lib/test-str-sanitize.c	Sat Apr 14 02:05:51 2018 +0200
@@ -10,7 +10,7 @@
 	const char *sanitized; /* NULL for no change */
 };
 
-void test_str_sanitize(void)
+static void test_str_sanitize_max_bytes(void)
 {
 	static struct str_sanitize_test tests[] = {
 		{ NULL,    2, NULL },
@@ -64,3 +64,64 @@
 	}
 	test_end();
 }
+
+static void test_str_sanitize_max_codepoints(void)
+{
+	static const struct str_sanitize_test tests[] = {
+		{ NULL,    2, NULL },
+		{ "",      2, NULL },
+		{ "a",     2, NULL },
+		{ "ab",    2, NULL },
+		{ "abc",   2, "a\xE2\x80\xA6" },
+		{ "abcd",  3, "ab\xE2\x80\xA6" },
+		{ "abcde", 4, "abc\xE2\x80\xA6" },
+		{ "\xD1\x81",     1, "\xD1\x81" },
+		{ "\xD1\x81",     2, "\xD1\x81" },
+		{ "\xD1\x81",     3, NULL },
+		{ "\xC3\xA4\xC3\xA4zyxa", 1, "\xE2\x80\xA6" },
+		{ "\xC3\xA4\xC3\xA4zyxa", 2, "\xC3\xA4\xE2\x80\xA6" },
+		{ "\xC3\xA4\xC3\xA4zyxa", 3, "\xC3\xA4\xC3\xA4\xE2\x80\xA6" },
+		{ "\xC3\xA4\xC3\xA4zyxa", 4, "\xC3\xA4\xC3\xA4z\xE2\x80\xA6" },
+		{ "\xC3\xA4\xC3\xA4zyxa", 5, "\xC3\xA4\xC3\xA4zy\xE2\x80\xA6" },
+		{ "\xC3\xA4\xC3\xA4zyxa", 6, "\xC3\xA4\xC3\xA4zyxa" },
+		{ "\xC3\xA4\xC3\xA4zyxa", 7, "\xC3\xA4\xC3\xA4zyxa" },
+		{ "\xC3\xA4\xC3\xA4zyxa", 8, "\xC3\xA4\xC3\xA4zyxa" },
+		{ "\001x\x1fy\x81", 10, "\xEF\xBF\xBDx\xEF\xBF\xBDy\xEF\xBF\xBD" }
+	};
+	const char *str;
+	string_t *str2;
+	unsigned int i;
+
+	test_begin("str_sanitize_utf8");
+	for (i = 0; i < N_ELEMENTS(tests); i++) {
+		str = str_sanitize_utf8(tests[i].str, tests[i].max_len);
+		if (tests[i].sanitized != NULL)
+			test_assert_idx(null_strcmp(str, tests[i].sanitized) == 0, i);
+		else
+			test_assert_idx(str == tests[i].str, i);
+	}
+	test_end();
+
+	test_begin("str_sanitize_append_utf8");
+	str2 = t_str_new(128);
+	for (i = 0; i < N_ELEMENTS(tests); i++) {
+		if (tests[i].str == NULL)
+			continue;
+		str_truncate(str2, 0);
+		str_append(str2, "1234567890");
+		str_sanitize_append_utf8(str2, tests[i].str, tests[i].max_len);
+
+		test_assert_idx(strncmp(str_c(str2), "1234567890", 10) == 0, i);
+		if (tests[i].sanitized != NULL)
+			test_assert_idx(strcmp(str_c(str2)+10, tests[i].sanitized) == 0, i);
+		else
+			test_assert_idx(strcmp(str_c(str2)+10, tests[i].str) == 0, i);
+	}
+	test_end();
+}
+
+void test_str_sanitize(void)
+{
+	test_str_sanitize_max_bytes();
+	test_str_sanitize_max_codepoints();
+}