changeset 10212:f68c2cc1b32b HEAD

str_sanitize(): Don't break UTF-8 input.
author Timo Sirainen <tss@iki.fi>
date Wed, 28 Oct 2009 13:50:55 -0400
parents 104edcb89a70
children 6c32cc350164
files src/lib/str-sanitize.c src/lib/test-str-sanitize.c
diffstat 2 files changed, 38 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/src/lib/str-sanitize.c	Tue Oct 27 22:44:39 2009 -0400
+++ b/src/lib/str-sanitize.c	Wed Oct 28 13:50:55 2009 -0400
@@ -1,32 +1,52 @@
 /* Copyright (c) 2004-2009 Dovecot authors, see the included COPYING file */
 
 #include "lib.h"
+#include "unichar.h"
 #include "str.h"
 #include "str-sanitize.h"
 
 static size_t str_sanitize_skip_start(const char *src, size_t max_len)
 {
+	unsigned int len;
+	unichar_t chr;
 	size_t i;
 
-	for (i = 0; i < max_len; i++) {
-		if (((unsigned char)src[i] & 0x7f) < 32)
+	for (i = 0; i < max_len; ) {
+		len = uni_utf8_char_bytes(src[i]);
+		if (uni_utf8_get_char(src+i, &chr) <= 0)
 			break;
+		if ((unsigned char)src[i] < 32)
+			break;
+		i += len;
 	}
 	return i;
 }
 
 void str_sanitize_append(string_t *dest, const char *src, size_t max_len)
 {
+	unsigned int len;
+	unichar_t chr;
 	size_t i;
+	int ret;
 
-	i = str_sanitize_skip_start(src, max_len);
-	str_append_n(dest, src, i);
-
-	for (; i < max_len && src[i] != '\0'; i++) {
-		if (((unsigned char)src[i] & 0x7f) < 32)
+	for (i = 0; i < max_len && src[i] != '\0'; ) {
+		len = uni_utf8_char_bytes(src[i]);
+		ret = uni_utf8_get_char(src+i, &chr);
+		if (ret <= 0) {
+			/* invalid UTF-8 */
+			str_append_c(dest, '?');
+			if (ret == 0) {
+				/* input ended too early */
+				return;
+			}
+			i++;
+			continue;
+		}
+		if ((unsigned char)src[i] < 32)
 			str_append_c(dest, '?');
 		else
 			str_append_c(dest, src[i]);
+		i += len;
 	}
 
 	if (src[i] != '\0') {
--- a/src/lib/test-str-sanitize.c	Tue Oct 27 22:44:39 2009 -0400
+++ b/src/lib/test-str-sanitize.c	Wed Oct 28 13:50:55 2009 -0400
@@ -17,7 +17,10 @@
 		{ "ab", 2 },
 		{ "abc", 2 },
 		{ "abcd", 3 },
-		{ "abcde", 4 }
+		{ "abcde", 4 },
+		{ "с", 10 },
+		{ "с", 1 },
+		{ "\001x\x1fy\x81", 10 }
 	};
 	static const char *output[] = {
 		NULL,
@@ -26,15 +29,18 @@
 		"ab",
 		"...",
 		"...",
-		"a..."
+		"a...",
+		"с",
+		"с",
+		"?x?y?"
 	};
 	const char *str;
 	unsigned int i;
-	bool success;
 
+	test_begin("str_sanitize");
 	for (i = 0; i < N_ELEMENTS(input); i++) {
 		str = str_sanitize(input[i].str, input[i].max_len);
-		success = null_strcmp(output[i], str) == 0;
-		test_out(t_strdup_printf("str_sanitize(%d)", i), success);
+		test_assert(null_strcmp(output[i], str) == 0);
 	}
+	test_end();
 }