changeset 9456:23abbf14279c HEAD

str_sanitize(): Don't break UTF-8 input.
author Timo Sirainen <tss@iki.fi>
date Wed, 28 Oct 2009 13:50:57 -0400
parents bd4a6f500c75
children 778a6418f54b
files src/lib/str-sanitize.c
diffstat 1 files changed, 27 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/src/lib/str-sanitize.c	Tue Oct 27 22:47:25 2009 -0400
+++ b/src/lib/str-sanitize.c	Wed Oct 28 13:50:57 2009 -0400
@@ -1,32 +1,52 @@
 /* Copyright (c) 2004-2009 Dovecot authors, see the included COPYING file */
 
 #include "lib.h"
+#include "unichar.h"
 #include "str.h"
 #include "str-sanitize.h"
 
 static size_t str_sanitize_skip_start(const char *src, size_t max_len)
 {
+	unsigned int len;
+	unichar_t chr;
 	size_t i;
 
-	for (i = 0; i < max_len; i++) {
-		if (((unsigned char)src[i] & 0x7f) < 32)
+	for (i = 0; i < max_len; ) {
+		len = uni_utf8_char_bytes(src[i]);
+		if (uni_utf8_get_char(src+i, &chr) <= 0)
 			break;
+		if ((unsigned char)src[i] < 32)
+			break;
+		i += len;
 	}
 	return i;
 }
 
 void str_sanitize_append(string_t *dest, const char *src, size_t max_len)
 {
+	unsigned int len;
+	unichar_t chr;
 	size_t i;
+	int ret;
 
-	i = str_sanitize_skip_start(src, max_len);
-	str_append_n(dest, src, i);
-
-	for (; i < max_len && src[i] != '\0'; i++) {
-		if (((unsigned char)src[i] & 0x7f) < 32)
+	for (i = 0; i < max_len && src[i] != '\0'; ) {
+		len = uni_utf8_char_bytes(src[i]);
+		ret = uni_utf8_get_char(src+i, &chr);
+		if (ret <= 0) {
+			/* invalid UTF-8 */
+			str_append_c(dest, '?');
+			if (ret == 0) {
+				/* input ended too early */
+				return;
+			}
+			i++;
+			continue;
+		}
+		if ((unsigned char)src[i] < 32)
 			str_append_c(dest, '?');
 		else
 			str_append_c(dest, src[i]);
+		i += len;
 	}
 
 	if (src[i] != '\0') {