changeset 7185:6f014a866f38 HEAD

Replace invalid UTF8 input with a replacement character.
author Timo Sirainen <tss@iki.fi>
date Tue, 22 Jan 2008 09:31:59 +0200
parents 7416737df8b8
children d48c419a27ca
files src/lib/unichar.c src/lib/unichar.h
diffstat 2 files changed, 27 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/src/lib/unichar.c	Tue Jan 22 08:49:24 2008 +0200
+++ b/src/lib/unichar.c	Tue Jan 22 09:31:59 2008 +0200
@@ -260,6 +260,22 @@
 	return TRUE;
 }
 
+static void output_add_replacement_char(buffer_t *output)
+{
+	/* 0xfffd */
+	static const unsigned char replacement_utf8[] = { 0xef, 0xbf, 0xbd };
+#define REPLACEMENT_UTF8_LEN 3
+
+	if (output->used >= REPLACEMENT_UTF8_LEN &&
+	    memcmp(CONST_PTR_OFFSET(output->data,
+				    output->used - REPLACEMENT_UTF8_LEN),
+		   replacement_utf8, REPLACEMENT_UTF8_LEN) == 0) {
+		/* don't add the replacement char multiple times */
+		return;
+	}
+	buffer_append(output, replacement_utf8, REPLACEMENT_UTF8_LEN);
+}
+
 int uni_utf8_to_decomposed_titlecase(const void *_input, size_t max_len,
 				     buffer_t *output)
 {
@@ -273,6 +289,7 @@
 			/* invalid input. try the next byte. */
 			ret = -1;
 			input++; max_len--;
+			output_add_replacement_char(output);
 			continue;
 		}
 		bytes = uni_utf8_char_bytes(*input);
@@ -327,6 +344,7 @@
 	/* broken utf-8 input - skip the broken characters */
 	buffer_append(buf, input, i++);
 
+	output_add_replacement_char(buf);
 	while (i < size) {
 		if (input[i] < 0x80) {
 			buffer_append_c(buf, input[i++]);
@@ -336,6 +354,7 @@
 		len = is_valid_utf8_seq(input + i, size-i);
 		if (len == 0) {
 			i++;
+			output_add_replacement_char(buf);
 			continue;
 		}
 		buffer_append(buf, input + i, len);
--- a/src/lib/unichar.h	Tue Jan 22 08:49:24 2008 +0200
+++ b/src/lib/unichar.h	Tue Jan 22 09:31:59 2008 +0200
@@ -1,6 +1,9 @@
 #ifndef UNICHAR_H
 #define UNICHAR_H
 
+/* Character used to replace invalid input. */
+#define UNICODE_REPLACEMENT_CHAR 0xfffd
+
 typedef uint32_t unichar_t;
 ARRAY_DEFINE_TYPE(unichars, unichar_t);
 
@@ -37,13 +40,14 @@
 
 /* Convert UTF-8 input to titlecase and decompose the titlecase characters to
    output buffer. Returns 0 if ok, -1 if input was invalid. This generates
-   output that's compatible with i;unicode-casemap comparator. */
+   output that's compatible with i;unicode-casemap comparator. Invalid input
+   is replaced with unicode replacement character (0xfffd). */
 int uni_utf8_to_decomposed_titlecase(const void *input, size_t max_len,
 				     buffer_t *output);
 
-/* If input contains only valid UTF-8 characters, return TRUE. If input
-   contains invalid UTF-8 characters, write only the valid ones to buf and
-   return FALSE. */
+/* If input contains only valid UTF-8 characters, return TRUE without updating
+   buf. If input contains invalid UTF-8 characters, replace them with unicode
+   replacement character (0xfffd), write the output to buf and return FALSE. */
 bool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
 			     buffer_t *buf);