changeset 7187:d9b87e3ce6c8 HEAD

Treat replacement characters as non-indexed chars.
author Timo Sirainen <tss@iki.fi>
date Tue, 22 Jan 2008 09:33:40 +0200
parents d48c419a27ca
children febb2592e616
files src/plugins/fts-squat/squat-trie.c
diffstat 1 files changed, 13 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/src/plugins/fts-squat/squat-trie.c	Tue Jan 22 09:32:27 2008 +0200
+++ b/src/plugins/fts-squat/squat-trie.c	Tue Jan 22 09:33:40 2008 +0200
@@ -821,12 +821,23 @@
 squat_data_normalize(struct squat_trie *trie, const unsigned char *data,
 		     unsigned int size)
 {
+	static const unsigned char replacement_utf8[] = { 0xef, 0xbf, 0xbd };
 	unsigned char *dest;
 	unsigned int i;
 
 	dest = t_malloc(size);
-	for (i = 0; i < size; i++)
-		dest[i] = trie->hdr.normalize_map[data[i]];
+	for (i = 0; i < size; i++) {
+		if (data[i] == replacement_utf8[0] && i + 2 < size &&
+		    data[i+1] == replacement_utf8[1] &&
+		    data[i+2] == replacement_utf8[2]) {
+			/* Don't index replacement character */
+			dest[i++] = 0;
+			dest[i++] = 0;
+			dest[i] = 0;
+		} else {
+			dest[i] = trie->hdr.normalize_map[data[i]];
+		}
+	}
 	return dest;
 }