changeset 13129:4ed44f06c54c

fts: Don't pass NUL bytes to FTS backend. It can confuse them.
author Timo Sirainen <tss@iki.fi>
date Wed, 03 Aug 2011 18:59:07 +0300
parents 62c8eadd09d2
children 3b4612e2a25a
files src/plugins/fts/fts-build.c src/plugins/fts/fts-parser.c src/plugins/fts/fts-parser.h
diffstat 3 files changed, 68 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/src/plugins/fts/fts-build.c	Wed Aug 03 18:58:45 2011 +0300
+++ b/src/plugins/fts/fts-build.c	Wed Aug 03 18:59:07 2011 +0300
@@ -56,6 +56,31 @@
 		fts_build_parse_content_disposition(ctx, hdr);
 }
 
+static void
+fts_build_unstructured_header(struct fts_storage_build_context *ctx,
+			      const struct message_header_line *hdr)
+{
+	const unsigned char *data = hdr->full_value;
+	unsigned char *buf = NULL;
+	unsigned int i;
+
+	/* @UNSAFE: if there are any NULs, replace them with spaces */
+	for (i = 0; i < hdr->full_value_len; i++) {
+		if (data[i] == '\0') {
+			if (buf == NULL) {
+				buf = i_malloc(hdr->full_value_len);
+				memcpy(buf, data, i);
+			}
+			buf[i] = ' ';
+		} else if (buf != NULL) {
+			buf[i] = data[i];
+		}
+	}
+	(void)fts_backend_update_build_more(ctx->update_ctx,
+					    data, hdr->full_value_len);
+	i_free(buf);
+}
+
 static void fts_build_mail_header(struct fts_storage_build_context *ctx,
 				  const struct message_block *block)
 {
@@ -78,9 +103,8 @@
 
 	if (!message_header_is_address(hdr->name)) {
 		/* regular unstructured header */
-		(void)fts_backend_update_build_more(ctx->update_ctx,
-						    hdr->full_value,
-						    hdr->full_value_len);
+		// FIXME: get rid of potential NULs
+		fts_build_unstructured_header(ctx, hdr);
 	} else T_BEGIN {
 		/* message address. normalize it to give better
 		   search results. */
@@ -136,6 +160,8 @@
 		*binary_body_r = TRUE;
 		key.type = FTS_BACKEND_BUILD_KEY_BODY_PART_BINARY;
 	}
+	if (ctx->body_parser == NULL)
+		ctx->body_parser = fts_parser_text_init();
 	key.body_content_type = content_type;
 	key.body_content_disposition = ctx->content_disposition;
 	return fts_backend_update_set_build_key(ctx->update_ctx, &key);
--- a/src/plugins/fts/fts-parser.c	Wed Aug 03 18:58:45 2011 +0300
+++ b/src/plugins/fts/fts-parser.c	Wed Aug 03 18:59:07 2011 +0300
@@ -26,11 +26,41 @@
 	return FALSE;
 }
 
+struct fts_parser *fts_parser_text_init(void)
+{
+	return i_new(struct fts_parser, 1);
+}
+
+static bool data_has_nuls(const unsigned char *data, size_t size)
+{
+	size_t i;
+
+	for (i = 0; i < size; i++) {
+		if (data[i] == '\0')
+			return TRUE;
+	}
+	return FALSE;
+}
+
+static void replace_nul_bytes(buffer_t *buf)
+{
+	unsigned char *data;
+	size_t i, size;
+
+	data = buffer_get_modifiable_data(buf, &size);
+	for (i = 0; i < size; i++) {
+		if (data[i] == '\0')
+			data[i] = ' ';
+	}
+}
+
 void fts_parser_more(struct fts_parser *parser, struct message_block *block)
 {
-	parser->v.more(parser, block);
+	if (parser->v.more != NULL)
+		parser->v.more(parser, block);
 
-	if (!uni_utf8_data_is_valid(block->data, block->size)) {
+	if (!uni_utf8_data_is_valid(block->data, block->size) ||
+	    data_has_nuls(block->data, block->size)) {
 		/* output isn't valid UTF-8. make it. */
 		if (parser->utf8_output == NULL) {
 			parser->utf8_output =
@@ -40,6 +70,7 @@
 		}
 		(void)uni_utf8_get_valid_data(block->data, block->size,
 					      parser->utf8_output);
+		replace_nul_bytes(parser->utf8_output);
 		block->data = parser->utf8_output->data;
 		block->size = parser->utf8_output->used;
 	}
@@ -53,5 +84,8 @@
 
 	if (parser->utf8_output != NULL)
 		buffer_free(&parser->utf8_output);
-	parser->v.deinit(parser);
+	if (parser->v.deinit != NULL)
+		parser->v.deinit(parser);
+	else
+		i_free(parser);
 }
--- a/src/plugins/fts/fts-parser.h	Wed Aug 03 18:58:45 2011 +0300
+++ b/src/plugins/fts/fts-parser.h	Wed Aug 03 18:59:07 2011 +0300
@@ -23,6 +23,8 @@
 bool fts_parser_init(struct mail_user *user,
 		     const char *content_type, const char *content_disposition,
 		     struct fts_parser **parser_r);
+struct fts_parser *fts_parser_text_init(void);
+
 /* The parser is initially called with message body blocks. Once message is
    finished, it's still called with incoming size=0 while the parser increases
    it to non-zero. */