Mercurial > dovecot > original-hg > dovecot-1.2
changeset 9604:cf0da2cd31fb HEAD
fts-solr: Replace characters not valid for XML with replacement char.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Fri, 20 Aug 2010 20:38:26 +0100 |
parents | 5efba9f9f0a7 |
children | 7e959d397a35 |
files | src/plugins/fts-solr/fts-backend-solr.c |
diffstat | 1 files changed, 32 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/src/plugins/fts-solr/fts-backend-solr.c Fri Aug 20 20:37:31 2010 +0100 +++ b/src/plugins/fts-solr/fts-backend-solr.c Fri Aug 20 20:38:26 2010 +0100 @@ -4,6 +4,7 @@ #include "array.h" #include "str.h" #include "strescape.h" +#include "unichar.h" #include "mail-storage-private.h" #include "mail-namespace.h" #include "solr-connection.h" @@ -74,9 +75,25 @@ return name; } +static bool is_valid_xml_char(unichar_t chr) +{ + /* Valid characters in XML: + + #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | + [#x10000-#x10FFFF] + + This function gets called only for #x80 and higher */ + if (chr > 0xd7ff && chr < 0xe000) + return FALSE; + if (chr > 0xfffd && chr < 0x10000) + return FALSE; + return chr < 0x10ffff; +} + static void xml_encode_data(string_t *dest, const unsigned char *data, unsigned int len) { + unichar_t chr; unsigned int i; for (i = 0; i < len; i++) { @@ -101,11 +118,26 @@ /* SOLR doesn't like control characters. replace them with spaces. */ str_append_c(dest, ' '); + } else if (data[i] >= 0x80) { + /* make sure the character is valid for XML + so we don't get XML parser errors */ + unsigned int char_len = + uni_utf8_char_bytes(data[0]); + if (i + char_len <= len && + uni_utf8_get_char_n(data, len, &chr) == 0 && + is_valid_xml_char(chr)) + str_append_n(dest, data + i, char_len); + else { + str_append_n(dest, utf8_replacement_char, + UTF8_REPLACEMENT_CHAR_LEN); + } + i += char_len - 1; } else { str_append_c(dest, data[i]); } break; } + i += uni_utf8_char_bytes(data[0]); } }