Mercurial > dovecot > original-hg > dovecot-1.2
comparison src/plugins/fts-solr/fts-backend-solr.c @ 9604:cf0da2cd31fb HEAD
fts-solr: Replace characters not valid for XML with replacement char.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Fri, 20 Aug 2010 20:38:26 +0100 |
parents | 00cd9aacd03c |
children | ac0855b1bbea |
comparison
equal
deleted
inserted
replaced
9603:5efba9f9f0a7 | 9604:cf0da2cd31fb |
---|---|
2 | 2 |
3 #include "lib.h" | 3 #include "lib.h" |
4 #include "array.h" | 4 #include "array.h" |
5 #include "str.h" | 5 #include "str.h" |
6 #include "strescape.h" | 6 #include "strescape.h" |
7 #include "unichar.h" | |
7 #include "mail-storage-private.h" | 8 #include "mail-storage-private.h" |
8 #include "mail-namespace.h" | 9 #include "mail-namespace.h" |
9 #include "solr-connection.h" | 10 #include "solr-connection.h" |
10 #include "fts-solr-plugin.h" | 11 #include "fts-solr-plugin.h" |
11 | 12 |
72 fts_box_name_get_root(&ns, &name); | 73 fts_box_name_get_root(&ns, &name); |
73 *ns_r = ns; | 74 *ns_r = ns; |
74 return name; | 75 return name; |
75 } | 76 } |
76 | 77 |
78 static bool is_valid_xml_char(unichar_t chr) | |
79 { | |
80 /* Valid characters in XML: | |
81 | |
82 #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | | |
83 [#x10000-#x10FFFF] | |
84 | |
85 This function gets called only for #x80 and higher */ | |
86 if (chr > 0xd7ff && chr < 0xe000) | |
87 return FALSE; | |
88 if (chr > 0xfffd && chr < 0x10000) | |
89 return FALSE; | |
90 return chr < 0x10ffff; | |
91 } | |
92 | |
77 static void | 93 static void |
78 xml_encode_data(string_t *dest, const unsigned char *data, unsigned int len) | 94 xml_encode_data(string_t *dest, const unsigned char *data, unsigned int len) |
79 { | 95 { |
96 unichar_t chr; | |
80 unsigned int i; | 97 unsigned int i; |
81 | 98 |
82 for (i = 0; i < len; i++) { | 99 for (i = 0; i < len; i++) { |
83 switch (data[i]) { | 100 switch (data[i]) { |
84 case '&': | 101 case '&': |
99 default: | 116 default: |
100 if (data[i] < 32) { | 117 if (data[i] < 32) { |
101 /* SOLR doesn't like control characters. | 118 /* SOLR doesn't like control characters. |
102 replace them with spaces. */ | 119 replace them with spaces. */ |
103 str_append_c(dest, ' '); | 120 str_append_c(dest, ' '); |
121 } else if (data[i] >= 0x80) { | |
122 /* make sure the character is valid for XML | |
123 so we don't get XML parser errors */ | |
124 unsigned int char_len = | |
125 uni_utf8_char_bytes(data[0]); | |
126 if (i + char_len <= len && | |
127 uni_utf8_get_char_n(data, len, &chr) == 0 && | |
128 is_valid_xml_char(chr)) | |
129 str_append_n(dest, data + i, char_len); | |
130 else { | |
131 str_append_n(dest, utf8_replacement_char, | |
132 UTF8_REPLACEMENT_CHAR_LEN); | |
133 } | |
134 i += char_len - 1; | |
104 } else { | 135 } else { |
105 str_append_c(dest, data[i]); | 136 str_append_c(dest, data[i]); |
106 } | 137 } |
107 break; | 138 break; |
108 } | 139 } |
140 i += uni_utf8_char_bytes(data[0]); | |
109 } | 141 } |
110 } | 142 } |
111 | 143 |
112 static void xml_encode(string_t *dest, const char *str) | 144 static void xml_encode(string_t *dest, const char *str) |
113 { | 145 { |