comparison src/plugins/fts-solr/fts-backend-solr.c @ 9604:cf0da2cd31fb HEAD

fts-solr: Replace characters not valid for XML with replacement char.
author Timo Sirainen <tss@iki.fi>
date Fri, 20 Aug 2010 20:38:26 +0100
parents 00cd9aacd03c
children ac0855b1bbea
comparison
equal deleted inserted replaced
9603:5efba9f9f0a7 9604:cf0da2cd31fb
2 2
3 #include "lib.h" 3 #include "lib.h"
4 #include "array.h" 4 #include "array.h"
5 #include "str.h" 5 #include "str.h"
6 #include "strescape.h" 6 #include "strescape.h"
7 #include "unichar.h"
7 #include "mail-storage-private.h" 8 #include "mail-storage-private.h"
8 #include "mail-namespace.h" 9 #include "mail-namespace.h"
9 #include "solr-connection.h" 10 #include "solr-connection.h"
10 #include "fts-solr-plugin.h" 11 #include "fts-solr-plugin.h"
11 12
72 fts_box_name_get_root(&ns, &name); 73 fts_box_name_get_root(&ns, &name);
73 *ns_r = ns; 74 *ns_r = ns;
74 return name; 75 return name;
75 } 76 }
76 77
78 static bool is_valid_xml_char(unichar_t chr)
79 {
80 /* Valid characters in XML:
81
82 #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
83 [#x10000-#x10FFFF]
84
85 This function gets called only for #x80 and higher */
86 if (chr > 0xd7ff && chr < 0xe000)
87 return FALSE;
88 if (chr > 0xfffd && chr < 0x10000)
89 return FALSE;
90 return chr < 0x10ffff;
91 }
92
77 static void 93 static void
78 xml_encode_data(string_t *dest, const unsigned char *data, unsigned int len) 94 xml_encode_data(string_t *dest, const unsigned char *data, unsigned int len)
79 { 95 {
96 unichar_t chr;
80 unsigned int i; 97 unsigned int i;
81 98
82 for (i = 0; i < len; i++) { 99 for (i = 0; i < len; i++) {
83 switch (data[i]) { 100 switch (data[i]) {
84 case '&': 101 case '&':
99 default: 116 default:
100 if (data[i] < 32) { 117 if (data[i] < 32) {
101 /* SOLR doesn't like control characters. 118 /* SOLR doesn't like control characters.
102 replace them with spaces. */ 119 replace them with spaces. */
103 str_append_c(dest, ' '); 120 str_append_c(dest, ' ');
121 } else if (data[i] >= 0x80) {
122 /* make sure the character is valid for XML
123 so we don't get XML parser errors */
124 unsigned int char_len =
125 uni_utf8_char_bytes(data[0]);
126 if (i + char_len <= len &&
127 uni_utf8_get_char_n(data, len, &chr) == 0 &&
128 is_valid_xml_char(chr))
129 str_append_n(dest, data + i, char_len);
130 else {
131 str_append_n(dest, utf8_replacement_char,
132 UTF8_REPLACEMENT_CHAR_LEN);
133 }
134 i += char_len - 1;
104 } else { 135 } else {
105 str_append_c(dest, data[i]); 136 str_append_c(dest, data[i]);
106 } 137 }
107 break; 138 break;
108 } 139 }
140 i += uni_utf8_char_bytes(data[0]);
109 } 141 }
110 } 142 }
111 143
112 static void xml_encode(string_t *dest, const char *str) 144 static void xml_encode(string_t *dest, const char *str)
113 { 145 {