Mercurial > dovecot > core-2.2
annotate src/lib-fts/fts-language.c @ 18417:cf04173f3f69
lib-fts: Fixed default textcat datadir paths.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Mon, 20 Apr 2015 17:01:12 +0300 |
parents | 81e5b977e5c5 |
children | 50ef619ce58a |
rev | line source |
---|---|
18414 | 1 /* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */ |
2 | |
3 #include "lib.h" | |
4 #include "array.h" | |
5 #include "fts-language.h" | |
6 #include "strfuncs.h" | |
7 #include "llist.h" | |
8 | |
9 #ifdef HAVE_LIBEXTTEXTCAT_TEXTCAT_H | |
10 # include <libexttextcat/textcat.h> | |
11 # define HAVE_TEXTCAT | |
12 #elif defined (HAVE_LIBTEXTCAT_TEXTCAT_H) | |
13 # include <libtextcat/textcat.h> | |
14 # define HAVE_TEXTCAT | |
15 #elif defined (HAVE_FTS_TEXTCAT) | |
16 # include <textcat.h> | |
17 # define HAVE_TEXTCAT | |
18 #endif | |
19 | |
20 #ifndef TEXTCAT_RESULT_UNKNOWN /* old textcat.h has typos */ | |
21 # ifdef TEXTCAT_RESULT_UNKOWN | |
22 # define TEXTCAT_RESULT_UNKNOWN TEXTCAT_RESULT_UNKOWN | |
23 # endif | |
24 #endif | |
25 | |
26 #define DETECT_STR_MAX_LEN 200 | |
27 | |
28 struct fts_language_list { | |
29 pool_t pool; | |
30 ARRAY_TYPE(fts_language) languages; | |
31 const char *textcat_config; | |
32 const char *textcat_datadir; | |
33 void *textcat_handle; | |
34 bool textcat_failed; | |
35 }; | |
36 | |
37 const struct fts_language fts_languages[] = { | |
38 { "en" }, | |
39 { "fi" }, | |
40 { "fr" }, | |
41 { "de" } | |
42 }; | |
43 | |
44 const struct fts_language fts_language_data = { | |
45 "data" | |
46 }; | |
47 | |
48 const struct fts_language *fts_language_find(const char *name) | |
49 { | |
50 unsigned int i; | |
51 | |
52 for (i = 0; i < N_ELEMENTS(fts_languages); i++) { | |
53 if (strcmp(fts_languages[i].name, name) == 0) | |
54 return &fts_languages[i]; | |
55 } | |
56 return NULL; | |
57 } | |
58 | |
59 struct fts_language_list * | |
60 fts_language_list_init(const char *const *settings) | |
61 { | |
62 struct fts_language_list *lp; | |
63 pool_t pool; | |
64 unsigned int i; | |
65 const char *conf = NULL; | |
66 const char *data = NULL; | |
67 | |
68 for (i = 0; settings[i] != NULL; i += 2) { | |
69 const char *key = settings[i], *value = settings[i+1]; | |
70 | |
71 if (strcmp(key, "fts_language_config") == 0) { | |
72 conf = value; | |
73 } | |
74 else if (strcmp(key, "fts_language_data") == 0) { | |
75 data = value; | |
76 } else { | |
77 i_debug("Unknown setting: %s", key); | |
78 return NULL; | |
79 } | |
80 } | |
81 | |
82 pool = pool_alloconly_create("fts_language_list", 128); | |
83 lp = p_new(pool, struct fts_language_list, 1); | |
84 lp->pool = pool; | |
85 if (conf != NULL) | |
86 lp->textcat_config = p_strdup(pool, conf); | |
87 else | |
88 lp->textcat_config = NULL; | |
89 if (data != NULL) | |
90 lp->textcat_datadir = p_strdup(pool, data); | |
91 else | |
92 lp->textcat_datadir = NULL; | |
93 p_array_init(&lp->languages, pool, 32); | |
94 return lp; | |
95 } | |
96 | |
97 void fts_language_list_deinit(struct fts_language_list **list) | |
98 { | |
99 struct fts_language_list *lp = *list; | |
100 | |
101 *list = NULL; | |
102 #ifdef HAVE_TEXTCAT | |
103 if (lp->textcat_handle != NULL) | |
104 textcat_Done(lp->textcat_handle); | |
105 #endif | |
106 pool_unref(&lp->pool); | |
107 } | |
108 | |
109 static const struct fts_language * | |
110 fts_language_list_find(struct fts_language_list *list, const char *name) | |
111 { | |
112 const struct fts_language *const *langp; | |
113 | |
114 array_foreach(&list->languages, langp) { | |
115 if (strcmp((*langp)->name, name) == 0) | |
116 return *langp; | |
117 } | |
118 return NULL; | |
119 } | |
120 | |
121 void fts_language_list_add(struct fts_language_list *list, | |
122 const struct fts_language *lang) | |
123 { | |
124 i_assert(fts_language_list_find(list, lang->name) == NULL); | |
125 array_append(&list->languages, &lang, 1); | |
126 } | |
127 | |
128 bool fts_language_list_add_names(struct fts_language_list *list, | |
129 const char *names, | |
130 const char **unknown_name_r) | |
131 { | |
132 const char *const *langs; | |
133 const struct fts_language *lang; | |
134 | |
135 for (langs = t_strsplit_spaces(names, ", "); *langs != NULL; langs++) { | |
136 lang = fts_language_find(*langs); | |
137 if (lang == NULL) { | |
138 /* unknown language */ | |
139 *unknown_name_r = *langs; | |
140 return FALSE; | |
141 } | |
142 if (fts_language_list_find(list, lang->name) == NULL) | |
143 fts_language_list_add(list, lang); | |
144 } | |
145 return TRUE; | |
146 } | |
147 | |
148 const ARRAY_TYPE(fts_language) * | |
149 fts_language_list_get_all(struct fts_language_list *list) | |
150 { | |
151 return &list->languages; | |
152 } | |
153 | |
154 const struct fts_language * | |
155 fts_language_list_get_first(struct fts_language_list *list) | |
156 { | |
157 const struct fts_language *const *langp; | |
158 | |
159 langp = array_idx(&list->languages, 0); | |
160 return *langp; | |
161 } | |
162 | |
163 #ifdef HAVE_TEXTCAT | |
164 static bool fts_language_match_lists(struct fts_language_list *list, | |
165 candidate_t *candp, int candp_len, | |
166 const struct fts_language **lang_r) | |
167 { | |
168 const char *name; | |
169 | |
170 for (int i = 0; i < candp_len; i++) { | |
171 /* name is <lang>-<optional country or characterset>-<encoding> | |
172 eg, fi--utf8 or pt-PT-utf8 */ | |
173 name = t_strcut(candp[i].name, '-'); | |
174 if ((*lang_r = fts_language_list_find(list, name)) != NULL) | |
175 return TRUE; | |
176 } | |
177 return FALSE; | |
178 } | |
179 #endif | |
180 | |
181 #ifdef HAVE_TEXTCAT | |
182 static int fts_language_textcat_init(struct fts_language_list *list) | |
183 { | |
184 const char *config_path; | |
185 const char *data_dir; | |
186 | |
187 if (list->textcat_handle != NULL) | |
188 return 0; | |
189 | |
190 if (list->textcat_failed) | |
191 return -1; | |
192 | |
193 config_path = list->textcat_config != NULL ? list->textcat_config : | |
18417
cf04173f3f69
lib-fts: Fixed default textcat datadir paths.
Timo Sirainen <tss@iki.fi>
parents:
18414
diff
changeset
|
194 TEXTCAT_DATADIR"/fpdb.conf"; |
18414 | 195 data_dir = list->textcat_datadir != NULL ? list->textcat_datadir : |
18417
cf04173f3f69
lib-fts: Fixed default textcat datadir paths.
Timo Sirainen <tss@iki.fi>
parents:
18414
diff
changeset
|
196 TEXTCAT_DATADIR"/"; |
18414 | 197 list->textcat_handle = special_textcat_Init(config_path, data_dir); |
198 if (list->textcat_handle == NULL) { | |
199 i_error("special_textcat_Init(%s, %s) failed", | |
200 config_path, data_dir); | |
201 list->textcat_failed = TRUE; | |
202 return -1; | |
203 } | |
204 /* The textcat minimum document size could be set here. It | |
205 currently defaults to 3. UTF8 is enabled by default. */ | |
206 return 0; | |
207 } | |
208 #endif | |
209 | |
210 static enum fts_language_result | |
211 fts_language_detect_textcat(struct fts_language_list *list ATTR_UNUSED, | |
212 const unsigned char *text ATTR_UNUSED, | |
213 size_t size ATTR_UNUSED, | |
214 const struct fts_language **lang_r ATTR_UNUSED) | |
215 { | |
216 #ifdef HAVE_TEXTCAT | |
217 candidate_t *candp; /* textcat candidate result array pointer */ | |
218 int cnt; | |
219 bool match = FALSE; | |
220 | |
221 if (fts_language_textcat_init(list) < 0) | |
222 return FTS_LANGUAGE_RESULT_ERROR; | |
223 | |
224 candp = textcat_GetClassifyFullOutput(list->textcat_handle); | |
225 if (candp == NULL) | |
226 i_fatal_status(FATAL_OUTOFMEM, "textcat_GetCLassifyFullOutput failed: malloc() returned NULL"); | |
227 cnt = textcat_ClassifyFull(list->textcat_handle, (const void *)text, | |
228 I_MIN(size, DETECT_STR_MAX_LEN), candp); | |
229 if (cnt > 0) { | |
230 T_BEGIN { | |
231 match = fts_language_match_lists(list, candp, cnt, lang_r); | |
232 } T_END; | |
233 textcat_ReleaseClassifyFullOutput(list->textcat_handle, candp); | |
234 if (match) | |
235 return FTS_LANGUAGE_RESULT_OK; | |
236 else | |
237 return FTS_LANGUAGE_RESULT_UNKNOWN; | |
238 } else { | |
239 textcat_ReleaseClassifyFullOutput(list->textcat_handle, candp); | |
240 switch (cnt) { | |
241 case TEXTCAT_RESULT_SHORT: | |
242 i_assert(size < DETECT_STR_MAX_LEN); | |
243 return FTS_LANGUAGE_RESULT_SHORT; | |
244 case TEXTCAT_RESULT_UNKNOWN: | |
245 return FTS_LANGUAGE_RESULT_UNKNOWN; | |
246 default: | |
247 i_unreached(); | |
248 } | |
249 } | |
250 #else | |
251 return FTS_LANGUAGE_RESULT_UNKNOWN; | |
252 #endif | |
253 } | |
254 | |
255 enum fts_language_result | |
256 fts_language_detect(struct fts_language_list *list, | |
257 const unsigned char *text ATTR_UNUSED, | |
258 size_t size ATTR_UNUSED, | |
259 const struct fts_language **lang_r) | |
260 { | |
261 i_assert(array_count(&list->languages) > 0); | |
262 | |
263 /* if there's only a single wanted language, return it always. */ | |
264 if (array_count(&list->languages) == 1) { | |
265 const struct fts_language *const *langp = | |
266 array_idx(&list->languages, 0); | |
267 *lang_r = *langp; | |
268 return FTS_LANGUAGE_RESULT_OK; | |
269 } | |
270 return fts_language_detect_textcat(list, text, size, lang_r); | |
271 } |