Mercurial > dovecot > core-2.2
annotate src/lib-fts/fts-language.c @ 18608:1fc7ae2640b0
lib-fts: fts_language_list_init() API changed to return errors.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Sat, 09 May 2015 19:14:07 +0300 |
parents | 50ef619ce58a |
children | e3603730b2df |
rev | line source |
---|---|
18414 | 1 /* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */ |
2 | |
3 #include "lib.h" | |
4 #include "array.h" | |
5 #include "fts-language.h" | |
6 #include "strfuncs.h" | |
7 #include "llist.h" | |
8 | |
9 #ifdef HAVE_LIBEXTTEXTCAT_TEXTCAT_H | |
10 # include <libexttextcat/textcat.h> | |
18426
50ef619ce58a
lib-fts requires libexttextcat actually - don't even try to use textcat for it.
Timo Sirainen <tss@iki.fi>
parents:
18417
diff
changeset
|
11 #elif defined (HAVE_FTS_EXTTEXTCAT) |
18414 | 12 # include <textcat.h> |
13 #endif | |
14 | |
15 #ifndef TEXTCAT_RESULT_UNKNOWN /* old textcat.h has typos */ | |
16 # ifdef TEXTCAT_RESULT_UNKOWN | |
17 # define TEXTCAT_RESULT_UNKNOWN TEXTCAT_RESULT_UNKOWN | |
18 # endif | |
19 #endif | |
20 | |
21 #define DETECT_STR_MAX_LEN 200 | |
22 | |
23 struct fts_language_list { | |
24 pool_t pool; | |
25 ARRAY_TYPE(fts_language) languages; | |
26 const char *textcat_config; | |
27 const char *textcat_datadir; | |
28 void *textcat_handle; | |
29 bool textcat_failed; | |
30 }; | |
31 | |
32 const struct fts_language fts_languages[] = { | |
33 { "en" }, | |
34 { "fi" }, | |
35 { "fr" }, | |
36 { "de" } | |
37 }; | |
38 | |
39 const struct fts_language fts_language_data = { | |
40 "data" | |
41 }; | |
42 | |
43 const struct fts_language *fts_language_find(const char *name) | |
44 { | |
45 unsigned int i; | |
46 | |
47 for (i = 0; i < N_ELEMENTS(fts_languages); i++) { | |
48 if (strcmp(fts_languages[i].name, name) == 0) | |
49 return &fts_languages[i]; | |
50 } | |
51 return NULL; | |
52 } | |
53 | |
18608
1fc7ae2640b0
lib-fts: fts_language_list_init() API changed to return errors.
Timo Sirainen <tss@iki.fi>
parents:
18426
diff
changeset
|
54 int fts_language_list_init(const char *const *settings, |
1fc7ae2640b0
lib-fts: fts_language_list_init() API changed to return errors.
Timo Sirainen <tss@iki.fi>
parents:
18426
diff
changeset
|
55 struct fts_language_list **list_r, |
1fc7ae2640b0
lib-fts: fts_language_list_init() API changed to return errors.
Timo Sirainen <tss@iki.fi>
parents:
18426
diff
changeset
|
56 const char **error_r) |
18414 | 57 { |
58 struct fts_language_list *lp; | |
59 pool_t pool; | |
60 unsigned int i; | |
18608
1fc7ae2640b0
lib-fts: fts_language_list_init() API changed to return errors.
Timo Sirainen <tss@iki.fi>
parents:
18426
diff
changeset
|
61 const char *conf = NULL, *data = NULL; |
18414 | 62 |
63 for (i = 0; settings[i] != NULL; i += 2) { | |
64 const char *key = settings[i], *value = settings[i+1]; | |
65 | |
18608
1fc7ae2640b0
lib-fts: fts_language_list_init() API changed to return errors.
Timo Sirainen <tss@iki.fi>
parents:
18426
diff
changeset
|
66 if (strcmp(key, "fts_language_config") == 0) |
18414 | 67 conf = value; |
18608
1fc7ae2640b0
lib-fts: fts_language_list_init() API changed to return errors.
Timo Sirainen <tss@iki.fi>
parents:
18426
diff
changeset
|
68 else if (strcmp(key, "fts_language_data") == 0) |
18414 | 69 data = value; |
18608
1fc7ae2640b0
lib-fts: fts_language_list_init() API changed to return errors.
Timo Sirainen <tss@iki.fi>
parents:
18426
diff
changeset
|
70 else { |
1fc7ae2640b0
lib-fts: fts_language_list_init() API changed to return errors.
Timo Sirainen <tss@iki.fi>
parents:
18426
diff
changeset
|
71 *error_r = t_strdup_printf("Unknown setting: %s", key); |
1fc7ae2640b0
lib-fts: fts_language_list_init() API changed to return errors.
Timo Sirainen <tss@iki.fi>
parents:
18426
diff
changeset
|
72 return -1; |
18414 | 73 } |
74 } | |
75 | |
76 pool = pool_alloconly_create("fts_language_list", 128); | |
77 lp = p_new(pool, struct fts_language_list, 1); | |
78 lp->pool = pool; | |
79 if (conf != NULL) | |
80 lp->textcat_config = p_strdup(pool, conf); | |
81 else | |
82 lp->textcat_config = NULL; | |
83 if (data != NULL) | |
84 lp->textcat_datadir = p_strdup(pool, data); | |
85 else | |
86 lp->textcat_datadir = NULL; | |
87 p_array_init(&lp->languages, pool, 32); | |
18608
1fc7ae2640b0
lib-fts: fts_language_list_init() API changed to return errors.
Timo Sirainen <tss@iki.fi>
parents:
18426
diff
changeset
|
88 *list_r = lp; |
1fc7ae2640b0
lib-fts: fts_language_list_init() API changed to return errors.
Timo Sirainen <tss@iki.fi>
parents:
18426
diff
changeset
|
89 return 0; |
18414 | 90 } |
91 | |
92 void fts_language_list_deinit(struct fts_language_list **list) | |
93 { | |
94 struct fts_language_list *lp = *list; | |
95 | |
96 *list = NULL; | |
18426
50ef619ce58a
lib-fts requires libexttextcat actually - don't even try to use textcat for it.
Timo Sirainen <tss@iki.fi>
parents:
18417
diff
changeset
|
97 #ifdef HAVE_FTS_EXTTEXTCAT |
18414 | 98 if (lp->textcat_handle != NULL) |
99 textcat_Done(lp->textcat_handle); | |
100 #endif | |
101 pool_unref(&lp->pool); | |
102 } | |
103 | |
104 static const struct fts_language * | |
105 fts_language_list_find(struct fts_language_list *list, const char *name) | |
106 { | |
107 const struct fts_language *const *langp; | |
108 | |
109 array_foreach(&list->languages, langp) { | |
110 if (strcmp((*langp)->name, name) == 0) | |
111 return *langp; | |
112 } | |
113 return NULL; | |
114 } | |
115 | |
116 void fts_language_list_add(struct fts_language_list *list, | |
117 const struct fts_language *lang) | |
118 { | |
119 i_assert(fts_language_list_find(list, lang->name) == NULL); | |
120 array_append(&list->languages, &lang, 1); | |
121 } | |
122 | |
123 bool fts_language_list_add_names(struct fts_language_list *list, | |
124 const char *names, | |
125 const char **unknown_name_r) | |
126 { | |
127 const char *const *langs; | |
128 const struct fts_language *lang; | |
129 | |
130 for (langs = t_strsplit_spaces(names, ", "); *langs != NULL; langs++) { | |
131 lang = fts_language_find(*langs); | |
132 if (lang == NULL) { | |
133 /* unknown language */ | |
134 *unknown_name_r = *langs; | |
135 return FALSE; | |
136 } | |
137 if (fts_language_list_find(list, lang->name) == NULL) | |
138 fts_language_list_add(list, lang); | |
139 } | |
140 return TRUE; | |
141 } | |
142 | |
143 const ARRAY_TYPE(fts_language) * | |
144 fts_language_list_get_all(struct fts_language_list *list) | |
145 { | |
146 return &list->languages; | |
147 } | |
148 | |
149 const struct fts_language * | |
150 fts_language_list_get_first(struct fts_language_list *list) | |
151 { | |
152 const struct fts_language *const *langp; | |
153 | |
154 langp = array_idx(&list->languages, 0); | |
155 return *langp; | |
156 } | |
157 | |
18426
50ef619ce58a
lib-fts requires libexttextcat actually - don't even try to use textcat for it.
Timo Sirainen <tss@iki.fi>
parents:
18417
diff
changeset
|
158 #ifdef HAVE_FTS_EXTTEXTCAT |
18414 | 159 static bool fts_language_match_lists(struct fts_language_list *list, |
160 candidate_t *candp, int candp_len, | |
161 const struct fts_language **lang_r) | |
162 { | |
163 const char *name; | |
164 | |
165 for (int i = 0; i < candp_len; i++) { | |
166 /* name is <lang>-<optional country or characterset>-<encoding> | |
167 eg, fi--utf8 or pt-PT-utf8 */ | |
168 name = t_strcut(candp[i].name, '-'); | |
169 if ((*lang_r = fts_language_list_find(list, name)) != NULL) | |
170 return TRUE; | |
171 } | |
172 return FALSE; | |
173 } | |
174 #endif | |
175 | |
18426
50ef619ce58a
lib-fts requires libexttextcat actually - don't even try to use textcat for it.
Timo Sirainen <tss@iki.fi>
parents:
18417
diff
changeset
|
176 #ifdef HAVE_FTS_EXTTEXTCAT |
18414 | 177 static int fts_language_textcat_init(struct fts_language_list *list) |
178 { | |
179 const char *config_path; | |
180 const char *data_dir; | |
181 | |
182 if (list->textcat_handle != NULL) | |
183 return 0; | |
184 | |
185 if (list->textcat_failed) | |
186 return -1; | |
187 | |
188 config_path = list->textcat_config != NULL ? list->textcat_config : | |
18417
cf04173f3f69
lib-fts: Fixed default textcat datadir paths.
Timo Sirainen <tss@iki.fi>
parents:
18414
diff
changeset
|
189 TEXTCAT_DATADIR"/fpdb.conf"; |
18414 | 190 data_dir = list->textcat_datadir != NULL ? list->textcat_datadir : |
18417
cf04173f3f69
lib-fts: Fixed default textcat datadir paths.
Timo Sirainen <tss@iki.fi>
parents:
18414
diff
changeset
|
191 TEXTCAT_DATADIR"/"; |
18414 | 192 list->textcat_handle = special_textcat_Init(config_path, data_dir); |
193 if (list->textcat_handle == NULL) { | |
194 i_error("special_textcat_Init(%s, %s) failed", | |
195 config_path, data_dir); | |
196 list->textcat_failed = TRUE; | |
197 return -1; | |
198 } | |
199 /* The textcat minimum document size could be set here. It | |
200 currently defaults to 3. UTF8 is enabled by default. */ | |
201 return 0; | |
202 } | |
203 #endif | |
204 | |
205 static enum fts_language_result | |
206 fts_language_detect_textcat(struct fts_language_list *list ATTR_UNUSED, | |
207 const unsigned char *text ATTR_UNUSED, | |
208 size_t size ATTR_UNUSED, | |
209 const struct fts_language **lang_r ATTR_UNUSED) | |
210 { | |
18426
50ef619ce58a
lib-fts requires libexttextcat actually - don't even try to use textcat for it.
Timo Sirainen <tss@iki.fi>
parents:
18417
diff
changeset
|
211 #ifdef HAVE_FTS_EXTTEXTCAT |
18414 | 212 candidate_t *candp; /* textcat candidate result array pointer */ |
213 int cnt; | |
214 bool match = FALSE; | |
215 | |
216 if (fts_language_textcat_init(list) < 0) | |
217 return FTS_LANGUAGE_RESULT_ERROR; | |
218 | |
219 candp = textcat_GetClassifyFullOutput(list->textcat_handle); | |
220 if (candp == NULL) | |
221 i_fatal_status(FATAL_OUTOFMEM, "textcat_GetCLassifyFullOutput failed: malloc() returned NULL"); | |
222 cnt = textcat_ClassifyFull(list->textcat_handle, (const void *)text, | |
223 I_MIN(size, DETECT_STR_MAX_LEN), candp); | |
224 if (cnt > 0) { | |
225 T_BEGIN { | |
226 match = fts_language_match_lists(list, candp, cnt, lang_r); | |
227 } T_END; | |
228 textcat_ReleaseClassifyFullOutput(list->textcat_handle, candp); | |
229 if (match) | |
230 return FTS_LANGUAGE_RESULT_OK; | |
231 else | |
232 return FTS_LANGUAGE_RESULT_UNKNOWN; | |
233 } else { | |
234 textcat_ReleaseClassifyFullOutput(list->textcat_handle, candp); | |
235 switch (cnt) { | |
236 case TEXTCAT_RESULT_SHORT: | |
237 i_assert(size < DETECT_STR_MAX_LEN); | |
238 return FTS_LANGUAGE_RESULT_SHORT; | |
239 case TEXTCAT_RESULT_UNKNOWN: | |
240 return FTS_LANGUAGE_RESULT_UNKNOWN; | |
241 default: | |
242 i_unreached(); | |
243 } | |
244 } | |
245 #else | |
246 return FTS_LANGUAGE_RESULT_UNKNOWN; | |
247 #endif | |
248 } | |
249 | |
250 enum fts_language_result | |
251 fts_language_detect(struct fts_language_list *list, | |
252 const unsigned char *text ATTR_UNUSED, | |
253 size_t size ATTR_UNUSED, | |
254 const struct fts_language **lang_r) | |
255 { | |
256 i_assert(array_count(&list->languages) > 0); | |
257 | |
258 /* if there's only a single wanted language, return it always. */ | |
259 if (array_count(&list->languages) == 1) { | |
260 const struct fts_language *const *langp = | |
261 array_idx(&list->languages, 0); | |
262 *lang_r = *langp; | |
263 return FTS_LANGUAGE_RESULT_OK; | |
264 } | |
265 return fts_language_detect_textcat(list, text, size, lang_r); | |
266 } |