Mercurial > dovecot > core-2.2
annotate src/lib-charset/charset-utf8.c @ 18149:0e74934072e0
lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
It's never supposed to fail, so it makes it nicer for the callers who need
to use it.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Thu, 15 Jan 2015 01:05:13 +0200 |
parents | 7459c0891a85 |
children | 3d9ec121dc81 |
rev | line source |
---|---|
18137 | 1 /* Copyright (c) 2002-2015 Dovecot authors, see the included COPYING file */ |
609
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
2 |
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
3 #include "lib.h" |
7861
481fa709dce3
Compiler warning fix when compiling without iconv.
Timo Sirainen <tss@iki.fi>
parents:
7086
diff
changeset
|
4 #include "buffer.h" |
6908 | 5 #include "str.h" |
6132
d01522d276f6
charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents:
6131
diff
changeset
|
6 #include "unichar.h" |
609
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
7 #include "charset-utf8.h" |
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
8 |
766 | 9 #include <ctype.h> |
10 | |
4605
e6cb9f75b76a
Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents:
3879
diff
changeset
|
11 bool charset_is_utf8(const char *charset) |
e6cb9f75b76a
Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents:
3879
diff
changeset
|
12 { |
e6cb9f75b76a
Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents:
3879
diff
changeset
|
13 return strcasecmp(charset, "us-ascii") == 0 || |
e6cb9f75b76a
Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents:
3879
diff
changeset
|
14 strcasecmp(charset, "ascii") == 0 || |
e6cb9f75b76a
Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents:
3879
diff
changeset
|
15 strcasecmp(charset, "UTF-8") == 0 || |
e6cb9f75b76a
Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents:
3879
diff
changeset
|
16 strcasecmp(charset, "UTF8") == 0; |
e6cb9f75b76a
Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents:
3879
diff
changeset
|
17 } |
e6cb9f75b76a
Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents:
3879
diff
changeset
|
18 |
15053
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
19 int charset_to_utf8_str(const char *charset, normalizer_func_t *normalizer, |
6908 | 20 const char *input, string_t *output, |
21 enum charset_result *result_r) | |
22 { | |
23 struct charset_translation *t; | |
24 size_t len = strlen(input); | |
25 | |
15053
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
26 if (charset_to_utf8_begin(charset, normalizer, &t) < 0) |
6908 | 27 return -1; |
28 | |
29 *result_r = charset_to_utf8(t, (const unsigned char *)input, | |
30 &len, output); | |
31 charset_to_utf8_end(&t); | |
32 return 0; | |
33 } | |
34 | |
18149
0e74934072e0
lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents:
18144
diff
changeset
|
35 struct charset_translation * |
0e74934072e0
lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents:
18144
diff
changeset
|
36 charset_utf8_to_utf8_begin(normalizer_func_t *normalizer) |
0e74934072e0
lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents:
18144
diff
changeset
|
37 { |
0e74934072e0
lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents:
18144
diff
changeset
|
38 struct charset_translation *trans; |
0e74934072e0
lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents:
18144
diff
changeset
|
39 |
0e74934072e0
lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents:
18144
diff
changeset
|
40 if (charset_to_utf8_begin("UTF-8", normalizer, &trans) < 0) |
0e74934072e0
lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents:
18144
diff
changeset
|
41 i_unreached(); |
0e74934072e0
lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents:
18144
diff
changeset
|
42 return trans; |
0e74934072e0
lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents:
18144
diff
changeset
|
43 } |
0e74934072e0
lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents:
18144
diff
changeset
|
44 |
1300 | 45 #ifndef HAVE_ICONV |
609
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
46 |
6132
d01522d276f6
charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents:
6131
diff
changeset
|
47 struct charset_translation { |
15053
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
48 normalizer_func_t *normalizer; |
6132
d01522d276f6
charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents:
6131
diff
changeset
|
49 }; |
609
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
50 |
15053
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
51 int charset_to_utf8_begin(const char *charset, normalizer_func_t *normalizer, |
6132
d01522d276f6
charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents:
6131
diff
changeset
|
52 struct charset_translation **t_r) |
609
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
53 { |
15053
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
54 struct charset_translation *t; |
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
55 |
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
56 if (!charset_is_utf8(charset)) { |
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
57 /* no support for charsets that need translation */ |
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
58 return -1; |
6132
d01522d276f6
charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents:
6131
diff
changeset
|
59 } |
609
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
60 |
15053
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
61 t = i_new(struct charset_translation, 1); |
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
62 t->normalizer = normalizer; |
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
63 *t_r = t; |
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
64 return 0; |
609
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
65 } |
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
66 |
15053
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
67 void charset_to_utf8_end(struct charset_translation **_t) |
609
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
68 { |
15053
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
69 struct charset_translation *t = *_t; |
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
70 |
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
71 *_t = NULL; |
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
72 i_free(t); |
609
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
73 } |
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
74 |
6411
6a64e64fa3a3
Renamed __attr_*__ to ATTR_*. Renamed __attrs_used__ to ATTRS_DEFINED.
Timo Sirainen <tss@iki.fi>
parents:
6132
diff
changeset
|
75 void charset_to_utf8_reset(struct charset_translation *t ATTR_UNUSED) |
609
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
76 { |
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
77 } |
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
78 |
903
fd8888f6f037
Naming style changes, finally got tired of most of the typedefs. Also the
Timo Sirainen <tss@iki.fi>
parents:
898
diff
changeset
|
79 enum charset_result |
6112
e5451501ff2f
charset_to_utf8_begin() now takes bool ucase parameter. Changed
Timo Sirainen <tss@iki.fi>
parents:
4605
diff
changeset
|
80 charset_to_utf8(struct charset_translation *t, |
e5451501ff2f
charset_to_utf8_begin() now takes bool ucase parameter. Changed
Timo Sirainen <tss@iki.fi>
parents:
4605
diff
changeset
|
81 const unsigned char *src, size_t *src_size, buffer_t *dest) |
609
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
82 { |
18144
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
83 return charset_utf8_to_utf8(t->normalizer, src, src_size, dest); |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
84 } |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
85 |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
86 #endif |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
87 |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
88 enum charset_result |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
89 charset_utf8_to_utf8(normalizer_func_t *normalizer, |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
90 const unsigned char *src, size_t *src_size, buffer_t *dest) |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
91 { |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
92 enum charset_result res = CHARSET_RET_OK; |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
93 size_t pos; |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
94 |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
95 uni_utf8_partial_strlen_n(src, *src_size, &pos); |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
96 if (pos < *src_size) { |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
97 *src_size = pos; |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
98 res = CHARSET_RET_INCOMPLETE_INPUT; |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
99 } |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
100 |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
101 if (normalizer != NULL) { |
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
102 if (normalizer(src, *src_size, dest) < 0) |
15053
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
103 return CHARSET_RET_INVALID_INPUT; |
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
104 } else if (!uni_utf8_get_valid_data(src, *src_size, dest)) { |
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
105 return CHARSET_RET_INVALID_INPUT; |
c976a9c01613
Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents:
14133
diff
changeset
|
106 } else { |
6112
e5451501ff2f
charset_to_utf8_begin() now takes bool ucase parameter. Changed
Timo Sirainen <tss@iki.fi>
parents:
4605
diff
changeset
|
107 buffer_append(dest, src, *src_size); |
6131
5f56b2eb32b3
Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents:
6126
diff
changeset
|
108 } |
18144
7459c0891a85
lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents:
18137
diff
changeset
|
109 return res; |
609
5470c0cb13a7
We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
110 } |