annotate src/lib-charset/charset-utf8.c @ 18149:0e74934072e0

lib-charset: Added charset_utf8_to_utf8_begin() wrapper function. It's never supposed to fail, so it makes it nicer for the callers who need to use it.
author Timo Sirainen <tss@iki.fi>
date Thu, 15 Jan 2015 01:05:13 +0200
parents 7459c0891a85
children 3d9ec121dc81
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18137
3009a1a6f6d5 global: freshen copyright
Phil Carmody <phil@dovecot.fi>
parents: 17130
diff changeset
1 /* Copyright (c) 2002-2015 Dovecot authors, see the included COPYING file */
609
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
2
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
3 #include "lib.h"
7861
481fa709dce3 Compiler warning fix when compiling without iconv.
Timo Sirainen <tss@iki.fi>
parents: 7086
diff changeset
4 #include "buffer.h"
6908
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
5 #include "str.h"
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
6 #include "unichar.h"
609
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
7 #include "charset-utf8.h"
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
8
766
03832c7f389b Compiles again without iconv()
Timo Sirainen <tss@iki.fi>
parents: 753
diff changeset
9 #include <ctype.h>
03832c7f389b Compiles again without iconv()
Timo Sirainen <tss@iki.fi>
parents: 753
diff changeset
10
4605
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
11 bool charset_is_utf8(const char *charset)
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
12 {
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
13 return strcasecmp(charset, "us-ascii") == 0 ||
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
14 strcasecmp(charset, "ascii") == 0 ||
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
15 strcasecmp(charset, "UTF-8") == 0 ||
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
16 strcasecmp(charset, "UTF8") == 0;
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
17 }
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
18
15053
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
19 int charset_to_utf8_str(const char *charset, normalizer_func_t *normalizer,
6908
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
20 const char *input, string_t *output,
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
21 enum charset_result *result_r)
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
22 {
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
23 struct charset_translation *t;
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
24 size_t len = strlen(input);
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
25
15053
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
26 if (charset_to_utf8_begin(charset, normalizer, &t) < 0)
6908
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
27 return -1;
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
28
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
29 *result_r = charset_to_utf8(t, (const unsigned char *)input,
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
30 &len, output);
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
31 charset_to_utf8_end(&t);
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
32 return 0;
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
33 }
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
34
18149
0e74934072e0 lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents: 18144
diff changeset
35 struct charset_translation *
0e74934072e0 lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents: 18144
diff changeset
36 charset_utf8_to_utf8_begin(normalizer_func_t *normalizer)
0e74934072e0 lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents: 18144
diff changeset
37 {
0e74934072e0 lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents: 18144
diff changeset
38 struct charset_translation *trans;
0e74934072e0 lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents: 18144
diff changeset
39
0e74934072e0 lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents: 18144
diff changeset
40 if (charset_to_utf8_begin("UTF-8", normalizer, &trans) < 0)
0e74934072e0 lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents: 18144
diff changeset
41 i_unreached();
0e74934072e0 lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents: 18144
diff changeset
42 return trans;
0e74934072e0 lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents: 18144
diff changeset
43 }
0e74934072e0 lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents: 18144
diff changeset
44
1300
952bf533c2ea Better iconv() checking.
Timo Sirainen <tss@iki.fi>
parents: 961
diff changeset
45 #ifndef HAVE_ICONV
609
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
46
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
47 struct charset_translation {
15053
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
48 normalizer_func_t *normalizer;
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
49 };
609
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
50
15053
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
51 int charset_to_utf8_begin(const char *charset, normalizer_func_t *normalizer,
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
52 struct charset_translation **t_r)
609
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
53 {
15053
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
54 struct charset_translation *t;
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
55
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
56 if (!charset_is_utf8(charset)) {
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
57 /* no support for charsets that need translation */
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
58 return -1;
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
59 }
609
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
60
15053
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
61 t = i_new(struct charset_translation, 1);
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
62 t->normalizer = normalizer;
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
63 *t_r = t;
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
64 return 0;
609
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
65 }
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
66
15053
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
67 void charset_to_utf8_end(struct charset_translation **_t)
609
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
68 {
15053
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
69 struct charset_translation *t = *_t;
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
70
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
71 *_t = NULL;
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
72 i_free(t);
609
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
73 }
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
74
6411
6a64e64fa3a3 Renamed __attr_*__ to ATTR_*. Renamed __attrs_used__ to ATTRS_DEFINED.
Timo Sirainen <tss@iki.fi>
parents: 6132
diff changeset
75 void charset_to_utf8_reset(struct charset_translation *t ATTR_UNUSED)
609
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
76 {
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
77 }
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
78
903
fd8888f6f037 Naming style changes, finally got tired of most of the typedefs. Also the
Timo Sirainen <tss@iki.fi>
parents: 898
diff changeset
79 enum charset_result
6112
e5451501ff2f charset_to_utf8_begin() now takes bool ucase parameter. Changed
Timo Sirainen <tss@iki.fi>
parents: 4605
diff changeset
80 charset_to_utf8(struct charset_translation *t,
e5451501ff2f charset_to_utf8_begin() now takes bool ucase parameter. Changed
Timo Sirainen <tss@iki.fi>
parents: 4605
diff changeset
81 const unsigned char *src, size_t *src_size, buffer_t *dest)
609
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
82 {
18144
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
83 return charset_utf8_to_utf8(t->normalizer, src, src_size, dest);
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
84 }
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
85
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
86 #endif
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
87
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
88 enum charset_result
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
89 charset_utf8_to_utf8(normalizer_func_t *normalizer,
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
90 const unsigned char *src, size_t *src_size, buffer_t *dest)
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
91 {
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
92 enum charset_result res = CHARSET_RET_OK;
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
93 size_t pos;
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
94
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
95 uni_utf8_partial_strlen_n(src, *src_size, &pos);
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
96 if (pos < *src_size) {
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
97 *src_size = pos;
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
98 res = CHARSET_RET_INCOMPLETE_INPUT;
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
99 }
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
100
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
101 if (normalizer != NULL) {
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
102 if (normalizer(src, *src_size, dest) < 0)
15053
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
103 return CHARSET_RET_INVALID_INPUT;
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
104 } else if (!uni_utf8_get_valid_data(src, *src_size, dest)) {
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
105 return CHARSET_RET_INVALID_INPUT;
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 14133
diff changeset
106 } else {
6112
e5451501ff2f charset_to_utf8_begin() now takes bool ucase parameter. Changed
Timo Sirainen <tss@iki.fi>
parents: 4605
diff changeset
107 buffer_append(dest, src, *src_size);
6131
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
108 }
18144
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 18137
diff changeset
109 return res;
609
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
110 }