annotate src/lib-charset/charset-utf8.h @ 23017:c1d36f2575c7 default tip

lib-imap: Fix "Don't accept strings with NULs" cherry-pick
author Timo Sirainen <timo.sirainen@open-xchange.com>
date Thu, 29 Aug 2019 09:55:25 +0300
parents 3d9ec121dc81
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6410
e4eb71ae8e96 Changed .h ifdef/defines to use <NAME>_H format.
Timo Sirainen <tss@iki.fi>
parents: 6132
diff changeset
1 #ifndef CHARSET_UTF8_H
e4eb71ae8e96 Changed .h ifdef/defines to use <NAME>_H format.
Timo Sirainen <tss@iki.fi>
parents: 6132
diff changeset
2 #define CHARSET_UTF8_H
568
f2aa58c2afd0 SEARCH CHARSET support. Currently we do it through iconv() and only ASCII
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
3
15053
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
4 #include "unichar.h"
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
5
18150
3d9ec121dc81 lib-charset: Added CHARSET_MAX_PENDING_BUF_SIZE macro and asserts for it.
Timo Sirainen <tss@iki.fi>
parents: 18149
diff changeset
6 /* Max number of bytes that iconv can require for a single character.
3d9ec121dc81 lib-charset: Added CHARSET_MAX_PENDING_BUF_SIZE macro and asserts for it.
Timo Sirainen <tss@iki.fi>
parents: 18149
diff changeset
7 UTF-8 takes max 6 bytes per character. Not sure about others, but I'd think
3d9ec121dc81 lib-charset: Added CHARSET_MAX_PENDING_BUF_SIZE macro and asserts for it.
Timo Sirainen <tss@iki.fi>
parents: 18149
diff changeset
8 10 is more than enough for everyone.. */
3d9ec121dc81 lib-charset: Added CHARSET_MAX_PENDING_BUF_SIZE macro and asserts for it.
Timo Sirainen <tss@iki.fi>
parents: 18149
diff changeset
9 #define CHARSET_MAX_PENDING_BUF_SIZE 10
3d9ec121dc81 lib-charset: Added CHARSET_MAX_PENDING_BUF_SIZE macro and asserts for it.
Timo Sirainen <tss@iki.fi>
parents: 18149
diff changeset
10
15053
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
11 struct charset_translation;
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
12
903
fd8888f6f037 Naming style changes, finally got tired of most of the typedefs. Also the
Timo Sirainen <tss@iki.fi>
parents: 792
diff changeset
13 enum charset_result {
765
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 608
diff changeset
14 CHARSET_RET_OK = 1,
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 608
diff changeset
15 CHARSET_RET_INCOMPLETE_INPUT = -1,
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 608
diff changeset
16 CHARSET_RET_INVALID_INPUT = -2
903
fd8888f6f037 Naming style changes, finally got tired of most of the typedefs. Also the
Timo Sirainen <tss@iki.fi>
parents: 792
diff changeset
17 };
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 568
diff changeset
18
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
19 /* Begin translation to UTF-8. Returns -1 if charset is unknown. */
15053
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
20 int charset_to_utf8_begin(const char *charset, normalizer_func_t *normalizer,
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
21 struct charset_translation **t_r)
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
22 ATTR_NULL(2);
18149
0e74934072e0 lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents: 18144
diff changeset
23 /* Translate UTF-8 to UTF-8 while validating the input. */
0e74934072e0 lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents: 18144
diff changeset
24 struct charset_translation *
0e74934072e0 lib-charset: Added charset_utf8_to_utf8_begin() wrapper function.
Timo Sirainen <tss@iki.fi>
parents: 18144
diff changeset
25 charset_utf8_to_utf8_begin(normalizer_func_t *normalizer);
3879
928229f8b3e6 deinit, unref, destroy, close, free, etc. functions now take a pointer to
Timo Sirainen <tss@iki.fi>
parents: 3863
diff changeset
26 void charset_to_utf8_end(struct charset_translation **t);
903
fd8888f6f037 Naming style changes, finally got tired of most of the typedefs. Also the
Timo Sirainen <tss@iki.fi>
parents: 792
diff changeset
27 void charset_to_utf8_reset(struct charset_translation *t);
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 568
diff changeset
28
4605
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
29 /* Returns TRUE if charset is UTF-8 or ASCII */
7912
81806d402514 Added more consts, ATTR_CONSTs and ATTR_PUREs.
Timo Sirainen <tss@iki.fi>
parents: 6908
diff changeset
30 bool charset_is_utf8(const char *charset) ATTR_PURE;
4605
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
31
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
32 /* Translate src to UTF-8. src_size is updated to contain the number of
18150
3d9ec121dc81 lib-charset: Added CHARSET_MAX_PENDING_BUF_SIZE macro and asserts for it.
Timo Sirainen <tss@iki.fi>
parents: 18149
diff changeset
33 characters actually translated from src. The src_size should never shrink
3d9ec121dc81 lib-charset: Added CHARSET_MAX_PENDING_BUF_SIZE macro and asserts for it.
Timo Sirainen <tss@iki.fi>
parents: 18149
diff changeset
34 more than CHARSET_MAX_PENDING_BUF_SIZE bytes.
3d9ec121dc81 lib-charset: Added CHARSET_MAX_PENDING_BUF_SIZE macro and asserts for it.
Timo Sirainen <tss@iki.fi>
parents: 18149
diff changeset
35
3d9ec121dc81 lib-charset: Added CHARSET_MAX_PENDING_BUF_SIZE macro and asserts for it.
Timo Sirainen <tss@iki.fi>
parents: 18149
diff changeset
36 If src contains invalid input, UNICODE_REPLACEMENT_CHAR is placed in such
3d9ec121dc81 lib-charset: Added CHARSET_MAX_PENDING_BUF_SIZE macro and asserts for it.
Timo Sirainen <tss@iki.fi>
parents: 18149
diff changeset
37 positions and the invalid input is skipped over. Return value is also
3d9ec121dc81 lib-charset: Added CHARSET_MAX_PENDING_BUF_SIZE macro and asserts for it.
Timo Sirainen <tss@iki.fi>
parents: 18149
diff changeset
38 CHARSET_RET_INCOMPLETE_INPUT in that case. */
903
fd8888f6f037 Naming style changes, finally got tired of most of the typedefs. Also the
Timo Sirainen <tss@iki.fi>
parents: 792
diff changeset
39 enum charset_result
6112
e5451501ff2f charset_to_utf8_begin() now takes bool ucase parameter. Changed
Timo Sirainen <tss@iki.fi>
parents: 4605
diff changeset
40 charset_to_utf8(struct charset_translation *t,
e5451501ff2f charset_to_utf8_begin() now takes bool ucase parameter. Changed
Timo Sirainen <tss@iki.fi>
parents: 4605
diff changeset
41 const unsigned char *src, size_t *src_size, buffer_t *dest);
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 568
diff changeset
42
6908
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6410
diff changeset
43 /* Translate a single string to UTF8. */
15053
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
44 int charset_to_utf8_str(const char *charset, normalizer_func_t *normalizer,
6908
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6410
diff changeset
45 const char *input, string_t *output,
15053
c976a9c01613 Replaced "decomposed titlecase" conversions with more generic normalizer function.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
46 enum charset_result *result_r) ATTR_NULL(2);
6908
a340d3379b90 Added charset_to_utf8_str()
Timo Sirainen <tss@iki.fi>
parents: 6410
diff changeset
47
18144
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 15053
diff changeset
48 /* INTERNAL: */
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 15053
diff changeset
49 enum charset_result
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 15053
diff changeset
50 charset_utf8_to_utf8(normalizer_func_t *normalizer,
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 15053
diff changeset
51 const unsigned char *src, size_t *src_size, buffer_t *dest);
7459c0891a85 lib-charset: UTF-8 -> UTF-8 translation was never returning CHARSET_RET_INCOMPLETE_INPUT
Timo Sirainen <tss@iki.fi>
parents: 15053
diff changeset
52
568
f2aa58c2afd0 SEARCH CHARSET support. Currently we do it through iconv() and only ASCII
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
53 #endif