annotate src/lib/unichar.h @ 22664:fea53c2725c0

director: Fix director_max_parallel_moves/kicks type Should be uint, not time.
author Timo Sirainen <timo.sirainen@dovecot.fi>
date Thu, 09 Nov 2017 12:24:16 +0200
parents b95be677f483
children 9899d141ec9e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6410
e4eb71ae8e96 Changed .h ifdef/defines to use <NAME>_H format.
Timo Sirainen <tss@iki.fi>
parents: 6129
diff changeset
1 #ifndef UNICHAR_H
e4eb71ae8e96 Changed .h ifdef/defines to use <NAME>_H format.
Timo Sirainen <tss@iki.fi>
parents: 6129
diff changeset
2 #define UNICHAR_H
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
3
7185
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7042
diff changeset
4 /* Character used to replace invalid input. */
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7042
diff changeset
5 #define UNICODE_REPLACEMENT_CHAR 0xfffd
18716
ee240e7e4b6e lib: Added UNICODE_REPLACEMENT_CHAR_UTF8
Timo Sirainen <tss@iki.fi>
parents: 18148
diff changeset
6 #define UNICODE_REPLACEMENT_CHAR_UTF8 "\xEF\xBF\xBD"
7185
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7042
diff changeset
7
8378
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
8 /* Characters >= base require surrogates */
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
9 #define UTF16_SURROGATE_BASE 0x10000
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
10
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
11 #define UTF16_SURROGATE_SHIFT 10
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
12 #define UTF16_SURROGATE_MASK 0x03ff
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
13 #define UTF16_SURROGATE_HIGH_FIRST 0xd800
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
14 #define UTF16_SURROGATE_HIGH_LAST 0xdbff
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
15 #define UTF16_SURROGATE_HIGH_MAX 0xdfff
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
16 #define UTF16_SURROGATE_LOW_FIRST 0xdc00
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
17 #define UTF16_SURROGATE_LOW_LAST 0xdfff
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
18
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
19 #define UTF16_SURROGATE_HIGH(chr) \
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
20 (UTF16_SURROGATE_HIGH_FIRST + \
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
21 (((chr) - UTF16_SURROGATE_BASE) >> UTF16_SURROGATE_SHIFT))
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
22 #define UTF16_SURROGATE_LOW(chr) \
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
23 (UTF16_SURROGATE_LOW_FIRST + \
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
24 (((chr) - UTF16_SURROGATE_BASE) & UTF16_SURROGATE_MASK))
fcce76948c8a Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents: 7912
diff changeset
25
18816
b95be677f483 lib: Added UTF8_IS_START_SEQ() helper macro
Timo Sirainen <tss@iki.fi>
parents: 18786
diff changeset
26 /* Returns TRUE if given byte is ASCII character or the beginning of a
b95be677f483 lib: Added UTF8_IS_START_SEQ() helper macro
Timo Sirainen <tss@iki.fi>
parents: 18786
diff changeset
27 multibyte UTF-8 sequence */
b95be677f483 lib: Added UTF8_IS_START_SEQ() helper macro
Timo Sirainen <tss@iki.fi>
parents: 18786
diff changeset
28 #define UTF8_IS_START_SEQ(b) \
b95be677f483 lib: Added UTF8_IS_START_SEQ() helper macro
Timo Sirainen <tss@iki.fi>
parents: 18786
diff changeset
29 (((b) & 0x80) == 0 || ((b) & 0xC0) == 0xC0)
b95be677f483 lib: Added UTF8_IS_START_SEQ() helper macro
Timo Sirainen <tss@iki.fi>
parents: 18786
diff changeset
30
12024
6105706de7b6 Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents: 12010
diff changeset
31 #define UTF8_REPLACEMENT_CHAR_LEN 3
6105706de7b6 Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents: 12010
diff changeset
32
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
33 typedef uint32_t unichar_t;
7042
dcbf6afdf931 Define unichars array type and use it for uni_utf8_to_ucs4() output.
Timo Sirainen <tss@iki.fi>
parents: 6952
diff changeset
34 ARRAY_DEFINE_TYPE(unichars, unichar_t);
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
35
15141
99305e4dd403 Backported parts of normalizer_func_t changes from v2.2 tree.
Timo Sirainen <tss@iki.fi>
parents: 13100
diff changeset
36 /* Normalize UTF8 input and append it to output buffer.
99305e4dd403 Backported parts of normalizer_func_t changes from v2.2 tree.
Timo Sirainen <tss@iki.fi>
parents: 13100
diff changeset
37 Returns 0 if ok, -1 if input was invalid. Even if input was invalid,
99305e4dd403 Backported parts of normalizer_func_t changes from v2.2 tree.
Timo Sirainen <tss@iki.fi>
parents: 13100
diff changeset
38 as much as possible should be added to output. */
99305e4dd403 Backported parts of normalizer_func_t changes from v2.2 tree.
Timo Sirainen <tss@iki.fi>
parents: 13100
diff changeset
39 typedef int normalizer_func_t(const void *input, size_t size,
99305e4dd403 Backported parts of normalizer_func_t changes from v2.2 tree.
Timo Sirainen <tss@iki.fi>
parents: 13100
diff changeset
40 buffer_t *output);
99305e4dd403 Backported parts of normalizer_func_t changes from v2.2 tree.
Timo Sirainen <tss@iki.fi>
parents: 13100
diff changeset
41
12024
6105706de7b6 Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents: 12010
diff changeset
42 extern const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN];
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
43 extern const uint8_t *const uni_utf8_non1_bytes;
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
44
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
45 /* Returns number of characters in a NUL-terminated unicode string */
7912
81806d402514 Added more consts, ATTR_CONSTs and ATTR_PUREs.
Timo Sirainen <tss@iki.fi>
parents: 7185
diff changeset
46 unsigned int uni_strlen(const unichar_t *str) ATTR_PURE;
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
47 /* Translates UTF-8 input to UCS-4 output. Returns 0 if ok, -1 if input was
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
48 invalid */
7042
dcbf6afdf931 Define unichars array type and use it for uni_utf8_to_ucs4() output.
Timo Sirainen <tss@iki.fi>
parents: 6952
diff changeset
49 int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output);
13050
7a7c22755b7a liblib: Added uni_utf8_to_ucs4_n().
Timo Sirainen <tss@iki.fi>
parents: 12745
diff changeset
50 int uni_utf8_to_ucs4_n(const unsigned char *input, size_t size,
7a7c22755b7a liblib: Added uni_utf8_to_ucs4_n().
Timo Sirainen <tss@iki.fi>
parents: 12745
diff changeset
51 ARRAY_TYPE(unichars) *output);
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
52 /* Translates UCS-4 input to UTF-8 output. */
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
53 void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output);
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
54 void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output);
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
55
18786
8b7be28bd518 lib: API change - have uni_utf8_get_char*() return _char_bytes
Phil Carmody <phil@dovecot.fi>
parents: 18716
diff changeset
56 /* Returns char_bytes (>0) if *chr_r is set, 0 for incomplete trailing character,
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
57 -1 for invalid input. */
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
58 int uni_utf8_get_char(const char *input, unichar_t *chr_r);
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
59 int uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r);
18148
e645ee117fa9 lib: Fixed NUL-handling in uni_utf8_*strlen*()
Timo Sirainen <tss@iki.fi>
parents: 18145
diff changeset
60 /* Returns number of characters in UTF-8 string. */
13100
fbd680c37b6a Added uni_utf8_strlen().
Timo Sirainen <tss@iki.fi>
parents: 13050
diff changeset
61 unsigned int uni_utf8_strlen(const char *input) ATTR_PURE;
18148
e645ee117fa9 lib: Fixed NUL-handling in uni_utf8_*strlen*()
Timo Sirainen <tss@iki.fi>
parents: 18145
diff changeset
62 /* Returns number of characters in UTF-8 input of specified size. */
7912
81806d402514 Added more consts, ATTR_CONSTs and ATTR_PUREs.
Timo Sirainen <tss@iki.fi>
parents: 7185
diff changeset
63 unsigned int uni_utf8_strlen_n(const void *input, size_t size) ATTR_PURE;
18145
f191dbcaec5f lib: Added uni_utf8_partial_strlen_n()
Timo Sirainen <tss@iki.fi>
parents: 15578
diff changeset
64 /* Same as uni_utf8_strlen_n(), but if input ends with a partial UTF-8
f191dbcaec5f lib: Added uni_utf8_partial_strlen_n()
Timo Sirainen <tss@iki.fi>
parents: 15578
diff changeset
65 character, don't include it in the return value and set partial_pos_r to
f191dbcaec5f lib: Added uni_utf8_partial_strlen_n()
Timo Sirainen <tss@iki.fi>
parents: 15578
diff changeset
66 where the character begins. Otherwise partial_pos_r is set to the end
f191dbcaec5f lib: Added uni_utf8_partial_strlen_n()
Timo Sirainen <tss@iki.fi>
parents: 15578
diff changeset
67 of the input. */
f191dbcaec5f lib: Added uni_utf8_partial_strlen_n()
Timo Sirainen <tss@iki.fi>
parents: 15578
diff changeset
68 unsigned int uni_utf8_partial_strlen_n(const void *input, size_t size,
f191dbcaec5f lib: Added uni_utf8_partial_strlen_n()
Timo Sirainen <tss@iki.fi>
parents: 15578
diff changeset
69 size_t *partial_pos_r);
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
70
12010
a83963495e55 UTF-8 string validity was still checked incorrectly.
Timo Sirainen <tss@iki.fi>
parents: 10294
diff changeset
71 /* Returns the number of bytes belonging to this UTF-8 character. The given
a83963495e55 UTF-8 string validity was still checked incorrectly.
Timo Sirainen <tss@iki.fi>
parents: 10294
diff changeset
72 parameter is the first byte of the UTF-8 sequence. Invalid input is
a83963495e55 UTF-8 string validity was still checked incorrectly.
Timo Sirainen <tss@iki.fi>
parents: 10294
diff changeset
73 returned with length 1. */
7912
81806d402514 Added more consts, ATTR_CONSTs and ATTR_PUREs.
Timo Sirainen <tss@iki.fi>
parents: 7185
diff changeset
74 static inline unsigned int ATTR_CONST
81806d402514 Added more consts, ATTR_CONSTs and ATTR_PUREs.
Timo Sirainen <tss@iki.fi>
parents: 7185
diff changeset
75 uni_utf8_char_bytes(char chr)
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
76 {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
77 /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
78 if ((uint8_t)chr < (192 + 2))
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
79 return 1;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
80 return uni_utf8_non1_bytes[(uint8_t)chr - (192 + 2)];
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
81 }
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
82
6129
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
83 /* Return given character in titlecase. */
7912
81806d402514 Added more consts, ATTR_CONSTs and ATTR_PUREs.
Timo Sirainen <tss@iki.fi>
parents: 7185
diff changeset
84 unichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST;
6129
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
85
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
86 /* Convert UTF-8 input to titlecase and decompose the titlecase characters to
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
87 output buffer. Returns 0 if ok, -1 if input was invalid. This generates
7185
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7042
diff changeset
88 output that's compatible with i;unicode-casemap comparator. Invalid input
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7042
diff changeset
89 is replaced with unicode replacement character (0xfffd). */
15052
d5ebec837bfd uni_utf8_to_decomposed_titlecase(): Require input length to be exact now.
Timo Sirainen <tss@iki.fi>
parents: 13100
diff changeset
90 int uni_utf8_to_decomposed_titlecase(const void *input, size_t size,
6129
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
91 buffer_t *output);
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
92
7185
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7042
diff changeset
93 /* If input contains only valid UTF-8 characters, return TRUE without updating
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7042
diff changeset
94 buf. If input contains invalid UTF-8 characters, replace them with unicode
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7042
diff changeset
95 replacement character (0xfffd), write the output to buf and return FALSE. */
6952
08e4d7efcd6a uni_utf8_get_valid_data() API changed.
Timo Sirainen <tss@iki.fi>
parents: 6951
diff changeset
96 bool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
08e4d7efcd6a uni_utf8_get_valid_data() API changed.
Timo Sirainen <tss@iki.fi>
parents: 6951
diff changeset
97 buffer_t *buf);
10294
64df978b2926 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 8378
diff changeset
98 /* Returns TRUE if string is valid UTF-8 input. */
64df978b2926 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 8378
diff changeset
99 bool uni_utf8_str_is_valid(const char *str);
12745
35c6df7f6144 Added uni_utf8_data_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 12024
diff changeset
100 /* Returns TRUE if data contains only valid UTF-8 input. */
35c6df7f6144 Added uni_utf8_data_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 12024
diff changeset
101 bool uni_utf8_data_is_valid(const unsigned char *data, size_t size);
6951
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6410
diff changeset
102
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
103 #endif