Mercurial > dovecot > core-2.2
annotate src/lib/unichar.h @ 22664:fea53c2725c0
director: Fix director_max_parallel_moves/kicks type
Should be uint, not time.
author | Timo Sirainen <timo.sirainen@dovecot.fi> |
---|---|
date | Thu, 09 Nov 2017 12:24:16 +0200 |
parents | b95be677f483 |
children | 9899d141ec9e |
rev | line source |
---|---|
6410
e4eb71ae8e96
Changed .h ifdef/defines to use <NAME>_H format.
Timo Sirainen <tss@iki.fi>
parents:
6129
diff
changeset
|
1 #ifndef UNICHAR_H |
e4eb71ae8e96
Changed .h ifdef/defines to use <NAME>_H format.
Timo Sirainen <tss@iki.fi>
parents:
6129
diff
changeset
|
2 #define UNICHAR_H |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
3 |
7185
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7042
diff
changeset
|
4 /* Character used to replace invalid input. */ |
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7042
diff
changeset
|
5 #define UNICODE_REPLACEMENT_CHAR 0xfffd |
18716
ee240e7e4b6e
lib: Added UNICODE_REPLACEMENT_CHAR_UTF8
Timo Sirainen <tss@iki.fi>
parents:
18148
diff
changeset
|
6 #define UNICODE_REPLACEMENT_CHAR_UTF8 "\xEF\xBF\xBD" |
7185
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7042
diff
changeset
|
7 |
8378
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
8 /* Characters >= base require surrogates */ |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
9 #define UTF16_SURROGATE_BASE 0x10000 |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
10 |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
11 #define UTF16_SURROGATE_SHIFT 10 |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
12 #define UTF16_SURROGATE_MASK 0x03ff |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
13 #define UTF16_SURROGATE_HIGH_FIRST 0xd800 |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
14 #define UTF16_SURROGATE_HIGH_LAST 0xdbff |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
15 #define UTF16_SURROGATE_HIGH_MAX 0xdfff |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
16 #define UTF16_SURROGATE_LOW_FIRST 0xdc00 |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
17 #define UTF16_SURROGATE_LOW_LAST 0xdfff |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
18 |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
19 #define UTF16_SURROGATE_HIGH(chr) \ |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
20 (UTF16_SURROGATE_HIGH_FIRST + \ |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
21 (((chr) - UTF16_SURROGATE_BASE) >> UTF16_SURROGATE_SHIFT)) |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
22 #define UTF16_SURROGATE_LOW(chr) \ |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
23 (UTF16_SURROGATE_LOW_FIRST + \ |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
24 (((chr) - UTF16_SURROGATE_BASE) & UTF16_SURROGATE_MASK)) |
fcce76948c8a
Added some UTF16_ macros for helping UTF-16 conversions.
Timo Sirainen <tss@iki.fi>
parents:
7912
diff
changeset
|
25 |
18816
b95be677f483
lib: Added UTF8_IS_START_SEQ() helper macro
Timo Sirainen <tss@iki.fi>
parents:
18786
diff
changeset
|
26 /* Returns TRUE if given byte is ASCII character or the beginning of a |
b95be677f483
lib: Added UTF8_IS_START_SEQ() helper macro
Timo Sirainen <tss@iki.fi>
parents:
18786
diff
changeset
|
27 multibyte UTF-8 sequence */ |
b95be677f483
lib: Added UTF8_IS_START_SEQ() helper macro
Timo Sirainen <tss@iki.fi>
parents:
18786
diff
changeset
|
28 #define UTF8_IS_START_SEQ(b) \ |
b95be677f483
lib: Added UTF8_IS_START_SEQ() helper macro
Timo Sirainen <tss@iki.fi>
parents:
18786
diff
changeset
|
29 (((b) & 0x80) == 0 || ((b) & 0xC0) == 0xC0) |
b95be677f483
lib: Added UTF8_IS_START_SEQ() helper macro
Timo Sirainen <tss@iki.fi>
parents:
18786
diff
changeset
|
30 |
12024
6105706de7b6
Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents:
12010
diff
changeset
|
31 #define UTF8_REPLACEMENT_CHAR_LEN 3 |
6105706de7b6
Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents:
12010
diff
changeset
|
32 |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
33 typedef uint32_t unichar_t; |
7042
dcbf6afdf931
Define unichars array type and use it for uni_utf8_to_ucs4() output.
Timo Sirainen <tss@iki.fi>
parents:
6952
diff
changeset
|
34 ARRAY_DEFINE_TYPE(unichars, unichar_t); |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
35 |
15141
99305e4dd403
Backported parts of normalizer_func_t changes from v2.2 tree.
Timo Sirainen <tss@iki.fi>
parents:
13100
diff
changeset
|
36 /* Normalize UTF8 input and append it to output buffer. |
99305e4dd403
Backported parts of normalizer_func_t changes from v2.2 tree.
Timo Sirainen <tss@iki.fi>
parents:
13100
diff
changeset
|
37 Returns 0 if ok, -1 if input was invalid. Even if input was invalid, |
99305e4dd403
Backported parts of normalizer_func_t changes from v2.2 tree.
Timo Sirainen <tss@iki.fi>
parents:
13100
diff
changeset
|
38 as much as possible should be added to output. */ |
99305e4dd403
Backported parts of normalizer_func_t changes from v2.2 tree.
Timo Sirainen <tss@iki.fi>
parents:
13100
diff
changeset
|
39 typedef int normalizer_func_t(const void *input, size_t size, |
99305e4dd403
Backported parts of normalizer_func_t changes from v2.2 tree.
Timo Sirainen <tss@iki.fi>
parents:
13100
diff
changeset
|
40 buffer_t *output); |
99305e4dd403
Backported parts of normalizer_func_t changes from v2.2 tree.
Timo Sirainen <tss@iki.fi>
parents:
13100
diff
changeset
|
41 |
12024
6105706de7b6
Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents:
12010
diff
changeset
|
42 extern const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN]; |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
43 extern const uint8_t *const uni_utf8_non1_bytes; |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
44 |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
45 /* Returns number of characters in a NUL-terminated unicode string */ |
7912
81806d402514
Added more consts, ATTR_CONSTs and ATTR_PUREs.
Timo Sirainen <tss@iki.fi>
parents:
7185
diff
changeset
|
46 unsigned int uni_strlen(const unichar_t *str) ATTR_PURE; |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
47 /* Translates UTF-8 input to UCS-4 output. Returns 0 if ok, -1 if input was |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
48 invalid */ |
7042
dcbf6afdf931
Define unichars array type and use it for uni_utf8_to_ucs4() output.
Timo Sirainen <tss@iki.fi>
parents:
6952
diff
changeset
|
49 int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output); |
13050
7a7c22755b7a
liblib: Added uni_utf8_to_ucs4_n().
Timo Sirainen <tss@iki.fi>
parents:
12745
diff
changeset
|
50 int uni_utf8_to_ucs4_n(const unsigned char *input, size_t size, |
7a7c22755b7a
liblib: Added uni_utf8_to_ucs4_n().
Timo Sirainen <tss@iki.fi>
parents:
12745
diff
changeset
|
51 ARRAY_TYPE(unichars) *output); |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
52 /* Translates UCS-4 input to UTF-8 output. */ |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
53 void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output); |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
54 void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output); |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
55 |
18786
8b7be28bd518
lib: API change - have uni_utf8_get_char*() return _char_bytes
Phil Carmody <phil@dovecot.fi>
parents:
18716
diff
changeset
|
56 /* Returns char_bytes (>0) if *chr_r is set, 0 for incomplete trailing character, |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
57 -1 for invalid input. */ |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
58 int uni_utf8_get_char(const char *input, unichar_t *chr_r); |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
59 int uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r); |
18148
e645ee117fa9
lib: Fixed NUL-handling in uni_utf8_*strlen*()
Timo Sirainen <tss@iki.fi>
parents:
18145
diff
changeset
|
60 /* Returns number of characters in UTF-8 string. */ |
13100 | 61 unsigned int uni_utf8_strlen(const char *input) ATTR_PURE; |
18148
e645ee117fa9
lib: Fixed NUL-handling in uni_utf8_*strlen*()
Timo Sirainen <tss@iki.fi>
parents:
18145
diff
changeset
|
62 /* Returns number of characters in UTF-8 input of specified size. */ |
7912
81806d402514
Added more consts, ATTR_CONSTs and ATTR_PUREs.
Timo Sirainen <tss@iki.fi>
parents:
7185
diff
changeset
|
63 unsigned int uni_utf8_strlen_n(const void *input, size_t size) ATTR_PURE; |
18145
f191dbcaec5f
lib: Added uni_utf8_partial_strlen_n()
Timo Sirainen <tss@iki.fi>
parents:
15578
diff
changeset
|
64 /* Same as uni_utf8_strlen_n(), but if input ends with a partial UTF-8 |
f191dbcaec5f
lib: Added uni_utf8_partial_strlen_n()
Timo Sirainen <tss@iki.fi>
parents:
15578
diff
changeset
|
65 character, don't include it in the return value and set partial_pos_r to |
f191dbcaec5f
lib: Added uni_utf8_partial_strlen_n()
Timo Sirainen <tss@iki.fi>
parents:
15578
diff
changeset
|
66 where the character begins. Otherwise partial_pos_r is set to the end |
f191dbcaec5f
lib: Added uni_utf8_partial_strlen_n()
Timo Sirainen <tss@iki.fi>
parents:
15578
diff
changeset
|
67 of the input. */ |
f191dbcaec5f
lib: Added uni_utf8_partial_strlen_n()
Timo Sirainen <tss@iki.fi>
parents:
15578
diff
changeset
|
68 unsigned int uni_utf8_partial_strlen_n(const void *input, size_t size, |
f191dbcaec5f
lib: Added uni_utf8_partial_strlen_n()
Timo Sirainen <tss@iki.fi>
parents:
15578
diff
changeset
|
69 size_t *partial_pos_r); |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
70 |
12010
a83963495e55
UTF-8 string validity was still checked incorrectly.
Timo Sirainen <tss@iki.fi>
parents:
10294
diff
changeset
|
71 /* Returns the number of bytes belonging to this UTF-8 character. The given |
a83963495e55
UTF-8 string validity was still checked incorrectly.
Timo Sirainen <tss@iki.fi>
parents:
10294
diff
changeset
|
72 parameter is the first byte of the UTF-8 sequence. Invalid input is |
a83963495e55
UTF-8 string validity was still checked incorrectly.
Timo Sirainen <tss@iki.fi>
parents:
10294
diff
changeset
|
73 returned with length 1. */ |
7912
81806d402514
Added more consts, ATTR_CONSTs and ATTR_PUREs.
Timo Sirainen <tss@iki.fi>
parents:
7185
diff
changeset
|
74 static inline unsigned int ATTR_CONST |
81806d402514
Added more consts, ATTR_CONSTs and ATTR_PUREs.
Timo Sirainen <tss@iki.fi>
parents:
7185
diff
changeset
|
75 uni_utf8_char_bytes(char chr) |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
76 { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
77 /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */ |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
78 if ((uint8_t)chr < (192 + 2)) |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
79 return 1; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
80 return uni_utf8_non1_bytes[(uint8_t)chr - (192 + 2)]; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
81 } |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
82 |
6129
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
83 /* Return given character in titlecase. */ |
7912
81806d402514
Added more consts, ATTR_CONSTs and ATTR_PUREs.
Timo Sirainen <tss@iki.fi>
parents:
7185
diff
changeset
|
84 unichar_t uni_ucs4_to_titlecase(unichar_t chr) ATTR_CONST; |
6129
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
85 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
86 /* Convert UTF-8 input to titlecase and decompose the titlecase characters to |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
87 output buffer. Returns 0 if ok, -1 if input was invalid. This generates |
7185
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7042
diff
changeset
|
88 output that's compatible with i;unicode-casemap comparator. Invalid input |
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7042
diff
changeset
|
89 is replaced with unicode replacement character (0xfffd). */ |
15052
d5ebec837bfd
uni_utf8_to_decomposed_titlecase(): Require input length to be exact now.
Timo Sirainen <tss@iki.fi>
parents:
13100
diff
changeset
|
90 int uni_utf8_to_decomposed_titlecase(const void *input, size_t size, |
6129
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
91 buffer_t *output); |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
92 |
7185
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7042
diff
changeset
|
93 /* If input contains only valid UTF-8 characters, return TRUE without updating |
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7042
diff
changeset
|
94 buf. If input contains invalid UTF-8 characters, replace them with unicode |
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7042
diff
changeset
|
95 replacement character (0xfffd), write the output to buf and return FALSE. */ |
6952
08e4d7efcd6a
uni_utf8_get_valid_data() API changed.
Timo Sirainen <tss@iki.fi>
parents:
6951
diff
changeset
|
96 bool uni_utf8_get_valid_data(const unsigned char *input, size_t size, |
08e4d7efcd6a
uni_utf8_get_valid_data() API changed.
Timo Sirainen <tss@iki.fi>
parents:
6951
diff
changeset
|
97 buffer_t *buf); |
10294 | 98 /* Returns TRUE if string is valid UTF-8 input. */ |
99 bool uni_utf8_str_is_valid(const char *str); | |
12745
35c6df7f6144
Added uni_utf8_data_is_valid().
Timo Sirainen <tss@iki.fi>
parents:
12024
diff
changeset
|
100 /* Returns TRUE if data contains only valid UTF-8 input. */ |
35c6df7f6144
Added uni_utf8_data_is_valid().
Timo Sirainen <tss@iki.fi>
parents:
12024
diff
changeset
|
101 bool uni_utf8_data_is_valid(const unsigned char *data, size_t size); |
6951
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6410
diff
changeset
|
102 |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
103 #endif |