annotate src/lib/unichar.c @ 9603:5efba9f9f0a7 HEAD

Added a global utf8_replacement_char variable.
author Timo Sirainen <tss@iki.fi>
date Fri, 20 Aug 2010 20:37:31 +0100
parents cc7aa7a4dd6d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9532
00cd9aacd03c Updated copyright notices to include year 2010.
Timo Sirainen <tss@iki.fi>
parents: 9476
diff changeset
1 /* Copyright (c) 2005-2010 Dovecot authors, see the included COPYING file */
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
2
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
3 #include "lib.h"
7042
dcbf6afdf931 Define unichars array type and use it for uni_utf8_to_ucs4() output.
Timo Sirainen <tss@iki.fi>
parents: 6953
diff changeset
4 #include "array.h"
6129
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
5 #include "bsearch-insert-pos.h"
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
6 #include "unichar.h"
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
7
6129
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
8 #include "unicodemap.c"
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
9
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
10 #define HANGUL_FIRST 0xac00
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
11 #define HANGUL_LAST 0xd7a3
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
12
9603
5efba9f9f0a7 Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents: 9602
diff changeset
13 const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN] =
5efba9f9f0a7 Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents: 9602
diff changeset
14 { 0xef, 0xbf, 0xbd }; /* 0xfffd */
5efba9f9f0a7 Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents: 9602
diff changeset
15
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
16 static const uint8_t utf8_non1_bytes[256 - 192 - 2] = {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
17 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
18 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
19 };
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
20
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
21 const uint8_t *const uni_utf8_non1_bytes = utf8_non1_bytes;
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
22
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
23 unsigned int uni_strlen(const unichar_t *str)
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
24 {
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
25 unsigned int len = 0;
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
26
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
27 for (len = 0; str[len] != 0; len++) ;
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
28
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
29 return len;
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
30 }
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
31
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
32 int uni_utf8_get_char(const char *input, unichar_t *chr_r)
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
33 {
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
34 return uni_utf8_get_char_n((const unsigned char *)input, (size_t)-1,
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
35 chr_r);
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
36 }
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
37
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
38 int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r)
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
39 {
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
40 const unsigned char *input = _input;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
41 unichar_t chr;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
42 unsigned int i, len;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
43 int ret;
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
44
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
45 i_assert(max_len > 0);
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
46
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
47 if (*input < 0x80) {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
48 *chr_r = *input;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
49 return 1;
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
50 }
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
51
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
52 /* first byte has len highest bits set, followed by zero bit.
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
53 the rest of the bits are used as the highest bits of the value. */
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
54 chr = *input;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
55 len = uni_utf8_char_bytes(*input);
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
56 switch (len) {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
57 case 2:
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
58 chr &= 0x1f;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
59 break;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
60 case 3:
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
61 chr &= 0x0f;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
62 break;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
63 case 4:
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
64 chr &= 0x07;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
65 break;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
66 case 5:
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
67 chr &= 0x03;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
68 break;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
69 case 6:
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
70 chr &= 0x01;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
71 break;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
72 default:
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
73 /* only 7bit chars should have len==1 */
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
74 i_assert(len == 1);
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
75 return -1;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
76 }
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
77
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
78 if (len <= max_len)
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
79 ret = 1;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
80 else {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
81 /* check first if the input is invalid before returning 0 */
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
82 ret = 0;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
83 len = max_len;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
84 }
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
85
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
86 /* the following bytes must all be 10xxxxxx */
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
87 for (i = 1; i < len; i++) {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
88 if ((input[i] & 0xc0) != 0x80)
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
89 return input[i] == '\0' ? 0 : -1;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
90
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
91 chr <<= 6;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
92 chr |= input[i] & 0x3f;
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
93 }
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
94
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
95 *chr_r = chr;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
96 return ret;
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
97 }
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
98
7042
dcbf6afdf931 Define unichars array type and use it for uni_utf8_to_ucs4() output.
Timo Sirainen <tss@iki.fi>
parents: 6953
diff changeset
99 int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output)
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
100 {
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
101 unichar_t chr;
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
102
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
103 while (*input != '\0') {
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
104 if (uni_utf8_get_char(input, &chr) <= 0) {
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
105 /* invalid input */
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
106 return -1;
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
107 }
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
108 input += uni_utf8_char_bytes(*input);
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
109
7042
dcbf6afdf931 Define unichars array type and use it for uni_utf8_to_ucs4() output.
Timo Sirainen <tss@iki.fi>
parents: 6953
diff changeset
110 array_append(output, &chr, 1);
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
111 }
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
112 return 0;
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
113 }
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
114
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
115 void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output)
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
116 {
7084
d8e7699ac68e uni_ucs4_to_utf8(): Check len>0 first so we don't access input[len].
Timo Sirainen <tss@iki.fi>
parents: 7042
diff changeset
117 for (; len > 0 && *input != '\0'; input++, len--)
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
118 uni_ucs4_to_utf8_c(*input, output);
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
119 }
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
120
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
121 void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output)
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
122 {
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
123 unsigned char first;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
124 int bitpos;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
125
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
126 if (chr < 0x80) {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
127 buffer_append_c(output, chr);
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
128 return;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
129 }
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
130
6780
8b11c9f06bbf Assert fix
Timo Sirainen <tss@iki.fi>
parents: 6429
diff changeset
131 i_assert(chr < 0x80000000); /* 1 << (5*6 + 1) */
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
132
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
133 if (chr < (1 << (6 + 5))) {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
134 /* 110xxxxx */
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
135 bitpos = 6;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
136 first = 0x80 | 0x40;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
137 } else if (chr < (1 << ((2*6) + 4))) {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
138 /* 1110xxxx */
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
139 bitpos = 2*6;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
140 first = 0x80 | 0x40 | 0x20;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
141 } else if (chr < (1 << ((3*6) + 3))) {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
142 /* 11110xxx */
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
143 bitpos = 3*6;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
144 first = 0x80 | 0x40 | 0x20 | 0x10;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
145 } else if (chr < (1 << ((4*6) + 2))) {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
146 /* 111110xx */
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
147 bitpos = 4*6;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
148 first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
149 } else {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
150 /* 1111110x */
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
151 bitpos = 5*6;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
152 first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08 | 0x04;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
153 }
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
154 buffer_append_c(output, first | (chr >> bitpos));
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
155
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
156 do {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
157 bitpos -= 6;
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
158 buffer_append_c(output, 0x80 | ((chr >> bitpos) & 0x3f));
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
159 } while (bitpos > 0);
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
160 }
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
161
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
162 unsigned int uni_utf8_strlen_n(const void *_input, size_t size)
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
163 {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
164 const unsigned char *input = _input;
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
165 unsigned int len = 0;
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
166 size_t i;
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
167
5683
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
168 for (i = 0; i < size && input[i] != '\0'; ) {
8101787cdd1c Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents: 4899
diff changeset
169 i += uni_utf8_char_bytes(input[i]);
4899
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
170 if (i > size)
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
171 break;
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
172 len++;
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
173 }
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
174 return len;
c98008a7e9b7 Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
175 }
6129
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
176
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
177 static bool uint16_find(const uint16_t *data, unsigned int count,
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
178 uint16_t value, unsigned int *idx_r)
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
179 {
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
180 BINARY_NUMBER_SEARCH(data, count, value, idx_r);
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
181 }
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
182
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
183 static bool uint32_find(const uint32_t *data, unsigned int count,
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
184 uint32_t value, unsigned int *idx_r)
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
185 {
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
186 BINARY_NUMBER_SEARCH(data, count, value, idx_r);
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
187 }
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
188
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
189 unichar_t uni_ucs4_to_titlecase(unichar_t chr)
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
190 {
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
191 unsigned int idx;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
192
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
193 if (chr <= 0xffff) {
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
194 if (!uint16_find(titlecase16_keys, N_ELEMENTS(titlecase16_keys),
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
195 chr, &idx))
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
196 return chr;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
197 else
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
198 return titlecase16_values[idx];
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
199 } else {
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
200 if (!uint32_find(titlecase32_keys, N_ELEMENTS(titlecase32_keys),
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
201 chr, &idx))
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
202 return chr;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
203 else
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
204 return titlecase32_values[idx];
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
205 }
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
206 }
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
207
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
208 static bool uni_ucs4_decompose_uni(unichar_t *chr)
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
209 {
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
210 unsigned int idx;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
211
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
212 if (*chr <= 0xffff) {
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
213 if (!uint16_find(uni16_decomp_keys,
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
214 N_ELEMENTS(uni16_decomp_keys),
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
215 *chr, &idx))
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
216 return FALSE;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
217 *chr = uni16_decomp_values[idx];
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
218 } else {
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
219 if (!uint32_find(uni32_decomp_keys,
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
220 N_ELEMENTS(uni32_decomp_keys),
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
221 *chr, &idx))
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
222 return FALSE;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
223 *chr = uni32_decomp_values[idx];
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
224 }
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
225 return TRUE;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
226 }
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
227
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
228 static void uni_ucs4_decompose_hangul_utf8(unichar_t chr, buffer_t *output)
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
229 {
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
230 #define SBase HANGUL_FIRST
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
231 #define LBase 0x1100
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
232 #define VBase 0x1161
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
233 #define TBase 0x11A7
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
234 #define LCount 19
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
235 #define VCount 21
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
236 #define TCount 28
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
237 #define NCount (VCount * TCount)
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
238 unsigned int SIndex = chr - SBase;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
239 unichar_t L = LBase + SIndex / NCount;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
240 unichar_t V = VBase + (SIndex % NCount) / TCount;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
241 unichar_t T = TBase + SIndex % TCount;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
242
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
243 uni_ucs4_to_utf8_c(L, output);
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
244 uni_ucs4_to_utf8_c(V, output);
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
245 if (T != TBase) uni_ucs4_to_utf8_c(T, output);
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
246 }
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
247
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
248 static bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output)
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
249 {
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
250 const uint16_t *value;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
251 unsigned int idx;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
252
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
253 if (chr > 0xffff)
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
254 return FALSE;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
255
9405
9a8c565adbe1 New UnicodeMap.txt has >16bit multi-decomposition keys, support them.
Timo Sirainen <tss@iki.fi>
parents: 8590
diff changeset
256 if (!uint32_find(multidecomp_keys, N_ELEMENTS(multidecomp_keys),
6129
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
257 chr, &idx))
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
258 return FALSE;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
259
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
260 value = &multidecomp_values[multidecomp_offsets[idx]];
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
261 for (; *value != 0; value++)
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
262 uni_ucs4_to_utf8_c(*value, output);
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
263 return TRUE;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
264 }
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
265
7185
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7157
diff changeset
266 static void output_add_replacement_char(buffer_t *output)
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7157
diff changeset
267 {
9603
5efba9f9f0a7 Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents: 9602
diff changeset
268 if (output->used >= UTF8_REPLACEMENT_CHAR_LEN &&
7185
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7157
diff changeset
269 memcmp(CONST_PTR_OFFSET(output->data,
9603
5efba9f9f0a7 Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents: 9602
diff changeset
270 output->used - UTF8_REPLACEMENT_CHAR_LEN),
5efba9f9f0a7 Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents: 9602
diff changeset
271 utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN) == 0) {
7185
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7157
diff changeset
272 /* don't add the replacement char multiple times */
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7157
diff changeset
273 return;
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7157
diff changeset
274 }
9603
5efba9f9f0a7 Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents: 9602
diff changeset
275 buffer_append(output, utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN);
7185
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7157
diff changeset
276 }
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7157
diff changeset
277
6129
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
278 int uni_utf8_to_decomposed_titlecase(const void *_input, size_t max_len,
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
279 buffer_t *output)
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
280 {
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
281 const unsigned char *input = _input;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
282 unsigned int bytes;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
283 unichar_t chr;
6953
edd296d164db uni_utf8_to_decomposed_titlecase(): If we encounter invalid UTF-8 input,
Timo Sirainen <tss@iki.fi>
parents: 6952
diff changeset
284 int ret = 0;
6129
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
285
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
286 while (max_len > 0 && *input != '\0') {
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
287 if (uni_utf8_get_char_n(input, max_len, &chr) <= 0) {
6953
edd296d164db uni_utf8_to_decomposed_titlecase(): If we encounter invalid UTF-8 input,
Timo Sirainen <tss@iki.fi>
parents: 6952
diff changeset
288 /* invalid input. try the next byte. */
edd296d164db uni_utf8_to_decomposed_titlecase(): If we encounter invalid UTF-8 input,
Timo Sirainen <tss@iki.fi>
parents: 6952
diff changeset
289 ret = -1;
edd296d164db uni_utf8_to_decomposed_titlecase(): If we encounter invalid UTF-8 input,
Timo Sirainen <tss@iki.fi>
parents: 6952
diff changeset
290 input++; max_len--;
7185
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7157
diff changeset
291 output_add_replacement_char(output);
6953
edd296d164db uni_utf8_to_decomposed_titlecase(): If we encounter invalid UTF-8 input,
Timo Sirainen <tss@iki.fi>
parents: 6952
diff changeset
292 continue;
6129
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
293 }
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
294 bytes = uni_utf8_char_bytes(*input);
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
295 input += bytes;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
296 max_len -= bytes;
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
297
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
298 chr = uni_ucs4_to_titlecase(chr);
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
299 if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST)
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
300 uni_ucs4_decompose_hangul_utf8(chr, output);
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
301 else if (uni_ucs4_decompose_uni(&chr) ||
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
302 !uni_ucs4_decompose_multi_utf8(chr, output))
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
303 uni_ucs4_to_utf8_c(chr, output);
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
304 }
6953
edd296d164db uni_utf8_to_decomposed_titlecase(): If we encounter invalid UTF-8 input,
Timo Sirainen <tss@iki.fi>
parents: 6952
diff changeset
305 return ret;
6129
04b9eb27283c Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents: 5683
diff changeset
306 }
6951
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
307
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
308 static inline unsigned int
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
309 is_valid_utf8_seq(const unsigned char *input, unsigned int size)
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
310 {
9602
cc7aa7a4dd6d UTF-8 string validity was still checked incorrectly.
Timo Sirainen <tss@iki.fi>
parents: 9600
diff changeset
311 unsigned int i, len;
6951
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
312
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
313 len = uni_utf8_char_bytes(input[0]);
7157
0c1d5a814368 uni_utf8_get_valid_data(): Check for invalid UTF-8 better.
Timo Sirainen <tss@iki.fi>
parents: 7086
diff changeset
314 if (unlikely(len > size || len == 1))
6951
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
315 return 0;
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
316
9600
ccc71865dea0 UTF-8 string validity was checked incorrectly.
Timo Sirainen <tss@iki.fi>
parents: 9532
diff changeset
317 /* the rest of the chars should be in 0x80..0xbf range.
ccc71865dea0 UTF-8 string validity was checked incorrectly.
Timo Sirainen <tss@iki.fi>
parents: 9532
diff changeset
318 anything else is start of a sequence or invalid */
7157
0c1d5a814368 uni_utf8_get_valid_data(): Check for invalid UTF-8 better.
Timo Sirainen <tss@iki.fi>
parents: 7086
diff changeset
319 for (i = 1; i < len; i++) {
9602
cc7aa7a4dd6d UTF-8 string validity was still checked incorrectly.
Timo Sirainen <tss@iki.fi>
parents: 9600
diff changeset
320 if (unlikely(input[i] < 0x80 || input[i] > 0xbf))
6951
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
321 return 0;
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
322 }
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
323 return len;
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
324 }
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
325
9476
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
326 static int uni_utf8_find_invalid_pos(const unsigned char *input, size_t size,
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
327 size_t *pos_r)
6951
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
328 {
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
329 size_t i, len;
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
330
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
331 /* find the first invalid utf8 sequence */
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
332 for (i = 0; i < size;) {
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
333 if (input[i] < 0x80)
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
334 i++;
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
335 else {
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
336 len = is_valid_utf8_seq(input + i, size-i);
9476
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
337 if (unlikely(len == 0)) {
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
338 *pos_r = i;
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
339 return -1;
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
340 }
6951
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
341 i += len;
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
342 }
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
343 }
9476
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
344 return 0;
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
345 }
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
346
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
347 bool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
348 buffer_t *buf)
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
349 {
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
350 size_t i, len;
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
351
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
352 if (uni_utf8_find_invalid_pos(input, size, &i) == 0)
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
353 return TRUE;
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
354
6951
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
355 /* broken utf-8 input - skip the broken characters */
6952
08e4d7efcd6a uni_utf8_get_valid_data() API changed.
Timo Sirainen <tss@iki.fi>
parents: 6951
diff changeset
356 buffer_append(buf, input, i++);
6951
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
357
7185
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7157
diff changeset
358 output_add_replacement_char(buf);
6951
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
359 while (i < size) {
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
360 if (input[i] < 0x80) {
6952
08e4d7efcd6a uni_utf8_get_valid_data() API changed.
Timo Sirainen <tss@iki.fi>
parents: 6951
diff changeset
361 buffer_append_c(buf, input[i++]);
6951
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
362 continue;
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
363 }
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
364
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
365 len = is_valid_utf8_seq(input + i, size-i);
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
366 if (len == 0) {
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
367 i++;
7185
6f014a866f38 Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7157
diff changeset
368 output_add_replacement_char(buf);
6951
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
369 continue;
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
370 }
6952
08e4d7efcd6a uni_utf8_get_valid_data() API changed.
Timo Sirainen <tss@iki.fi>
parents: 6951
diff changeset
371 buffer_append(buf, input + i, len);
6951
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
372 i += len;
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
373 }
6952
08e4d7efcd6a uni_utf8_get_valid_data() API changed.
Timo Sirainen <tss@iki.fi>
parents: 6951
diff changeset
374 return FALSE;
6951
1f70c72e4312 Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents: 6780
diff changeset
375 }
9476
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
376
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
377 bool uni_utf8_str_is_valid(const char *str)
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
378 {
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
379 size_t i;
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
380
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
381 return uni_utf8_find_invalid_pos((const unsigned char *)str,
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
382 strlen(str), &i) == 0;
bf2fb1679cb4 Added uni_utf8_str_is_valid().
Timo Sirainen <tss@iki.fi>
parents: 9405
diff changeset
383 }