Mercurial > dovecot > original-hg > dovecot-1.2
annotate src/lib/unichar.c @ 9603:5efba9f9f0a7 HEAD
Added a global utf8_replacement_char variable.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Fri, 20 Aug 2010 20:37:31 +0100 |
parents | cc7aa7a4dd6d |
children |
rev | line source |
---|---|
9532
00cd9aacd03c
Updated copyright notices to include year 2010.
Timo Sirainen <tss@iki.fi>
parents:
9476
diff
changeset
|
1 /* Copyright (c) 2005-2010 Dovecot authors, see the included COPYING file */ |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
2 |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
3 #include "lib.h" |
7042
dcbf6afdf931
Define unichars array type and use it for uni_utf8_to_ucs4() output.
Timo Sirainen <tss@iki.fi>
parents:
6953
diff
changeset
|
4 #include "array.h" |
6129
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
5 #include "bsearch-insert-pos.h" |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
6 #include "unichar.h" |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
7 |
6129
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
8 #include "unicodemap.c" |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
9 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
10 #define HANGUL_FIRST 0xac00 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
11 #define HANGUL_LAST 0xd7a3 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
12 |
9603
5efba9f9f0a7
Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents:
9602
diff
changeset
|
13 const unsigned char utf8_replacement_char[UTF8_REPLACEMENT_CHAR_LEN] = |
5efba9f9f0a7
Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents:
9602
diff
changeset
|
14 { 0xef, 0xbf, 0xbd }; /* 0xfffd */ |
5efba9f9f0a7
Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents:
9602
diff
changeset
|
15 |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
16 static const uint8_t utf8_non1_bytes[256 - 192 - 2] = { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
17 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
18 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
19 }; |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
20 |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
21 const uint8_t *const uni_utf8_non1_bytes = utf8_non1_bytes; |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
22 |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
23 unsigned int uni_strlen(const unichar_t *str) |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
24 { |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
25 unsigned int len = 0; |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
26 |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
27 for (len = 0; str[len] != 0; len++) ; |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
28 |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
29 return len; |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
30 } |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
31 |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
32 int uni_utf8_get_char(const char *input, unichar_t *chr_r) |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
33 { |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
34 return uni_utf8_get_char_n((const unsigned char *)input, (size_t)-1, |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
35 chr_r); |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
36 } |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
37 |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
38 int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r) |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
39 { |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
40 const unsigned char *input = _input; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
41 unichar_t chr; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
42 unsigned int i, len; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
43 int ret; |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
44 |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
45 i_assert(max_len > 0); |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
46 |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
47 if (*input < 0x80) { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
48 *chr_r = *input; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
49 return 1; |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
50 } |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
51 |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
52 /* first byte has len highest bits set, followed by zero bit. |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
53 the rest of the bits are used as the highest bits of the value. */ |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
54 chr = *input; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
55 len = uni_utf8_char_bytes(*input); |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
56 switch (len) { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
57 case 2: |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
58 chr &= 0x1f; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
59 break; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
60 case 3: |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
61 chr &= 0x0f; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
62 break; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
63 case 4: |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
64 chr &= 0x07; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
65 break; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
66 case 5: |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
67 chr &= 0x03; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
68 break; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
69 case 6: |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
70 chr &= 0x01; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
71 break; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
72 default: |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
73 /* only 7bit chars should have len==1 */ |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
74 i_assert(len == 1); |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
75 return -1; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
76 } |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
77 |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
78 if (len <= max_len) |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
79 ret = 1; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
80 else { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
81 /* check first if the input is invalid before returning 0 */ |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
82 ret = 0; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
83 len = max_len; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
84 } |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
85 |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
86 /* the following bytes must all be 10xxxxxx */ |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
87 for (i = 1; i < len; i++) { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
88 if ((input[i] & 0xc0) != 0x80) |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
89 return input[i] == '\0' ? 0 : -1; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
90 |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
91 chr <<= 6; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
92 chr |= input[i] & 0x3f; |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
93 } |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
94 |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
95 *chr_r = chr; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
96 return ret; |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
97 } |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
98 |
7042
dcbf6afdf931
Define unichars array type and use it for uni_utf8_to_ucs4() output.
Timo Sirainen <tss@iki.fi>
parents:
6953
diff
changeset
|
99 int uni_utf8_to_ucs4(const char *input, ARRAY_TYPE(unichars) *output) |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
100 { |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
101 unichar_t chr; |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
102 |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
103 while (*input != '\0') { |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
104 if (uni_utf8_get_char(input, &chr) <= 0) { |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
105 /* invalid input */ |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
106 return -1; |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
107 } |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
108 input += uni_utf8_char_bytes(*input); |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
109 |
7042
dcbf6afdf931
Define unichars array type and use it for uni_utf8_to_ucs4() output.
Timo Sirainen <tss@iki.fi>
parents:
6953
diff
changeset
|
110 array_append(output, &chr, 1); |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
111 } |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
112 return 0; |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
113 } |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
114 |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
115 void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output) |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
116 { |
7084
d8e7699ac68e
uni_ucs4_to_utf8(): Check len>0 first so we don't access input[len].
Timo Sirainen <tss@iki.fi>
parents:
7042
diff
changeset
|
117 for (; len > 0 && *input != '\0'; input++, len--) |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
118 uni_ucs4_to_utf8_c(*input, output); |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
119 } |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
120 |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
121 void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output) |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
122 { |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
123 unsigned char first; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
124 int bitpos; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
125 |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
126 if (chr < 0x80) { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
127 buffer_append_c(output, chr); |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
128 return; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
129 } |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
130 |
6780 | 131 i_assert(chr < 0x80000000); /* 1 << (5*6 + 1) */ |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
132 |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
133 if (chr < (1 << (6 + 5))) { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
134 /* 110xxxxx */ |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
135 bitpos = 6; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
136 first = 0x80 | 0x40; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
137 } else if (chr < (1 << ((2*6) + 4))) { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
138 /* 1110xxxx */ |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
139 bitpos = 2*6; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
140 first = 0x80 | 0x40 | 0x20; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
141 } else if (chr < (1 << ((3*6) + 3))) { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
142 /* 11110xxx */ |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
143 bitpos = 3*6; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
144 first = 0x80 | 0x40 | 0x20 | 0x10; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
145 } else if (chr < (1 << ((4*6) + 2))) { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
146 /* 111110xx */ |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
147 bitpos = 4*6; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
148 first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
149 } else { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
150 /* 1111110x */ |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
151 bitpos = 5*6; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
152 first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08 | 0x04; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
153 } |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
154 buffer_append_c(output, first | (chr >> bitpos)); |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
155 |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
156 do { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
157 bitpos -= 6; |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
158 buffer_append_c(output, 0x80 | ((chr >> bitpos) & 0x3f)); |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
159 } while (bitpos > 0); |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
160 } |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
161 |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
162 unsigned int uni_utf8_strlen_n(const void *_input, size_t size) |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
163 { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
164 const unsigned char *input = _input; |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
165 unsigned int len = 0; |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
166 size_t i; |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
167 |
5683
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
168 for (i = 0; i < size && input[i] != '\0'; ) { |
8101787cdd1c
Rewrote some code and cleaned up the API
Timo Sirainen <tss@iki.fi>
parents:
4899
diff
changeset
|
169 i += uni_utf8_char_bytes(input[i]); |
4899
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
170 if (i > size) |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
171 break; |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
172 len++; |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
173 } |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
174 return len; |
c98008a7e9b7
Added unichar_t UCS-4 type and some ucs4/utf8 functions.
Timo Sirainen <tss@iki.fi>
parents:
diff
changeset
|
175 } |
6129
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
176 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
177 static bool uint16_find(const uint16_t *data, unsigned int count, |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
178 uint16_t value, unsigned int *idx_r) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
179 { |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
180 BINARY_NUMBER_SEARCH(data, count, value, idx_r); |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
181 } |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
182 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
183 static bool uint32_find(const uint32_t *data, unsigned int count, |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
184 uint32_t value, unsigned int *idx_r) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
185 { |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
186 BINARY_NUMBER_SEARCH(data, count, value, idx_r); |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
187 } |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
188 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
189 unichar_t uni_ucs4_to_titlecase(unichar_t chr) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
190 { |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
191 unsigned int idx; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
192 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
193 if (chr <= 0xffff) { |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
194 if (!uint16_find(titlecase16_keys, N_ELEMENTS(titlecase16_keys), |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
195 chr, &idx)) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
196 return chr; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
197 else |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
198 return titlecase16_values[idx]; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
199 } else { |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
200 if (!uint32_find(titlecase32_keys, N_ELEMENTS(titlecase32_keys), |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
201 chr, &idx)) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
202 return chr; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
203 else |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
204 return titlecase32_values[idx]; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
205 } |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
206 } |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
207 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
208 static bool uni_ucs4_decompose_uni(unichar_t *chr) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
209 { |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
210 unsigned int idx; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
211 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
212 if (*chr <= 0xffff) { |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
213 if (!uint16_find(uni16_decomp_keys, |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
214 N_ELEMENTS(uni16_decomp_keys), |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
215 *chr, &idx)) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
216 return FALSE; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
217 *chr = uni16_decomp_values[idx]; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
218 } else { |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
219 if (!uint32_find(uni32_decomp_keys, |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
220 N_ELEMENTS(uni32_decomp_keys), |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
221 *chr, &idx)) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
222 return FALSE; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
223 *chr = uni32_decomp_values[idx]; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
224 } |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
225 return TRUE; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
226 } |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
227 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
228 static void uni_ucs4_decompose_hangul_utf8(unichar_t chr, buffer_t *output) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
229 { |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
230 #define SBase HANGUL_FIRST |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
231 #define LBase 0x1100 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
232 #define VBase 0x1161 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
233 #define TBase 0x11A7 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
234 #define LCount 19 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
235 #define VCount 21 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
236 #define TCount 28 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
237 #define NCount (VCount * TCount) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
238 unsigned int SIndex = chr - SBase; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
239 unichar_t L = LBase + SIndex / NCount; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
240 unichar_t V = VBase + (SIndex % NCount) / TCount; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
241 unichar_t T = TBase + SIndex % TCount; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
242 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
243 uni_ucs4_to_utf8_c(L, output); |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
244 uni_ucs4_to_utf8_c(V, output); |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
245 if (T != TBase) uni_ucs4_to_utf8_c(T, output); |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
246 } |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
247 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
248 static bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
249 { |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
250 const uint16_t *value; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
251 unsigned int idx; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
252 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
253 if (chr > 0xffff) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
254 return FALSE; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
255 |
9405
9a8c565adbe1
New UnicodeMap.txt has >16bit multi-decomposition keys, support them.
Timo Sirainen <tss@iki.fi>
parents:
8590
diff
changeset
|
256 if (!uint32_find(multidecomp_keys, N_ELEMENTS(multidecomp_keys), |
6129
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
257 chr, &idx)) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
258 return FALSE; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
259 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
260 value = &multidecomp_values[multidecomp_offsets[idx]]; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
261 for (; *value != 0; value++) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
262 uni_ucs4_to_utf8_c(*value, output); |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
263 return TRUE; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
264 } |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
265 |
7185
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7157
diff
changeset
|
266 static void output_add_replacement_char(buffer_t *output) |
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7157
diff
changeset
|
267 { |
9603
5efba9f9f0a7
Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents:
9602
diff
changeset
|
268 if (output->used >= UTF8_REPLACEMENT_CHAR_LEN && |
7185
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7157
diff
changeset
|
269 memcmp(CONST_PTR_OFFSET(output->data, |
9603
5efba9f9f0a7
Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents:
9602
diff
changeset
|
270 output->used - UTF8_REPLACEMENT_CHAR_LEN), |
5efba9f9f0a7
Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents:
9602
diff
changeset
|
271 utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN) == 0) { |
7185
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7157
diff
changeset
|
272 /* don't add the replacement char multiple times */ |
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7157
diff
changeset
|
273 return; |
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7157
diff
changeset
|
274 } |
9603
5efba9f9f0a7
Added a global utf8_replacement_char variable.
Timo Sirainen <tss@iki.fi>
parents:
9602
diff
changeset
|
275 buffer_append(output, utf8_replacement_char, UTF8_REPLACEMENT_CHAR_LEN); |
7185
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7157
diff
changeset
|
276 } |
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7157
diff
changeset
|
277 |
6129
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
278 int uni_utf8_to_decomposed_titlecase(const void *_input, size_t max_len, |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
279 buffer_t *output) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
280 { |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
281 const unsigned char *input = _input; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
282 unsigned int bytes; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
283 unichar_t chr; |
6953
edd296d164db
uni_utf8_to_decomposed_titlecase(): If we encounter invalid UTF-8 input,
Timo Sirainen <tss@iki.fi>
parents:
6952
diff
changeset
|
284 int ret = 0; |
6129
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
285 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
286 while (max_len > 0 && *input != '\0') { |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
287 if (uni_utf8_get_char_n(input, max_len, &chr) <= 0) { |
6953
edd296d164db
uni_utf8_to_decomposed_titlecase(): If we encounter invalid UTF-8 input,
Timo Sirainen <tss@iki.fi>
parents:
6952
diff
changeset
|
288 /* invalid input. try the next byte. */ |
edd296d164db
uni_utf8_to_decomposed_titlecase(): If we encounter invalid UTF-8 input,
Timo Sirainen <tss@iki.fi>
parents:
6952
diff
changeset
|
289 ret = -1; |
edd296d164db
uni_utf8_to_decomposed_titlecase(): If we encounter invalid UTF-8 input,
Timo Sirainen <tss@iki.fi>
parents:
6952
diff
changeset
|
290 input++; max_len--; |
7185
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7157
diff
changeset
|
291 output_add_replacement_char(output); |
6953
edd296d164db
uni_utf8_to_decomposed_titlecase(): If we encounter invalid UTF-8 input,
Timo Sirainen <tss@iki.fi>
parents:
6952
diff
changeset
|
292 continue; |
6129
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
293 } |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
294 bytes = uni_utf8_char_bytes(*input); |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
295 input += bytes; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
296 max_len -= bytes; |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
297 |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
298 chr = uni_ucs4_to_titlecase(chr); |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
299 if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
300 uni_ucs4_decompose_hangul_utf8(chr, output); |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
301 else if (uni_ucs4_decompose_uni(&chr) || |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
302 !uni_ucs4_decompose_multi_utf8(chr, output)) |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
303 uni_ucs4_to_utf8_c(chr, output); |
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
304 } |
6953
edd296d164db
uni_utf8_to_decomposed_titlecase(): If we encounter invalid UTF-8 input,
Timo Sirainen <tss@iki.fi>
parents:
6952
diff
changeset
|
305 return ret; |
6129
04b9eb27283c
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
Timo Sirainen <tss@iki.fi>
parents:
5683
diff
changeset
|
306 } |
6951
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
307 |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
308 static inline unsigned int |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
309 is_valid_utf8_seq(const unsigned char *input, unsigned int size) |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
310 { |
9602
cc7aa7a4dd6d
UTF-8 string validity was still checked incorrectly.
Timo Sirainen <tss@iki.fi>
parents:
9600
diff
changeset
|
311 unsigned int i, len; |
6951
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
312 |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
313 len = uni_utf8_char_bytes(input[0]); |
7157
0c1d5a814368
uni_utf8_get_valid_data(): Check for invalid UTF-8 better.
Timo Sirainen <tss@iki.fi>
parents:
7086
diff
changeset
|
314 if (unlikely(len > size || len == 1)) |
6951
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
315 return 0; |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
316 |
9600
ccc71865dea0
UTF-8 string validity was checked incorrectly.
Timo Sirainen <tss@iki.fi>
parents:
9532
diff
changeset
|
317 /* the rest of the chars should be in 0x80..0xbf range. |
ccc71865dea0
UTF-8 string validity was checked incorrectly.
Timo Sirainen <tss@iki.fi>
parents:
9532
diff
changeset
|
318 anything else is start of a sequence or invalid */ |
7157
0c1d5a814368
uni_utf8_get_valid_data(): Check for invalid UTF-8 better.
Timo Sirainen <tss@iki.fi>
parents:
7086
diff
changeset
|
319 for (i = 1; i < len; i++) { |
9602
cc7aa7a4dd6d
UTF-8 string validity was still checked incorrectly.
Timo Sirainen <tss@iki.fi>
parents:
9600
diff
changeset
|
320 if (unlikely(input[i] < 0x80 || input[i] > 0xbf)) |
6951
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
321 return 0; |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
322 } |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
323 return len; |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
324 } |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
325 |
9476 | 326 static int uni_utf8_find_invalid_pos(const unsigned char *input, size_t size, |
327 size_t *pos_r) | |
6951
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
328 { |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
329 size_t i, len; |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
330 |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
331 /* find the first invalid utf8 sequence */ |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
332 for (i = 0; i < size;) { |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
333 if (input[i] < 0x80) |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
334 i++; |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
335 else { |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
336 len = is_valid_utf8_seq(input + i, size-i); |
9476 | 337 if (unlikely(len == 0)) { |
338 *pos_r = i; | |
339 return -1; | |
340 } | |
6951
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
341 i += len; |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
342 } |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
343 } |
9476 | 344 return 0; |
345 } | |
346 | |
347 bool uni_utf8_get_valid_data(const unsigned char *input, size_t size, | |
348 buffer_t *buf) | |
349 { | |
350 size_t i, len; | |
351 | |
352 if (uni_utf8_find_invalid_pos(input, size, &i) == 0) | |
353 return TRUE; | |
354 | |
6951
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
355 /* broken utf-8 input - skip the broken characters */ |
6952
08e4d7efcd6a
uni_utf8_get_valid_data() API changed.
Timo Sirainen <tss@iki.fi>
parents:
6951
diff
changeset
|
356 buffer_append(buf, input, i++); |
6951
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
357 |
7185
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7157
diff
changeset
|
358 output_add_replacement_char(buf); |
6951
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
359 while (i < size) { |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
360 if (input[i] < 0x80) { |
6952
08e4d7efcd6a
uni_utf8_get_valid_data() API changed.
Timo Sirainen <tss@iki.fi>
parents:
6951
diff
changeset
|
361 buffer_append_c(buf, input[i++]); |
6951
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
362 continue; |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
363 } |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
364 |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
365 len = is_valid_utf8_seq(input + i, size-i); |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
366 if (len == 0) { |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
367 i++; |
7185
6f014a866f38
Replace invalid UTF8 input with a replacement character.
Timo Sirainen <tss@iki.fi>
parents:
7157
diff
changeset
|
368 output_add_replacement_char(buf); |
6951
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
369 continue; |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
370 } |
6952
08e4d7efcd6a
uni_utf8_get_valid_data() API changed.
Timo Sirainen <tss@iki.fi>
parents:
6951
diff
changeset
|
371 buffer_append(buf, input + i, len); |
6951
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
372 i += len; |
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
373 } |
6952
08e4d7efcd6a
uni_utf8_get_valid_data() API changed.
Timo Sirainen <tss@iki.fi>
parents:
6951
diff
changeset
|
374 return FALSE; |
6951
1f70c72e4312
Moved uni_utf8_get_valid_data() to lib/
Timo Sirainen <tss@iki.fi>
parents:
6780
diff
changeset
|
375 } |
9476 | 376 |
377 bool uni_utf8_str_is_valid(const char *str) | |
378 { | |
379 size_t i; | |
380 | |
381 return uni_utf8_find_invalid_pos((const unsigned char *)str, | |
382 strlen(str), &i) == 0; | |
383 } |