annotate src/lib-charset/charset-iconv.c @ 9586:97b702abd132 HEAD

lib-charset: Don't assert-crash when iconv() skips lots of invalid input.
author Timo Sirainen <tss@iki.fi>
date Mon, 21 Jun 2010 21:17:58 +0100
parents 00cd9aacd03c
children 91c605339e45
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9532
00cd9aacd03c Updated copyright notices to include year 2010.
Timo Sirainen <tss@iki.fi>
parents: 8871
diff changeset
1 /* Copyright (c) 2002-2010 Dovecot authors, see the included COPYING file */
568
f2aa58c2afd0 SEARCH CHARSET support. Currently we do it through iconv() and only ASCII
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
2
f2aa58c2afd0 SEARCH CHARSET support. Currently we do it through iconv() and only ASCII
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
3 #include "lib.h"
765
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 753
diff changeset
4 #include "buffer.h"
6131
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
5 #include "unichar.h"
579
e524da896d92 Several minor fixes: signess, casting away const, missing static, etc.
Timo Sirainen <tss@iki.fi>
parents: 568
diff changeset
6 #include "charset-utf8.h"
568
f2aa58c2afd0 SEARCH CHARSET support. Currently we do it through iconv() and only ASCII
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
7
1300
952bf533c2ea Better iconv() checking.
Timo Sirainen <tss@iki.fi>
parents: 903
diff changeset
8 #ifdef HAVE_ICONV
568
f2aa58c2afd0 SEARCH CHARSET support. Currently we do it through iconv() and only ASCII
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
9
f2aa58c2afd0 SEARCH CHARSET support. Currently we do it through iconv() and only ASCII
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
10 #include <iconv.h>
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
11 #include <ctype.h>
568
f2aa58c2afd0 SEARCH CHARSET support. Currently we do it through iconv() and only ASCII
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
12
903
fd8888f6f037 Naming style changes, finally got tired of most of the typedefs. Also the
Timo Sirainen <tss@iki.fi>
parents: 898
diff changeset
13 struct charset_translation {
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
14 iconv_t cd;
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
15 enum charset_flags flags;
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
16 };
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
17
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
18 int charset_to_utf8_begin(const char *charset, enum charset_flags flags,
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
19 struct charset_translation **t_r)
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
20 {
903
fd8888f6f037 Naming style changes, finally got tired of most of the typedefs. Also the
Timo Sirainen <tss@iki.fi>
parents: 898
diff changeset
21 struct charset_translation *t;
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
22 iconv_t cd;
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
23
5502
212bbdc55065 Cleanup
Timo Sirainen <tss@iki.fi>
parents: 4605
diff changeset
24 if (charset_is_utf8(charset))
1991
689f791b480f iconv_t isn't necessarily pointer.
Timo Sirainen <tss@iki.fi>
parents: 1471
diff changeset
25 cd = (iconv_t)-1;
5502
212bbdc55065 Cleanup
Timo Sirainen <tss@iki.fi>
parents: 4605
diff changeset
26 else {
609
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents: 608
diff changeset
27 cd = iconv_open("UTF-8", charset);
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
28 if (cd == (iconv_t)-1)
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
29 return -1;
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
30 }
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
31
903
fd8888f6f037 Naming style changes, finally got tired of most of the typedefs. Also the
Timo Sirainen <tss@iki.fi>
parents: 898
diff changeset
32 t = i_new(struct charset_translation, 1);
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
33 t->cd = cd;
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
34 t->flags = flags;
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
35 *t_r = t;
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
36 return 0;
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
37 }
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
38
3879
928229f8b3e6 deinit, unref, destroy, close, free, etc. functions now take a pointer to
Timo Sirainen <tss@iki.fi>
parents: 3863
diff changeset
39 void charset_to_utf8_end(struct charset_translation **_t)
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
40 {
3879
928229f8b3e6 deinit, unref, destroy, close, free, etc. functions now take a pointer to
Timo Sirainen <tss@iki.fi>
parents: 3863
diff changeset
41 struct charset_translation *t = *_t;
928229f8b3e6 deinit, unref, destroy, close, free, etc. functions now take a pointer to
Timo Sirainen <tss@iki.fi>
parents: 3863
diff changeset
42
928229f8b3e6 deinit, unref, destroy, close, free, etc. functions now take a pointer to
Timo Sirainen <tss@iki.fi>
parents: 3863
diff changeset
43 *_t = NULL;
928229f8b3e6 deinit, unref, destroy, close, free, etc. functions now take a pointer to
Timo Sirainen <tss@iki.fi>
parents: 3863
diff changeset
44
1991
689f791b480f iconv_t isn't necessarily pointer.
Timo Sirainen <tss@iki.fi>
parents: 1471
diff changeset
45 if (t->cd != (iconv_t)-1)
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
46 iconv_close(t->cd);
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
47 i_free(t);
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
48 }
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
49
903
fd8888f6f037 Naming style changes, finally got tired of most of the typedefs. Also the
Timo Sirainen <tss@iki.fi>
parents: 898
diff changeset
50 void charset_to_utf8_reset(struct charset_translation *t)
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
51 {
1991
689f791b480f iconv_t isn't necessarily pointer.
Timo Sirainen <tss@iki.fi>
parents: 1471
diff changeset
52 if (t->cd != (iconv_t)-1)
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
53 (void)iconv(t->cd, NULL, NULL, NULL, NULL);
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
54 }
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
55
6125
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
56 static bool
6122
d86581f4a0c6 charset_to_utf8() isn't used anymore, so renamed charset_to_utf8_full() to it.
Timo Sirainen <tss@iki.fi>
parents: 6114
diff changeset
57 charset_to_utf8_try(struct charset_translation *t,
6125
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
58 const unsigned char *src, size_t *src_size, buffer_t *dest,
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
59 enum charset_result *result)
765
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 753
diff changeset
60 {
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 753
diff changeset
61 ICONV_CONST char *ic_srcbuf;
6131
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
62 char tmpbuf[8192], *ic_destbuf;
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
63 size_t srcleft, destleft;
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
64 bool dtcase = (t->flags & CHARSET_FLAG_DECOMP_TITLECASE) != 0;
6125
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
65 bool ret = TRUE;
765
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 753
diff changeset
66
1991
689f791b480f iconv_t isn't necessarily pointer.
Timo Sirainen <tss@iki.fi>
parents: 1471
diff changeset
67 if (t->cd == (iconv_t)-1) {
609
5470c0cb13a7 We can support UTF-8 charset too without any translations.
Timo Sirainen <tss@iki.fi>
parents: 608
diff changeset
68 /* no translation needed - just copy it to outbuf uppercased */
6131
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
69 *result = CHARSET_RET_OK;
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
70 if (!dtcase) {
6122
d86581f4a0c6 charset_to_utf8() isn't used anymore, so renamed charset_to_utf8_full() to it.
Timo Sirainen <tss@iki.fi>
parents: 6114
diff changeset
71 buffer_append(dest, src, *src_size);
6131
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
72 return TRUE;
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
73 }
6125
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
74
6131
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
75 if (uni_utf8_to_decomposed_titlecase(src, *src_size, dest) < 0)
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
76 *result = CHARSET_RET_INVALID_INPUT;
6125
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
77 return TRUE;
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
78 }
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
79 if (!dtcase) {
6131
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
80 destleft = buffer_get_size(dest) - dest->used;
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
81 if (destleft < *src_size) {
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
82 /* The buffer is most likely too small to hold the
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
83 output, so increase it at least to the input size. */
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
84 destleft = *src_size;
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
85 }
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
86 ic_destbuf = buffer_append_space_unsafe(dest, destleft);
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
87 } else {
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
88 destleft = sizeof(tmpbuf);
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
89 ic_destbuf = tmpbuf;
6122
d86581f4a0c6 charset_to_utf8() isn't used anymore, so renamed charset_to_utf8_full() to it.
Timo Sirainen <tss@iki.fi>
parents: 6114
diff changeset
90 }
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
91
785
d96cbba73a8b Don't use Buffers with read-only data, just makes it more difficult without
Timo Sirainen <tss@iki.fi>
parents: 766
diff changeset
92 srcleft = *src_size;
d96cbba73a8b Don't use Buffers with read-only data, just makes it more difficult without
Timo Sirainen <tss@iki.fi>
parents: 766
diff changeset
93 ic_srcbuf = (ICONV_CONST char *) src;
765
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 753
diff changeset
94
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 753
diff changeset
95 if (iconv(t->cd, &ic_srcbuf, &srcleft,
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 753
diff changeset
96 &ic_destbuf, &destleft) != (size_t)-1)
6125
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
97 *result = CHARSET_RET_OK;
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
98 else if (errno == E2BIG) {
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
99 /* set result just to avoid compiler warning */
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
100 *result = CHARSET_RET_INCOMPLETE_INPUT;
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
101 ret = FALSE;
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
102 } else if (errno == EINVAL)
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
103 *result = CHARSET_RET_INCOMPLETE_INPUT;
765
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 753
diff changeset
104 else {
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 753
diff changeset
105 /* should be EILSEQ */
6125
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
106 *result = CHARSET_RET_INVALID_INPUT;
7240
fcfe2ea5c3ed Crashfix for converting invalid input to UTF-8.
Timo Sirainen <tss@iki.fi>
parents: 7186
diff changeset
107 if (!dtcase)
fcfe2ea5c3ed Crashfix for converting invalid input to UTF-8.
Timo Sirainen <tss@iki.fi>
parents: 7186
diff changeset
108 buffer_set_used_size(dest, dest->used - destleft);
7186
d48c419a27ca Fixed handling invalid charset input. Replace it with replacement character.
Timo Sirainen <tss@iki.fi>
parents: 7086
diff changeset
109 uni_ucs4_to_utf8_c(UNICODE_REPLACEMENT_CHAR, dest);
6125
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
110 return TRUE;
765
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 753
diff changeset
111 }
6131
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
112 *src_size -= srcleft;
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
113
6132
d01522d276f6 charset_to_utf8_begin() API change.
Timo Sirainen <tss@iki.fi>
parents: 6131
diff changeset
114 if (!dtcase) {
6131
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
115 /* give back the memory we didn't use */
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
116 buffer_set_used_size(dest, dest->used - destleft);
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
117 } else {
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
118 size_t tmpsize = sizeof(tmpbuf) - destleft;
5f56b2eb32b3 Use uni_utf8_to_decomposed_titlecase() to have proper case-insensitive UTF-8
Timo Sirainen <tss@iki.fi>
parents: 6126
diff changeset
119
8842
96102e6cd333 Charset conversion: Don't panic if iconv() doesn't produce valid UTF-8 output.
Timo Sirainen <tss@iki.fi>
parents: 8590
diff changeset
120 /* we just converted data to UTF-8. it shouldn't be invalid,
96102e6cd333 Charset conversion: Don't panic if iconv() doesn't produce valid UTF-8 output.
Timo Sirainen <tss@iki.fi>
parents: 8590
diff changeset
121 but Solaris iconv appears to pass invalid data through
96102e6cd333 Charset conversion: Don't panic if iconv() doesn't produce valid UTF-8 output.
Timo Sirainen <tss@iki.fi>
parents: 8590
diff changeset
122 sometimes (e.g. 8 bit characters with UTF-7) */
96102e6cd333 Charset conversion: Don't panic if iconv() doesn't produce valid UTF-8 output.
Timo Sirainen <tss@iki.fi>
parents: 8590
diff changeset
123 (void)uni_utf8_to_decomposed_titlecase(tmpbuf, tmpsize, dest);
6112
e5451501ff2f charset_to_utf8_begin() now takes bool ucase parameter. Changed
Timo Sirainen <tss@iki.fi>
parents: 5502
diff changeset
124 }
765
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 753
diff changeset
125 return ret;
553f050c8313 Added buffer API. Point is to hide all buffer writing behind this API which
Timo Sirainen <tss@iki.fi>
parents: 753
diff changeset
126 }
608
debb8468514e SEARCH CHARSET now works properly with message bodies, and in general body
Timo Sirainen <tss@iki.fi>
parents: 579
diff changeset
127
4605
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
128 enum charset_result
6122
d86581f4a0c6 charset_to_utf8() isn't used anymore, so renamed charset_to_utf8_full() to it.
Timo Sirainen <tss@iki.fi>
parents: 6114
diff changeset
129 charset_to_utf8(struct charset_translation *t,
d86581f4a0c6 charset_to_utf8() isn't used anymore, so renamed charset_to_utf8_full() to it.
Timo Sirainen <tss@iki.fi>
parents: 6114
diff changeset
130 const unsigned char *src, size_t *src_size, buffer_t *dest)
4605
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
131 {
8871
7c21256e3598 charset_to_utf8() may have tried to allocate a lot of memory in some conditions.
Timo Sirainen <tss@iki.fi>
parents: 8842
diff changeset
132 bool dtcase = (t->flags & CHARSET_FLAG_DECOMP_TITLECASE) != 0;
6125
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
133 enum charset_result result;
9586
97b702abd132 lib-charset: Don't assert-crash when iconv() skips lots of invalid input.
Timo Sirainen <tss@iki.fi>
parents: 9532
diff changeset
134 size_t pos, used, size, prev_pos = 0, prev_used = 0;
6125
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
135 bool ret;
4605
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
136
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
137 for (pos = 0;;) {
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
138 size = *src_size - pos;
6125
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
139 ret = charset_to_utf8_try(t, src + pos, &size, dest, &result);
4605
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
140 pos += size;
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
141
6125
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
142 if (ret) {
4605
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
143 *src_size = pos;
6125
b9c1336fd4e4 Removed CHARSET_RET_OUTPUT_FULL, it can't happen anymore.
Timo Sirainen <tss@iki.fi>
parents: 6122
diff changeset
144 return result;
4605
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
145 }
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
146
8871
7c21256e3598 charset_to_utf8() may have tried to allocate a lot of memory in some conditions.
Timo Sirainen <tss@iki.fi>
parents: 8842
diff changeset
147 if (!dtcase) {
7c21256e3598 charset_to_utf8() may have tried to allocate a lot of memory in some conditions.
Timo Sirainen <tss@iki.fi>
parents: 8842
diff changeset
148 /* force buffer to grow */
7c21256e3598 charset_to_utf8() may have tried to allocate a lot of memory in some conditions.
Timo Sirainen <tss@iki.fi>
parents: 8842
diff changeset
149 used = dest->used;
7c21256e3598 charset_to_utf8() may have tried to allocate a lot of memory in some conditions.
Timo Sirainen <tss@iki.fi>
parents: 8842
diff changeset
150 size = buffer_get_size(dest) - used + 1;
7c21256e3598 charset_to_utf8() may have tried to allocate a lot of memory in some conditions.
Timo Sirainen <tss@iki.fi>
parents: 8842
diff changeset
151 (void)buffer_append_space_unsafe(dest, size);
7c21256e3598 charset_to_utf8() may have tried to allocate a lot of memory in some conditions.
Timo Sirainen <tss@iki.fi>
parents: 8842
diff changeset
152 buffer_set_used_size(dest, used);
7c21256e3598 charset_to_utf8() may have tried to allocate a lot of memory in some conditions.
Timo Sirainen <tss@iki.fi>
parents: 8842
diff changeset
153 } else {
9586
97b702abd132 lib-charset: Don't assert-crash when iconv() skips lots of invalid input.
Timo Sirainen <tss@iki.fi>
parents: 9532
diff changeset
154 i_assert(dest->used != prev_used || pos != prev_pos);
97b702abd132 lib-charset: Don't assert-crash when iconv() skips lots of invalid input.
Timo Sirainen <tss@iki.fi>
parents: 9532
diff changeset
155 prev_pos = pos;
8871
7c21256e3598 charset_to_utf8() may have tried to allocate a lot of memory in some conditions.
Timo Sirainen <tss@iki.fi>
parents: 8842
diff changeset
156 prev_used = dest->used;
7c21256e3598 charset_to_utf8() may have tried to allocate a lot of memory in some conditions.
Timo Sirainen <tss@iki.fi>
parents: 8842
diff changeset
157 }
4605
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
158 }
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
159 }
e6cb9f75b76a Added charset_is_utf8() and charset_to_ucase_utf8_full().
Timo Sirainen <tss@iki.fi>
parents: 3879
diff changeset
160
568
f2aa58c2afd0 SEARCH CHARSET support. Currently we do it through iconv() and only ASCII
Timo Sirainen <tss@iki.fi>
parents:
diff changeset
161 #endif