Mercurial > dovecot > original-hg > dovecot-1.2
changeset 5683:8101787cdd1c HEAD
Rewrote some code and cleaned up the API
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Mon, 11 Jun 2007 04:37:29 +0300 |
parents | ff5ba9cb6cd0 |
children | d7302155b97f |
files | src/lib/unichar.c src/lib/unichar.h |
diffstat | 2 files changed, 122 insertions(+), 189 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib/unichar.c Mon Jun 11 02:27:55 2007 +0300 +++ b/src/lib/unichar.c Mon Jun 11 04:37:29 2007 +0300 @@ -1,49 +1,15 @@ -/* Copyright (C) 2005 Timo Sirainen */ - -/* Contains code from GLIB: - * - * Copyright (C) 1999 Tom Tromey - * Copyright (C) 2000 Red Hat, Inc. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ +/* Copyright (C) 2005-2007 Timo Sirainen */ #include "lib.h" #include "buffer.h" #include "unichar.h" -#define UTF8_LENGTH(Char) \ - ((Char) < 0x80 ? 1 : \ - ((Char) < 0x800 ? 2 : \ - ((Char) < 0x10000 ? 3 : \ - ((Char) < 0x200000 ? 4 : \ - ((Char) < 0x4000000 ? 5 : 6))))) - -static const char utf8_skip_data[256] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 +static const uint8_t utf8_non1_bytes[256 - 192 - 2] = { + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 }; -const char *const uni_utf8_skip = utf8_skip_data; +const uint8_t *const uni_utf8_non1_bytes = utf8_non1_bytes; unsigned int uni_strlen(const unichar_t *str) { @@ -54,146 +20,71 @@ return len; } -unichar_t uni_utf8_get_char(const char *input) +int uni_utf8_get_char(const char *input, unichar_t *chr_r) { - return uni_utf8_get_char_len((const unsigned char *)input, (size_t)-1); + return uni_utf8_get_char_n((const unsigned char *)input, (size_t)-1, + chr_r); } -unichar_t uni_utf8_get_char_len(const unsigned char *input, size_t max_len) +int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r) { - unsigned int i, len; - unichar_t wc = *input; - - i_assert(max_len > 0); + const unsigned char *input = _input; + unichar_t chr; + unsigned int i, len; + int ret; - if (wc < 0x80) - { - return wc; - } - else if (wc < 0xc0) - { - return (unichar_t)-1; - } - else if (wc < 0xe0) - { - len = 2; - wc &= 0x1f; - } - else if (wc < 0xf0) - { - len = 3; - wc &= 0x0f; - } - else if (wc < 0xf8) - { - len = 4; - wc &= 0x07; - } - else if (wc < 0xfc) - { - len = 5; - wc &= 0x03; - } - else if (wc < 0xfe) - { - len = 6; - wc &= 0x01; - } - else - { - return (unichar_t)-1; - } + i_assert(max_len > 0); - if (max_len != (size_t)-1 && len > max_len) - { - for (i = 1; i < max_len; i++) - { - if ((input[i] & 0xc0) != 0x80) - return (unichar_t)-1; - } - return (unichar_t)-2; - } - - for (i = 1; i < len; ++i) - { - if ((input[i] & 0xc0) != 0x80) - { - if (input[i] != '\0') - return (unichar_t)-1; - else - return (unichar_t)-2; + if (*input < 0x80) { + *chr_r = *input; + return 1; } - wc <<= 6; - wc |= (input[i] & 0x3f); - } - - if (UTF8_LENGTH(wc) != len) - return (unichar_t)-1; - - return wc; -} - -/** - * g_unichar_to_utf8: - * @c: a ISO10646 character code - * @outbuf: output buffer, must have at least 6 bytes of space. - * If %NULL, the length will be computed and returned - * and nothing will be written to @outbuf. - * - * Converts a single character to UTF-8. - * - * Return value: number of bytes written - **/ -static int -g_unichar_to_utf8(unichar_t c, char *outbuf) -{ - unsigned int len = 0; - int first; - int i; + /* first byte has len highest bits set, followed by zero bit. + the rest of the bits are used as the highest bits of the value. */ + chr = *input; + len = uni_utf8_char_bytes(*input); + switch (len) { + case 2: + chr &= 0x1f; + break; + case 3: + chr &= 0x0f; + break; + case 4: + chr &= 0x07; + break; + case 5: + chr &= 0x03; + break; + case 6: + chr &= 0x01; + break; + default: + /* only 7bit chars should have len==1 */ + i_assert(len == 1); + return -1; + } - if (c < 0x80) - { - first = 0; - len = 1; - } - else if (c < 0x800) - { - first = 0xc0; - len = 2; - } - else if (c < 0x10000) - { - first = 0xe0; - len = 3; - } - else if (c < 0x200000) - { - first = 0xf0; - len = 4; - } - else if (c < 0x4000000) - { - first = 0xf8; - len = 5; - } - else - { - first = 0xfc; - len = 6; - } + if (len <= max_len) + ret = 1; + else { + /* check first if the input is invalid before returning 0 */ + ret = 0; + len = max_len; + } - if (outbuf) - { - for (i = len - 1; i > 0; --i) - { - outbuf[i] = (c & 0x3f) | 0x80; - c >>= 6; + /* the following bytes must all be 10xxxxxx */ + for (i = 1; i < len; i++) { + if ((input[i] & 0xc0) != 0x80) + return input[i] == '\0' ? 0 : -1; + + chr <<= 6; + chr |= input[i] & 0x3f; } - outbuf[0] = c | first; - } - return len; + *chr_r = chr; + return ret; } int uni_utf8_to_ucs4(const char *input, buffer_t *output) @@ -201,12 +92,11 @@ unichar_t chr; while (*input != '\0') { - chr = uni_utf8_get_char(input); - if (chr & 0x80000000) { + if (uni_utf8_get_char(input, &chr) <= 0) { /* invalid input */ return -1; } - input = uni_utf8_next_char(input); + input += uni_utf8_char_bytes(*input); buffer_append(output, &chr, sizeof(chr)); } @@ -215,24 +105,59 @@ void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output) { - void *buf; - int char_len; - - for (; *input != '\0' && len > 0; input++, len--) { - buf = buffer_append_space_unsafe(output, 6); - char_len = g_unichar_to_utf8(*input, buf); - buffer_set_used_size(output, output->used - 6 + char_len); - } + for (; *input != '\0' && len > 0; input++, len--) + uni_ucs4_to_utf8_c(*input, output); } -unsigned int uni_utf8_strlen_n(const void *input, size_t size) +void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output) { - const uint8_t *data = (const uint8_t *)input; + unsigned char first; + int bitpos; + + if (chr < 0x80) { + buffer_append_c(output, chr); + return; + } + + i_assert(chr <= 0x40000000); /* 1 << (5 * 6) */ + + if (chr < (1 << (6 + 5))) { + /* 110xxxxx */ + bitpos = 6; + first = 0x80 | 0x40; + } else if (chr < (1 << ((2*6) + 4))) { + /* 1110xxxx */ + bitpos = 2*6; + first = 0x80 | 0x40 | 0x20; + } else if (chr < (1 << ((3*6) + 3))) { + /* 11110xxx */ + bitpos = 3*6; + first = 0x80 | 0x40 | 0x20 | 0x10; + } else if (chr < (1 << ((4*6) + 2))) { + /* 111110xx */ + bitpos = 4*6; + first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08; + } else { + /* 1111110x */ + bitpos = 5*6; + first = 0x80 | 0x40 | 0x20 | 0x10 | 0x08 | 0x04; + } + buffer_append_c(output, first | (chr >> bitpos)); + + do { + bitpos -= 6; + buffer_append_c(output, 0x80 | ((chr >> bitpos) & 0x3f)); + } while (bitpos > 0); +} + +unsigned int uni_utf8_strlen_n(const void *_input, size_t size) +{ + const unsigned char *input = _input; unsigned int len = 0; size_t i; - for (i = 0; i < size && data[i] != '\0'; ) { - i += uni_utf8_skip[data[i]]; + for (i = 0; i < size && input[i] != '\0'; ) { + i += uni_utf8_char_bytes(input[i]); if (i > size) break; len++;
--- a/src/lib/unichar.h Mon Jun 11 02:27:55 2007 +0300 +++ b/src/lib/unichar.h Mon Jun 11 04:37:29 2007 +0300 @@ -3,7 +3,7 @@ typedef uint32_t unichar_t; -extern const char *const uni_utf8_skip; +extern const uint8_t *const uni_utf8_non1_bytes; /* Returns number of characters in a NUL-terminated unicode string */ unsigned int uni_strlen(const unichar_t *str); @@ -12,15 +12,23 @@ int uni_utf8_to_ucs4(const char *input, buffer_t *output); /* Translates UCS-4 input to UTF-8 output. */ void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output); +void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output); -/* Returns the next UTF-8 character, or (unichar_t)-1 for invalid input and - (unichar_t)-2 for incomplete trailing character. */ -unichar_t uni_utf8_get_char(const char *input); -unichar_t uni_utf8_get_char_len(const unsigned char *input, size_t max_len); +/* Returns 1 if *chr_r is set, 0 for incomplete trailing character, + -1 for invalid input. */ +int uni_utf8_get_char(const char *input, unichar_t *chr_r); +int uni_utf8_get_char_n(const void *input, size_t max_len, unichar_t *chr_r); /* Returns UTF-8 string length with maximum input size. */ unsigned int uni_utf8_strlen_n(const void *input, size_t size); -#define uni_utf8_next_char(p) \ - ((p) + uni_utf8_skip[*(const uint8_t *)(p)]) +/* Returns the number of bytes belonging to this partial UTF-8 character. + Invalid input is returned with length 1. */ +static inline unsigned int uni_utf8_char_bytes(char chr) +{ + /* 0x00 .. 0x7f are ASCII. 0x80 .. 0xC1 are invalid. */ + if ((uint8_t)chr < (192 + 2)) + return 1; + return uni_utf8_non1_bytes[(uint8_t)chr - (192 + 2)]; +} #endif