Mercurial > illumos > illumos-gate
changeset 13264:1a29e6379e8a
616 euc code in libc not aligned with localedef
Reviewed by: roland.mainz@nexenta.com
Reviewed by: albert.lee@nexenta.com
Approved by: albert.lee@nexenta.com
author | Garrett D'Amore <garrett@nexenta.com> |
---|---|
date | Thu, 13 Jan 2011 08:38:20 -0800 |
parents | cac385f011a3 |
children | ff6d445369ca |
files | usr/src/lib/libc/port/locale/euc.c usr/src/lib/libc/port/locale/mblocal.h usr/src/lib/libc/port/locale/setrunelocale.c |
diffstat | 3 files changed, 240 insertions(+), 145 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/lib/libc/port/locale/euc.c Sat Jan 01 19:31:36 2011 -0500 +++ b/usr/src/lib/libc/port/locale/euc.c Thu Jan 13 08:38:20 2011 -0800 @@ -1,5 +1,5 @@ /* - * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved. * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. @@ -39,23 +39,37 @@ #include <string.h> #include <wchar.h> #include <sys/types.h> +#include <sys/euc.h> #include "runetype.h" #include "mblocal.h" -#define MIN(a, b) ((a) < (b) ? (a) : (b)) +static size_t _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD, + const char *_RESTRICT_KYWD, + size_t, mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t); +static size_t _EUC_wcrtomb_impl(char *_RESTRICT_KYWD, wchar_t, + mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t); -static size_t _EUC_mbrtowc(wchar_t *_RESTRICT_KYWD, +static size_t _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD, + const char *_RESTRICT_KYWD, + size_t, mbstate_t *_RESTRICT_KYWD); +static size_t _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD, const char *_RESTRICT_KYWD, size_t, mbstate_t *_RESTRICT_KYWD); -static int _EUC_mbsinit(const mbstate_t *); -static size_t _EUC_wcrtomb(char *_RESTRICT_KYWD, wchar_t, +static size_t _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD, + const char *_RESTRICT_KYWD, + size_t, mbstate_t *_RESTRICT_KYWD); +static size_t _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD, + const char *_RESTRICT_KYWD, + size_t, mbstate_t *_RESTRICT_KYWD); +static size_t _EUC_CN_wcrtomb(char *_RESTRICT_KYWD, wchar_t, mbstate_t *_RESTRICT_KYWD); - -typedef struct { - int count[4]; - wchar_t bits[4]; - wchar_t mask; -} _EucInfo; +static size_t _EUC_JP_wcrtomb(char *_RESTRICT_KYWD, wchar_t, + mbstate_t *_RESTRICT_KYWD); +static size_t _EUC_KR_wcrtomb(char *_RESTRICT_KYWD, wchar_t, + mbstate_t *_RESTRICT_KYWD); +static size_t _EUC_TW_wcrtomb(char *_RESTRICT_KYWD, wchar_t, + mbstate_t *_RESTRICT_KYWD); +static int _EUC_mbsinit(const mbstate_t *); typedef struct { wchar_t ch; @@ -63,59 +77,6 @@ int want; } _EucState; -int -_EUC_init(_RuneLocale *rl) -{ - _EucInfo *ei; - int x, new__mb_cur_max; - char *v, *e; - - if (rl->__variable == NULL) - return (EINVAL); - - v = (char *)rl->__variable; - - while (*v == ' ' || *v == '\t') - ++v; - - if ((ei = malloc(sizeof (_EucInfo))) == NULL) - return (errno == 0 ? ENOMEM : errno); - - new__mb_cur_max = 0; - for (x = 0; x < 4; ++x) { - ei->count[x] = (int)strtol(v, &e, 0); - if (v == e || !(v = e)) { - free(ei); - return (EINVAL); - } - if (new__mb_cur_max < ei->count[x]) - new__mb_cur_max = ei->count[x]; - while (*v == ' ' || *v == '\t') - ++v; - ei->bits[x] = (int)strtol(v, &e, 0); - if (v == e || !(v = e)) { - free(ei); - return (EINVAL); - } - while (*v == ' ' || *v == '\t') - ++v; - } - ei->mask = (int)strtol(v, &e, 0); - if (v == e || !(v = e)) { - free(ei); - return (EINVAL); - } - rl->__variable = ei; - rl->__variable_len = sizeof (_EucInfo); - _CurrentRuneLocale = rl; - __ctype[520] = new__mb_cur_max; - __mbrtowc = _EUC_mbrtowc; - __wcrtomb = _EUC_wcrtomb; - __mbsinit = _EUC_mbsinit; - charset_is_ascii = 0; - return (0); -} - static int _EUC_mbsinit(const mbstate_t *ps) { @@ -123,34 +84,147 @@ return (ps == NULL || ((const _EucState *)ps)->want == 0); } -#define CEI ((_EucInfo *)(_CurrentRuneLocale->__variable)) +/* + * EUC-CN uses CS0, CS1 and CS2 (4 bytes). + */ +int +_EUC_CN_init(_RuneLocale *rl) +{ + __mbrtowc = _EUC_CN_mbrtowc; + __wcrtomb = _EUC_CN_wcrtomb; + __mbsinit = _EUC_mbsinit; + + _CurrentRuneLocale = rl; -#define _SS2 0x008e -#define _SS3 0x008f + __ctype[520] = 4; + charset_is_ascii = 0; + return (0); +} -#define GR_BITS 0x80808080 /* XXX: to be fixed */ +static size_t +_EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, + size_t n, mbstate_t *_RESTRICT_KYWD ps) +{ + return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0)); +} -static int -_euc_set(uint_t c) +static size_t +_EUC_CN_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, + mbstate_t *_RESTRICT_KYWD ps) +{ + return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0)); +} + +/* + * EUC-KR uses only CS0 and CS1. + */ +int +_EUC_KR_init(_RuneLocale *rl) { + __mbrtowc = _EUC_KR_mbrtowc; + __wcrtomb = _EUC_KR_wcrtomb; + __mbsinit = _EUC_mbsinit; - c &= 0xff; - return ((c & 0x80) ? c == _SS3 ? 3 : c == _SS2 ? 2 : 1 : 0); + _CurrentRuneLocale = rl; + + __ctype[520] = 2; + charset_is_ascii = 0; + return (0); +} + +static size_t +_EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, + size_t n, mbstate_t *_RESTRICT_KYWD ps) +{ + return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0)); } static size_t -_EUC_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, +_EUC_KR_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, + mbstate_t *_RESTRICT_KYWD ps) +{ + return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0)); +} + +/* + * EUC-JP uses CS0, CS1, CS2, and CS3. + */ +int +_EUC_JP_init(_RuneLocale *rl) +{ + __mbrtowc = _EUC_JP_mbrtowc; + __wcrtomb = _EUC_JP_wcrtomb; + __mbsinit = _EUC_mbsinit; + + _CurrentRuneLocale = rl; + + __ctype[520] = 3; + charset_is_ascii = 0; + return (0); +} + +static size_t +_EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, size_t n, mbstate_t *_RESTRICT_KYWD ps) { + return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3)); +} + +static size_t +_EUC_JP_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, + mbstate_t *_RESTRICT_KYWD ps) +{ + return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3)); +} + +/* + * EUC-TW uses CS0, CS1, and CS2. + */ +int +_EUC_TW_init(_RuneLocale *rl) +{ + __mbrtowc = _EUC_TW_mbrtowc; + __wcrtomb = _EUC_TW_wcrtomb; + __mbsinit = _EUC_mbsinit; + + _CurrentRuneLocale = rl; + + __ctype[520] = 4; + charset_is_ascii = 0; + return (0); +} + +static size_t +_EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, + size_t n, mbstate_t *_RESTRICT_KYWD ps) +{ + return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0)); +} + +static size_t +_EUC_TW_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, + mbstate_t *_RESTRICT_KYWD ps) +{ + return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0)); +} + +/* + * Common EUC code. + */ + +static size_t +_EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, + size_t n, mbstate_t *_RESTRICT_KYWD ps, + uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) +{ _EucState *es; - int i, set, want; + int i, want; wchar_t wc; - const char *os; + unsigned char ch; es = (_EucState *)ps; - if (es->want < 0 || es->want > MB_CUR_MAX || es->set < 0 || - es->set > 3) { + if (es->want < 0 || es->want > MB_CUR_MAX) { errno = EINVAL; return ((size_t)-1); } @@ -165,58 +239,59 @@ /* Incomplete multibyte sequence */ return ((size_t)-2); - os = s; + if (es->want == 0) { + /* Fast path for plain ASCII (CS0) */ + if (((ch = (unsigned char)*s) & 0x80) == 0) { + if (pwc != NULL) + *pwc = ch; + return (ch != '\0' ? 1 : 0); + } - if (es->want == 0) { - want = CEI->count[set = _euc_set(*s)]; - if (set == 2 || set == 3) { - --want; - if (--n == 0) { - /* Incomplete multibyte sequence */ - es->set = set; - es->want = want; - es->ch = 0; - return ((size_t)-2); - } - ++s; - if (*s == '\0') { - errno = EILSEQ; - return ((size_t)-1); - } + if (ch >= 0xa1) { + /* CS1 */ + want = 2; + } else if (ch == cs2) { + want = cs2width; + } else if (ch == cs3) { + want = cs3width; + } else { + errno = EILSEQ; + return ((size_t)-1); } - wc = (unsigned char)*s++; + + + es->want = want; + es->ch = 0; } else { - set = es->set; want = es->want; wc = es->ch; } - for (i = (es->want == 0) ? 1 : 0; i < MIN(want, n); i++) { - if (*s == '\0') { - errno = EILSEQ; - return ((size_t)-1); - } - wc = (wc << 8) | (unsigned char)*s++; + + for (i = 0; i < MIN(want, n); i++) { + wc <<= 8; + wc |= *s; + s++; } if (i < want) { /* Incomplete multibyte sequence */ - es->set = set; es->want = want - i; es->ch = wc; return ((size_t)-2); } - wc = (wc & ~CEI->mask) | CEI->bits[set]; if (pwc != NULL) *pwc = wc; es->want = 0; - return (wc == L'\0' ? 0 : s - os); + return (wc == L'\0' ? 0 : want); } static size_t -_EUC_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps) +_EUC_wcrtomb_impl(char *_RESTRICT_KYWD s, wchar_t wc, + mbstate_t *_RESTRICT_KYWD ps, + uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) { _EucState *es; - wchar_t m, nm; int i, len; + wchar_t nm; es = (_EucState *)ps; @@ -229,38 +304,52 @@ /* Reset to initial shift state (no-op) */ return (1); - m = wc & CEI->mask; - nm = wc & ~m; + if ((wc & ~0x7f) == 0) { + /* Fast path for plain ASCII (CS0) */ + *s = (char)wc; + return (1); + } - if (m == CEI->bits[1]) { -CodeSet1: - /* Codeset 1: The first byte must have 0x80 in it. */ - i = len = CEI->count[1]; - while (i-- > 0) { - *(unsigned char *)s = (nm >> (i << 3)) | 0x80; - s++; - } + /* Determine the "length" */ + if ((unsigned)wc > 0xffffff) { + len = 4; + } else if ((unsigned)wc > 0xffff) { + len = 3; + } else if ((unsigned)wc > 0xff) { + len = 2; } else { - if (m == CEI->bits[0]) - i = len = CEI->count[0]; - else if (m == CEI->bits[2]) { - i = len = CEI->count[2]; - *(unsigned char *)s = _SS2; - s++; - --i; - /* SS2 designates G2 into GR */ - nm |= GR_BITS; - } else if (m == CEI->bits[3]) { - i = len = CEI->count[3]; - *(unsigned char *)s = _SS3; - s++; - --i; - /* SS3 designates G3 into GR */ - nm |= GR_BITS; - } else - goto CodeSet1; /* Bletch */ - while (i-- > 0) - *s++ = (nm >> (i << 3)) & 0xff; + len = 1; + } + + if (len > MB_CUR_MAX) { + errno = EILSEQ; + return ((size_t)-1); + } + + /* This first check excludes CS1, which is implicitly valid. */ + if ((wc < 0xa100) || (wc > 0xffff)) { + /* Check for valid CS2 or CS3 */ + nm = (wc >> ((len - 1) * 8)); + if (nm == cs2) { + if (len != cs2width) { + errno = EILSEQ; + return ((size_t)-1); + } + } else if (nm == cs3) { + if (len != cs3width) { + errno = EILSEQ; + return ((size_t)-1); + } + } else { + errno = EILSEQ; + return ((size_t)-1); + } + } + + /* Stash the bytes, least significant last */ + for (i = len - 1; i >= 0; i--) { + s[i] = (wc & 0xff); + wc >>= 8; } return (len); }
--- a/usr/src/lib/libc/port/locale/mblocal.h Sat Jan 01 19:31:36 2011 -0500 +++ b/usr/src/lib/libc/port/locale/mblocal.h Thu Jan 13 08:38:20 2011 -0800 @@ -1,5 +1,5 @@ /* - * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2004 Tim J. Robbins. * All rights reserved. * @@ -36,7 +36,10 @@ int _none_init(_RuneLocale *); int _ascii_init(_RuneLocale *); int _UTF8_init(_RuneLocale *); -int _EUC_init(_RuneLocale *); +int _EUC_CN_init(_RuneLocale *); +int _EUC_JP_init(_RuneLocale *); +int _EUC_KR_init(_RuneLocale *); +int _EUC_TW_init(_RuneLocale *); int _GB18030_init(_RuneLocale *); int _GB2312_init(_RuneLocale *); int _GBK_init(_RuneLocale *);
--- a/usr/src/lib/libc/port/locale/setrunelocale.c Sat Jan 01 19:31:36 2011 -0500 +++ b/usr/src/lib/libc/port/locale/setrunelocale.c Thu Jan 13 08:38:20 2011 -0800 @@ -1,5 +1,5 @@ /* - * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * @@ -151,8 +151,14 @@ ret = _ascii_init(rl); else if (strcmp(rl->__encoding, "UTF-8") == 0) ret = _UTF8_init(rl); - else if (strcmp(rl->__encoding, "EUC") == 0) - ret = _EUC_init(rl); + else if (strcmp(rl->__encoding, "EUC-CN") == 0) + ret = _EUC_CN_init(rl); + else if (strcmp(rl->__encoding, "EUC-JP") == 0) + ret = _EUC_JP_init(rl); + else if (strcmp(rl->__encoding, "EUC-KR") == 0) + ret = _EUC_KR_init(rl); + else if (strcmp(rl->__encoding, "EUC-TW") == 0) + ret = _EUC_TW_init(rl); else if (strcmp(rl->__encoding, "GB18030") == 0) ret = _GB18030_init(rl); else if (strcmp(rl->__encoding, "GB2312") == 0) @@ -168,9 +174,6 @@ if (ret == 0) { if (CachedRuneLocale != NULL) { - /* See euc.c */ - if (strcmp(CachedRuneLocale->__encoding, "EUC") == 0) - free(CachedRuneLocale->__variable); free(CachedRuneLocale); } CachedRuneLocale = _CurrentRuneLocale;