changeset 13264:1a29e6379e8a

616 euc code in libc not aligned with localedef Reviewed by: roland.mainz@nexenta.com Reviewed by: albert.lee@nexenta.com Approved by: albert.lee@nexenta.com
author Garrett D'Amore <garrett@nexenta.com>
date Thu, 13 Jan 2011 08:38:20 -0800
parents cac385f011a3
children ff6d445369ca
files usr/src/lib/libc/port/locale/euc.c usr/src/lib/libc/port/locale/mblocal.h usr/src/lib/libc/port/locale/setrunelocale.c
diffstat 3 files changed, 240 insertions(+), 145 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/lib/libc/port/locale/euc.c	Sat Jan 01 19:31:36 2011 -0500
+++ b/usr/src/lib/libc/port/locale/euc.c	Thu Jan 13 08:38:20 2011 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved.
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -39,23 +39,37 @@
 #include <string.h>
 #include <wchar.h>
 #include <sys/types.h>
+#include <sys/euc.h>
 #include "runetype.h"
 #include "mblocal.h"
 
-#define	MIN(a, b)	((a) < (b) ? (a) : (b))
+static size_t	_EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD,
+    const char *_RESTRICT_KYWD,
+    size_t, mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
+static size_t	_EUC_wcrtomb_impl(char *_RESTRICT_KYWD, wchar_t,
+    mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
 
-static size_t	_EUC_mbrtowc(wchar_t *_RESTRICT_KYWD,
+static size_t	_EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD,
+		    const char *_RESTRICT_KYWD,
+		    size_t, mbstate_t *_RESTRICT_KYWD);
+static size_t	_EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD,
 		    const char *_RESTRICT_KYWD,
 		    size_t, mbstate_t *_RESTRICT_KYWD);
-static int	_EUC_mbsinit(const mbstate_t *);
-static size_t	_EUC_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
+static size_t	_EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD,
+		    const char *_RESTRICT_KYWD,
+		    size_t, mbstate_t *_RESTRICT_KYWD);
+static size_t	_EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD,
+		    const char *_RESTRICT_KYWD,
+		    size_t, mbstate_t *_RESTRICT_KYWD);
+static size_t	_EUC_CN_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
 		    mbstate_t *_RESTRICT_KYWD);
-
-typedef struct {
-	int	count[4];
-	wchar_t	bits[4];
-	wchar_t	mask;
-} _EucInfo;
+static size_t	_EUC_JP_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
+		    mbstate_t *_RESTRICT_KYWD);
+static size_t	_EUC_KR_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
+		    mbstate_t *_RESTRICT_KYWD);
+static size_t	_EUC_TW_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
+		    mbstate_t *_RESTRICT_KYWD);
+static int	_EUC_mbsinit(const mbstate_t *);
 
 typedef struct {
 	wchar_t	ch;
@@ -63,59 +77,6 @@
 	int	want;
 } _EucState;
 
-int
-_EUC_init(_RuneLocale *rl)
-{
-	_EucInfo *ei;
-	int x, new__mb_cur_max;
-	char *v, *e;
-
-	if (rl->__variable == NULL)
-		return (EINVAL);
-
-	v = (char *)rl->__variable;
-
-	while (*v == ' ' || *v == '\t')
-		++v;
-
-	if ((ei = malloc(sizeof (_EucInfo))) == NULL)
-		return (errno == 0 ? ENOMEM : errno);
-
-	new__mb_cur_max = 0;
-	for (x = 0; x < 4; ++x) {
-		ei->count[x] = (int)strtol(v, &e, 0);
-		if (v == e || !(v = e)) {
-			free(ei);
-			return (EINVAL);
-		}
-		if (new__mb_cur_max < ei->count[x])
-			new__mb_cur_max = ei->count[x];
-		while (*v == ' ' || *v == '\t')
-			++v;
-		ei->bits[x] = (int)strtol(v, &e, 0);
-		if (v == e || !(v = e)) {
-			free(ei);
-			return (EINVAL);
-		}
-		while (*v == ' ' || *v == '\t')
-			++v;
-	}
-	ei->mask = (int)strtol(v, &e, 0);
-	if (v == e || !(v = e)) {
-		free(ei);
-		return (EINVAL);
-	}
-	rl->__variable = ei;
-	rl->__variable_len = sizeof (_EucInfo);
-	_CurrentRuneLocale = rl;
-	__ctype[520] = new__mb_cur_max;
-	__mbrtowc = _EUC_mbrtowc;
-	__wcrtomb = _EUC_wcrtomb;
-	__mbsinit = _EUC_mbsinit;
-	charset_is_ascii = 0;
-	return (0);
-}
-
 static int
 _EUC_mbsinit(const mbstate_t *ps)
 {
@@ -123,34 +84,147 @@
 	return (ps == NULL || ((const _EucState *)ps)->want == 0);
 }
 
-#define	CEI	((_EucInfo *)(_CurrentRuneLocale->__variable))
+/*
+ * EUC-CN uses CS0, CS1 and CS2 (4 bytes).
+ */
+int
+_EUC_CN_init(_RuneLocale *rl)
+{
+	__mbrtowc = _EUC_CN_mbrtowc;
+	__wcrtomb = _EUC_CN_wcrtomb;
+	__mbsinit = _EUC_mbsinit;
+
+	_CurrentRuneLocale = rl;
 
-#define	_SS2	0x008e
-#define	_SS3	0x008f
+	__ctype[520] = 4;
+	charset_is_ascii = 0;
+	return (0);
+}
 
-#define	GR_BITS	0x80808080 /* XXX: to be fixed */
+static size_t
+_EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
+    size_t n, mbstate_t *_RESTRICT_KYWD ps)
+{
+	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
+}
 
-static int
-_euc_set(uint_t c)
+static size_t
+_EUC_CN_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
+    mbstate_t *_RESTRICT_KYWD ps)
+{
+	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
+}
+
+/*
+ * EUC-KR uses only CS0 and CS1.
+ */
+int
+_EUC_KR_init(_RuneLocale *rl)
 {
+	__mbrtowc = _EUC_KR_mbrtowc;
+	__wcrtomb = _EUC_KR_wcrtomb;
+	__mbsinit = _EUC_mbsinit;
 
-	c &= 0xff;
-	return ((c & 0x80) ? c == _SS3 ? 3 : c == _SS2 ? 2 : 1 : 0);
+	_CurrentRuneLocale = rl;
+
+	__ctype[520] = 2;
+	charset_is_ascii = 0;
+	return (0);
+}
+
+static size_t
+_EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
+    size_t n, mbstate_t *_RESTRICT_KYWD ps)
+{
+	return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0));
 }
 
 static size_t
-_EUC_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
+_EUC_KR_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
+    mbstate_t *_RESTRICT_KYWD ps)
+{
+	return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0));
+}
+
+/*
+ * EUC-JP uses CS0, CS1, CS2, and CS3.
+ */
+int
+_EUC_JP_init(_RuneLocale *rl)
+{
+	__mbrtowc = _EUC_JP_mbrtowc;
+	__wcrtomb = _EUC_JP_wcrtomb;
+	__mbsinit = _EUC_mbsinit;
+
+	_CurrentRuneLocale = rl;
+
+	__ctype[520] = 3;
+	charset_is_ascii = 0;
+	return (0);
+}
+
+static size_t
+_EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
     size_t n, mbstate_t *_RESTRICT_KYWD ps)
 {
+	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3));
+}
+
+static size_t
+_EUC_JP_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
+    mbstate_t *_RESTRICT_KYWD ps)
+{
+	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3));
+}
+
+/*
+ * EUC-TW uses CS0, CS1, and CS2.
+ */
+int
+_EUC_TW_init(_RuneLocale *rl)
+{
+	__mbrtowc = _EUC_TW_mbrtowc;
+	__wcrtomb = _EUC_TW_wcrtomb;
+	__mbsinit = _EUC_mbsinit;
+
+	_CurrentRuneLocale = rl;
+
+	__ctype[520] = 4;
+	charset_is_ascii = 0;
+	return (0);
+}
+
+static size_t
+_EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
+    size_t n, mbstate_t *_RESTRICT_KYWD ps)
+{
+	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
+}
+
+static size_t
+_EUC_TW_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
+    mbstate_t *_RESTRICT_KYWD ps)
+{
+	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
+}
+
+/*
+ * Common EUC code.
+ */
+
+static size_t
+_EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
+    size_t n, mbstate_t *_RESTRICT_KYWD ps,
+    uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
+{
 	_EucState *es;
-	int i, set, want;
+	int i, want;
 	wchar_t wc;
-	const char *os;
+	unsigned char ch;
 
 	es = (_EucState *)ps;
 
-	if (es->want < 0 || es->want > MB_CUR_MAX || es->set < 0 ||
-	    es->set > 3) {
+	if (es->want < 0 || es->want > MB_CUR_MAX) {
 		errno = EINVAL;
 		return ((size_t)-1);
 	}
@@ -165,58 +239,59 @@
 		/* Incomplete multibyte sequence */
 		return ((size_t)-2);
 
-	os = s;
+	if (es->want == 0) {
+		/* Fast path for plain ASCII (CS0) */
+		if (((ch = (unsigned char)*s) & 0x80) == 0) {
+			if (pwc != NULL)
+				*pwc = ch;
+			return (ch != '\0' ? 1 : 0);
+		}
 
-	if (es->want == 0) {
-		want = CEI->count[set = _euc_set(*s)];
-		if (set == 2 || set == 3) {
-			--want;
-			if (--n == 0) {
-				/* Incomplete multibyte sequence */
-				es->set = set;
-				es->want = want;
-				es->ch = 0;
-				return ((size_t)-2);
-			}
-			++s;
-			if (*s == '\0') {
-				errno = EILSEQ;
-				return ((size_t)-1);
-			}
+		if (ch >= 0xa1) {
+			/* CS1 */
+			want = 2;
+		} else if (ch == cs2) {
+			want = cs2width;
+		} else if (ch == cs3) {
+			want = cs3width;
+		} else {
+			errno = EILSEQ;
+			return ((size_t)-1);
 		}
-		wc = (unsigned char)*s++;
+
+
+		es->want = want;
+		es->ch = 0;
 	} else {
-		set = es->set;
 		want = es->want;
 		wc = es->ch;
 	}
-	for (i = (es->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
-		if (*s == '\0') {
-			errno = EILSEQ;
-			return ((size_t)-1);
-		}
-		wc = (wc << 8) | (unsigned char)*s++;
+
+	for (i = 0; i < MIN(want, n); i++) {
+		wc <<= 8;
+		wc |= *s;
+		s++;
 	}
 	if (i < want) {
 		/* Incomplete multibyte sequence */
-		es->set = set;
 		es->want = want - i;
 		es->ch = wc;
 		return ((size_t)-2);
 	}
-	wc = (wc & ~CEI->mask) | CEI->bits[set];
 	if (pwc != NULL)
 		*pwc = wc;
 	es->want = 0;
-	return (wc == L'\0' ? 0 : s - os);
+	return (wc == L'\0' ? 0 : want);
 }
 
 static size_t
-_EUC_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
+_EUC_wcrtomb_impl(char *_RESTRICT_KYWD s, wchar_t wc,
+    mbstate_t *_RESTRICT_KYWD ps,
+    uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
 {
 	_EucState *es;
-	wchar_t m, nm;
 	int i, len;
+	wchar_t nm;
 
 	es = (_EucState *)ps;
 
@@ -229,38 +304,52 @@
 		/* Reset to initial shift state (no-op) */
 		return (1);
 
-	m = wc & CEI->mask;
-	nm = wc & ~m;
+	if ((wc & ~0x7f) == 0) {
+		/* Fast path for plain ASCII (CS0) */
+		*s = (char)wc;
+		return (1);
+	}
 
-	if (m == CEI->bits[1]) {
-CodeSet1:
-		/* Codeset 1: The first byte must have 0x80 in it. */
-		i = len = CEI->count[1];
-		while (i-- > 0) {
-			*(unsigned char *)s = (nm >> (i << 3)) | 0x80;
-			s++;
-		}
+	/* Determine the "length" */
+	if ((unsigned)wc > 0xffffff) {
+		len = 4;
+	} else if ((unsigned)wc > 0xffff) {
+		len = 3;
+	} else if ((unsigned)wc > 0xff) {
+		len = 2;
 	} else {
-		if (m == CEI->bits[0])
-			i = len = CEI->count[0];
-		else if (m == CEI->bits[2]) {
-			i = len = CEI->count[2];
-			*(unsigned char *)s = _SS2;
-			s++;
-			--i;
-			/* SS2 designates G2 into GR */
-			nm |= GR_BITS;
-		} else if (m == CEI->bits[3]) {
-			i = len = CEI->count[3];
-			*(unsigned char *)s = _SS3;
-			s++;
-			--i;
-			/* SS3 designates G3 into GR */
-			nm |= GR_BITS;
-		} else
-			goto CodeSet1;	/* Bletch */
-		while (i-- > 0)
-			*s++ = (nm >> (i << 3)) & 0xff;
+		len = 1;
+	}
+
+	if (len > MB_CUR_MAX) {
+		errno = EILSEQ;
+		return ((size_t)-1);
+	}
+
+	/* This first check excludes CS1, which is implicitly valid. */
+	if ((wc < 0xa100) || (wc > 0xffff)) {
+		/* Check for valid CS2 or CS3 */
+		nm = (wc >> ((len - 1) * 8));
+		if (nm == cs2) {
+			if (len != cs2width) {
+				errno = EILSEQ;
+				return ((size_t)-1);
+			}
+		} else if (nm == cs3) {
+			if (len != cs3width) {
+				errno = EILSEQ;
+				return ((size_t)-1);
+			}
+		} else {
+			errno = EILSEQ;
+			return ((size_t)-1);
+		}
+	}
+
+	/* Stash the bytes, least significant last */
+	for (i = len - 1; i >= 0; i--) {
+		s[i] = (wc & 0xff);
+		wc >>= 8;
 	}
 	return (len);
 }
--- a/usr/src/lib/libc/port/locale/mblocal.h	Sat Jan 01 19:31:36 2011 -0500
+++ b/usr/src/lib/libc/port/locale/mblocal.h	Thu Jan 13 08:38:20 2011 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2004 Tim J. Robbins.
  * All rights reserved.
  *
@@ -36,7 +36,10 @@
 int	_none_init(_RuneLocale *);
 int	_ascii_init(_RuneLocale *);
 int	_UTF8_init(_RuneLocale *);
-int	_EUC_init(_RuneLocale *);
+int	_EUC_CN_init(_RuneLocale *);
+int	_EUC_JP_init(_RuneLocale *);
+int	_EUC_KR_init(_RuneLocale *);
+int	_EUC_TW_init(_RuneLocale *);
 int	_GB18030_init(_RuneLocale *);
 int	_GB2312_init(_RuneLocale *);
 int	_GBK_init(_RuneLocale *);
--- a/usr/src/lib/libc/port/locale/setrunelocale.c	Sat Jan 01 19:31:36 2011 -0500
+++ b/usr/src/lib/libc/port/locale/setrunelocale.c	Thu Jan 13 08:38:20 2011 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  *
@@ -151,8 +151,14 @@
 		ret = _ascii_init(rl);
 	else if (strcmp(rl->__encoding, "UTF-8") == 0)
 		ret = _UTF8_init(rl);
-	else if (strcmp(rl->__encoding, "EUC") == 0)
-		ret = _EUC_init(rl);
+	else if (strcmp(rl->__encoding, "EUC-CN") == 0)
+		ret = _EUC_CN_init(rl);
+	else if (strcmp(rl->__encoding, "EUC-JP") == 0)
+		ret = _EUC_JP_init(rl);
+	else if (strcmp(rl->__encoding, "EUC-KR") == 0)
+		ret = _EUC_KR_init(rl);
+	else if (strcmp(rl->__encoding, "EUC-TW") == 0)
+		ret = _EUC_TW_init(rl);
 	else if (strcmp(rl->__encoding, "GB18030") == 0)
 		ret = _GB18030_init(rl);
 	else if (strcmp(rl->__encoding, "GB2312") == 0)
@@ -168,9 +174,6 @@
 
 	if (ret == 0) {
 		if (CachedRuneLocale != NULL) {
-			/* See euc.c */
-			if (strcmp(CachedRuneLocale->__encoding, "EUC") == 0)
-				free(CachedRuneLocale->__variable);
 			free(CachedRuneLocale);
 		}
 		CachedRuneLocale = _CurrentRuneLocale;