changeset 13206:e072bd4baed8

326 strxfrm could be faster 330 isprint() returns false for "legacy" code, results in bad prompt Reviewed by: gwr@nexenta.com Reviewed by: cjlove@san.rr.com Reviewed by: richlowe@richlowe.net Approved by: richlowe@richlowe.net
author Garrett D'Amore <garrett@nexenta.com>
date Mon, 11 Oct 2010 16:24:58 -0700
parents 969b97e84bdc
children 4f482eb481b3
files usr/src/cmd/localedef/collate.c usr/src/cmd/localedef/ctype.c usr/src/cmd/localedef/localedef.h usr/src/cmd/localedef/wide.c usr/src/lib/libc/port/locale/collate.c usr/src/lib/libc/port/locale/collate.h
diffstat 6 files changed, 429 insertions(+), 155 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/localedef/collate.c	Sun Oct 10 04:30:51 2010 +0100
+++ b/usr/src/cmd/localedef/collate.c	Mon Oct 11 16:24:58 2010 -0700
@@ -33,6 +33,67 @@
 #include "collate.h"
 
 /*
+ * Design notes.
+ *
+ * It will be extremely helpful to the reader if they have access to
+ * the localedef and locale file format specifications available.
+ * Latest versions of these are available from www.opengroup.org.
+ *
+ * The design for the collation code is a bit complex.  The goal is a
+ * single collation database as described in collate.h (in
+ * libc/port/locale).  However, there are some other tidbits:
+ *
+ * a) The substitution entries are now a directly indexable array.  A
+ * priority elsewhere in the table is taken as an index into the
+ * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
+ * set.  (The bit is cleared and the result is the index into the
+ * table.
+ *
+ * b) We eliminate duplicate entries into the substitution table.
+ * This saves a lot of space.
+ *
+ * c) The priorities for each level are "compressed", so that each
+ * sorting level has consecutively numbered priorities starting at 1.
+ * (O is reserved for the ignore priority.)  This means sort levels
+ * which only have a few distinct priorities can represent the
+ * priority level in fewer bits, which makes the strxfrm output
+ * smaller.
+ *
+ * d) We record the total number of priorities so that strxfrm can
+ * figure out how many bytes to expand a numeric priority into.
+ *
+ * e) For the UNDEFINED pass (the last pass), we record the maximum
+ * number of bits needed to uniquely prioritize these entries, so that
+ * the last pass can also use smaller strxfrm output when possible.
+ *
+ * f) Priorities with the sign bit set are verboten.  This works out
+ * because no active character set needs that bit to carry significant
+ * information once the character is in wide form.
+ *
+ * To process the entire data to make the database, we actually run
+ * multiple passes over the data.
+ *
+ * The first pass, which is done at parse time, identifies elements,
+ * substitutions, and such, and records them in priority order.  As
+ * some priorities can refer to other priorities, using forward
+ * references, we use a table of references indicating whether the
+ * priority's value has been resolved, or whether it is still a
+ * reference.
+ *
+ * The second pass walks over all the items in priority order, noting
+ * that they are used directly, and not just an indirect reference.
+ * This is done by creating a "weight" structure for the item.  The
+ * weights are stashed in an AVL tree sorted by relative "priority".
+ *
+ * The third pass walks over all the weight structures, in priority
+ * order, and assigns a new monotonically increasing (per sort level)
+ * weight value to them.  These are the values that will actually be
+ * written to the file.
+ *
+ * The fourth pass just writes the data out.
+ */
+
+/*
  * In order to resolve the priorities, we create a table of priorities.
  * Entries in the table can be in one of three states.
  *
@@ -48,8 +109,6 @@
  * this is used for forward references.  A collating-symbol can never
  * have this value.
  *
- * The "self" field is a reference to or own index in the pri list.
- *
  * The "pass" field is used during final resolution to aid in detection
  * of referencing loops.  (For example <A> depends on <B>, but <B> has its
  * priority dependent on <A>.)
@@ -60,15 +119,19 @@
 	REFER		/* priority is a reference (index) */
 } res_t;
 
+typedef struct weight {
+	int32_t		pri;
+	int		opt;
+	avl_node_t	avl;
+} weight_t;
+
 typedef struct priority {
 	res_t		res;
 	int32_t		pri;
-	int32_t		self;
 	int		pass;
 	int		lineno;
 } collpri_t;
 
-
 #define	NUM_WT	collinfo.directive_count
 
 /*
@@ -110,7 +173,7 @@
  */
 typedef struct collchar {
 	wchar_t		wc;
-	wchar_t		ref[COLL_WEIGHTS_MAX];
+	int32_t		ref[COLL_WEIGHTS_MAX];
 	avl_node_t	avl;
 } collchar_t;
 
@@ -124,6 +187,7 @@
 	int32_t		key;
 	int32_t		ref[COLLATE_STR_LEN];
 	avl_node_t	avl;
+	avl_node_t	avl_ref;
 } subst_t;
 
 static avl_tree_t	collsyms;
@@ -132,6 +196,9 @@
 static avl_tree_t	elem_by_expand;
 static avl_tree_t	collchars;
 static avl_tree_t	substs[COLL_WEIGHTS_MAX];
+static avl_tree_t	substs_ref[COLL_WEIGHTS_MAX];
+static avl_tree_t	weights[COLL_WEIGHTS_MAX];
+static int32_t		nweight[COLL_WEIGHTS_MAX];
 
 /*
  * This is state tracking for the ellipsis token.  Note that we start
@@ -152,6 +219,7 @@
  * We keep a running tally of weights.
  */
 static int nextpri = 1;
+static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
 
 /*
  * This array collects up the weights for each level.
@@ -191,7 +259,6 @@
 			prilist[i].res = UNKNOWN;
 			prilist[i].pri = 0;
 			prilist[i].pass = 0;
-			prilist[i].self = i;
 		}
 	}
 	return (numpri++);
@@ -267,6 +334,15 @@
 }
 
 static int
+weight_compare(const void *n1, const void *n2)
+{
+	int32_t	k1 = ((const weight_t *)n1)->pri;
+	int32_t	k2 = ((const weight_t *)n2)->pri;
+
+	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
+}
+
+static int
 collsym_compare(const void *n1, const void *n2)
 {
 	const collsym_t *c1 = n1;
@@ -328,6 +404,17 @@
 	return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
 }
 
+static int
+subst_compare_ref(const void *n1, const void *n2)
+{
+	int32_t *c1 = ((subst_t *)n1)->ref;
+	int32_t *c2 = ((subst_t *)n2)->ref;
+	int rv;
+
+	rv = wcscmp((wchar_t *)c1, (wchar_t *)c2);
+	return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
+}
+
 void
 init_collate(void)
 {
@@ -347,9 +434,15 @@
 	avl_create(&collchars, collchar_compare, sizeof (collchar_t),
 	    offsetof(collchar_t, avl));
 
-	for (i = 0; i < COLL_WEIGHTS_MAX; i++)
+	for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
 		avl_create(&substs[i], subst_compare, sizeof (subst_t),
 		    offsetof(subst_t, avl));
+		avl_create(&substs_ref[i], subst_compare_ref,
+		    sizeof (subst_t), offsetof(subst_t, avl_ref));
+		avl_create(&weights[i], weight_compare, sizeof (weight_t),
+		    offsetof(weight_t, avl));
+		nweight[i] = 1;
+	}
 
 	(void) memset(&collinfo, 0, sizeof (collinfo));
 
@@ -796,37 +889,52 @@
 void
 add_order_subst(void)
 {
+	subst_t srch;
 	subst_t	*s;
 	avl_index_t where;
 	int i;
 
-	if ((s = calloc(sizeof (*s), 1)) == NULL) {
-		errf(_("out of memory"));
-		return;
+	(void) memset(&srch, 0, sizeof (srch));
+	for (i = 0; i < curr_subst; i++) {
+		srch.ref[i] = subst_weights[i];
+		subst_weights[i] = 0;
 	}
-	s->key = new_pri();
+	s = avl_find(&substs_ref[curr_weight], &srch, &where);
+
+	if (s == NULL) {
+		if ((s = calloc(sizeof (*s), 1)) == NULL) {
+			errf(_("out of memory"));
+			return;
+		}
+		s->key = new_pri();
 
-	/*
-	 * We use a self reference for our key, but we set a high bit
-	 * to indicate that this is a substitution reference.  This
-	 * will expedite table lookups later, and prevent table
-	 * lookups for situations that don't require it.  (In short,
-	 * its a big win, because we can skip a lot of binary
-	 * searching.)
-	 */
-	set_pri(s->key, (nextpri | COLLATE_SUBST_PRIORITY), RESOLVED);
+		/*
+		 * We use a self reference for our key, but we set a
+		 * high bit to indicate that this is a substitution
+		 * reference.  This will expedite table lookups later,
+		 * and prevent table lookups for situations that don't
+		 * require it.  (In short, its a big win, because we
+		 * can skip a lot of binary searching.)
+		 */
+		set_pri(s->key,
+		    (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
+		    RESOLVED);
+		nextsubst[curr_weight] += 1;
 
-	for (i = 0; i < curr_subst; i++) {
-		s->ref[i] = subst_weights[i];
-		subst_weights[i] = 0;
+		for (i = 0; i < curr_subst; i++) {
+			s->ref[i] = srch.ref[i];
+		}
+
+		avl_insert(&substs_ref[curr_weight], s, where);
+
+		if (avl_find(&substs[curr_weight], s, &where) != NULL) {
+			INTERR;
+			return;
+		}
+		avl_insert(&substs[curr_weight], s, where);
 	}
 	curr_subst = 0;
 
-	if (avl_find(&substs[curr_weight], s, &where) != NULL) {
-		INTERR;
-		return;
-	}
-	avl_insert(&substs[curr_weight], s, where);
 
 	/*
 	 * We are using the current (unique) priority as a search key
@@ -884,6 +992,65 @@
 }
 
 void
+add_weight(int32_t ref, int pass)
+{
+	weight_t srch;
+	weight_t *w;
+	avl_index_t where;
+
+	srch.pri = resolve_pri(ref);
+
+	/* No translation of ignores */
+	if (srch.pri == 0)
+		return;
+
+	/* Substitution priorities are not weights */
+	if (srch.pri & COLLATE_SUBST_PRIORITY)
+		return;
+
+	if (avl_find(&weights[pass], &srch, &where) != NULL)
+		return;
+
+	if ((w = calloc(sizeof (*w), 1)) == NULL) {
+		errf(_("out of memory"));
+		return;
+	}
+	w->pri = srch.pri;
+	avl_insert(&weights[pass], w, where);
+}
+
+void
+add_weights(int32_t *refs)
+{
+	int i;
+	for (i = 0; i < NUM_WT; i++) {
+		add_weight(refs[i], i);
+	}
+}
+
+int32_t
+get_weight(int32_t ref, int pass)
+{
+	weight_t	srch;
+	weight_t	*w;
+	int32_t		pri;
+
+	pri = resolve_pri(ref);
+	if (pri & COLLATE_SUBST_PRIORITY) {
+		return (pri);
+	}
+	if (pri <= 0) {
+		return (pri);
+	}
+	srch.pri = pri;
+	if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) {
+		INTERR;
+		return (-1);
+	}
+	return (w->opt);
+}
+
+void
 dump_collate(void)
 {
 	FILE			*f;
@@ -894,18 +1061,56 @@
 	collchar_t		*cc;
 	subst_t			*sb;
 	char			vers[COLLATE_STR_LEN];
-	collate_char_pri_t	char_pri[UCHAR_MAX + 1];
-	collate_large_pri_t	*large;
+	collate_char_t		chars[UCHAR_MAX + 1];
+	collate_large_t		*large;
 	collate_subst_t		*subst[COLL_WEIGHTS_MAX];
-	collate_chain_pri_t	*chain;
+	collate_chain_t		*chain;
 
-	(void) memset(&char_pri, 0, sizeof (char_pri));
+	/*
+	 * We have to run throught a preliminary pass to identify all the
+	 * weights that we use for each sorting level.
+	 */
+	for (i = 0; i < NUM_WT; i++) {
+		add_weight(pri_ignore, i);
+	}
+	for (i = 0; i < NUM_WT; i++) {
+		for (sb = avl_first(&substs[i]); sb;
+		    sb = AVL_NEXT(&substs[i], sb)) {
+			for (j = 0; sb->ref[j]; j++) {
+				add_weight(sb->ref[j], i);
+			}
+		}
+	}
+	for (ce = avl_first(&elem_by_expand);
+	    ce != NULL;
+	    ce = AVL_NEXT(&elem_by_expand, ce)) {
+		add_weights(ce->ref);
+	}
+	for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
+		add_weights(cc->ref);
+	}
+
+	/*
+	 * Now we walk the entire set of weights, removing the gaps
+	 * in the weights.  This gives us optimum usage.  The walk
+	 * occurs in priority.
+	 */
+	for (i = 0; i < NUM_WT; i++) {
+		weight_t *w;
+		for (w = avl_first(&weights[i]); w;
+		    w = AVL_NEXT(&weights[i], w)) {
+			w->opt = nweight[i];
+			nweight[i] += 1;
+		}
+	}
+
+	(void) memset(&chars, 0, sizeof (chars));
 	(void) memset(vers, 0, COLLATE_STR_LEN);
-	(void) snprintf(vers, sizeof (vers), COLLATE_VERSION);
+	(void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
 
 	/*
 	 * We need to make sure we arrange for the UNDEFINED field
-	 * to show up.
+	 * to show up.  Also, set the total weight counts.
 	 */
 	for (i = 0; i < NUM_WT; i++) {
 		if (resolve_pri(pri_undefined[i]) == -1) {
@@ -913,8 +1118,10 @@
 			/* they collate at the end of everything else */
 			collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
 		}
+		collinfo.pri_count[i] = nweight[i];
 	}
 
+	collinfo.pri_count[NUM_WT] = max_wide();
 	collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
 	collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
 
@@ -924,26 +1131,26 @@
 	for (i = 0; i <= UCHAR_MAX; i++) {
 		if ((cc = get_collchar(i, 0)) != NULL) {
 			for (j = 0; j < NUM_WT; j++) {
-				char_pri[i].pri[j] = resolve_pri(cc->ref[j]);
+				chars[i].pri[j] = get_weight(cc->ref[j], j);
 			}
 		} else {
 			for (j = 0; j < NUM_WT; j++) {
-				char_pri[i].pri[j] =
-				    resolve_pri(pri_undefined[j]);
+				chars[i].pri[j] =
+				    get_weight(pri_undefined[j], j);
 			}
 			/*
 			 * Per POSIX, for undefined characters, we
 			 * also have to add a last item, which is the
 			 * character code.
 			 */
-			char_pri[i].pri[NUM_WT] = i;
+			chars[i].pri[NUM_WT] = i;
 		}
 	}
 
 	/*
 	 * Substitution tables
 	 */
-	for (i = 0; i < collinfo.directive_count; i++) {
+	for (i = 0; i < NUM_WT; i++) {
 		collate_subst_t *st = NULL;
 		n = collinfo.subst_count[i] = avl_numnodes(&substs[i]);
 		if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) {
@@ -957,8 +1164,11 @@
 				/* by definition these resolve! */
 				INTERR;
 			}
+			if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
+				INTERR;
+			}
 			for (j = 0; sb->ref[j]; j++) {
-				st[n].pri[j] = resolve_pri(sb->ref[j]);
+				st[n].pri[j] = get_weight(sb->ref[j], i);
 			}
 			n++;
 		}
@@ -967,11 +1177,12 @@
 		subst[i] = st;
 	}
 
+
 	/*
 	 * Chains, i.e. collating elements
 	 */
 	collinfo.chain_count = avl_numnodes(&elem_by_expand);
-	chain = calloc(sizeof (collate_chain_pri_t), collinfo.chain_count);
+	chain = calloc(sizeof (collate_chain_t), collinfo.chain_count);
 	if (chain == NULL) {
 		errf(_("out of memory"));
 		return;
@@ -981,7 +1192,7 @@
 	    ce = AVL_NEXT(&elem_by_expand, ce), n++) {
 		(void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
 		for (i = 0; i < NUM_WT; i++) {
-			chain[n].pri[i] = resolve_pri(ce->ref[i]);
+			chain[n].pri[i] = get_weight(ce->ref[i], i);
 		}
 	}
 	if (n != collinfo.chain_count)
@@ -990,8 +1201,7 @@
 	/*
 	 * Large (> UCHAR_MAX) character priorities
 	 */
-	large = calloc(sizeof (collate_large_pri_t) * avl_numnodes(&collchars),
-	    1);
+	large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1);
 	if (large == NULL) {
 		errf(_("out of memory"));
 		return;
@@ -1004,7 +1214,7 @@
 		if (cc->wc <= UCHAR_MAX)
 			continue;
 		for (j = 0; j < NUM_WT; j++) {
-			if ((pri = resolve_pri(cc->ref[j])) < 0) {
+			if ((pri = get_weight(cc->ref[j], j)) < 0) {
 				undef = 1;
 			}
 			if (undef && (pri >= 0)) {
@@ -1016,7 +1226,7 @@
 		}
 		if (!undef) {
 			large[i].val = cc->wc;
-			collinfo.large_pri_count = i++;
+			collinfo.large_count = i++;
 		}
 	}
 
@@ -1028,7 +1238,7 @@
 
 	if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
 	    (wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
-	    (wr_category(&char_pri, sizeof (char_pri), f) < 0)) {
+	    (wr_category(&chars, sizeof (chars), f) < 0)) {
 		return;
 	}
 
@@ -1038,11 +1248,11 @@
 			return;
 		}
 	}
-	sz = sizeof (collate_chain_pri_t) * collinfo.chain_count;
+	sz = sizeof (collate_chain_t) * collinfo.chain_count;
 	if (wr_category(chain, sz, f) < 0) {
 		return;
 	}
-	sz = sizeof (collate_large_pri_t) * collinfo.large_pri_count;
+	sz = sizeof (collate_large_t) * collinfo.large_count;
 	if (wr_category(large, sz, f) < 0) {
 		return;
 	}
--- a/usr/src/cmd/localedef/ctype.c	Sun Oct 10 04:30:51 2010 +0100
+++ b/usr/src/cmd/localedef/ctype.c	Mon Oct 11 16:24:58 2010 -0700
@@ -244,6 +244,20 @@
 				ctn->ctype |= _ISXDIGIT;
 			if (strchr(" \t", (char)wc))
 				ctn->ctype |= _ISBLANK;
+
+			/*
+			 * Technically these settings are only
+			 * required for the C locale.  However, it
+			 * turns out that because of the historical
+			 * version of isprint(), we need them for all
+			 * locales as well.  Note that these are not
+			 * necessarily valid punctation characters in
+			 * the current language, but ispunct() needs
+			 * to return TRUE for them.
+			 */
+			if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
+			    (char)wc))
+				ctn->ctype |= _ISPUNCT;
 		}
 
 		/*
--- a/usr/src/cmd/localedef/localedef.h	Sun Oct 10 04:30:51 2010 +0100
+++ b/usr/src/cmd/localedef/localedef.h	Mon Oct 11 16:24:58 2010 -0700
@@ -134,6 +134,7 @@
 char *to_mb_string(const wchar_t *);
 void set_wide_encoding(const char *);
 const char *get_wide_encoding(void);
+int max_wide(void);
 
 #define	_(x)	gettext(x)
 #define	INTERR	errf(_("internal fault (%s:%d)"), __FILE__, __LINE__)
--- a/usr/src/cmd/localedef/wide.c	Sun Oct 10 04:30:51 2010 +0100
+++ b/usr/src/cmd/localedef/wide.c	Mon Oct 11 16:24:58 2010 -0700
@@ -44,6 +44,7 @@
 static int (*_towide)(wchar_t *, const char *, int) = towide_none;
 static int (*_tomb)(char *, wchar_t) = tomb_none;
 static const char *_encoding = "NONE";
+static int _nbits = 7;
 
 /*
  * Table of supported encodings.  We only bother to list the multibyte
@@ -53,45 +54,71 @@
 	const char *name;
 	/* the name that the underlying libc implemenation uses */
 	const char *cname;
+	/* the maximum number of bits required for priorities */
+	int nbits;
 	int (*towide)(wchar_t *, const char *, int);
 	int (*tomb)(char *, wchar_t);
 } mb_encodings[] = {
-	{ "UTF-8",	"UTF-8",	 towide_utf8,	tomb_utf8 },
-	{ "UTF8",	"UTF-8",	 towide_utf8,	tomb_utf8 },
-	{ "utf8",	"UTF-8",	 towide_utf8,	tomb_utf8 },
-	{ "utf-8",	"UTF-8",	towide_utf8,	tomb_utf8 },
-
-	{ "EUC-CN",	"EUC-CN",	towide_euccn,	tomb_mbs },
-	{ "eucCN",	"EUC-CN",	towide_euccn,	tomb_mbs },
+	/*
+	 * UTF8 values max out at 0x1fffff (although in theory there could
+	 * be later extensions, but it won't happen.)  This means we only need
+	 * 21 bits to be able to encode the entire range of priorities.
+	 */
+	{ "UTF-8",	"UTF-8",	21, towide_utf8, tomb_utf8 },
+	{ "UTF8",	"UTF-8",	21, towide_utf8, tomb_utf8 },
+	{ "utf8",	"UTF-8",	21, towide_utf8, tomb_utf8 },
+	{ "utf-8",	"UTF-8",	21, towide_utf8, tomb_utf8 },
 
-	{ "EUC-JP",	"EUC-JP",	towide_eucjp,	tomb_mbs },
-	{ "eucJP",	"EUC-JP",	towide_eucjp,	tomb_mbs },
-
-	{ "EUC-KR",	"EUC-KR",	towide_euckr,	tomb_mbs },
-	{ "eucKR",	"EUC-KR",	towide_euckr,	tomb_mbs },
-
-	{ "EUC-TW",	"EUC-TW",	towide_euctw,	tomb_mbs },
-	{ "eucTW",	"EUC-TW",	towide_euctw,	tomb_mbs },
+	{ "EUC-CN",	"EUC-CN",	16, towide_euccn, tomb_mbs },
+	{ "eucCN",	"EUC-CN",	16, towide_euccn, tomb_mbs },
+	/*
+	 * Becuase the 3-byte form of EUC-JP use the same leading byte,
+	 * only 17 bits required to provide unique priorities.  (The low
+	 * bit of that first byte is set.)  By setting this value low,
+	 * we can get by with only 3 bytes in the strxfrm expansion.
+	 */
+	{ "EUC-JP",	"EUC-JP",	17, towide_eucjp, tomb_mbs },
+	{ "eucJP",	"EUC-JP",	17, towide_eucjp, tomb_mbs },
 
-	{ "MS_Kanji",	"MSKanji",	towide_mskanji,	tomb_mbs },
-	{ "MSKanji",	"MSKanji",	towide_mskanji,	tomb_mbs },
-	{ "PCK",	"MSKanji",	towide_mskanji,	tomb_mbs },
-	{ "SJIS",	"MSKanji",	towide_mskanji,	tomb_mbs },
-	{ "Shift_JIS",	"MSKanji",	towide_mskanji,	tomb_mbs },
+	{ "EUC-KR",	"EUC-KR",	16, towide_euckr, tomb_mbs },
+	{ "eucKR",	"EUC-KR",	16, towide_euckr, tomb_mbs },
+	/*
+	 * EUC-TW uses 2 bytes most of the time, but 4 bytes if the
+	 * high order byte is 0x8E.  However, with 4 byte encodings,
+	 * the third byte will be A0-B0.  So we only need to consider
+	 * the lower order 24 bits for collation.
+	 */
+	{ "EUC-TW",	"EUC-TW",	24, towide_euctw, tomb_mbs },
+	{ "eucTW",	"EUC-TW",	24, towide_euctw, tomb_mbs },
 
-	{ "BIG5",	"BIG5",		towide_big5,	tomb_mbs },
-	{ "big5",	"BIG5",		towide_big5,	tomb_mbs },
-	{ "Big5",	"BIG5",		towide_big5,	tomb_mbs },
+	{ "MS_Kanji",	"MSKanji",	16, towide_mskanji, tomb_mbs },
+	{ "MSKanji",	"MSKanji",	16, towide_mskanji, tomb_mbs },
+	{ "PCK",	"MSKanji",	16, towide_mskanji, tomb_mbs },
+	{ "SJIS",	"MSKanji",	16, towide_mskanji, tomb_mbs },
+	{ "Shift_JIS",	"MSKanji",	16, towide_mskanji, tomb_mbs },
 
-	{ "GBK",	"GBK",		towide_gbk,	tomb_mbs },
+	{ "BIG5",	"BIG5",		16, towide_big5, tomb_mbs },
+	{ "big5",	"BIG5",		16, towide_big5, tomb_mbs },
+	{ "Big5",	"BIG5",		16, towide_big5, tomb_mbs },
 
-	{ "GB18030",	"GB18030",	towide_gb18030,	tomb_mbs },
+	{ "GBK",	"GBK",		16, towide_gbk,	tomb_mbs },
 
-	{ "GB2312",	"GB2312",	towide_gb2312,	tomb_mbs },
+	/*
+	 * GB18030 can get away with just 31 bits.  This is because the
+	 * high order bit is always set for 4 byte values, and the
+	 * at least one of the other bits in that 4 byte value will
+	 * be non-zero.
+	 */
+	{ "GB18030",	"GB18030",	31, towide_gb18030, tomb_mbs },
 
-	{ "ASCII",	"ASCII",	towide_none,	tomb_none },
-	{ "US-ASCII",	"ASCII",	towide_none,	tomb_none },
-	{ "646",	"ASCII",	towide_none,	tomb_none },
+	/*
+	 * This should probably be an aliase for euc-cn, or vice versa.
+	 */
+	{ "GB2312",	"GB2312",	16, towide_gb2312, tomb_mbs },
+
+	{ "ASCII",	"ASCII",	7, towide_none,	tomb_none },
+	{ "US-ASCII",	"ASCII",	7, towide_none,	tomb_none },
+	{ "646",	"ASCII",	7, towide_none,	tomb_none },
 
 	{ NULL, NULL },
 };
@@ -511,7 +538,7 @@
  *
  * Code set 0 (ASCII):				0x21-0x7E
  * Code set 1 (CNS 11643-1992 Plane 1):		0xA1A1-0xFEFE
- * Code set 2 (CNS 11643-1992 Planes 1-16):	0x8EA1A1A1-0x8EB0FEFE
+ * Code set 2:					unused
  * Code set 3:					unused
  */
 int
@@ -620,12 +647,15 @@
 	_towide = towide_none;
 	_tomb = tomb_none;
 	_encoding = "NONE";
+	_nbits = 8;
 
 	for (i = 0; mb_encodings[i].name; i++) {
 		if (strcasecmp(encoding, mb_encodings[i].name) == 0) {
 			_towide = mb_encodings[i].towide;
 			_tomb = mb_encodings[i].tomb;
 			_encoding = mb_encodings[i].cname;
+			_nbits = mb_encodings[i].nbits;
+			break;
 		}
 	}
 }
@@ -635,3 +665,9 @@
 {
 	return (_encoding);
 }
+
+int
+max_wide(void)
+{
+	return ((int)((1U << _nbits) - 1));
+}
--- a/usr/src/lib/libc/port/locale/collate.c	Sun Oct 10 04:30:51 2010 +0100
+++ b/usr/src/lib/libc/port/locale/collate.c	Mon Oct 11 16:24:58 2010 -0700
@@ -46,16 +46,23 @@
 #include "setlocale.h"
 #include "ldpart.h"
 
-static collate_subst_t			*subst_table[COLL_WEIGHTS_MAX];
-static collate_char_pri_t		*char_pri_table;
-static collate_large_pri_t		*large_pri_table;
-static collate_chain_pri_t		*chain_pri_table;
-static char				*cache = NULL;
-static size_t				cachesz;
-static char				collate_encoding[ENCODING_LEN + 1];
+/*
+ * See the comments in usr/src/cmd/localedef/collate.c for further
+ * information.  It would also be very helpful to have a copy of the
+ * POSIX standard for collation (in the locale format manual page)
+ * handy (www.opengroup.org).
+ */
+
+static collate_subst_t		*subst_table[COLL_WEIGHTS_MAX];
+static collate_char_t		*char_pri_table;
+static collate_large_t		*large_pri_table;
+static collate_chain_t		*chain_pri_table;
+static char			*cache = NULL;
+static size_t			cachesz;
+static char			collate_encoding[ENCODING_LEN + 1];
 
 /* Exposed externally to other parts of libc. */
-collate_info_t				*_collate_info;
+collate_info_t			*_collate_info;
 int _collate_load_error = 1;
 
 
@@ -126,9 +133,9 @@
 		return (_LDP_ERROR);
 	}
 
-	i = (sizeof (collate_char_pri_t) * (UCHAR_MAX + 1)) +
-	    (sizeof (collate_chain_pri_t) * chains) +
-	    (sizeof (collate_large_pri_t) * info->large_pri_count);
+	i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
+	    (sizeof (collate_chain_t) * chains) +
+	    (sizeof (collate_large_t) * info->large_count);
 	for (z = 0; z < (info->directive_count); z++) {
 		i += sizeof (collate_subst_t) * info->subst_count[z];
 	}
@@ -139,7 +146,7 @@
 	}
 
 	char_pri_table = (void *)TMP;
-	TMP += sizeof (collate_char_pri_t) * (UCHAR_MAX + 1);
+	TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
 
 	for (z = 0; z < info->directive_count; z++) {
 		if (info->subst_count[z] > 0) {
@@ -152,10 +159,10 @@
 
 	if (chains > 0) {
 		chain_pri_table = (void *)TMP;
-		TMP += chains * sizeof (collate_chain_pri_t);
+		TMP += chains * sizeof (collate_chain_t);
 	} else
 		chain_pri_table = NULL;
-	if (info->large_pri_count > 0)
+	if (info->large_count > 0)
 		large_pri_table = (void *)TMP;
 	else
 		large_pri_table = NULL;
@@ -173,49 +180,39 @@
 	return (_LDP_LOADED);
 }
 
-/*
- * Note: for performance reasons, we have expanded bsearch here.  This avoids
- * function call overhead with each comparison.
- */
-
 static int32_t *
 substsearch(const wchar_t key, int pass)
 {
-	int low;
-	int high;
-	int next, compar;
 	collate_subst_t *p;
-	collate_subst_t *tab;
 	int n = _collate_info->subst_count[pass];
 
 	if (n == 0)
 		return (NULL);
 
-	tab = subst_table[pass];
-	low = 0;
-	high = n - 1;
-	while (low <= high) {
-		next = (low + high) / 2;
-		p = tab + next;
-		compar = key - p->key;
-		if (compar == 0)
-			return (p->pri);
-		if (compar > 0)
-			low = next + 1;
-		else
-			high = next - 1;
-	}
-	return (NULL);
+	if (pass >= _collate_info->directive_count)
+		return (NULL);
+
+	if (!(key & COLLATE_SUBST_PRIORITY))
+		return (NULL);
+
+	p = subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
+	assert(p->key == key);
+	return (p->pri);
 }
 
-static collate_chain_pri_t *
+/*
+ * Note: for performance reasons, we have expanded bsearch here.  This avoids
+ * function call overhead with each comparison.
+ */
+
+static collate_chain_t *
 chainsearch(const wchar_t *key, int *len)
 {
 	int low;
 	int high;
 	int next, compar, l;
-	collate_chain_pri_t *p;
-	collate_chain_pri_t *tab;
+	collate_chain_t *p;
+	collate_chain_t *tab;
 
 	if (_collate_info->chain_count == 0)
 		return (NULL);
@@ -244,16 +241,16 @@
 	return (NULL);
 }
 
-static collate_large_pri_t *
+static collate_large_t *
 largesearch(const wchar_t key)
 {
 	int low = 0;
-	int high = _collate_info->large_pri_count - 1;
+	int high = _collate_info->large_count - 1;
 	int next, compar;
-	collate_large_pri_t *p;
-	collate_large_pri_t *tab = large_pri_table;
+	collate_large_t *p;
+	collate_large_t *tab = large_pri_table;
 
-	if (_collate_info->large_pri_count == 0)
+	if (_collate_info->large_count == 0)
 		return (NULL);
 
 	while (low <= high) {
@@ -271,15 +268,26 @@
 }
 
 void
-_collate_lookup(const wchar_t *t, int *len, int *pri, int which,
-    int **state)
+_collate_lookup(const wchar_t *t, int *len, int *pri, int which, int **state)
 {
-	collate_chain_pri_t *p2;
-	collate_large_pri_t *match;
+	collate_chain_t *p2;
+	collate_large_t *match;
+	collate_info_t *info = _collate_info;
 	int p, l;
 	int *sptr;
 
 	/*
+	 * If this is the "last" pass for the UNDEFINED, then
+	 * we just return the priority itself.
+	 */
+	if (which >= info->directive_count) {
+		*pri = *t;
+		*len = 1;
+		*state = NULL;
+		return;
+	}
+
+	/*
 	 * If we have remaining substitution data from a previous
 	 * call, consume it first.
 	 */
@@ -312,7 +320,7 @@
 		 */
 		*pri = char_pri_table[*t].pri[which];
 
-	} else if ((_collate_info->large_pri_count > 0) &&
+	} else if ((info->large_count > 0) &&
 	    ((match = largesearch(*t)) != NULL)) {
 
 		/*
@@ -324,11 +332,11 @@
 		/*
 		 * Character lacks a specific definition.
 		 */
-		if (_collate_info->directive[which] & DIRECTIVE_UNDEFINED) {
+		if (info->directive[which] & DIRECTIVE_UNDEFINED) {
 			/* Mask off sign bit to prevent ordering confusion. */
 			*pri = (*t & COLLATE_MAX_PRIORITY);
 		} else {
-			*pri = _collate_info->undef_pri[which];
+			*pri = info->undef_pri[which];
 		}
 		/* No substitutions for undefined characters! */
 		return;
@@ -346,8 +354,7 @@
 	 * to be substituted be unique for that level.  The localedef
 	 * code ensures this for us.
 	 */
-	if ((*pri & COLLATE_SUBST_PRIORITY) &&
-	    (sptr = substsearch(*pri, which)) != NULL) {
+	if ((sptr = substsearch(*pri, which)) != NULL) {
 		if ((*pri = *sptr) != 0) {
 			sptr++;
 			*state = *sptr ? sptr : NULL;
@@ -489,14 +496,21 @@
 #define	XFRM_MASK	((1 << XFRM_SHIFT) - 1)
 #define	XFRM_SEP	('.')	/* chosen to be less than XFRM_OFFSET */
 
-static void
-xfrm(unsigned char *p, int pri)
+static int
+xfrm(unsigned char *p, int pri, int pass)
 {
-	int i;
-	for (i = 0; i < XFRM_BYTES; i++) {
-		p[i] = (pri & XFRM_MASK) + XFRM_OFFSET;
+	/* we use unsigned to ensure zero fill on right shift */
+	uint32_t val = (uint32_t)_collate_info->pri_count[pass];
+	int nc = 0;
+
+	while (val) {
+		*p = (pri & XFRM_MASK) + XFRM_OFFSET;
 		pri >>= XFRM_SHIFT;
+		val >>= XFRM_SHIFT;
+		p++;
+		nc++;
 	}
+	return (nc);
 }
 
 size_t
@@ -569,10 +583,9 @@
 					pri = COLLATE_MAX_PRIORITY;
 				}
 
+				b = xfrm(buf, pri, pass);
+				want += b;
 				if (room) {
-					b = XFRM_BYTES;
-					xfrm(buf, pri);
-
 					while (b) {
 						b--;
 						if (room)
@@ -580,7 +593,6 @@
 						room--;
 					}
 				}
-				want += XFRM_BYTES;
 				need = want;
 			}
 		} else {
@@ -594,9 +606,10 @@
 					}
 					continue;
 				}
+
+				b = xfrm(buf, pri, pass);
+				want += b;
 				if (room) {
-					b = XFRM_BYTES;
-					xfrm(buf, pri);
 
 					while (b) {
 						b--;
@@ -605,7 +618,6 @@
 						room--;
 					}
 				}
-				want += XFRM_BYTES;
 				need = want;
 			}
 		}
--- a/usr/src/lib/libc/port/locale/collate.h	Sun Oct 10 04:30:51 2010 +0100
+++ b/usr/src/lib/libc/port/locale/collate.h	Mon Oct 11 16:24:58 2010 -0700
@@ -33,7 +33,7 @@
 #include <limits.h>
 
 #define	COLLATE_STR_LEN		24		/* should be 64-bit multiple */
-#define	COLLATE_VERSION		"I1.0\n"
+#define	COLLATE_VERSION		"IllumosCollate2\n"
 
 #define	COLLATE_MAX_PRIORITY	(0x7fffffff)	/* max signed value */
 #define	COLLATE_SUBST_PRIORITY	(0x40000000)	/* bit indicates subst table */
@@ -65,26 +65,27 @@
 typedef struct collate_info {
 	uint8_t directive_count;
 	uint8_t directive[COLL_WEIGHTS_MAX];
+	int32_t pri_count[COLL_WEIGHTS_MAX];
 	int32_t flags;
 	int32_t chain_count;
-	int32_t large_pri_count;
+	int32_t large_count;
 	int32_t subst_count[COLL_WEIGHTS_MAX];
 	int32_t undef_pri[COLL_WEIGHTS_MAX];
 } collate_info_t;
 
-typedef struct collate_char_pri {
+typedef struct collate_char {
 	int32_t pri[COLL_WEIGHTS_MAX];
-} collate_char_pri_t;
+} collate_char_t;
 
-typedef struct collate_chain_pri {
+typedef struct collate_chain {
 	wchar_t str[COLLATE_STR_LEN];
 	int32_t pri[COLL_WEIGHTS_MAX];
-} collate_chain_pri_t;
+} collate_chain_t;
 
-typedef struct collate_large_pri {
+typedef struct collate_large {
 	int32_t val;
-	collate_char_pri_t pri;
-} collate_large_pri_t;
+	collate_char_t pri;
+} collate_large_t;
 
 typedef struct collate_subst {
 	int32_t key;