# HG changeset patch # User Garrett D'Amore # Date 1286839498 25200 # Node ID e072bd4baed8355cb972497af1f99fb6a595b170 # Parent 969b97e84bdc8e11ec7d770f4de15793889ee4fe 326 strxfrm could be faster 330 isprint() returns false for "legacy" code, results in bad prompt Reviewed by: gwr@nexenta.com Reviewed by: cjlove@san.rr.com Reviewed by: richlowe@richlowe.net Approved by: richlowe@richlowe.net diff -r 969b97e84bdc -r e072bd4baed8 usr/src/cmd/localedef/collate.c --- a/usr/src/cmd/localedef/collate.c Sun Oct 10 04:30:51 2010 +0100 +++ b/usr/src/cmd/localedef/collate.c Mon Oct 11 16:24:58 2010 -0700 @@ -33,6 +33,67 @@ #include "collate.h" /* + * Design notes. + * + * It will be extremely helpful to the reader if they have access to + * the localedef and locale file format specifications available. + * Latest versions of these are available from www.opengroup.org. + * + * The design for the collation code is a bit complex. The goal is a + * single collation database as described in collate.h (in + * libc/port/locale). However, there are some other tidbits: + * + * a) The substitution entries are now a directly indexable array. A + * priority elsewhere in the table is taken as an index into the + * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY) + * set. (The bit is cleared and the result is the index into the + * table. + * + * b) We eliminate duplicate entries into the substitution table. + * This saves a lot of space. + * + * c) The priorities for each level are "compressed", so that each + * sorting level has consecutively numbered priorities starting at 1. + * (O is reserved for the ignore priority.) This means sort levels + * which only have a few distinct priorities can represent the + * priority level in fewer bits, which makes the strxfrm output + * smaller. + * + * d) We record the total number of priorities so that strxfrm can + * figure out how many bytes to expand a numeric priority into. + * + * e) For the UNDEFINED pass (the last pass), we record the maximum + * number of bits needed to uniquely prioritize these entries, so that + * the last pass can also use smaller strxfrm output when possible. + * + * f) Priorities with the sign bit set are verboten. This works out + * because no active character set needs that bit to carry significant + * information once the character is in wide form. + * + * To process the entire data to make the database, we actually run + * multiple passes over the data. + * + * The first pass, which is done at parse time, identifies elements, + * substitutions, and such, and records them in priority order. As + * some priorities can refer to other priorities, using forward + * references, we use a table of references indicating whether the + * priority's value has been resolved, or whether it is still a + * reference. + * + * The second pass walks over all the items in priority order, noting + * that they are used directly, and not just an indirect reference. + * This is done by creating a "weight" structure for the item. The + * weights are stashed in an AVL tree sorted by relative "priority". + * + * The third pass walks over all the weight structures, in priority + * order, and assigns a new monotonically increasing (per sort level) + * weight value to them. These are the values that will actually be + * written to the file. + * + * The fourth pass just writes the data out. + */ + +/* * In order to resolve the priorities, we create a table of priorities. * Entries in the table can be in one of three states. * @@ -48,8 +109,6 @@ * this is used for forward references. A collating-symbol can never * have this value. * - * The "self" field is a reference to or own index in the pri list. - * * The "pass" field is used during final resolution to aid in detection * of referencing loops. (For example depends on , but has its * priority dependent on .) @@ -60,15 +119,19 @@ REFER /* priority is a reference (index) */ } res_t; +typedef struct weight { + int32_t pri; + int opt; + avl_node_t avl; +} weight_t; + typedef struct priority { res_t res; int32_t pri; - int32_t self; int pass; int lineno; } collpri_t; - #define NUM_WT collinfo.directive_count /* @@ -110,7 +173,7 @@ */ typedef struct collchar { wchar_t wc; - wchar_t ref[COLL_WEIGHTS_MAX]; + int32_t ref[COLL_WEIGHTS_MAX]; avl_node_t avl; } collchar_t; @@ -124,6 +187,7 @@ int32_t key; int32_t ref[COLLATE_STR_LEN]; avl_node_t avl; + avl_node_t avl_ref; } subst_t; static avl_tree_t collsyms; @@ -132,6 +196,9 @@ static avl_tree_t elem_by_expand; static avl_tree_t collchars; static avl_tree_t substs[COLL_WEIGHTS_MAX]; +static avl_tree_t substs_ref[COLL_WEIGHTS_MAX]; +static avl_tree_t weights[COLL_WEIGHTS_MAX]; +static int32_t nweight[COLL_WEIGHTS_MAX]; /* * This is state tracking for the ellipsis token. Note that we start @@ -152,6 +219,7 @@ * We keep a running tally of weights. */ static int nextpri = 1; +static int nextsubst[COLL_WEIGHTS_MAX] = { 0 }; /* * This array collects up the weights for each level. @@ -191,7 +259,6 @@ prilist[i].res = UNKNOWN; prilist[i].pri = 0; prilist[i].pass = 0; - prilist[i].self = i; } } return (numpri++); @@ -267,6 +334,15 @@ } static int +weight_compare(const void *n1, const void *n2) +{ + int32_t k1 = ((const weight_t *)n1)->pri; + int32_t k2 = ((const weight_t *)n2)->pri; + + return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); +} + +static int collsym_compare(const void *n1, const void *n2) { const collsym_t *c1 = n1; @@ -328,6 +404,17 @@ return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); } +static int +subst_compare_ref(const void *n1, const void *n2) +{ + int32_t *c1 = ((subst_t *)n1)->ref; + int32_t *c2 = ((subst_t *)n2)->ref; + int rv; + + rv = wcscmp((wchar_t *)c1, (wchar_t *)c2); + return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); +} + void init_collate(void) { @@ -347,9 +434,15 @@ avl_create(&collchars, collchar_compare, sizeof (collchar_t), offsetof(collchar_t, avl)); - for (i = 0; i < COLL_WEIGHTS_MAX; i++) + for (i = 0; i < COLL_WEIGHTS_MAX; i++) { avl_create(&substs[i], subst_compare, sizeof (subst_t), offsetof(subst_t, avl)); + avl_create(&substs_ref[i], subst_compare_ref, + sizeof (subst_t), offsetof(subst_t, avl_ref)); + avl_create(&weights[i], weight_compare, sizeof (weight_t), + offsetof(weight_t, avl)); + nweight[i] = 1; + } (void) memset(&collinfo, 0, sizeof (collinfo)); @@ -796,37 +889,52 @@ void add_order_subst(void) { + subst_t srch; subst_t *s; avl_index_t where; int i; - if ((s = calloc(sizeof (*s), 1)) == NULL) { - errf(_("out of memory")); - return; + (void) memset(&srch, 0, sizeof (srch)); + for (i = 0; i < curr_subst; i++) { + srch.ref[i] = subst_weights[i]; + subst_weights[i] = 0; } - s->key = new_pri(); + s = avl_find(&substs_ref[curr_weight], &srch, &where); + + if (s == NULL) { + if ((s = calloc(sizeof (*s), 1)) == NULL) { + errf(_("out of memory")); + return; + } + s->key = new_pri(); - /* - * We use a self reference for our key, but we set a high bit - * to indicate that this is a substitution reference. This - * will expedite table lookups later, and prevent table - * lookups for situations that don't require it. (In short, - * its a big win, because we can skip a lot of binary - * searching.) - */ - set_pri(s->key, (nextpri | COLLATE_SUBST_PRIORITY), RESOLVED); + /* + * We use a self reference for our key, but we set a + * high bit to indicate that this is a substitution + * reference. This will expedite table lookups later, + * and prevent table lookups for situations that don't + * require it. (In short, its a big win, because we + * can skip a lot of binary searching.) + */ + set_pri(s->key, + (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY), + RESOLVED); + nextsubst[curr_weight] += 1; - for (i = 0; i < curr_subst; i++) { - s->ref[i] = subst_weights[i]; - subst_weights[i] = 0; + for (i = 0; i < curr_subst; i++) { + s->ref[i] = srch.ref[i]; + } + + avl_insert(&substs_ref[curr_weight], s, where); + + if (avl_find(&substs[curr_weight], s, &where) != NULL) { + INTERR; + return; + } + avl_insert(&substs[curr_weight], s, where); } curr_subst = 0; - if (avl_find(&substs[curr_weight], s, &where) != NULL) { - INTERR; - return; - } - avl_insert(&substs[curr_weight], s, where); /* * We are using the current (unique) priority as a search key @@ -884,6 +992,65 @@ } void +add_weight(int32_t ref, int pass) +{ + weight_t srch; + weight_t *w; + avl_index_t where; + + srch.pri = resolve_pri(ref); + + /* No translation of ignores */ + if (srch.pri == 0) + return; + + /* Substitution priorities are not weights */ + if (srch.pri & COLLATE_SUBST_PRIORITY) + return; + + if (avl_find(&weights[pass], &srch, &where) != NULL) + return; + + if ((w = calloc(sizeof (*w), 1)) == NULL) { + errf(_("out of memory")); + return; + } + w->pri = srch.pri; + avl_insert(&weights[pass], w, where); +} + +void +add_weights(int32_t *refs) +{ + int i; + for (i = 0; i < NUM_WT; i++) { + add_weight(refs[i], i); + } +} + +int32_t +get_weight(int32_t ref, int pass) +{ + weight_t srch; + weight_t *w; + int32_t pri; + + pri = resolve_pri(ref); + if (pri & COLLATE_SUBST_PRIORITY) { + return (pri); + } + if (pri <= 0) { + return (pri); + } + srch.pri = pri; + if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) { + INTERR; + return (-1); + } + return (w->opt); +} + +void dump_collate(void) { FILE *f; @@ -894,18 +1061,56 @@ collchar_t *cc; subst_t *sb; char vers[COLLATE_STR_LEN]; - collate_char_pri_t char_pri[UCHAR_MAX + 1]; - collate_large_pri_t *large; + collate_char_t chars[UCHAR_MAX + 1]; + collate_large_t *large; collate_subst_t *subst[COLL_WEIGHTS_MAX]; - collate_chain_pri_t *chain; + collate_chain_t *chain; - (void) memset(&char_pri, 0, sizeof (char_pri)); + /* + * We have to run throught a preliminary pass to identify all the + * weights that we use for each sorting level. + */ + for (i = 0; i < NUM_WT; i++) { + add_weight(pri_ignore, i); + } + for (i = 0; i < NUM_WT; i++) { + for (sb = avl_first(&substs[i]); sb; + sb = AVL_NEXT(&substs[i], sb)) { + for (j = 0; sb->ref[j]; j++) { + add_weight(sb->ref[j], i); + } + } + } + for (ce = avl_first(&elem_by_expand); + ce != NULL; + ce = AVL_NEXT(&elem_by_expand, ce)) { + add_weights(ce->ref); + } + for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) { + add_weights(cc->ref); + } + + /* + * Now we walk the entire set of weights, removing the gaps + * in the weights. This gives us optimum usage. The walk + * occurs in priority. + */ + for (i = 0; i < NUM_WT; i++) { + weight_t *w; + for (w = avl_first(&weights[i]); w; + w = AVL_NEXT(&weights[i], w)) { + w->opt = nweight[i]; + nweight[i] += 1; + } + } + + (void) memset(&chars, 0, sizeof (chars)); (void) memset(vers, 0, COLLATE_STR_LEN); - (void) snprintf(vers, sizeof (vers), COLLATE_VERSION); + (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers)); /* * We need to make sure we arrange for the UNDEFINED field - * to show up. + * to show up. Also, set the total weight counts. */ for (i = 0; i < NUM_WT; i++) { if (resolve_pri(pri_undefined[i]) == -1) { @@ -913,8 +1118,10 @@ /* they collate at the end of everything else */ collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY; } + collinfo.pri_count[i] = nweight[i]; } + collinfo.pri_count[NUM_WT] = max_wide(); collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY; collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED; @@ -924,26 +1131,26 @@ for (i = 0; i <= UCHAR_MAX; i++) { if ((cc = get_collchar(i, 0)) != NULL) { for (j = 0; j < NUM_WT; j++) { - char_pri[i].pri[j] = resolve_pri(cc->ref[j]); + chars[i].pri[j] = get_weight(cc->ref[j], j); } } else { for (j = 0; j < NUM_WT; j++) { - char_pri[i].pri[j] = - resolve_pri(pri_undefined[j]); + chars[i].pri[j] = + get_weight(pri_undefined[j], j); } /* * Per POSIX, for undefined characters, we * also have to add a last item, which is the * character code. */ - char_pri[i].pri[NUM_WT] = i; + chars[i].pri[NUM_WT] = i; } } /* * Substitution tables */ - for (i = 0; i < collinfo.directive_count; i++) { + for (i = 0; i < NUM_WT; i++) { collate_subst_t *st = NULL; n = collinfo.subst_count[i] = avl_numnodes(&substs[i]); if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) { @@ -957,8 +1164,11 @@ /* by definition these resolve! */ INTERR; } + if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) { + INTERR; + } for (j = 0; sb->ref[j]; j++) { - st[n].pri[j] = resolve_pri(sb->ref[j]); + st[n].pri[j] = get_weight(sb->ref[j], i); } n++; } @@ -967,11 +1177,12 @@ subst[i] = st; } + /* * Chains, i.e. collating elements */ collinfo.chain_count = avl_numnodes(&elem_by_expand); - chain = calloc(sizeof (collate_chain_pri_t), collinfo.chain_count); + chain = calloc(sizeof (collate_chain_t), collinfo.chain_count); if (chain == NULL) { errf(_("out of memory")); return; @@ -981,7 +1192,7 @@ ce = AVL_NEXT(&elem_by_expand, ce), n++) { (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN); for (i = 0; i < NUM_WT; i++) { - chain[n].pri[i] = resolve_pri(ce->ref[i]); + chain[n].pri[i] = get_weight(ce->ref[i], i); } } if (n != collinfo.chain_count) @@ -990,8 +1201,7 @@ /* * Large (> UCHAR_MAX) character priorities */ - large = calloc(sizeof (collate_large_pri_t) * avl_numnodes(&collchars), - 1); + large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1); if (large == NULL) { errf(_("out of memory")); return; @@ -1004,7 +1214,7 @@ if (cc->wc <= UCHAR_MAX) continue; for (j = 0; j < NUM_WT; j++) { - if ((pri = resolve_pri(cc->ref[j])) < 0) { + if ((pri = get_weight(cc->ref[j], j)) < 0) { undef = 1; } if (undef && (pri >= 0)) { @@ -1016,7 +1226,7 @@ } if (!undef) { large[i].val = cc->wc; - collinfo.large_pri_count = i++; + collinfo.large_count = i++; } } @@ -1028,7 +1238,7 @@ if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) || (wr_category(&collinfo, sizeof (collinfo), f) < 0) || - (wr_category(&char_pri, sizeof (char_pri), f) < 0)) { + (wr_category(&chars, sizeof (chars), f) < 0)) { return; } @@ -1038,11 +1248,11 @@ return; } } - sz = sizeof (collate_chain_pri_t) * collinfo.chain_count; + sz = sizeof (collate_chain_t) * collinfo.chain_count; if (wr_category(chain, sz, f) < 0) { return; } - sz = sizeof (collate_large_pri_t) * collinfo.large_pri_count; + sz = sizeof (collate_large_t) * collinfo.large_count; if (wr_category(large, sz, f) < 0) { return; } diff -r 969b97e84bdc -r e072bd4baed8 usr/src/cmd/localedef/ctype.c --- a/usr/src/cmd/localedef/ctype.c Sun Oct 10 04:30:51 2010 +0100 +++ b/usr/src/cmd/localedef/ctype.c Mon Oct 11 16:24:58 2010 -0700 @@ -244,6 +244,20 @@ ctn->ctype |= _ISXDIGIT; if (strchr(" \t", (char)wc)) ctn->ctype |= _ISBLANK; + + /* + * Technically these settings are only + * required for the C locale. However, it + * turns out that because of the historical + * version of isprint(), we need them for all + * locales as well. Note that these are not + * necessarily valid punctation characters in + * the current language, but ispunct() needs + * to return TRUE for them. + */ + if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", + (char)wc)) + ctn->ctype |= _ISPUNCT; } /* diff -r 969b97e84bdc -r e072bd4baed8 usr/src/cmd/localedef/localedef.h --- a/usr/src/cmd/localedef/localedef.h Sun Oct 10 04:30:51 2010 +0100 +++ b/usr/src/cmd/localedef/localedef.h Mon Oct 11 16:24:58 2010 -0700 @@ -134,6 +134,7 @@ char *to_mb_string(const wchar_t *); void set_wide_encoding(const char *); const char *get_wide_encoding(void); +int max_wide(void); #define _(x) gettext(x) #define INTERR errf(_("internal fault (%s:%d)"), __FILE__, __LINE__) diff -r 969b97e84bdc -r e072bd4baed8 usr/src/cmd/localedef/wide.c --- a/usr/src/cmd/localedef/wide.c Sun Oct 10 04:30:51 2010 +0100 +++ b/usr/src/cmd/localedef/wide.c Mon Oct 11 16:24:58 2010 -0700 @@ -44,6 +44,7 @@ static int (*_towide)(wchar_t *, const char *, int) = towide_none; static int (*_tomb)(char *, wchar_t) = tomb_none; static const char *_encoding = "NONE"; +static int _nbits = 7; /* * Table of supported encodings. We only bother to list the multibyte @@ -53,45 +54,71 @@ const char *name; /* the name that the underlying libc implemenation uses */ const char *cname; + /* the maximum number of bits required for priorities */ + int nbits; int (*towide)(wchar_t *, const char *, int); int (*tomb)(char *, wchar_t); } mb_encodings[] = { - { "UTF-8", "UTF-8", towide_utf8, tomb_utf8 }, - { "UTF8", "UTF-8", towide_utf8, tomb_utf8 }, - { "utf8", "UTF-8", towide_utf8, tomb_utf8 }, - { "utf-8", "UTF-8", towide_utf8, tomb_utf8 }, - - { "EUC-CN", "EUC-CN", towide_euccn, tomb_mbs }, - { "eucCN", "EUC-CN", towide_euccn, tomb_mbs }, + /* + * UTF8 values max out at 0x1fffff (although in theory there could + * be later extensions, but it won't happen.) This means we only need + * 21 bits to be able to encode the entire range of priorities. + */ + { "UTF-8", "UTF-8", 21, towide_utf8, tomb_utf8 }, + { "UTF8", "UTF-8", 21, towide_utf8, tomb_utf8 }, + { "utf8", "UTF-8", 21, towide_utf8, tomb_utf8 }, + { "utf-8", "UTF-8", 21, towide_utf8, tomb_utf8 }, - { "EUC-JP", "EUC-JP", towide_eucjp, tomb_mbs }, - { "eucJP", "EUC-JP", towide_eucjp, tomb_mbs }, - - { "EUC-KR", "EUC-KR", towide_euckr, tomb_mbs }, - { "eucKR", "EUC-KR", towide_euckr, tomb_mbs }, - - { "EUC-TW", "EUC-TW", towide_euctw, tomb_mbs }, - { "eucTW", "EUC-TW", towide_euctw, tomb_mbs }, + { "EUC-CN", "EUC-CN", 16, towide_euccn, tomb_mbs }, + { "eucCN", "EUC-CN", 16, towide_euccn, tomb_mbs }, + /* + * Becuase the 3-byte form of EUC-JP use the same leading byte, + * only 17 bits required to provide unique priorities. (The low + * bit of that first byte is set.) By setting this value low, + * we can get by with only 3 bytes in the strxfrm expansion. + */ + { "EUC-JP", "EUC-JP", 17, towide_eucjp, tomb_mbs }, + { "eucJP", "EUC-JP", 17, towide_eucjp, tomb_mbs }, - { "MS_Kanji", "MSKanji", towide_mskanji, tomb_mbs }, - { "MSKanji", "MSKanji", towide_mskanji, tomb_mbs }, - { "PCK", "MSKanji", towide_mskanji, tomb_mbs }, - { "SJIS", "MSKanji", towide_mskanji, tomb_mbs }, - { "Shift_JIS", "MSKanji", towide_mskanji, tomb_mbs }, + { "EUC-KR", "EUC-KR", 16, towide_euckr, tomb_mbs }, + { "eucKR", "EUC-KR", 16, towide_euckr, tomb_mbs }, + /* + * EUC-TW uses 2 bytes most of the time, but 4 bytes if the + * high order byte is 0x8E. However, with 4 byte encodings, + * the third byte will be A0-B0. So we only need to consider + * the lower order 24 bits for collation. + */ + { "EUC-TW", "EUC-TW", 24, towide_euctw, tomb_mbs }, + { "eucTW", "EUC-TW", 24, towide_euctw, tomb_mbs }, - { "BIG5", "BIG5", towide_big5, tomb_mbs }, - { "big5", "BIG5", towide_big5, tomb_mbs }, - { "Big5", "BIG5", towide_big5, tomb_mbs }, + { "MS_Kanji", "MSKanji", 16, towide_mskanji, tomb_mbs }, + { "MSKanji", "MSKanji", 16, towide_mskanji, tomb_mbs }, + { "PCK", "MSKanji", 16, towide_mskanji, tomb_mbs }, + { "SJIS", "MSKanji", 16, towide_mskanji, tomb_mbs }, + { "Shift_JIS", "MSKanji", 16, towide_mskanji, tomb_mbs }, - { "GBK", "GBK", towide_gbk, tomb_mbs }, + { "BIG5", "BIG5", 16, towide_big5, tomb_mbs }, + { "big5", "BIG5", 16, towide_big5, tomb_mbs }, + { "Big5", "BIG5", 16, towide_big5, tomb_mbs }, - { "GB18030", "GB18030", towide_gb18030, tomb_mbs }, + { "GBK", "GBK", 16, towide_gbk, tomb_mbs }, - { "GB2312", "GB2312", towide_gb2312, tomb_mbs }, + /* + * GB18030 can get away with just 31 bits. This is because the + * high order bit is always set for 4 byte values, and the + * at least one of the other bits in that 4 byte value will + * be non-zero. + */ + { "GB18030", "GB18030", 31, towide_gb18030, tomb_mbs }, - { "ASCII", "ASCII", towide_none, tomb_none }, - { "US-ASCII", "ASCII", towide_none, tomb_none }, - { "646", "ASCII", towide_none, tomb_none }, + /* + * This should probably be an aliase for euc-cn, or vice versa. + */ + { "GB2312", "GB2312", 16, towide_gb2312, tomb_mbs }, + + { "ASCII", "ASCII", 7, towide_none, tomb_none }, + { "US-ASCII", "ASCII", 7, towide_none, tomb_none }, + { "646", "ASCII", 7, towide_none, tomb_none }, { NULL, NULL }, }; @@ -511,7 +538,7 @@ * * Code set 0 (ASCII): 0x21-0x7E * Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE - * Code set 2 (CNS 11643-1992 Planes 1-16): 0x8EA1A1A1-0x8EB0FEFE + * Code set 2: unused * Code set 3: unused */ int @@ -620,12 +647,15 @@ _towide = towide_none; _tomb = tomb_none; _encoding = "NONE"; + _nbits = 8; for (i = 0; mb_encodings[i].name; i++) { if (strcasecmp(encoding, mb_encodings[i].name) == 0) { _towide = mb_encodings[i].towide; _tomb = mb_encodings[i].tomb; _encoding = mb_encodings[i].cname; + _nbits = mb_encodings[i].nbits; + break; } } } @@ -635,3 +665,9 @@ { return (_encoding); } + +int +max_wide(void) +{ + return ((int)((1U << _nbits) - 1)); +} diff -r 969b97e84bdc -r e072bd4baed8 usr/src/lib/libc/port/locale/collate.c --- a/usr/src/lib/libc/port/locale/collate.c Sun Oct 10 04:30:51 2010 +0100 +++ b/usr/src/lib/libc/port/locale/collate.c Mon Oct 11 16:24:58 2010 -0700 @@ -46,16 +46,23 @@ #include "setlocale.h" #include "ldpart.h" -static collate_subst_t *subst_table[COLL_WEIGHTS_MAX]; -static collate_char_pri_t *char_pri_table; -static collate_large_pri_t *large_pri_table; -static collate_chain_pri_t *chain_pri_table; -static char *cache = NULL; -static size_t cachesz; -static char collate_encoding[ENCODING_LEN + 1]; +/* + * See the comments in usr/src/cmd/localedef/collate.c for further + * information. It would also be very helpful to have a copy of the + * POSIX standard for collation (in the locale format manual page) + * handy (www.opengroup.org). + */ + +static collate_subst_t *subst_table[COLL_WEIGHTS_MAX]; +static collate_char_t *char_pri_table; +static collate_large_t *large_pri_table; +static collate_chain_t *chain_pri_table; +static char *cache = NULL; +static size_t cachesz; +static char collate_encoding[ENCODING_LEN + 1]; /* Exposed externally to other parts of libc. */ -collate_info_t *_collate_info; +collate_info_t *_collate_info; int _collate_load_error = 1; @@ -126,9 +133,9 @@ return (_LDP_ERROR); } - i = (sizeof (collate_char_pri_t) * (UCHAR_MAX + 1)) + - (sizeof (collate_chain_pri_t) * chains) + - (sizeof (collate_large_pri_t) * info->large_pri_count); + i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + + (sizeof (collate_chain_t) * chains) + + (sizeof (collate_large_t) * info->large_count); for (z = 0; z < (info->directive_count); z++) { i += sizeof (collate_subst_t) * info->subst_count[z]; } @@ -139,7 +146,7 @@ } char_pri_table = (void *)TMP; - TMP += sizeof (collate_char_pri_t) * (UCHAR_MAX + 1); + TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); for (z = 0; z < info->directive_count; z++) { if (info->subst_count[z] > 0) { @@ -152,10 +159,10 @@ if (chains > 0) { chain_pri_table = (void *)TMP; - TMP += chains * sizeof (collate_chain_pri_t); + TMP += chains * sizeof (collate_chain_t); } else chain_pri_table = NULL; - if (info->large_pri_count > 0) + if (info->large_count > 0) large_pri_table = (void *)TMP; else large_pri_table = NULL; @@ -173,49 +180,39 @@ return (_LDP_LOADED); } -/* - * Note: for performance reasons, we have expanded bsearch here. This avoids - * function call overhead with each comparison. - */ - static int32_t * substsearch(const wchar_t key, int pass) { - int low; - int high; - int next, compar; collate_subst_t *p; - collate_subst_t *tab; int n = _collate_info->subst_count[pass]; if (n == 0) return (NULL); - tab = subst_table[pass]; - low = 0; - high = n - 1; - while (low <= high) { - next = (low + high) / 2; - p = tab + next; - compar = key - p->key; - if (compar == 0) - return (p->pri); - if (compar > 0) - low = next + 1; - else - high = next - 1; - } - return (NULL); + if (pass >= _collate_info->directive_count) + return (NULL); + + if (!(key & COLLATE_SUBST_PRIORITY)) + return (NULL); + + p = subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); + assert(p->key == key); + return (p->pri); } -static collate_chain_pri_t * +/* + * Note: for performance reasons, we have expanded bsearch here. This avoids + * function call overhead with each comparison. + */ + +static collate_chain_t * chainsearch(const wchar_t *key, int *len) { int low; int high; int next, compar, l; - collate_chain_pri_t *p; - collate_chain_pri_t *tab; + collate_chain_t *p; + collate_chain_t *tab; if (_collate_info->chain_count == 0) return (NULL); @@ -244,16 +241,16 @@ return (NULL); } -static collate_large_pri_t * +static collate_large_t * largesearch(const wchar_t key) { int low = 0; - int high = _collate_info->large_pri_count - 1; + int high = _collate_info->large_count - 1; int next, compar; - collate_large_pri_t *p; - collate_large_pri_t *tab = large_pri_table; + collate_large_t *p; + collate_large_t *tab = large_pri_table; - if (_collate_info->large_pri_count == 0) + if (_collate_info->large_count == 0) return (NULL); while (low <= high) { @@ -271,15 +268,26 @@ } void -_collate_lookup(const wchar_t *t, int *len, int *pri, int which, - int **state) +_collate_lookup(const wchar_t *t, int *len, int *pri, int which, int **state) { - collate_chain_pri_t *p2; - collate_large_pri_t *match; + collate_chain_t *p2; + collate_large_t *match; + collate_info_t *info = _collate_info; int p, l; int *sptr; /* + * If this is the "last" pass for the UNDEFINED, then + * we just return the priority itself. + */ + if (which >= info->directive_count) { + *pri = *t; + *len = 1; + *state = NULL; + return; + } + + /* * If we have remaining substitution data from a previous * call, consume it first. */ @@ -312,7 +320,7 @@ */ *pri = char_pri_table[*t].pri[which]; - } else if ((_collate_info->large_pri_count > 0) && + } else if ((info->large_count > 0) && ((match = largesearch(*t)) != NULL)) { /* @@ -324,11 +332,11 @@ /* * Character lacks a specific definition. */ - if (_collate_info->directive[which] & DIRECTIVE_UNDEFINED) { + if (info->directive[which] & DIRECTIVE_UNDEFINED) { /* Mask off sign bit to prevent ordering confusion. */ *pri = (*t & COLLATE_MAX_PRIORITY); } else { - *pri = _collate_info->undef_pri[which]; + *pri = info->undef_pri[which]; } /* No substitutions for undefined characters! */ return; @@ -346,8 +354,7 @@ * to be substituted be unique for that level. The localedef * code ensures this for us. */ - if ((*pri & COLLATE_SUBST_PRIORITY) && - (sptr = substsearch(*pri, which)) != NULL) { + if ((sptr = substsearch(*pri, which)) != NULL) { if ((*pri = *sptr) != 0) { sptr++; *state = *sptr ? sptr : NULL; @@ -489,14 +496,21 @@ #define XFRM_MASK ((1 << XFRM_SHIFT) - 1) #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ -static void -xfrm(unsigned char *p, int pri) +static int +xfrm(unsigned char *p, int pri, int pass) { - int i; - for (i = 0; i < XFRM_BYTES; i++) { - p[i] = (pri & XFRM_MASK) + XFRM_OFFSET; + /* we use unsigned to ensure zero fill on right shift */ + uint32_t val = (uint32_t)_collate_info->pri_count[pass]; + int nc = 0; + + while (val) { + *p = (pri & XFRM_MASK) + XFRM_OFFSET; pri >>= XFRM_SHIFT; + val >>= XFRM_SHIFT; + p++; + nc++; } + return (nc); } size_t @@ -569,10 +583,9 @@ pri = COLLATE_MAX_PRIORITY; } + b = xfrm(buf, pri, pass); + want += b; if (room) { - b = XFRM_BYTES; - xfrm(buf, pri); - while (b) { b--; if (room) @@ -580,7 +593,6 @@ room--; } } - want += XFRM_BYTES; need = want; } } else { @@ -594,9 +606,10 @@ } continue; } + + b = xfrm(buf, pri, pass); + want += b; if (room) { - b = XFRM_BYTES; - xfrm(buf, pri); while (b) { b--; @@ -605,7 +618,6 @@ room--; } } - want += XFRM_BYTES; need = want; } } diff -r 969b97e84bdc -r e072bd4baed8 usr/src/lib/libc/port/locale/collate.h --- a/usr/src/lib/libc/port/locale/collate.h Sun Oct 10 04:30:51 2010 +0100 +++ b/usr/src/lib/libc/port/locale/collate.h Mon Oct 11 16:24:58 2010 -0700 @@ -33,7 +33,7 @@ #include #define COLLATE_STR_LEN 24 /* should be 64-bit multiple */ -#define COLLATE_VERSION "I1.0\n" +#define COLLATE_VERSION "IllumosCollate2\n" #define COLLATE_MAX_PRIORITY (0x7fffffff) /* max signed value */ #define COLLATE_SUBST_PRIORITY (0x40000000) /* bit indicates subst table */ @@ -65,26 +65,27 @@ typedef struct collate_info { uint8_t directive_count; uint8_t directive[COLL_WEIGHTS_MAX]; + int32_t pri_count[COLL_WEIGHTS_MAX]; int32_t flags; int32_t chain_count; - int32_t large_pri_count; + int32_t large_count; int32_t subst_count[COLL_WEIGHTS_MAX]; int32_t undef_pri[COLL_WEIGHTS_MAX]; } collate_info_t; -typedef struct collate_char_pri { +typedef struct collate_char { int32_t pri[COLL_WEIGHTS_MAX]; -} collate_char_pri_t; +} collate_char_t; -typedef struct collate_chain_pri { +typedef struct collate_chain { wchar_t str[COLLATE_STR_LEN]; int32_t pri[COLL_WEIGHTS_MAX]; -} collate_chain_pri_t; +} collate_chain_t; -typedef struct collate_large_pri { +typedef struct collate_large { int32_t val; - collate_char_pri_t pri; -} collate_large_pri_t; + collate_char_t pri; +} collate_large_t; typedef struct collate_subst { int32_t key;