Mercurial > illumos > illumos-gate

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/common/unicode/u8_textprep.c	Fri Sep 14 10:25:36 2007 -0700
@@ -0,0 +1,2132 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+
+/*
+ * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
+ *
+ * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
+ * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
+ * the section 3C man pages.
+ * Interface stability: Committed.
+ */
+
+#include <sys/types.h>
+#ifdef	_KERNEL
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#else
+#include <sys/u8_textprep.h>
+#include <strings.h>
+#endif	/* _KERNEL */
+#include <sys/byteorder.h>
+#include <sys/errno.h>
+#include <sys/u8_textprep_data.h>
+
+
+/* The maximum possible number of bytes in a UTF-8 character. */
+#define	U8_MB_CUR_MAX			(4)
+
+/*
+ * The maximum number of bytes needed for a UTF-8 character to cover
+ * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
+ */
+#define	U8_MAX_BYTES_UCS2		(3)
+
+/* The maximum possible number of bytes in a Stream-Safe Text. */
+#define	U8_STREAM_SAFE_TEXT_MAX		(128)
+
+/*
+ * The maximum number of characters in a combining/conjoining sequence and
+ * the actual upperbound limit of a combining/conjoining sequence.
+ */
+#define	U8_MAX_CHARS_A_SEQ		(32)
+#define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
+
+/* The combining class value for Starter. */
+#define	U8_COMBINING_CLASS_STARTER	(0)
+
+/*
+ * Some Hangul related macros at below.
+ *
+ * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
+ * Vowels, and optional Trailing consonants in Unicode scalar values.
+ *
+ * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
+ * the actual U+11A8. This is due to that the trailing consonant is optional
+ * and thus we are doing a pre-calculation of subtracting one.
+ *
+ * Each of 19 modern leading consonants has total 588 possible syllables since
+ * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
+ * no trailing consonant case, i.e., 21 x 28 = 588.
+ *
+ * We also have bunch of Hangul related macros at below. Please bear in mind
+ * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
+ * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
+ * Jamo; it just guarantee that it will be most likely.
+ */
+#define	U8_HANGUL_SYL_FIRST		(0xAC00U)
+#define	U8_HANGUL_SYL_LAST		(0xD7A3U)
+
+#define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
+#define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
+#define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
+#define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
+#define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
+#define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
+
+#define	U8_HANGUL_V_COUNT		(21)
+#define	U8_HANGUL_VT_COUNT		(588)
+#define	U8_HANGUL_T_COUNT		(28)
+
+#define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
+
+#define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
+	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
+	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
+	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
+
+#define	U8_HANGUL_JAMO_L(u) \
+	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
+
+#define	U8_HANGUL_JAMO_V(u) \
+	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
+
+#define	U8_HANGUL_JAMO_T(u) \
+	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
+
+#define	U8_HANGUL_JAMO(u) \
+	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
+
+#define	U8_HANGUL_SYLLABLE(u) \
+	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
+
+#define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
+	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
+
+#define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
+	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
+
+/* The types of decomposition mappings. */
+#define	U8_DECOMP_BOTH			(0xF5U)
+#define	U8_DECOMP_CANONICAL		(0xF6U)
+
+/* The indicator for 16-bit table. */
+#define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
+
+/* The following are some convenience macros. */
+#define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
+	(u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
+		(uint32_t)(b3) & 0x3F;
+
+#define	U8_SIMPLE_SWAP(a, b, t) \
+	(t) = (a); \
+	(a) = (b); \
+	(b) = (t);
+
+#define	U8_ASCII_TOUPPER(c) \
+	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
+
+#define	U8_ASCII_TOLOWER(c) \
+	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
+
+#define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
+/*
+ * The following macro assumes that the two characters that are to be
+ * swapped are adjacent to each other and 'a' comes before 'b'.
+ *
+ * If the assumptions are not met, then, the macro will fail.
+ */
+#define	U8_SWAP_COMB_MARKS(a, b) \
+	for (k = 0; k < disp[(a)]; k++) \
+		u8t[k] = u8s[start[(a)] + k]; \
+	for (k = 0; k < disp[(b)]; k++) \
+		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
+	start[(b)] = start[(a)] + disp[(b)]; \
+	for (k = 0; k < disp[(a)]; k++) \
+		u8s[start[(b)] + k] = u8t[k]; \
+	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
+	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
+
+/* The possible states during normalization. */
+typedef enum {
+	U8_STATE_START = 0,
+	U8_STATE_HANGUL_L = 1,
+	U8_STATE_HANGUL_LV = 2,
+	U8_STATE_HANGUL_LVT = 3,
+	U8_STATE_HANGUL_V = 4,
+	U8_STATE_HANGUL_T = 5,
+	U8_STATE_COMBINING_MARK = 6
+} u8_normalization_states_t;
+
+/*
+ * The three vectors at below are used to check bytes of a given UTF-8
+ * character are valid and not containing any malformed byte values.
+ *
+ * We used to have a quite relaxed UTF-8 binary representation but then there
+ * was some security related issues and so the Unicode Consortium defined
+ * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
+ * one more time at the Unicode 3.2. The following three tables are based on
+ * that.
+ */
+
+#define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
+
+#define	I_				U8_ILLEGAL_CHAR
+#define	O_				U8_OUT_OF_RANGE_CHAR
+
+const int8_t u8_number_of_bytes[0x100] = {
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+
+/*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
+	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/*  	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
+	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/*  	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
+	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
+	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
+	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+
+/*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
+	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+
+/*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
+	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+
+/*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
+	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
+};
+
+#undef	I_
+#undef	O_
+
+const uint8_t u8_valid_min_2nd_byte[0x100] = {
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+/*	C0    C1    C2    C3    C4    C5    C6    C7    */
+	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/*	C8    C9    CA    CB    CC    CD    CE    CF    */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/*	D0    D1    D2    D3    D4    D5    D6    D7    */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/*	D8    D9    DA    DB    DC    DD    DE    DF    */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/*	E0    E1    E2    E3    E4    E5    E6    E7    */
+	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/*	E8    E9    EA    EB    EC    ED    EE    EF    */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/*	F0    F1    F2    F3    F4    F5    F6    F7    */
+	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+};
+
+const uint8_t u8_valid_max_2nd_byte[0x100] = {
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+/*	C0    C1    C2    C3    C4    C5    C6    C7    */
+	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/*	C8    C9    CA    CB    CC    CD    CE    CF    */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/*	D0    D1    D2    D3    D4    D5    D6    D7    */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/*	D8    D9    DA    DB    DC    DD    DE    DF    */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/*	E0    E1    E2    E3    E4    E5    E6    E7    */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/*	E8    E9    EA    EB    EC    ED    EE    EF    */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
+/*	F0    F1    F2    F3    F4    F5    F6    F7    */
+	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+};
+
+
+/*
+ * The u8_validate() validates on the given UTF-8 character string and
+ * calculate the byte length. It is quite similar to mblen(3C) except that
+ * this will validate against the list of characters if required and
+ * specific to UTF-8 and Unicode.
+ */
+int
+u8_validate(char *u8str, size_t n, char **list, int flag, int *errno)
+{
+	uchar_t *ib;
+	uchar_t *ibtail;
+	uchar_t **p;
+	uchar_t *s1;
+	uchar_t *s2;
+	uchar_t f;
+	int sz;
+	size_t i;
+	int ret_val;
+	boolean_t second;
+	boolean_t no_need_to_validate_entire;
+	boolean_t check_additional;
+	boolean_t validate_ucs2_range_only;
+
+	if (! u8str)
+		return (0);
+
+	ib = (uchar_t *)u8str;
+	ibtail = ib + n;
+
+	ret_val = 0;
+
+	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
+	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
+	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
+
+	while (ib < ibtail) {
+		/*
+		 * The first byte of a UTF-8 character tells how many
+		 * bytes will follow for the character. If the first byte
+		 * is an illegal byte value or out of range value, we just
+		 * return -1 with an appropriate error number.
+		 */
+		sz = u8_number_of_bytes[*ib];
+		if (sz == U8_ILLEGAL_CHAR) {
+			*errno = EILSEQ;
+			return (-1);
+		}
+
+		if (sz == U8_OUT_OF_RANGE_CHAR ||
+		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
+			*errno = ERANGE;
+			return (-1);
+		}
+
+		/*
+		 * If we don't have enough bytes to check on, that's also
+		 * an error. As you can see, we give illegal byte sequence
+		 * checking higher priority then EINVAL cases.
+		 */
+		if ((ibtail - ib) < sz) {
+			*errno = EINVAL;
+			return (-1);
+		}
+
+		if (sz == 1) {
+			ib++;
+			ret_val++;
+		} else {
+			/*
+			 * Check on the multi-byte UTF-8 character. For more
+			 * details on this, see comment added for the used
+			 * data structures at the beginning of the file.
+			 */
+			f = *ib++;
+			ret_val++;
+			second = B_TRUE;
+			for (i = 1; i < sz; i++) {
+				if (second) {
+					if (*ib < u8_valid_min_2nd_byte[f] ||
+					    *ib > u8_valid_max_2nd_byte[f]) {
+						*errno = EILSEQ;
+						return (-1);
+					}
+					second = B_FALSE;
+				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
+					*errno = EILSEQ;
+					return (-1);
+				}
+				ib++;
+				ret_val++;
+			}
+		}
+
+		if (check_additional) {
+			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
+				s1 = ib - sz;
+				s2 = p[i];
+				while (s1 < ib) {
+					if (*s1 != *s2 || *s2 == '\0')
+						break;
+					s1++;
+					s2++;
+				}
+
+				if (s1 >= ib && *s2 == '\0') {
+					*errno = EBADF;
+					return (-1);
+				}
+			}
+		}
+
+		if (no_need_to_validate_entire)
+			break;
+	}
+
+	return (ret_val);
+}
+
+/*
+ * The do_case_conv() looks at the mapping tables and returns found
+ * bytes if any. If not found, the input bytes are returned. The function
+ * always terminate the return bytes with a null character assuming that
+ * there are plenty of room to do so.
+ *
+ * The case conversions are simple case conversions mapping a character to
+ * another character as specified in the Unicode data. The byte size of
+ * the mapped character could be different from that of the input character.
+ *
+ * The return value is the byte length of the returned character excluding
+ * the terminating null byte.
+ */
+static size_t
+do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
+{
+	size_t i;
+	uint16_t b1 = 0;
+	uint16_t b2 = 0;
+	uint16_t b3 = 0;
+	uint16_t b3_tbl;
+	uint16_t b3_base;
+	uint16_t b4 = 0;
+	size_t start_id;
+	size_t end_id;
+
+	/*
+	 * At this point, the only possible values for sz are 2, 3, and 4.
+	 * The u8s should point to a vector that is well beyond the size of
+	 * 5 bytes.
+	 */
+	if (sz == 2) {
+		b3 = u8s[0] = s[0];
+		b4 = u8s[1] = s[1];
+	} else if (sz == 3) {
+		b2 = u8s[0] = s[0];
+		b3 = u8s[1] = s[1];
+		b4 = u8s[2] = s[2];
+	} else if (sz == 4) {
+		b1 = u8s[0] = s[0];
+		b2 = u8s[1] = s[1];
+		b3 = u8s[2] = s[2];
+		b4 = u8s[3] = s[3];
+	} else {
+		/* This is not possible but just in case as a fallback. */
+		if (is_it_toupper)
+			*u8s = U8_ASCII_TOUPPER(*s);
+		else
+			*u8s = U8_ASCII_TOLOWER(*s);
+		u8s[1] = '\0';
+
+		return (1);
+	}
+	u8s[sz] = '\0';
+
+	/*
+	 * Let's find out if we have a corresponding character.
+	 */
+	b1 = u8_common_b1_tbl[uv][b1];
+	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+		return ((size_t)sz);
+
+	b2 = u8_case_common_b2_tbl[uv][b1][b2];
+	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+		return ((size_t)sz);
+
+	if (is_it_toupper) {
+		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
+		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+			return ((size_t)sz);
+
+		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
+		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
+
+		/* Either there is no match or an error at the table. */
+		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
+			return ((size_t)sz);
+
+		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
+
+		for (i = 0; start_id < end_id; start_id++)
+			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
+	} else {
+		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
+		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+			return ((size_t)sz);
+
+		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
+		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
+
+		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
+			return ((size_t)sz);
+
+		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
+
+		for (i = 0; start_id < end_id; start_id++)
+			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
+	}
+
+	/*
+	 * If i is still zero, that means there is no corresponding character.
+	 */
+	if (i == 0)
+		return ((size_t)sz);
+
+	u8s[i] = '\0';
+
+	return (i);
+}
+
+/*
+ * The do_case_compare() function compares the two input strings, s1 and s2,
+ * one character at a time doing case conversions if applicable and return
+ * the comparison result as like strcmp().
+ *
+ * Since, in empirical sense, most of text data are 7-bit ASCII characters,
+ * we treat the 7-bit ASCII characters as a special case trying to yield
+ * faster processing time.
+ */
+static int
+do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
+	size_t n2, boolean_t is_it_toupper, int *errno)
+{
+	int f;
+	int sz1;
+	int sz2;
+	size_t j;
+	size_t i1;
+	size_t i2;
+	uchar_t u8s1[U8_MB_CUR_MAX + 1];
+	uchar_t u8s2[U8_MB_CUR_MAX + 1];
+
+	i1 = i2 = 0;
+	while (i1 < n1 && i2 < n2) {
+		/*
+		 * Find out what would be the byte length for this UTF-8
+		 * character at string s1 and also find out if this is
+		 * an illegal start byte or not and if so, issue a proper
+		 * errno and yet treat this byte as a character.
+		 */
+		sz1 = u8_number_of_bytes[*s1];
+		if (sz1 < 0) {
+			*errno = EILSEQ;
+			sz1 = 1;
+		}
+
+		/*
+		 * For 7-bit ASCII characters mainly, we do a quick case
+		 * conversion right at here.
+		 *
+		 * If we don't have enough bytes for this character, issue
+		 * an EINVAL error and use what are available.
+		 *
+		 * If we have enough bytes, find out if there is
+		 * a corresponding uppercase character and if so, copy over
+		 * the bytes for a comparison later. If there is no
+		 * corresponding uppercase character, then, use what we have
+		 * for the comparison.
+		 */
+		if (sz1 == 1) {
+			if (is_it_toupper)
+				u8s1[0] = U8_ASCII_TOUPPER(*s1);
+			else
+				u8s1[0] = U8_ASCII_TOLOWER(*s1);
+			s1++;
+			u8s1[1] = '\0';
+		} else if ((i1 + sz1) > n1) {
+			*errno = EINVAL;
+			for (j = 0; (i1 + j) < n1; )
+				u8s1[j++] = *s1++;
+			u8s1[j] = '\0';
+		} else {
+			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
+			s1 += sz1;
+		}
+
+		/* Do the same for the string s2. */
+		sz2 = u8_number_of_bytes[*s2];
+		if (sz2 < 0) {
+			*errno = EILSEQ;
+			sz2 = 1;
+		}
+
+		if (sz2 == 1) {
+			if (is_it_toupper)
+				u8s2[0] = U8_ASCII_TOUPPER(*s2);
+			else
+				u8s2[0] = U8_ASCII_TOLOWER(*s2);
+			s2++;
+			u8s2[1] = '\0';
+		} else if ((i2 + sz2) > n2) {
+			*errno = EINVAL;
+			for (j = 0; (i2 + j) < n2; )
+				u8s2[j++] = *s2++;
+			u8s2[j] = '\0';
+		} else {
+			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
+			s2 += sz2;
+		}
+
+		/* Now compare the two characters. */
+		if (sz1 == 1 && sz2 == 1) {
+			if (*u8s1 > *u8s2)
+				return (1);
+			if (*u8s1 < *u8s2)
+				return (-1);
+		} else {
+			f = strcmp((const char *)u8s1, (const char *)u8s2);
+			if (f != 0)
+				return (f);
+		}
+
+		/*
+		 * They were the same. Let's move on to the next
+		 * characters then.
+		 */
+		i1 += sz1;
+		i2 += sz2;
+	}
+
+	/*
+	 * We compared until the end of either or both strings.
+	 *
+	 * If we reached to or went over the ends for the both, that means
+	 * they are the same.
+	 *
+	 * If we reached only one of the two ends, that means the other string
+	 * has something which then the fact can be used to determine
+	 * the return value.
+	 */
+	if (i1 >= n1) {
+		if (i2 >= n2)
+			return (0);
+		return (-1);
+	}
+	return (1);
+}
+
+/*
+ * The combining_class() function checks on the given bytes and find out
+ * the corresponding Unicode combining class value. The return value 0 means
+ * it is a Starter. Any illegal UTF-8 character will also be treated as
+ * a Starter.
+ */
+static uchar_t
+combining_class(size_t uv, uchar_t *s, size_t sz)
+{
+	uint16_t b1 = 0;
+	uint16_t b2 = 0;
+	uint16_t b3 = 0;
+	uint16_t b4 = 0;
+
+	if (sz == 1 || sz > 4)
+		return (0);
+
+	if (sz == 2) {
+		b3 = s[0];
+		b4 = s[1];
+	} else if (sz == 3) {
+		b2 = s[0];
+		b3 = s[1];
+		b4 = s[2];
+	} else if (sz == 4) {
+		b1 = s[0];
+		b2 = s[1];
+		b3 = s[2];
+		b4 = s[3];
+	}
+
+	b1 = u8_common_b1_tbl[uv][b1];
+	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+		return (0);
+
+	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
+	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+		return (0);
+
+	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
+	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
+		return (0);
+
+	return (u8_combining_class_b4_tbl[uv][b3][b4]);
+}
+
+/*
+ * The do_decomp() function finds out a matching decomposition if any
+ * and return. If there is no match, the input bytes are copied and returned.
+ * The function also checks if there is a Hangul, decomposes it if necessary
+ * and returns.
+ *
+ * To save time, a single byte 7-bit ASCII character should be handled by
+ * the caller.
+ *
+ * The function returns the number of bytes returned sans always terminating
+ * the null byte. It will also return a state that will tell if there was
+ * a Hangul character decomposed which then will be used by the caller.
+ */
+static size_t
+do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
+	boolean_t canonical_decomposition, u8_normalization_states_t *state)
+{
+	uint16_t b1 = 0;
+	uint16_t b2 = 0;
+	uint16_t b3 = 0;
+	uint16_t b3_tbl;
+	uint16_t b3_base;
+	uint16_t b4 = 0;
+	size_t start_id;
+	size_t end_id;
+	size_t i;
+	uint32_t u1;
+
+	if (sz == 2) {
+		b3 = u8s[0] = s[0];
+		b4 = u8s[1] = s[1];
+		u8s[2] = '\0';
+	} else if (sz == 3) {
+		/* Convert it to a Unicode scalar value. */
+		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
+
+		/*
+		 * If this is a Hangul syllable, we decompose it into
+		 * a leading consonant, a vowel, and an optional trailing
+		 * consonant and then return.
+		 */
+		if (U8_HANGUL_SYLLABLE(u1)) {
+			u1 -= U8_HANGUL_SYL_FIRST;
+
+			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
+			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
+			    / U8_HANGUL_T_COUNT;
+			b3 = u1 % U8_HANGUL_T_COUNT;
+
+			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
+			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
+			if (b3) {
+				b3 += U8_HANGUL_JAMO_T_FIRST;
+				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
+
+				u8s[9] = '\0';
+				*state = U8_STATE_HANGUL_LVT;
+				return (9);
+			}
+
+			u8s[6] = '\0';
+			*state = U8_STATE_HANGUL_LV;
+			return (6);
+		}
+
+		b2 = u8s[0] = s[0];
+		b3 = u8s[1] = s[1];
+		b4 = u8s[2] = s[2];
+		u8s[3] = '\0';
+
+		/*
+		 * If this is a Hangul Jamo, we know there is nothing
+		 * further that we can decompose.
+		 */
+		if (U8_HANGUL_JAMO_L(u1)) {
+			*state = U8_STATE_HANGUL_L;
+			return (3);
+		}
+
+		if (U8_HANGUL_JAMO_V(u1)) {
+			if (*state == U8_STATE_HANGUL_L)
+				*state = U8_STATE_HANGUL_LV;
+			else
+				*state = U8_STATE_HANGUL_V;
+			return (3);
+		}
+
+		if (U8_HANGUL_JAMO_T(u1)) {
+			if (*state == U8_STATE_HANGUL_LV)
+				*state = U8_STATE_HANGUL_LVT;
+			else
+				*state = U8_STATE_HANGUL_T;
+			return (3);
+		}
+	} else if (sz == 4) {
+		b1 = u8s[0] = s[0];
+		b2 = u8s[1] = s[1];
+		b3 = u8s[2] = s[2];
+		b4 = u8s[3] = s[3];
+		u8s[4] = '\0';
+	} else {
+		/*
+		 * This is a fallback and should not happen if the function
+		 * was called properly.
+		 */
+		u8s[0] = s[0];
+		u8s[1] = '\0';
+		*state = U8_STATE_START;
+		return (1);
+	}
+
+	/*
+	 * At this point, this rountine does not know what it would get.
+	 * The caller should sort it out if the state isn't a Hangul one.
+	 */
+	*state = U8_STATE_START;
+
+	/* Try to find matching decomposition mapping byte sequence. */
+	b1 = u8_common_b1_tbl[uv][b1];
+	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+		return ((size_t)sz);
+
+	b2 = u8_decomp_b2_tbl[uv][b1][b2];
+	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+		return ((size_t)sz);
+
+	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
+	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+		return ((size_t)sz);
+
+	/*
+	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
+	 * which is 0x8000, this means we couldn't fit the mappings into
+	 * the cardinality of a unsigned byte.
+	 */
+	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
+		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
+		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
+		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
+	} else {
+		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
+		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
+	}
+
+	/* This also means there wasn't any matching decomposition. */
+	if (start_id >= end_id)
+		return ((size_t)sz);
+
+	/*
+	 * The final table for decomposition mappings has three types of
+	 * byte sequences depending on whether a mapping is for compatibility
+	 * decomposition, canonical decomposition, or both like the following:
+	 *
+	 * (1) Compatibility decomposition mappings:
+	 *
+	 *	+---+---+-...-+---+
+	 *	| B0| B1| ... | Bm|
+	 *	+---+---+-...-+---+
+	 *
+	 *	The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
+	 *
+	 * (2) Canonical decomposition mappings:
+	 *
+	 *	+---+---+---+-...-+---+
+	 *	| T | b0| b1| ... | bn|
+	 *	+---+---+---+-...-+---+
+	 *
+	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
+	 *
+	 * (3) Both mappings:
+	 *
+	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
+	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
+	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
+	 *
+	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
+	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
+	 *	compatibility mapping bytes.
+	 *
+	 * Note that compatibility decomposition means doing recursive
+	 * decompositions using both compatibility decomposition mappings and
+	 * canonical decomposition mappings. On the other hand, canonical
+	 * decomposition means doing recursive decompositions using only
+	 * canonical decomposition mappings. Since the table we have has gone
+	 * through the recursions already, we do not need to do so during
+	 * runtime, i.e., the table has been completely flattened out
+	 * already.
+	 */
+
+	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
+
+	/* Get the type, T, of the byte sequence. */
+	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
+
+	/*
+	 * If necessary, adjust start_id, end_id, or both. Note that if
+	 * this is compatibility decomposition mapping, there is no
+	 * adjustment.
+	 */
+	if (canonical_decomposition) {
+		/* Is the mapping only for compatibility decomposition? */
+		if (b1 < U8_DECOMP_BOTH)
+			return ((size_t)sz);
+
+		start_id++;
+
+		if (b1 == U8_DECOMP_BOTH) {
+			end_id = start_id +
+			    u8_decomp_final_tbl[uv][b3_base + start_id];
+			start_id++;
+		}
+	} else {
+		/*
+		 * Unless this is a compatibility decomposition mapping,
+		 * we adjust the start_id.
+		 */
+		if (b1 == U8_DECOMP_BOTH) {
+			start_id++;
+			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
+		} else if (b1 == U8_DECOMP_CANONICAL) {
+			start_id++;
+		}
+	}
+
+	for (i = 0; start_id < end_id; start_id++)
+		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
+	u8s[i] = '\0';
+
+	return (i);
+}
+
+/*
+ * The find_composition_start() function uses the character bytes given and
+ * find out the matching composition mappings if any and return the address
+ * to the composition mappings as explained in the do_composition().
+ */
+static uchar_t *
+find_composition_start(size_t uv, uchar_t *s, size_t sz)
+{
+	uint16_t b1 = 0;
+	uint16_t b2 = 0;
+	uint16_t b3 = 0;
+	uint16_t b3_tbl;
+	uint16_t b3_base;
+	uint16_t b4 = 0;
+	size_t start_id;
+	size_t end_id;
+
+	if (sz == 1) {
+		b4 = s[0];
+	} else if (sz == 2) {
+		b3 = s[0];
+		b4 = s[1];
+	} else if (sz == 3) {
+		b2 = s[0];
+		b3 = s[1];
+		b4 = s[2];
+	} else if (sz == 4) {
+		b1 = s[0];
+		b2 = s[1];
+		b3 = s[2];
+		b4 = s[3];
+	} else {
+		/*
+		 * This is a fallback and should not happen if the function
+		 * was called properly.
+		 */
+		return (NULL);
+	}
+
+	b1 = u8_composition_b1_tbl[uv][b1];
+	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+		return (NULL);
+
+	b2 = u8_composition_b2_tbl[uv][b1][b2];
+	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+		return (NULL);
+
+	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
+	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+		return (NULL);
+
+	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
+		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
+		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
+		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
+	} else {
+		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
+		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
+	}
+
+	if (start_id >= end_id)
+		return (NULL);
+
+	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
+
+	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
+}
+
+/*
+ * The blocked() function checks on the combining class values of previous
+ * characters in this sequence and return whether it is blocked or not.
+ */
+static boolean_t
+blocked(uchar_t *comb_class, size_t last)
+{
+	uchar_t my_comb_class;
+	size_t i;
+
+	my_comb_class = comb_class[last];
+	for (i = 1; i < last; i++)
+		if (comb_class[i] >= my_comb_class ||
+		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
+			return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+/*
+ * The do_composition() reads the character string pointed by 's' and
+ * do necessary canonical composition and then copy over the result back to
+ * the 's'.
+ *
+ * The input argument 's' cannot contain more than 32 characters.
+ */
+static size_t
+do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
+	uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
+{
+	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
+	uchar_t tc[U8_MB_CUR_MAX];
+	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
+	size_t saved_marks_count;
+	uchar_t *p;
+	uchar_t *saved_p;
+	uchar_t *q;
+	size_t i;
+	size_t saved_i;
+	size_t j;
+	size_t k;
+	size_t l;
+	size_t C;
+	size_t saved_l;
+	size_t size;
+	uint32_t u1;
+	uint32_t u2;
+	boolean_t match_not_found = B_TRUE;
+
+	/*
+	 * This should never happen unless the callers are doing some strange
+	 * and unexpected things.
+	 *
+	 * The "last" is the index pointing to the last character not last + 1.
+	 */
+	if (last >= U8_MAX_CHARS_A_SEQ)
+		last = U8_UPPER_LIMIT_IN_A_SEQ;
+
+	for (i = l = 0; i <= last; i++) {
+		/*
+		 * The last or any non-Starters at the beginning, we don't
+		 * have any chance to do composition and so we just copy them
+		 * to the temporary buffer.
+		 */
+		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
+SAVE_THE_CHAR:
+			p = s + start[i];
+			size = disp[i];
+			for (k = 0; k < size; k++)
+				t[l++] = *p++;
+			continue;
+		}
+
+		/*
+		 * If this could be a start of Hangul Jamos, then, we try to
+		 * conjoin them.
+		 */
+		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
+			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
+			    s[start[i] + 1], s[start[i] + 2]);
+			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
+			    s[start[i] + 4], s[start[i] + 5]);
+
+			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
+				u1 -= U8_HANGUL_JAMO_L_FIRST;
+				u2 -= U8_HANGUL_JAMO_V_FIRST;
+				u1 = U8_HANGUL_SYL_FIRST +
+				    (u1 * U8_HANGUL_V_COUNT + u2) *
+				    U8_HANGUL_T_COUNT;
+
+				i += 2;
+				if (i <= last) {
+					U8_PUT_3BYTES_INTO_UTF32(u2,
+					    s[start[i]], s[start[i] + 1],
+					    s[start[i] + 2]);
+
+					if (U8_HANGUL_JAMO_T(u2)) {
+						u1 += u2 -
+						    U8_HANGUL_JAMO_T_FIRST;
+						i++;
+					}
+				}
+
+				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
+				i--;
+				l += 3;
+				continue;
+			}
+		}
+
+		/*
+		 * Let's then find out if this Starter has composition
+		 * mapping.
+		 */
+		p = find_composition_start(uv, s + start[i], disp[i]);
+		if (p == NULL)
+			goto SAVE_THE_CHAR;
+
+		/*
+		 * We have a Starter with composition mapping and the next
+		 * character is a non-Starter. Let's try to find out if
+		 * we can do composition.
+		 */
+
+		saved_p = p;
+		saved_i = i;
+		saved_l = l;
+		saved_marks_count = 0;
+
+TRY_THE_NEXT_MARK:
+		q = s + start[++i];
+		size = disp[i];
+
+		/*
+		 * The next for() loop compares the non-Starter pointed by
+		 * 'q' with the possible (joinable) characters pointed by 'p'.
+		 *
+		 * The composition final table entry pointed by the 'p'
+		 * looks like the following:
+		 *
+		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
+		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
+		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
+		 *
+		 * where C is the count byte indicating the number of
+		 * mapping pairs where each pair would be look like
+		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
+		 * character of a canonical decomposition and the B0-Bm are
+		 * the bytes of a matching composite character. The F is
+		 * a filler byte after each character as the separator.
+		 */
+
+		match_not_found = B_TRUE;
+
+		for (C = *p++; C > 0; C--) {
+			for (k = 0; k < size; p++, k++)
+				if (*p != q[k])
+					break;
+
+			/* Have we found it? */
+			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
+				match_not_found = B_FALSE;
+
+				l = saved_l;
+
+				while (*++p != U8_TBL_ELEMENT_FILLER)
+					t[l++] = *p;
+
+				break;
+			}
+
+			/* We didn't find; skip to the next pair. */
+			if (*p != U8_TBL_ELEMENT_FILLER)
+				while (*++p != U8_TBL_ELEMENT_FILLER)
+					;
+			while (*++p != U8_TBL_ELEMENT_FILLER)
+				;
+			p++;
+		}
+
+		/*
+		 * If there was no match, we will need to save the combining
+		 * mark for later appending. After that, if the next one
+		 * is a non-Starter and not blocked, then, we try once
+		 * again to do composition with the next non-Starter.
+		 *
+		 * If there was no match and this was a Starter, then,
+		 * this is a new start.
+		 *
+		 * If there was a match and a composition done and we have
+		 * more to check on, then, we retrieve a new composition final
+		 * table entry for the composite and then try to do the
+		 * composition again.
+		 */
+
+		if (match_not_found) {
+			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
+				i--;
+				goto SAVE_THE_CHAR;
+			}
+
+			saved_marks[saved_marks_count++] = i;
+		}
+
+		if (saved_l == l) {
+			while (i < last) {
+				if (blocked(comb_class, i + 1))
+					saved_marks[saved_marks_count++] = ++i;
+				else
+					break;
+			}
+			if (i < last) {
+				p = saved_p;
+				goto TRY_THE_NEXT_MARK;
+			}
+		} else if (i < last) {
+			p = find_composition_start(uv, t + saved_l,
+			    l - saved_l);
+			if (p != NULL) {
+				saved_p = p;
+				goto TRY_THE_NEXT_MARK;
+			}
+		}
+
+		/*
+		 * There is no more composition possible.
+		 *
+		 * If there was no composition what so ever then we copy
+		 * over the original Starter and then append any non-Starters
+		 * remaining at the target string sequentially after that.
+		 */
+
+		if (saved_l == l) {
+			p = s + start[saved_i];
+			size = disp[saved_i];
+			for (j = 0; j < size; j++)
+				t[l++] = *p++;
+		}
+
+		for (k = 0; k < saved_marks_count; k++) {
+			p = s + start[saved_marks[k]];
+			size = disp[saved_marks[k]];
+			for (j = 0; j < size; j++)
+				t[l++] = *p++;
+		}
+	}
+
+	/*
+	 * If the last character is a Starter and if we have a character
+	 * (possibly another Starter) that can be turned into a composite,
+	 * we do so and we do so until there is no more of composition
+	 * possible.
+	 */
+	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
+		p = *os;
+		saved_l = l - disp[last];
+
+		while (p < oslast) {
+			size = u8_number_of_bytes[*p];
+			if (size <= 1 || (p + size) > oslast)
+				break;
+
+			saved_p = p;
+
+			for (i = 0; i < size; i++)
+				tc[i] = *p++;
+
+			q = find_composition_start(uv, t + saved_l,
+			    l - saved_l);
+			if (q == NULL) {
+				p = saved_p;
+				break;
+			}
+
+			match_not_found = B_TRUE;
+
+			for (C = *q++; C > 0; C--) {
+				for (k = 0; k < size; q++, k++)
+					if (*q != tc[k])
+						break;
+
+				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
+					match_not_found = B_FALSE;
+
+					l = saved_l;
+
+					while (*++q != U8_TBL_ELEMENT_FILLER) {
+						/*
+						 * This is practically
+						 * impossible but we don't
+						 * want to take any chances.
+						 */
+						if (l >=
+						    U8_STREAM_SAFE_TEXT_MAX) {
+							p = saved_p;
+							goto SAFE_RETURN;
+						}
+						t[l++] = *q;
+					}
+
+					break;
+				}
+
+				if (*q != U8_TBL_ELEMENT_FILLER)
+					while (*++q != U8_TBL_ELEMENT_FILLER)
+						;
+				while (*++q != U8_TBL_ELEMENT_FILLER)
+					;
+				q++;
+			}
+
+			if (match_not_found) {
+				p = saved_p;
+				break;
+			}
+		}
+SAFE_RETURN:
+		*os = p;
+	}
+
+	/*
+	 * Now we copy over the temporary string to the target string.
+	 * Since composition always reduces the number of characters or
+	 * the number of characters stay, we don't need to worry about
+	 * the buffer overflow here.
+	 */
+	for (i = 0; i < l; i++)
+		s[i] = t[i];
+	s[l] = '\0';
+
+	return (l);
+}
+
+/*
+ * The collect_a_seq() function checks on the given string s, collect
+ * a sequence of characters at u8s, and return the sequence. While it collects
+ * a sequence, it also applies case conversion, canonical or compatibility
+ * decomposition, canonical decomposition, or some or all of them and
+ * in that order.
+ *
+ * The collected sequence cannot be bigger than 32 characters since if
+ * it is having more than 31 characters, the sequence will be terminated
+ * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
+ * a Stream-Safe Text. The collected sequence is always terminated with
+ * a null byte and the return value is the byte length of the sequence
+ * including 0. The return value does not include the terminating
+ * null byte.
+ */
+static size_t
+collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
+	boolean_t is_it_toupper,
+	boolean_t is_it_tolower,
+	boolean_t canonical_decomposition,
+	boolean_t compatibility_decomposition,
+	boolean_t canonical_composition,
+	int *errno, u8_normalization_states_t *state)
+{
+	uchar_t *s;
+	int sz;
+	int saved_sz;
+	size_t i;
+	size_t j;
+	size_t k;
+	size_t l;
+	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
+	uchar_t disp[U8_MAX_CHARS_A_SEQ];
+	uchar_t start[U8_MAX_CHARS_A_SEQ];
+	uchar_t u8t[U8_MB_CUR_MAX];
+	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
+	uchar_t tc;
+	size_t last;
+	size_t saved_last;
+	uint32_t u1;
+
+	/*
+	 * Save the source string pointer which we will return a changed
+	 * pointer if we do processing.
+	 */
+	s = *source;
+
+	/*
+	 * The following is a fallback for just in case callers are not
+	 * checking the string boundaries before the calling.
+	 */
+	if (s >= slast) {
+		u8s[0] = '\0';
+
+		return (0);
+	}
+
+	/*
+	 * As the first thing, let's collect a character and do case
+	 * conversion if necessary.
+	 */
+
+	sz = u8_number_of_bytes[*s];
+
+	if (sz < 0) {
+		*errno = EILSEQ;
+
+		u8s[0] = *s++;
+		u8s[1] = '\0';
+
+		*source = s;
+
+		return (1);
+	}
+
+	if (sz == 1) {
+		if (is_it_toupper)
+			u8s[0] = U8_ASCII_TOUPPER(*s);
+		else if (is_it_tolower)
+			u8s[0] = U8_ASCII_TOLOWER(*s);
+		else
+			u8s[0] = *s;
+		s++;
+		u8s[1] = '\0';
+	} else if ((s + sz) > slast) {
+		*errno = EINVAL;
+
+		for (i = 0; s < slast; )
+			u8s[i++] = *s++;
+		u8s[i] = '\0';
+
+		*source = s;
+
+		return (i);
+	} else {
+		if (is_it_toupper || is_it_tolower) {
+			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
+			s += sz;
+			sz = i;
+		} else {
+			for (i = 0; i < sz; )
+				u8s[i++] = *s++;
+			u8s[i] = '\0';
+		}
+	}
+
+	/*
+	 * And then canonical/compatibility decomposition followed by
+	 * an optional canonical composition. Please be noted that
+	 * canonical composition is done only when a decomposition is
+	 * done.
+	 */
+	if (canonical_decomposition || compatibility_decomposition) {
+		if (sz == 1) {
+			*state = U8_STATE_START;
+
+			saved_sz = 1;
+
+			comb_class[0] = 0;
+			start[0] = 0;
+			disp[0] = 1;
+
+			last = 1;
+		} else {
+			saved_sz = do_decomp(uv, u8s, u8s, sz,
+			    canonical_decomposition, state);
+
+			last = 0;
+
+			for (i = 0; i < saved_sz; ) {
+				sz = u8_number_of_bytes[u8s[i]];
+
+				comb_class[last] = combining_class(uv,
+				    u8s + i, sz);
+				start[last] = i;
+				disp[last] = sz;
+
+				last++;
+				i += sz;
+			}
+
+			/*
+			 * Decomposition yields various Hangul related
+			 * states but not on combining marks. We need to
+			 * find out at here by checking on the last
+			 * character.
+			 */
+			if (*state == U8_STATE_START) {
+				if (comb_class[last - 1])
+					*state = U8_STATE_COMBINING_MARK;
+			}
+		}
+
+		saved_last = last;
+
+		while (s < slast) {
+			sz = u8_number_of_bytes[*s];
+
+			/*
+			 * If this is an illegal character, an incomplete
+			 * character, or an 7-bit ASCII Starter character,
+			 * then we have collected a sequence; break and let
+			 * the next call deal with the two cases.
+			 *
+			 * Note that this is okay only if you are using this
+			 * function with a fixed length string, not on
+			 * a buffer with multiple calls of one chunk at a time.
+			 */
+			if (sz <= 1) {
+				break;
+			} else if ((s + sz) > slast) {
+				break;
+			} else {
+				/*
+				 * If the previous character was a Hangul Jamo
+				 * and this character is a Hangul Jamo that
+				 * can be conjoined, we collect the Jamo.
+				 */
+				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
+					U8_PUT_3BYTES_INTO_UTF32(u1,
+					    *s, *(s + 1), *(s + 2));
+
+					if (U8_HANGUL_COMPOSABLE_L_V(*state,
+					    u1)) {
+						i = 0;
+						*state = U8_STATE_HANGUL_LV;
+						goto COLLECT_A_HANGUL;
+					}
+
+					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
+					    u1)) {
+						i = 0;
+						*state = U8_STATE_HANGUL_LVT;
+						goto COLLECT_A_HANGUL;
+					}
+				}
+
+				/*
+				 * Regardless of whatever it was, if this is
+				 * a Starter, we don't collect the character
+				 * since that's a new start and we will deal
+				 * with it at the next time.
+				 */
+				i = combining_class(uv, s, sz);
+				if (i == U8_COMBINING_CLASS_STARTER)
+					break;
+
+				/*
+				 * We know the current character is a combining
+				 * mark. If the previous character wasn't
+				 * a Starter (not Hangul) or a combining mark,
+				 * then, we don't collect this combining mark.
+				 */
+				if (*state != U8_STATE_START &&
+				    *state != U8_STATE_COMBINING_MARK)
+					break;
+
+				*state = U8_STATE_COMBINING_MARK;
+COLLECT_A_HANGUL:
+				/*
+				 * If we collected a Starter and combining
+				 * marks up to 30, i.e., total 31 characters,
+				 * then, we terminate this degenerately long
+				 * combining sequence with a U+034F COMBINING
+				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
+				 * UTF-8 and turn this into a Stream-Safe
+				 * Text. This will be extremely rare but
+				 * possible.
+				 *
+				 * The following will also guarantee that
+				 * we are not writing more than 32 characters
+				 * plus a NULL at u8s[].
+				 */
+				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
+TURN_STREAM_SAFE:
+					*state = U8_STATE_START;
+					comb_class[last] = 0;
+					start[last] = saved_sz;
+					disp[last] = 2;
+					last++;
+
+					u8s[saved_sz++] = 0xCD;
+					u8s[saved_sz++] = 0x8F;
+
+					break;
+				}
+
+				/*
+				 * Some combining marks also do decompose into
+				 * another combining mark or marks.
+				 */
+				if (*state == U8_STATE_COMBINING_MARK) {
+					k = last;
+					l = sz;
+					i = do_decomp(uv, uts, s, sz,
+					    canonical_decomposition, state);
+					for (j = 0; j < i; ) {
+						sz = u8_number_of_bytes[uts[j]];
+
+						comb_class[last] =
+						    combining_class(uv,
+						    uts + j, sz);
+						start[last] = saved_sz + j;
+						disp[last] = sz;
+
+						last++;
+						if (last >=
+						    U8_UPPER_LIMIT_IN_A_SEQ) {
+							last = k;
+							goto TURN_STREAM_SAFE;
+						}
+						j += sz;
+					}
+
+					*state = U8_STATE_COMBINING_MARK;
+					sz = i;
+					s += l;
+
+					for (i = 0; i < sz; i++)
+						u8s[saved_sz++] = uts[i];
+				} else {
+					comb_class[last] = i;
+					start[last] = saved_sz;
+					disp[last] = sz;
+					last++;
+
+					for (i = 0; i < sz; i++)
+						u8s[saved_sz++] = *s++;
+				}
+
+				/*
+				 * If this is U+0345 COMBINING GREEK
+				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
+				 * iota subscript, and need to be converted to
+				 * uppercase letter, convert it to U+0399 GREEK
+				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
+				 * i.e., convert to capital adscript form as
+				 * specified in the Unicode standard.
+				 *
+				 * This is the only special case of (ambiguous)
+				 * case conversion at combining marks and
+				 * probably the standard will never have
+				 * anything similar like this in future.
+				 */
+				if (is_it_toupper && sz >= 2 &&
+				    u8s[saved_sz - 2] == 0xCD &&
+				    u8s[saved_sz - 1] == 0x85) {
+					u8s[saved_sz - 2] = 0xCE;
+					u8s[saved_sz - 1] = 0x99;
+				}
+			}
+		}
+
+		/*
+		 * Let's try to ensure a canonical ordering for the collected
+		 * combining marks. We do this only if we have collected
+		 * at least one more non-Starter. (The decomposition mapping
+		 * data tables have fully (and recursively) expanded and
+		 * canonically ordered decompositions.)
+		 *
+		 * The U8_SWAP_COMB_MARKS() convenience macro has some
+		 * assumptions and we are meeting the assumptions.
+		 */
+		last--;
+		if (last >= saved_last) {
+			for (i = 0; i < last; i++)
+				for (j = last; j > i; j--)
+					if (comb_class[j] &&
+					    comb_class[j - 1] > comb_class[j]) {
+						U8_SWAP_COMB_MARKS(j - 1, j);
+					}
+		}
+
+		*source = s;
+
+		if (! canonical_composition) {
+			u8s[saved_sz] = '\0';
+			return (saved_sz);
+		}
+
+		/*
+		 * Now do the canonical composition. Note that we do this
+		 * only after a canonical or compatibility decomposition to
+		 * finish up NFC or NFKC.
+		 */
+		sz = do_composition(uv, u8s, comb_class, start, disp, last,
+		    &s, slast);
+	}
+
+	*source = s;
+
+	return ((size_t)sz);
+}
+
+/*
+ * The do_norm_compare() function does string comparion based on Unicode
+ * simple case mappings and Unicode Normalization definitions.
+ *
+ * It does so by collecting a sequence of character at a time and comparing
+ * the collected sequences from the strings.
+ *
+ * The meanings on the return values are the same as the usual strcmp().
+ */
+static int
+do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
+	int flag, int *errno)
+{
+	int result;
+	size_t sz1;
+	size_t sz2;
+	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
+	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
+	uchar_t *s1last;
+	uchar_t *s2last;
+	boolean_t is_it_toupper;
+	boolean_t is_it_tolower;
+	boolean_t canonical_decomposition;
+	boolean_t compatibility_decomposition;
+	boolean_t canonical_composition;
+	u8_normalization_states_t state;
+
+	s1last = s1 + n1;
+	s2last = s2 + n2;
+
+	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
+	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
+	canonical_decomposition = flag & U8_CANON_DECOMP;
+	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
+	canonical_composition = flag & U8_CANON_COMP;
+
+	while (s1 < s1last && s2 < s2last) {
+		/*
+		 * If the current character is a 7-bit ASCII and the last
+		 * character, or, if the current character and the next
+		 * character are both some 7-bit ASCII characters then
+		 * we treat the current character as a sequence.
+		 *
+		 * In any other cases, we need to call collect_a_seq().
+		 */
+
+		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
+		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
+			if (is_it_toupper)
+				u8s1[0] = U8_ASCII_TOUPPER(*s1);
+			else if (is_it_tolower)
+				u8s1[0] = U8_ASCII_TOLOWER(*s1);
+			else
+				u8s1[0] = *s1;
+			u8s1[1] = '\0';
+			sz1 = 1;
+			s1++;
+		} else {
+			state = U8_STATE_START;
+			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
+			    is_it_toupper, is_it_tolower,
+			    canonical_decomposition,
+			    compatibility_decomposition,
+			    canonical_composition, errno, &state);
+		}
+
+		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
+		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
+			if (is_it_toupper)
+				u8s2[0] = U8_ASCII_TOUPPER(*s2);
+			else if (is_it_tolower)
+				u8s2[0] = U8_ASCII_TOLOWER(*s2);
+			else
+				u8s2[0] = *s2;
+			u8s2[1] = '\0';
+			sz2 = 1;
+			s2++;
+		} else {
+			state = U8_STATE_START;
+			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
+			    is_it_toupper, is_it_tolower,
+			    canonical_decomposition,
+			    compatibility_decomposition,
+			    canonical_composition, errno, &state);
+		}
+
+		/*
+		 * Now compare the two characters. If they are the same,
+		 * we move on to the next character sequences.
+		 */
+		if (sz1 == 1 && sz2 == 1) {
+			if (*u8s1 > *u8s2)
+				return (1);
+			if (*u8s1 < *u8s2)
+				return (-1);
+		} else {
+			result = strcmp((const char *)u8s1, (const char *)u8s2);
+			if (result != 0)
+				return (result);
+		}
+	}
+
+	/*
+	 * We compared until the end of either or both strings.
+	 *
+	 * If we reached to or went over the ends for the both, that means
+	 * they are the same.
+	 *
+	 * If we reached only one end, that means the other string has
+	 * something which then can be used to determine the return value.
+	 */
+	if (s1 >= s1last) {
+		if (s2 >= s2last)
+			return (0);
+		return (-1);
+	}
+	return (1);
+}
+
+/*
+ * The u8_strcmp() function compares two UTF-8 strings quite similar to
+ * the strcmp(). For the comparison, however, Unicode Normalization specific
+ * equivalency and Unicode simple case conversion mappings based equivalency
+ * can be requested and checked against.
+ */
+int
+u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
+		int *errno)
+{
+	int f;
+	size_t n1;
+	size_t n2;
+
+	*errno = 0;
+
+	/*
+	 * Check on the requested Unicode version, case conversion, and
+	 * normalization flag values.
+	 */
+
+	if (uv > U8_UNICODE_LATEST) {
+		*errno = ERANGE;
+		uv = U8_UNICODE_LATEST;
+	}
+
+	if (flag == 0) {
+		flag = U8_STRCMP_CS;
+	} else {
+		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
+		    U8_STRCMP_CI_LOWER);
+		if (f == 0) {
+			flag |= U8_STRCMP_CS;
+		} else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
+		    f != U8_STRCMP_CI_LOWER) {
+			*errno = EBADF;
+			flag = U8_STRCMP_CS;
+		}
+
+		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
+		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
+		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
+			*errno = EBADF;
+			flag = U8_STRCMP_CS;
+		}
+	}
+
+	if (flag == U8_STRCMP_CS) {
+		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
+	}
+
+	n1 = strlen(s1);
+	n2 = strlen(s2);
+	if (n != 0) {
+		if (n < n1)
+			n1 = n;
+		if (n < n2)
+			n2 = n;
+	}
+
+	/*
+	 * Simple case conversion can be done much faster and so we do
+	 * them separately here.
+	 */
+	if (flag == U8_STRCMP_CI_UPPER) {
+		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
+		    n1, n2, B_TRUE, errno));
+	} else if (flag == U8_STRCMP_CI_LOWER) {
+		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
+		    n1, n2, B_FALSE, errno));
+	}
+
+	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
+	    flag, errno));
+}
+
+size_t
+u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
+	int flag, size_t unicode_version, int *errno)
+{
+	int f;
+	int sz;
+	uchar_t *ib;
+	uchar_t *ibtail;
+	uchar_t *ob;
+	uchar_t *obtail;
+	boolean_t do_not_ignore_null;
+	boolean_t do_not_ignore_invalid;
+	boolean_t is_it_toupper;
+	boolean_t is_it_tolower;
+	boolean_t canonical_decomposition;
+	boolean_t compatibility_decomposition;
+	boolean_t canonical_composition;
+	size_t ret_val;
+	size_t i;
+	size_t j;
+	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
+	u8_normalization_states_t state;
+
+	if (unicode_version > U8_UNICODE_LATEST) {
+		*errno = ERANGE;
+		return ((size_t)-1);
+	}
+
+	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
+	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
+		*errno = EBADF;
+		return ((size_t)-1);
+	}
+
+	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
+	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
+	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
+		*errno = EBADF;
+		return ((size_t)-1);
+	}
+
+	if (inarray == NULL || *inlen == 0)
+		return (0);
+
+	if (outarray == NULL) {
+		*errno = E2BIG;
+		return ((size_t)-1);
+	}
+
+	ib = (uchar_t *)inarray;
+	ob = (uchar_t *)outarray;
+	ibtail = ib + *inlen;
+	obtail = ob + *outlen;
+
+	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
+	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
+	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
+	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
+
+	ret_val = 0;
+
+	/*
+	 * If we don't have a normalization flag set, we do the simple case
+	 * conversion based text preparation separately below. Text
+	 * preparation involving Normalization will be done in the false task
+	 * block, again, separately since it will take much more time and
+	 * resource than doing simple case conversions.
+	 */
+	if (f == 0) {
+		while (ib < ibtail) {
+			if (*ib == '\0' && do_not_ignore_null)
+				break;
+
+			sz = u8_number_of_bytes[*ib];
+
+			if (sz < 0) {
+				if (do_not_ignore_invalid) {
+					*errno = EILSEQ;
+					ret_val = (size_t)-1;
+					break;
+				}
+
+				sz = 1;
+				ret_val++;
+			}
+
+			if (sz == 1) {
+				if (ob >= obtail) {
+					*errno = E2BIG;
+					ret_val = (size_t)-1;
+					break;
+				}
+
+				if (is_it_toupper)
+					*ob = U8_ASCII_TOUPPER(*ib);
+				else if (is_it_tolower)
+					*ob = U8_ASCII_TOLOWER(*ib);
+				else
+					*ob = *ib;
+				ib++;
+				ob++;
+			} else if ((ib + sz) > ibtail) {
+				if (do_not_ignore_invalid) {
+					*errno = EINVAL;
+					ret_val = (size_t)-1;
+					break;
+				}
+
+				if ((obtail - ob) < (ibtail - ib)) {
+					*errno = E2BIG;
+					ret_val = (size_t)-1;
+					break;
+				}
+
+				/*
+				 * We treat the remaining incomplete character
+				 * bytes as a character.
+				 */
+				ret_val++;
+
+				while (ib < ibtail)
+					*ob++ = *ib++;
+			} else {
+				if (is_it_toupper || is_it_tolower) {
+					i = do_case_conv(unicode_version, u8s,
+					    ib, sz, is_it_toupper);
+
+					if ((obtail - ob) < i) {
+						*errno = E2BIG;
+						ret_val = (size_t)-1;
+						break;
+					}
+
+					ib += sz;
+
+					for (sz = 0; sz < i; sz++)
+						*ob++ = u8s[sz];
+				} else {
+					if ((obtail - ob) < sz) {
+						*errno = E2BIG;
+						ret_val = (size_t)-1;
+						break;
+					}
+
+					for (i = 0; i < sz; i++)
+						*ob++ = *ib++;
+				}
+			}
+		}
+	} else {
+		canonical_decomposition = flag & U8_CANON_DECOMP;
+		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
+		canonical_composition = flag & U8_CANON_COMP;
+
+		while (ib < ibtail) {
+			if (*ib == '\0' && do_not_ignore_null)
+				break;
+
+			/*
+			 * If the current character is a 7-bit ASCII
+			 * character and it is the last character, or,
+			 * if the current character is a 7-bit ASCII
+			 * character and the next character is also a 7-bit
+			 * ASCII character, then, we copy over this
+			 * character without going through collect_a_seq().
+			 *
+			 * In any other cases, we need to look further with
+			 * the collect_a_seq() function.
+			 */
+			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
+			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
+				if (ob >= obtail) {
+					*errno = E2BIG;
+					ret_val = (size_t)-1;
+					break;
+				}
+
+				if (is_it_toupper)
+					*ob = U8_ASCII_TOUPPER(*ib);
+				else if (is_it_tolower)
+					*ob = U8_ASCII_TOLOWER(*ib);
+				else
+					*ob = *ib;
+				ib++;
+				ob++;
+			} else {
+				*errno = 0;
+				state = U8_STATE_START;
+
+				j = collect_a_seq(unicode_version, u8s,
+				    &ib, ibtail,
+				    is_it_toupper,
+				    is_it_tolower,
+				    canonical_decomposition,
+				    compatibility_decomposition,
+				    canonical_composition,
+				    errno, &state);
+
+				if (*errno && do_not_ignore_invalid) {
+					ret_val = (size_t)-1;
+					break;
+				}
+
+				if ((obtail - ob) < j) {
+					*errno = E2BIG;
+					ret_val = (size_t)-1;
+					break;
+				}
+
+				for (i = 0; i < j; i++)
+					*ob++ = u8s[i];
+			}
+		}
+	}
+
+	*inlen = ibtail - ib;
+	*outlen = obtail - ob;
+
+	return (ret_val);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/common/unicode/uconv.c	Fri Sep 14 10:25:36 2007 -0700
@@ -0,0 +1,851 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
+ * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
+ * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
+ * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
+ * the section 3C man pages.
+ * Interface stability: Committed
+ */
+
+#include <sys/types.h>
+#ifdef	_KERNEL
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/sunddi.h>
+#else
+#include <sys/u8_textprep.h>
+#endif	/* _KERNEL */
+#include <sys/byteorder.h>
+#include <sys/errno.h>
+
+
+/*
+ * The max and min values of high and low surrogate pairs of UTF-16,
+ * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
+ */
+#define	UCONV_U16_HI_MIN	(0xd800U)
+#define	UCONV_U16_HI_MAX	(0xdbffU)
+#define	UCONV_U16_LO_MIN	(0xdc00U)
+#define	UCONV_U16_LO_MAX	(0xdfffU)
+#define	UCONV_U16_BIT_SHIFT	(0x0400U)
+#define	UCONV_U16_BIT_MASK	(0x0fffffU)
+#define	UCONV_U16_START		(0x010000U)
+
+/* The maximum value of Unicode coding space and ASCII coding space. */
+#define	UCONV_UNICODE_MAX	(0x10ffffU)
+#define	UCONV_ASCII_MAX		(0x7fU)
+
+/* The mask values for input and output endians. */
+#define	UCONV_IN_ENDIAN_MASKS	(UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
+#define	UCONV_OUT_ENDIAN_MASKS	(UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
+
+/* Native and reversed endian macros. */
+#ifdef	_BIG_ENDIAN
+#define	UCONV_IN_NAT_ENDIAN	UCONV_IN_BIG_ENDIAN
+#define	UCONV_IN_REV_ENDIAN	UCONV_IN_LITTLE_ENDIAN
+#define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_BIG_ENDIAN
+#define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
+#else
+#define	UCONV_IN_NAT_ENDIAN	UCONV_IN_LITTLE_ENDIAN
+#define	UCONV_IN_REV_ENDIAN	UCONV_IN_BIG_ENDIAN
+#define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
+#define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_BIG_ENDIAN
+#endif	/* _BIG_ENDIAN */
+
+/* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
+#define	UCONV_BOM_NORMAL	(0xfeffU)
+#define	UCONV_BOM_SWAPPED	(0xfffeU)
+#define	UCONV_BOM_SWAPPED_32	(0xfffe0000U)
+
+/* UTF-32 boundaries based on UTF-8 character byte lengths. */
+#define	UCONV_U8_ONE_BYTE	(0x7fU)
+#define	UCONV_U8_TWO_BYTES	(0x7ffU)
+#define	UCONV_U8_THREE_BYTES	(0xffffU)
+#define	UCONV_U8_FOUR_BYTES	(0x10ffffU)
+
+/* The common minimum and maximum values at the UTF-8 character bytes. */
+#define	UCONV_U8_BYTE_MIN	(0x80U)
+#define	UCONV_U8_BYTE_MAX	(0xbfU)
+
+/*
+ * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
+ * UTF-8 character bytes.
+ */
+#define	UCONV_U8_BIT_SHIFT	6
+#define	UCONV_U8_BIT_MASK	0x3f
+
+/*
+ * The following vector shows remaining bytes in a UTF-8 character.
+ * Index will be the first byte of the character.
+ */
+static const uchar_t remaining_bytes_tbl[0x100] = {
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+
+/*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
+	0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+
+/*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+
+/*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
+	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+
+/*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
+	3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+};
+
+/*
+ * The following is a vector of bit-masks to get used bits in
+ * the first byte of a UTF-8 character.  Index is remaining bytes at above of
+ * the character.
+ */
+static const uchar_t masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
+
+/*
+ * The following two vectors are to provide valid minimum and
+ * maximum values for the 2'nd byte of a multibyte UTF-8 character for
+ * better illegal sequence checking. The index value must be the value of
+ * the first byte of the UTF-8 character.
+ */
+static const uchar_t valid_min_2nd_byte[0x100] = {
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+
+/*	C0    C1    C2    C3    C4    C5    C6    C7 */
+	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/*	C8    C9    CA    CB    CC    CD    CE    CF */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/*	D0    D1    D2    D3    D4    D5    D6    D7 */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/*	D8    D9    DA    DB    DC    DD    DE    DF */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/*	E0    E1    E2    E3    E4    E5    E6    E7 */
+	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/*	E8    E9    EA    EB    EC    ED    EE    EF */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/*	F0    F1    F2    F3    F4    F5    F6    F7 */
+	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
+
+	0,    0,    0,    0,    0,    0,    0,    0
+};
+
+static const uchar_t valid_max_2nd_byte[0x100] = {
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+
+/*	C0    C1    C2    C3    C4    C5    C6    C7 */
+	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/*	C8    C9    CA    CB    CC    CD    CE    CF */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/*	D0    D1    D2    D3    D4    D5    D6    D7 */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/*	D8    D9    DA    DB    DC    DD    DE    DF */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/*	E0    E1    E2    E3    E4    E5    E6    E7 */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/*	E8    E9    EA    EB    EC    ED    EE    EF */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
+
+/*	F0    F1    F2    F3    F4    F5    F6    F7 */
+	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
+
+	0,    0,    0,    0,    0,    0,    0,    0
+};
+
+
+static int
+check_endian(int flag, int *in, int *out)
+{
+	*in = flag & UCONV_IN_ENDIAN_MASKS;
+
+	/* You cannot have both. */
+	if (*in == UCONV_IN_ENDIAN_MASKS)
+		return (EBADF);
+
+	if (*in == 0)
+		*in = UCONV_IN_NAT_ENDIAN;
+
+	*out = flag & UCONV_OUT_ENDIAN_MASKS;
+
+	/* You cannot have both. */
+	if (*out == UCONV_OUT_ENDIAN_MASKS)
+		return (EBADF);
+
+	if (*out == 0)
+		*out = UCONV_OUT_NAT_ENDIAN;
+
+	return (0);
+}
+
+static boolean_t
+check_bom16(const uint16_t *u16s, size_t u16l, int *in)
+{
+	if (u16l > 0) {
+		if (*u16s == UCONV_BOM_NORMAL) {
+			*in = UCONV_IN_NAT_ENDIAN;
+			return (B_TRUE);
+		}
+		if (*u16s == UCONV_BOM_SWAPPED) {
+			*in = UCONV_IN_REV_ENDIAN;
+			return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+static boolean_t
+check_bom32(const uint32_t *u32s, size_t u32l, int *in)
+{
+	if (u32l > 0) {
+		if (*u32s == UCONV_BOM_NORMAL) {
+			*in = UCONV_IN_NAT_ENDIAN;
+			return (B_TRUE);
+		}
+		if (*u32s == UCONV_BOM_SWAPPED_32) {
+			*in = UCONV_IN_REV_ENDIAN;
+			return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+int
+uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
+    uint32_t *u32s, size_t *utf32len, int flag)
+{
+	int inendian;
+	int outendian;
+	size_t u16l;
+	size_t u32l;
+	uint32_t hi;
+	uint32_t lo;
+	boolean_t do_not_ignore_null;
+
+	/*
+	 * Do preliminary validity checks on parameters and collect info on
+	 * endians.
+	 */
+	if (u16s == NULL || utf16len == NULL)
+		return (EILSEQ);
+
+	if (u32s == NULL || utf32len == NULL)
+		return (E2BIG);
+
+	if (check_endian(flag, &inendian, &outendian) != 0)
+		return (EBADF);
+
+	/*
+	 * Initialize input and output parameter buffer indices and
+	 * temporary variables.
+	 */
+	u16l = u32l = 0;
+	hi = 0;
+	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+	/*
+	 * Check on the BOM at the beginning of the input buffer if required
+	 * and if there is indeed one, process it.
+	 */
+	if ((flag & UCONV_IN_ACCEPT_BOM) &&
+	    check_bom16(u16s, *utf16len, &inendian))
+		u16l++;
+
+	/*
+	 * Reset inendian and outendian so that after this point, those can be
+	 * used as condition values.
+	 */
+	inendian &= UCONV_IN_NAT_ENDIAN;
+	outendian &= UCONV_OUT_NAT_ENDIAN;
+
+	/*
+	 * If there is something in the input buffer and if necessary and
+	 * requested, save the BOM at the output buffer.
+	 */
+	if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
+		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
+		    UCONV_BOM_SWAPPED_32;
+
+	/*
+	 * Do conversion; if encounter a surrogate pair, assemble high and
+	 * low pair values to form a UTF-32 character. If a half of a pair
+	 * exists alone, then, either it is an illegal (EILSEQ) or
+	 * invalid (EINVAL) value.
+	 */
+	for (; u16l < *utf16len; u16l++) {
+		if (u16s[u16l] == 0 && do_not_ignore_null)
+			break;
+
+		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
+
+		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
+			if (hi)
+				return (EILSEQ);
+			hi = lo;
+			continue;
+		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
+			if (! hi)
+				return (EILSEQ);
+			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
+			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
+			    + UCONV_U16_START;
+			hi = 0;
+		} else if (hi) {
+			return (EILSEQ);
+		}
+
+		if (u32l >= *utf32len)
+			return (E2BIG);
+
+		u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
+	}
+
+	/*
+	 * If high half didn't see low half, then, it's most likely the input
+	 * parameter is incomplete.
+	 */
+	if (hi)
+		return (EINVAL);
+
+	/*
+	 * Save the number of consumed and saved characters. They do not
+	 * include terminating NULL character (U+0000) at the end of
+	 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
+	 * the input buffer length is big enough to include the terminating
+	 * NULL character).
+	 */
+	*utf16len = u16l;
+	*utf32len = u32l;
+
+	return (0);
+}
+
+int
+uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
+    uchar_t *u8s, size_t *utf8len, int flag)
+{
+	int inendian;
+	int outendian;
+	size_t u16l;
+	size_t u8l;
+	uint32_t hi;
+	uint32_t lo;
+	boolean_t do_not_ignore_null;
+
+	if (u16s == NULL || utf16len == NULL)
+		return (EILSEQ);
+
+	if (u8s == NULL || utf8len == NULL)
+		return (E2BIG);
+
+	if (check_endian(flag, &inendian, &outendian) != 0)
+		return (EBADF);
+
+	u16l = u8l = 0;
+	hi = 0;
+	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+	if ((flag & UCONV_IN_ACCEPT_BOM) &&
+	    check_bom16(u16s, *utf16len, &inendian))
+		u16l++;
+
+	inendian &= UCONV_IN_NAT_ENDIAN;
+
+	for (; u16l < *utf16len; u16l++) {
+		if (u16s[u16l] == 0 && do_not_ignore_null)
+			break;
+
+		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
+
+		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
+			if (hi)
+				return (EILSEQ);
+			hi = lo;
+			continue;
+		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
+			if (! hi)
+				return (EILSEQ);
+			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
+			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
+			    + UCONV_U16_START;
+			hi = 0;
+		} else if (hi) {
+			return (EILSEQ);
+		}
+
+		/*
+		 * Now we convert a UTF-32 character into a UTF-8 character.
+		 * Unicode coding space is between U+0000 and U+10FFFF;
+		 * anything bigger is an illegal character.
+		 */
+		if (lo <= UCONV_U8_ONE_BYTE) {
+			if (u8l >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)lo;
+		} else if (lo <= UCONV_U8_TWO_BYTES) {
+			if ((u8l + 1) >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
+			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
+		} else if (lo <= UCONV_U8_THREE_BYTES) {
+			if ((u8l + 2) >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
+			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
+			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
+		} else if (lo <= UCONV_U8_FOUR_BYTES) {
+			if ((u8l + 3) >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
+			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
+			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
+			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
+		} else {
+			return (EILSEQ);
+		}
+	}
+
+	if (hi)
+		return (EINVAL);
+
+	*utf16len = u16l;
+	*utf8len = u8l;
+
+	return (0);
+}
+
+int
+uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
+    uint16_t *u16s, size_t *utf16len, int flag)
+{
+	int inendian;
+	int outendian;
+	size_t u16l;
+	size_t u32l;
+	uint32_t hi;
+	uint32_t lo;
+	boolean_t do_not_ignore_null;
+
+	if (u32s == NULL || utf32len == NULL)
+		return (EILSEQ);
+
+	if (u16s == NULL || utf16len == NULL)
+		return (E2BIG);
+
+	if (check_endian(flag, &inendian, &outendian) != 0)
+		return (EBADF);
+
+	u16l = u32l = 0;
+	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+	if ((flag & UCONV_IN_ACCEPT_BOM) &&
+	    check_bom32(u32s, *utf32len, &inendian))
+		u32l++;
+
+	inendian &= UCONV_IN_NAT_ENDIAN;
+	outendian &= UCONV_OUT_NAT_ENDIAN;
+
+	if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
+		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
+		    UCONV_BOM_SWAPPED;
+
+	for (; u32l < *utf32len; u32l++) {
+		if (u32s[u32l] == 0 && do_not_ignore_null)
+			break;
+
+		hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
+
+		/*
+		 * Anything bigger than the Unicode coding space, i.e.,
+		 * Unicode scalar value bigger than U+10FFFF, is an illegal
+		 * character.
+		 */
+		if (hi > UCONV_UNICODE_MAX)
+			return (EILSEQ);
+
+		/*
+		 * Anything bigger than U+FFFF must be converted into
+		 * a surrogate pair in UTF-16.
+		 */
+		if (hi >= UCONV_U16_START) {
+			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
+			    UCONV_U16_LO_MIN;
+			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
+			    UCONV_U16_HI_MIN;
+
+			if ((u16l + 1) >= *utf16len)
+				return (E2BIG);
+
+			if (outendian) {
+				u16s[u16l++] = (uint16_t)hi;
+				u16s[u16l++] = (uint16_t)lo;
+			} else {
+				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
+				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
+			}
+		} else {
+			if (u16l >= *utf16len)
+				return (E2BIG);
+			u16s[u16l++] = (outendian) ? (uint16_t)hi :
+			    BSWAP_16(((uint16_t)hi));
+		}
+	}
+
+	*utf16len = u16l;
+	*utf32len = u32l;
+
+	return (0);
+}
+
+int
+uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
+    uchar_t *u8s, size_t *utf8len, int flag)
+{
+	int inendian;
+	int outendian;
+	size_t u32l;
+	size_t u8l;
+	uint32_t lo;
+	boolean_t do_not_ignore_null;
+
+	if (u32s == NULL || utf32len == NULL)
+		return (EILSEQ);
+
+	if (u8s == NULL || utf8len == NULL)
+		return (E2BIG);
+
+	if (check_endian(flag, &inendian, &outendian) != 0)
+		return (EBADF);
+
+	u32l = u8l = 0;
+	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+	if ((flag & UCONV_IN_ACCEPT_BOM) &&
+	    check_bom32(u32s, *utf32len, &inendian))
+		u32l++;
+
+	inendian &= UCONV_IN_NAT_ENDIAN;
+
+	for (; u32l < *utf32len; u32l++) {
+		if (u32s[u32l] == 0 && do_not_ignore_null)
+			break;
+
+		lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
+
+		if (lo <= UCONV_U8_ONE_BYTE) {
+			if (u8l >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)lo;
+		} else if (lo <= UCONV_U8_TWO_BYTES) {
+			if ((u8l + 1) >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
+			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
+		} else if (lo <= UCONV_U8_THREE_BYTES) {
+			if ((u8l + 2) >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
+			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
+			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
+		} else if (lo <= UCONV_U8_FOUR_BYTES) {
+			if ((u8l + 3) >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
+			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
+			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
+			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
+		} else {
+			return (EILSEQ);
+		}
+	}
+
+	*utf32len = u32l;
+	*utf8len = u8l;
+
+	return (0);
+}
+
+int
+uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
+    uint16_t *u16s, size_t *utf16len, int flag)
+{
+	int inendian;
+	int outendian;
+	size_t u16l;
+	size_t u8l;
+	uint32_t hi;
+	uint32_t lo;
+	int remaining_bytes;
+	int first_b;
+	boolean_t do_not_ignore_null;
+
+	if (u8s == NULL || utf8len == NULL)
+		return (EILSEQ);
+
+	if (u16s == NULL || utf16len == NULL)
+		return (E2BIG);
+
+	if (check_endian(flag, &inendian, &outendian) != 0)
+		return (EBADF);
+
+	u16l = u8l = 0;
+	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+	outendian &= UCONV_OUT_NAT_ENDIAN;
+
+	if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
+		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
+		    UCONV_BOM_SWAPPED;
+
+	for (; u8l < *utf8len; ) {
+		if (u8s[u8l] == 0 && do_not_ignore_null)
+			break;
+
+		/*
+		 * Collect a UTF-8 character and convert it to a UTF-32
+		 * character. In doing so, we screen out illegally formed
+		 * UTF-8 characters and treat such as illegal characters.
+		 * The algorithm at below also screens out anything bigger
+		 * than the U+10FFFF.
+		 *
+		 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
+		 * more details on the illegal values of UTF-8 character
+		 * bytes.
+		 */
+		hi = (uint32_t)u8s[u8l++];
+
+		if (hi > UCONV_ASCII_MAX) {
+			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
+				return (EILSEQ);
+
+			first_b = hi;
+			hi = hi & masks_tbl[remaining_bytes];
+
+			for (; remaining_bytes > 0; remaining_bytes--) {
+				/*
+				 * If we have no more bytes, the current
+				 * UTF-8 character is incomplete.
+				 */
+				if (u8l >= *utf8len)
+					return (EINVAL);
+
+				lo = (uint32_t)u8s[u8l++];
+
+				if (first_b) {
+					if (lo < valid_min_2nd_byte[first_b] ||
+					    lo > valid_max_2nd_byte[first_b])
+						return (EILSEQ);
+					first_b = 0;
+				} else if (lo < UCONV_U8_BYTE_MIN ||
+				    lo > UCONV_U8_BYTE_MAX) {
+					return (EILSEQ);
+				}
+				hi = (hi << UCONV_U8_BIT_SHIFT) |
+				    (lo & UCONV_U8_BIT_MASK);
+			}
+		}
+
+		if (hi >= UCONV_U16_START) {
+			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
+			    UCONV_U16_LO_MIN;
+			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
+			    UCONV_U16_HI_MIN;
+
+			if ((u16l + 1) >= *utf16len)
+				return (E2BIG);
+
+			if (outendian) {
+				u16s[u16l++] = (uint16_t)hi;
+				u16s[u16l++] = (uint16_t)lo;
+			} else {
+				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
+				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
+			}
+		} else {
+			if (u16l >= *utf16len)
+				return (E2BIG);
+
+			u16s[u16l++] = (outendian) ? (uint16_t)hi :
+			    BSWAP_16(((uint16_t)hi));
+		}
+	}
+
+	*utf16len = u16l;
+	*utf8len = u8l;
+
+	return (0);
+}
+
+int
+uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
+    uint32_t *u32s, size_t *utf32len, int flag)
+{
+	int inendian;
+	int outendian;
+	size_t u32l;
+	size_t u8l;
+	uint32_t hi;
+	uint32_t c;
+	int remaining_bytes;
+	int first_b;
+	boolean_t do_not_ignore_null;
+
+	if (u8s == NULL || utf8len == NULL)
+		return (EILSEQ);
+
+	if (u32s == NULL || utf32len == NULL)
+		return (E2BIG);
+
+	if (check_endian(flag, &inendian, &outendian) != 0)
+		return (EBADF);
+
+	u32l = u8l = 0;
+	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+	outendian &= UCONV_OUT_NAT_ENDIAN;
+
+	if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
+		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
+		    UCONV_BOM_SWAPPED_32;
+
+	for (; u8l < *utf8len; ) {
+		if (u8s[u8l] == 0 && do_not_ignore_null)
+			break;
+
+		hi = (uint32_t)u8s[u8l++];
+
+		if (hi > UCONV_ASCII_MAX) {
+			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
+				return (EILSEQ);
+
+			first_b = hi;
+			hi = hi & masks_tbl[remaining_bytes];
+
+			for (; remaining_bytes > 0; remaining_bytes--) {
+				if (u8l >= *utf8len)
+					return (EINVAL);
+
+				c = (uint32_t)u8s[u8l++];
+
+				if (first_b) {
+					if (c < valid_min_2nd_byte[first_b] ||
+					    c > valid_max_2nd_byte[first_b])
+						return (EILSEQ);
+					first_b = 0;
+				} else if (c < UCONV_U8_BYTE_MIN ||
+				    c > UCONV_U8_BYTE_MAX) {
+					return (EILSEQ);
+				}
+				hi = (hi << UCONV_U8_BIT_SHIFT) |
+				    (c & UCONV_U8_BIT_MASK);
+			}
+		}
+
+		if (u32l >= *utf32len)
+			return (E2BIG);
+
+		u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
+	}
+
+	*utf32len = u32l;
+	*utf8len = u8l;
+
+	return (0);
+}
--- a/usr/src/lib/libc/Makefile.targ	Fri Sep 14 08:32:57 2007 -0700
+++ b/usr/src/lib/libc/Makefile.targ	Fri Sep 14 10:25:36 2007 -0700
@@ -276,6 +276,10 @@
 	$(COMPILE.c) -o $@ $(SRC)/common/dtrace/$(@F:.o=.c)
 	$(POST_PROCESS_O)

+$(UNICODEOBJS:%=pics/%): $(SRC)/common/unicode/$$(@F:.o=.c)
+	$(COMPILE.c) -o $@ $(SRC)/common/unicode/$(@F:.o=.c)
+	$(POST_PROCESS_O)
+
 # DTrace rules
 pics/%.o: ../port/threads/%.d $(THREADSOBJS:%=pics/%)
 	$(COMPILE.d) -C -xlazyload -s $< -o $@ $(THREADSOBJS:%=pics/%)
--- a/usr/src/lib/libc/amd64/Makefile	Fri Sep 14 08:32:57 2007 -0700
+++ b/usr/src/lib/libc/amd64/Makefile	Fri Sep 14 10:25:36 2007 -0700
@@ -762,6 +762,10 @@
 THREADSASMOBJS=			\
 	asm_subr.o

+UNICODEOBJS=			\
+	u8_textprep.o		\
+	uconv.o
+
 UNWINDMACHOBJS=			\
 	call_frame_inst.o	\
 	eh_frame.o		\
@@ -866,6 +870,7 @@
 	$(THREADSOBJS)		\
 	$(THREADSMACHOBJS)	\
 	$(THREADSASMOBJS)	\
+	$(UNICODEOBJS)		\
 	$(UNWINDMACHOBJS)	\
 	$(UNWINDASMOBJS)	\
 	$(COMSYSOBJS)		\
@@ -974,6 +979,7 @@
 	$(TPOOLOBJS:%.o=../port/tpool/%.c)		\
 	$(THREADSOBJS:%.o=../port/threads/%.c)		\
 	$(THREADSMACHOBJS:%.o=threads/%.c)		\
+	$(UNICODEOBJS:%.o=$(SRC)/common/unicode/%.c)	\
 	$(UNWINDMACHOBJS:%.o=unwind/%.c)		\
 	$(FPOBJS:%.o=fp/%.c)				\
 	$(I386FPOBJS:%.o=../i386/fp/%.c)		\
--- a/usr/src/lib/libc/i386/Makefile.com	Fri Sep 14 08:32:57 2007 -0700
+++ b/usr/src/lib/libc/i386/Makefile.com	Fri Sep 14 10:25:36 2007 -0700
@@ -804,6 +804,10 @@
 THREADSASMOBJS=			\
 	asm_subr.o

+UNICODEOBJS=			\
+	u8_textprep.o		\
+	uconv.o
+
 UNWINDMACHOBJS=			\
 	unwind.o

@@ -904,6 +908,7 @@
 	$(THREADSOBJS)		\
 	$(THREADSMACHOBJS)	\
 	$(THREADSASMOBJS)	\
+	$(UNICODEOBJS)		\
 	$(UNWINDMACHOBJS)	\
 	$(UNWINDASMOBJS)	\
 	$(COMSYSOBJS)		\
@@ -1035,6 +1040,7 @@
 	$(TPOOLOBJS:%.o=../port/tpool/%.c)		\
 	$(THREADSOBJS:%.o=../port/threads/%.c)		\
 	$(THREADSMACHOBJS:%.o=../$(MACH)/threads/%.c)	\
+	$(UNICODEOBJS:%.o=$(SRC)/common/unicode/%.c)	\
 	$(UNWINDMACHOBJS:%.o=../port/unwind/%.c)	\
 	$(FPOBJS:%.o=../$(MACH)/fp/%.c)			\
 	$(LIBCBASE)/gen/ecvt.c				\
--- a/usr/src/lib/libc/port/mapfile-vers	Fri Sep 14 08:32:57 2007 -0700
+++ b/usr/src/lib/libc/port/mapfile-vers	Fri Sep 14 10:25:36 2007 -0700
@@ -123,6 +123,15 @@
 	timer_getoverrun;
 	timer_gettime;
 	timer_settime;
+	u8_strcmp;
+	u8_textprep_str;
+	u8_validate;
+	uconv_u16tou32;
+	uconv_u16tou8;
+	uconv_u32tou16;
+	uconv_u32tou8;
+	uconv_u8tou16;
+	uconv_u8tou32;
 	uucopy;
 	uucopystr;
 	vforkx;
--- a/usr/src/lib/libc/sparc/Makefile	Fri Sep 14 08:32:57 2007 -0700
+++ b/usr/src/lib/libc/sparc/Makefile	Fri Sep 14 10:25:36 2007 -0700
@@ -830,6 +830,10 @@
 THREADSASMOBJS=			\
 	asm_subr.o

+UNICODEOBJS=			\
+	u8_textprep.o		\
+	uconv.o
+
 UNWINDMACHOBJS=			\
 	unwind.o

@@ -932,6 +936,7 @@
 	$(THREADSOBJS)		\
 	$(THREADSMACHOBJS)	\
 	$(THREADSASMOBJS)	\
+	$(UNICODEOBJS)		\
 	$(UNWINDMACHOBJS)	\
 	$(UNWINDASMOBJS)	\
 	$(COMSYSOBJS)		\
@@ -1053,6 +1058,7 @@
 	$(TPOOLOBJS:%.o=../port/tpool/%.c)		\
 	$(THREADSOBJS:%.o=../port/threads/%.c)		\
 	$(THREADSMACHOBJS:%.o=../$(MACH)/threads/%.c)	\
+	$(UNICODEOBJS:%.o=$(SRC)/common/unicode/%.c)	\
 	$(UNWINDMACHOBJS:%.o=../port/unwind/%.c)	\
 	$(FPOBJS:%.o=../$(MACH)/fp/%.c)			\
 	$(LIBCBASE)/crt/_ftou.c				\
--- a/usr/src/lib/libc/sparcv9/Makefile	Fri Sep 14 08:32:57 2007 -0700
+++ b/usr/src/lib/libc/sparcv9/Makefile	Fri Sep 14 10:25:36 2007 -0700
@@ -777,6 +777,10 @@
 THREADSASMOBJS=			\
 	asm_subr.o

+UNICODEOBJS=			\
+	u8_textprep.o		\
+	uconv.o
+
 UNWINDMACHOBJS=			\
 	unwind.o

@@ -875,6 +879,7 @@
 	$(THREADSOBJS)		\
 	$(THREADSMACHOBJS)	\
 	$(THREADSASMOBJS)	\
+	$(UNICODEOBJS)		\
 	$(UNWINDMACHOBJS)	\
 	$(UNWINDASMOBJS)	\
 	$(COMSYSOBJS)		\
@@ -986,6 +991,7 @@
 	$(TPOOLOBJS:%.o=../port/tpool/%.c)		\
 	$(THREADSOBJS:%.o=../port/threads/%.c)		\
 	$(THREADSMACHOBJS:%.o=../$(MACH)/threads/%.c)	\
+	$(UNICODEOBJS:%.o=$(SRC)/common/unicode/%.c)	\
 	$(UNWINDMACHOBJS:%.o=../port/unwind/%.c)	\
 	$(FPOBJS:%.o=../$(MACH)/fp/%.c)			\
 	$(FPOBJS64:%.o=$(LIBCBASE)/fp/%.c)		\
--- a/usr/src/pkgdefs/SUNWhea/prototype_com	Fri Sep 14 08:32:57 2007 -0700
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com	Fri Sep 14 10:25:36 2007 -0700
@@ -1293,6 +1293,7 @@
 f none usr/include/sys/turnstile.h 644 root bin
 f none usr/include/sys/types.h 644 root bin
 f none usr/include/sys/types32.h 644 root bin
+f none usr/include/sys/u8_textprep.h 644 root bin
 f none usr/include/sys/uadmin.h 644 root bin
 f none usr/include/sys/ucontext.h 644 root bin
 f none usr/include/sys/uio.h 644 root bin
--- a/usr/src/uts/common/Makefile.rules	Fri Sep 14 08:32:57 2007 -0700
+++ b/usr/src/uts/common/Makefile.rules	Fri Sep 14 10:25:36 2007 -0700
@@ -951,6 +951,10 @@
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)

+$(OBJS_DIR)/%.o:		$(COMMONBASE)/unicode/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/common/vm/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -1673,6 +1677,9 @@
 $(LINTS_DIR)/%.ln:		$(COMMONBASE)/util/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))

+$(LINTS_DIR)/%.ln:		$(COMMONBASE)/unicode/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/common/vm/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
--- a/usr/src/uts/common/os/u8_textprep.c	Fri Sep 14 08:32:57 2007 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2126 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-
-/*
- * UTF-8 text preparation functions (PSARC/2007/149).
- *
- * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
- * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F).
- * Interface stability: Committed.
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/sysmacros.h>
-#include <sys/systm.h>
-#include <sys/debug.h>
-#include <sys/kmem.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/byteorder.h>
-#include <sys/errno.h>
-#include <sys/u8_textprep_data.h>
-
-
-/* The maximum possible number of bytes in a UTF-8 character. */
-#define	U8_MB_CUR_MAX			(4)
-
-/*
- * The maximum number of bytes needed for a UTF-8 character to cover
- * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
- */
-#define	U8_MAX_BYTES_UCS2		(3)
-
-/* The maximum possible number of bytes in a Stream-Safe Text. */
-#define	U8_STREAM_SAFE_TEXT_MAX		(128)
-
-/*
- * The maximum number of characters in a combining/conjoining sequence and
- * the actual upperbound limit of a combining/conjoining sequence.
- */
-#define	U8_MAX_CHARS_A_SEQ		(32)
-#define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
-
-/* The combining class value for Starter. */
-#define	U8_COMBINING_CLASS_STARTER	(0)
-
-/*
- * Some Hangul related macros at below.
- *
- * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
- * Vowels, and optional Trailing consonants in Unicode scalar values.
- *
- * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
- * the actual U+11A8. This is due to that the trailing consonant is optional
- * and thus we are doing a pre-calculation of subtracting one.
- *
- * Each of 19 modern leading consonants has total 588 possible syllables since
- * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
- * no trailing consonant case, i.e., 21 x 28 = 588.
- *
- * We also have bunch of Hangul related macros at below. Please bear in mind
- * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
- * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
- * Jamo; it just guarantee that it will be most likely.
- */
-#define	U8_HANGUL_SYL_FIRST		(0xAC00U)
-#define	U8_HANGUL_SYL_LAST		(0xD7A3U)
-
-#define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
-#define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
-#define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
-#define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
-#define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
-#define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
-
-#define	U8_HANGUL_V_COUNT		(21)
-#define	U8_HANGUL_VT_COUNT		(588)
-#define	U8_HANGUL_T_COUNT		(28)
-
-#define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
-
-#define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
-	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
-	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
-	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
-
-#define	U8_HANGUL_JAMO_L(u) \
-	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
-
-#define	U8_HANGUL_JAMO_V(u) \
-	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
-
-#define	U8_HANGUL_JAMO_T(u) \
-	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
-
-#define	U8_HANGUL_JAMO(u) \
-	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
-
-#define	U8_HANGUL_SYLLABLE(u) \
-	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
-
-#define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
-	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
-
-#define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
-	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
-
-/* The types of decomposition mappings. */
-#define	U8_DECOMP_BOTH			(0xF5U)
-#define	U8_DECOMP_CANONICAL		(0xF6U)
-
-/* The indicator for 16-bit table. */
-#define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
-
-/* The following are some convenience macros. */
-#define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \
-	(u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \
-		(uint32_t)(b3) & 0x3F;
-
-#define	U8_SIMPLE_SWAP(a, b, t) \
-	(t) = (a); \
-	(a) = (b); \
-	(b) = (t);
-
-#define	U8_ASCII_TOUPPER(c) \
-	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
-
-#define	U8_ASCII_TOLOWER(c) \
-	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
-
-#define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
-/*
- * The following macro assumes that the two characters that are to be
- * swapped are adjacent to each other and 'a' comes before 'b'.
- *
- * If the assumptions are not met, then, the macro will fail.
- */
-#define	U8_SWAP_COMB_MARKS(a, b) \
-	for (k = 0; k < disp[(a)]; k++) \
-		u8t[k] = u8s[start[(a)] + k]; \
-	for (k = 0; k < disp[(b)]; k++) \
-		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
-	start[(b)] = start[(a)] + disp[(b)]; \
-	for (k = 0; k < disp[(a)]; k++) \
-		u8s[start[(b)] + k] = u8t[k]; \
-	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
-	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
-
-/* The possible states during normalization. */
-typedef enum {
-	U8_STATE_START = 0,
-	U8_STATE_HANGUL_L = 1,
-	U8_STATE_HANGUL_LV = 2,
-	U8_STATE_HANGUL_LVT = 3,
-	U8_STATE_HANGUL_V = 4,
-	U8_STATE_HANGUL_T = 5,
-	U8_STATE_COMBINING_MARK = 6
-} u8_normalization_states_t;
-
-/*
- * The three vectors at below are used to check bytes of a given UTF-8
- * character are valid and not containing any malformed byte values.
- *
- * We used to have a quite relaxed UTF-8 binary representation but then there
- * was some security related issues and so the Unicode Consortium defined
- * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
- * one more time at the Unicode 3.2. The following three tables are based on
- * that.
- */
-
-#define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
-
-#define	I_				U8_ILLEGAL_CHAR
-#define	O_				U8_OUT_OF_RANGE_CHAR
-
-const int8_t u8_number_of_bytes[0x100] = {
-	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-
-/*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
-	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
-
-/*  	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
-	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
-
-/*  	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
-	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
-
-/*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
-	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
-
-/*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
-	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-
-/*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
-	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-
-/*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
-	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
-
-/*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
-	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
-};
-
-#undef	I_
-#undef	O_
-
-const uint8_t u8_valid_min_2nd_byte[0x100] = {
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-/*	C0    C1    C2    C3    C4    C5    C6    C7    */
-	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-/*	C8    C9    CA    CB    CC    CD    CE    CF    */
-	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-/*	D0    D1    D2    D3    D4    D5    D6    D7    */
-	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-/*	D8    D9    DA    DB    DC    DD    DE    DF    */
-	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-/*	E0    E1    E2    E3    E4    E5    E6    E7    */
-	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-/*	E8    E9    EA    EB    EC    ED    EE    EF    */
-	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-/*	F0    F1    F2    F3    F4    F5    F6    F7    */
-	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-};
-
-const uint8_t u8_valid_max_2nd_byte[0x100] = {
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-/*	C0    C1    C2    C3    C4    C5    C6    C7    */
-	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-/*	C8    C9    CA    CB    CC    CD    CE    CF    */
-	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-/*	D0    D1    D2    D3    D4    D5    D6    D7    */
-	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-/*	D8    D9    DA    DB    DC    DD    DE    DF    */
-	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-/*	E0    E1    E2    E3    E4    E5    E6    E7    */
-	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-/*	E8    E9    EA    EB    EC    ED    EE    EF    */
-	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
-/*	F0    F1    F2    F3    F4    F5    F6    F7    */
-	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-};
-
-
-/*
- * The u8_validate() validates on the given UTF-8 character string and
- * calculate the byte length. It is quite similar to mblen(3C) except that
- * this will validate against the list of characters if required and
- * specific to UTF-8 and Unicode.
- */
-int
-u8_validate(char *u8str, size_t n, char **list, int flag, int *errno)
-{
-	uchar_t *ib;
-	uchar_t *ibtail;
-	uchar_t **p;
-	uchar_t *s1;
-	uchar_t *s2;
-	uchar_t f;
-	int sz;
-	size_t i;
-	int ret_val;
-	boolean_t second;
-	boolean_t no_need_to_validate_entire;
-	boolean_t check_additional;
-	boolean_t validate_ucs2_range_only;
-
-	if (! u8str)
-		return (0);
-
-	ib = (uchar_t *)u8str;
-	ibtail = ib + n;
-
-	ret_val = 0;
-
-	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
-	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
-	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
-
-	while (ib < ibtail) {
-		/*
-		 * The first byte of a UTF-8 character tells how many
-		 * bytes will follow for the character. If the first byte
-		 * is an illegal byte value or out of range value, we just
-		 * return -1 with an appropriate error number.
-		 */
-		sz = u8_number_of_bytes[*ib];
-		if (sz == U8_ILLEGAL_CHAR) {
-			*errno = EILSEQ;
-			return (-1);
-		}
-
-		if (sz == U8_OUT_OF_RANGE_CHAR ||
-		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
-			*errno = ERANGE;
-			return (-1);
-		}
-
-		/*
-		 * If we don't have enough bytes to check on, that's also
-		 * an error. As you can see, we give illegal byte sequence
-		 * checking higher priority then EINVAL cases.
-		 */
-		if ((ibtail - ib) < sz) {
-			*errno = EINVAL;
-			return (-1);
-		}
-
-		if (sz == 1) {
-			ib++;
-			ret_val++;
-		} else {
-			/*
-			 * Check on the multi-byte UTF-8 character. For more
-			 * details on this, see comment added for the used
-			 * data structures at the beginning of the file.
-			 */
-			f = *ib++;
-			ret_val++;
-			second = B_TRUE;
-			for (i = 1; i < sz; i++) {
-				if (second) {
-					if (*ib < u8_valid_min_2nd_byte[f] ||
-					    *ib > u8_valid_max_2nd_byte[f]) {
-						*errno = EILSEQ;
-						return (-1);
-					}
-					second = B_FALSE;
-				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
-					*errno = EILSEQ;
-					return (-1);
-				}
-				ib++;
-				ret_val++;
-			}
-		}
-
-		if (check_additional) {
-			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
-				s1 = ib - sz;
-				s2 = p[i];
-				while (s1 < ib) {
-					if (*s1 != *s2 || *s2 == '\0')
-						break;
-					s1++;
-					s2++;
-				}
-
-				if (s1 >= ib && *s2 == '\0') {
-					*errno = EBADF;
-					return (-1);
-				}
-			}
-		}
-
-		if (no_need_to_validate_entire)
-			break;
-	}
-
-	return (ret_val);
-}
-
-/*
- * The do_case_conv() looks at the mapping tables and returns found
- * bytes if any. If not found, the input bytes are returned. The function
- * always terminate the return bytes with a null character assuming that
- * there are plenty of room to do so.
- *
- * The case conversions are simple case conversions mapping a character to
- * another character as specified in the Unicode data. The byte size of
- * the mapped character could be different from that of the input character.
- *
- * The return value is the byte length of the returned character excluding
- * the terminating null byte.
- */
-static size_t
-do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
-{
-	size_t i;
-	uint16_t b1 = 0;
-	uint16_t b2 = 0;
-	uint16_t b3 = 0;
-	uint16_t b3_tbl;
-	uint16_t b3_base;
-	uint16_t b4 = 0;
-	size_t start_id;
-	size_t end_id;
-
-	/*
-	 * At this point, the only possible values for sz are 2, 3, and 4.
-	 * The u8s should point to a vector that is well beyond the size of
-	 * 5 bytes.
-	 */
-	if (sz == 2) {
-		b3 = u8s[0] = s[0];
-		b4 = u8s[1] = s[1];
-	} else if (sz == 3) {
-		b2 = u8s[0] = s[0];
-		b3 = u8s[1] = s[1];
-		b4 = u8s[2] = s[2];
-	} else if (sz == 4) {
-		b1 = u8s[0] = s[0];
-		b2 = u8s[1] = s[1];
-		b3 = u8s[2] = s[2];
-		b4 = u8s[3] = s[3];
-	} else {
-		/* This is not possible but just in case as a fallback. */
-		if (is_it_toupper)
-			*u8s = U8_ASCII_TOUPPER(*s);
-		else
-			*u8s = U8_ASCII_TOLOWER(*s);
-		u8s[1] = '\0';
-
-		return (1);
-	}
-	u8s[sz] = '\0';
-
-	/*
-	 * Let's find out if we have a corresponding character.
-	 */
-	b1 = u8_common_b1_tbl[uv][b1];
-	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
-		return ((size_t)sz);
-
-	b2 = u8_case_common_b2_tbl[uv][b1][b2];
-	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
-		return ((size_t)sz);
-
-	if (is_it_toupper) {
-		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
-		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
-			return ((size_t)sz);
-
-		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
-		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
-
-		/* Either there is no match or an error at the table. */
-		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
-			return ((size_t)sz);
-
-		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
-
-		for (i = 0; start_id < end_id; start_id++)
-			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
-	} else {
-		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
-		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
-			return ((size_t)sz);
-
-		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
-		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
-
-		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
-			return ((size_t)sz);
-
-		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
-
-		for (i = 0; start_id < end_id; start_id++)
-			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
-	}
-
-	/*
-	 * If i is still zero, that means there is no corresponding character.
-	 */
-	if (i == 0)
-		return ((size_t)sz);
-
-	u8s[i] = '\0';
-
-	return (i);
-}
-
-/*
- * The do_case_compare() function compares the two input strings, s1 and s2,
- * one character at a time doing case conversions if applicable and return
- * the comparison result as like strcmp().
- *
- * Since, in empirical sense, most of text data are 7-bit ASCII characters,
- * we treat the 7-bit ASCII characters as a special case trying to yield
- * faster processing time.
- */
-static int
-do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
-	size_t n2, boolean_t is_it_toupper, int *errno)
-{
-	int f;
-	int sz1;
-	int sz2;
-	size_t j;
-	size_t i1;
-	size_t i2;
-	uchar_t u8s1[U8_MB_CUR_MAX + 1];
-	uchar_t u8s2[U8_MB_CUR_MAX + 1];
-
-	i1 = i2 = 0;
-	while (i1 < n1 && i2 < n2) {
-		/*
-		 * Find out what would be the byte length for this UTF-8
-		 * character at string s1 and also find out if this is
-		 * an illegal start byte or not and if so, issue a proper
-		 * errno and yet treat this byte as a character.
-		 */
-		sz1 = u8_number_of_bytes[*s1];
-		if (sz1 < 0) {
-			*errno = EILSEQ;
-			sz1 = 1;
-		}
-
-		/*
-		 * For 7-bit ASCII characters mainly, we do a quick case
-		 * conversion right at here.
-		 *
-		 * If we don't have enough bytes for this character, issue
-		 * an EINVAL error and use what are available.
-		 *
-		 * If we have enough bytes, find out if there is
-		 * a corresponding uppercase character and if so, copy over
-		 * the bytes for a comparison later. If there is no
-		 * corresponding uppercase character, then, use what we have
-		 * for the comparison.
-		 */
-		if (sz1 == 1) {
-			if (is_it_toupper)
-				u8s1[0] = U8_ASCII_TOUPPER(*s1);
-			else
-				u8s1[0] = U8_ASCII_TOLOWER(*s1);
-			s1++;
-			u8s1[1] = '\0';
-		} else if ((i1 + sz1) > n1) {
-			*errno = EINVAL;
-			for (j = 0; (i1 + j) < n1; )
-				u8s1[j++] = *s1++;
-			u8s1[j] = '\0';
-		} else {
-			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
-			s1 += sz1;
-		}
-
-		/* Do the same for the string s2. */
-		sz2 = u8_number_of_bytes[*s2];
-		if (sz2 < 0) {
-			*errno = EILSEQ;
-			sz2 = 1;
-		}
-
-		if (sz2 == 1) {
-			if (is_it_toupper)
-				u8s2[0] = U8_ASCII_TOUPPER(*s2);
-			else
-				u8s2[0] = U8_ASCII_TOLOWER(*s2);
-			s2++;
-			u8s2[1] = '\0';
-		} else if ((i2 + sz2) > n2) {
-			*errno = EINVAL;
-			for (j = 0; (i2 + j) < n2; )
-				u8s2[j++] = *s2++;
-			u8s2[j] = '\0';
-		} else {
-			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
-			s2 += sz2;
-		}
-
-		/* Now compare the two characters. */
-		if (sz1 == 1 && sz2 == 1) {
-			if (*u8s1 > *u8s2)
-				return (1);
-			if (*u8s1 < *u8s2)
-				return (-1);
-		} else {
-			f = strcmp((const char *)u8s1, (const char *)u8s2);
-			if (f != 0)
-				return (f);
-		}
-
-		/*
-		 * They were the same. Let's move on to the next
-		 * characters then.
-		 */
-		i1 += sz1;
-		i2 += sz2;
-	}
-
-	/*
-	 * We compared until the end of either or both strings.
-	 *
-	 * If we reached to or went over the ends for the both, that means
-	 * they are the same.
-	 *
-	 * If we reached only one of the two ends, that means the other string
-	 * has something which then the fact can be used to determine
-	 * the return value.
-	 */
-	if (i1 >= n1) {
-		if (i2 >= n2)
-			return (0);
-		return (-1);
-	}
-	return (1);
-}
-
-/*
- * The combining_class() function checks on the given bytes and find out
- * the corresponding Unicode combining class value. The return value 0 means
- * it is a Starter. Any illegal UTF-8 character will also be treated as
- * a Starter.
- */
-static uchar_t
-combining_class(size_t uv, uchar_t *s, size_t sz)
-{
-	uint16_t b1 = 0;
-	uint16_t b2 = 0;
-	uint16_t b3 = 0;
-	uint16_t b4 = 0;
-
-	if (sz == 1 || sz > 4)
-		return (0);
-
-	if (sz == 2) {
-		b3 = s[0];
-		b4 = s[1];
-	} else if (sz == 3) {
-		b2 = s[0];
-		b3 = s[1];
-		b4 = s[2];
-	} else if (sz == 4) {
-		b1 = s[0];
-		b2 = s[1];
-		b3 = s[2];
-		b4 = s[3];
-	}
-
-	b1 = u8_common_b1_tbl[uv][b1];
-	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
-		return (0);
-
-	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
-	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
-		return (0);
-
-	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
-	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
-		return (0);
-
-	return (u8_combining_class_b4_tbl[uv][b3][b4]);
-}
-
-/*
- * The do_decomp() function finds out a matching decomposition if any
- * and return. If there is no match, the input bytes are copied and returned.
- * The function also checks if there is a Hangul, decomposes it if necessary
- * and returns.
- *
- * To save time, a single byte 7-bit ASCII character should be handled by
- * the caller.
- *
- * The function returns the number of bytes returned sans always terminating
- * the null byte. It will also return a state that will tell if there was
- * a Hangul character decomposed which then will be used by the caller.
- */
-static size_t
-do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
-	boolean_t canonical_decomposition, u8_normalization_states_t *state)
-{
-	uint16_t b1 = 0;
-	uint16_t b2 = 0;
-	uint16_t b3 = 0;
-	uint16_t b3_tbl;
-	uint16_t b3_base;
-	uint16_t b4 = 0;
-	size_t start_id;
-	size_t end_id;
-	size_t i;
-	uint32_t u1;
-
-	if (sz == 2) {
-		b3 = u8s[0] = s[0];
-		b4 = u8s[1] = s[1];
-		u8s[2] = '\0';
-	} else if (sz == 3) {
-		/* Convert it to a Unicode scalar value. */
-		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
-
-		/*
-		 * If this is a Hangul syllable, we decompose it into
-		 * a leading consonant, a vowel, and an optional trailing
-		 * consonant and then return.
-		 */
-		if (U8_HANGUL_SYLLABLE(u1)) {
-			u1 -= U8_HANGUL_SYL_FIRST;
-
-			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
-			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
-			    / U8_HANGUL_T_COUNT;
-			b3 = u1 % U8_HANGUL_T_COUNT;
-
-			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
-			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
-			if (b3) {
-				b3 += U8_HANGUL_JAMO_T_FIRST;
-				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
-
-				u8s[9] = '\0';
-				*state = U8_STATE_HANGUL_LVT;
-				return (9);
-			}
-
-			u8s[6] = '\0';
-			*state = U8_STATE_HANGUL_LV;
-			return (6);
-		}
-
-		b2 = u8s[0] = s[0];
-		b3 = u8s[1] = s[1];
-		b4 = u8s[2] = s[2];
-		u8s[3] = '\0';
-
-		/*
-		 * If this is a Hangul Jamo, we know there is nothing
-		 * further that we can decompose.
-		 */
-		if (U8_HANGUL_JAMO_L(u1)) {
-			*state = U8_STATE_HANGUL_L;
-			return (3);
-		}
-
-		if (U8_HANGUL_JAMO_V(u1)) {
-			if (*state == U8_STATE_HANGUL_L)
-				*state = U8_STATE_HANGUL_LV;
-			else
-				*state = U8_STATE_HANGUL_V;
-			return (3);
-		}
-
-		if (U8_HANGUL_JAMO_T(u1)) {
-			if (*state == U8_STATE_HANGUL_LV)
-				*state = U8_STATE_HANGUL_LVT;
-			else
-				*state = U8_STATE_HANGUL_T;
-			return (3);
-		}
-	} else if (sz == 4) {
-		b1 = u8s[0] = s[0];
-		b2 = u8s[1] = s[1];
-		b3 = u8s[2] = s[2];
-		b4 = u8s[3] = s[3];
-		u8s[4] = '\0';
-	} else {
-		/*
-		 * This is a fallback and should not happen if the function
-		 * was called properly.
-		 */
-		u8s[0] = s[0];
-		u8s[1] = '\0';
-		*state = U8_STATE_START;
-		return (1);
-	}
-
-	/*
-	 * At this point, this rountine does not know what it would get.
-	 * The caller should sort it out if the state isn't a Hangul one.
-	 */
-	*state = U8_STATE_START;
-
-	/* Try to find matching decomposition mapping byte sequence. */
-	b1 = u8_common_b1_tbl[uv][b1];
-	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
-		return ((size_t)sz);
-
-	b2 = u8_decomp_b2_tbl[uv][b1][b2];
-	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
-		return ((size_t)sz);
-
-	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
-	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
-		return ((size_t)sz);
-
-	/*
-	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
-	 * which is 0x8000, this means we couldn't fit the mappings into
-	 * the cardinality of a unsigned byte.
-	 */
-	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
-		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
-		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
-		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
-	} else {
-		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
-		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
-	}
-
-	/* This also means there wasn't any matching decomposition. */
-	if (start_id >= end_id)
-		return ((size_t)sz);
-
-	/*
-	 * The final table for decomposition mappings has three types of
-	 * byte sequences depending on whether a mapping is for compatibility
-	 * decomposition, canonical decomposition, or both like the following:
-	 *
-	 * (1) Compatibility decomposition mappings:
-	 *
-	 *	+---+---+-...-+---+
-	 *	| B0| B1| ... | Bm|
-	 *	+---+---+-...-+---+
-	 *
-	 *	The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
-	 *
-	 * (2) Canonical decomposition mappings:
-	 *
-	 *	+---+---+---+-...-+---+
-	 *	| T | b0| b1| ... | bn|
-	 *	+---+---+---+-...-+---+
-	 *
-	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
-	 *
-	 * (3) Both mappings:
-	 *
-	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
-	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
-	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
-	 *
-	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
-	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
-	 *	compatibility mapping bytes.
-	 *
-	 * Note that compatibility decomposition means doing recursive
-	 * decompositions using both compatibility decomposition mappings and
-	 * canonical decomposition mappings. On the other hand, canonical
-	 * decomposition means doing recursive decompositions using only
-	 * canonical decomposition mappings. Since the table we have has gone
-	 * through the recursions already, we do not need to do so during
-	 * runtime, i.e., the table has been completely flattened out
-	 * already.
-	 */
-
-	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
-
-	/* Get the type, T, of the byte sequence. */
-	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
-
-	/*
-	 * If necessary, adjust start_id, end_id, or both. Note that if
-	 * this is compatibility decomposition mapping, there is no
-	 * adjustment.
-	 */
-	if (canonical_decomposition) {
-		/* Is the mapping only for compatibility decomposition? */
-		if (b1 < U8_DECOMP_BOTH)
-			return ((size_t)sz);
-
-		start_id++;
-
-		if (b1 == U8_DECOMP_BOTH) {
-			end_id = start_id +
-			    u8_decomp_final_tbl[uv][b3_base + start_id];
-			start_id++;
-		}
-	} else {
-		/*
-		 * Unless this is a compatibility decomposition mapping,
-		 * we adjust the start_id.
-		 */
-		if (b1 == U8_DECOMP_BOTH) {
-			start_id++;
-			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
-		} else if (b1 == U8_DECOMP_CANONICAL) {
-			start_id++;
-		}
-	}
-
-	for (i = 0; start_id < end_id; start_id++)
-		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
-	u8s[i] = '\0';
-
-	return (i);
-}
-
-/*
- * The find_composition_start() function uses the character bytes given and
- * find out the matching composition mappings if any and return the address
- * to the composition mappings as explained in the do_composition().
- */
-static uchar_t *
-find_composition_start(size_t uv, uchar_t *s, size_t sz)
-{
-	uint16_t b1 = 0;
-	uint16_t b2 = 0;
-	uint16_t b3 = 0;
-	uint16_t b3_tbl;
-	uint16_t b3_base;
-	uint16_t b4 = 0;
-	size_t start_id;
-	size_t end_id;
-
-	if (sz == 1) {
-		b4 = s[0];
-	} else if (sz == 2) {
-		b3 = s[0];
-		b4 = s[1];
-	} else if (sz == 3) {
-		b2 = s[0];
-		b3 = s[1];
-		b4 = s[2];
-	} else if (sz == 4) {
-		b1 = s[0];
-		b2 = s[1];
-		b3 = s[2];
-		b4 = s[3];
-	} else {
-		/*
-		 * This is a fallback and should not happen if the function
-		 * was called properly.
-		 */
-		return (NULL);
-	}
-
-	b1 = u8_composition_b1_tbl[uv][b1];
-	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
-		return (NULL);
-
-	b2 = u8_composition_b2_tbl[uv][b1][b2];
-	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
-		return (NULL);
-
-	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
-	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
-		return (NULL);
-
-	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
-		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
-		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
-		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
-	} else {
-		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
-		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
-	}
-
-	if (start_id >= end_id)
-		return (NULL);
-
-	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
-
-	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
-}
-
-/*
- * The blocked() function checks on the combining class values of previous
- * characters in this sequence and return whether it is blocked or not.
- */
-static boolean_t
-blocked(uchar_t *comb_class, size_t last)
-{
-	uchar_t my_comb_class;
-	size_t i;
-
-	my_comb_class = comb_class[last];
-	for (i = 1; i < last; i++)
-		if (comb_class[i] >= my_comb_class ||
-		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
-			return (B_TRUE);
-
-	return (B_FALSE);
-}
-
-/*
- * The do_composition() reads the character string pointed by 's' and
- * do necessary canonical composition and then copy over the result back to
- * the 's'.
- *
- * The input argument 's' cannot contain more than 32 characters.
- */
-static size_t
-do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
-	uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
-{
-	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
-	uchar_t tc[U8_MB_CUR_MAX];
-	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
-	size_t saved_marks_count;
-	uchar_t *p;
-	uchar_t *saved_p;
-	uchar_t *q;
-	size_t i;
-	size_t saved_i;
-	size_t j;
-	size_t k;
-	size_t l;
-	size_t C;
-	size_t saved_l;
-	size_t size;
-	uint32_t u1;
-	uint32_t u2;
-	boolean_t match_not_found = B_TRUE;
-
-	/*
-	 * This should never happen unless the callers are doing some strange
-	 * and unexpected things.
-	 *
-	 * The "last" is the index pointing to the last character not last + 1.
-	 */
-	if (last >= U8_MAX_CHARS_A_SEQ)
-		last = U8_UPPER_LIMIT_IN_A_SEQ;
-
-	for (i = l = 0; i <= last; i++) {
-		/*
-		 * The last or any non-Starters at the beginning, we don't
-		 * have any chance to do composition and so we just copy them
-		 * to the temporary buffer.
-		 */
-		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
-SAVE_THE_CHAR:
-			p = s + start[i];
-			size = disp[i];
-			for (k = 0; k < size; k++)
-				t[l++] = *p++;
-			continue;
-		}
-
-		/*
-		 * If this could be a start of Hangul Jamos, then, we try to
-		 * conjoin them.
-		 */
-		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
-			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
-			    s[start[i] + 1], s[start[i] + 2]);
-			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
-			    s[start[i] + 4], s[start[i] + 5]);
-
-			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
-				u1 -= U8_HANGUL_JAMO_L_FIRST;
-				u2 -= U8_HANGUL_JAMO_V_FIRST;
-				u1 = U8_HANGUL_SYL_FIRST +
-				    (u1 * U8_HANGUL_V_COUNT + u2) *
-				    U8_HANGUL_T_COUNT;
-
-				i += 2;
-				if (i <= last) {
-					U8_PUT_3BYTES_INTO_UTF32(u2,
-					    s[start[i]], s[start[i] + 1],
-					    s[start[i] + 2]);
-
-					if (U8_HANGUL_JAMO_T(u2)) {
-						u1 += u2 -
-						    U8_HANGUL_JAMO_T_FIRST;
-						i++;
-					}
-				}
-
-				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
-				i--;
-				l += 3;
-				continue;
-			}
-		}
-
-		/*
-		 * Let's then find out if this Starter has composition
-		 * mapping.
-		 */
-		p = find_composition_start(uv, s + start[i], disp[i]);
-		if (p == NULL)
-			goto SAVE_THE_CHAR;
-
-		/*
-		 * We have a Starter with composition mapping and the next
-		 * character is a non-Starter. Let's try to find out if
-		 * we can do composition.
-		 */
-
-		saved_p = p;
-		saved_i = i;
-		saved_l = l;
-		saved_marks_count = 0;
-
-TRY_THE_NEXT_MARK:
-		q = s + start[++i];
-		size = disp[i];
-
-		/*
-		 * The next for() loop compares the non-Starter pointed by
-		 * 'q' with the possible (joinable) characters pointed by 'p'.
-		 *
-		 * The composition final table entry pointed by the 'p'
-		 * looks like the following:
-		 *
-		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
-		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
-		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
-		 *
-		 * where C is the count byte indicating the number of
-		 * mapping pairs where each pair would be look like
-		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
-		 * character of a canonical decomposition and the B0-Bm are
-		 * the bytes of a matching composite character. The F is
-		 * a filler byte after each character as the separator.
-		 */
-
-		match_not_found = B_TRUE;
-
-		for (C = *p++; C > 0; C--) {
-			for (k = 0; k < size; p++, k++)
-				if (*p != q[k])
-					break;
-
-			/* Have we found it? */
-			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
-				match_not_found = B_FALSE;
-
-				l = saved_l;
-
-				while (*++p != U8_TBL_ELEMENT_FILLER)
-					t[l++] = *p;
-
-				break;
-			}
-
-			/* We didn't find; skip to the next pair. */
-			if (*p != U8_TBL_ELEMENT_FILLER)
-				while (*++p != U8_TBL_ELEMENT_FILLER)
-					;
-			while (*++p != U8_TBL_ELEMENT_FILLER)
-				;
-			p++;
-		}
-
-		/*
-		 * If there was no match, we will need to save the combining
-		 * mark for later appending. After that, if the next one
-		 * is a non-Starter and not blocked, then, we try once
-		 * again to do composition with the next non-Starter.
-		 *
-		 * If there was no match and this was a Starter, then,
-		 * this is a new start.
-		 *
-		 * If there was a match and a composition done and we have
-		 * more to check on, then, we retrieve a new composition final
-		 * table entry for the composite and then try to do the
-		 * composition again.
-		 */
-
-		if (match_not_found) {
-			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
-				i--;
-				goto SAVE_THE_CHAR;
-			}
-
-			saved_marks[saved_marks_count++] = i;
-		}
-
-		if (saved_l == l) {
-			while (i < last) {
-				if (blocked(comb_class, i + 1))
-					saved_marks[saved_marks_count++] = ++i;
-				else
-					break;
-			}
-			if (i < last) {
-				p = saved_p;
-				goto TRY_THE_NEXT_MARK;
-			}
-		} else if (i < last) {
-			p = find_composition_start(uv, t + saved_l,
-			    l - saved_l);
-			if (p != NULL) {
-				saved_p = p;
-				goto TRY_THE_NEXT_MARK;
-			}
-		}
-
-		/*
-		 * There is no more composition possible.
-		 *
-		 * If there was no composition what so ever then we copy
-		 * over the original Starter and then append any non-Starters
-		 * remaining at the target string sequentially after that.
-		 */
-
-		if (saved_l == l) {
-			p = s + start[saved_i];
-			size = disp[saved_i];
-			for (j = 0; j < size; j++)
-				t[l++] = *p++;
-		}
-
-		for (k = 0; k < saved_marks_count; k++) {
-			p = s + start[saved_marks[k]];
-			size = disp[saved_marks[k]];
-			for (j = 0; j < size; j++)
-				t[l++] = *p++;
-		}
-	}
-
-	/*
-	 * If the last character is a Starter and if we have a character
-	 * (possibly another Starter) that can be turned into a composite,
-	 * we do so and we do so until there is no more of composition
-	 * possible.
-	 */
-	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
-		p = *os;
-		saved_l = l - disp[last];
-
-		while (p < oslast) {
-			size = u8_number_of_bytes[*p];
-			if (size <= 1 || (p + size) > oslast)
-				break;
-
-			saved_p = p;
-
-			for (i = 0; i < size; i++)
-				tc[i] = *p++;
-
-			q = find_composition_start(uv, t + saved_l,
-			    l - saved_l);
-			if (q == NULL) {
-				p = saved_p;
-				break;
-			}
-
-			match_not_found = B_TRUE;
-
-			for (C = *q++; C > 0; C--) {
-				for (k = 0; k < size; q++, k++)
-					if (*q != tc[k])
-						break;
-
-				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
-					match_not_found = B_FALSE;
-
-					l = saved_l;
-
-					while (*++q != U8_TBL_ELEMENT_FILLER) {
-						/*
-						 * This is practically
-						 * impossible but we don't
-						 * want to take any chances.
-						 */
-						if (l >=
-						    U8_STREAM_SAFE_TEXT_MAX) {
-							p = saved_p;
-							goto SAFE_RETURN;
-						}
-						t[l++] = *q;
-					}
-
-					break;
-				}
-
-				if (*q != U8_TBL_ELEMENT_FILLER)
-					while (*++q != U8_TBL_ELEMENT_FILLER)
-						;
-				while (*++q != U8_TBL_ELEMENT_FILLER)
-					;
-				q++;
-			}
-
-			if (match_not_found) {
-				p = saved_p;
-				break;
-			}
-		}
-SAFE_RETURN:
-		*os = p;
-	}
-
-	/*
-	 * Now we copy over the temporary string to the target string.
-	 * Since composition always reduces the number of characters or
-	 * the number of characters stay, we don't need to worry about
-	 * the buffer overflow here.
-	 */
-	for (i = 0; i < l; i++)
-		s[i] = t[i];
-	s[l] = '\0';
-
-	return (l);
-}
-
-/*
- * The collect_a_seq() function checks on the given string s, collect
- * a sequence of characters at u8s, and return the sequence. While it collects
- * a sequence, it also applies case conversion, canonical or compatibility
- * decomposition, canonical decomposition, or some or all of them and
- * in that order.
- *
- * The collected sequence cannot be bigger than 32 characters since if
- * it is having more than 31 characters, the sequence will be terminated
- * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
- * a Stream-Safe Text. The collected sequence is always terminated with
- * a null byte and the return value is the byte length of the sequence
- * including 0. The return value does not include the terminating
- * null byte.
- */
-static size_t
-collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
-	boolean_t is_it_toupper,
-	boolean_t is_it_tolower,
-	boolean_t canonical_decomposition,
-	boolean_t compatibility_decomposition,
-	boolean_t canonical_composition,
-	int *errno, u8_normalization_states_t *state)
-{
-	uchar_t *s;
-	int sz;
-	int saved_sz;
-	size_t i;
-	size_t j;
-	size_t k;
-	size_t l;
-	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
-	uchar_t disp[U8_MAX_CHARS_A_SEQ];
-	uchar_t start[U8_MAX_CHARS_A_SEQ];
-	uchar_t u8t[U8_MB_CUR_MAX];
-	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
-	uchar_t tc;
-	size_t last;
-	size_t saved_last;
-	uint32_t u1;
-
-	/*
-	 * Save the source string pointer which we will return a changed
-	 * pointer if we do processing.
-	 */
-	s = *source;
-
-	/*
-	 * The following is a fallback for just in case callers are not
-	 * checking the string boundaries before the calling.
-	 */
-	if (s >= slast) {
-		u8s[0] = '\0';
-
-		return (0);
-	}
-
-	/*
-	 * As the first thing, let's collect a character and do case
-	 * conversion if necessary.
-	 */
-
-	sz = u8_number_of_bytes[*s];
-
-	if (sz < 0) {
-		*errno = EILSEQ;
-
-		u8s[0] = *s++;
-		u8s[1] = '\0';
-
-		*source = s;
-
-		return (1);
-	}
-
-	if (sz == 1) {
-		if (is_it_toupper)
-			u8s[0] = U8_ASCII_TOUPPER(*s);
-		else if (is_it_tolower)
-			u8s[0] = U8_ASCII_TOLOWER(*s);
-		else
-			u8s[0] = *s;
-		s++;
-		u8s[1] = '\0';
-	} else if ((s + sz) > slast) {
-		*errno = EINVAL;
-
-		for (i = 0; s < slast; )
-			u8s[i++] = *s++;
-		u8s[i] = '\0';
-
-		*source = s;
-
-		return (i);
-	} else {
-		if (is_it_toupper || is_it_tolower) {
-			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
-			s += sz;
-			sz = i;
-		} else {
-			for (i = 0; i < sz; )
-				u8s[i++] = *s++;
-			u8s[i] = '\0';
-		}
-	}
-
-	/*
-	 * And then canonical/compatibility decomposition followed by
-	 * an optional canonical composition. Please be noted that
-	 * canonical composition is done only when a decomposition is
-	 * done.
-	 */
-	if (canonical_decomposition || compatibility_decomposition) {
-		if (sz == 1) {
-			*state = U8_STATE_START;
-
-			saved_sz = 1;
-
-			comb_class[0] = 0;
-			start[0] = 0;
-			disp[0] = 1;
-
-			last = 1;
-		} else {
-			saved_sz = do_decomp(uv, u8s, u8s, sz,
-			    canonical_decomposition, state);
-
-			last = 0;
-
-			for (i = 0; i < saved_sz; ) {
-				sz = u8_number_of_bytes[u8s[i]];
-
-				comb_class[last] = combining_class(uv,
-				    u8s + i, sz);
-				start[last] = i;
-				disp[last] = sz;
-
-				last++;
-				i += sz;
-			}
-
-			/*
-			 * Decomposition yields various Hangul related
-			 * states but not on combining marks. We need to
-			 * find out at here by checking on the last
-			 * character.
-			 */
-			if (*state == U8_STATE_START) {
-				if (comb_class[last - 1])
-					*state = U8_STATE_COMBINING_MARK;
-			}
-		}
-
-		saved_last = last;
-
-		while (s < slast) {
-			sz = u8_number_of_bytes[*s];
-
-			/*
-			 * If this is an illegal character, an incomplete
-			 * character, or an 7-bit ASCII Starter character,
-			 * then we have collected a sequence; break and let
-			 * the next call deal with the two cases.
-			 *
-			 * Note that this is okay only if you are using this
-			 * function with a fixed length string, not on
-			 * a buffer with multiple calls of one chunk at a time.
-			 */
-			if (sz <= 1) {
-				break;
-			} else if ((s + sz) > slast) {
-				break;
-			} else {
-				/*
-				 * If the previous character was a Hangul Jamo
-				 * and this character is a Hangul Jamo that
-				 * can be conjoined, we collect the Jamo.
-				 */
-				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
-					U8_PUT_3BYTES_INTO_UTF32(u1,
-					    *s, *(s + 1), *(s + 2));
-
-					if (U8_HANGUL_COMPOSABLE_L_V(*state,
-					    u1)) {
-						i = 0;
-						*state = U8_STATE_HANGUL_LV;
-						goto COLLECT_A_HANGUL;
-					}
-
-					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
-					    u1)) {
-						i = 0;
-						*state = U8_STATE_HANGUL_LVT;
-						goto COLLECT_A_HANGUL;
-					}
-				}
-
-				/*
-				 * Regardless of whatever it was, if this is
-				 * a Starter, we don't collect the character
-				 * since that's a new start and we will deal
-				 * with it at the next time.
-				 */
-				i = combining_class(uv, s, sz);
-				if (i == U8_COMBINING_CLASS_STARTER)
-					break;
-
-				/*
-				 * We know the current character is a combining
-				 * mark. If the previous character wasn't
-				 * a Starter (not Hangul) or a combining mark,
-				 * then, we don't collect this combining mark.
-				 */
-				if (*state != U8_STATE_START &&
-				    *state != U8_STATE_COMBINING_MARK)
-					break;
-
-				*state = U8_STATE_COMBINING_MARK;
-COLLECT_A_HANGUL:
-				/*
-				 * If we collected a Starter and combining
-				 * marks up to 30, i.e., total 31 characters,
-				 * then, we terminate this degenerately long
-				 * combining sequence with a U+034F COMBINING
-				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
-				 * UTF-8 and turn this into a Stream-Safe
-				 * Text. This will be extremely rare but
-				 * possible.
-				 *
-				 * The following will also guarantee that
-				 * we are not writing more than 32 characters
-				 * plus a NULL at u8s[].
-				 */
-				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
-TURN_STREAM_SAFE:
-					*state = U8_STATE_START;
-					comb_class[last] = 0;
-					start[last] = saved_sz;
-					disp[last] = 2;
-					last++;
-
-					u8s[saved_sz++] = 0xCD;
-					u8s[saved_sz++] = 0x8F;
-
-					break;
-				}
-
-				/*
-				 * Some combining marks also do decompose into
-				 * another combining mark or marks.
-				 */
-				if (*state == U8_STATE_COMBINING_MARK) {
-					k = last;
-					l = sz;
-					i = do_decomp(uv, uts, s, sz,
-					    canonical_decomposition, state);
-					for (j = 0; j < i; ) {
-						sz = u8_number_of_bytes[uts[j]];
-
-						comb_class[last] =
-						    combining_class(uv,
-						    uts + j, sz);
-						start[last] = saved_sz + j;
-						disp[last] = sz;
-
-						last++;
-						if (last >=
-						    U8_UPPER_LIMIT_IN_A_SEQ) {
-							last = k;
-							goto TURN_STREAM_SAFE;
-						}
-						j += sz;
-					}
-
-					*state = U8_STATE_COMBINING_MARK;
-					sz = i;
-					s += l;
-
-					for (i = 0; i < sz; i++)
-						u8s[saved_sz++] = uts[i];
-				} else {
-					comb_class[last] = i;
-					start[last] = saved_sz;
-					disp[last] = sz;
-					last++;
-
-					for (i = 0; i < sz; i++)
-						u8s[saved_sz++] = *s++;
-				}
-
-				/*
-				 * If this is U+0345 COMBINING GREEK
-				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
-				 * iota subscript, and need to be converted to
-				 * uppercase letter, convert it to U+0399 GREEK
-				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
-				 * i.e., convert to capital adscript form as
-				 * specified in the Unicode standard.
-				 *
-				 * This is the only special case of (ambiguous)
-				 * case conversion at combining marks and
-				 * probably the standard will never have
-				 * anything similar like this in future.
-				 */
-				if (is_it_toupper && sz >= 2 &&
-				    u8s[saved_sz - 2] == 0xCD &&
-				    u8s[saved_sz - 1] == 0x85) {
-					u8s[saved_sz - 2] = 0xCE;
-					u8s[saved_sz - 1] = 0x99;
-				}
-			}
-		}
-
-		/*
-		 * Let's try to ensure a canonical ordering for the collected
-		 * combining marks. We do this only if we have collected
-		 * at least one more non-Starter. (The decomposition mapping
-		 * data tables have fully (and recursively) expanded and
-		 * canonically ordered decompositions.)
-		 *
-		 * The U8_SWAP_COMB_MARKS() convenience macro has some
-		 * assumptions and we are meeting the assumptions.
-		 */
-		last--;
-		if (last >= saved_last) {
-			for (i = 0; i < last; i++)
-				for (j = last; j > i; j--)
-					if (comb_class[j] &&
-					    comb_class[j - 1] > comb_class[j]) {
-						U8_SWAP_COMB_MARKS(j - 1, j);
-					}
-		}
-
-		*source = s;
-
-		if (! canonical_composition) {
-			u8s[saved_sz] = '\0';
-			return (saved_sz);
-		}
-
-		/*
-		 * Now do the canonical composition. Note that we do this
-		 * only after a canonical or compatibility decomposition to
-		 * finish up NFC or NFKC.
-		 */
-		sz = do_composition(uv, u8s, comb_class, start, disp, last,
-		    &s, slast);
-	}
-
-	*source = s;
-
-	return ((size_t)sz);
-}
-
-/*
- * The do_norm_compare() function does string comparion based on Unicode
- * simple case mappings and Unicode Normalization definitions.
- *
- * It does so by collecting a sequence of character at a time and comparing
- * the collected sequences from the strings.
- *
- * The meanings on the return values are the same as the usual strcmp().
- */
-static int
-do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
-	int flag, int *errno)
-{
-	int result;
-	size_t sz1;
-	size_t sz2;
-	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
-	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
-	uchar_t *s1last;
-	uchar_t *s2last;
-	boolean_t is_it_toupper;
-	boolean_t is_it_tolower;
-	boolean_t canonical_decomposition;
-	boolean_t compatibility_decomposition;
-	boolean_t canonical_composition;
-	u8_normalization_states_t state;
-
-	s1last = s1 + n1;
-	s2last = s2 + n2;
-
-	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
-	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
-	canonical_decomposition = flag & U8_CANON_DECOMP;
-	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
-	canonical_composition = flag & U8_CANON_COMP;
-
-	while (s1 < s1last && s2 < s2last) {
-		/*
-		 * If the current character is a 7-bit ASCII and the last
-		 * character, or, if the current character and the next
-		 * character are both some 7-bit ASCII characters then
-		 * we treat the current character as a sequence.
-		 *
-		 * In any other cases, we need to call collect_a_seq().
-		 */
-
-		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
-		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
-			if (is_it_toupper)
-				u8s1[0] = U8_ASCII_TOUPPER(*s1);
-			else if (is_it_tolower)
-				u8s1[0] = U8_ASCII_TOLOWER(*s1);
-			else
-				u8s1[0] = *s1;
-			u8s1[1] = '\0';
-			sz1 = 1;
-			s1++;
-		} else {
-			state = U8_STATE_START;
-			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
-			    is_it_toupper, is_it_tolower,
-			    canonical_decomposition,
-			    compatibility_decomposition,
-			    canonical_composition, errno, &state);
-		}
-
-		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
-		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
-			if (is_it_toupper)
-				u8s2[0] = U8_ASCII_TOUPPER(*s2);
-			else if (is_it_tolower)
-				u8s2[0] = U8_ASCII_TOLOWER(*s2);
-			else
-				u8s2[0] = *s2;
-			u8s2[1] = '\0';
-			sz2 = 1;
-			s2++;
-		} else {
-			state = U8_STATE_START;
-			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
-			    is_it_toupper, is_it_tolower,
-			    canonical_decomposition,
-			    compatibility_decomposition,
-			    canonical_composition, errno, &state);
-		}
-
-		/*
-		 * Now compare the two characters. If they are the same,
-		 * we move on to the next character sequences.
-		 */
-		if (sz1 == 1 && sz2 == 1) {
-			if (*u8s1 > *u8s2)
-				return (1);
-			if (*u8s1 < *u8s2)
-				return (-1);
-		} else {
-			result = strcmp((const char *)u8s1, (const char *)u8s2);
-			if (result != 0)
-				return (result);
-		}
-	}
-
-	/*
-	 * We compared until the end of either or both strings.
-	 *
-	 * If we reached to or went over the ends for the both, that means
-	 * they are the same.
-	 *
-	 * If we reached only one end, that means the other string has
-	 * something which then can be used to determine the return value.
-	 */
-	if (s1 >= s1last) {
-		if (s2 >= s2last)
-			return (0);
-		return (-1);
-	}
-	return (1);
-}
-
-/*
- * The u8_strcmp() function compares two UTF-8 strings quite similar to
- * the strcmp(). For the comparison, however, Unicode Normalization specific
- * equivalency and Unicode simple case conversion mappings based equivalency
- * can be requested and checked against.
- */
-int
-u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
-		int *errno)
-{
-	int f;
-	size_t n1;
-	size_t n2;
-
-	*errno = 0;
-
-	/*
-	 * Check on the requested Unicode version, case conversion, and
-	 * normalization flag values.
-	 */
-
-	if (uv > U8_UNICODE_LATEST) {
-		*errno = ERANGE;
-		uv = U8_UNICODE_LATEST;
-	}
-
-	if (flag == 0) {
-		flag = U8_STRCMP_CS;
-	} else {
-		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
-		    U8_STRCMP_CI_LOWER);
-		if (f == 0) {
-			flag |= U8_STRCMP_CS;
-		} else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
-		    f != U8_STRCMP_CI_LOWER) {
-			*errno = EBADF;
-			flag = U8_STRCMP_CS;
-		}
-
-		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
-		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
-		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
-			*errno = EBADF;
-			flag = U8_STRCMP_CS;
-		}
-	}
-
-	if (flag == U8_STRCMP_CS) {
-		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
-	}
-
-	n1 = strlen(s1);
-	n2 = strlen(s2);
-	if (n != 0) {
-		if (n < n1)
-			n1 = n;
-		if (n < n2)
-			n2 = n;
-	}
-
-	/*
-	 * Simple case conversion can be done much faster and so we do
-	 * them separately here.
-	 */
-	if (flag == U8_STRCMP_CI_UPPER) {
-		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
-		    n1, n2, B_TRUE, errno));
-	} else if (flag == U8_STRCMP_CI_LOWER) {
-		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
-		    n1, n2, B_FALSE, errno));
-	}
-
-	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
-	    flag, errno));
-}
-
-size_t
-u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
-	int flag, size_t unicode_version, int *errno)
-{
-	int f;
-	int sz;
-	uchar_t *ib;
-	uchar_t *ibtail;
-	uchar_t *ob;
-	uchar_t *obtail;
-	boolean_t do_not_ignore_null;
-	boolean_t do_not_ignore_invalid;
-	boolean_t is_it_toupper;
-	boolean_t is_it_tolower;
-	boolean_t canonical_decomposition;
-	boolean_t compatibility_decomposition;
-	boolean_t canonical_composition;
-	size_t ret_val;
-	size_t i;
-	size_t j;
-	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
-	u8_normalization_states_t state;
-
-	if (unicode_version > U8_UNICODE_LATEST) {
-		*errno = ERANGE;
-		return ((size_t)-1);
-	}
-
-	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
-	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
-		*errno = EBADF;
-		return ((size_t)-1);
-	}
-
-	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
-	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
-	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
-		*errno = EBADF;
-		return ((size_t)-1);
-	}
-
-	if (inarray == NULL || *inlen == 0)
-		return (0);
-
-	if (outarray == NULL) {
-		*errno = E2BIG;
-		return ((size_t)-1);
-	}
-
-	ib = (uchar_t *)inarray;
-	ob = (uchar_t *)outarray;
-	ibtail = ib + *inlen;
-	obtail = ob + *outlen;
-
-	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
-	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
-	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
-	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
-
-	ret_val = 0;
-
-	/*
-	 * If we don't have a normalization flag set, we do the simple case
-	 * conversion based text preparation separately below. Text
-	 * preparation involving Normalization will be done in the false task
-	 * block, again, separately since it will take much more time and
-	 * resource than doing simple case conversions.
-	 */
-	if (f == 0) {
-		while (ib < ibtail) {
-			if (*ib == '\0' && do_not_ignore_null)
-				break;
-
-			sz = u8_number_of_bytes[*ib];
-
-			if (sz < 0) {
-				if (do_not_ignore_invalid) {
-					*errno = EILSEQ;
-					ret_val = (size_t)-1;
-					break;
-				}
-
-				sz = 1;
-				ret_val++;
-			}
-
-			if (sz == 1) {
-				if (ob >= obtail) {
-					*errno = E2BIG;
-					ret_val = (size_t)-1;
-					break;
-				}
-
-				if (is_it_toupper)
-					*ob = U8_ASCII_TOUPPER(*ib);
-				else if (is_it_tolower)
-					*ob = U8_ASCII_TOLOWER(*ib);
-				else
-					*ob = *ib;
-				ib++;
-				ob++;
-			} else if ((ib + sz) > ibtail) {
-				if (do_not_ignore_invalid) {
-					*errno = EINVAL;
-					ret_val = (size_t)-1;
-					break;
-				}
-
-				if ((obtail - ob) < (ibtail - ib)) {
-					*errno = E2BIG;
-					ret_val = (size_t)-1;
-					break;
-				}
-
-				/*
-				 * We treat the remaining incomplete character
-				 * bytes as a character.
-				 */
-				ret_val++;
-
-				while (ib < ibtail)
-					*ob++ = *ib++;
-			} else {
-				if (is_it_toupper || is_it_tolower) {
-					i = do_case_conv(unicode_version, u8s,
-					    ib, sz, is_it_toupper);
-
-					if ((obtail - ob) < i) {
-						*errno = E2BIG;
-						ret_val = (size_t)-1;
-						break;
-					}
-
-					ib += sz;
-
-					for (sz = 0; sz < i; sz++)
-						*ob++ = u8s[sz];
-				} else {
-					if ((obtail - ob) < sz) {
-						*errno = E2BIG;
-						ret_val = (size_t)-1;
-						break;
-					}
-
-					for (i = 0; i < sz; i++)
-						*ob++ = *ib++;
-				}
-			}
-		}
-	} else {
-		canonical_decomposition = flag & U8_CANON_DECOMP;
-		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
-		canonical_composition = flag & U8_CANON_COMP;
-
-		while (ib < ibtail) {
-			if (*ib == '\0' && do_not_ignore_null)
-				break;
-
-			/*
-			 * If the current character is a 7-bit ASCII
-			 * character and it is the last character, or,
-			 * if the current character is a 7-bit ASCII
-			 * character and the next character is also a 7-bit
-			 * ASCII character, then, we copy over this
-			 * character without going through collect_a_seq().
-			 *
-			 * In any other cases, we need to look further with
-			 * the collect_a_seq() function.
-			 */
-			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
-			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
-				if (ob >= obtail) {
-					*errno = E2BIG;
-					ret_val = (size_t)-1;
-					break;
-				}
-
-				if (is_it_toupper)
-					*ob = U8_ASCII_TOUPPER(*ib);
-				else if (is_it_tolower)
-					*ob = U8_ASCII_TOLOWER(*ib);
-				else
-					*ob = *ib;
-				ib++;
-				ob++;
-			} else {
-				*errno = 0;
-				state = U8_STATE_START;
-
-				j = collect_a_seq(unicode_version, u8s,
-				    &ib, ibtail,
-				    is_it_toupper,
-				    is_it_tolower,
-				    canonical_decomposition,
-				    compatibility_decomposition,
-				    canonical_composition,
-				    errno, &state);
-
-				if (*errno && do_not_ignore_invalid) {
-					ret_val = (size_t)-1;
-					break;
-				}
-
-				if ((obtail - ob) < j) {
-					*errno = E2BIG;
-					ret_val = (size_t)-1;
-					break;
-				}
-
-				for (i = 0; i < j; i++)
-					*ob++ = u8s[i];
-			}
-		}
-	}
-
-	*inlen = ibtail - ib;
-	*outlen = obtail - ob;
-
-	return (ret_val);
-}
--- a/usr/src/uts/common/os/uconv.c	Fri Sep 14 08:32:57 2007 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,845 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-/*
- * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
- * Man page: /shared/sac/PSARC/2005/446/materials/uconv_functions.9f
- * Interface stability: Consolidation Private
- */
-
-#include <sys/types.h>
-#include <sys/param.h>
-#include <sys/sysmacros.h>
-#include <sys/systm.h>
-#include <sys/debug.h>
-#include <sys/kmem.h>
-#include <sys/sunddi.h>
-#include <sys/byteorder.h>
-#include <sys/errno.h>
-
-
-/*
- * The max and min values of high and low surrogate pairs of UTF-16,
- * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
- */
-#define	UCONV_U16_HI_MIN	(0xd800U)
-#define	UCONV_U16_HI_MAX	(0xdbffU)
-#define	UCONV_U16_LO_MIN	(0xdc00U)
-#define	UCONV_U16_LO_MAX	(0xdfffU)
-#define	UCONV_U16_BIT_SHIFT	(0x0400U)
-#define	UCONV_U16_BIT_MASK	(0x0fffffU)
-#define	UCONV_U16_START		(0x010000U)
-
-/* The maximum value of Unicode coding space and ASCII coding space. */
-#define	UCONV_UNICODE_MAX	(0x10ffffU)
-#define	UCONV_ASCII_MAX		(0x7fU)
-
-/* The mask values for input and output endians. */
-#define	UCONV_IN_ENDIAN_MASKS	(UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
-#define	UCONV_OUT_ENDIAN_MASKS	(UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
-
-/* Native and reversed endian macros. */
-#ifdef	_BIG_ENDIAN
-#define	UCONV_IN_NAT_ENDIAN	UCONV_IN_BIG_ENDIAN
-#define	UCONV_IN_REV_ENDIAN	UCONV_IN_LITTLE_ENDIAN
-#define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_BIG_ENDIAN
-#define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
-#else
-#define	UCONV_IN_NAT_ENDIAN	UCONV_IN_LITTLE_ENDIAN
-#define	UCONV_IN_REV_ENDIAN	UCONV_IN_BIG_ENDIAN
-#define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
-#define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_BIG_ENDIAN
-#endif	/* _BIG_ENDIAN */
-
-/* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
-#define	UCONV_BOM_NORMAL	(0xfeffU)
-#define	UCONV_BOM_SWAPPED	(0xfffeU)
-#define	UCONV_BOM_SWAPPED_32	(0xfffe0000U)
-
-/* UTF-32 boundaries based on UTF-8 character byte lengths. */
-#define	UCONV_U8_ONE_BYTE	(0x7fU)
-#define	UCONV_U8_TWO_BYTES	(0x7ffU)
-#define	UCONV_U8_THREE_BYTES	(0xffffU)
-#define	UCONV_U8_FOUR_BYTES	(0x10ffffU)
-
-/* The common minimum and maximum values at the UTF-8 character bytes. */
-#define	UCONV_U8_BYTE_MIN	(0x80U)
-#define	UCONV_U8_BYTE_MAX	(0xbfU)
-
-/*
- * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
- * UTF-8 character bytes.
- */
-#define	UCONV_U8_BIT_SHIFT	6
-#define	UCONV_U8_BIT_MASK	0x3f
-
-/*
- * The following vector shows remaining bytes in a UTF-8 character.
- * Index will be the first byte of the character.
- */
-static const uchar_t remaining_bytes_tbl[0x100] = {
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-
-/*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
-	0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-
-/*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
-	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-
-/*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
-	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
-
-/*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
-	3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-};
-
-/*
- * The following is a vector of bit-masks to get used bits in
- * the first byte of a UTF-8 character.  Index is remaining bytes at above of
- * the character.
- */
-static const uchar_t masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
-
-/*
- * The following two vectors are to provide valid minimum and
- * maximum values for the 2'nd byte of a multibyte UTF-8 character for
- * better illegal sequence checking. The index value must be the value of
- * the first byte of the UTF-8 character.
- */
-static const uchar_t valid_min_2nd_byte[0x100] = {
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-
-/*	C0    C1    C2    C3    C4    C5    C6    C7 */
-	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-
-/*	C8    C9    CA    CB    CC    CD    CE    CF */
-	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-
-/*	D0    D1    D2    D3    D4    D5    D6    D7 */
-	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-
-/*	D8    D9    DA    DB    DC    DD    DE    DF */
-	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-
-/*	E0    E1    E2    E3    E4    E5    E6    E7 */
-	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-
-/*	E8    E9    EA    EB    EC    ED    EE    EF */
-	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-
-/*	F0    F1    F2    F3    F4    F5    F6    F7 */
-	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
-
-	0,    0,    0,    0,    0,    0,    0,    0
-};
-
-static const uchar_t valid_max_2nd_byte[0x100] = {
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-	0,    0,    0,    0,    0,    0,    0,    0,
-
-/*	C0    C1    C2    C3    C4    C5    C6    C7 */
-	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-
-/*	C8    C9    CA    CB    CC    CD    CE    CF */
-	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-
-/*	D0    D1    D2    D3    D4    D5    D6    D7 */
-	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-
-/*	D8    D9    DA    DB    DC    DD    DE    DF */
-	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-
-/*	E0    E1    E2    E3    E4    E5    E6    E7 */
-	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
-
-/*	E8    E9    EA    EB    EC    ED    EE    EF */
-	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
-
-/*	F0    F1    F2    F3    F4    F5    F6    F7 */
-	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
-
-	0,    0,    0,    0,    0,    0,    0,    0
-};
-
-
-static int
-check_endian(int flag, int *in, int *out)
-{
-	*in = flag & UCONV_IN_ENDIAN_MASKS;
-
-	/* You cannot have both. */
-	if (*in == UCONV_IN_ENDIAN_MASKS)
-		return (EBADF);
-
-	if (*in == 0)
-		*in = UCONV_IN_NAT_ENDIAN;
-
-	*out = flag & UCONV_OUT_ENDIAN_MASKS;
-
-	/* You cannot have both. */
-	if (*out == UCONV_OUT_ENDIAN_MASKS)
-		return (EBADF);
-
-	if (*out == 0)
-		*out = UCONV_OUT_NAT_ENDIAN;
-
-	return (0);
-}
-
-static boolean_t
-check_bom16(const uint16_t *u16s, size_t u16l, int *in)
-{
-	if (u16l > 0) {
-		if (*u16s == UCONV_BOM_NORMAL) {
-			*in = UCONV_IN_NAT_ENDIAN;
-			return (B_TRUE);
-		}
-		if (*u16s == UCONV_BOM_SWAPPED) {
-			*in = UCONV_IN_REV_ENDIAN;
-			return (B_TRUE);
-		}
-	}
-
-	return (B_FALSE);
-}
-
-static boolean_t
-check_bom32(const uint32_t *u32s, size_t u32l, int *in)
-{
-	if (u32l > 0) {
-		if (*u32s == UCONV_BOM_NORMAL) {
-			*in = UCONV_IN_NAT_ENDIAN;
-			return (B_TRUE);
-		}
-		if (*u32s == UCONV_BOM_SWAPPED_32) {
-			*in = UCONV_IN_REV_ENDIAN;
-			return (B_TRUE);
-		}
-	}
-
-	return (B_FALSE);
-}
-
-int
-uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
-    uint32_t *u32s, size_t *utf32len, int flag)
-{
-	int inendian;
-	int outendian;
-	size_t u16l;
-	size_t u32l;
-	uint32_t hi;
-	uint32_t lo;
-	boolean_t do_not_ignore_null;
-
-	/*
-	 * Do preliminary validity checks on parameters and collect info on
-	 * endians.
-	 */
-	if (u16s == NULL || utf16len == NULL)
-		return (EILSEQ);
-
-	if (u32s == NULL || utf32len == NULL)
-		return (E2BIG);
-
-	if (check_endian(flag, &inendian, &outendian) != 0)
-		return (EBADF);
-
-	/*
-	 * Initialize input and output parameter buffer indices and
-	 * temporary variables.
-	 */
-	u16l = u32l = 0;
-	hi = 0;
-	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
-
-	/*
-	 * Check on the BOM at the beginning of the input buffer if required
-	 * and if there is indeed one, process it.
-	 */
-	if ((flag & UCONV_IN_ACCEPT_BOM) &&
-	    check_bom16(u16s, *utf16len, &inendian))
-		u16l++;
-
-	/*
-	 * Reset inendian and outendian so that after this point, those can be
-	 * used as condition values.
-	 */
-	inendian &= UCONV_IN_NAT_ENDIAN;
-	outendian &= UCONV_OUT_NAT_ENDIAN;
-
-	/*
-	 * If there is something in the input buffer and if necessary and
-	 * requested, save the BOM at the output buffer.
-	 */
-	if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
-		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
-			UCONV_BOM_SWAPPED_32;
-
-	/*
-	 * Do conversion; if encounter a surrogate pair, assemble high and
-	 * low pair values to form a UTF-32 character. If a half of a pair
-	 * exists alone, then, either it is an illegal (EILSEQ) or
-	 * invalid (EINVAL) value.
-	 */
-	for (; u16l < *utf16len; u16l++) {
-		if (u16s[u16l] == 0 && do_not_ignore_null)
-			break;
-
-		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
-
-		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
-			if (hi)
-				return (EILSEQ);
-			hi = lo;
-			continue;
-		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
-			if (! hi)
-				return (EILSEQ);
-			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
-				lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
-				+ UCONV_U16_START;
-			hi = 0;
-		} else if (hi) {
-			return (EILSEQ);
-		}
-
-		if (u32l >= *utf32len)
-			return (E2BIG);
-
-		u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
-	}
-
-	/*
-	 * If high half didn't see low half, then, it's most likely the input
-	 * parameter is incomplete.
-	 */
-	if (hi)
-		return (EINVAL);
-
-	/*
-	 * Save the number of consumed and saved characters. They do not
-	 * include terminating NULL character (U+0000) at the end of
-	 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
-	 * the input buffer length is big enough to include the terminating
-	 * NULL character).
-	 */
-	*utf16len = u16l;
-	*utf32len = u32l;
-
-	return (0);
-}
-
-int
-uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
-    uchar_t *u8s, size_t *utf8len, int flag)
-{
-	int inendian;
-	int outendian;
-	size_t u16l;
-	size_t u8l;
-	uint32_t hi;
-	uint32_t lo;
-	boolean_t do_not_ignore_null;
-
-	if (u16s == NULL || utf16len == NULL)
-		return (EILSEQ);
-
-	if (u8s == NULL || utf8len == NULL)
-		return (E2BIG);
-
-	if (check_endian(flag, &inendian, &outendian) != 0)
-		return (EBADF);
-
-	u16l = u8l = 0;
-	hi = 0;
-	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
-
-	if ((flag & UCONV_IN_ACCEPT_BOM) &&
-	    check_bom16(u16s, *utf16len, &inendian))
-		u16l++;
-
-	inendian &= UCONV_IN_NAT_ENDIAN;
-
-	for (; u16l < *utf16len; u16l++) {
-		if (u16s[u16l] == 0 && do_not_ignore_null)
-			break;
-
-		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
-
-		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
-			if (hi)
-				return (EILSEQ);
-			hi = lo;
-			continue;
-		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
-			if (! hi)
-				return (EILSEQ);
-			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
-				lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
-				+ UCONV_U16_START;
-			hi = 0;
-		} else if (hi) {
-			return (EILSEQ);
-		}
-
-		/*
-		 * Now we convert a UTF-32 character into a UTF-8 character.
-		 * Unicode coding space is between U+0000 and U+10FFFF;
-		 * anything bigger is an illegal character.
-		 */
-		if (lo <= UCONV_U8_ONE_BYTE) {
-			if (u8l >= *utf8len)
-				return (E2BIG);
-			u8s[u8l++] = (uchar_t)lo;
-		} else if (lo <= UCONV_U8_TWO_BYTES) {
-			if ((u8l + 1) >= *utf8len)
-				return (E2BIG);
-			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
-			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
-		} else if (lo <= UCONV_U8_THREE_BYTES) {
-			if ((u8l + 2) >= *utf8len)
-				return (E2BIG);
-			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
-			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
-			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
-		} else if (lo <= UCONV_U8_FOUR_BYTES) {
-			if ((u8l + 3) >= *utf8len)
-				return (E2BIG);
-			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
-			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
-			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
-			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
-		} else {
-			return (EILSEQ);
-		}
-	}
-
-	if (hi)
-		return (EINVAL);
-
-	*utf16len = u16l;
-	*utf8len = u8l;
-
-	return (0);
-}
-
-int
-uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
-    uint16_t *u16s, size_t *utf16len, int flag)
-{
-	int inendian;
-	int outendian;
-	size_t u16l;
-	size_t u32l;
-	uint32_t hi;
-	uint32_t lo;
-	boolean_t do_not_ignore_null;
-
-	if (u32s == NULL || utf32len == NULL)
-		return (EILSEQ);
-
-	if (u16s == NULL || utf16len == NULL)
-		return (E2BIG);
-
-	if (check_endian(flag, &inendian, &outendian) != 0)
-		return (EBADF);
-
-	u16l = u32l = 0;
-	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
-
-	if ((flag & UCONV_IN_ACCEPT_BOM) &&
-	    check_bom32(u32s, *utf32len, &inendian))
-		u32l++;
-
-	inendian &= UCONV_IN_NAT_ENDIAN;
-	outendian &= UCONV_OUT_NAT_ENDIAN;
-
-	if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
-		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
-			UCONV_BOM_SWAPPED;
-
-	for (; u32l < *utf32len; u32l++) {
-		if (u32s[u32l] == 0 && do_not_ignore_null)
-			break;
-
-		hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
-
-		/*
-		 * Anything bigger than the Unicode coding space, i.e.,
-		 * Unicode scalar value bigger than U+10FFFF, is an illegal
-		 * character.
-		 */
-		if (hi > UCONV_UNICODE_MAX)
-			return (EILSEQ);
-
-		/*
-		 * Anything bigger than U+FFFF must be converted into
-		 * a surrogate pair in UTF-16.
-		 */
-		if (hi >= UCONV_U16_START) {
-			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
-				UCONV_U16_LO_MIN;
-			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
-				UCONV_U16_HI_MIN;
-
-			if ((u16l + 1) >= *utf16len)
-				return (E2BIG);
-
-			if (outendian) {
-				u16s[u16l++] = (uint16_t)hi;
-				u16s[u16l++] = (uint16_t)lo;
-			} else {
-				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
-				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
-			}
-		} else {
-			if (u16l >= *utf16len)
-				return (E2BIG);
-			u16s[u16l++] = (outendian) ? (uint16_t)hi :
-				BSWAP_16(((uint16_t)hi));
-		}
-	}
-
-	*utf16len = u16l;
-	*utf32len = u32l;
-
-	return (0);
-}
-
-int
-uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
-    uchar_t *u8s, size_t *utf8len, int flag)
-{
-	int inendian;
-	int outendian;
-	size_t u32l;
-	size_t u8l;
-	uint32_t lo;
-	boolean_t do_not_ignore_null;
-
-	if (u32s == NULL || utf32len == NULL)
-		return (EILSEQ);
-
-	if (u8s == NULL || utf8len == NULL)
-		return (E2BIG);
-
-	if (check_endian(flag, &inendian, &outendian) != 0)
-		return (EBADF);
-
-	u32l = u8l = 0;
-	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
-
-	if ((flag & UCONV_IN_ACCEPT_BOM) &&
-	    check_bom32(u32s, *utf32len, &inendian))
-		u32l++;
-
-	inendian &= UCONV_IN_NAT_ENDIAN;
-
-	for (; u32l < *utf32len; u32l++) {
-		if (u32s[u32l] == 0 && do_not_ignore_null)
-			break;
-
-		lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
-
-		if (lo <= UCONV_U8_ONE_BYTE) {
-			if (u8l >= *utf8len)
-				return (E2BIG);
-			u8s[u8l++] = (uchar_t)lo;
-		} else if (lo <= UCONV_U8_TWO_BYTES) {
-			if ((u8l + 1) >= *utf8len)
-				return (E2BIG);
-			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
-			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
-		} else if (lo <= UCONV_U8_THREE_BYTES) {
-			if ((u8l + 2) >= *utf8len)
-				return (E2BIG);
-			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
-			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
-			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
-		} else if (lo <= UCONV_U8_FOUR_BYTES) {
-			if ((u8l + 3) >= *utf8len)
-				return (E2BIG);
-			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
-			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
-			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
-			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
-		} else {
-			return (EILSEQ);
-		}
-	}
-
-	*utf32len = u32l;
-	*utf8len = u8l;
-
-	return (0);
-}
-
-int
-uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
-    uint16_t *u16s, size_t *utf16len, int flag)
-{
-	int inendian;
-	int outendian;
-	size_t u16l;
-	size_t u8l;
-	uint32_t hi;
-	uint32_t lo;
-	int remaining_bytes;
-	int first_b;
-	boolean_t do_not_ignore_null;
-
-	if (u8s == NULL || utf8len == NULL)
-		return (EILSEQ);
-
-	if (u16s == NULL || utf16len == NULL)
-		return (E2BIG);
-
-	if (check_endian(flag, &inendian, &outendian) != 0)
-		return (EBADF);
-
-	u16l = u8l = 0;
-	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
-
-	outendian &= UCONV_OUT_NAT_ENDIAN;
-
-	if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
-		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
-			UCONV_BOM_SWAPPED;
-
-	for (; u8l < *utf8len; ) {
-		if (u8s[u8l] == 0 && do_not_ignore_null)
-			break;
-
-		/*
-		 * Collect a UTF-8 character and convert it to a UTF-32
-		 * character. In doing so, we screen out illegally formed
-		 * UTF-8 characters and treat such as illegal characters.
-		 * The algorithm at below also screens out anything bigger
-		 * than the U+10FFFF.
-		 *
-		 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
-		 * more details on the illegal values of UTF-8 character
-		 * bytes.
-		 */
-		hi = (uint32_t)u8s[u8l++];
-
-		if (hi > UCONV_ASCII_MAX) {
-			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
-				return (EILSEQ);
-
-			first_b = hi;
-			hi = hi & masks_tbl[remaining_bytes];
-
-			for (; remaining_bytes > 0; remaining_bytes--) {
-				/*
-				 * If we have no more bytes, the current
-				 * UTF-8 character is incomplete.
-				 */
-				if (u8l >= *utf8len)
-					return (EINVAL);
-
-				lo = (uint32_t)u8s[u8l++];
-
-				if (first_b) {
-					if (lo < valid_min_2nd_byte[first_b] ||
-					    lo > valid_max_2nd_byte[first_b])
-						return (EILSEQ);
-					first_b = 0;
-				} else if (lo < UCONV_U8_BYTE_MIN ||
-					lo > UCONV_U8_BYTE_MAX) {
-					return (EILSEQ);
-				}
-				hi = (hi << UCONV_U8_BIT_SHIFT) |
-					(lo & UCONV_U8_BIT_MASK);
-			}
-		}
-
-		if (hi >= UCONV_U16_START) {
-			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
-				UCONV_U16_LO_MIN;
-			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
-				UCONV_U16_HI_MIN;
-
-			if ((u16l + 1) >= *utf16len)
-				return (E2BIG);
-
-			if (outendian) {
-				u16s[u16l++] = (uint16_t)hi;
-				u16s[u16l++] = (uint16_t)lo;
-			} else {
-				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
-				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
-			}
-		} else {
-			if (u16l >= *utf16len)
-				return (E2BIG);
-
-			u16s[u16l++] = (outendian) ? (uint16_t)hi :
-				BSWAP_16(((uint16_t)hi));
-		}
-	}
-
-	*utf16len = u16l;
-	*utf8len = u8l;
-
-	return (0);
-}
-
-int
-uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
-    uint32_t *u32s, size_t *utf32len, int flag)
-{
-	int inendian;
-	int outendian;
-	size_t u32l;
-	size_t u8l;
-	uint32_t hi;
-	uint32_t c;
-	int remaining_bytes;
-	int first_b;
-	boolean_t do_not_ignore_null;
-
-	if (u8s == NULL || utf8len == NULL)
-		return (EILSEQ);
-
-	if (u32s == NULL || utf32len == NULL)
-		return (E2BIG);
-
-	if (check_endian(flag, &inendian, &outendian) != 0)
-		return (EBADF);
-
-	u32l = u8l = 0;
-	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
-
-	outendian &= UCONV_OUT_NAT_ENDIAN;
-
-	if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
-		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
-			UCONV_BOM_SWAPPED_32;
-
-	for (; u8l < *utf8len; ) {
-		if (u8s[u8l] == 0 && do_not_ignore_null)
-			break;
-
-		hi = (uint32_t)u8s[u8l++];
-
-		if (hi > UCONV_ASCII_MAX) {
-			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
-				return (EILSEQ);
-
-			first_b = hi;
-			hi = hi & masks_tbl[remaining_bytes];
-
-			for (; remaining_bytes > 0; remaining_bytes--) {
-				if (u8l >= *utf8len)
-					return (EINVAL);
-
-				c = (uint32_t)u8s[u8l++];
-
-				if (first_b) {
-					if (c < valid_min_2nd_byte[first_b] ||
-					    c > valid_max_2nd_byte[first_b])
-						return (EILSEQ);
-					first_b = 0;
-				} else if (c < UCONV_U8_BYTE_MIN ||
-					c > UCONV_U8_BYTE_MAX) {
-					return (EILSEQ);
-				}
-				hi = (hi << UCONV_U8_BIT_SHIFT) |
-					(c & UCONV_U8_BIT_MASK);
-			}
-		}
-
-		if (u32l >= *utf32len)
-			return (E2BIG);
-
-		u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
-	}
-
-	*utf32len = u32l;
-	*utf8len = u8l;
-
-	return (0);
-}
--- a/usr/src/uts/common/sys/Makefile	Fri Sep 14 08:32:57 2007 -0700
+++ b/usr/src/uts/common/sys/Makefile	Fri Sep 14 10:25:36 2007 -0700
@@ -534,6 +534,7 @@
 	turnstile.h		\
 	types.h			\
 	types32.h		\
+	u8_textprep.h		\
 	u8_textprep_data.h	\
 	uadmin.h		\
 	ucred.h			\
--- a/usr/src/uts/common/sys/sunddi.h	Fri Sep 14 08:32:57 2007 -0700
+++ b/usr/src/uts/common/sys/sunddi.h	Fri Sep 14 10:25:36 2007 -0700
@@ -59,6 +59,7 @@
 #ifdef _KERNEL
 #include <sys/ddi_obsolete.h>
 #endif
+#include <sys/u8_textprep.h>

 #ifdef	__cplusplus
 extern "C" {
@@ -441,76 +442,6 @@
 extern int ddi_strtoul(const char *, char **, int, unsigned long *);

 /*
- * Unicode encoding conversion functions and their macros.
- */
-#define	UCONV_IN_BIG_ENDIAN		0x0001
-#define	UCONV_OUT_BIG_ENDIAN		0x0002
-#define	UCONV_IN_SYSTEM_ENDIAN		0x0004
-#define	UCONV_OUT_SYSTEM_ENDIAN		0x0008
-#define	UCONV_IN_LITTLE_ENDIAN		0x0010
-#define	UCONV_OUT_LITTLE_ENDIAN		0x0020
-#define	UCONV_IGNORE_NULL		0x0040
-#define	UCONV_IN_ACCEPT_BOM		0x0080
-#define	UCONV_OUT_EMIT_BOM		0x0100
-
-extern int uconv_u16tou32(const uint16_t *, size_t *, uint32_t *, size_t *,
-	int);
-extern int uconv_u16tou8(const uint16_t *, size_t *, uchar_t *, size_t *, int);
-extern int uconv_u32tou16(const uint32_t *, size_t *, uint16_t *, size_t *,
-	int);
-extern int uconv_u32tou8(const uint32_t *, size_t *, uchar_t *, size_t *, int);
-extern int uconv_u8tou16(const uchar_t *, size_t *, uint16_t *, size_t *, int);
-extern int uconv_u8tou32(const uchar_t *, size_t *, uint32_t *, size_t *, int);
-
-/*
- * UTF-8 text preparation functions and their macros.
- *
- * Among the macros defined, U8_CANON_DECOMP, U8_COMPAT_DECOMP, and
- * U8_CANON_COMP are not public interfaces and must not be used directly
- * at the flag input argument.
- */
-#define	U8_STRCMP_CS			(0x00000001)
-#define	U8_STRCMP_CI_UPPER		(0x00000002)
-#define	U8_STRCMP_CI_LOWER		(0x00000004)
-
-#define	U8_CANON_DECOMP			(0x00000010)
-#define	U8_COMPAT_DECOMP		(0x00000020)
-#define	U8_CANON_COMP			(0x00000040)
-
-#define	U8_STRCMP_NFD			(U8_CANON_DECOMP)
-#define	U8_STRCMP_NFC			(U8_CANON_DECOMP | U8_CANON_COMP)
-#define	U8_STRCMP_NFKD			(U8_COMPAT_DECOMP)
-#define	U8_STRCMP_NFKC			(U8_COMPAT_DECOMP | U8_CANON_COMP)
-
-#define	U8_TEXTPREP_TOUPPER		(U8_STRCMP_CI_UPPER)
-#define	U8_TEXTPREP_TOLOWER		(U8_STRCMP_CI_LOWER)
-
-#define	U8_TEXTPREP_NFD			(U8_STRCMP_NFD)
-#define	U8_TEXTPREP_NFC			(U8_STRCMP_NFC)
-#define	U8_TEXTPREP_NFKD		(U8_STRCMP_NFKD)
-#define	U8_TEXTPREP_NFKC		(U8_STRCMP_NFKC)
-
-#define	U8_TEXTPREP_IGNORE_NULL		(0x00010000)
-#define	U8_TEXTPREP_IGNORE_INVALID	(0x00020000)
-#define	U8_TEXTPREP_NOWAIT		(0x00040000)
-
-#define	U8_UNICODE_320			(0)
-#define	U8_UNICODE_500			(1)
-#define	U8_UNICODE_LATEST		(U8_UNICODE_500)
-
-#define	U8_VALIDATE_ENTIRE		(0x00100000)
-#define	U8_VALIDATE_CHECK_ADDITIONAL	(0x00200000)
-#define	U8_VALIDATE_UCS2_RANGE		(0x00400000)
-
-#define	U8_ILLEGAL_CHAR			(-1)
-#define	U8_OUT_OF_RANGE_CHAR		(-2)
-
-extern int u8_validate(char *, size_t, char **, int, int *);
-extern int u8_strcmp(const char *, const char *, size_t, int, size_t, int *);
-extern size_t u8_textprep_str(char *, size_t *, char *, size_t *, int, size_t,
-	int *);
-
-/*
  * ddi_map_regs
  *
  *	Map in the register set given by rnumber.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/sys/u8_textprep.h	Fri Sep 14 10:25:36 2007 -0700
@@ -0,0 +1,113 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_U8_TEXTPREP_H
+#define	_SYS_U8_TEXTPREP_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Unicode encoding conversion functions and their macros.
+ */
+#define	UCONV_IN_BIG_ENDIAN		0x0001
+#define	UCONV_OUT_BIG_ENDIAN		0x0002
+#define	UCONV_IN_SYSTEM_ENDIAN		0x0004
+#define	UCONV_OUT_SYSTEM_ENDIAN		0x0008
+#define	UCONV_IN_LITTLE_ENDIAN		0x0010
+#define	UCONV_OUT_LITTLE_ENDIAN		0x0020
+#define	UCONV_IGNORE_NULL		0x0040
+#define	UCONV_IN_ACCEPT_BOM		0x0080
+#define	UCONV_OUT_EMIT_BOM		0x0100
+
+extern int uconv_u16tou32(const uint16_t *, size_t *, uint32_t *, size_t *,
+	int);
+extern int uconv_u16tou8(const uint16_t *, size_t *, uchar_t *, size_t *, int);
+extern int uconv_u32tou16(const uint32_t *, size_t *, uint16_t *, size_t *,
+	int);
+extern int uconv_u32tou8(const uint32_t *, size_t *, uchar_t *, size_t *, int);
+extern int uconv_u8tou16(const uchar_t *, size_t *, uint16_t *, size_t *, int);
+extern int uconv_u8tou32(const uchar_t *, size_t *, uint32_t *, size_t *, int);
+
+/*
+ * UTF-8 text preparation functions and their macros.
+ *
+ * Among the macros defined, U8_CANON_DECOMP, U8_COMPAT_DECOMP, and
+ * U8_CANON_COMP are not public interfaces and must not be used directly
+ * at the flag input argument.
+ */
+#define	U8_STRCMP_CS			(0x00000001)
+#define	U8_STRCMP_CI_UPPER		(0x00000002)
+#define	U8_STRCMP_CI_LOWER		(0x00000004)
+
+#define	U8_CANON_DECOMP			(0x00000010)
+#define	U8_COMPAT_DECOMP		(0x00000020)
+#define	U8_CANON_COMP			(0x00000040)
+
+#define	U8_STRCMP_NFD			(U8_CANON_DECOMP)
+#define	U8_STRCMP_NFC			(U8_CANON_DECOMP | U8_CANON_COMP)
+#define	U8_STRCMP_NFKD			(U8_COMPAT_DECOMP)
+#define	U8_STRCMP_NFKC			(U8_COMPAT_DECOMP | U8_CANON_COMP)
+
+#define	U8_TEXTPREP_TOUPPER		(U8_STRCMP_CI_UPPER)
+#define	U8_TEXTPREP_TOLOWER		(U8_STRCMP_CI_LOWER)
+
+#define	U8_TEXTPREP_NFD			(U8_STRCMP_NFD)
+#define	U8_TEXTPREP_NFC			(U8_STRCMP_NFC)
+#define	U8_TEXTPREP_NFKD		(U8_STRCMP_NFKD)
+#define	U8_TEXTPREP_NFKC		(U8_STRCMP_NFKC)
+
+#define	U8_TEXTPREP_IGNORE_NULL		(0x00010000)
+#define	U8_TEXTPREP_IGNORE_INVALID	(0x00020000)
+#define	U8_TEXTPREP_NOWAIT		(0x00040000)
+
+#define	U8_UNICODE_320			(0)
+#define	U8_UNICODE_500			(1)
+#define	U8_UNICODE_LATEST		(U8_UNICODE_500)
+
+#define	U8_VALIDATE_ENTIRE		(0x00100000)
+#define	U8_VALIDATE_CHECK_ADDITIONAL	(0x00200000)
+#define	U8_VALIDATE_UCS2_RANGE		(0x00400000)
+
+#define	U8_ILLEGAL_CHAR			(-1)
+#define	U8_OUT_OF_RANGE_CHAR		(-2)
+
+extern int u8_validate(char *, size_t, char **, int, int *);
+extern int u8_strcmp(const char *, const char *, size_t, int, size_t, int *);
+extern size_t u8_textprep_str(char *, size_t *, char *, size_t *, int, size_t,
+	int *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_U8_TEXTPREP_H */