# HG changeset patch # User is # Date 1189790736 25200 # Node ID 09764a26229ef1b3e2fadfec9596ad9d33007983 # Parent a4c12419233c5ba8f90b489645e13ce0ec52ff82 6603632 PSARC/2007/458 User land UTF-8 text preparation functions 6603633 PSARC/2007/517 Uconv functions at libc diff -r a4c12419233c -r 09764a26229e usr/src/common/unicode/u8_textprep.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/common/unicode/u8_textprep.c Fri Sep 14 10:25:36 2007 -0700 @@ -0,0 +1,2132 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + + +/* + * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458). + * + * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F), + * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also + * the section 3C man pages. + * Interface stability: Committed. + */ + +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#else +#include +#include +#endif /* _KERNEL */ +#include +#include +#include + + +/* The maximum possible number of bytes in a UTF-8 character. */ +#define U8_MB_CUR_MAX (4) + +/* + * The maximum number of bytes needed for a UTF-8 character to cover + * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2. + */ +#define U8_MAX_BYTES_UCS2 (3) + +/* The maximum possible number of bytes in a Stream-Safe Text. */ +#define U8_STREAM_SAFE_TEXT_MAX (128) + +/* + * The maximum number of characters in a combining/conjoining sequence and + * the actual upperbound limit of a combining/conjoining sequence. + */ +#define U8_MAX_CHARS_A_SEQ (32) +#define U8_UPPER_LIMIT_IN_A_SEQ (31) + +/* The combining class value for Starter. */ +#define U8_COMBINING_CLASS_STARTER (0) + +/* + * Some Hangul related macros at below. + * + * The first and the last of Hangul syllables, Hangul Jamo Leading consonants, + * Vowels, and optional Trailing consonants in Unicode scalar values. + * + * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not + * the actual U+11A8. This is due to that the trailing consonant is optional + * and thus we are doing a pre-calculation of subtracting one. + * + * Each of 19 modern leading consonants has total 588 possible syllables since + * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for + * no trailing consonant case, i.e., 21 x 28 = 588. + * + * We also have bunch of Hangul related macros at below. Please bear in mind + * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is + * a Hangul Jamo or not but the value does not guarantee that it is a Hangul + * Jamo; it just guarantee that it will be most likely. + */ +#define U8_HANGUL_SYL_FIRST (0xAC00U) +#define U8_HANGUL_SYL_LAST (0xD7A3U) + +#define U8_HANGUL_JAMO_L_FIRST (0x1100U) +#define U8_HANGUL_JAMO_L_LAST (0x1112U) +#define U8_HANGUL_JAMO_V_FIRST (0x1161U) +#define U8_HANGUL_JAMO_V_LAST (0x1175U) +#define U8_HANGUL_JAMO_T_FIRST (0x11A7U) +#define U8_HANGUL_JAMO_T_LAST (0x11C2U) + +#define U8_HANGUL_V_COUNT (21) +#define U8_HANGUL_VT_COUNT (588) +#define U8_HANGUL_T_COUNT (28) + +#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U) + +#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \ + (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \ + (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \ + (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU)); + +#define U8_HANGUL_JAMO_L(u) \ + ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST) + +#define U8_HANGUL_JAMO_V(u) \ + ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST) + +#define U8_HANGUL_JAMO_T(u) \ + ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) + +#define U8_HANGUL_JAMO(u) \ + ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) + +#define U8_HANGUL_SYLLABLE(u) \ + ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST) + +#define U8_HANGUL_COMPOSABLE_L_V(s, u) \ + ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u))) + +#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \ + ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u))) + +/* The types of decomposition mappings. */ +#define U8_DECOMP_BOTH (0xF5U) +#define U8_DECOMP_CANONICAL (0xF6U) + +/* The indicator for 16-bit table. */ +#define U8_16BIT_TABLE_INDICATOR (0x8000U) + +/* The following are some convenience macros. */ +#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ + (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \ + (uint32_t)(b3) & 0x3F; + +#define U8_SIMPLE_SWAP(a, b, t) \ + (t) = (a); \ + (a) = (b); \ + (b) = (t); + +#define U8_ASCII_TOUPPER(c) \ + (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c)) + +#define U8_ASCII_TOLOWER(c) \ + (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c)) + +#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U) +/* + * The following macro assumes that the two characters that are to be + * swapped are adjacent to each other and 'a' comes before 'b'. + * + * If the assumptions are not met, then, the macro will fail. + */ +#define U8_SWAP_COMB_MARKS(a, b) \ + for (k = 0; k < disp[(a)]; k++) \ + u8t[k] = u8s[start[(a)] + k]; \ + for (k = 0; k < disp[(b)]; k++) \ + u8s[start[(a)] + k] = u8s[start[(b)] + k]; \ + start[(b)] = start[(a)] + disp[(b)]; \ + for (k = 0; k < disp[(a)]; k++) \ + u8s[start[(b)] + k] = u8t[k]; \ + U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \ + U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc); + +/* The possible states during normalization. */ +typedef enum { + U8_STATE_START = 0, + U8_STATE_HANGUL_L = 1, + U8_STATE_HANGUL_LV = 2, + U8_STATE_HANGUL_LVT = 3, + U8_STATE_HANGUL_V = 4, + U8_STATE_HANGUL_T = 5, + U8_STATE_COMBINING_MARK = 6 +} u8_normalization_states_t; + +/* + * The three vectors at below are used to check bytes of a given UTF-8 + * character are valid and not containing any malformed byte values. + * + * We used to have a quite relaxed UTF-8 binary representation but then there + * was some security related issues and so the Unicode Consortium defined + * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it + * one more time at the Unicode 3.2. The following three tables are based on + * that. + */ + +#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF) + +#define I_ U8_ILLEGAL_CHAR +#define O_ U8_OUT_OF_RANGE_CHAR + +const int8_t u8_number_of_bytes[0x100] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + +/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ + I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + +/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + +/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + +/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ + 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, +}; + +#undef I_ +#undef O_ + +const uint8_t u8_valid_min_2nd_byte[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +/* C0 C1 C2 C3 C4 C5 C6 C7 */ + 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* C8 C9 CA CB CC CD CE CF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* D0 D1 D2 D3 D4 D5 D6 D7 */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* D8 D9 DA DB DC DD DE DF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* E0 E1 E2 E3 E4 E5 E6 E7 */ + 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* E8 E9 EA EB EC ED EE EF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* F0 F1 F2 F3 F4 F5 F6 F7 */ + 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +const uint8_t u8_valid_max_2nd_byte[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +/* C0 C1 C2 C3 C4 C5 C6 C7 */ + 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* C8 C9 CA CB CC CD CE CF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* D0 D1 D2 D3 D4 D5 D6 D7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* D8 D9 DA DB DC DD DE DF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* E0 E1 E2 E3 E4 E5 E6 E7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* E8 E9 EA EB EC ED EE EF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, +/* F0 F1 F2 F3 F4 F5 F6 F7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + + +/* + * The u8_validate() validates on the given UTF-8 character string and + * calculate the byte length. It is quite similar to mblen(3C) except that + * this will validate against the list of characters if required and + * specific to UTF-8 and Unicode. + */ +int +u8_validate(char *u8str, size_t n, char **list, int flag, int *errno) +{ + uchar_t *ib; + uchar_t *ibtail; + uchar_t **p; + uchar_t *s1; + uchar_t *s2; + uchar_t f; + int sz; + size_t i; + int ret_val; + boolean_t second; + boolean_t no_need_to_validate_entire; + boolean_t check_additional; + boolean_t validate_ucs2_range_only; + + if (! u8str) + return (0); + + ib = (uchar_t *)u8str; + ibtail = ib + n; + + ret_val = 0; + + no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE); + check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL; + validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE; + + while (ib < ibtail) { + /* + * The first byte of a UTF-8 character tells how many + * bytes will follow for the character. If the first byte + * is an illegal byte value or out of range value, we just + * return -1 with an appropriate error number. + */ + sz = u8_number_of_bytes[*ib]; + if (sz == U8_ILLEGAL_CHAR) { + *errno = EILSEQ; + return (-1); + } + + if (sz == U8_OUT_OF_RANGE_CHAR || + (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) { + *errno = ERANGE; + return (-1); + } + + /* + * If we don't have enough bytes to check on, that's also + * an error. As you can see, we give illegal byte sequence + * checking higher priority then EINVAL cases. + */ + if ((ibtail - ib) < sz) { + *errno = EINVAL; + return (-1); + } + + if (sz == 1) { + ib++; + ret_val++; + } else { + /* + * Check on the multi-byte UTF-8 character. For more + * details on this, see comment added for the used + * data structures at the beginning of the file. + */ + f = *ib++; + ret_val++; + second = B_TRUE; + for (i = 1; i < sz; i++) { + if (second) { + if (*ib < u8_valid_min_2nd_byte[f] || + *ib > u8_valid_max_2nd_byte[f]) { + *errno = EILSEQ; + return (-1); + } + second = B_FALSE; + } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) { + *errno = EILSEQ; + return (-1); + } + ib++; + ret_val++; + } + } + + if (check_additional) { + for (p = (uchar_t **)list, i = 0; p[i]; i++) { + s1 = ib - sz; + s2 = p[i]; + while (s1 < ib) { + if (*s1 != *s2 || *s2 == '\0') + break; + s1++; + s2++; + } + + if (s1 >= ib && *s2 == '\0') { + *errno = EBADF; + return (-1); + } + } + } + + if (no_need_to_validate_entire) + break; + } + + return (ret_val); +} + +/* + * The do_case_conv() looks at the mapping tables and returns found + * bytes if any. If not found, the input bytes are returned. The function + * always terminate the return bytes with a null character assuming that + * there are plenty of room to do so. + * + * The case conversions are simple case conversions mapping a character to + * another character as specified in the Unicode data. The byte size of + * the mapped character could be different from that of the input character. + * + * The return value is the byte length of the returned character excluding + * the terminating null byte. + */ +static size_t +do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper) +{ + size_t i; + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b3_tbl; + uint16_t b3_base; + uint16_t b4 = 0; + size_t start_id; + size_t end_id; + + /* + * At this point, the only possible values for sz are 2, 3, and 4. + * The u8s should point to a vector that is well beyond the size of + * 5 bytes. + */ + if (sz == 2) { + b3 = u8s[0] = s[0]; + b4 = u8s[1] = s[1]; + } else if (sz == 3) { + b2 = u8s[0] = s[0]; + b3 = u8s[1] = s[1]; + b4 = u8s[2] = s[2]; + } else if (sz == 4) { + b1 = u8s[0] = s[0]; + b2 = u8s[1] = s[1]; + b3 = u8s[2] = s[2]; + b4 = u8s[3] = s[3]; + } else { + /* This is not possible but just in case as a fallback. */ + if (is_it_toupper) + *u8s = U8_ASCII_TOUPPER(*s); + else + *u8s = U8_ASCII_TOLOWER(*s); + u8s[1] = '\0'; + + return (1); + } + u8s[sz] = '\0'; + + /* + * Let's find out if we have a corresponding character. + */ + b1 = u8_common_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + b2 = u8_case_common_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + if (is_it_toupper) { + b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1]; + + /* Either there is no match or an error at the table. */ + if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) + return ((size_t)sz); + + b3_base = u8_toupper_b3_tbl[uv][b2][b3].base; + + for (i = 0; start_id < end_id; start_id++) + u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id]; + } else { + b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1]; + + if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) + return ((size_t)sz); + + b3_base = u8_tolower_b3_tbl[uv][b2][b3].base; + + for (i = 0; start_id < end_id; start_id++) + u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id]; + } + + /* + * If i is still zero, that means there is no corresponding character. + */ + if (i == 0) + return ((size_t)sz); + + u8s[i] = '\0'; + + return (i); +} + +/* + * The do_case_compare() function compares the two input strings, s1 and s2, + * one character at a time doing case conversions if applicable and return + * the comparison result as like strcmp(). + * + * Since, in empirical sense, most of text data are 7-bit ASCII characters, + * we treat the 7-bit ASCII characters as a special case trying to yield + * faster processing time. + */ +static int +do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, + size_t n2, boolean_t is_it_toupper, int *errno) +{ + int f; + int sz1; + int sz2; + size_t j; + size_t i1; + size_t i2; + uchar_t u8s1[U8_MB_CUR_MAX + 1]; + uchar_t u8s2[U8_MB_CUR_MAX + 1]; + + i1 = i2 = 0; + while (i1 < n1 && i2 < n2) { + /* + * Find out what would be the byte length for this UTF-8 + * character at string s1 and also find out if this is + * an illegal start byte or not and if so, issue a proper + * errno and yet treat this byte as a character. + */ + sz1 = u8_number_of_bytes[*s1]; + if (sz1 < 0) { + *errno = EILSEQ; + sz1 = 1; + } + + /* + * For 7-bit ASCII characters mainly, we do a quick case + * conversion right at here. + * + * If we don't have enough bytes for this character, issue + * an EINVAL error and use what are available. + * + * If we have enough bytes, find out if there is + * a corresponding uppercase character and if so, copy over + * the bytes for a comparison later. If there is no + * corresponding uppercase character, then, use what we have + * for the comparison. + */ + if (sz1 == 1) { + if (is_it_toupper) + u8s1[0] = U8_ASCII_TOUPPER(*s1); + else + u8s1[0] = U8_ASCII_TOLOWER(*s1); + s1++; + u8s1[1] = '\0'; + } else if ((i1 + sz1) > n1) { + *errno = EINVAL; + for (j = 0; (i1 + j) < n1; ) + u8s1[j++] = *s1++; + u8s1[j] = '\0'; + } else { + (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper); + s1 += sz1; + } + + /* Do the same for the string s2. */ + sz2 = u8_number_of_bytes[*s2]; + if (sz2 < 0) { + *errno = EILSEQ; + sz2 = 1; + } + + if (sz2 == 1) { + if (is_it_toupper) + u8s2[0] = U8_ASCII_TOUPPER(*s2); + else + u8s2[0] = U8_ASCII_TOLOWER(*s2); + s2++; + u8s2[1] = '\0'; + } else if ((i2 + sz2) > n2) { + *errno = EINVAL; + for (j = 0; (i2 + j) < n2; ) + u8s2[j++] = *s2++; + u8s2[j] = '\0'; + } else { + (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper); + s2 += sz2; + } + + /* Now compare the two characters. */ + if (sz1 == 1 && sz2 == 1) { + if (*u8s1 > *u8s2) + return (1); + if (*u8s1 < *u8s2) + return (-1); + } else { + f = strcmp((const char *)u8s1, (const char *)u8s2); + if (f != 0) + return (f); + } + + /* + * They were the same. Let's move on to the next + * characters then. + */ + i1 += sz1; + i2 += sz2; + } + + /* + * We compared until the end of either or both strings. + * + * If we reached to or went over the ends for the both, that means + * they are the same. + * + * If we reached only one of the two ends, that means the other string + * has something which then the fact can be used to determine + * the return value. + */ + if (i1 >= n1) { + if (i2 >= n2) + return (0); + return (-1); + } + return (1); +} + +/* + * The combining_class() function checks on the given bytes and find out + * the corresponding Unicode combining class value. The return value 0 means + * it is a Starter. Any illegal UTF-8 character will also be treated as + * a Starter. + */ +static uchar_t +combining_class(size_t uv, uchar_t *s, size_t sz) +{ + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b4 = 0; + + if (sz == 1 || sz > 4) + return (0); + + if (sz == 2) { + b3 = s[0]; + b4 = s[1]; + } else if (sz == 3) { + b2 = s[0]; + b3 = s[1]; + b4 = s[2]; + } else if (sz == 4) { + b1 = s[0]; + b2 = s[1]; + b3 = s[2]; + b4 = s[3]; + } + + b1 = u8_common_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return (0); + + b2 = u8_combining_class_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return (0); + + b3 = u8_combining_class_b3_tbl[uv][b2][b3]; + if (b3 == U8_TBL_ELEMENT_NOT_DEF) + return (0); + + return (u8_combining_class_b4_tbl[uv][b3][b4]); +} + +/* + * The do_decomp() function finds out a matching decomposition if any + * and return. If there is no match, the input bytes are copied and returned. + * The function also checks if there is a Hangul, decomposes it if necessary + * and returns. + * + * To save time, a single byte 7-bit ASCII character should be handled by + * the caller. + * + * The function returns the number of bytes returned sans always terminating + * the null byte. It will also return a state that will tell if there was + * a Hangul character decomposed which then will be used by the caller. + */ +static size_t +do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, + boolean_t canonical_decomposition, u8_normalization_states_t *state) +{ + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b3_tbl; + uint16_t b3_base; + uint16_t b4 = 0; + size_t start_id; + size_t end_id; + size_t i; + uint32_t u1; + + if (sz == 2) { + b3 = u8s[0] = s[0]; + b4 = u8s[1] = s[1]; + u8s[2] = '\0'; + } else if (sz == 3) { + /* Convert it to a Unicode scalar value. */ + U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]); + + /* + * If this is a Hangul syllable, we decompose it into + * a leading consonant, a vowel, and an optional trailing + * consonant and then return. + */ + if (U8_HANGUL_SYLLABLE(u1)) { + u1 -= U8_HANGUL_SYL_FIRST; + + b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT; + b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT) + / U8_HANGUL_T_COUNT; + b3 = u1 % U8_HANGUL_T_COUNT; + + U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1); + U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2); + if (b3) { + b3 += U8_HANGUL_JAMO_T_FIRST; + U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3); + + u8s[9] = '\0'; + *state = U8_STATE_HANGUL_LVT; + return (9); + } + + u8s[6] = '\0'; + *state = U8_STATE_HANGUL_LV; + return (6); + } + + b2 = u8s[0] = s[0]; + b3 = u8s[1] = s[1]; + b4 = u8s[2] = s[2]; + u8s[3] = '\0'; + + /* + * If this is a Hangul Jamo, we know there is nothing + * further that we can decompose. + */ + if (U8_HANGUL_JAMO_L(u1)) { + *state = U8_STATE_HANGUL_L; + return (3); + } + + if (U8_HANGUL_JAMO_V(u1)) { + if (*state == U8_STATE_HANGUL_L) + *state = U8_STATE_HANGUL_LV; + else + *state = U8_STATE_HANGUL_V; + return (3); + } + + if (U8_HANGUL_JAMO_T(u1)) { + if (*state == U8_STATE_HANGUL_LV) + *state = U8_STATE_HANGUL_LVT; + else + *state = U8_STATE_HANGUL_T; + return (3); + } + } else if (sz == 4) { + b1 = u8s[0] = s[0]; + b2 = u8s[1] = s[1]; + b3 = u8s[2] = s[2]; + b4 = u8s[3] = s[3]; + u8s[4] = '\0'; + } else { + /* + * This is a fallback and should not happen if the function + * was called properly. + */ + u8s[0] = s[0]; + u8s[1] = '\0'; + *state = U8_STATE_START; + return (1); + } + + /* + * At this point, this rountine does not know what it would get. + * The caller should sort it out if the state isn't a Hangul one. + */ + *state = U8_STATE_START; + + /* Try to find matching decomposition mapping byte sequence. */ + b1 = u8_common_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + b2 = u8_decomp_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + /* + * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR + * which is 0x8000, this means we couldn't fit the mappings into + * the cardinality of a unsigned byte. + */ + if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { + b3_tbl -= U8_16BIT_TABLE_INDICATOR; + start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; + end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; + } else { + start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; + } + + /* This also means there wasn't any matching decomposition. */ + if (start_id >= end_id) + return ((size_t)sz); + + /* + * The final table for decomposition mappings has three types of + * byte sequences depending on whether a mapping is for compatibility + * decomposition, canonical decomposition, or both like the following: + * + * (1) Compatibility decomposition mappings: + * + * +---+---+-...-+---+ + * | B0| B1| ... | Bm| + * +---+---+-...-+---+ + * + * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH). + * + * (2) Canonical decomposition mappings: + * + * +---+---+---+-...-+---+ + * | T | b0| b1| ... | bn| + * +---+---+---+-...-+---+ + * + * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL). + * + * (3) Both mappings: + * + * +---+---+---+---+-...-+---+---+---+-...-+---+ + * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm| + * +---+---+---+---+-...-+---+---+---+-...-+---+ + * + * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement + * byte, b0 to bn are canonical mapping bytes and B0 to Bm are + * compatibility mapping bytes. + * + * Note that compatibility decomposition means doing recursive + * decompositions using both compatibility decomposition mappings and + * canonical decomposition mappings. On the other hand, canonical + * decomposition means doing recursive decompositions using only + * canonical decomposition mappings. Since the table we have has gone + * through the recursions already, we do not need to do so during + * runtime, i.e., the table has been completely flattened out + * already. + */ + + b3_base = u8_decomp_b3_tbl[uv][b2][b3].base; + + /* Get the type, T, of the byte sequence. */ + b1 = u8_decomp_final_tbl[uv][b3_base + start_id]; + + /* + * If necessary, adjust start_id, end_id, or both. Note that if + * this is compatibility decomposition mapping, there is no + * adjustment. + */ + if (canonical_decomposition) { + /* Is the mapping only for compatibility decomposition? */ + if (b1 < U8_DECOMP_BOTH) + return ((size_t)sz); + + start_id++; + + if (b1 == U8_DECOMP_BOTH) { + end_id = start_id + + u8_decomp_final_tbl[uv][b3_base + start_id]; + start_id++; + } + } else { + /* + * Unless this is a compatibility decomposition mapping, + * we adjust the start_id. + */ + if (b1 == U8_DECOMP_BOTH) { + start_id++; + start_id += u8_decomp_final_tbl[uv][b3_base + start_id]; + } else if (b1 == U8_DECOMP_CANONICAL) { + start_id++; + } + } + + for (i = 0; start_id < end_id; start_id++) + u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id]; + u8s[i] = '\0'; + + return (i); +} + +/* + * The find_composition_start() function uses the character bytes given and + * find out the matching composition mappings if any and return the address + * to the composition mappings as explained in the do_composition(). + */ +static uchar_t * +find_composition_start(size_t uv, uchar_t *s, size_t sz) +{ + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b3_tbl; + uint16_t b3_base; + uint16_t b4 = 0; + size_t start_id; + size_t end_id; + + if (sz == 1) { + b4 = s[0]; + } else if (sz == 2) { + b3 = s[0]; + b4 = s[1]; + } else if (sz == 3) { + b2 = s[0]; + b3 = s[1]; + b4 = s[2]; + } else if (sz == 4) { + b1 = s[0]; + b2 = s[1]; + b3 = s[2]; + b4 = s[3]; + } else { + /* + * This is a fallback and should not happen if the function + * was called properly. + */ + return (NULL); + } + + b1 = u8_composition_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return (NULL); + + b2 = u8_composition_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return (NULL); + + b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return (NULL); + + if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { + b3_tbl -= U8_16BIT_TABLE_INDICATOR; + start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; + end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; + } else { + start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; + } + + if (start_id >= end_id) + return (NULL); + + b3_base = u8_composition_b3_tbl[uv][b2][b3].base; + + return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id])); +} + +/* + * The blocked() function checks on the combining class values of previous + * characters in this sequence and return whether it is blocked or not. + */ +static boolean_t +blocked(uchar_t *comb_class, size_t last) +{ + uchar_t my_comb_class; + size_t i; + + my_comb_class = comb_class[last]; + for (i = 1; i < last; i++) + if (comb_class[i] >= my_comb_class || + comb_class[i] == U8_COMBINING_CLASS_STARTER) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * The do_composition() reads the character string pointed by 's' and + * do necessary canonical composition and then copy over the result back to + * the 's'. + * + * The input argument 's' cannot contain more than 32 characters. + */ +static size_t +do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start, + uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) +{ + uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t tc[U8_MB_CUR_MAX]; + uint8_t saved_marks[U8_MAX_CHARS_A_SEQ]; + size_t saved_marks_count; + uchar_t *p; + uchar_t *saved_p; + uchar_t *q; + size_t i; + size_t saved_i; + size_t j; + size_t k; + size_t l; + size_t C; + size_t saved_l; + size_t size; + uint32_t u1; + uint32_t u2; + boolean_t match_not_found = B_TRUE; + + /* + * This should never happen unless the callers are doing some strange + * and unexpected things. + * + * The "last" is the index pointing to the last character not last + 1. + */ + if (last >= U8_MAX_CHARS_A_SEQ) + last = U8_UPPER_LIMIT_IN_A_SEQ; + + for (i = l = 0; i <= last; i++) { + /* + * The last or any non-Starters at the beginning, we don't + * have any chance to do composition and so we just copy them + * to the temporary buffer. + */ + if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) { +SAVE_THE_CHAR: + p = s + start[i]; + size = disp[i]; + for (k = 0; k < size; k++) + t[l++] = *p++; + continue; + } + + /* + * If this could be a start of Hangul Jamos, then, we try to + * conjoin them. + */ + if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) { + U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]], + s[start[i] + 1], s[start[i] + 2]); + U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3], + s[start[i] + 4], s[start[i] + 5]); + + if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) { + u1 -= U8_HANGUL_JAMO_L_FIRST; + u2 -= U8_HANGUL_JAMO_V_FIRST; + u1 = U8_HANGUL_SYL_FIRST + + (u1 * U8_HANGUL_V_COUNT + u2) * + U8_HANGUL_T_COUNT; + + i += 2; + if (i <= last) { + U8_PUT_3BYTES_INTO_UTF32(u2, + s[start[i]], s[start[i] + 1], + s[start[i] + 2]); + + if (U8_HANGUL_JAMO_T(u2)) { + u1 += u2 - + U8_HANGUL_JAMO_T_FIRST; + i++; + } + } + + U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1); + i--; + l += 3; + continue; + } + } + + /* + * Let's then find out if this Starter has composition + * mapping. + */ + p = find_composition_start(uv, s + start[i], disp[i]); + if (p == NULL) + goto SAVE_THE_CHAR; + + /* + * We have a Starter with composition mapping and the next + * character is a non-Starter. Let's try to find out if + * we can do composition. + */ + + saved_p = p; + saved_i = i; + saved_l = l; + saved_marks_count = 0; + +TRY_THE_NEXT_MARK: + q = s + start[++i]; + size = disp[i]; + + /* + * The next for() loop compares the non-Starter pointed by + * 'q' with the possible (joinable) characters pointed by 'p'. + * + * The composition final table entry pointed by the 'p' + * looks like the following: + * + * +---+---+---+-...-+---+---+---+---+-...-+---+---+ + * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F | + * +---+---+---+-...-+---+---+---+---+-...-+---+---+ + * + * where C is the count byte indicating the number of + * mapping pairs where each pair would be look like + * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second + * character of a canonical decomposition and the B0-Bm are + * the bytes of a matching composite character. The F is + * a filler byte after each character as the separator. + */ + + match_not_found = B_TRUE; + + for (C = *p++; C > 0; C--) { + for (k = 0; k < size; p++, k++) + if (*p != q[k]) + break; + + /* Have we found it? */ + if (k >= size && *p == U8_TBL_ELEMENT_FILLER) { + match_not_found = B_FALSE; + + l = saved_l; + + while (*++p != U8_TBL_ELEMENT_FILLER) + t[l++] = *p; + + break; + } + + /* We didn't find; skip to the next pair. */ + if (*p != U8_TBL_ELEMENT_FILLER) + while (*++p != U8_TBL_ELEMENT_FILLER) + ; + while (*++p != U8_TBL_ELEMENT_FILLER) + ; + p++; + } + + /* + * If there was no match, we will need to save the combining + * mark for later appending. After that, if the next one + * is a non-Starter and not blocked, then, we try once + * again to do composition with the next non-Starter. + * + * If there was no match and this was a Starter, then, + * this is a new start. + * + * If there was a match and a composition done and we have + * more to check on, then, we retrieve a new composition final + * table entry for the composite and then try to do the + * composition again. + */ + + if (match_not_found) { + if (comb_class[i] == U8_COMBINING_CLASS_STARTER) { + i--; + goto SAVE_THE_CHAR; + } + + saved_marks[saved_marks_count++] = i; + } + + if (saved_l == l) { + while (i < last) { + if (blocked(comb_class, i + 1)) + saved_marks[saved_marks_count++] = ++i; + else + break; + } + if (i < last) { + p = saved_p; + goto TRY_THE_NEXT_MARK; + } + } else if (i < last) { + p = find_composition_start(uv, t + saved_l, + l - saved_l); + if (p != NULL) { + saved_p = p; + goto TRY_THE_NEXT_MARK; + } + } + + /* + * There is no more composition possible. + * + * If there was no composition what so ever then we copy + * over the original Starter and then append any non-Starters + * remaining at the target string sequentially after that. + */ + + if (saved_l == l) { + p = s + start[saved_i]; + size = disp[saved_i]; + for (j = 0; j < size; j++) + t[l++] = *p++; + } + + for (k = 0; k < saved_marks_count; k++) { + p = s + start[saved_marks[k]]; + size = disp[saved_marks[k]]; + for (j = 0; j < size; j++) + t[l++] = *p++; + } + } + + /* + * If the last character is a Starter and if we have a character + * (possibly another Starter) that can be turned into a composite, + * we do so and we do so until there is no more of composition + * possible. + */ + if (comb_class[last] == U8_COMBINING_CLASS_STARTER) { + p = *os; + saved_l = l - disp[last]; + + while (p < oslast) { + size = u8_number_of_bytes[*p]; + if (size <= 1 || (p + size) > oslast) + break; + + saved_p = p; + + for (i = 0; i < size; i++) + tc[i] = *p++; + + q = find_composition_start(uv, t + saved_l, + l - saved_l); + if (q == NULL) { + p = saved_p; + break; + } + + match_not_found = B_TRUE; + + for (C = *q++; C > 0; C--) { + for (k = 0; k < size; q++, k++) + if (*q != tc[k]) + break; + + if (k >= size && *q == U8_TBL_ELEMENT_FILLER) { + match_not_found = B_FALSE; + + l = saved_l; + + while (*++q != U8_TBL_ELEMENT_FILLER) { + /* + * This is practically + * impossible but we don't + * want to take any chances. + */ + if (l >= + U8_STREAM_SAFE_TEXT_MAX) { + p = saved_p; + goto SAFE_RETURN; + } + t[l++] = *q; + } + + break; + } + + if (*q != U8_TBL_ELEMENT_FILLER) + while (*++q != U8_TBL_ELEMENT_FILLER) + ; + while (*++q != U8_TBL_ELEMENT_FILLER) + ; + q++; + } + + if (match_not_found) { + p = saved_p; + break; + } + } +SAFE_RETURN: + *os = p; + } + + /* + * Now we copy over the temporary string to the target string. + * Since composition always reduces the number of characters or + * the number of characters stay, we don't need to worry about + * the buffer overflow here. + */ + for (i = 0; i < l; i++) + s[i] = t[i]; + s[l] = '\0'; + + return (l); +} + +/* + * The collect_a_seq() function checks on the given string s, collect + * a sequence of characters at u8s, and return the sequence. While it collects + * a sequence, it also applies case conversion, canonical or compatibility + * decomposition, canonical decomposition, or some or all of them and + * in that order. + * + * The collected sequence cannot be bigger than 32 characters since if + * it is having more than 31 characters, the sequence will be terminated + * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into + * a Stream-Safe Text. The collected sequence is always terminated with + * a null byte and the return value is the byte length of the sequence + * including 0. The return value does not include the terminating + * null byte. + */ +static size_t +collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast, + boolean_t is_it_toupper, + boolean_t is_it_tolower, + boolean_t canonical_decomposition, + boolean_t compatibility_decomposition, + boolean_t canonical_composition, + int *errno, u8_normalization_states_t *state) +{ + uchar_t *s; + int sz; + int saved_sz; + size_t i; + size_t j; + size_t k; + size_t l; + uchar_t comb_class[U8_MAX_CHARS_A_SEQ]; + uchar_t disp[U8_MAX_CHARS_A_SEQ]; + uchar_t start[U8_MAX_CHARS_A_SEQ]; + uchar_t u8t[U8_MB_CUR_MAX]; + uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t tc; + size_t last; + size_t saved_last; + uint32_t u1; + + /* + * Save the source string pointer which we will return a changed + * pointer if we do processing. + */ + s = *source; + + /* + * The following is a fallback for just in case callers are not + * checking the string boundaries before the calling. + */ + if (s >= slast) { + u8s[0] = '\0'; + + return (0); + } + + /* + * As the first thing, let's collect a character and do case + * conversion if necessary. + */ + + sz = u8_number_of_bytes[*s]; + + if (sz < 0) { + *errno = EILSEQ; + + u8s[0] = *s++; + u8s[1] = '\0'; + + *source = s; + + return (1); + } + + if (sz == 1) { + if (is_it_toupper) + u8s[0] = U8_ASCII_TOUPPER(*s); + else if (is_it_tolower) + u8s[0] = U8_ASCII_TOLOWER(*s); + else + u8s[0] = *s; + s++; + u8s[1] = '\0'; + } else if ((s + sz) > slast) { + *errno = EINVAL; + + for (i = 0; s < slast; ) + u8s[i++] = *s++; + u8s[i] = '\0'; + + *source = s; + + return (i); + } else { + if (is_it_toupper || is_it_tolower) { + i = do_case_conv(uv, u8s, s, sz, is_it_toupper); + s += sz; + sz = i; + } else { + for (i = 0; i < sz; ) + u8s[i++] = *s++; + u8s[i] = '\0'; + } + } + + /* + * And then canonical/compatibility decomposition followed by + * an optional canonical composition. Please be noted that + * canonical composition is done only when a decomposition is + * done. + */ + if (canonical_decomposition || compatibility_decomposition) { + if (sz == 1) { + *state = U8_STATE_START; + + saved_sz = 1; + + comb_class[0] = 0; + start[0] = 0; + disp[0] = 1; + + last = 1; + } else { + saved_sz = do_decomp(uv, u8s, u8s, sz, + canonical_decomposition, state); + + last = 0; + + for (i = 0; i < saved_sz; ) { + sz = u8_number_of_bytes[u8s[i]]; + + comb_class[last] = combining_class(uv, + u8s + i, sz); + start[last] = i; + disp[last] = sz; + + last++; + i += sz; + } + + /* + * Decomposition yields various Hangul related + * states but not on combining marks. We need to + * find out at here by checking on the last + * character. + */ + if (*state == U8_STATE_START) { + if (comb_class[last - 1]) + *state = U8_STATE_COMBINING_MARK; + } + } + + saved_last = last; + + while (s < slast) { + sz = u8_number_of_bytes[*s]; + + /* + * If this is an illegal character, an incomplete + * character, or an 7-bit ASCII Starter character, + * then we have collected a sequence; break and let + * the next call deal with the two cases. + * + * Note that this is okay only if you are using this + * function with a fixed length string, not on + * a buffer with multiple calls of one chunk at a time. + */ + if (sz <= 1) { + break; + } else if ((s + sz) > slast) { + break; + } else { + /* + * If the previous character was a Hangul Jamo + * and this character is a Hangul Jamo that + * can be conjoined, we collect the Jamo. + */ + if (*s == U8_HANGUL_JAMO_1ST_BYTE) { + U8_PUT_3BYTES_INTO_UTF32(u1, + *s, *(s + 1), *(s + 2)); + + if (U8_HANGUL_COMPOSABLE_L_V(*state, + u1)) { + i = 0; + *state = U8_STATE_HANGUL_LV; + goto COLLECT_A_HANGUL; + } + + if (U8_HANGUL_COMPOSABLE_LV_T(*state, + u1)) { + i = 0; + *state = U8_STATE_HANGUL_LVT; + goto COLLECT_A_HANGUL; + } + } + + /* + * Regardless of whatever it was, if this is + * a Starter, we don't collect the character + * since that's a new start and we will deal + * with it at the next time. + */ + i = combining_class(uv, s, sz); + if (i == U8_COMBINING_CLASS_STARTER) + break; + + /* + * We know the current character is a combining + * mark. If the previous character wasn't + * a Starter (not Hangul) or a combining mark, + * then, we don't collect this combining mark. + */ + if (*state != U8_STATE_START && + *state != U8_STATE_COMBINING_MARK) + break; + + *state = U8_STATE_COMBINING_MARK; +COLLECT_A_HANGUL: + /* + * If we collected a Starter and combining + * marks up to 30, i.e., total 31 characters, + * then, we terminate this degenerately long + * combining sequence with a U+034F COMBINING + * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in + * UTF-8 and turn this into a Stream-Safe + * Text. This will be extremely rare but + * possible. + * + * The following will also guarantee that + * we are not writing more than 32 characters + * plus a NULL at u8s[]. + */ + if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { +TURN_STREAM_SAFE: + *state = U8_STATE_START; + comb_class[last] = 0; + start[last] = saved_sz; + disp[last] = 2; + last++; + + u8s[saved_sz++] = 0xCD; + u8s[saved_sz++] = 0x8F; + + break; + } + + /* + * Some combining marks also do decompose into + * another combining mark or marks. + */ + if (*state == U8_STATE_COMBINING_MARK) { + k = last; + l = sz; + i = do_decomp(uv, uts, s, sz, + canonical_decomposition, state); + for (j = 0; j < i; ) { + sz = u8_number_of_bytes[uts[j]]; + + comb_class[last] = + combining_class(uv, + uts + j, sz); + start[last] = saved_sz + j; + disp[last] = sz; + + last++; + if (last >= + U8_UPPER_LIMIT_IN_A_SEQ) { + last = k; + goto TURN_STREAM_SAFE; + } + j += sz; + } + + *state = U8_STATE_COMBINING_MARK; + sz = i; + s += l; + + for (i = 0; i < sz; i++) + u8s[saved_sz++] = uts[i]; + } else { + comb_class[last] = i; + start[last] = saved_sz; + disp[last] = sz; + last++; + + for (i = 0; i < sz; i++) + u8s[saved_sz++] = *s++; + } + + /* + * If this is U+0345 COMBINING GREEK + * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a., + * iota subscript, and need to be converted to + * uppercase letter, convert it to U+0399 GREEK + * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8), + * i.e., convert to capital adscript form as + * specified in the Unicode standard. + * + * This is the only special case of (ambiguous) + * case conversion at combining marks and + * probably the standard will never have + * anything similar like this in future. + */ + if (is_it_toupper && sz >= 2 && + u8s[saved_sz - 2] == 0xCD && + u8s[saved_sz - 1] == 0x85) { + u8s[saved_sz - 2] = 0xCE; + u8s[saved_sz - 1] = 0x99; + } + } + } + + /* + * Let's try to ensure a canonical ordering for the collected + * combining marks. We do this only if we have collected + * at least one more non-Starter. (The decomposition mapping + * data tables have fully (and recursively) expanded and + * canonically ordered decompositions.) + * + * The U8_SWAP_COMB_MARKS() convenience macro has some + * assumptions and we are meeting the assumptions. + */ + last--; + if (last >= saved_last) { + for (i = 0; i < last; i++) + for (j = last; j > i; j--) + if (comb_class[j] && + comb_class[j - 1] > comb_class[j]) { + U8_SWAP_COMB_MARKS(j - 1, j); + } + } + + *source = s; + + if (! canonical_composition) { + u8s[saved_sz] = '\0'; + return (saved_sz); + } + + /* + * Now do the canonical composition. Note that we do this + * only after a canonical or compatibility decomposition to + * finish up NFC or NFKC. + */ + sz = do_composition(uv, u8s, comb_class, start, disp, last, + &s, slast); + } + + *source = s; + + return ((size_t)sz); +} + +/* + * The do_norm_compare() function does string comparion based on Unicode + * simple case mappings and Unicode Normalization definitions. + * + * It does so by collecting a sequence of character at a time and comparing + * the collected sequences from the strings. + * + * The meanings on the return values are the same as the usual strcmp(). + */ +static int +do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, + int flag, int *errno) +{ + int result; + size_t sz1; + size_t sz2; + uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t *s1last; + uchar_t *s2last; + boolean_t is_it_toupper; + boolean_t is_it_tolower; + boolean_t canonical_decomposition; + boolean_t compatibility_decomposition; + boolean_t canonical_composition; + u8_normalization_states_t state; + + s1last = s1 + n1; + s2last = s2 + n2; + + is_it_toupper = flag & U8_TEXTPREP_TOUPPER; + is_it_tolower = flag & U8_TEXTPREP_TOLOWER; + canonical_decomposition = flag & U8_CANON_DECOMP; + compatibility_decomposition = flag & U8_COMPAT_DECOMP; + canonical_composition = flag & U8_CANON_COMP; + + while (s1 < s1last && s2 < s2last) { + /* + * If the current character is a 7-bit ASCII and the last + * character, or, if the current character and the next + * character are both some 7-bit ASCII characters then + * we treat the current character as a sequence. + * + * In any other cases, we need to call collect_a_seq(). + */ + + if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last || + ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) { + if (is_it_toupper) + u8s1[0] = U8_ASCII_TOUPPER(*s1); + else if (is_it_tolower) + u8s1[0] = U8_ASCII_TOLOWER(*s1); + else + u8s1[0] = *s1; + u8s1[1] = '\0'; + sz1 = 1; + s1++; + } else { + state = U8_STATE_START; + sz1 = collect_a_seq(uv, u8s1, &s1, s1last, + is_it_toupper, is_it_tolower, + canonical_decomposition, + compatibility_decomposition, + canonical_composition, errno, &state); + } + + if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last || + ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) { + if (is_it_toupper) + u8s2[0] = U8_ASCII_TOUPPER(*s2); + else if (is_it_tolower) + u8s2[0] = U8_ASCII_TOLOWER(*s2); + else + u8s2[0] = *s2; + u8s2[1] = '\0'; + sz2 = 1; + s2++; + } else { + state = U8_STATE_START; + sz2 = collect_a_seq(uv, u8s2, &s2, s2last, + is_it_toupper, is_it_tolower, + canonical_decomposition, + compatibility_decomposition, + canonical_composition, errno, &state); + } + + /* + * Now compare the two characters. If they are the same, + * we move on to the next character sequences. + */ + if (sz1 == 1 && sz2 == 1) { + if (*u8s1 > *u8s2) + return (1); + if (*u8s1 < *u8s2) + return (-1); + } else { + result = strcmp((const char *)u8s1, (const char *)u8s2); + if (result != 0) + return (result); + } + } + + /* + * We compared until the end of either or both strings. + * + * If we reached to or went over the ends for the both, that means + * they are the same. + * + * If we reached only one end, that means the other string has + * something which then can be used to determine the return value. + */ + if (s1 >= s1last) { + if (s2 >= s2last) + return (0); + return (-1); + } + return (1); +} + +/* + * The u8_strcmp() function compares two UTF-8 strings quite similar to + * the strcmp(). For the comparison, however, Unicode Normalization specific + * equivalency and Unicode simple case conversion mappings based equivalency + * can be requested and checked against. + */ +int +u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv, + int *errno) +{ + int f; + size_t n1; + size_t n2; + + *errno = 0; + + /* + * Check on the requested Unicode version, case conversion, and + * normalization flag values. + */ + + if (uv > U8_UNICODE_LATEST) { + *errno = ERANGE; + uv = U8_UNICODE_LATEST; + } + + if (flag == 0) { + flag = U8_STRCMP_CS; + } else { + f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER | + U8_STRCMP_CI_LOWER); + if (f == 0) { + flag |= U8_STRCMP_CS; + } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER && + f != U8_STRCMP_CI_LOWER) { + *errno = EBADF; + flag = U8_STRCMP_CS; + } + + f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); + if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC && + f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) { + *errno = EBADF; + flag = U8_STRCMP_CS; + } + } + + if (flag == U8_STRCMP_CS) { + return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n)); + } + + n1 = strlen(s1); + n2 = strlen(s2); + if (n != 0) { + if (n < n1) + n1 = n; + if (n < n2) + n2 = n; + } + + /* + * Simple case conversion can be done much faster and so we do + * them separately here. + */ + if (flag == U8_STRCMP_CI_UPPER) { + return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, + n1, n2, B_TRUE, errno)); + } else if (flag == U8_STRCMP_CI_LOWER) { + return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, + n1, n2, B_FALSE, errno)); + } + + return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, + flag, errno)); +} + +size_t +u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, + int flag, size_t unicode_version, int *errno) +{ + int f; + int sz; + uchar_t *ib; + uchar_t *ibtail; + uchar_t *ob; + uchar_t *obtail; + boolean_t do_not_ignore_null; + boolean_t do_not_ignore_invalid; + boolean_t is_it_toupper; + boolean_t is_it_tolower; + boolean_t canonical_decomposition; + boolean_t compatibility_decomposition; + boolean_t canonical_composition; + size_t ret_val; + size_t i; + size_t j; + uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1]; + u8_normalization_states_t state; + + if (unicode_version > U8_UNICODE_LATEST) { + *errno = ERANGE; + return ((size_t)-1); + } + + f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER); + if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) { + *errno = EBADF; + return ((size_t)-1); + } + + f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); + if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC && + f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) { + *errno = EBADF; + return ((size_t)-1); + } + + if (inarray == NULL || *inlen == 0) + return (0); + + if (outarray == NULL) { + *errno = E2BIG; + return ((size_t)-1); + } + + ib = (uchar_t *)inarray; + ob = (uchar_t *)outarray; + ibtail = ib + *inlen; + obtail = ob + *outlen; + + do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL); + do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID); + is_it_toupper = flag & U8_TEXTPREP_TOUPPER; + is_it_tolower = flag & U8_TEXTPREP_TOLOWER; + + ret_val = 0; + + /* + * If we don't have a normalization flag set, we do the simple case + * conversion based text preparation separately below. Text + * preparation involving Normalization will be done in the false task + * block, again, separately since it will take much more time and + * resource than doing simple case conversions. + */ + if (f == 0) { + while (ib < ibtail) { + if (*ib == '\0' && do_not_ignore_null) + break; + + sz = u8_number_of_bytes[*ib]; + + if (sz < 0) { + if (do_not_ignore_invalid) { + *errno = EILSEQ; + ret_val = (size_t)-1; + break; + } + + sz = 1; + ret_val++; + } + + if (sz == 1) { + if (ob >= obtail) { + *errno = E2BIG; + ret_val = (size_t)-1; + break; + } + + if (is_it_toupper) + *ob = U8_ASCII_TOUPPER(*ib); + else if (is_it_tolower) + *ob = U8_ASCII_TOLOWER(*ib); + else + *ob = *ib; + ib++; + ob++; + } else if ((ib + sz) > ibtail) { + if (do_not_ignore_invalid) { + *errno = EINVAL; + ret_val = (size_t)-1; + break; + } + + if ((obtail - ob) < (ibtail - ib)) { + *errno = E2BIG; + ret_val = (size_t)-1; + break; + } + + /* + * We treat the remaining incomplete character + * bytes as a character. + */ + ret_val++; + + while (ib < ibtail) + *ob++ = *ib++; + } else { + if (is_it_toupper || is_it_tolower) { + i = do_case_conv(unicode_version, u8s, + ib, sz, is_it_toupper); + + if ((obtail - ob) < i) { + *errno = E2BIG; + ret_val = (size_t)-1; + break; + } + + ib += sz; + + for (sz = 0; sz < i; sz++) + *ob++ = u8s[sz]; + } else { + if ((obtail - ob) < sz) { + *errno = E2BIG; + ret_val = (size_t)-1; + break; + } + + for (i = 0; i < sz; i++) + *ob++ = *ib++; + } + } + } + } else { + canonical_decomposition = flag & U8_CANON_DECOMP; + compatibility_decomposition = flag & U8_COMPAT_DECOMP; + canonical_composition = flag & U8_CANON_COMP; + + while (ib < ibtail) { + if (*ib == '\0' && do_not_ignore_null) + break; + + /* + * If the current character is a 7-bit ASCII + * character and it is the last character, or, + * if the current character is a 7-bit ASCII + * character and the next character is also a 7-bit + * ASCII character, then, we copy over this + * character without going through collect_a_seq(). + * + * In any other cases, we need to look further with + * the collect_a_seq() function. + */ + if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail || + ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) { + if (ob >= obtail) { + *errno = E2BIG; + ret_val = (size_t)-1; + break; + } + + if (is_it_toupper) + *ob = U8_ASCII_TOUPPER(*ib); + else if (is_it_tolower) + *ob = U8_ASCII_TOLOWER(*ib); + else + *ob = *ib; + ib++; + ob++; + } else { + *errno = 0; + state = U8_STATE_START; + + j = collect_a_seq(unicode_version, u8s, + &ib, ibtail, + is_it_toupper, + is_it_tolower, + canonical_decomposition, + compatibility_decomposition, + canonical_composition, + errno, &state); + + if (*errno && do_not_ignore_invalid) { + ret_val = (size_t)-1; + break; + } + + if ((obtail - ob) < j) { + *errno = E2BIG; + ret_val = (size_t)-1; + break; + } + + for (i = 0; i < j; i++) + *ob++ = u8s[i]; + } + } + } + + *inlen = ibtail - ib; + *outlen = obtail - ob; + + return (ret_val); +} diff -r a4c12419233c -r 09764a26229e usr/src/common/unicode/uconv.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/common/unicode/uconv.c Fri Sep 14 10:25:36 2007 -0700 @@ -0,0 +1,851 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32. + * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517) + * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F), + * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also + * the section 3C man pages. + * Interface stability: Committed + */ + +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#else +#include +#endif /* _KERNEL */ +#include +#include + + +/* + * The max and min values of high and low surrogate pairs of UTF-16, + * UTF-16 bit shift value, bit mask, and starting value outside of BMP. + */ +#define UCONV_U16_HI_MIN (0xd800U) +#define UCONV_U16_HI_MAX (0xdbffU) +#define UCONV_U16_LO_MIN (0xdc00U) +#define UCONV_U16_LO_MAX (0xdfffU) +#define UCONV_U16_BIT_SHIFT (0x0400U) +#define UCONV_U16_BIT_MASK (0x0fffffU) +#define UCONV_U16_START (0x010000U) + +/* The maximum value of Unicode coding space and ASCII coding space. */ +#define UCONV_UNICODE_MAX (0x10ffffU) +#define UCONV_ASCII_MAX (0x7fU) + +/* The mask values for input and output endians. */ +#define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN) +#define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN) + +/* Native and reversed endian macros. */ +#ifdef _BIG_ENDIAN +#define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN +#define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN +#define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN +#define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN +#else +#define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN +#define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN +#define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN +#define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN +#endif /* _BIG_ENDIAN */ + +/* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */ +#define UCONV_BOM_NORMAL (0xfeffU) +#define UCONV_BOM_SWAPPED (0xfffeU) +#define UCONV_BOM_SWAPPED_32 (0xfffe0000U) + +/* UTF-32 boundaries based on UTF-8 character byte lengths. */ +#define UCONV_U8_ONE_BYTE (0x7fU) +#define UCONV_U8_TWO_BYTES (0x7ffU) +#define UCONV_U8_THREE_BYTES (0xffffU) +#define UCONV_U8_FOUR_BYTES (0x10ffffU) + +/* The common minimum and maximum values at the UTF-8 character bytes. */ +#define UCONV_U8_BYTE_MIN (0x80U) +#define UCONV_U8_BYTE_MAX (0xbfU) + +/* + * The following "6" and "0x3f" came from "10xx xxxx" bit representation of + * UTF-8 character bytes. + */ +#define UCONV_U8_BIT_SHIFT 6 +#define UCONV_U8_BIT_MASK 0x3f + +/* + * The following vector shows remaining bytes in a UTF-8 character. + * Index will be the first byte of the character. + */ +static const uchar_t remaining_bytes_tbl[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + +/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + +/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + +/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + +/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ + 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* + * The following is a vector of bit-masks to get used bits in + * the first byte of a UTF-8 character. Index is remaining bytes at above of + * the character. + */ +static const uchar_t masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; + +/* + * The following two vectors are to provide valid minimum and + * maximum values for the 2'nd byte of a multibyte UTF-8 character for + * better illegal sequence checking. The index value must be the value of + * the first byte of the UTF-8 character. + */ +static const uchar_t valid_min_2nd_byte[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + +/* C0 C1 C2 C3 C4 C5 C6 C7 */ + 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + +/* C8 C9 CA CB CC CD CE CF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + +/* D0 D1 D2 D3 D4 D5 D6 D7 */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + +/* D8 D9 DA DB DC DD DE DF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + +/* E0 E1 E2 E3 E4 E5 E6 E7 */ + 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + +/* E8 E9 EA EB EC ED EE EF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + +/* F0 F1 F2 F3 F4 F5 F6 F7 */ + 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static const uchar_t valid_max_2nd_byte[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + +/* C0 C1 C2 C3 C4 C5 C6 C7 */ + 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, + +/* C8 C9 CA CB CC CD CE CF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, + +/* D0 D1 D2 D3 D4 D5 D6 D7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, + +/* D8 D9 DA DB DC DD DE DF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, + +/* E0 E1 E2 E3 E4 E5 E6 E7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, + +/* E8 E9 EA EB EC ED EE EF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, + +/* F0 F1 F2 F3 F4 F5 F6 F7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0 +}; + + +static int +check_endian(int flag, int *in, int *out) +{ + *in = flag & UCONV_IN_ENDIAN_MASKS; + + /* You cannot have both. */ + if (*in == UCONV_IN_ENDIAN_MASKS) + return (EBADF); + + if (*in == 0) + *in = UCONV_IN_NAT_ENDIAN; + + *out = flag & UCONV_OUT_ENDIAN_MASKS; + + /* You cannot have both. */ + if (*out == UCONV_OUT_ENDIAN_MASKS) + return (EBADF); + + if (*out == 0) + *out = UCONV_OUT_NAT_ENDIAN; + + return (0); +} + +static boolean_t +check_bom16(const uint16_t *u16s, size_t u16l, int *in) +{ + if (u16l > 0) { + if (*u16s == UCONV_BOM_NORMAL) { + *in = UCONV_IN_NAT_ENDIAN; + return (B_TRUE); + } + if (*u16s == UCONV_BOM_SWAPPED) { + *in = UCONV_IN_REV_ENDIAN; + return (B_TRUE); + } + } + + return (B_FALSE); +} + +static boolean_t +check_bom32(const uint32_t *u32s, size_t u32l, int *in) +{ + if (u32l > 0) { + if (*u32s == UCONV_BOM_NORMAL) { + *in = UCONV_IN_NAT_ENDIAN; + return (B_TRUE); + } + if (*u32s == UCONV_BOM_SWAPPED_32) { + *in = UCONV_IN_REV_ENDIAN; + return (B_TRUE); + } + } + + return (B_FALSE); +} + +int +uconv_u16tou32(const uint16_t *u16s, size_t *utf16len, + uint32_t *u32s, size_t *utf32len, int flag) +{ + int inendian; + int outendian; + size_t u16l; + size_t u32l; + uint32_t hi; + uint32_t lo; + boolean_t do_not_ignore_null; + + /* + * Do preliminary validity checks on parameters and collect info on + * endians. + */ + if (u16s == NULL || utf16len == NULL) + return (EILSEQ); + + if (u32s == NULL || utf32len == NULL) + return (E2BIG); + + if (check_endian(flag, &inendian, &outendian) != 0) + return (EBADF); + + /* + * Initialize input and output parameter buffer indices and + * temporary variables. + */ + u16l = u32l = 0; + hi = 0; + do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); + + /* + * Check on the BOM at the beginning of the input buffer if required + * and if there is indeed one, process it. + */ + if ((flag & UCONV_IN_ACCEPT_BOM) && + check_bom16(u16s, *utf16len, &inendian)) + u16l++; + + /* + * Reset inendian and outendian so that after this point, those can be + * used as condition values. + */ + inendian &= UCONV_IN_NAT_ENDIAN; + outendian &= UCONV_OUT_NAT_ENDIAN; + + /* + * If there is something in the input buffer and if necessary and + * requested, save the BOM at the output buffer. + */ + if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) + u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : + UCONV_BOM_SWAPPED_32; + + /* + * Do conversion; if encounter a surrogate pair, assemble high and + * low pair values to form a UTF-32 character. If a half of a pair + * exists alone, then, either it is an illegal (EILSEQ) or + * invalid (EINVAL) value. + */ + for (; u16l < *utf16len; u16l++) { + if (u16s[u16l] == 0 && do_not_ignore_null) + break; + + lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); + + if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { + if (hi) + return (EILSEQ); + hi = lo; + continue; + } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { + if (! hi) + return (EILSEQ); + lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + + lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) + + UCONV_U16_START; + hi = 0; + } else if (hi) { + return (EILSEQ); + } + + if (u32l >= *utf32len) + return (E2BIG); + + u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo); + } + + /* + * If high half didn't see low half, then, it's most likely the input + * parameter is incomplete. + */ + if (hi) + return (EINVAL); + + /* + * Save the number of consumed and saved characters. They do not + * include terminating NULL character (U+0000) at the end of + * the input buffer (even when UCONV_IGNORE_NULL isn't specified and + * the input buffer length is big enough to include the terminating + * NULL character). + */ + *utf16len = u16l; + *utf32len = u32l; + + return (0); +} + +int +uconv_u16tou8(const uint16_t *u16s, size_t *utf16len, + uchar_t *u8s, size_t *utf8len, int flag) +{ + int inendian; + int outendian; + size_t u16l; + size_t u8l; + uint32_t hi; + uint32_t lo; + boolean_t do_not_ignore_null; + + if (u16s == NULL || utf16len == NULL) + return (EILSEQ); + + if (u8s == NULL || utf8len == NULL) + return (E2BIG); + + if (check_endian(flag, &inendian, &outendian) != 0) + return (EBADF); + + u16l = u8l = 0; + hi = 0; + do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); + + if ((flag & UCONV_IN_ACCEPT_BOM) && + check_bom16(u16s, *utf16len, &inendian)) + u16l++; + + inendian &= UCONV_IN_NAT_ENDIAN; + + for (; u16l < *utf16len; u16l++) { + if (u16s[u16l] == 0 && do_not_ignore_null) + break; + + lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); + + if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { + if (hi) + return (EILSEQ); + hi = lo; + continue; + } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { + if (! hi) + return (EILSEQ); + lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + + lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) + + UCONV_U16_START; + hi = 0; + } else if (hi) { + return (EILSEQ); + } + + /* + * Now we convert a UTF-32 character into a UTF-8 character. + * Unicode coding space is between U+0000 and U+10FFFF; + * anything bigger is an illegal character. + */ + if (lo <= UCONV_U8_ONE_BYTE) { + if (u8l >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)lo; + } else if (lo <= UCONV_U8_TWO_BYTES) { + if ((u8l + 1) >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); + u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); + } else if (lo <= UCONV_U8_THREE_BYTES) { + if ((u8l + 2) >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); + u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); + u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); + } else if (lo <= UCONV_U8_FOUR_BYTES) { + if ((u8l + 3) >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); + u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); + u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); + u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); + } else { + return (EILSEQ); + } + } + + if (hi) + return (EINVAL); + + *utf16len = u16l; + *utf8len = u8l; + + return (0); +} + +int +uconv_u32tou16(const uint32_t *u32s, size_t *utf32len, + uint16_t *u16s, size_t *utf16len, int flag) +{ + int inendian; + int outendian; + size_t u16l; + size_t u32l; + uint32_t hi; + uint32_t lo; + boolean_t do_not_ignore_null; + + if (u32s == NULL || utf32len == NULL) + return (EILSEQ); + + if (u16s == NULL || utf16len == NULL) + return (E2BIG); + + if (check_endian(flag, &inendian, &outendian) != 0) + return (EBADF); + + u16l = u32l = 0; + do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); + + if ((flag & UCONV_IN_ACCEPT_BOM) && + check_bom32(u32s, *utf32len, &inendian)) + u32l++; + + inendian &= UCONV_IN_NAT_ENDIAN; + outendian &= UCONV_OUT_NAT_ENDIAN; + + if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) + u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : + UCONV_BOM_SWAPPED; + + for (; u32l < *utf32len; u32l++) { + if (u32s[u32l] == 0 && do_not_ignore_null) + break; + + hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); + + /* + * Anything bigger than the Unicode coding space, i.e., + * Unicode scalar value bigger than U+10FFFF, is an illegal + * character. + */ + if (hi > UCONV_UNICODE_MAX) + return (EILSEQ); + + /* + * Anything bigger than U+FFFF must be converted into + * a surrogate pair in UTF-16. + */ + if (hi >= UCONV_U16_START) { + lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + + UCONV_U16_LO_MIN; + hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + + UCONV_U16_HI_MIN; + + if ((u16l + 1) >= *utf16len) + return (E2BIG); + + if (outendian) { + u16s[u16l++] = (uint16_t)hi; + u16s[u16l++] = (uint16_t)lo; + } else { + u16s[u16l++] = BSWAP_16(((uint16_t)hi)); + u16s[u16l++] = BSWAP_16(((uint16_t)lo)); + } + } else { + if (u16l >= *utf16len) + return (E2BIG); + u16s[u16l++] = (outendian) ? (uint16_t)hi : + BSWAP_16(((uint16_t)hi)); + } + } + + *utf16len = u16l; + *utf32len = u32l; + + return (0); +} + +int +uconv_u32tou8(const uint32_t *u32s, size_t *utf32len, + uchar_t *u8s, size_t *utf8len, int flag) +{ + int inendian; + int outendian; + size_t u32l; + size_t u8l; + uint32_t lo; + boolean_t do_not_ignore_null; + + if (u32s == NULL || utf32len == NULL) + return (EILSEQ); + + if (u8s == NULL || utf8len == NULL) + return (E2BIG); + + if (check_endian(flag, &inendian, &outendian) != 0) + return (EBADF); + + u32l = u8l = 0; + do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); + + if ((flag & UCONV_IN_ACCEPT_BOM) && + check_bom32(u32s, *utf32len, &inendian)) + u32l++; + + inendian &= UCONV_IN_NAT_ENDIAN; + + for (; u32l < *utf32len; u32l++) { + if (u32s[u32l] == 0 && do_not_ignore_null) + break; + + lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); + + if (lo <= UCONV_U8_ONE_BYTE) { + if (u8l >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)lo; + } else if (lo <= UCONV_U8_TWO_BYTES) { + if ((u8l + 1) >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); + u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); + } else if (lo <= UCONV_U8_THREE_BYTES) { + if ((u8l + 2) >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); + u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); + u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); + } else if (lo <= UCONV_U8_FOUR_BYTES) { + if ((u8l + 3) >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); + u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); + u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); + u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); + } else { + return (EILSEQ); + } + } + + *utf32len = u32l; + *utf8len = u8l; + + return (0); +} + +int +uconv_u8tou16(const uchar_t *u8s, size_t *utf8len, + uint16_t *u16s, size_t *utf16len, int flag) +{ + int inendian; + int outendian; + size_t u16l; + size_t u8l; + uint32_t hi; + uint32_t lo; + int remaining_bytes; + int first_b; + boolean_t do_not_ignore_null; + + if (u8s == NULL || utf8len == NULL) + return (EILSEQ); + + if (u16s == NULL || utf16len == NULL) + return (E2BIG); + + if (check_endian(flag, &inendian, &outendian) != 0) + return (EBADF); + + u16l = u8l = 0; + do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); + + outendian &= UCONV_OUT_NAT_ENDIAN; + + if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) + u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : + UCONV_BOM_SWAPPED; + + for (; u8l < *utf8len; ) { + if (u8s[u8l] == 0 && do_not_ignore_null) + break; + + /* + * Collect a UTF-8 character and convert it to a UTF-32 + * character. In doing so, we screen out illegally formed + * UTF-8 characters and treat such as illegal characters. + * The algorithm at below also screens out anything bigger + * than the U+10FFFF. + * + * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for + * more details on the illegal values of UTF-8 character + * bytes. + */ + hi = (uint32_t)u8s[u8l++]; + + if (hi > UCONV_ASCII_MAX) { + if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) + return (EILSEQ); + + first_b = hi; + hi = hi & masks_tbl[remaining_bytes]; + + for (; remaining_bytes > 0; remaining_bytes--) { + /* + * If we have no more bytes, the current + * UTF-8 character is incomplete. + */ + if (u8l >= *utf8len) + return (EINVAL); + + lo = (uint32_t)u8s[u8l++]; + + if (first_b) { + if (lo < valid_min_2nd_byte[first_b] || + lo > valid_max_2nd_byte[first_b]) + return (EILSEQ); + first_b = 0; + } else if (lo < UCONV_U8_BYTE_MIN || + lo > UCONV_U8_BYTE_MAX) { + return (EILSEQ); + } + hi = (hi << UCONV_U8_BIT_SHIFT) | + (lo & UCONV_U8_BIT_MASK); + } + } + + if (hi >= UCONV_U16_START) { + lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + + UCONV_U16_LO_MIN; + hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + + UCONV_U16_HI_MIN; + + if ((u16l + 1) >= *utf16len) + return (E2BIG); + + if (outendian) { + u16s[u16l++] = (uint16_t)hi; + u16s[u16l++] = (uint16_t)lo; + } else { + u16s[u16l++] = BSWAP_16(((uint16_t)hi)); + u16s[u16l++] = BSWAP_16(((uint16_t)lo)); + } + } else { + if (u16l >= *utf16len) + return (E2BIG); + + u16s[u16l++] = (outendian) ? (uint16_t)hi : + BSWAP_16(((uint16_t)hi)); + } + } + + *utf16len = u16l; + *utf8len = u8l; + + return (0); +} + +int +uconv_u8tou32(const uchar_t *u8s, size_t *utf8len, + uint32_t *u32s, size_t *utf32len, int flag) +{ + int inendian; + int outendian; + size_t u32l; + size_t u8l; + uint32_t hi; + uint32_t c; + int remaining_bytes; + int first_b; + boolean_t do_not_ignore_null; + + if (u8s == NULL || utf8len == NULL) + return (EILSEQ); + + if (u32s == NULL || utf32len == NULL) + return (E2BIG); + + if (check_endian(flag, &inendian, &outendian) != 0) + return (EBADF); + + u32l = u8l = 0; + do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); + + outendian &= UCONV_OUT_NAT_ENDIAN; + + if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) + u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : + UCONV_BOM_SWAPPED_32; + + for (; u8l < *utf8len; ) { + if (u8s[u8l] == 0 && do_not_ignore_null) + break; + + hi = (uint32_t)u8s[u8l++]; + + if (hi > UCONV_ASCII_MAX) { + if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) + return (EILSEQ); + + first_b = hi; + hi = hi & masks_tbl[remaining_bytes]; + + for (; remaining_bytes > 0; remaining_bytes--) { + if (u8l >= *utf8len) + return (EINVAL); + + c = (uint32_t)u8s[u8l++]; + + if (first_b) { + if (c < valid_min_2nd_byte[first_b] || + c > valid_max_2nd_byte[first_b]) + return (EILSEQ); + first_b = 0; + } else if (c < UCONV_U8_BYTE_MIN || + c > UCONV_U8_BYTE_MAX) { + return (EILSEQ); + } + hi = (hi << UCONV_U8_BIT_SHIFT) | + (c & UCONV_U8_BIT_MASK); + } + } + + if (u32l >= *utf32len) + return (E2BIG); + + u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi); + } + + *utf32len = u32l; + *utf8len = u8l; + + return (0); +} diff -r a4c12419233c -r 09764a26229e usr/src/lib/libc/Makefile.targ --- a/usr/src/lib/libc/Makefile.targ Fri Sep 14 08:32:57 2007 -0700 +++ b/usr/src/lib/libc/Makefile.targ Fri Sep 14 10:25:36 2007 -0700 @@ -276,6 +276,10 @@ $(COMPILE.c) -o $@ $(SRC)/common/dtrace/$(@F:.o=.c) $(POST_PROCESS_O) +$(UNICODEOBJS:%=pics/%): $(SRC)/common/unicode/$$(@F:.o=.c) + $(COMPILE.c) -o $@ $(SRC)/common/unicode/$(@F:.o=.c) + $(POST_PROCESS_O) + # DTrace rules pics/%.o: ../port/threads/%.d $(THREADSOBJS:%=pics/%) $(COMPILE.d) -C -xlazyload -s $< -o $@ $(THREADSOBJS:%=pics/%) diff -r a4c12419233c -r 09764a26229e usr/src/lib/libc/amd64/Makefile --- a/usr/src/lib/libc/amd64/Makefile Fri Sep 14 08:32:57 2007 -0700 +++ b/usr/src/lib/libc/amd64/Makefile Fri Sep 14 10:25:36 2007 -0700 @@ -762,6 +762,10 @@ THREADSASMOBJS= \ asm_subr.o +UNICODEOBJS= \ + u8_textprep.o \ + uconv.o + UNWINDMACHOBJS= \ call_frame_inst.o \ eh_frame.o \ @@ -866,6 +870,7 @@ $(THREADSOBJS) \ $(THREADSMACHOBJS) \ $(THREADSASMOBJS) \ + $(UNICODEOBJS) \ $(UNWINDMACHOBJS) \ $(UNWINDASMOBJS) \ $(COMSYSOBJS) \ @@ -974,6 +979,7 @@ $(TPOOLOBJS:%.o=../port/tpool/%.c) \ $(THREADSOBJS:%.o=../port/threads/%.c) \ $(THREADSMACHOBJS:%.o=threads/%.c) \ + $(UNICODEOBJS:%.o=$(SRC)/common/unicode/%.c) \ $(UNWINDMACHOBJS:%.o=unwind/%.c) \ $(FPOBJS:%.o=fp/%.c) \ $(I386FPOBJS:%.o=../i386/fp/%.c) \ diff -r a4c12419233c -r 09764a26229e usr/src/lib/libc/i386/Makefile.com --- a/usr/src/lib/libc/i386/Makefile.com Fri Sep 14 08:32:57 2007 -0700 +++ b/usr/src/lib/libc/i386/Makefile.com Fri Sep 14 10:25:36 2007 -0700 @@ -804,6 +804,10 @@ THREADSASMOBJS= \ asm_subr.o +UNICODEOBJS= \ + u8_textprep.o \ + uconv.o + UNWINDMACHOBJS= \ unwind.o @@ -904,6 +908,7 @@ $(THREADSOBJS) \ $(THREADSMACHOBJS) \ $(THREADSASMOBJS) \ + $(UNICODEOBJS) \ $(UNWINDMACHOBJS) \ $(UNWINDASMOBJS) \ $(COMSYSOBJS) \ @@ -1035,6 +1040,7 @@ $(TPOOLOBJS:%.o=../port/tpool/%.c) \ $(THREADSOBJS:%.o=../port/threads/%.c) \ $(THREADSMACHOBJS:%.o=../$(MACH)/threads/%.c) \ + $(UNICODEOBJS:%.o=$(SRC)/common/unicode/%.c) \ $(UNWINDMACHOBJS:%.o=../port/unwind/%.c) \ $(FPOBJS:%.o=../$(MACH)/fp/%.c) \ $(LIBCBASE)/gen/ecvt.c \ diff -r a4c12419233c -r 09764a26229e usr/src/lib/libc/port/mapfile-vers --- a/usr/src/lib/libc/port/mapfile-vers Fri Sep 14 08:32:57 2007 -0700 +++ b/usr/src/lib/libc/port/mapfile-vers Fri Sep 14 10:25:36 2007 -0700 @@ -123,6 +123,15 @@ timer_getoverrun; timer_gettime; timer_settime; + u8_strcmp; + u8_textprep_str; + u8_validate; + uconv_u16tou32; + uconv_u16tou8; + uconv_u32tou16; + uconv_u32tou8; + uconv_u8tou16; + uconv_u8tou32; uucopy; uucopystr; vforkx; diff -r a4c12419233c -r 09764a26229e usr/src/lib/libc/sparc/Makefile --- a/usr/src/lib/libc/sparc/Makefile Fri Sep 14 08:32:57 2007 -0700 +++ b/usr/src/lib/libc/sparc/Makefile Fri Sep 14 10:25:36 2007 -0700 @@ -830,6 +830,10 @@ THREADSASMOBJS= \ asm_subr.o +UNICODEOBJS= \ + u8_textprep.o \ + uconv.o + UNWINDMACHOBJS= \ unwind.o @@ -932,6 +936,7 @@ $(THREADSOBJS) \ $(THREADSMACHOBJS) \ $(THREADSASMOBJS) \ + $(UNICODEOBJS) \ $(UNWINDMACHOBJS) \ $(UNWINDASMOBJS) \ $(COMSYSOBJS) \ @@ -1053,6 +1058,7 @@ $(TPOOLOBJS:%.o=../port/tpool/%.c) \ $(THREADSOBJS:%.o=../port/threads/%.c) \ $(THREADSMACHOBJS:%.o=../$(MACH)/threads/%.c) \ + $(UNICODEOBJS:%.o=$(SRC)/common/unicode/%.c) \ $(UNWINDMACHOBJS:%.o=../port/unwind/%.c) \ $(FPOBJS:%.o=../$(MACH)/fp/%.c) \ $(LIBCBASE)/crt/_ftou.c \ diff -r a4c12419233c -r 09764a26229e usr/src/lib/libc/sparcv9/Makefile --- a/usr/src/lib/libc/sparcv9/Makefile Fri Sep 14 08:32:57 2007 -0700 +++ b/usr/src/lib/libc/sparcv9/Makefile Fri Sep 14 10:25:36 2007 -0700 @@ -777,6 +777,10 @@ THREADSASMOBJS= \ asm_subr.o +UNICODEOBJS= \ + u8_textprep.o \ + uconv.o + UNWINDMACHOBJS= \ unwind.o @@ -875,6 +879,7 @@ $(THREADSOBJS) \ $(THREADSMACHOBJS) \ $(THREADSASMOBJS) \ + $(UNICODEOBJS) \ $(UNWINDMACHOBJS) \ $(UNWINDASMOBJS) \ $(COMSYSOBJS) \ @@ -986,6 +991,7 @@ $(TPOOLOBJS:%.o=../port/tpool/%.c) \ $(THREADSOBJS:%.o=../port/threads/%.c) \ $(THREADSMACHOBJS:%.o=../$(MACH)/threads/%.c) \ + $(UNICODEOBJS:%.o=$(SRC)/common/unicode/%.c) \ $(UNWINDMACHOBJS:%.o=../port/unwind/%.c) \ $(FPOBJS:%.o=../$(MACH)/fp/%.c) \ $(FPOBJS64:%.o=$(LIBCBASE)/fp/%.c) \ diff -r a4c12419233c -r 09764a26229e usr/src/pkgdefs/SUNWhea/prototype_com --- a/usr/src/pkgdefs/SUNWhea/prototype_com Fri Sep 14 08:32:57 2007 -0700 +++ b/usr/src/pkgdefs/SUNWhea/prototype_com Fri Sep 14 10:25:36 2007 -0700 @@ -1293,6 +1293,7 @@ f none usr/include/sys/turnstile.h 644 root bin f none usr/include/sys/types.h 644 root bin f none usr/include/sys/types32.h 644 root bin +f none usr/include/sys/u8_textprep.h 644 root bin f none usr/include/sys/uadmin.h 644 root bin f none usr/include/sys/ucontext.h 644 root bin f none usr/include/sys/uio.h 644 root bin diff -r a4c12419233c -r 09764a26229e usr/src/uts/common/Makefile.rules --- a/usr/src/uts/common/Makefile.rules Fri Sep 14 08:32:57 2007 -0700 +++ b/usr/src/uts/common/Makefile.rules Fri Sep 14 10:25:36 2007 -0700 @@ -951,6 +951,10 @@ $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(COMMONBASE)/unicode/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/vm/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1673,6 +1677,9 @@ $(LINTS_DIR)/%.ln: $(COMMONBASE)/util/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(COMMONBASE)/unicode/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/vm/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff -r a4c12419233c -r 09764a26229e usr/src/uts/common/os/u8_textprep.c --- a/usr/src/uts/common/os/u8_textprep.c Fri Sep 14 08:32:57 2007 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2126 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - - -/* - * UTF-8 text preparation functions (PSARC/2007/149). - * - * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F), - * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). - * Interface stability: Committed. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -/* The maximum possible number of bytes in a UTF-8 character. */ -#define U8_MB_CUR_MAX (4) - -/* - * The maximum number of bytes needed for a UTF-8 character to cover - * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2. - */ -#define U8_MAX_BYTES_UCS2 (3) - -/* The maximum possible number of bytes in a Stream-Safe Text. */ -#define U8_STREAM_SAFE_TEXT_MAX (128) - -/* - * The maximum number of characters in a combining/conjoining sequence and - * the actual upperbound limit of a combining/conjoining sequence. - */ -#define U8_MAX_CHARS_A_SEQ (32) -#define U8_UPPER_LIMIT_IN_A_SEQ (31) - -/* The combining class value for Starter. */ -#define U8_COMBINING_CLASS_STARTER (0) - -/* - * Some Hangul related macros at below. - * - * The first and the last of Hangul syllables, Hangul Jamo Leading consonants, - * Vowels, and optional Trailing consonants in Unicode scalar values. - * - * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not - * the actual U+11A8. This is due to that the trailing consonant is optional - * and thus we are doing a pre-calculation of subtracting one. - * - * Each of 19 modern leading consonants has total 588 possible syllables since - * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for - * no trailing consonant case, i.e., 21 x 28 = 588. - * - * We also have bunch of Hangul related macros at below. Please bear in mind - * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is - * a Hangul Jamo or not but the value does not guarantee that it is a Hangul - * Jamo; it just guarantee that it will be most likely. - */ -#define U8_HANGUL_SYL_FIRST (0xAC00U) -#define U8_HANGUL_SYL_LAST (0xD7A3U) - -#define U8_HANGUL_JAMO_L_FIRST (0x1100U) -#define U8_HANGUL_JAMO_L_LAST (0x1112U) -#define U8_HANGUL_JAMO_V_FIRST (0x1161U) -#define U8_HANGUL_JAMO_V_LAST (0x1175U) -#define U8_HANGUL_JAMO_T_FIRST (0x11A7U) -#define U8_HANGUL_JAMO_T_LAST (0x11C2U) - -#define U8_HANGUL_V_COUNT (21) -#define U8_HANGUL_VT_COUNT (588) -#define U8_HANGUL_T_COUNT (28) - -#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U) - -#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \ - (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \ - (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \ - (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU)); - -#define U8_HANGUL_JAMO_L(u) \ - ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST) - -#define U8_HANGUL_JAMO_V(u) \ - ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST) - -#define U8_HANGUL_JAMO_T(u) \ - ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) - -#define U8_HANGUL_JAMO(u) \ - ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) - -#define U8_HANGUL_SYLLABLE(u) \ - ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST) - -#define U8_HANGUL_COMPOSABLE_L_V(s, u) \ - ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u))) - -#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \ - ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u))) - -/* The types of decomposition mappings. */ -#define U8_DECOMP_BOTH (0xF5U) -#define U8_DECOMP_CANONICAL (0xF6U) - -/* The indicator for 16-bit table. */ -#define U8_16BIT_TABLE_INDICATOR (0x8000U) - -/* The following are some convenience macros. */ -#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ - (u) = ((uint32_t)(b1) & 0x0F) << 12 | ((uint32_t)(b2) & 0x3F) << 6 | \ - (uint32_t)(b3) & 0x3F; - -#define U8_SIMPLE_SWAP(a, b, t) \ - (t) = (a); \ - (a) = (b); \ - (b) = (t); - -#define U8_ASCII_TOUPPER(c) \ - (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c)) - -#define U8_ASCII_TOLOWER(c) \ - (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c)) - -#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U) -/* - * The following macro assumes that the two characters that are to be - * swapped are adjacent to each other and 'a' comes before 'b'. - * - * If the assumptions are not met, then, the macro will fail. - */ -#define U8_SWAP_COMB_MARKS(a, b) \ - for (k = 0; k < disp[(a)]; k++) \ - u8t[k] = u8s[start[(a)] + k]; \ - for (k = 0; k < disp[(b)]; k++) \ - u8s[start[(a)] + k] = u8s[start[(b)] + k]; \ - start[(b)] = start[(a)] + disp[(b)]; \ - for (k = 0; k < disp[(a)]; k++) \ - u8s[start[(b)] + k] = u8t[k]; \ - U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \ - U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc); - -/* The possible states during normalization. */ -typedef enum { - U8_STATE_START = 0, - U8_STATE_HANGUL_L = 1, - U8_STATE_HANGUL_LV = 2, - U8_STATE_HANGUL_LVT = 3, - U8_STATE_HANGUL_V = 4, - U8_STATE_HANGUL_T = 5, - U8_STATE_COMBINING_MARK = 6 -} u8_normalization_states_t; - -/* - * The three vectors at below are used to check bytes of a given UTF-8 - * character are valid and not containing any malformed byte values. - * - * We used to have a quite relaxed UTF-8 binary representation but then there - * was some security related issues and so the Unicode Consortium defined - * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it - * one more time at the Unicode 3.2. The following three tables are based on - * that. - */ - -#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF) - -#define I_ U8_ILLEGAL_CHAR -#define O_ U8_OUT_OF_RANGE_CHAR - -const int8_t u8_number_of_bytes[0x100] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - -/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ - I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, - -/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ - I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, - -/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ - I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, - -/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ - I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, - -/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ - I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - -/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - -/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - -/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ - 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, -}; - -#undef I_ -#undef O_ - -const uint8_t u8_valid_min_2nd_byte[0x100] = { - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -/* C0 C1 C2 C3 C4 C5 C6 C7 */ - 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* C8 C9 CA CB CC CD CE CF */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* D0 D1 D2 D3 D4 D5 D6 D7 */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* D8 D9 DA DB DC DD DE DF */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* E0 E1 E2 E3 E4 E5 E6 E7 */ - 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* E8 E9 EA EB EC ED EE EF */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, -/* F0 F1 F2 F3 F4 F5 F6 F7 */ - 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -}; - -const uint8_t u8_valid_max_2nd_byte[0x100] = { - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -/* C0 C1 C2 C3 C4 C5 C6 C7 */ - 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* C8 C9 CA CB CC CD CE CF */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* D0 D1 D2 D3 D4 D5 D6 D7 */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* D8 D9 DA DB DC DD DE DF */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* E0 E1 E2 E3 E4 E5 E6 E7 */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, -/* E8 E9 EA EB EC ED EE EF */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, -/* F0 F1 F2 F3 F4 F5 F6 F7 */ - 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, -}; - - -/* - * The u8_validate() validates on the given UTF-8 character string and - * calculate the byte length. It is quite similar to mblen(3C) except that - * this will validate against the list of characters if required and - * specific to UTF-8 and Unicode. - */ -int -u8_validate(char *u8str, size_t n, char **list, int flag, int *errno) -{ - uchar_t *ib; - uchar_t *ibtail; - uchar_t **p; - uchar_t *s1; - uchar_t *s2; - uchar_t f; - int sz; - size_t i; - int ret_val; - boolean_t second; - boolean_t no_need_to_validate_entire; - boolean_t check_additional; - boolean_t validate_ucs2_range_only; - - if (! u8str) - return (0); - - ib = (uchar_t *)u8str; - ibtail = ib + n; - - ret_val = 0; - - no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE); - check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL; - validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE; - - while (ib < ibtail) { - /* - * The first byte of a UTF-8 character tells how many - * bytes will follow for the character. If the first byte - * is an illegal byte value or out of range value, we just - * return -1 with an appropriate error number. - */ - sz = u8_number_of_bytes[*ib]; - if (sz == U8_ILLEGAL_CHAR) { - *errno = EILSEQ; - return (-1); - } - - if (sz == U8_OUT_OF_RANGE_CHAR || - (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) { - *errno = ERANGE; - return (-1); - } - - /* - * If we don't have enough bytes to check on, that's also - * an error. As you can see, we give illegal byte sequence - * checking higher priority then EINVAL cases. - */ - if ((ibtail - ib) < sz) { - *errno = EINVAL; - return (-1); - } - - if (sz == 1) { - ib++; - ret_val++; - } else { - /* - * Check on the multi-byte UTF-8 character. For more - * details on this, see comment added for the used - * data structures at the beginning of the file. - */ - f = *ib++; - ret_val++; - second = B_TRUE; - for (i = 1; i < sz; i++) { - if (second) { - if (*ib < u8_valid_min_2nd_byte[f] || - *ib > u8_valid_max_2nd_byte[f]) { - *errno = EILSEQ; - return (-1); - } - second = B_FALSE; - } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) { - *errno = EILSEQ; - return (-1); - } - ib++; - ret_val++; - } - } - - if (check_additional) { - for (p = (uchar_t **)list, i = 0; p[i]; i++) { - s1 = ib - sz; - s2 = p[i]; - while (s1 < ib) { - if (*s1 != *s2 || *s2 == '\0') - break; - s1++; - s2++; - } - - if (s1 >= ib && *s2 == '\0') { - *errno = EBADF; - return (-1); - } - } - } - - if (no_need_to_validate_entire) - break; - } - - return (ret_val); -} - -/* - * The do_case_conv() looks at the mapping tables and returns found - * bytes if any. If not found, the input bytes are returned. The function - * always terminate the return bytes with a null character assuming that - * there are plenty of room to do so. - * - * The case conversions are simple case conversions mapping a character to - * another character as specified in the Unicode data. The byte size of - * the mapped character could be different from that of the input character. - * - * The return value is the byte length of the returned character excluding - * the terminating null byte. - */ -static size_t -do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper) -{ - size_t i; - uint16_t b1 = 0; - uint16_t b2 = 0; - uint16_t b3 = 0; - uint16_t b3_tbl; - uint16_t b3_base; - uint16_t b4 = 0; - size_t start_id; - size_t end_id; - - /* - * At this point, the only possible values for sz are 2, 3, and 4. - * The u8s should point to a vector that is well beyond the size of - * 5 bytes. - */ - if (sz == 2) { - b3 = u8s[0] = s[0]; - b4 = u8s[1] = s[1]; - } else if (sz == 3) { - b2 = u8s[0] = s[0]; - b3 = u8s[1] = s[1]; - b4 = u8s[2] = s[2]; - } else if (sz == 4) { - b1 = u8s[0] = s[0]; - b2 = u8s[1] = s[1]; - b3 = u8s[2] = s[2]; - b4 = u8s[3] = s[3]; - } else { - /* This is not possible but just in case as a fallback. */ - if (is_it_toupper) - *u8s = U8_ASCII_TOUPPER(*s); - else - *u8s = U8_ASCII_TOLOWER(*s); - u8s[1] = '\0'; - - return (1); - } - u8s[sz] = '\0'; - - /* - * Let's find out if we have a corresponding character. - */ - b1 = u8_common_b1_tbl[uv][b1]; - if (b1 == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - b2 = u8_case_common_b2_tbl[uv][b1][b2]; - if (b2 == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - if (is_it_toupper) { - b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id; - if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4]; - end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1]; - - /* Either there is no match or an error at the table. */ - if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) - return ((size_t)sz); - - b3_base = u8_toupper_b3_tbl[uv][b2][b3].base; - - for (i = 0; start_id < end_id; start_id++) - u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id]; - } else { - b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id; - if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4]; - end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1]; - - if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) - return ((size_t)sz); - - b3_base = u8_tolower_b3_tbl[uv][b2][b3].base; - - for (i = 0; start_id < end_id; start_id++) - u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id]; - } - - /* - * If i is still zero, that means there is no corresponding character. - */ - if (i == 0) - return ((size_t)sz); - - u8s[i] = '\0'; - - return (i); -} - -/* - * The do_case_compare() function compares the two input strings, s1 and s2, - * one character at a time doing case conversions if applicable and return - * the comparison result as like strcmp(). - * - * Since, in empirical sense, most of text data are 7-bit ASCII characters, - * we treat the 7-bit ASCII characters as a special case trying to yield - * faster processing time. - */ -static int -do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, - size_t n2, boolean_t is_it_toupper, int *errno) -{ - int f; - int sz1; - int sz2; - size_t j; - size_t i1; - size_t i2; - uchar_t u8s1[U8_MB_CUR_MAX + 1]; - uchar_t u8s2[U8_MB_CUR_MAX + 1]; - - i1 = i2 = 0; - while (i1 < n1 && i2 < n2) { - /* - * Find out what would be the byte length for this UTF-8 - * character at string s1 and also find out if this is - * an illegal start byte or not and if so, issue a proper - * errno and yet treat this byte as a character. - */ - sz1 = u8_number_of_bytes[*s1]; - if (sz1 < 0) { - *errno = EILSEQ; - sz1 = 1; - } - - /* - * For 7-bit ASCII characters mainly, we do a quick case - * conversion right at here. - * - * If we don't have enough bytes for this character, issue - * an EINVAL error and use what are available. - * - * If we have enough bytes, find out if there is - * a corresponding uppercase character and if so, copy over - * the bytes for a comparison later. If there is no - * corresponding uppercase character, then, use what we have - * for the comparison. - */ - if (sz1 == 1) { - if (is_it_toupper) - u8s1[0] = U8_ASCII_TOUPPER(*s1); - else - u8s1[0] = U8_ASCII_TOLOWER(*s1); - s1++; - u8s1[1] = '\0'; - } else if ((i1 + sz1) > n1) { - *errno = EINVAL; - for (j = 0; (i1 + j) < n1; ) - u8s1[j++] = *s1++; - u8s1[j] = '\0'; - } else { - (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper); - s1 += sz1; - } - - /* Do the same for the string s2. */ - sz2 = u8_number_of_bytes[*s2]; - if (sz2 < 0) { - *errno = EILSEQ; - sz2 = 1; - } - - if (sz2 == 1) { - if (is_it_toupper) - u8s2[0] = U8_ASCII_TOUPPER(*s2); - else - u8s2[0] = U8_ASCII_TOLOWER(*s2); - s2++; - u8s2[1] = '\0'; - } else if ((i2 + sz2) > n2) { - *errno = EINVAL; - for (j = 0; (i2 + j) < n2; ) - u8s2[j++] = *s2++; - u8s2[j] = '\0'; - } else { - (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper); - s2 += sz2; - } - - /* Now compare the two characters. */ - if (sz1 == 1 && sz2 == 1) { - if (*u8s1 > *u8s2) - return (1); - if (*u8s1 < *u8s2) - return (-1); - } else { - f = strcmp((const char *)u8s1, (const char *)u8s2); - if (f != 0) - return (f); - } - - /* - * They were the same. Let's move on to the next - * characters then. - */ - i1 += sz1; - i2 += sz2; - } - - /* - * We compared until the end of either or both strings. - * - * If we reached to or went over the ends for the both, that means - * they are the same. - * - * If we reached only one of the two ends, that means the other string - * has something which then the fact can be used to determine - * the return value. - */ - if (i1 >= n1) { - if (i2 >= n2) - return (0); - return (-1); - } - return (1); -} - -/* - * The combining_class() function checks on the given bytes and find out - * the corresponding Unicode combining class value. The return value 0 means - * it is a Starter. Any illegal UTF-8 character will also be treated as - * a Starter. - */ -static uchar_t -combining_class(size_t uv, uchar_t *s, size_t sz) -{ - uint16_t b1 = 0; - uint16_t b2 = 0; - uint16_t b3 = 0; - uint16_t b4 = 0; - - if (sz == 1 || sz > 4) - return (0); - - if (sz == 2) { - b3 = s[0]; - b4 = s[1]; - } else if (sz == 3) { - b2 = s[0]; - b3 = s[1]; - b4 = s[2]; - } else if (sz == 4) { - b1 = s[0]; - b2 = s[1]; - b3 = s[2]; - b4 = s[3]; - } - - b1 = u8_common_b1_tbl[uv][b1]; - if (b1 == U8_TBL_ELEMENT_NOT_DEF) - return (0); - - b2 = u8_combining_class_b2_tbl[uv][b1][b2]; - if (b2 == U8_TBL_ELEMENT_NOT_DEF) - return (0); - - b3 = u8_combining_class_b3_tbl[uv][b2][b3]; - if (b3 == U8_TBL_ELEMENT_NOT_DEF) - return (0); - - return (u8_combining_class_b4_tbl[uv][b3][b4]); -} - -/* - * The do_decomp() function finds out a matching decomposition if any - * and return. If there is no match, the input bytes are copied and returned. - * The function also checks if there is a Hangul, decomposes it if necessary - * and returns. - * - * To save time, a single byte 7-bit ASCII character should be handled by - * the caller. - * - * The function returns the number of bytes returned sans always terminating - * the null byte. It will also return a state that will tell if there was - * a Hangul character decomposed which then will be used by the caller. - */ -static size_t -do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, - boolean_t canonical_decomposition, u8_normalization_states_t *state) -{ - uint16_t b1 = 0; - uint16_t b2 = 0; - uint16_t b3 = 0; - uint16_t b3_tbl; - uint16_t b3_base; - uint16_t b4 = 0; - size_t start_id; - size_t end_id; - size_t i; - uint32_t u1; - - if (sz == 2) { - b3 = u8s[0] = s[0]; - b4 = u8s[1] = s[1]; - u8s[2] = '\0'; - } else if (sz == 3) { - /* Convert it to a Unicode scalar value. */ - U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]); - - /* - * If this is a Hangul syllable, we decompose it into - * a leading consonant, a vowel, and an optional trailing - * consonant and then return. - */ - if (U8_HANGUL_SYLLABLE(u1)) { - u1 -= U8_HANGUL_SYL_FIRST; - - b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT; - b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT) - / U8_HANGUL_T_COUNT; - b3 = u1 % U8_HANGUL_T_COUNT; - - U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1); - U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2); - if (b3) { - b3 += U8_HANGUL_JAMO_T_FIRST; - U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3); - - u8s[9] = '\0'; - *state = U8_STATE_HANGUL_LVT; - return (9); - } - - u8s[6] = '\0'; - *state = U8_STATE_HANGUL_LV; - return (6); - } - - b2 = u8s[0] = s[0]; - b3 = u8s[1] = s[1]; - b4 = u8s[2] = s[2]; - u8s[3] = '\0'; - - /* - * If this is a Hangul Jamo, we know there is nothing - * further that we can decompose. - */ - if (U8_HANGUL_JAMO_L(u1)) { - *state = U8_STATE_HANGUL_L; - return (3); - } - - if (U8_HANGUL_JAMO_V(u1)) { - if (*state == U8_STATE_HANGUL_L) - *state = U8_STATE_HANGUL_LV; - else - *state = U8_STATE_HANGUL_V; - return (3); - } - - if (U8_HANGUL_JAMO_T(u1)) { - if (*state == U8_STATE_HANGUL_LV) - *state = U8_STATE_HANGUL_LVT; - else - *state = U8_STATE_HANGUL_T; - return (3); - } - } else if (sz == 4) { - b1 = u8s[0] = s[0]; - b2 = u8s[1] = s[1]; - b3 = u8s[2] = s[2]; - b4 = u8s[3] = s[3]; - u8s[4] = '\0'; - } else { - /* - * This is a fallback and should not happen if the function - * was called properly. - */ - u8s[0] = s[0]; - u8s[1] = '\0'; - *state = U8_STATE_START; - return (1); - } - - /* - * At this point, this rountine does not know what it would get. - * The caller should sort it out if the state isn't a Hangul one. - */ - *state = U8_STATE_START; - - /* Try to find matching decomposition mapping byte sequence. */ - b1 = u8_common_b1_tbl[uv][b1]; - if (b1 == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - b2 = u8_decomp_b2_tbl[uv][b1][b2]; - if (b2 == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id; - if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) - return ((size_t)sz); - - /* - * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR - * which is 0x8000, this means we couldn't fit the mappings into - * the cardinality of a unsigned byte. - */ - if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { - b3_tbl -= U8_16BIT_TABLE_INDICATOR; - start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; - end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; - } else { - start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; - end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; - } - - /* This also means there wasn't any matching decomposition. */ - if (start_id >= end_id) - return ((size_t)sz); - - /* - * The final table for decomposition mappings has three types of - * byte sequences depending on whether a mapping is for compatibility - * decomposition, canonical decomposition, or both like the following: - * - * (1) Compatibility decomposition mappings: - * - * +---+---+-...-+---+ - * | B0| B1| ... | Bm| - * +---+---+-...-+---+ - * - * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH). - * - * (2) Canonical decomposition mappings: - * - * +---+---+---+-...-+---+ - * | T | b0| b1| ... | bn| - * +---+---+---+-...-+---+ - * - * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL). - * - * (3) Both mappings: - * - * +---+---+---+---+-...-+---+---+---+-...-+---+ - * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm| - * +---+---+---+---+-...-+---+---+---+-...-+---+ - * - * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement - * byte, b0 to bn are canonical mapping bytes and B0 to Bm are - * compatibility mapping bytes. - * - * Note that compatibility decomposition means doing recursive - * decompositions using both compatibility decomposition mappings and - * canonical decomposition mappings. On the other hand, canonical - * decomposition means doing recursive decompositions using only - * canonical decomposition mappings. Since the table we have has gone - * through the recursions already, we do not need to do so during - * runtime, i.e., the table has been completely flattened out - * already. - */ - - b3_base = u8_decomp_b3_tbl[uv][b2][b3].base; - - /* Get the type, T, of the byte sequence. */ - b1 = u8_decomp_final_tbl[uv][b3_base + start_id]; - - /* - * If necessary, adjust start_id, end_id, or both. Note that if - * this is compatibility decomposition mapping, there is no - * adjustment. - */ - if (canonical_decomposition) { - /* Is the mapping only for compatibility decomposition? */ - if (b1 < U8_DECOMP_BOTH) - return ((size_t)sz); - - start_id++; - - if (b1 == U8_DECOMP_BOTH) { - end_id = start_id + - u8_decomp_final_tbl[uv][b3_base + start_id]; - start_id++; - } - } else { - /* - * Unless this is a compatibility decomposition mapping, - * we adjust the start_id. - */ - if (b1 == U8_DECOMP_BOTH) { - start_id++; - start_id += u8_decomp_final_tbl[uv][b3_base + start_id]; - } else if (b1 == U8_DECOMP_CANONICAL) { - start_id++; - } - } - - for (i = 0; start_id < end_id; start_id++) - u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id]; - u8s[i] = '\0'; - - return (i); -} - -/* - * The find_composition_start() function uses the character bytes given and - * find out the matching composition mappings if any and return the address - * to the composition mappings as explained in the do_composition(). - */ -static uchar_t * -find_composition_start(size_t uv, uchar_t *s, size_t sz) -{ - uint16_t b1 = 0; - uint16_t b2 = 0; - uint16_t b3 = 0; - uint16_t b3_tbl; - uint16_t b3_base; - uint16_t b4 = 0; - size_t start_id; - size_t end_id; - - if (sz == 1) { - b4 = s[0]; - } else if (sz == 2) { - b3 = s[0]; - b4 = s[1]; - } else if (sz == 3) { - b2 = s[0]; - b3 = s[1]; - b4 = s[2]; - } else if (sz == 4) { - b1 = s[0]; - b2 = s[1]; - b3 = s[2]; - b4 = s[3]; - } else { - /* - * This is a fallback and should not happen if the function - * was called properly. - */ - return (NULL); - } - - b1 = u8_composition_b1_tbl[uv][b1]; - if (b1 == U8_TBL_ELEMENT_NOT_DEF) - return (NULL); - - b2 = u8_composition_b2_tbl[uv][b1][b2]; - if (b2 == U8_TBL_ELEMENT_NOT_DEF) - return (NULL); - - b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id; - if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) - return (NULL); - - if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { - b3_tbl -= U8_16BIT_TABLE_INDICATOR; - start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; - end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; - } else { - start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; - end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; - } - - if (start_id >= end_id) - return (NULL); - - b3_base = u8_composition_b3_tbl[uv][b2][b3].base; - - return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id])); -} - -/* - * The blocked() function checks on the combining class values of previous - * characters in this sequence and return whether it is blocked or not. - */ -static boolean_t -blocked(uchar_t *comb_class, size_t last) -{ - uchar_t my_comb_class; - size_t i; - - my_comb_class = comb_class[last]; - for (i = 1; i < last; i++) - if (comb_class[i] >= my_comb_class || - comb_class[i] == U8_COMBINING_CLASS_STARTER) - return (B_TRUE); - - return (B_FALSE); -} - -/* - * The do_composition() reads the character string pointed by 's' and - * do necessary canonical composition and then copy over the result back to - * the 's'. - * - * The input argument 's' cannot contain more than 32 characters. - */ -static size_t -do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start, - uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) -{ - uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1]; - uchar_t tc[U8_MB_CUR_MAX]; - uint8_t saved_marks[U8_MAX_CHARS_A_SEQ]; - size_t saved_marks_count; - uchar_t *p; - uchar_t *saved_p; - uchar_t *q; - size_t i; - size_t saved_i; - size_t j; - size_t k; - size_t l; - size_t C; - size_t saved_l; - size_t size; - uint32_t u1; - uint32_t u2; - boolean_t match_not_found = B_TRUE; - - /* - * This should never happen unless the callers are doing some strange - * and unexpected things. - * - * The "last" is the index pointing to the last character not last + 1. - */ - if (last >= U8_MAX_CHARS_A_SEQ) - last = U8_UPPER_LIMIT_IN_A_SEQ; - - for (i = l = 0; i <= last; i++) { - /* - * The last or any non-Starters at the beginning, we don't - * have any chance to do composition and so we just copy them - * to the temporary buffer. - */ - if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) { -SAVE_THE_CHAR: - p = s + start[i]; - size = disp[i]; - for (k = 0; k < size; k++) - t[l++] = *p++; - continue; - } - - /* - * If this could be a start of Hangul Jamos, then, we try to - * conjoin them. - */ - if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) { - U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]], - s[start[i] + 1], s[start[i] + 2]); - U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3], - s[start[i] + 4], s[start[i] + 5]); - - if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) { - u1 -= U8_HANGUL_JAMO_L_FIRST; - u2 -= U8_HANGUL_JAMO_V_FIRST; - u1 = U8_HANGUL_SYL_FIRST + - (u1 * U8_HANGUL_V_COUNT + u2) * - U8_HANGUL_T_COUNT; - - i += 2; - if (i <= last) { - U8_PUT_3BYTES_INTO_UTF32(u2, - s[start[i]], s[start[i] + 1], - s[start[i] + 2]); - - if (U8_HANGUL_JAMO_T(u2)) { - u1 += u2 - - U8_HANGUL_JAMO_T_FIRST; - i++; - } - } - - U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1); - i--; - l += 3; - continue; - } - } - - /* - * Let's then find out if this Starter has composition - * mapping. - */ - p = find_composition_start(uv, s + start[i], disp[i]); - if (p == NULL) - goto SAVE_THE_CHAR; - - /* - * We have a Starter with composition mapping and the next - * character is a non-Starter. Let's try to find out if - * we can do composition. - */ - - saved_p = p; - saved_i = i; - saved_l = l; - saved_marks_count = 0; - -TRY_THE_NEXT_MARK: - q = s + start[++i]; - size = disp[i]; - - /* - * The next for() loop compares the non-Starter pointed by - * 'q' with the possible (joinable) characters pointed by 'p'. - * - * The composition final table entry pointed by the 'p' - * looks like the following: - * - * +---+---+---+-...-+---+---+---+---+-...-+---+---+ - * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F | - * +---+---+---+-...-+---+---+---+---+-...-+---+---+ - * - * where C is the count byte indicating the number of - * mapping pairs where each pair would be look like - * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second - * character of a canonical decomposition and the B0-Bm are - * the bytes of a matching composite character. The F is - * a filler byte after each character as the separator. - */ - - match_not_found = B_TRUE; - - for (C = *p++; C > 0; C--) { - for (k = 0; k < size; p++, k++) - if (*p != q[k]) - break; - - /* Have we found it? */ - if (k >= size && *p == U8_TBL_ELEMENT_FILLER) { - match_not_found = B_FALSE; - - l = saved_l; - - while (*++p != U8_TBL_ELEMENT_FILLER) - t[l++] = *p; - - break; - } - - /* We didn't find; skip to the next pair. */ - if (*p != U8_TBL_ELEMENT_FILLER) - while (*++p != U8_TBL_ELEMENT_FILLER) - ; - while (*++p != U8_TBL_ELEMENT_FILLER) - ; - p++; - } - - /* - * If there was no match, we will need to save the combining - * mark for later appending. After that, if the next one - * is a non-Starter and not blocked, then, we try once - * again to do composition with the next non-Starter. - * - * If there was no match and this was a Starter, then, - * this is a new start. - * - * If there was a match and a composition done and we have - * more to check on, then, we retrieve a new composition final - * table entry for the composite and then try to do the - * composition again. - */ - - if (match_not_found) { - if (comb_class[i] == U8_COMBINING_CLASS_STARTER) { - i--; - goto SAVE_THE_CHAR; - } - - saved_marks[saved_marks_count++] = i; - } - - if (saved_l == l) { - while (i < last) { - if (blocked(comb_class, i + 1)) - saved_marks[saved_marks_count++] = ++i; - else - break; - } - if (i < last) { - p = saved_p; - goto TRY_THE_NEXT_MARK; - } - } else if (i < last) { - p = find_composition_start(uv, t + saved_l, - l - saved_l); - if (p != NULL) { - saved_p = p; - goto TRY_THE_NEXT_MARK; - } - } - - /* - * There is no more composition possible. - * - * If there was no composition what so ever then we copy - * over the original Starter and then append any non-Starters - * remaining at the target string sequentially after that. - */ - - if (saved_l == l) { - p = s + start[saved_i]; - size = disp[saved_i]; - for (j = 0; j < size; j++) - t[l++] = *p++; - } - - for (k = 0; k < saved_marks_count; k++) { - p = s + start[saved_marks[k]]; - size = disp[saved_marks[k]]; - for (j = 0; j < size; j++) - t[l++] = *p++; - } - } - - /* - * If the last character is a Starter and if we have a character - * (possibly another Starter) that can be turned into a composite, - * we do so and we do so until there is no more of composition - * possible. - */ - if (comb_class[last] == U8_COMBINING_CLASS_STARTER) { - p = *os; - saved_l = l - disp[last]; - - while (p < oslast) { - size = u8_number_of_bytes[*p]; - if (size <= 1 || (p + size) > oslast) - break; - - saved_p = p; - - for (i = 0; i < size; i++) - tc[i] = *p++; - - q = find_composition_start(uv, t + saved_l, - l - saved_l); - if (q == NULL) { - p = saved_p; - break; - } - - match_not_found = B_TRUE; - - for (C = *q++; C > 0; C--) { - for (k = 0; k < size; q++, k++) - if (*q != tc[k]) - break; - - if (k >= size && *q == U8_TBL_ELEMENT_FILLER) { - match_not_found = B_FALSE; - - l = saved_l; - - while (*++q != U8_TBL_ELEMENT_FILLER) { - /* - * This is practically - * impossible but we don't - * want to take any chances. - */ - if (l >= - U8_STREAM_SAFE_TEXT_MAX) { - p = saved_p; - goto SAFE_RETURN; - } - t[l++] = *q; - } - - break; - } - - if (*q != U8_TBL_ELEMENT_FILLER) - while (*++q != U8_TBL_ELEMENT_FILLER) - ; - while (*++q != U8_TBL_ELEMENT_FILLER) - ; - q++; - } - - if (match_not_found) { - p = saved_p; - break; - } - } -SAFE_RETURN: - *os = p; - } - - /* - * Now we copy over the temporary string to the target string. - * Since composition always reduces the number of characters or - * the number of characters stay, we don't need to worry about - * the buffer overflow here. - */ - for (i = 0; i < l; i++) - s[i] = t[i]; - s[l] = '\0'; - - return (l); -} - -/* - * The collect_a_seq() function checks on the given string s, collect - * a sequence of characters at u8s, and return the sequence. While it collects - * a sequence, it also applies case conversion, canonical or compatibility - * decomposition, canonical decomposition, or some or all of them and - * in that order. - * - * The collected sequence cannot be bigger than 32 characters since if - * it is having more than 31 characters, the sequence will be terminated - * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into - * a Stream-Safe Text. The collected sequence is always terminated with - * a null byte and the return value is the byte length of the sequence - * including 0. The return value does not include the terminating - * null byte. - */ -static size_t -collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast, - boolean_t is_it_toupper, - boolean_t is_it_tolower, - boolean_t canonical_decomposition, - boolean_t compatibility_decomposition, - boolean_t canonical_composition, - int *errno, u8_normalization_states_t *state) -{ - uchar_t *s; - int sz; - int saved_sz; - size_t i; - size_t j; - size_t k; - size_t l; - uchar_t comb_class[U8_MAX_CHARS_A_SEQ]; - uchar_t disp[U8_MAX_CHARS_A_SEQ]; - uchar_t start[U8_MAX_CHARS_A_SEQ]; - uchar_t u8t[U8_MB_CUR_MAX]; - uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1]; - uchar_t tc; - size_t last; - size_t saved_last; - uint32_t u1; - - /* - * Save the source string pointer which we will return a changed - * pointer if we do processing. - */ - s = *source; - - /* - * The following is a fallback for just in case callers are not - * checking the string boundaries before the calling. - */ - if (s >= slast) { - u8s[0] = '\0'; - - return (0); - } - - /* - * As the first thing, let's collect a character and do case - * conversion if necessary. - */ - - sz = u8_number_of_bytes[*s]; - - if (sz < 0) { - *errno = EILSEQ; - - u8s[0] = *s++; - u8s[1] = '\0'; - - *source = s; - - return (1); - } - - if (sz == 1) { - if (is_it_toupper) - u8s[0] = U8_ASCII_TOUPPER(*s); - else if (is_it_tolower) - u8s[0] = U8_ASCII_TOLOWER(*s); - else - u8s[0] = *s; - s++; - u8s[1] = '\0'; - } else if ((s + sz) > slast) { - *errno = EINVAL; - - for (i = 0; s < slast; ) - u8s[i++] = *s++; - u8s[i] = '\0'; - - *source = s; - - return (i); - } else { - if (is_it_toupper || is_it_tolower) { - i = do_case_conv(uv, u8s, s, sz, is_it_toupper); - s += sz; - sz = i; - } else { - for (i = 0; i < sz; ) - u8s[i++] = *s++; - u8s[i] = '\0'; - } - } - - /* - * And then canonical/compatibility decomposition followed by - * an optional canonical composition. Please be noted that - * canonical composition is done only when a decomposition is - * done. - */ - if (canonical_decomposition || compatibility_decomposition) { - if (sz == 1) { - *state = U8_STATE_START; - - saved_sz = 1; - - comb_class[0] = 0; - start[0] = 0; - disp[0] = 1; - - last = 1; - } else { - saved_sz = do_decomp(uv, u8s, u8s, sz, - canonical_decomposition, state); - - last = 0; - - for (i = 0; i < saved_sz; ) { - sz = u8_number_of_bytes[u8s[i]]; - - comb_class[last] = combining_class(uv, - u8s + i, sz); - start[last] = i; - disp[last] = sz; - - last++; - i += sz; - } - - /* - * Decomposition yields various Hangul related - * states but not on combining marks. We need to - * find out at here by checking on the last - * character. - */ - if (*state == U8_STATE_START) { - if (comb_class[last - 1]) - *state = U8_STATE_COMBINING_MARK; - } - } - - saved_last = last; - - while (s < slast) { - sz = u8_number_of_bytes[*s]; - - /* - * If this is an illegal character, an incomplete - * character, or an 7-bit ASCII Starter character, - * then we have collected a sequence; break and let - * the next call deal with the two cases. - * - * Note that this is okay only if you are using this - * function with a fixed length string, not on - * a buffer with multiple calls of one chunk at a time. - */ - if (sz <= 1) { - break; - } else if ((s + sz) > slast) { - break; - } else { - /* - * If the previous character was a Hangul Jamo - * and this character is a Hangul Jamo that - * can be conjoined, we collect the Jamo. - */ - if (*s == U8_HANGUL_JAMO_1ST_BYTE) { - U8_PUT_3BYTES_INTO_UTF32(u1, - *s, *(s + 1), *(s + 2)); - - if (U8_HANGUL_COMPOSABLE_L_V(*state, - u1)) { - i = 0; - *state = U8_STATE_HANGUL_LV; - goto COLLECT_A_HANGUL; - } - - if (U8_HANGUL_COMPOSABLE_LV_T(*state, - u1)) { - i = 0; - *state = U8_STATE_HANGUL_LVT; - goto COLLECT_A_HANGUL; - } - } - - /* - * Regardless of whatever it was, if this is - * a Starter, we don't collect the character - * since that's a new start and we will deal - * with it at the next time. - */ - i = combining_class(uv, s, sz); - if (i == U8_COMBINING_CLASS_STARTER) - break; - - /* - * We know the current character is a combining - * mark. If the previous character wasn't - * a Starter (not Hangul) or a combining mark, - * then, we don't collect this combining mark. - */ - if (*state != U8_STATE_START && - *state != U8_STATE_COMBINING_MARK) - break; - - *state = U8_STATE_COMBINING_MARK; -COLLECT_A_HANGUL: - /* - * If we collected a Starter and combining - * marks up to 30, i.e., total 31 characters, - * then, we terminate this degenerately long - * combining sequence with a U+034F COMBINING - * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in - * UTF-8 and turn this into a Stream-Safe - * Text. This will be extremely rare but - * possible. - * - * The following will also guarantee that - * we are not writing more than 32 characters - * plus a NULL at u8s[]. - */ - if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { -TURN_STREAM_SAFE: - *state = U8_STATE_START; - comb_class[last] = 0; - start[last] = saved_sz; - disp[last] = 2; - last++; - - u8s[saved_sz++] = 0xCD; - u8s[saved_sz++] = 0x8F; - - break; - } - - /* - * Some combining marks also do decompose into - * another combining mark or marks. - */ - if (*state == U8_STATE_COMBINING_MARK) { - k = last; - l = sz; - i = do_decomp(uv, uts, s, sz, - canonical_decomposition, state); - for (j = 0; j < i; ) { - sz = u8_number_of_bytes[uts[j]]; - - comb_class[last] = - combining_class(uv, - uts + j, sz); - start[last] = saved_sz + j; - disp[last] = sz; - - last++; - if (last >= - U8_UPPER_LIMIT_IN_A_SEQ) { - last = k; - goto TURN_STREAM_SAFE; - } - j += sz; - } - - *state = U8_STATE_COMBINING_MARK; - sz = i; - s += l; - - for (i = 0; i < sz; i++) - u8s[saved_sz++] = uts[i]; - } else { - comb_class[last] = i; - start[last] = saved_sz; - disp[last] = sz; - last++; - - for (i = 0; i < sz; i++) - u8s[saved_sz++] = *s++; - } - - /* - * If this is U+0345 COMBINING GREEK - * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a., - * iota subscript, and need to be converted to - * uppercase letter, convert it to U+0399 GREEK - * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8), - * i.e., convert to capital adscript form as - * specified in the Unicode standard. - * - * This is the only special case of (ambiguous) - * case conversion at combining marks and - * probably the standard will never have - * anything similar like this in future. - */ - if (is_it_toupper && sz >= 2 && - u8s[saved_sz - 2] == 0xCD && - u8s[saved_sz - 1] == 0x85) { - u8s[saved_sz - 2] = 0xCE; - u8s[saved_sz - 1] = 0x99; - } - } - } - - /* - * Let's try to ensure a canonical ordering for the collected - * combining marks. We do this only if we have collected - * at least one more non-Starter. (The decomposition mapping - * data tables have fully (and recursively) expanded and - * canonically ordered decompositions.) - * - * The U8_SWAP_COMB_MARKS() convenience macro has some - * assumptions and we are meeting the assumptions. - */ - last--; - if (last >= saved_last) { - for (i = 0; i < last; i++) - for (j = last; j > i; j--) - if (comb_class[j] && - comb_class[j - 1] > comb_class[j]) { - U8_SWAP_COMB_MARKS(j - 1, j); - } - } - - *source = s; - - if (! canonical_composition) { - u8s[saved_sz] = '\0'; - return (saved_sz); - } - - /* - * Now do the canonical composition. Note that we do this - * only after a canonical or compatibility decomposition to - * finish up NFC or NFKC. - */ - sz = do_composition(uv, u8s, comb_class, start, disp, last, - &s, slast); - } - - *source = s; - - return ((size_t)sz); -} - -/* - * The do_norm_compare() function does string comparion based on Unicode - * simple case mappings and Unicode Normalization definitions. - * - * It does so by collecting a sequence of character at a time and comparing - * the collected sequences from the strings. - * - * The meanings on the return values are the same as the usual strcmp(). - */ -static int -do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, - int flag, int *errno) -{ - int result; - size_t sz1; - size_t sz2; - uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1]; - uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1]; - uchar_t *s1last; - uchar_t *s2last; - boolean_t is_it_toupper; - boolean_t is_it_tolower; - boolean_t canonical_decomposition; - boolean_t compatibility_decomposition; - boolean_t canonical_composition; - u8_normalization_states_t state; - - s1last = s1 + n1; - s2last = s2 + n2; - - is_it_toupper = flag & U8_TEXTPREP_TOUPPER; - is_it_tolower = flag & U8_TEXTPREP_TOLOWER; - canonical_decomposition = flag & U8_CANON_DECOMP; - compatibility_decomposition = flag & U8_COMPAT_DECOMP; - canonical_composition = flag & U8_CANON_COMP; - - while (s1 < s1last && s2 < s2last) { - /* - * If the current character is a 7-bit ASCII and the last - * character, or, if the current character and the next - * character are both some 7-bit ASCII characters then - * we treat the current character as a sequence. - * - * In any other cases, we need to call collect_a_seq(). - */ - - if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last || - ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) { - if (is_it_toupper) - u8s1[0] = U8_ASCII_TOUPPER(*s1); - else if (is_it_tolower) - u8s1[0] = U8_ASCII_TOLOWER(*s1); - else - u8s1[0] = *s1; - u8s1[1] = '\0'; - sz1 = 1; - s1++; - } else { - state = U8_STATE_START; - sz1 = collect_a_seq(uv, u8s1, &s1, s1last, - is_it_toupper, is_it_tolower, - canonical_decomposition, - compatibility_decomposition, - canonical_composition, errno, &state); - } - - if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last || - ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) { - if (is_it_toupper) - u8s2[0] = U8_ASCII_TOUPPER(*s2); - else if (is_it_tolower) - u8s2[0] = U8_ASCII_TOLOWER(*s2); - else - u8s2[0] = *s2; - u8s2[1] = '\0'; - sz2 = 1; - s2++; - } else { - state = U8_STATE_START; - sz2 = collect_a_seq(uv, u8s2, &s2, s2last, - is_it_toupper, is_it_tolower, - canonical_decomposition, - compatibility_decomposition, - canonical_composition, errno, &state); - } - - /* - * Now compare the two characters. If they are the same, - * we move on to the next character sequences. - */ - if (sz1 == 1 && sz2 == 1) { - if (*u8s1 > *u8s2) - return (1); - if (*u8s1 < *u8s2) - return (-1); - } else { - result = strcmp((const char *)u8s1, (const char *)u8s2); - if (result != 0) - return (result); - } - } - - /* - * We compared until the end of either or both strings. - * - * If we reached to or went over the ends for the both, that means - * they are the same. - * - * If we reached only one end, that means the other string has - * something which then can be used to determine the return value. - */ - if (s1 >= s1last) { - if (s2 >= s2last) - return (0); - return (-1); - } - return (1); -} - -/* - * The u8_strcmp() function compares two UTF-8 strings quite similar to - * the strcmp(). For the comparison, however, Unicode Normalization specific - * equivalency and Unicode simple case conversion mappings based equivalency - * can be requested and checked against. - */ -int -u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv, - int *errno) -{ - int f; - size_t n1; - size_t n2; - - *errno = 0; - - /* - * Check on the requested Unicode version, case conversion, and - * normalization flag values. - */ - - if (uv > U8_UNICODE_LATEST) { - *errno = ERANGE; - uv = U8_UNICODE_LATEST; - } - - if (flag == 0) { - flag = U8_STRCMP_CS; - } else { - f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER | - U8_STRCMP_CI_LOWER); - if (f == 0) { - flag |= U8_STRCMP_CS; - } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER && - f != U8_STRCMP_CI_LOWER) { - *errno = EBADF; - flag = U8_STRCMP_CS; - } - - f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); - if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC && - f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) { - *errno = EBADF; - flag = U8_STRCMP_CS; - } - } - - if (flag == U8_STRCMP_CS) { - return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n)); - } - - n1 = strlen(s1); - n2 = strlen(s2); - if (n != 0) { - if (n < n1) - n1 = n; - if (n < n2) - n2 = n; - } - - /* - * Simple case conversion can be done much faster and so we do - * them separately here. - */ - if (flag == U8_STRCMP_CI_UPPER) { - return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, - n1, n2, B_TRUE, errno)); - } else if (flag == U8_STRCMP_CI_LOWER) { - return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, - n1, n2, B_FALSE, errno)); - } - - return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, - flag, errno)); -} - -size_t -u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, - int flag, size_t unicode_version, int *errno) -{ - int f; - int sz; - uchar_t *ib; - uchar_t *ibtail; - uchar_t *ob; - uchar_t *obtail; - boolean_t do_not_ignore_null; - boolean_t do_not_ignore_invalid; - boolean_t is_it_toupper; - boolean_t is_it_tolower; - boolean_t canonical_decomposition; - boolean_t compatibility_decomposition; - boolean_t canonical_composition; - size_t ret_val; - size_t i; - size_t j; - uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1]; - u8_normalization_states_t state; - - if (unicode_version > U8_UNICODE_LATEST) { - *errno = ERANGE; - return ((size_t)-1); - } - - f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER); - if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) { - *errno = EBADF; - return ((size_t)-1); - } - - f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); - if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC && - f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) { - *errno = EBADF; - return ((size_t)-1); - } - - if (inarray == NULL || *inlen == 0) - return (0); - - if (outarray == NULL) { - *errno = E2BIG; - return ((size_t)-1); - } - - ib = (uchar_t *)inarray; - ob = (uchar_t *)outarray; - ibtail = ib + *inlen; - obtail = ob + *outlen; - - do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL); - do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID); - is_it_toupper = flag & U8_TEXTPREP_TOUPPER; - is_it_tolower = flag & U8_TEXTPREP_TOLOWER; - - ret_val = 0; - - /* - * If we don't have a normalization flag set, we do the simple case - * conversion based text preparation separately below. Text - * preparation involving Normalization will be done in the false task - * block, again, separately since it will take much more time and - * resource than doing simple case conversions. - */ - if (f == 0) { - while (ib < ibtail) { - if (*ib == '\0' && do_not_ignore_null) - break; - - sz = u8_number_of_bytes[*ib]; - - if (sz < 0) { - if (do_not_ignore_invalid) { - *errno = EILSEQ; - ret_val = (size_t)-1; - break; - } - - sz = 1; - ret_val++; - } - - if (sz == 1) { - if (ob >= obtail) { - *errno = E2BIG; - ret_val = (size_t)-1; - break; - } - - if (is_it_toupper) - *ob = U8_ASCII_TOUPPER(*ib); - else if (is_it_tolower) - *ob = U8_ASCII_TOLOWER(*ib); - else - *ob = *ib; - ib++; - ob++; - } else if ((ib + sz) > ibtail) { - if (do_not_ignore_invalid) { - *errno = EINVAL; - ret_val = (size_t)-1; - break; - } - - if ((obtail - ob) < (ibtail - ib)) { - *errno = E2BIG; - ret_val = (size_t)-1; - break; - } - - /* - * We treat the remaining incomplete character - * bytes as a character. - */ - ret_val++; - - while (ib < ibtail) - *ob++ = *ib++; - } else { - if (is_it_toupper || is_it_tolower) { - i = do_case_conv(unicode_version, u8s, - ib, sz, is_it_toupper); - - if ((obtail - ob) < i) { - *errno = E2BIG; - ret_val = (size_t)-1; - break; - } - - ib += sz; - - for (sz = 0; sz < i; sz++) - *ob++ = u8s[sz]; - } else { - if ((obtail - ob) < sz) { - *errno = E2BIG; - ret_val = (size_t)-1; - break; - } - - for (i = 0; i < sz; i++) - *ob++ = *ib++; - } - } - } - } else { - canonical_decomposition = flag & U8_CANON_DECOMP; - compatibility_decomposition = flag & U8_COMPAT_DECOMP; - canonical_composition = flag & U8_CANON_COMP; - - while (ib < ibtail) { - if (*ib == '\0' && do_not_ignore_null) - break; - - /* - * If the current character is a 7-bit ASCII - * character and it is the last character, or, - * if the current character is a 7-bit ASCII - * character and the next character is also a 7-bit - * ASCII character, then, we copy over this - * character without going through collect_a_seq(). - * - * In any other cases, we need to look further with - * the collect_a_seq() function. - */ - if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail || - ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) { - if (ob >= obtail) { - *errno = E2BIG; - ret_val = (size_t)-1; - break; - } - - if (is_it_toupper) - *ob = U8_ASCII_TOUPPER(*ib); - else if (is_it_tolower) - *ob = U8_ASCII_TOLOWER(*ib); - else - *ob = *ib; - ib++; - ob++; - } else { - *errno = 0; - state = U8_STATE_START; - - j = collect_a_seq(unicode_version, u8s, - &ib, ibtail, - is_it_toupper, - is_it_tolower, - canonical_decomposition, - compatibility_decomposition, - canonical_composition, - errno, &state); - - if (*errno && do_not_ignore_invalid) { - ret_val = (size_t)-1; - break; - } - - if ((obtail - ob) < j) { - *errno = E2BIG; - ret_val = (size_t)-1; - break; - } - - for (i = 0; i < j; i++) - *ob++ = u8s[i]; - } - } - } - - *inlen = ibtail - ib; - *outlen = obtail - ob; - - return (ret_val); -} diff -r a4c12419233c -r 09764a26229e usr/src/uts/common/os/uconv.c --- a/usr/src/uts/common/os/uconv.c Fri Sep 14 08:32:57 2007 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,845 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -/* - * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32. - * Man page: /shared/sac/PSARC/2005/446/materials/uconv_functions.9f - * Interface stability: Consolidation Private - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -/* - * The max and min values of high and low surrogate pairs of UTF-16, - * UTF-16 bit shift value, bit mask, and starting value outside of BMP. - */ -#define UCONV_U16_HI_MIN (0xd800U) -#define UCONV_U16_HI_MAX (0xdbffU) -#define UCONV_U16_LO_MIN (0xdc00U) -#define UCONV_U16_LO_MAX (0xdfffU) -#define UCONV_U16_BIT_SHIFT (0x0400U) -#define UCONV_U16_BIT_MASK (0x0fffffU) -#define UCONV_U16_START (0x010000U) - -/* The maximum value of Unicode coding space and ASCII coding space. */ -#define UCONV_UNICODE_MAX (0x10ffffU) -#define UCONV_ASCII_MAX (0x7fU) - -/* The mask values for input and output endians. */ -#define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN) -#define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN) - -/* Native and reversed endian macros. */ -#ifdef _BIG_ENDIAN -#define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN -#define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN -#define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN -#define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN -#else -#define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN -#define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN -#define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN -#define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN -#endif /* _BIG_ENDIAN */ - -/* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */ -#define UCONV_BOM_NORMAL (0xfeffU) -#define UCONV_BOM_SWAPPED (0xfffeU) -#define UCONV_BOM_SWAPPED_32 (0xfffe0000U) - -/* UTF-32 boundaries based on UTF-8 character byte lengths. */ -#define UCONV_U8_ONE_BYTE (0x7fU) -#define UCONV_U8_TWO_BYTES (0x7ffU) -#define UCONV_U8_THREE_BYTES (0xffffU) -#define UCONV_U8_FOUR_BYTES (0x10ffffU) - -/* The common minimum and maximum values at the UTF-8 character bytes. */ -#define UCONV_U8_BYTE_MIN (0x80U) -#define UCONV_U8_BYTE_MAX (0xbfU) - -/* - * The following "6" and "0x3f" came from "10xx xxxx" bit representation of - * UTF-8 character bytes. - */ -#define UCONV_U8_BIT_SHIFT 6 -#define UCONV_U8_BIT_MASK 0x3f - -/* - * The following vector shows remaining bytes in a UTF-8 character. - * Index will be the first byte of the character. - */ -static const uchar_t remaining_bytes_tbl[0x100] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - -/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ - 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - -/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - -/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - -/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ - 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -/* - * The following is a vector of bit-masks to get used bits in - * the first byte of a UTF-8 character. Index is remaining bytes at above of - * the character. - */ -static const uchar_t masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; - -/* - * The following two vectors are to provide valid minimum and - * maximum values for the 2'nd byte of a multibyte UTF-8 character for - * better illegal sequence checking. The index value must be the value of - * the first byte of the UTF-8 character. - */ -static const uchar_t valid_min_2nd_byte[0x100] = { - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - -/* C0 C1 C2 C3 C4 C5 C6 C7 */ - 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - -/* C8 C9 CA CB CC CD CE CF */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - -/* D0 D1 D2 D3 D4 D5 D6 D7 */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - -/* D8 D9 DA DB DC DD DE DF */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - -/* E0 E1 E2 E3 E4 E5 E6 E7 */ - 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - -/* E8 E9 EA EB EC ED EE EF */ - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - -/* F0 F1 F2 F3 F4 F5 F6 F7 */ - 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, - - 0, 0, 0, 0, 0, 0, 0, 0 -}; - -static const uchar_t valid_max_2nd_byte[0x100] = { - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - -/* C0 C1 C2 C3 C4 C5 C6 C7 */ - 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, - -/* C8 C9 CA CB CC CD CE CF */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, - -/* D0 D1 D2 D3 D4 D5 D6 D7 */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, - -/* D8 D9 DA DB DC DD DE DF */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, - -/* E0 E1 E2 E3 E4 E5 E6 E7 */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, - -/* E8 E9 EA EB EC ED EE EF */ - 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, - -/* F0 F1 F2 F3 F4 F5 F6 F7 */ - 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, - - 0, 0, 0, 0, 0, 0, 0, 0 -}; - - -static int -check_endian(int flag, int *in, int *out) -{ - *in = flag & UCONV_IN_ENDIAN_MASKS; - - /* You cannot have both. */ - if (*in == UCONV_IN_ENDIAN_MASKS) - return (EBADF); - - if (*in == 0) - *in = UCONV_IN_NAT_ENDIAN; - - *out = flag & UCONV_OUT_ENDIAN_MASKS; - - /* You cannot have both. */ - if (*out == UCONV_OUT_ENDIAN_MASKS) - return (EBADF); - - if (*out == 0) - *out = UCONV_OUT_NAT_ENDIAN; - - return (0); -} - -static boolean_t -check_bom16(const uint16_t *u16s, size_t u16l, int *in) -{ - if (u16l > 0) { - if (*u16s == UCONV_BOM_NORMAL) { - *in = UCONV_IN_NAT_ENDIAN; - return (B_TRUE); - } - if (*u16s == UCONV_BOM_SWAPPED) { - *in = UCONV_IN_REV_ENDIAN; - return (B_TRUE); - } - } - - return (B_FALSE); -} - -static boolean_t -check_bom32(const uint32_t *u32s, size_t u32l, int *in) -{ - if (u32l > 0) { - if (*u32s == UCONV_BOM_NORMAL) { - *in = UCONV_IN_NAT_ENDIAN; - return (B_TRUE); - } - if (*u32s == UCONV_BOM_SWAPPED_32) { - *in = UCONV_IN_REV_ENDIAN; - return (B_TRUE); - } - } - - return (B_FALSE); -} - -int -uconv_u16tou32(const uint16_t *u16s, size_t *utf16len, - uint32_t *u32s, size_t *utf32len, int flag) -{ - int inendian; - int outendian; - size_t u16l; - size_t u32l; - uint32_t hi; - uint32_t lo; - boolean_t do_not_ignore_null; - - /* - * Do preliminary validity checks on parameters and collect info on - * endians. - */ - if (u16s == NULL || utf16len == NULL) - return (EILSEQ); - - if (u32s == NULL || utf32len == NULL) - return (E2BIG); - - if (check_endian(flag, &inendian, &outendian) != 0) - return (EBADF); - - /* - * Initialize input and output parameter buffer indices and - * temporary variables. - */ - u16l = u32l = 0; - hi = 0; - do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); - - /* - * Check on the BOM at the beginning of the input buffer if required - * and if there is indeed one, process it. - */ - if ((flag & UCONV_IN_ACCEPT_BOM) && - check_bom16(u16s, *utf16len, &inendian)) - u16l++; - - /* - * Reset inendian and outendian so that after this point, those can be - * used as condition values. - */ - inendian &= UCONV_IN_NAT_ENDIAN; - outendian &= UCONV_OUT_NAT_ENDIAN; - - /* - * If there is something in the input buffer and if necessary and - * requested, save the BOM at the output buffer. - */ - if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) - u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : - UCONV_BOM_SWAPPED_32; - - /* - * Do conversion; if encounter a surrogate pair, assemble high and - * low pair values to form a UTF-32 character. If a half of a pair - * exists alone, then, either it is an illegal (EILSEQ) or - * invalid (EINVAL) value. - */ - for (; u16l < *utf16len; u16l++) { - if (u16s[u16l] == 0 && do_not_ignore_null) - break; - - lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); - - if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { - if (hi) - return (EILSEQ); - hi = lo; - continue; - } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { - if (! hi) - return (EILSEQ); - lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + - lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) - + UCONV_U16_START; - hi = 0; - } else if (hi) { - return (EILSEQ); - } - - if (u32l >= *utf32len) - return (E2BIG); - - u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo); - } - - /* - * If high half didn't see low half, then, it's most likely the input - * parameter is incomplete. - */ - if (hi) - return (EINVAL); - - /* - * Save the number of consumed and saved characters. They do not - * include terminating NULL character (U+0000) at the end of - * the input buffer (even when UCONV_IGNORE_NULL isn't specified and - * the input buffer length is big enough to include the terminating - * NULL character). - */ - *utf16len = u16l; - *utf32len = u32l; - - return (0); -} - -int -uconv_u16tou8(const uint16_t *u16s, size_t *utf16len, - uchar_t *u8s, size_t *utf8len, int flag) -{ - int inendian; - int outendian; - size_t u16l; - size_t u8l; - uint32_t hi; - uint32_t lo; - boolean_t do_not_ignore_null; - - if (u16s == NULL || utf16len == NULL) - return (EILSEQ); - - if (u8s == NULL || utf8len == NULL) - return (E2BIG); - - if (check_endian(flag, &inendian, &outendian) != 0) - return (EBADF); - - u16l = u8l = 0; - hi = 0; - do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); - - if ((flag & UCONV_IN_ACCEPT_BOM) && - check_bom16(u16s, *utf16len, &inendian)) - u16l++; - - inendian &= UCONV_IN_NAT_ENDIAN; - - for (; u16l < *utf16len; u16l++) { - if (u16s[u16l] == 0 && do_not_ignore_null) - break; - - lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); - - if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { - if (hi) - return (EILSEQ); - hi = lo; - continue; - } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { - if (! hi) - return (EILSEQ); - lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + - lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) - + UCONV_U16_START; - hi = 0; - } else if (hi) { - return (EILSEQ); - } - - /* - * Now we convert a UTF-32 character into a UTF-8 character. - * Unicode coding space is between U+0000 and U+10FFFF; - * anything bigger is an illegal character. - */ - if (lo <= UCONV_U8_ONE_BYTE) { - if (u8l >= *utf8len) - return (E2BIG); - u8s[u8l++] = (uchar_t)lo; - } else if (lo <= UCONV_U8_TWO_BYTES) { - if ((u8l + 1) >= *utf8len) - return (E2BIG); - u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); - u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); - } else if (lo <= UCONV_U8_THREE_BYTES) { - if ((u8l + 2) >= *utf8len) - return (E2BIG); - u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); - u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); - u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); - } else if (lo <= UCONV_U8_FOUR_BYTES) { - if ((u8l + 3) >= *utf8len) - return (E2BIG); - u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); - u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); - u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); - u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); - } else { - return (EILSEQ); - } - } - - if (hi) - return (EINVAL); - - *utf16len = u16l; - *utf8len = u8l; - - return (0); -} - -int -uconv_u32tou16(const uint32_t *u32s, size_t *utf32len, - uint16_t *u16s, size_t *utf16len, int flag) -{ - int inendian; - int outendian; - size_t u16l; - size_t u32l; - uint32_t hi; - uint32_t lo; - boolean_t do_not_ignore_null; - - if (u32s == NULL || utf32len == NULL) - return (EILSEQ); - - if (u16s == NULL || utf16len == NULL) - return (E2BIG); - - if (check_endian(flag, &inendian, &outendian) != 0) - return (EBADF); - - u16l = u32l = 0; - do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); - - if ((flag & UCONV_IN_ACCEPT_BOM) && - check_bom32(u32s, *utf32len, &inendian)) - u32l++; - - inendian &= UCONV_IN_NAT_ENDIAN; - outendian &= UCONV_OUT_NAT_ENDIAN; - - if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) - u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : - UCONV_BOM_SWAPPED; - - for (; u32l < *utf32len; u32l++) { - if (u32s[u32l] == 0 && do_not_ignore_null) - break; - - hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); - - /* - * Anything bigger than the Unicode coding space, i.e., - * Unicode scalar value bigger than U+10FFFF, is an illegal - * character. - */ - if (hi > UCONV_UNICODE_MAX) - return (EILSEQ); - - /* - * Anything bigger than U+FFFF must be converted into - * a surrogate pair in UTF-16. - */ - if (hi >= UCONV_U16_START) { - lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + - UCONV_U16_LO_MIN; - hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + - UCONV_U16_HI_MIN; - - if ((u16l + 1) >= *utf16len) - return (E2BIG); - - if (outendian) { - u16s[u16l++] = (uint16_t)hi; - u16s[u16l++] = (uint16_t)lo; - } else { - u16s[u16l++] = BSWAP_16(((uint16_t)hi)); - u16s[u16l++] = BSWAP_16(((uint16_t)lo)); - } - } else { - if (u16l >= *utf16len) - return (E2BIG); - u16s[u16l++] = (outendian) ? (uint16_t)hi : - BSWAP_16(((uint16_t)hi)); - } - } - - *utf16len = u16l; - *utf32len = u32l; - - return (0); -} - -int -uconv_u32tou8(const uint32_t *u32s, size_t *utf32len, - uchar_t *u8s, size_t *utf8len, int flag) -{ - int inendian; - int outendian; - size_t u32l; - size_t u8l; - uint32_t lo; - boolean_t do_not_ignore_null; - - if (u32s == NULL || utf32len == NULL) - return (EILSEQ); - - if (u8s == NULL || utf8len == NULL) - return (E2BIG); - - if (check_endian(flag, &inendian, &outendian) != 0) - return (EBADF); - - u32l = u8l = 0; - do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); - - if ((flag & UCONV_IN_ACCEPT_BOM) && - check_bom32(u32s, *utf32len, &inendian)) - u32l++; - - inendian &= UCONV_IN_NAT_ENDIAN; - - for (; u32l < *utf32len; u32l++) { - if (u32s[u32l] == 0 && do_not_ignore_null) - break; - - lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); - - if (lo <= UCONV_U8_ONE_BYTE) { - if (u8l >= *utf8len) - return (E2BIG); - u8s[u8l++] = (uchar_t)lo; - } else if (lo <= UCONV_U8_TWO_BYTES) { - if ((u8l + 1) >= *utf8len) - return (E2BIG); - u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); - u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); - } else if (lo <= UCONV_U8_THREE_BYTES) { - if ((u8l + 2) >= *utf8len) - return (E2BIG); - u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); - u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); - u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); - } else if (lo <= UCONV_U8_FOUR_BYTES) { - if ((u8l + 3) >= *utf8len) - return (E2BIG); - u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); - u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); - u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); - u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); - } else { - return (EILSEQ); - } - } - - *utf32len = u32l; - *utf8len = u8l; - - return (0); -} - -int -uconv_u8tou16(const uchar_t *u8s, size_t *utf8len, - uint16_t *u16s, size_t *utf16len, int flag) -{ - int inendian; - int outendian; - size_t u16l; - size_t u8l; - uint32_t hi; - uint32_t lo; - int remaining_bytes; - int first_b; - boolean_t do_not_ignore_null; - - if (u8s == NULL || utf8len == NULL) - return (EILSEQ); - - if (u16s == NULL || utf16len == NULL) - return (E2BIG); - - if (check_endian(flag, &inendian, &outendian) != 0) - return (EBADF); - - u16l = u8l = 0; - do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); - - outendian &= UCONV_OUT_NAT_ENDIAN; - - if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) - u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : - UCONV_BOM_SWAPPED; - - for (; u8l < *utf8len; ) { - if (u8s[u8l] == 0 && do_not_ignore_null) - break; - - /* - * Collect a UTF-8 character and convert it to a UTF-32 - * character. In doing so, we screen out illegally formed - * UTF-8 characters and treat such as illegal characters. - * The algorithm at below also screens out anything bigger - * than the U+10FFFF. - * - * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for - * more details on the illegal values of UTF-8 character - * bytes. - */ - hi = (uint32_t)u8s[u8l++]; - - if (hi > UCONV_ASCII_MAX) { - if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) - return (EILSEQ); - - first_b = hi; - hi = hi & masks_tbl[remaining_bytes]; - - for (; remaining_bytes > 0; remaining_bytes--) { - /* - * If we have no more bytes, the current - * UTF-8 character is incomplete. - */ - if (u8l >= *utf8len) - return (EINVAL); - - lo = (uint32_t)u8s[u8l++]; - - if (first_b) { - if (lo < valid_min_2nd_byte[first_b] || - lo > valid_max_2nd_byte[first_b]) - return (EILSEQ); - first_b = 0; - } else if (lo < UCONV_U8_BYTE_MIN || - lo > UCONV_U8_BYTE_MAX) { - return (EILSEQ); - } - hi = (hi << UCONV_U8_BIT_SHIFT) | - (lo & UCONV_U8_BIT_MASK); - } - } - - if (hi >= UCONV_U16_START) { - lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + - UCONV_U16_LO_MIN; - hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + - UCONV_U16_HI_MIN; - - if ((u16l + 1) >= *utf16len) - return (E2BIG); - - if (outendian) { - u16s[u16l++] = (uint16_t)hi; - u16s[u16l++] = (uint16_t)lo; - } else { - u16s[u16l++] = BSWAP_16(((uint16_t)hi)); - u16s[u16l++] = BSWAP_16(((uint16_t)lo)); - } - } else { - if (u16l >= *utf16len) - return (E2BIG); - - u16s[u16l++] = (outendian) ? (uint16_t)hi : - BSWAP_16(((uint16_t)hi)); - } - } - - *utf16len = u16l; - *utf8len = u8l; - - return (0); -} - -int -uconv_u8tou32(const uchar_t *u8s, size_t *utf8len, - uint32_t *u32s, size_t *utf32len, int flag) -{ - int inendian; - int outendian; - size_t u32l; - size_t u8l; - uint32_t hi; - uint32_t c; - int remaining_bytes; - int first_b; - boolean_t do_not_ignore_null; - - if (u8s == NULL || utf8len == NULL) - return (EILSEQ); - - if (u32s == NULL || utf32len == NULL) - return (E2BIG); - - if (check_endian(flag, &inendian, &outendian) != 0) - return (EBADF); - - u32l = u8l = 0; - do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); - - outendian &= UCONV_OUT_NAT_ENDIAN; - - if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) - u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : - UCONV_BOM_SWAPPED_32; - - for (; u8l < *utf8len; ) { - if (u8s[u8l] == 0 && do_not_ignore_null) - break; - - hi = (uint32_t)u8s[u8l++]; - - if (hi > UCONV_ASCII_MAX) { - if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) - return (EILSEQ); - - first_b = hi; - hi = hi & masks_tbl[remaining_bytes]; - - for (; remaining_bytes > 0; remaining_bytes--) { - if (u8l >= *utf8len) - return (EINVAL); - - c = (uint32_t)u8s[u8l++]; - - if (first_b) { - if (c < valid_min_2nd_byte[first_b] || - c > valid_max_2nd_byte[first_b]) - return (EILSEQ); - first_b = 0; - } else if (c < UCONV_U8_BYTE_MIN || - c > UCONV_U8_BYTE_MAX) { - return (EILSEQ); - } - hi = (hi << UCONV_U8_BIT_SHIFT) | - (c & UCONV_U8_BIT_MASK); - } - } - - if (u32l >= *utf32len) - return (E2BIG); - - u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi); - } - - *utf32len = u32l; - *utf8len = u8l; - - return (0); -} diff -r a4c12419233c -r 09764a26229e usr/src/uts/common/sys/Makefile --- a/usr/src/uts/common/sys/Makefile Fri Sep 14 08:32:57 2007 -0700 +++ b/usr/src/uts/common/sys/Makefile Fri Sep 14 10:25:36 2007 -0700 @@ -534,6 +534,7 @@ turnstile.h \ types.h \ types32.h \ + u8_textprep.h \ u8_textprep_data.h \ uadmin.h \ ucred.h \ diff -r a4c12419233c -r 09764a26229e usr/src/uts/common/sys/sunddi.h --- a/usr/src/uts/common/sys/sunddi.h Fri Sep 14 08:32:57 2007 -0700 +++ b/usr/src/uts/common/sys/sunddi.h Fri Sep 14 10:25:36 2007 -0700 @@ -59,6 +59,7 @@ #ifdef _KERNEL #include #endif +#include #ifdef __cplusplus extern "C" { @@ -441,76 +442,6 @@ extern int ddi_strtoul(const char *, char **, int, unsigned long *); /* - * Unicode encoding conversion functions and their macros. - */ -#define UCONV_IN_BIG_ENDIAN 0x0001 -#define UCONV_OUT_BIG_ENDIAN 0x0002 -#define UCONV_IN_SYSTEM_ENDIAN 0x0004 -#define UCONV_OUT_SYSTEM_ENDIAN 0x0008 -#define UCONV_IN_LITTLE_ENDIAN 0x0010 -#define UCONV_OUT_LITTLE_ENDIAN 0x0020 -#define UCONV_IGNORE_NULL 0x0040 -#define UCONV_IN_ACCEPT_BOM 0x0080 -#define UCONV_OUT_EMIT_BOM 0x0100 - -extern int uconv_u16tou32(const uint16_t *, size_t *, uint32_t *, size_t *, - int); -extern int uconv_u16tou8(const uint16_t *, size_t *, uchar_t *, size_t *, int); -extern int uconv_u32tou16(const uint32_t *, size_t *, uint16_t *, size_t *, - int); -extern int uconv_u32tou8(const uint32_t *, size_t *, uchar_t *, size_t *, int); -extern int uconv_u8tou16(const uchar_t *, size_t *, uint16_t *, size_t *, int); -extern int uconv_u8tou32(const uchar_t *, size_t *, uint32_t *, size_t *, int); - -/* - * UTF-8 text preparation functions and their macros. - * - * Among the macros defined, U8_CANON_DECOMP, U8_COMPAT_DECOMP, and - * U8_CANON_COMP are not public interfaces and must not be used directly - * at the flag input argument. - */ -#define U8_STRCMP_CS (0x00000001) -#define U8_STRCMP_CI_UPPER (0x00000002) -#define U8_STRCMP_CI_LOWER (0x00000004) - -#define U8_CANON_DECOMP (0x00000010) -#define U8_COMPAT_DECOMP (0x00000020) -#define U8_CANON_COMP (0x00000040) - -#define U8_STRCMP_NFD (U8_CANON_DECOMP) -#define U8_STRCMP_NFC (U8_CANON_DECOMP | U8_CANON_COMP) -#define U8_STRCMP_NFKD (U8_COMPAT_DECOMP) -#define U8_STRCMP_NFKC (U8_COMPAT_DECOMP | U8_CANON_COMP) - -#define U8_TEXTPREP_TOUPPER (U8_STRCMP_CI_UPPER) -#define U8_TEXTPREP_TOLOWER (U8_STRCMP_CI_LOWER) - -#define U8_TEXTPREP_NFD (U8_STRCMP_NFD) -#define U8_TEXTPREP_NFC (U8_STRCMP_NFC) -#define U8_TEXTPREP_NFKD (U8_STRCMP_NFKD) -#define U8_TEXTPREP_NFKC (U8_STRCMP_NFKC) - -#define U8_TEXTPREP_IGNORE_NULL (0x00010000) -#define U8_TEXTPREP_IGNORE_INVALID (0x00020000) -#define U8_TEXTPREP_NOWAIT (0x00040000) - -#define U8_UNICODE_320 (0) -#define U8_UNICODE_500 (1) -#define U8_UNICODE_LATEST (U8_UNICODE_500) - -#define U8_VALIDATE_ENTIRE (0x00100000) -#define U8_VALIDATE_CHECK_ADDITIONAL (0x00200000) -#define U8_VALIDATE_UCS2_RANGE (0x00400000) - -#define U8_ILLEGAL_CHAR (-1) -#define U8_OUT_OF_RANGE_CHAR (-2) - -extern int u8_validate(char *, size_t, char **, int, int *); -extern int u8_strcmp(const char *, const char *, size_t, int, size_t, int *); -extern size_t u8_textprep_str(char *, size_t *, char *, size_t *, int, size_t, - int *); - -/* * ddi_map_regs * * Map in the register set given by rnumber. diff -r a4c12419233c -r 09764a26229e usr/src/uts/common/sys/u8_textprep.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/common/sys/u8_textprep.h Fri Sep 14 10:25:36 2007 -0700 @@ -0,0 +1,113 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_U8_TEXTPREP_H +#define _SYS_U8_TEXTPREP_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Unicode encoding conversion functions and their macros. + */ +#define UCONV_IN_BIG_ENDIAN 0x0001 +#define UCONV_OUT_BIG_ENDIAN 0x0002 +#define UCONV_IN_SYSTEM_ENDIAN 0x0004 +#define UCONV_OUT_SYSTEM_ENDIAN 0x0008 +#define UCONV_IN_LITTLE_ENDIAN 0x0010 +#define UCONV_OUT_LITTLE_ENDIAN 0x0020 +#define UCONV_IGNORE_NULL 0x0040 +#define UCONV_IN_ACCEPT_BOM 0x0080 +#define UCONV_OUT_EMIT_BOM 0x0100 + +extern int uconv_u16tou32(const uint16_t *, size_t *, uint32_t *, size_t *, + int); +extern int uconv_u16tou8(const uint16_t *, size_t *, uchar_t *, size_t *, int); +extern int uconv_u32tou16(const uint32_t *, size_t *, uint16_t *, size_t *, + int); +extern int uconv_u32tou8(const uint32_t *, size_t *, uchar_t *, size_t *, int); +extern int uconv_u8tou16(const uchar_t *, size_t *, uint16_t *, size_t *, int); +extern int uconv_u8tou32(const uchar_t *, size_t *, uint32_t *, size_t *, int); + +/* + * UTF-8 text preparation functions and their macros. + * + * Among the macros defined, U8_CANON_DECOMP, U8_COMPAT_DECOMP, and + * U8_CANON_COMP are not public interfaces and must not be used directly + * at the flag input argument. + */ +#define U8_STRCMP_CS (0x00000001) +#define U8_STRCMP_CI_UPPER (0x00000002) +#define U8_STRCMP_CI_LOWER (0x00000004) + +#define U8_CANON_DECOMP (0x00000010) +#define U8_COMPAT_DECOMP (0x00000020) +#define U8_CANON_COMP (0x00000040) + +#define U8_STRCMP_NFD (U8_CANON_DECOMP) +#define U8_STRCMP_NFC (U8_CANON_DECOMP | U8_CANON_COMP) +#define U8_STRCMP_NFKD (U8_COMPAT_DECOMP) +#define U8_STRCMP_NFKC (U8_COMPAT_DECOMP | U8_CANON_COMP) + +#define U8_TEXTPREP_TOUPPER (U8_STRCMP_CI_UPPER) +#define U8_TEXTPREP_TOLOWER (U8_STRCMP_CI_LOWER) + +#define U8_TEXTPREP_NFD (U8_STRCMP_NFD) +#define U8_TEXTPREP_NFC (U8_STRCMP_NFC) +#define U8_TEXTPREP_NFKD (U8_STRCMP_NFKD) +#define U8_TEXTPREP_NFKC (U8_STRCMP_NFKC) + +#define U8_TEXTPREP_IGNORE_NULL (0x00010000) +#define U8_TEXTPREP_IGNORE_INVALID (0x00020000) +#define U8_TEXTPREP_NOWAIT (0x00040000) + +#define U8_UNICODE_320 (0) +#define U8_UNICODE_500 (1) +#define U8_UNICODE_LATEST (U8_UNICODE_500) + +#define U8_VALIDATE_ENTIRE (0x00100000) +#define U8_VALIDATE_CHECK_ADDITIONAL (0x00200000) +#define U8_VALIDATE_UCS2_RANGE (0x00400000) + +#define U8_ILLEGAL_CHAR (-1) +#define U8_OUT_OF_RANGE_CHAR (-2) + +extern int u8_validate(char *, size_t, char **, int, int *); +extern int u8_strcmp(const char *, const char *, size_t, int, size_t, int *); +extern size_t u8_textprep_str(char *, size_t *, char *, size_t *, int, size_t, + int *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_U8_TEXTPREP_H */