changeset 13399:a1d28d03839f

992 towlower/towupper are broken Reviewed by: Garrett D'Amore <garrett@nexenta.com> Approved by: Gordon Ross <gwr@nexenta.com>
author Yuri Pankov <yuri.pankov@gmail.com>
date Thu, 12 May 2011 03:21:34 +0400
parents fa0b6e3a91f5
children 71e59c2d8715
files usr/src/cmd/localedef/Makefile usr/src/cmd/localedef/ctype.c usr/src/cmd/localedef/data/ctype.sh
diffstat 3 files changed, 61 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/localedef/Makefile	Mon Jul 11 16:32:19 2011 -0400
+++ b/usr/src/cmd/localedef/Makefile	Thu May 12 03:21:34 2011 +0400
@@ -31,7 +31,7 @@
 YFLAGS		= -d -b parser
 CLEANFILES	= $(OBJS) parser.tab.c parser.tab.h
 CLEANFILES	+= \
-		UTF-8.cm \
+		UTF-8.cm UTF-8.ct \
 		8859-1.cm 8859-2.cm 8859-3.cm 8859-4.cm \
 		8859-5.cm 8859-5.cm 8859-6.cm 8859-7.cm \
 		8859-8.cm 8859-9.cm 8859-9.cm 8859-10.cm \
@@ -179,6 +179,8 @@
 		vi_VN \
 		zh_CN zh_HK zh_MO zh_SG zh_TW
 
+UTF8SRCS	= $(UTF_8_LOCALES:%=data/%.UTF-8.src)
+
 LOCNAMES	= \
 		$(ISO8859_1_LOCALES:%=%.ISO8859-1) \
 		$(ISO8859_2_LOCALES:%=%.ISO8859-2) \
@@ -251,8 +253,12 @@
 
 include ../Makefile.targ
 
-locale/%.UTF-8/stamp:		data/%.UTF-8.src UTF-8.cm locale $(PROG)
-	./$(PROG) -U -i $< -f UTF-8.cm $(@D)
+# Strip LC_CTYPE contents for UTF-8 locales and replace them
+# with UTF-8.ct we compiled
+locale/%.UTF-8/stamp:		data/%.UTF-8.src UTF-8.cm \
+				UTF-8.ct locale $(PROG)
+	$(SED) '/^LC_CTYPE/,/^END LC_CTYPE/d;$$r UTF-8.ct' $< | \
+		./$(PROG) -U -f UTF-8.cm $(@D)
 	$(TOUCH) $@
 locale/%.ISO8859-1/stamp:	data/%.UTF-8.src 8859-1.cm locale $(PROG)
 	./$(PROG) -U -i $< -f 8859-1.cm $(@D)
@@ -295,6 +301,9 @@
 UTF-8.cm: data/UTF-8.cm
 	$(LN) -sf data/UTF-8.cm  $@
 
+UTF-8.ct: $(UTF8SRCS)
+	$(SH) data/ctype.sh $(UTF8SRCS) > $@
+
 %.cm: data/%.TXT UTF-8.cm
 	$(RM) $@
 	$(PERL) data/convert_map.pl $< > $@
--- a/usr/src/cmd/localedef/ctype.c	Mon Jul 11 16:32:19 2011 -0400
+++ b/usr/src/cmd/localedef/ctype.c	Thu May 12 03:21:34 2011 +0400
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2010,2011 Nexenta Systems, Inc.  All rights reserved.
  */
 
 /*
@@ -321,8 +321,8 @@
 			ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
 			last_ct = ctn;
 		}
-		if (ctn->toupper == 0) {
-			last_up = NULL;
+		if (ctn->tolower == 0) {
+			last_lo = NULL;
 		} else if ((last_lo != NULL) &&
 		    (last_lo->tolower + 1 == ctn->tolower)) {
 			lo[rl.maplower_ext_nranges-1].max = wc;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/localedef/data/ctype.sh	Thu May 12 03:21:34 2011 +0400
@@ -0,0 +1,46 @@
+#! /usr/bin/sh
+#
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+#
+
+# Combine LC_CTYPE classes from all .UTF-8.src files to be compiled by localedef
+# into one LC_CTYPE/LCL_DATA used by all locales, so we have the same case
+# mapping tables, character classes, etc. for all of them. This is not general
+# purpose parser but is good enough for the stock files supplied with CLDR.
+
+printf "\nLC_CTYPE\n"
+
+for i in upper lower alpha space cntrl graph print punct digit xdigit blank \
+	toupper tolower; do
+	# sed can't match both range patterns on the same line so we just make
+	# it look like valid multiline class by duplicating the definition
+	sed -E "/^$i.*>$/ {
+		s,$,;/,
+		h
+		s,^$i(.*>);/$,\1,
+		H
+		x
+	}" $@ |\
+	sed -E -n "/^$i/,/(>|\))$/ {
+		s,^$i,,
+		s,(>|\))$,\1;/,
+		/^$/d
+		p
+	}" |\
+	sort -u |\
+	sed -E "1 s,^,$i,;$ s,(>|\));/,\1,"
+done
+
+printf "\nEND LC_CTYPE\n"