changeset 10271:7c80b70bb8de

6858457 Remove Solaris support for UltraSPARC-AT10 processor
author Jason Beloro <Jason.Beloro@Sun.COM>
date Thu, 06 Aug 2009 17:39:39 -0700
parents 2dc261f74cc2
children a0669934e974
files usr/src/cmd/mdb/common/kmdb/kctl/kctl_main.c usr/src/cmd/mdb/common/kmdb/kmdb_kvm.c usr/src/cmd/picl/plugins/inc/picldefs.h usr/src/cmd/picl/plugins/sun4v/snmp/snmpplugin.c usr/src/cmd/picl/plugins/sun4v/snmp/snmpplugin.h usr/src/common/atomic/sparcv9/atomic.s usr/src/common/elfcap/elfcap.c usr/src/lib/libc/sparc_hwcap1/common/gen/memcpy.s usr/src/lib/libc/sparc_hwcap1/common/gen/memset.s usr/src/lib/libc/sparc_hwcap1/common/gen/misc.s usr/src/lib/libc/sparc_hwcap1/common/gen/strcpy.s usr/src/lib/libc/sparc_hwcap1/common/gen/strlen.s usr/src/lib/libc/sparc_hwcap1/sparc/Makefile usr/src/lib/libc/sparc_hwcap1/sparcv9/Makefile usr/src/lib/libdisasm/sparc/dis_sparc_fmt.c usr/src/lib/libdisasm/sparc/instr.c usr/src/lib/libprtdiag/common/display_sun4v.c usr/src/pkgdefs/Makefile usr/src/pkgdefs/SUNWusat10.v/Makefile usr/src/pkgdefs/SUNWusat10.v/pkginfo.tmpl usr/src/pkgdefs/SUNWusat10.v/prototype_com usr/src/pkgdefs/SUNWusat10.v/prototype_sparc usr/src/uts/common/io/mem.c usr/src/uts/common/sys/auxv_SPARC.h usr/src/uts/common/vm/hat.h usr/src/uts/common/vm/page.h usr/src/uts/common/vm/page_retire.c usr/src/uts/common/vm/seg_kmem.c usr/src/uts/common/vm/vm_page.c usr/src/uts/sfmmu/ml/sfmmu_asm.s usr/src/uts/sfmmu/ml/sfmmu_kdi.s usr/src/uts/sfmmu/vm/hat_sfmmu.c usr/src/uts/sfmmu/vm/hat_sfmmu.h usr/src/uts/sparc/fpu/fpu_simulator.c usr/src/uts/sparc/sys/fpu/fpu_simulator.h usr/src/uts/sun4/os/startup.c usr/src/uts/sun4/vm/sfmmu.c usr/src/uts/sun4/vm/vm_dep.h usr/src/uts/sun4u/sys/pte.h usr/src/uts/sun4u/vm/mach_sfmmu.h usr/src/uts/sun4v/Makefile.files usr/src/uts/sun4v/Makefile.sun4v.shared usr/src/uts/sun4v/cpu/rock.c usr/src/uts/sun4v/cpu/rock_asm.s usr/src/uts/sun4v/cpu/rock_copy.s usr/src/uts/sun4v/io/px/px_lib4v.c usr/src/uts/sun4v/io/px/px_lib4v.h usr/src/uts/sun4v/ml/hcall.s usr/src/uts/sun4v/ml/mach_interrupt.s usr/src/uts/sun4v/ml/mach_offsets.in usr/src/uts/sun4v/ml/trap_table.s usr/src/uts/sun4v/os/error.c usr/src/uts/sun4v/os/fillsysinfo.c usr/src/uts/sun4v/os/mach_cpu_states.c usr/src/uts/sun4v/pcbe/rock_pcbe.c usr/src/uts/sun4v/rock/Makefile usr/src/uts/sun4v/rock_pcbe/Makefile usr/src/uts/sun4v/sys/error.h usr/src/uts/sun4v/sys/hsvc.h usr/src/uts/sun4v/sys/hypervisor_api.h usr/src/uts/sun4v/sys/machcpuvar.h usr/src/uts/sun4v/sys/machsystm.h usr/src/uts/sun4v/sys/mmu.h usr/src/uts/sun4v/sys/pte.h usr/src/uts/sun4v/sys/rock_hypervisor_api.h usr/src/uts/sun4v/sys/rockasi.h usr/src/uts/sun4v/vm/mach_sfmmu.c usr/src/uts/sun4v/vm/mach_sfmmu.h usr/src/uts/sun4v/vm/mach_sfmmu_asm.s usr/src/uts/sun4v/vm/mach_vm_dep.c
diffstat 70 files changed, 213 insertions(+), 13783 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/mdb/common/kmdb/kctl/kctl_main.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/cmd/mdb/common/kmdb/kctl/kctl_main.c	Thu Aug 06 17:39:39 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -41,7 +41,6 @@
 #include <sys/kdi_impl.h>
 #include <sys/ctf_api.h>
 #include <vm/seg_kmem.h>
-#include <vm/hat.h>
 
 kctl_t kctl;
 
@@ -153,9 +152,8 @@
 	if (hat_getpfnum(kas.a_hat, addr) != PFN_INVALID)
 		return (EAGAIN);
 
-	/* Set HAT_ATTR_TEXT to override soft execute mode */
-	if (segkmem_xalloc(NULL, addr, sz, VM_NOSLEEP, HAT_ATTR_TEXT,
-	    segkmem_page_create, NULL) == NULL)
+	if (segkmem_xalloc(NULL, addr, sz, VM_NOSLEEP, 0, segkmem_page_create,
+	    NULL) == NULL)
 		return (ENOMEM);
 
 	return (0);
--- a/usr/src/cmd/mdb/common/kmdb/kmdb_kvm.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/cmd/mdb/common/kmdb/kmdb_kvm.c	Thu Aug 06 17:39:39 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -43,7 +43,6 @@
 
 #include <strings.h>
 #include <dlfcn.h>
-#include <sys/kdi_impl.h>
 #include <sys/isa_defs.h>
 #include <sys/kobj.h>
 #include <sys/kobj_impl.h>
@@ -218,7 +217,6 @@
 kmt_writer(void *buf, size_t nbytes, uint64_t addr)
 {
 	kmt_bcopy(buf, (void *)(uintptr_t)addr, nbytes);
-	mdb.m_kdi->kdi_flush_caches();
 	return (nbytes);
 }
 
--- a/usr/src/cmd/picl/plugins/inc/picldefs.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/cmd/picl/plugins/inc/picldefs.h	Thu Aug 06 17:39:39 2009 -0700
@@ -129,8 +129,6 @@
 #define	PICL_CLASS_SENSOR		"sensor"
 #define	PICL_CLASS_STACK		"stack"
 #define	PICL_CLASS_UNKNOWN		"unknown"
-#define	PICL_CLASS_HUMIDITY_SENSOR	"humidity-sensor"
-#define	PICL_CLASS_HUMIDITY_INDICATOR	"humidity-indicator"
 
 /*
  * Solaris driver property names
@@ -243,7 +241,6 @@
 #define	PICL_PROP_BASE_UNITS			"BaseUnits"
 #define	PICL_PROP_EXPONENT			"Exponent"
 #define	PICL_PROP_RATE_UNITS			"RateUnits"
-#define	PICL_PROP_HUMIDITY			"Humidity"
 
 /*
  * Various threshold property names
--- a/usr/src/cmd/picl/plugins/sun4v/snmp/snmpplugin.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/cmd/picl/plugins/sun4v/snmp/snmpplugin.c	Thu Aug 06 17:39:39 2009 -0700
@@ -865,10 +865,6 @@
 				ADD_NODE(PICL_CLASS_RPM_SENSOR)
 				add_prop(nodeh, &proph, node_name, row,
 				    PP_SPEED, snmp_syserr_p);
-			} else if (sensor_type == SSST_HUMIDITY) {
-				ADD_NODE(PICL_CLASS_HUMIDITY_SENSOR)
-				add_prop(nodeh, &proph, node_name, row,
-				    PP_HUMIDITY, snmp_syserr_p);
 			} else {
 				ADD_NODE(PICL_CLASS_SENSOR)
 				add_prop(nodeh, &proph, node_name, row,
@@ -906,8 +902,6 @@
 				ADD_NODE(PICL_CLASS_RPM_INDICATOR)
 			} else if (sensor_type == SSST_PRESENCE) {
 				ADD_NODE(PICL_CLASS_PRESENCE_INDICATOR)
-			} else if (sensor_type == SSST_HUMIDITY) {
-				ADD_NODE(PICL_CLASS_HUMIDITY_INDICATOR)
 			} else {
 				ADD_NODE(PICL_CLASS_INDICATOR)
 			}
--- a/usr/src/cmd/picl/plugins/sun4v/snmp/snmpplugin.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/cmd/picl/plugins/sun4v/snmp/snmpplugin.h	Thu Aug 06 17:39:39 2009 -0700
@@ -111,8 +111,7 @@
 	PP_MFG_NAME,
 	PP_MODEL_NAME,
 	PP_DESCRIPTION,
-	PP_LABEL,
-	PP_HUMIDITY
+	PP_LABEL
 } sp_propid_t;
 
 /*
--- a/usr/src/common/atomic/sparcv9/atomic.s	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/common/atomic/sparcv9/atomic.s	Thu Aug 06 17:39:39 2009 -0700
@@ -82,13 +82,6 @@
 	mov	tmp2, %o7	/* restore callee's return address */	; \
 label/**/1:
 
-#ifdef	ATOMIC_SIMPLE_BO_ENABLE
-/*
- * For some processors, simple limit has proved benefical
- */
-#define ATOMIC_BACKOFF_CPU(val, limit, ncpu, cas_cnt, label)		\
-	set	1 << ATOMIC_BO_ENABLE_SHIFT, limit
-#else
 /*
  * For the kernel, we take into consideration of cas failures
  * and also scale the backoff limit w.r.t. the number of cpus.
@@ -111,7 +104,6 @@
 	mov	%g0, cas_cnt						; \
 	mov	1, val							; \
 label/**/1:
-#endif	/* ATOMIC_SIMPLE_BO_ENABLE */
 #endif	/* ATOMIC_BO_ENABLE_SHIFT */
 
 #else	/* _KERNEL */
@@ -137,18 +129,11 @@
  * The cas_cnt counts the cas instruction failure and is
  * initialized to 0.
  */
-#ifdef	ATOMIC_SIMPLE_BO_ENABLE
-#define ATOMIC_BACKOFF_INIT(val, ncpu, cas_cnt)	\
-	mov	1, val
-
-#else /* If not defined ATOMIC_SIMPLE_BO_ENABLE */
 #define ATOMIC_BACKOFF_INIT(val, ncpu, cas_cnt)	\
 	mov	1, val				; \
 	mov	%g0, ncpu			; \
 	mov	%g0, cas_cnt
 
-#endif	/* ATOMIC_SIMPLE_BO_ENABLE */
-
 #define ATOMIC_BACKOFF_BRANCH(cr, backoff, loop) \
 	bne,a,pn cr, backoff
 
--- a/usr/src/common/elfcap/elfcap.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/common/elfcap/elfcap.c	Thu Aug 06 17:39:39 2009 -0700
@@ -150,10 +150,7 @@
 		AV_SPARC_FMAF, STRDESC("AV_SPARC_FMAF"),
 		STRDESC("FMAF"), STRDESC("fmaf"),
 	},
-	{						/* 0x00000200 */
-		AV_SPARC_FMAU, STRDESC("AV_SPARC_FMAU"),
-		STRDESC("FMAU"), STRDESC("fmau"),
-	},
+	RESERVED_ELFCAP_DESC,				/* 0x00000200 */
 	{						/* 0x00000400 */
 		AV_SPARC_VIS3, STRDESC("AV_SPARC_VIS3"),
 		STRDESC("VIS3"), STRDESC("vis3"),
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/memcpy.s	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1704 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-	.file	"memcpy.s"
-
-/*
- * memcpy(s1, s2, len)
- *
- * Copy s2 to s1, always copy n bytes.
- * Note: this C code does not work for overlapped copies.
- *       Memmove() and bcopy() do.
- *
- * Added entry __align_cpy_1 is generally for use of the compilers.
- *
- * Fast assembler language version of the following C-program for memcpy
- * which represents the `standard' for the C-library.
- *
- *	void * 
- *	memcpy(void *s, const void *s0, size_t n)
- *	{
- *		if (n != 0) {
- *	   	    char *s1 = s;
- *		    const char *s2 = s0;
- *		    do {
- *			*s1++ = *s2++;
- *		    } while (--n != 0);
- *		}
- *		return (s);
- *	}
- */
-
-#include <sys/asm_linkage.h>
-#include <sys/sun4asi.h>
-#include <sys/trap.h>
-
-#ifdef	__sparcv9
-#define	SAVESIZE	(8 * 1)
-#define	STACK_OFFSET	(STACK_BIAS + MINFRAME)
-#else
-#define	SAVESIZE	(8 * 3)
-#define	STACK_OFFSET	(STACK_BIAS + MINFRAME + 4)
-#endif
-
-#define	scratch_offset	0
-#define	g4_offset	8
-#define	g5_offset	16
-
-#define	ICACHE_LINE_SIZE	64
-#define	BLOCK_SIZE	64
-#define	FPRS_FEF	0x4
-#define	PF_FAR		2048
-#define	PF_NEAR		1024
-
-#define SHORTCOPY	3
-#define	SMALL_MAX	39
-#define	MEDIUM_MAX	255
-#define MED_WMAX	256	/* max copy for medium word-aligned case */
-#define MED_MAX		256	/* max copy for medium longword-aligned case */
-
-#ifndef BSTORE_SIZE
-#define BSTORE_SIZE	256	/* min copy size for block store */
-#endif
-
-/*
- * The LDDs will use the below ASI for performance
- * This ASI minimizes cache pollution.
- */
-#define	ASI_CACHE_SPARING	0xf4
-#define	ASI_CACHE_SPARING_PRIMARY	0xf4
-
-	ANSI_PRAGMA_WEAK(memmove,function)
-	ANSI_PRAGMA_WEAK(memcpy,function)
-
-	ENTRY(memmove)
-	cmp	%o1, %o0	! if from address is >= to use forward copy
-	bgeu	%ncc, .forcpy	! else use backward if ...
-	sub	%o0, %o1, %o4	! get difference of two addresses
-	cmp	%o2, %o4	! compare size and difference of addresses
-	bleu	%ncc, .forcpy	! if size is bigger, do overlapped copy
-	nop
-
-	!
-	! an overlapped copy that must be done "backwards"
-	!
-.ovbc:
-	mov	%o0, %g1		! save dest address for return val
-	add     %o1, %o2, %o1	   ! get to end of source space
-	add     %o0, %o2, %o0	   ! get to end of destination space
-
-	cmp	%o2, 24
-	bgeu,pn	%ncc, .dbalign
-	nop
-	cmp	%o2, 4
-	blt,pn	%ncc, .byte
-	sub	%o2, 3, %o2
-.byte4loop:
-	ldub	[%o1-1], %o3		! load last byte
-	stb	%o3, [%o0-1]		! store last byte
-	sub	%o1, 4, %o1
-	ldub	[%o1+2], %o3		! load 2nd from last byte
-	stb	%o3, [%o0-2]		! store 2nd from last byte
-	sub	%o0, 4, %o0
-	ldub	[%o1+1], %o3		! load 3rd from last byte
-	stb	%o3, [%o0+1]		! store 3rd from last byte
-	subcc	%o2, 4, %o2
-	ldub	[%o1], %o3		! load 4th from last byte
-	bgu,pt	%ncc, .byte4loop
-	stb	%o3, [%o0]		! store 4th from last byte
-.byte:
-	addcc	%o2, 3, %o2
-	bz,pt	%ncc, .exit
-.byteloop:
-	dec	%o1			! decrement src address
-	ldub	[%o1], %o3		! read a byte
-	dec	%o0			! decrement dst address
-	deccc	%o2			! decrement count
-	bgu,pt	%ncc, .byteloop		! loop until done
-	stb	%o3, [%o0]		! write byte
-.exit:
-	retl
-	mov	%g1, %o0
-
-	.align	16
-.dbalign:
-	andcc   %o0, 7, %o5		! bytes till DST 8 byte aligned
-	bz,pt	%ncc, .dbmed
-	sub	%o2, %o5, %o2		! update count
-.dbalign1:
-	dec	%o1			! decrement src address
-	ldub	[%o1], %o3		! read a byte
-	dec	%o0			! decrement dst address
-	deccc	%o5			! decrement count
-	bgu,pt	%ncc, .dbalign1		! loop until done
-	stb	%o3, [%o0]		! store a byte
-
-! check for src long word alignment
-.dbmed:
-	mov	%asi, %g5		! save curr %asi
-	wr	%g0, ASI_CACHE_SPARING, %asi
-	andcc	%o1, 7, %g0		! chk src long word alignment
-	bnz,pn	%ncc, .dbbck
-	nop
-!
-! Following code is for overlapping copies where src and dest
-! are long word aligned
-!
-	cmp	%o2, 4095
-	blt,pn	%ncc, .dbmedl32enter	! go to no prefetch code
-	nop
-	prefetch [%o1 - (1 * BLOCK_SIZE)], #n_reads
-	sub	%o2, 63, %o2		! adjust length to allow cc test
-					! for end of loop
-	prefetch [%o1 - (2 * BLOCK_SIZE)], #n_reads
-	prefetch [%o1 - (3 * BLOCK_SIZE)], #n_reads
-	prefetch [%o1 - (4 * BLOCK_SIZE)], #n_reads
-.dbmedl64:
-	prefetch [%o1 - (5 * BLOCK_SIZE)], #n_reads
-	ldxa	[%o1-8]%asi, %o3	! load
-	subcc	%o2, 64, %o2		! decrement length count
-	stx	%o3, [%o0-8]		! and store
-	ldxa	[%o1-16]%asi, %o3	! a block of 64 bytes
-	sub	%o1, 64, %o1		! decrease src ptr by 64
-	stx	%o3, [%o0-16]
-	sub	%o0, 64, %o0		! decrease dst ptr by 64
-	ldxa	[%o1+40]%asi, %o3
-	ldxa	[%o1+32]%asi, %o4
-	ldxa	[%o1+24]%asi, %o5
-	stx	%o3, [%o0+40]
-	stx	%o4, [%o0+32]
-	stx	%o5, [%o0+24]
-	ldxa	[%o1+16]%asi, %o3
-	ldxa	[%o1+8]%asi, %o4
-	stx	%o3, [%o0+16]
-	stx	%o4, [%o0+8]
-	ldxa	[%o1]%asi, %o5
-	bgu,pt	%ncc, .dbmedl64		! repeat if at least 64 bytes left
-	stx	%o5, [%o0]
-	add	%o2, 63, %o2		! restore offset adjustment
-.dbmedl32enter:
-	subcc	%o2, 31, %o2		! adjust length to allow cc test
-					! for end of loop
-	ble,pt  %ncc, .dbmedl31		! skip big loop if less than 32
-	nop
-.dbmedl32:
-	ldx	[%o1-8], %o4		! load
-	subcc	%o2, 32, %o2		! decrement length count
-	stx	%o4, [%o0-8]		! and store
-	ldx	[%o1-16], %o3		! a block of 32 bytes
-	sub	%o1, 32, %o1		! decrease src ptr by 32
-	stx	%o3, [%o0-16]
-	ldx	[%o1+8], %o4
-	sub	%o0, 32, %o0		! decrease dst ptr by 32
-	stx	%o4, [%o0+8]
-	ldx	[%o1], %o3
-	bgu,pt	%ncc, .dbmedl32		! repeat if at least 32 bytes left
-	stx	%o3, [%o0]
-.dbmedl31:
-	addcc	%o2, 16, %o2		! adjust remaining count
-	ble,pt	%ncc, .dbmedl15		! skip if 15 or fewer bytes left
-	nop				!
-	ldx	[%o1-8], %o4		! load and store 16 bytes
-	sub	%o1, 16, %o1		! decrease src ptr by 16
-	stx	%o4, [%o0-8]		!
-	sub	%o2, 16, %o2		! decrease count by 16
-	ldx	[%o1], %o3		!
-	sub	%o0, 16, %o0		! decrease dst ptr by 16
-	stx	%o3, [%o0]
-.dbmedl15:
-	addcc	%o2, 15, %o2		! restore count
-	bz,pt	%ncc, .dbexit		! exit if finished
-	nop
-	cmp	%o2, 8
-	blt,pt	%ncc, .dbremain		! skip if 7 or fewer bytes left
-	nop
-	ldx	[%o1-8], %o4		! load 8 bytes
-	sub	%o1, 8, %o1		! decrease src ptr by 8
-	stx	%o4, [%o0-8]		! and store 8 bytes
-	subcc	%o2, 8, %o2		! decrease count by 8
-	bnz	%ncc, .dbremain		! exit if finished
-	sub	%o0, 8, %o0		! decrease dst ptr by 8
-	mov	%g5, %asi		! restore %asi
-	retl
-	mov	%g1, %o0
-
-!
-! Following code is for overlapping copies where src and dest
-! are not long word aligned
-!
-	.align	16
-.dbbck:
-	rd	%fprs, %o3		! o3 = fprs
- 
-	! if fprs.fef == 0, set it. Checking it, requires 2 instructions.
-	! So set it anyway, without checking.
-	wr      %g0, FPRS_FEF, %fprs	 ! fprs.fef = 1
-
-	alignaddr %o1, %g0, %o5		! align src
-	ldda	[%o5]%asi, %d0		! get first 8 byte block
-	andn	%o2, 7, %o4		! prepare src ptr for finishup code
-	cmp	%o2, 32
-	blt,pn	%ncc, .dbmv8
-	sub	%o1, %o4, %o1		!
-	cmp	%o2, 4095		! check for short memmoves
-	blt,pn	%ncc, .dbmv32enter	! go to no prefetch code
-.dbmv64:
-	ldda	[%o5-8]%asi, %d2	! load 8 bytes
-	ldda	[%o5-16]%asi, %d4	! load 8 bytes
-	sub	%o5, 64, %o5		!
-	ldda	[%o5+40]%asi, %d6	! load 8 bytes
-	sub	%o0, 64, %o0		!
-	ldda	[%o5+32]%asi, %d8	! load 8 bytes
-	sub	%o2, 64, %o2		! 64 less bytes to copy
-	ldda	[%o5+24]%asi, %d18	! load 8 bytes
-	cmp	%o2, 64			! do we have < 64 bytes remaining
-	ldda	[%o5+16]%asi, %d28	! load 8 bytes
-	ldda	[%o5+8]%asi, %d30	! load 8 bytes
-	prefetch [%o5 - (5 * BLOCK_SIZE)], #n_reads
-	faligndata %d2, %d0, %d10	! extract 8 bytes out
-	ldda	[%o5]%asi, %d0		! load 8 bytes
-	std	%d10, [%o0+56]		! store the current 8 bytes
-	faligndata %d4, %d2, %d12	! extract 8 bytes out
-	std	%d12, [%o0+48]		! store the current 8 bytes
-	faligndata %d6, %d4, %d14	! extract 8 bytes out
-	std	%d14, [%o0+40]		! store the current 8 bytes
-	faligndata %d8, %d6, %d16	! extract 8 bytes out
-	std	%d16, [%o0+32]		! store the current 8 bytes
-	faligndata %d18, %d8, %d20	! extract 8 bytes out
-	std	%d20, [%o0+24]		! store the current 8 bytes
-	faligndata %d28, %d18, %d22	! extract 8 bytes out
-	std	%d22, [%o0+16]		! store the current 8 bytes
-	faligndata %d30, %d28, %d24	! extract 8 bytes out
-	std	%d24, [%o0+8]		! store the current 8 bytes
-	faligndata %d0, %d30, %d26	! extract 8 bytes out
-	bgeu,pt	%ncc, .dbmv64
-	std	%d26, [%o0]		! store the current 8 bytes
-
-	cmp	%o2, 32
-	blt,pn	%ncc, .dbmvx
-	nop
-.dbmv32:
-	ldda	[%o5-8]%asi, %d2	! load 8 bytes
-.dbmv32enter:
-	ldda	[%o5-16]%asi, %d4	! load 8 bytes
-	sub	%o5, 32, %o5		!
-	ldda	[%o5+8]%asi, %d6	! load 8 bytes
-	sub	%o0, 32, %o0		! 
-	faligndata %d2, %d0, %d10	! extract 8 bytes out
-	ldda	[%o5]%asi, %d0		! load 8 bytes
-	sub     %o2,32, %o2		! 32 less bytes to copy
-	std	%d10, [%o0+24]		! store the current 8 bytes
-	cmp	%o2, 32			! do we have < 32 bytes remaining
-	faligndata %d4, %d2, %d12	! extract 8 bytes out
-	std	%d12, [%o0+16]		! store the current 8 bytes
-	faligndata %d6, %d4, %d14	! extract 8 bytes out
-	std	%d14, [%o0+8]		! store the current 8 bytes
-	faligndata %d0, %d6, %d16	! extract 8 bytes out
-	bgeu,pt	%ncc, .dbmv32
-	std	%d16, [%o0]		! store the current 8 bytes
-.dbmvx:
-	cmp	%o2, 8			! do we have < 8 bytes remaining
-	blt,pt	%ncc, .dbmvfinish	! if yes, skip to finish up code
-	nop
-.dbmv8:
-	ldda	[%o5-8]%asi, %d2
-	sub	%o0, 8, %o0		! since we are at the end
-					! when we first enter the loop
-	sub     %o2, 8, %o2		! 8 less bytes to copy
-	sub	%o5, 8, %o5
-	cmp	%o2, 8			! do we have < 8 bytes remaining
-	faligndata %d2, %d0, %d8	! extract 8 bytes out
-	std	%d8, [%o0]		! store the current 8 bytes
-	bgeu,pt	%ncc, .dbmv8
-	fmovd	%d2, %d0
-.dbmvfinish:
-	and	%o3, 0x4, %o3	   ! fprs.du = fprs.dl = 0
-	tst	%o2
-	bz,pt	%ncc, .dbexit
-	wr	%o3, %g0, %fprs	 ! fprs = o3   restore fprs
-
-.dbremain:
-	cmp	%o2, 4
-	blt,pn	%ncc, .dbbyte
-	nop
-	ldub	[%o1-1], %o3		! load last byte
-	stb	%o3, [%o0-1]		! store last byte
-	sub	%o1, 4, %o1
-	ldub	[%o1+2], %o3		! load 2nd from last byte
-	stb	%o3, [%o0-2]		! store 2nd from last byte
-	sub	%o0, 4, %o0
-	ldub	[%o1+1], %o3		! load 3rd from last byte
-	stb	%o3, [%o0+1]		! store 3rd from last byte
-	subcc	%o2, 4, %o2
-	ldub	[%o1], %o3		! load 4th from last byte
-	stb	%o3, [%o0]		! store 4th from last byte	
-	bz,pt	%ncc, .dbexit
-.dbbyte:
-	dec	%o1			! decrement src address
-	ldub	[%o1], %o3		! read a byte
-	dec	%o0			! decrement dst address
-	deccc	%o2			! decrement count
-	bgu,pt	%ncc, .dbbyte		! loop until done
-	stb	%o3, [%o0]		! write byte
-.dbexit:
-	mov	%g5, %asi		! restore %asi
-	retl
-	mov     %g1, %o0
-	SET_SIZE(memmove)
-
-	.align ICACHE_LINE_SIZE
-	ENTRY(memcpy)
-	ENTRY(__align_cpy_1)
-					! adjust instruction alignment
-	nop				! Do not remove, these nops affect
-	nop				! icache alignment and performance
-.forcpy:
-	cmp	%o2, SMALL_MAX		! check for not small case
-	bgu,pn	%ncc, .medium		! go to larger cases
-	mov	%o0, %g1		! save %o0
-	cmp	%o2, SHORTCOPY		! check for really short case
-	ble,pt	%ncc, .smallleft	!
-	or	%o0, %o1, %o3		! prepare alignment check
-	andcc	%o3, 0x3, %g0		! test for alignment
-	bz,pt	%ncc, .smallword	! branch to word aligned case
-	sub	%o2, 3, %o2		! adjust count to allow cc zero test
-.smallnotalign4:
-	ldub	[%o1], %o3		! read byte
-	subcc	%o2, 4, %o2		! reduce count by 4
-	stb	%o3, [%o0]		! write byte
-	ldub	[%o1+1], %o3		! repeat for a total of 4 bytes
-	add	%o1, 4, %o1		! advance SRC by 4
-	stb	%o3, [%o0+1]
-	ldub	[%o1-2], %o3
-	add	%o0, 4, %o0		! advance DST by 4
-	stb	%o3, [%o0-2]
-	ldub	[%o1-1], %o3
-	bgu,pt	%ncc, .smallnotalign4	! loop til 3 or fewer bytes remain
-	stb	%o3, [%o0-1]
-	add	%o2, 3, %o2		! restore count
-.smallleft:
-	tst	%o2
-	bz,pt	%ncc, .smallexit
-	nop
-.smallleft3:				! 1, 2, or 3 bytes remain
-	ldub	[%o1], %o3		! load one byte
-	deccc	%o2			! reduce count for cc test
-	bz,pt	%ncc, .smallexit
-	stb	%o3, [%o0]		! store one byte
-	ldub	[%o1+1], %o3		! load second byte
-	deccc	%o2
-	bz,pt	%ncc, .smallexit
-	stb	%o3, [%o0+1]		! store second byte
-	ldub	[%o1+2], %o3		! load third byte
-	stb	%o3, [%o0+2]		! store third byte
-	retl
-	mov	%g1, %o0		! restore %o0
-
-	.align	16
-	nop				! affects loop icache alignment
-.smallwords:
-	lduw	[%o1], %o3		! read word
-.smallwordx:
-	subcc	%o2, 8, %o2		! update count
-	stw	%o3, [%o0]		! write word
-	add	%o1, 8, %o1		! update SRC
-	lduw	[%o1-4], %o3		! read word
-	add	%o0, 8, %o0		! update DST
-	bgu,pt	%ncc, .smallwords	! loop until done
-	stw	%o3, [%o0-4]		! write word
-	addcc	%o2, 7, %o2		! restore count
-	bz,pt	%ncc, .smallexit	! check for completion
-	nop
-	cmp	%o2, 4			! check for 4 or more bytes left
-	blt	.smallleft3		! if not, go to finish up
-	nop
-	lduw	[%o1], %o3
-	add	%o1, 4, %o1
-	subcc	%o2, 4, %o2
-	stw	%o3, [%o0]
-	add	%o0, 4, %o0
-	bnz,pt	%ncc, .smallleft3
-	nop
-	retl
-	mov	%g1, %o0		! restore %o0
-
-.smallword:
-	subcc	%o2, 4, %o2		! update count
-	bgu,pt	%ncc, .smallwordx
-	lduw	[%o1], %o3		! read word
-	addcc	%o2, 3, %o2		! restore count
-	bz,pt	%ncc, .smallexit
-	stw	%o3, [%o0]		! write word
-	deccc	%o2			! reduce count for cc test
-	ldub	[%o1+4], %o3		! load one byte
-	bz,pt	%ncc, .smallexit
-	stb	%o3, [%o0+4]		! store one byte
-	ldub	[%o1+5], %o3		! load second byte
-	deccc	%o2
-	bz,pt	%ncc, .smallexit
-	stb	%o3, [%o0+5]		! store second byte
-	ldub	[%o1+6], %o3		! load third byte
-	stb	%o3, [%o0+6]		! store third byte
-.smallexit:
-	retl
-	mov	%g1, %o0		! restore %o0
-	.align 16
-.medium:
-	neg	%o0, %o5
-	neg	%o1, %o3	
-	andcc	%o5, 7, %o5	! bytes till DST 8 byte aligned
-	and	%o3, 7, %o3	! bytes till SRC 8 byte aligned
-	cmp	%o5, %o3
-	bne	%ncc, continue
-	sub	%o5, %o3, %o3	! -(bytes till SRC aligned after DST aligned)
-				! o3={-7, -6, ... 7}  o3>0 => SRC overaligned
-	! src and dst are aligned.
-	mov	%o3, %g5		! save %o3
-	andcc	%o1, 7, %o3		! is src buf  aligned on a 8 byte bound
-	brz,pt	%o3, src_dst_aligned_on_8		
-	mov	%o3, %o5
-	mov	8, %o4
-	sub 	%o4, %o3, %o3
-	cmp	%o3, %o2
-	bg,a,pn	%ncc, 1f
-	mov	%o2, %o3	
-1:
-	! %o3 has the bytes to be written in partial store.
-	sub	%o2, %o3, %o2
-	prefetch	[%o1],2
-
-7:
-	deccc	%o3			! byte clearing loop
-	ldub	[%o1], %o4		! load one byte
-	stb	%o4, [%o0]
-	inc	%o1			! increment src
-	bgu,pt	%ncc, 7b
-	inc	%o0			! increment dst
-
-	mov	%g5, %o3		! restore %o3
-src_dst_aligned_on_8:
-	! check  if we are copying 1k or more bytes
-	cmp	%o2, 511
-	bgu,pt	%ncc, copying_ge_512
-	nop
-	ba	.medlword
-	nop
-
-continue:
-	andcc	%o5, 7, %o5	! bytes till DST 8 byte aligned
-	bz	%ncc, 2f
-	nop
-
-	sub	%o2, %o5, %o2	! update count
-
-1:
-	ldub	[%o1], %o4
-	deccc	%o5
-	inc	%o1
-	stb	%o4, [%o0]
-	bgu,pt	%ncc, 1b
-	inc	%o0
-
-	! Now DST is 8-byte aligned.  o0, o1, o2 are current.
-
-2:
-	andcc	%o1, 0x3, %g0		! test alignment
-	bnz,pt	%ncc, .mediumsetup	! branch to skip aligned cases
-					! if src, dst not aligned
-	prefetch [%o1 + (1 * BLOCK_SIZE)], #n_reads
-
-/*
- * Handle all cases where src and dest are aligned on word
- * or long word boundaries.  Use unrolled loops for better
- * performance.  This option wins over standard large data
- * move when source and destination is in cache for medium
- * to short data moves.
- */
-	andcc	%o1, 0x7, %g0		! test word alignment
-	bz,pt	%ncc, src_dst_lword_aligned	! branch to long word aligned case
-	prefetch [%o1 + (2 * BLOCK_SIZE)], #n_reads
-	cmp	%o2, MED_WMAX		! limit to store buffer size
-	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
-	nop
-	subcc	%o2, 15, %o2		! adjust length to allow cc test
-					! for end of loop
-	ble,pt	%ncc, .medw15		! skip big loop if less than 16
-	prefetch [%o1 + (3 * BLOCK_SIZE)], #n_reads
-/*
- * no need to put prefetch in loop as prefetches have
- * already been issued for maximum loop size
- */
-.medw16:
-	ld	[%o1], %o4		! load
-	subcc	%o2, 16, %o2		! decrement length count
-	stw	%o4, [%o0]		! and store
-	ld	[%o1+4], %o3		! a block of 16 bytes
-	add	%o1, 16, %o1		! increase src ptr by 16
-	stw	%o3, [%o0+4]
-	ld	[%o1-8], %o4
-	add	%o0, 16, %o0		! increase dst ptr by 16
-	stw	%o4, [%o0-8]
-	ld	[%o1-4], %o3
-	bgu,pt	%ncc, .medw16		! repeat if at least 16 bytes left
-	stw	%o3, [%o0-4]
-.medw15:
-	addcc	%o2, 15, %o2		! restore count
-	bz,pt	%ncc, .medwexit		! exit if finished
-	nop
-	cmp	%o2, 8
-	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
-	nop				!
-	ld	[%o1], %o4		! load 4 bytes
-	subcc	%o2, 8, %o2		! decrease count by 8
-	stw	%o4, [%o0]		! and store 4 bytes
-	add	%o1, 8, %o1		! increase src ptr by 8
-	ld	[%o1-4], %o3		! load 4 bytes
-	add	%o0, 8, %o0		! increase dst ptr by 8
-	stw	%o3, [%o0-4]		! and store 4 bytes
-	bz	%ncc, .medwexit		! exit if finished
-	nop
-.medw7:					! count is ge 1, less than 8
-	cmp	%o2, 3			! check for 4 bytes left
-	ble,pt	%ncc, .medw3		! skip if 3 or fewer bytes left
-	nop				!
-	ld	[%o1], %o4		! load 4 bytes
-	sub	%o2, 4, %o2		! decrease count by 4
-	add	%o1, 4, %o1		! increase src ptr by 4
-	stw	%o4, [%o0]		! and store 4 bytes
-	add	%o0, 4, %o0		! increase dst ptr by 4
-	tst	%o2			! check for zero bytes left
-	bz	%ncc, .medwexit		! exit if finished
-	nop
-.medw3:					! count is known to be 1, 2, or 3
-	deccc	%o2			! reduce count by one
-	ldub	[%o1], %o3		! load one byte
-	bz,pt	%ncc, .medwexit		! exit if last byte
-	stb	%o3, [%o0]		! store one byte
-	ldub	[%o1+1], %o3		! load second byte
-	deccc	%o2			! reduce count by one
-	bz,pt	%ncc, .medwexit		! exit if last byte
-	stb	%o3, [%o0+1]		! store second byte
-	ldub	[%o1+2], %o3		! load third byte
-	stb	%o3, [%o0+2]		! store third byte
-.medwexit:
-	retl
-	mov	%g1, %o0		! restore %o0
-	
-/*
- * Special case for handling when src and dest are both long word aligned
- * and total data to move is between SMALL_MAX and MED_MAX bytes
- */
-
-	.align 16
-	nop
-src_dst_lword_aligned:
-.medlword:				! long word aligned
-	cmp	%o2, MED_MAX		! limit to store buffer size
-	bgu,pt	%ncc, .mediumrejoin	! otherwise rejoin main loop
-	nop
-	subcc	%o2, 31, %o2		! adjust length to allow cc test
-					! for end of loop
-	ble,pt	%ncc, .medl31		! skip big loop if less than 32
-	prefetch [%o1 + (3 * BLOCK_SIZE)], #n_reads ! into the l2 cache
-/*
- * no need to put prefetch in loop as prefetches have
- * already been issued for maximum loop size
- */
-.medl32:
-	ldx	[%o1], %o4		! load
-	subcc	%o2, 32, %o2		! decrement length count
-	stx	%o4, [%o0]		! and store
-	ldx	[%o1+8], %o3		! a block of 32 bytes
-	add	%o1, 32, %o1		! increase src ptr by 32
-	stx	%o3, [%o0+8]
-	ldx	[%o1-16], %o4
-	add	%o0, 32, %o0		! increase dst ptr by 32
-	stx	%o4, [%o0-16]
-	ldx	[%o1-8], %o3
-	bgu,pt	%ncc, .medl32		! repeat if at least 32 bytes left
-	stx	%o3, [%o0-8]
-.medl31:
-	addcc	%o2, 16, %o2		! adjust remaining count
-	ble,pt	%ncc, .medl15		! skip if 15 or fewer bytes left
-	nop				!
-	ldx	[%o1], %o4		! load and store 16 bytes
-	add	%o1, 16, %o1		! increase src ptr by 16
-	stx	%o4, [%o0]		!
-	sub	%o2, 16, %o2		! decrease count by 16
-	ldx	[%o1-8], %o3		!
-	add	%o0, 16, %o0		! increase dst ptr by 16
-	stx	%o3, [%o0-8]
-.medl15:
-	addcc	%o2, 15, %o2		! restore count
-	bz,pt	%ncc, .medwexit		! exit if finished
-	nop
-	cmp	%o2, 8
-	blt,pt	%ncc, .medw7		! skip if 7 or fewer bytes left
-	nop
-	ldx	[%o1], %o4		! load 8 bytes
-	add	%o1, 8, %o1		! increase src ptr by 8
-	stx	%o4, [%o0]		! and store 8 bytes
-	subcc	%o2, 8, %o2		! decrease count by 8
-	bz	%ncc, .medwexit		! exit if finished
-	add	%o0, 8, %o0		! increase dst ptr by 8
-	ba	.medw7
-	nop
-
-	.align 16
-	nop
-	nop
-	nop
-unaligned_src_dst:
-
-.mediumsetup:
-	prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read
-.mediumrejoin:
-	rd	%fprs, %o4		! check for unused fp
-
-	add	%o1, 8, %o1		! prepare to round SRC upward
-
-	sethi	%hi(0x1234567f), %o5	! For GSR.MASK 
-	or	%o5, 0x67f, %o5
-	andcc	%o4, FPRS_FEF, %o4	! test FEF, fprs.du = fprs.dl = 0
-	bz,a	%ncc, 3f
-	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
-3:
-	cmp	%o2, MEDIUM_MAX
-	bmask	%o5, %g0, %g0
-
-	! Compute o5 (number of bytes that need copying using the main loop).
-	! First, compute for the medium case.
-	! Then, if large case, o5 is replaced by count for block alignment.
-	! Be careful not to read past end of SRC
-	! Currently, o2 is the actual count remaining
-	!	    o3 is how much sooner we'll cross the alignment boundary
-	!		in SRC compared to in DST
-	!
-	! Examples:  Let # denote bytes that should not be accessed
-	!	    Let x denote a byte already copied to align DST
-	!	    Let . and - denote bytes not yet copied
-	!	    Let | denote double alignment boundaries
-	!
-	!	    DST:  ######xx|........|--------|..######   o2 = 18
-	!			  o0
-	!
-	!  o3 = -3:  SRC:  ###xx...|.....---|-----..#|########   o5 = 8
-	!			  o1
-	!
-	!  o3 =  0:  SRC:  ######xx|........|--------|..######   o5 = 16-8 = 8
-	!				   o1
-	!
-	!  o3 = +1:  SRC:  #######x|x.......|.-------|-..#####   o5 = 16-8 = 8
-	!				   o1
-
-	mov	%asi, %g5		! save curr %asi
-	wr	%g0, ASI_CACHE_SPARING, %asi
-
-	or	%g0, -8, %o5
-	alignaddr %o1, %g0, %o1		! set GSR.ALIGN and align o1
-
-	movrlz	%o3, %g0, %o5		! subtract 8 from o2+o3 only if o3>=0
-	add	%o5, %o2, %o5
-	add	%o5, %o3, %o5
-
-	bleu	%ncc, 4f
-	andn	%o5, 7, %o5		! 8 byte aligned count
-	neg	%o0, %o5		! 'large' case
-	and	%o5, BLOCK_SIZE-1, %o5  ! bytes till DST block aligned
-4:	
-	brgez,a	%o3, .beginmedloop
-	ldda	[%o1-8]%asi, %d0
-
-	add	%o1, %o3, %o1		! back up o1
-5:
-	ldda	[%o1]ASI_FL8_P, %d2
-	inc	%o1
-	andcc	%o1, 7, %g0
-	bnz	%ncc, 5b
-	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
-
-.beginmedloop:
-	tst	%o5
-	bz	%ncc, .endmedloop
-	sub	%o2, %o5, %o2		! update count for later
-
-	! Main loop to write out doubles.  Note: o5 & 7 == 0
-	
-	ldd	[%o1], %d2
-	subcc	%o5, 8, %o5		! update local count
-	bz,pn	%ncc, 1f
-	add	%o1, 8, %o1		! update SRC
-
-.medloop:
-	faligndata %d0, %d2, %d4
-	ldda	[%o1]%asi, %d0
-	subcc	%o5, 8, %o5		! update local count
-	add	%o1, 16, %o1		! update SRC
-	std	%d4, [%o0]
-	bz,pn	%ncc, 2f
-	faligndata %d2, %d0, %d6
-	ldda	[%o1 - 8]%asi, %d2
-	subcc	%o5, 8, %o5		! update local count
-	std	%d6, [%o0 + 8]
-	bnz,pt	%ncc, .medloop
-	add	%o0, 16, %o0		! update DST
-
-1:	
-	faligndata %d0, %d2, %d4
-	fmovd	%d2, %d0
-	std	%d4, [%o0]
-	ba	.endmedloop
-	add	%o0, 8, %o0
-	
-2:
-	std	%d6, [%o0 + 8]
-	sub	%o1, 8, %o1
-	add	%o0, 16, %o0
-	
-
-.endmedloop:
-	! Currently, o1 is pointing to the next double-aligned byte in SRC
-	! The 8 bytes starting at [o1-8] are available in d0
-	! At least one, and possibly all, of these need to be written.
-
-	cmp	%o2, BLOCK_SIZE	
-	bgu	%ncc, .large		! otherwise, less than 16 bytes left
-	
-#if 1
-
-	/* This code will use partial stores.  */
-
-	mov	%g0, %o5
-	and	%o3, 7, %o3		! Number of bytes needed to completely
-					! fill %d0 with good (unwritten) data.
-
-	subcc	%o2, 8, %o2		! update count (maybe too much)
-	movl	%ncc, %o2, %o5		
-	addcc	%o3, %o5, %o5		! extra bytes we can stuff into %d0
-	sub	%o3, %o5, %o3		! update o3 (# bad bytes in %d0)
-
-	bz	%ncc, 2f
-	alignaddr %o3, %g0, %g0		! set GSR.ALIGN
-	
-1:
-	deccc	%o5
-	ldda	[%o1]ASI_FL8_P, %d2
-	inc	%o1
-	bgu	%ncc, 1b
-	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
-
-2:
-	not     %o3
-	faligndata %d0, %d0, %d0	! shift bytes to the left
-	and	%o3, 7, %o3		! last byte to be stored in [%o0+%o3]
-	edge8n	%g0, %o3, %o5
-	stda	%d0, [%o0]%o5, ASI_PST8_P
-	brlez	%o2, .exit_memcpy
-	add	%o0, %o3, %o0		! update DST to last stored byte
-3:	
-	inc	%o0
-	deccc	%o2
-	ldub	[%o1], %o3
-	stb	%o3, [%o0]
-	bgu	%ncc, 3b
-	inc	%o1
-
-#else
-
-	andcc	%o3, 7, %o5		! Number of bytes needed to completely
-					! fill %d0 with good (unwritten) data.
-	bz	%ncc, 2f
-	sub	%o5, 8, %o3		! -(number of good bytes in %d0)
-	cmp	%o2, 8
-	bl,a	%ncc, 3f		! Not enough bytes to fill %d0
-	add	%o1, %o3, %o1 		! Back up %o1
-
-1:
-	deccc	%o5
-	ldda	[%o1]ASI_FL8_P, %d2
-	inc	%o1
-	bgu	%ncc, 1b
-	bshuffle %d0, %d2, %d0		! shifts d0 left 1 byte and or's in d2
-
-2:	
-	subcc	%o2, 8, %o2
-	std	%d0, [%o0]
-	bz	%ncc, .exit_memcpy
-	add	%o0, 8, %o0
-3:	
-	ldub	[%o1], %o3
-	deccc	%o2
-	inc	%o1
-	stb	%o3, [%o0]
-	bgu	%ncc, 3b
-	inc	%o0
-#endif	
-
-.exit_memcpy:
-        wr      %o4, %g0, %fprs		! fprs = o4   restore fprs
-	mov	%g5, %asi		! restore %asi
-	retl
-        mov     %g1, %o0
-
-	.align ICACHE_LINE_SIZE
-.large:
-	! The following test for BSTORE_SIZE is used to decide whether
-	! to store data with a block store or with individual stores.
-	! The block store wins when the amount of data is so large
-	! that it is causes other application data to be moved out
-	! of the L1 or L2 cache.
-	! On a Panther, block store can lose more often because block
-	! store forces the stored data to be removed from the L3 cache.
-	!
-	sethi	%hi(BSTORE_SIZE),%o5
-	or	%o5,%lo(BSTORE_SIZE),%o5
-	cmp	%o2, %o5
-	bgu	%ncc, .xlarge		
-
-	! %o0 I/O DST is 64-byte aligned
-	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
-	! %d0 I/O already loaded with SRC data from [%o1-8]
-	! %o2 I/O count (number of bytes that need to be written)
-	! %o3 I   Not written.  If zero, then SRC is double aligned.
-	! %o4 I   Not written.  Holds fprs.
-	! %o5   O The number of doubles that remain to be written.
-
-	! Load the rest of the current block 
-	! Recall that %o1 is further into SRC than %o0 is into DST
-
-	prefetch [%o0 + (0 * BLOCK_SIZE)], #n_writes
-	prefetch [%o0 + (1 * BLOCK_SIZE)], #n_writes
-	prefetch [%o0 + (2 * BLOCK_SIZE)], #n_writes
-	ldda	[%o1]%asi, %d2
-	prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
-	ldda	[%o1 + 0x8]%asi, %d4
-	faligndata %d0, %d2, %d16
-	ldda	[%o1 + 0x10]%asi, %d6
-	faligndata %d2, %d4, %d18
-	ldda	[%o1 + 0x18]%asi, %d8
-	faligndata %d4, %d6, %d20
-	ldda	[%o1 + 0x20]%asi, %d10
-	or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
-	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
-	faligndata %d6, %d8, %d22
-	ldda	[%o1 + 0x28]%asi, %d12
-	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed lter)
-	faligndata %d8, %d10, %d24
-	ldda	[%o1 + 0x30]%asi, %d14
-	faligndata %d10, %d12, %d26
-	ldda	[%o1 + 0x38]%asi, %d0
-	sub	%o2, BLOCK_SIZE, %o2	! update count
-	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
-	add	%o1, BLOCK_SIZE, %o1		! update SRC
-
-	! Main loop.  Write previous block.  Load rest of current block.
-	! Some bytes will be loaded that won't yet be written.
-1:	
-	ldda	[%o1]%asi, %d2
-	faligndata %d12, %d14, %d28
-	ldda	[%o1 + 0x8]%asi, %d4
-	faligndata %d14, %d0, %d30
-	std	%d16, [%o0]
-	std	%d18, [%o0+8]
-	std	%d20, [%o0+16]
-	std	%d22, [%o0+24]
-	std	%d24, [%o0+32]
-	std	%d26, [%o0+40]
-	std	%d28, [%o0+48]
-	std	%d30, [%o0+56]
-	sub	%o2, BLOCK_SIZE, %o2		! update count
-	prefetch [%o0 + (6 * BLOCK_SIZE)], #n_writes
-	prefetch [%o0 + (3 * BLOCK_SIZE)], #n_writes
-	add	%o0, BLOCK_SIZE, %o0		! update DST
-	ldda	[%o1 + 0x10]%asi, %d6
-	faligndata %d0, %d2, %d16
-	ldda	[%o1 + 0x18]%asi, %d8
-	faligndata %d2, %d4, %d18
-	ldda	[%o1 + 0x20]%asi, %d10
-	faligndata %d4, %d6, %d20
-	ldda	[%o1 + 0x28]%asi, %d12
-	faligndata %d6, %d8, %d22
-	ldda	[%o1 + 0x30]%asi, %d14
-	faligndata %d8, %d10, %d24
-	ldda	[%o1 + 0x38]%asi, %d0
-	faligndata %d10, %d12, %d26
-	cmp	%o2, BLOCK_SIZE + 8
-	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
-	bgu,pt	%ncc, 1b
-	add	%o1, BLOCK_SIZE, %o1	! update SRC
-	faligndata %d12, %d14, %d28
-	faligndata %d14, %d0, %d30
-	stda	%d16, [%o0]ASI_BLK_P	! store 64 bytes, bypass cache
-	cmp	%o2, BLOCK_SIZE
-	bne	%ncc, 2f		! exactly 1 block remaining?
-	add	%o0, BLOCK_SIZE, %o0	! update DST
-	brz,a	%o3, 3f			! is SRC double aligned?
-	ldd	[%o1], %d2
-
-2:	
-	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8 
-	add	%o5, %o3, %o5
-
-	ba	.beginmedloop
-	andn	%o5, 7, %o5		! 8 byte aligned count
-
-	! This is when there is exactly 1 block remaining and SRC is aligned
-3:
-	!  %d0 was loaded in the last iteration of the loop above, and
-	!  %d2 was loaded in the branch delay slot that got us here.
-	ldd	[%o1 + 0x08], %d4
-	ldd	[%o1 + 0x10], %d6
-	ldd	[%o1 + 0x18], %d8
-	ldd	[%o1 + 0x20], %d10
-	ldd	[%o1 + 0x28], %d12
-	ldd	[%o1 + 0x30], %d14
-	stda	%d0, [%o0]ASI_BLK_P
-
-	ba	.exit_memcpy
-	 nop
-
-
-	.align 16
-	! two nops here causes loop starting at 1f below to be
-	! on a cache line boundary, improving performance
-	nop
-	nop
-xlarge:
-.xlarge:
-	/*
-	set	4096, %l2
-	subcc	%o2, %l2, %g0
-	bge	%ncc, size_ge_4k
-	nop
-	*/
-	! %o0 I/O DST is 64-byte aligned
-	! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN)
-	! %d0 I/O already loaded with SRC data from [%o1-8]
-	! %o2 I/O count (number of bytes that need to be written)
-	! %o3 I   Not written.  If zero, then SRC is double aligned.
-	! %o4 I   Not written.  Holds fprs.
-	! %o5   O The number of doubles that remain to be written.
-
-	! Load the rest of the current block 
-	! Recall that %o1 is further into SRC than %o0 is into DST
-
-	! prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read
-	! executed in delay slot for branch to .xlarge
-	prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read
-	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
-	ldda	[%o1]%asi, %d2
-	prefetch [%o1 + (6 * BLOCK_SIZE)], #one_read
-	ldda	[%o1 + 0x8]%asi, %d4
-	faligndata %d0, %d2, %d16
-	ldda	[%o1 + 0x10]%asi, %d6
-	faligndata %d2, %d4, %d18
-	ldda	[%o1 + 0x18]%asi, %d8
-	faligndata %d4, %d6, %d20
-	ldda	[%o1 + 0x20]%asi, %d10
-	or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
-	faligndata %d6, %d8, %d22
-	ldda	[%o1 + 0x28]%asi, %d12
-	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed later)
-	faligndata %d8, %d10, %d24
-	ldda	[%o1 + 0x30]%asi, %d14
-	faligndata %d10, %d12, %d26
-	ldda	[%o1 + 0x38]%asi, %d0
-	sub	%o2, BLOCK_SIZE, %o2	! update count
-	prefetch [%o1 + (7 * BLOCK_SIZE)], #one_read
-	add	%o1, BLOCK_SIZE, %o1	! update SRC
-
-	! This point is 32-byte aligned since 24 instructions appear since
-	! the previous alignment directive.
-	
-
-	! Main loop.  Write previous block.  Load rest of current block.
-	! Some bytes will be loaded that won't yet be written.
-1:
-	ldda	[%o1]%asi, %d2
-	faligndata %d12, %d14, %d28
-	ldda	[%o1 + 0x8]%asi, %d4
-	faligndata %d14, %d0, %d30
-	stda	%d16, [%o0]ASI_BLK_P
-	sub	%o2, BLOCK_SIZE, %o2		! update count
-	ldda	[%o1 + 0x10]%asi, %d6
-	faligndata %d0, %d2, %d16
-	ldda	[%o1 + 0x18]%asi, %d8
-	faligndata %d2, %d4, %d18
-	ldda	[%o1 + 0x20]%asi, %d10
-	faligndata %d4, %d6, %d20
-	ldda	[%o1 + 0x28]%asi, %d12
-	faligndata %d6, %d8, %d22
-	ldda	[%o1 + 0x30]%asi, %d14
-	faligndata %d8, %d10, %d24
-	ldda	[%o1 + 0x38]%asi, %d0
-	faligndata %d10, %d12, %d26
-	! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K
-	prefetch [%o1 + (8 * BLOCK_SIZE) + 8], #one_read
-	add	%o0, BLOCK_SIZE, %o0		! update DST
-	cmp	%o2, BLOCK_SIZE + 8
-	! second prefetch important to correct for occasional dropped
-	! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K
-	! strong prefetch prevents drops on Panther, but Jaguar and earlier
-	! US-III models treat strong prefetches as weak prefetchs
-	! to avoid regressions on customer hardware, we retain the prefetch
-	prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read
-	bgu,pt	%ncc, 1b
-	add	%o1, BLOCK_SIZE, %o1	! update SRC
-
-	faligndata %d12, %d14, %d28
-	faligndata %d14, %d0, %d30
-	stda	%d16, [%o0]ASI_BLK_P	! store 64 bytes, bypass cache
-	cmp	%o2, BLOCK_SIZE		
-	bne	%ncc, 2f		! exactly 1 block remaining?
-	add	%o0, BLOCK_SIZE, %o0	! update DST
-	brz,a	%o3, 3f			! is SRC double aligned?
-	ldd	[%o1], %d2
-
-2:	
-	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8 
-	add	%o5, %o3, %o5
-
-
-	ba	.beginmedloop
-	andn	%o5, 7, %o5		! 8 byte aligned count
-
-
-	! This is when there is exactly 1 block remaining and SRC is aligned
-3:
-	!  %d0 was loaded in the last iteration of the loop above, and
-	!  %d2 was loaded in the branch delay slot that got us here.
-	ldd	[%o1 + 0x08], %d4
-	ldd	[%o1 + 0x10], %d6
-	ldd	[%o1 + 0x18], %d8
-	ldd	[%o1 + 0x20], %d10
-	ldd	[%o1 + 0x28], %d12
-	ldd	[%o1 + 0x30], %d14
-	stda	%d0, [%o0]ASI_BLK_P
-
-	ba	.exit_memcpy
-	 nop
-
-copying_ge_512:
-	mov	%o0, %o5	! save dst address for return value.
-	! both src and dst are aligned to 8 byte boundary.
-	save	%sp, -SA(STACK_OFFSET + SAVESIZE), %sp
-	mov	%i0, %o0
-	mov	%i1, %o1
-	mov	%i2, %o2
-	mov	%i3, %o3
-	mov	%i5, %o5
-#ifndef	__sparcv9
-	stx	%g4, [%sp + STACK_OFFSET + g4_offset]
-	stx	%g5, [%sp + STACK_OFFSET + g5_offset]
-#endif
-	rd	%fprs, %g5		! check for unused fp
-	andcc	%g5, FPRS_FEF, %g5	! test FEF, fprs.du = fprs.dl = 0
-	bz,a	%ncc, 1f
-	wr	%g0, FPRS_FEF, %fprs	! fprs.fef = 1
-1:
-	!predfetch src buf
-	sub     %o1,1,%o3
-	andn    %o3,0x7f,%l1
-	add     %l1,128,%l1
-	prefetch [%l1],2		!prefetch next 128b
-	prefetch [%l1+64],2
-	prefetch [%l1+(2*64)],2		!cont from above
-	prefetch [%l1+(3*64)],2
-	!predfetch dst buf
-	sub     %o5,1,%o3
-	andn    %o3,0x7f,%l1
-	add     %l1,128,%l1
-	prefetch [%l1],2		!prefetch next 128b
-	prefetch [%l1+64],2
-	prefetch [%l1+(2*64)],2		!cont from above
-	prefetch [%l1+(3*64)],2
-
-	andcc   %o5,0x7f,%o3	    !o3=0 , means it is already 128 align
-	brz,pn  %o3,aligned_on_128
-	sub     %o3,128,%o3
-
-	add     %o2,%o3,%o2
-align_to_128:
-	ldxa	[%o1]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o1,8,%o1		! increment src pointer
-	stxa    %o4,[%o5]ASI_CACHE_SPARING_PRIMARY
-	addcc   %o3,8,%o3
-	bl,pt   %ncc,align_to_128
-	add     %o5,8,%o5		! increment dst pointer
-
-aligned_on_128:
-	andcc	%o5,0x1ff,%o3	!%o3=0 when it is 512 b aligned.
-	brnz,pn	%o3, 4f
-	mov	%o2,%l4		!l4=count from 512 align
-	set	4096, %l2
-	subcc	%o2, %l2, %g0
-	bge,pn	%ncc, stingray_optimized_copy
-	nop
-4:
-
-	sub	%o5,8,%l6	!should be in current 512 chunk
-	andn 	%l6,0x1ff,%o3	!%o3=aligned 512b addr
-	add 	%o3,0x200,%o3	!%o3=next aligned 512b addr to start
-				! stingray_optimized_copy
-	sub 	%o3,%o5,%o3	!o3=how many byte in the current remaining chunk
-	sub	%o2,%o3,%l4	!l4=count from 512 align
-	/*
-	 * if l4 is < 4096 do interleave_128_copy only.
-	 */
-	set	4096, %l2
-	subcc	%l4, %l2, %g0
-	bge,pn	%ncc,6f
-	nop
-	mov	%g0, %l4
-	add	%o5, %o2, %l1
-	ba	interleave_128_copy
-	nop
-6:
-	mov	%o3, %o2
-	subcc 	%o3,256,%g0	! if it is > 256 bytes , could use the
-				! interleave_128_copy
-	bl,pn	%ncc,copy_word	! o.w use copy_word to finish the 512 byte
-				! alignment.
-	!%o1=64 bytes data
-	!%o5=next 8 byte addr to write
-	!%o2=new count i.e how many bytes to write
-	add     %o5,%o2,%l1	!cal the last byte to write %l1
-	ba	interleave_128_copy
-	nop
-
-	.align	64
-interleave_128_copy:
-	! %l1 has the addr of the dest. buffer at or beyond which no write
-	! is to be done.
-	! %l4 has the number of bytes to zero using stingray_optimized_bzero
-	!prefetch src
-	!prefetch src 
-
-	add	%o1, 256, %o3
-	prefetch [%o3], 2	!1st 64 byte line of next 256 byte block
-	add	%o1, 384, %o3
-	prefetch [%o3], 2	!3rd 64 byte line of next 256 byte block
-	add	%o1, 320, %o3
-	prefetch [%o3], 2	!2nd 64 byte line of next 256 byte block
-	add	%o1, 448, %o3
-	prefetch [%o3], 2	!4th 64 byte line of next 256 byte block
-
-	!prefetch dst 
-
-	add	%o5, 256, %o3
-	prefetch [%o3], 2	!1st 64 byte line of next 256 byte block
-	add	%o5, 384, %o3
-	prefetch [%o3], 2	!3rd 64 byte line of next 256 byte block
-	add	%o5, 320, %o3
-	prefetch [%o3], 2	!2nd 64 byte line of next 256 byte block
-	add	%o5, 448, %o3
-	prefetch [%o3], 2	!4th 64 byte line of next 256 byte block
-
-	ldxa	[%o1]ASI_CACHE_SPARING_PRIMARY, %o4
-	stxa     %o4,[%o5]ASI_CACHE_SPARING_PRIMARY	!1st 64 byte line
-	add	%o1, 128, %o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, 128, %o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY	!3rd 64 byte line
-	add     %o1, (1 * 8), %o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add	%o5, (1 * 8), %o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (1 * 8 + 128), %o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (1 * 8 + 128), %o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (2 * 8),%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (2 * 8),%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (2 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (2 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (3 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (3 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (3 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (3 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (4 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (4 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (4 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (4 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (5 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (5 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (5 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (5 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (6 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (6 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (6 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (6 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (7 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (7 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (7 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (7 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (8 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (8 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (8 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (8 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (9 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (9 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (9 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (9 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (10 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (10 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (10 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (10 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (11 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (11 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (11 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (11 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (12 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (12 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (12 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (12 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (13 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (13 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (13 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (13 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (14 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (14 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (14 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (14 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (15 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (15 * 8) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add     %o1, (15 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o5, (15 * 8 + 128) ,%o3
-	stxa     %o4,[%o3]ASI_CACHE_SPARING_PRIMARY
-	add	%o1, 256, %o1
-	! check if the next 256 byte copy will not exceed the number of
-	! bytes remaining to be copied.
-	! %l2 points to the dest buffer after copying 256 bytes more.
-	! %l1 points to dest. buffer at or beyond which no writes should be done.
-	add     %o5,512,%l2
-			
-	subcc   %l1,%l2,%g0
-	bge,pt  %ncc,interleave_128_copy
-	add     %o5,256,%o5
-
-copy_word:
-	and     %o2,255,%o3
-	and     %o3,7,%o2
-
-	! Set the remaining doubles
-	subcc   %o3, 8, %o3		! Can we store any doubles?
-	bl,pn  %ncc, 6f
-	and	%o2, 7, %o2		! calc bytes left after doubles
-
-	!prefetch src 
-
-	mov	%o1, %o4
-	prefetch [%o4], 2	!1st 64 byte line of next 256 byte block
-	add	%o1, 128, %o4
-	prefetch [%o4], 2	!3rd 64 byte line of next 256 byte block
-	add	%o1, 64, %o4
-	prefetch [%o4], 2	!2nd 64 byte line of next 256 byte block
-	add	%o1, 192, %o4
-	prefetch [%o4], 2	!4th 64 byte line of next 256 byte block
-
-	!prefetch dst 
-
-	mov	%o5, %o4
-	prefetch [%o4], 2	!1st 64 byte line of next 256 byte block
-	add	%o5, 128, %o4
-	prefetch [%o4], 2	!3rd 64 byte line of next 256 byte block
-	add	%o5, 64, %o4
-	prefetch [%o4], 2	!2nd 64 byte line of next 256 byte block
-	add	%o5, 192, %o4
-	prefetch [%o4], 2	!4th 64 byte line of next 256 byte block
-
-5:	
-	ldxa	[%o1]ASI_CACHE_SPARING_PRIMARY, %o4
-	add     %o1, 8, %o1      
-	stxa	%o4, [%o5]ASI_CACHE_SPARING_PRIMARY
-	subcc   %o3, 8, %o3
-	bge,pt	%ncc, 5b
-	add     %o5, 8, %o5      
-6:
-	! Set the remaining bytes
-	brz	%o2,  can_we_do_stingray_optimized_copy
-	nop
-	
-	! Terminate the copy with a partial store.
-	! The data should be at d0
-	ldxa	[%o1]ASI_CACHE_SPARING_PRIMARY, %o4
-	stx	%o4, [%sp + STACK_OFFSET + scratch_offset]
-	ldd	[%sp + STACK_OFFSET + scratch_offset], %d0
-	
-	dec     %o2		     ! needed to get the mask right
-	edge8n	%g0, %o2, %o4
-	stda	%d0, [%o5]%o4, ASI_PST8_P
-can_we_do_stingray_optimized_copy:
-	mov	%l4, %o2
-	brnz,pn	%o2, stingray_optimized_copy
-	nop
-	
-exit:	
-	brnz	%g5, 1f
-	nop
-	wr	%g5, %g0, %fprs
-1:
-#ifndef	__sparcv9
-	ldx	[%sp + STACK_OFFSET + g4_offset], %g4
-	ldx	[%sp + STACK_OFFSET + g5_offset], %g5
-#endif
-	ret				! %o0 was preserved
-	restore
-
-
-stingray_optimized_copy:
-!%o5 = next memory addr which is 512 b align
-!%l4 = remaining byte from 512 align.
-
-	add	%o5, %l4, %o2
-
-	prefetch [%o1+0],2
-	prefetch [%o1+(64*1)],2
-	prefetch [%o1+(64*2)],2
-	prefetch [%o1+(64*3)],2
-	prefetch [%o1+(64*4)],2
-	prefetch [%o1+(64*5)],2
-	prefetch [%o1+(64*6)],2
-	prefetch [%o1+(64*7)],2
-	prefetch [%o1+(64*8)],2
-	prefetch [%o1+(64*9)],2
-	prefetch [%o1+(64*10)],2
-	prefetch [%o1+(64*11)],2
-	prefetch [%o1+(64*12)],2
-	prefetch [%o1+(64*13)],2
-	prefetch [%o1+(64*14)],2
-	prefetch [%o1+(64*15)],2
-
-	prefetch [%o5+0],2
-	prefetch [%o5+(64*1)],2
-	prefetch [%o5+(64*2)],2
-	prefetch [%o5+(64*3)],2
-	prefetch [%o5+(64*4)],2
-	prefetch [%o5+(64*5)],2
-	prefetch [%o5+(64*6)],2
-	prefetch [%o5+(64*7)],2
-	prefetch [%o5+(64*8)],2
-	prefetch [%o5+(64*9)],2
-	prefetch [%o5+(64*10)],2
-	prefetch [%o5+(64*11)],2
-	prefetch [%o5+(64*12)],2
-	prefetch [%o5+(64*13)],2
-	prefetch [%o5+(64*14)],2
-	prefetch [%o5+(64*15)],2
-	
-	ba      myloop2
-	srl	%l4, 12, %l4
-	
-	! Local register usage:
-	!
-	! %l1 address at short distance ahead of current %o1 for prefetching
-	!     into L1 cache. 
-	! %l2 address at far ahead of current %o1 for prefetching into L2 cache.
-	! %l3 save %o5 at start of inner loop. 
-	! %l4 Number of 4k blocks to copy
-	! %g1 save %o1 at start of inner loop. 
-	! %l5 iteration counter to make buddy loop execute 2 times. 
-	! %l6 iteration counter to make inner loop execute 32 times. 
-	! %l7 address at far ahead of current %o5 for prefetching destination
-	!     into L2 cache.
-	       
-.align 64
-myloop2:
-	set      2,%l5	! %l5 is the loop count for the buddy loop, for 2 buddy lines.
-	add      %o5, 0, %l3 
-	add      %o1, 0, %g1 
-buddyloop:
-	set      PF_FAR, %g4	! Prefetch far ahead. CHANGE FAR PREFETCH HERE.
-	add      %o1, %g4, %l2	! For prefetching far ahead, set %l2 far ahead
-				! of %o1
-	add      %o1, PF_NEAR, %l1	! For prefetching into L1 D$, set %l1 a
-					! little ahead of %o1
-	add      %o5, %g4, %l7	! For prefetching far ahead, set %l7 far ahead
-				! of %o5
-
-	add      %l2, %g4, %g4	! %g4 is now double far ahead of the source
-				! address in %o1.
-	prefetch [%g4+%g0],2	! Prefetch ahead by several pages to get TLB
-				! entry in advance.
-	set      2*PF_FAR, %g4	! Prefetch double far ahead.  SET DOUBLE FAR
-				! PREFETCH HERE.
-	add      %o5, %g4, %g4	! %g4 is now double far ahead of the dest
-				! address in %o5.
-	prefetch [%g4+%g0],2	! Prefetch ahead by 2 pages to get TLB entry
-				! in advance.
-
-	set      4,%l6		! %l6 = loop count for the inner loop,
-				! for 4 x 8 = 32 lines.
-	set      0, %g4
-	
-	! Each iteration of the inner loop below copies 8 sequential lines.
-	! This loop is iterated 4 times, to move a total of 32 lines,
-	! all of which have the same value of PA[9], so we increment the base
-	! address by 1024 bytes in each iteration, which varies PA[10].				     */ 
-innerloop:	  
-	/* ---- copy line 1 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-	add     %g4, 64, %g4
-	add     %o5, 64, %o5
-	add     %o1, 64, %o1       /* increment %o1 for the next source line.   */
-
-	/* ---- copy line 2 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-	add     %g4, 64, %g4
-	add     %o5, 64, %o5
-	add     %o1, 64, %o1       /* increment %o1 for the next source line.   */
-
-	/* ---- copy line 3 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-	add     %g4, 64, %g4
-	add     %o5, 64, %o5
-	add     %o1, 64, %o1       /* increment %o1 for the next source line.   */
-
-	/* ---- copy line 4 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-	add     %g4, 64, %g4
-	add     %o5, 64, %o5
-	add     %o1, 64, %o1       /* increment %o1 for the next source line.   */
-
-	/* ---- copy line 5 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-	add     %g4, 64, %g4
-	add     %o5, 64, %o5
-	add     %o1, 64, %o1       /* increment %o1 for the next source line.   */
-
-	/* ---- copy line 6 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-	add     %g4, 64, %g4
-	add     %o5, 64, %o5
-	add     %o1, 64, %o1       /* increment %o1 for the next source line.   */
-
-	/* ---- copy line 7 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-	add     %g4, 64, %g4
-	add     %o5, 64, %o5
-	add     %o1, 64, %o1       /* increment %o1 for the next source line.   */
-
-	/* ---- copy line 8 of 8. ---- */
-	prefetch [%l2+%g4],2
-	prefetch [%l7+%g4],2
-	prefetch [%l1+%g4],1
-
-	ldd     [%o1],%d0
-	ldd     [%o1+8],%d2
-	ldd     [%o1+16],%d4
-	ldd     [%o1+24],%d6
-	ldd     [%o1+32],%d8
-	ldd     [%o1+40],%d10
-	ldd     [%o1+48],%d12
-	ldd     [%o1+56],%d14
-	stda    %d0,[%o5]ASI_BLK_P
-
-	subcc   %l6,1,%l6	  /* Decrement the inner loop counter.	 */
-	
-	! Now increment by 64 + 512 so we don't toggle PA[9]
-	add     %g4, 576, %g4
-	add     %o5, 576, %o5
-
-	bg,pt   %icc,innerloop
-	add     %o1, 576, %o1	! increment %o1 for the next source line.
-	! END OF INNER LOOP
-
-
-	subcc   %l5,1,%l5
-	add     %l3, 512, %o5	! increment %o5 to first buddy line of dest.
-	bg,pt   %icc,buddyloop
-	add     %g1, 512 ,%o1	! Set %o1 to the first of the odd buddy lines.
-
-	subcc   %l4, 1, %l4
-	add     %o5, 3584, %o5	! Advance both base addresses to 4k above where
-				! they started.
-	add     %o1, 3584, %o1	! They were already incremented by 512,
-				! so just add 3584.
-
-	bg,pt   %icc,myloop2
-	nop
-
-	/****larryalg_end_here*************/
-
-	sub	%o2,%o5,%o2	!how many byte left
-	brz,pn	%o2,complete_write
-	mov	%g0,%l4
-	add     %o5,%o2,%l1	     !cal the last byte to write %l1
-	subcc	%o2,256,%g0
-	bge,pt	%ncc,interleave_128_copy
-	mov	%g0,%l4
-	
-	ba	copy_word
-	nop
-
-
-complete_write: 
-	ba      exit
-	nop
-
-
-	
-	SET_SIZE(memcpy)
-	SET_SIZE(__align_cpy_1)
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/memset.s	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,767 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-
-	.file	"memset.s"
-/*
- * char *memset(sp, c, n)
- *
- * Set an array of n chars starting at sp to the character c.
- * Return sp.
- *
- * Fast assembler language version of the following C-program for memset
- * which represents the `standard' for the C-library.
- *
- *	void *
- *	memset(void *sp1, int c, size_t n)
- *	{
- *	    if (n != 0) {
- *		char *sp = sp1;
- *		do {
- *		    *sp++ = (char)c;
- *		} while (--n != 0);
- *	    }
- *	    return (sp1);
- *	}
- */
-
-#include <sys/asm_linkage.h>
-#include <sys/sun4asi.h>
-
-	ANSI_PRAGMA_WEAK(memset,function)
-
-#define	SAVESIZE	(8 * 1)
-#ifdef	__sparcv9
-#define	STACK_OFFSET	(STACK_BIAS + 0)
-#else
-#define	STACK_OFFSET	(STACK_BIAS + 0 + 0)
-#endif
-#define	scratch_offset	0
-
-#define ASI_CACHE_SPARING_PRIMARY 0xf4
-#define	ALIGN8(X)	(((X) + 7) & ~7)
-#define	ICACHE_LINE_SIZE	64
-#define	FPRS_FEF	0x4
-#define	PF_FAR		2048
-
-	.section        ".text"
-	.align ICACHE_LINE_SIZE
-
-	/*
-	 * Optimizations done:
-	 *
-	 * No stores in delay slot of branch instructions.
-	 * conditional stores where possible
-	 * prefetch before doing stxa
-	 * Bank interleaved writing.
-	 */
-
-	ENTRY(memset)
-	add	%sp, -SA(STACK_OFFSET + SAVESIZE), %sp
-	mov	%o0, %o5		! copy sp1 before using it
-	/*
-	 * If 0 bytes to xfer return
-	 */
-	brnz	%o2, continue
-	nop
-	retl
-	add	%sp, SA(STACK_OFFSET + SAVESIZE), %sp
-continue:
-	/*
-	 * If the count is multiple of 8 and buffer is aligned to 8
-	 * we don't have to look at fprs
-	 */
-	or	%o5, %o2, %o3
-	and	%o3, 7, %o3
-        brnz	%o3, check_fprs
-	mov	4, %g1
-	prefetch	[%o5],2
-	ba	skip_rd_fprs
-	nop
-	
-check_fprs:
-        rd      %fprs, %g1              ! g1 = fprs
-skip_rd_fprs:
-	prefetch	[%o5],2
-	andcc	%g1, 0x4, %g1		! fprs.du = fprs.dl = 0
-	bnz	%ncc, 1f		! Is fprs.fef == 1
-	nop
-        wr      %g0, FPRS_FEF, %fprs	! fprs.fef = 1
-1:
-	and	%o1, 0xff, %o1		! o1 is (char)c
-	sll     %o1, 8, %o3
-        or      %o1, %o3, %o1		! now o1 has 2 bytes of c
-        sll     %o1, 16, %o3
-        or      %o1, %o3, %o1		! now o1 has 4 bytes of c
-	sllx	%o1, 32, %o3
-	or	%o1, %o3, %o1		! now o1 has 8 bytes of c
-	stx	%o1, [%sp + STACK_OFFSET + scratch_offset]
-	ldd	[%sp + STACK_OFFSET + scratch_offset], %d0
-	cmp	%o2, 8
-	bge,pt	%ncc, xfer_8_or_more
-	mov	%o0, %o5
-	/*
-	 * Do a partial store of %o2 bytes
-	 */
-        andcc	%o5, 7, %o3		! is sp1 aligned on a 8 byte bound
-        brz,pt	%o3, aligned_on_8		
-        sub	%o5, %o3, %o5		! align the  destination buffer.
-	mov	%o3, %o1
-	mov	8, %o4
-	sub 	%o4, %o3, %o3
-	cmp	%o3, %o2
-	bg,a,pn	%ncc, 1f
-	mov	%o2, %o3	
-1:
-	! %o3 has the bytes to be written in partial store.
-	sub	%o2, %o3, %o2
-	dec	%o3
-	prefetch	[%o5],2
-	edge8n	%g0, %o3, %o4
-	srl	%o4, %o1, %o4
-	stda	%d0, [%o5]%o4, ASI_PST8_P
-	brz	%o2, simple_ret
-	add	%o5, 8, %o5
-aligned_on_8:
-	prefetch	[%o5],2
-        dec     %o2                     ! needed to get the mask right
-	edge8n	%g0, %o2, %o4
-	stda	%d0, [%o5]%o4, ASI_PST8_P
-	brnz	%g1, 1f			! was fprs.fef == 1
-	nop
-        wr	%g1, %g0, %fprs         ! fprs = g1  restore fprs
-1:
-	retl
-	add	%sp, SA(STACK_OFFSET + SAVESIZE), %sp
-
-xfer_8_or_more:
-        andcc	%o5, 7, %o3		! is sp1 aligned on a 8 byte bound
-        brz,pt	%o3, blkchk		
-        sub	%o5, %o3, %o5		! align the  destination buffer.
-        sub	%o3, 8, %o3		! -(bytes till double aligned)
-        add	%o2, %o3, %o2		! update o2 with new count
-	xor	%o3, 0xff, %o3
-	and	%o3, 7, %o3
-	prefetch	[%o5],2
-	edge8ln	%g0, %o3, %o4
-	stda	%d0, [%o5]%o4, ASI_PST8_P
-	add	%o5, 8, %o5
-
-
-	! Now sp1 is double aligned (sp1 is found in %o5)
-blkchk:
-	cmp     %o2, 767		! if large count use Block ld/st
-	bg,pt	%ncc,blkwr
-	nop
-
-	
-	and	%o2, 24, %o3		! o3 is {0, 8, 16, 24}
-
-	brz	%o3, skip_dw_loop
-	nop
-
-1:	subcc	%o3, 8, %o3		! double-word loop
-	stx	%o1, [%o5]
-	bgu,pt %ncc, 1b
-	add	%o5, 8, %o5
-skip_dw_loop:
-	andncc	%o2, 31, %o4		! o4 has 32 byte aligned count
-	brz,pn	%o4, 3f
-	nop
-	ba	loop_32byte
-	nop
-
-	.align	ICACHE_LINE_SIZE
-
-loop_32byte:
-	subcc	%o4, 32, %o4		! main loop, 32 bytes per iteration
-	stx	%o1, [%o5]
-	stx	%o1, [%o5 + 8]
-	stx	%o1, [%o5 + 16]
-	stx	%o1, [%o5 + 24]
-	bne,pt  %ncc, loop_32byte
-	add	%o5, 32, %o5
-3:	
-	and	%o2, 7, %o2		! o2 has the remaining bytes (<8)
-	brz	%o2, skip_partial_copy
-	nop
-
-	! Terminate the copy with a partial store.
-	! The data should be at d0
-	prefetch	[%o5],2
-        dec     %o2                     ! needed to get the mask right
-	edge8n	%g0, %o2, %o4
-	stda	%d0, [%o5]%o4, ASI_PST8_P
-
-skip_partial_copy:
-simple_ret:
-	brz,a	%g1, 1f			! was fprs.fef == 0
-        wr	%g1, %g0, %fprs         ! fprs = g1  restore fprs
-1:
-	retl
-	add	%sp, SA(STACK_OFFSET + SAVESIZE), %sp
-
-blkwr:
-        sub     %o5,1,%o3
-        andn    %o3,0x7f,%o4
-        add     %o4,128,%o4
-        prefetch [%o4],2		!prefetch next 128b
-        prefetch [%o4+64],2
-        prefetch [%o4+(2*64)],2		!cont from above
-        prefetch [%o4+(3*64)],2
-
-        andcc   %o5,0x7f,%o3            !o3=0 , means it is already 128 align
-        brz,pn  %o3,alreadyalign128
-        sub     %o3,128,%o3
-
-        add     %o2,%o3,%o2
-align128:
-        stxa    %o1,[%o5]ASI_CACHE_SPARING_PRIMARY
-        addcc   %o3,8,%o3
-        bl,pt   %ncc,align128
-        add     %o5,8,%o5
-
-
-
-alreadyalign128:
-	andcc	%o5,0x1ff,%o3	!%o3=0 when it is 512 b aligned.
-	brnz,pn	%o3, 4f
-	mov	%o2,%g5		!g5=count from 512 align
-	set	4096, %o4
-	subcc	%o2, %o4, %g0
-	bge,pn	%ncc, larry_alg
-	nop
-4:
-
-	sub	%o5,8,%o4	!should be in current 512 chunk
-	andn 	%o4,0x1ff,%o3	!%o3=aligned 512b addr
-	add 	%o3,0x200,%o3	!%o3=next aligned 512b addr which start larry process
-	sub 	%o3,%o5,%o3	!o3=how many byte in the current remaining chunk
-	sub	%o2,%o3,%g5	!g5=count from 512 align
-	/*
-	 * if g5 is < 4096 do start_128 only.
-	 */
-	set	4096, %o4
-	subcc	%g5, %o4, %g0
-	bge,pn	%ncc,6f
-	nop
-	mov	%g0, %g5
-	add	%o5, %o2, %o4
-	ba	start_128
-	nop
-6:
-	mov	%o3, %o2
-	subcc 	%o3,256,%g0	!if it is > 256 bytes , could use the st-interleave alg to wr
-	bl,pn	%ncc,storeword	!o.w use storeword to finish the 512 byte alignment.
-        !%o1=64 bytes data
-        !%o5=next 8 byte addr to write
-        !%o2=new count i.e how many bytes to write
-        add     %o5,%o2,%o4             !cal the last byte to write %o4
-	ba	start_128
-	nop
-
-	.align	64
-start_128:
-	add	%o5, 256, %o3
-	prefetch [%o3], 2	!1st 64 byte line of next 256 byte block
-	add	%o5, 384, %o3
-	prefetch [%o3], 2	!3rd 64 byte line of next 256 byte block
-	add	%o5, 320, %o3
-	prefetch [%o3], 2	!2nd 64 byte line of next 256 byte block
-	add	%o5, 448, %o3
-	prefetch [%o3], 2	!4th 64 byte line of next 256 byte block
-	mov	%o5, %o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!1st 64 byte line
-        add     %o5,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!3rd 64 byte line
-        add     %o5,8,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(2 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128 ,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(3 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(4 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(5 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(6 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(7 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(8 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(9 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(10 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(11 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(12 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(13 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(14 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(15 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,512,%o3  !%o3=final byte of next 256 byte, to check if more 256 byte block ahead
-        subcc   %o4,%o3,%g0   !%o4=final byte location;%o3=final byte of next 256 byte block
-        bge,pt  %ncc,start_128    !branch taken means next 256 byte block is still within the limit.
-        add     %o5,256,%o5
-
-!need to connect the rest of the program
-storeword:
-        and     %o2,255,%o3
-        and     %o3,7,%o2
-
-	! Set the remaining doubles
-	subcc   %o3, 8, %o3		! Can we store any doubles?
-	bl,pn  %ncc, 6f
-	and	%o2, 7, %o2		! calc bytes left after doubles
-
-5:	
-	stxa	%o1, [%o5]ASI_CACHE_SPARING_PRIMARY
-	subcc   %o3, 8, %o3
-	bge,pt	%ncc, 5b
-        add     %o5, 8, %o5      
-6:
-	! Set the remaining bytes
-	brz	%o2,  check_larry_alg		! safe to check all 64-bits
-	
-	! Terminate the copy with a partial store.
-	! The data should be at d0
-        dec     %o2                     ! needed to get the mask right
-	edge8n	%g0, %o2, %o4
-	stda	%d0, [%o5]%o4, ASI_PST8_P
-check_larry_alg:
-	mov	%g5, %o2
-	brnz,pn	%o2, larry_alg
-	nop
-	
-.exit:	
-	brz,a	%g1, 1f			! was fprs.fef == 0
-        wr	%g1, %g0, %fprs         ! fprs = g1  restore fprs
-1:
-        retl				! %o0 was preserved
-	add	%sp, SA(STACK_OFFSET + SAVESIZE), %sp
-
-larry_alg:
-	add	%sp, SA(STACK_OFFSET + SAVESIZE), %sp
-	save	%sp, -SA(MINFRAME), %sp
-	mov	%i0, %o0
-	mov	%i1, %o1
-	mov	%i2, %o2
-	mov	%i3, %o3
-	mov	%i5, %o5
-!%o5 = next memory addr which is 512 b align
-!%g5 = remaining byte from 512 align.
-init:
-	set     4096,%g6
-
-        prefetch [%o5+0],2
-        prefetch [%o5+(64*1)],2
-        prefetch [%o5+(64*2)],2
-        prefetch [%o5+(64*3)],2
-        prefetch [%o5+(64*4)],2
-        prefetch [%o5+(64*5)],2
-        prefetch [%o5+(64*6)],2
-        prefetch [%o5+(64*7)],2
-        prefetch [%o5+(64*8)],2
-        prefetch [%o5+(64*9)],2
-        prefetch [%o5+(64*10)],2
-        prefetch [%o5+(64*11)],2
-        prefetch [%o5+(64*12)],2
-        prefetch [%o5+(64*13)],2
-        prefetch [%o5+(64*14)],2
-        prefetch [%o5+(64*15)],2
-        ba      myloop2
-	add     %o5,%g5,%g5
-        /* Local register usage:
-           %l3   save %o5 at start of inner loop.
-           %l5   iteration counter to make buddy loop execute 2 times.
-           %l6   iteration counter to make inner loop execute 32 times.
-           %l7   address at far ahead of current %o5 for prefetching destination into L2 cache.
-	 */
-
-	.align 64
-myloop2:
-	/* Section 1 */
-        set      2,%l5    /* %l5 is the loop count for the buddy loop, for 2 buddy lines.  */
-        add      %o5, 0, %l3
-buddyloop:
-        set      PF_FAR, %l4        /* Prefetch far ahead.             CHANGE FAR PREFETCH HERE.     <<==== */
-        add      %o5, %l4, %l7      /* For prefetching far ahead, set %l7 far ahead of %o5           */
-
-        set      2*PF_FAR, %l4      /* Prefetch double far ahead.  SET DOUBLE FAR PREFETCH HERE.     <<==== */
-        add      %o5, %l4, %l4      /* %l4 is now double far ahead of the dest address in %o5.       */
-        prefetch [%l4+%g0],2        /* Prefetch ahead by 2 pages to get TLB entry in advance.        */
-
-        set      4,%l6             /* %l6 = loop count for the inner loop, for 4 x 8 = 32 lines.     */
-        set      0, %l4
-
-
-/* Each iteration of the inner loop below writes 8 sequential lines.  This loop is iterated 4 times,
-   to move a total of 32 lines, all of which have the same value of PA[9], so we increment the base
-   address by 1024 bytes in each iteration, which varies PA[10].                                     */
-innerloop:
-	add	%o5, PF_FAR, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2 
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-
-	mov	%o5, %o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!1st 64 byte line
-        add     %o5,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!3rd 64 byte line
-        add     %o5,8,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(2 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128 ,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(3 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(4 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(5 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(6 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(7 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(8 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(9 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(10 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(11 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(12 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(13 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(14 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(15 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-
-        add     %o5,256,%o5
-
-	mov	%o5, %o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!1st 64 byte line
-        add     %o5,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!3rd 64 byte line
-        add     %o5,8,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(2 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128 ,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(3 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(4 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(5 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(6 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(7 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(8 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(9 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(10 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(11 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(12 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(13 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(14 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(15 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-
-        subcc   %l6,1,%l6    /* Decrement the inner loop counter.         */
-
-        /* -------- Now increment by 256 + 512 so we don't toggle PA[9] -------- */
-        add     %o5, 768, %o5
-
-        bg,pt   %ncc,innerloop
-        nop
-/* ------------------------ END OF INNER LOOP -------------------------- */
-
-        subcc   %l5,1,%l5
-        add     %l3, 512, %o5       /* increment %o5 to first buddy line of dest.   */
-        bg,pt   %ncc,buddyloop
-	nop
-        add     %o5, 3584, %o5      /* Advance both base addresses to 4k above where they started. */
-                                        !%o5=next 4096 block.
-	add %o5,%g6,%i5
-	subcc %g5,%i5,%g0
-        bge,pt   %ncc,myloop2
-        nop
-
-
-	/****larryalg_end_here*************/
-
-	sub	%g5,%o5,%o2	!how many byte left
-	brz,pn	%o2,complete_write
-	mov	%g0,%g5
-	add     %o5,%o2,%o4             !cal the last byte to write %o4
-	subcc	%o2,256,%g0
-	bge,pt	%ncc,memset_128
-	mov	%g0,%g5
-	
-	ba	memset_storeword
-	nop
-
-
-complete_write: 
-	brz,a	%g1, 1f			! was fprs.fef == 0
-        wr	%g1, %g0, %fprs         ! fprs = g1  restore fprs
-1:
-        ret				! %o0 was preserved
-	restore
-
-	.align	64
-memset_128:
-	add	%o5, 256, %o3
-	prefetch [%o3], 2	!1st 64 byte line of next 256 byte block
-	add	%o5, 384, %o3
-	prefetch [%o3], 2	!3rd 64 byte line of next 256 byte block
-	add	%o5, 320, %o3
-	prefetch [%o3], 2	!2nd 64 byte line of next 256 byte block
-	add	%o5, 448, %o3
-	prefetch [%o3], 2	!4th 64 byte line of next 256 byte block
-	mov	%o5, %o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!1st 64 byte line
-        add     %o5,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY	!3rd 64 byte line
-        add     %o5,8,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(2 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128 ,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(3 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(4 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(5 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(6 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(7 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(8 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(9 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(10 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(11 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(12 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(13 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(14 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,(15 * 8),%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-        add     %o5,512,%l4  !%l4=final byte of next 256 byte, to check if more 256 byte block ahead
-        add     %o3,128,%o3
-        stxa     %o1,[%o3]ASI_CACHE_SPARING_PRIMARY
-!this branch condition is not needed if we are handling bytes before 4096b
-!because we will only issue once, so %l6 is an invalid data
-!the branch is really for handling bytes after 4096b, there could be
-!multiple of 256 byte block to work on. 
-
-        subcc   %o4,%l4,%g0   !%o4=final byte location;%l4=final byte of next 256 byte block
-        bge,pt  %ncc,memset_128    !branch taken means next 256 byte block is still within the limit.
-        add     %o5,256,%o5
-
-!need to connect the rest of the program
-memset_storeword:
-        and     %o2,255,%o3
-        and     %o3,7,%o2
-
-	! Set the remaining doubles
-	subcc   %o3, 8, %o3		! Can we store any doubles?
-	bl,pn  %ncc, 6f
-	and	%o2, 7, %o2		! calc bytes left after doubles
-
-5:	
-	stxa	%o1, [%o5]ASI_CACHE_SPARING_PRIMARY
-	subcc   %o3, 8, %o3
-	bge,pt	%ncc, 5b
-        add     %o5, 8, %o5      
-6:
-	! Set the remaining bytes
-	brz	%o2,  complete_write		! safe to check all 64-bits
-	
-	! Terminate the copy with a partial store.
-	! The data should be at d0
-        dec     %o2                     ! needed to get the mask right
-	edge8n	%g0, %o2, %o4
-	stda	%d0, [%o5]%o4, ASI_PST8_P
-	
-	brz,a	%g1, 1f			! was fprs.fef == 0
-        wr	%g1, %g0, %fprs         ! fprs = g1  restore fprs
-1:
-        ret				! %o0 was preserved
-	restore
-
-
-	SET_SIZE(memset)
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/misc.s	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,33 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/asm_linkage.h>
-
-	ENTRY(_rock_pause)
-	membar	#Halt
-	retl
-	nop
-	SET_SIZE(_rock_pause)
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/strcpy.s	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,340 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-	.file	"strcpy.s"
-
-/*
- * strcpy(s1, s2)
- *
- * Copy string s2 to s1.  s1 must be large enough. Return s1.
- *
- * Fast assembler language version of the following C-program strcpy
- * which represents the `standard' for the C-library.
- *
- *	char *
- *	strcpy(s1, s2)
- *	register char *s1;
- *	register const char *s2;
- *	{
- *		char *os1 = s1;
- *
- *		while(*s1++ = *s2++)
- *			;
- *		return(os1);
- *	}
- *
- */
-
-#include <sys/asm_linkage.h>
-
-	! This implementation of strcpy works by first checking the
-	! source alignment and copying byte, half byte, or word
-	! quantities until the source ptr is aligned at an extended
-	! word boundary.  Once this has occurred, the string is copied,
-	! checking for zero bytes, depending upon its dst ptr alignment.
-	! (methods for xword, word, half-word, and byte copies are present)
-
-#ifdef	__sparcv9
-#define	SAVESIZE	(8 * 3)
-#define	STACK_OFFSET	(STACK_BIAS + MINFRAME)
-#else
-#define	SAVESIZE	(8 * 5)
-#define	STACK_OFFSET	(STACK_BIAS + MINFRAME + 4)
-#endif
-
-#define LABEL_ADDRESS(label, reg)	 \
-	.pushlocals			;\
-0:	rd %pc, reg		  	;\
-	add reg, (label) - 0b, reg	;\
-	.poplocals
-
-offset_table:
-	.word	.storexword - offset_table	! Offset 0 => xword aligned
-	.word	.storebyte1241 - offset_table	! Offset 1 or 5
-	.word	.storehalfword - offset_table	! Offset 2 or 6
-	.word	.storebyte1421 - offset_table	! Offset 3 or 7
-	.word	.storeword - offset_table	! Offset 4
-
-	.align	64
-#ifdef	__sparcv9
-	.skip	20
-#else
-	.skip	12
-#endif
-
-	ENTRY(strcpy)
-	add	%sp, -SA(STACK_OFFSET + SAVESIZE), %sp
-#ifndef	__sparcv9
-	stx	%g4, [%sp + STACK_OFFSET + 24]
-	stx	%g5, [%sp + STACK_OFFSET + 32]
-#endif
-	sethi	%hi(0x01010101), %o4		! 0x01010000
-	sub	%o1, %o0, %o3		! src - dst
-	or	%o4, %lo(0x01010101), %o4	! 0x01010101
-	andcc	%o1, 7, %g5		! dword aligned ?
-	sllx	%o4, 32, %o5			! 0x01010101 << 32
-	mov	%o0, %o2		! save dst
-	or	%o4, %o5, %o4			! 0x0101010101010101
-
-	bz,pt	%ncc, .srcaligned	! yup
-	sllx	%o4, 7, %o5			! 0x8080808080808080
-
-	sub	%g0, %g5, %g4		! count = -off
-	ldx	[%o1 + %g4], %o1	! val = *(addr + -off)
-	mov	-1, %g1			! mask = -1
-	sllx	%g5, 3, %g4		! shift = off * 8
-	srlx	%g1, %g4, %g1		! -1 >> ((addr & 7) * 8)
-	orn	%o1, %g1, %o1		! val |= ~mask
-
-	andn	%o5, %o1, %g4		! ~val & 0x80
-	sub	%o1, %o4, %g1		! val - 0x01
-	andcc	%g4, %g1, %g4		! ~val & 0x80 & (val - 0x01)
-
-	sllx	%g5, 3, %g4
-	add	%o2, 8, %o2		! .zerobyte expects address = address + 8
-	bnz,a,pn	%xcc, .zerobyte ! Zero byte in the first xword
-	  sllx	%o1, %g4, %o1		! and data to be left justified
-
-	sub	%o2, 8, %o2
-	mov	8, %g4
-	sub	%g4, %g5, %g1		! Bytes to be written
-	sub	%g1, 1, %g4
-
-1:	stub	%o1, [%o2 + %g4]
-	dec	%g4
-	brgez,pt	%g4, 1b
-	srlx	%o1, 8, %o1
-
-	add	%o2, %g1, %o2		! Move ptr by #bytes written
-
-.srcaligned:
-	!! Check if the first dword contains zero after src is aligned
-	ldx	[%o2 + %o3], %o1	! x = src[]
-	andn	%o5, %o1, %g1		! ~x & 0x8080808080808080
-	sub	%o1, %o4, %g4		! x - 0x0101010101010101
-	andcc	%g4, %g1, %g0		! ((x - 0x0101010101010101) & ~x & 0x8080808080808080)
-	bnz,a,pn	%xcc, .zerobyte	! x has zero byte, handle end cases
-	  add	%o2, 8, %o2		! src += 8, dst += 8
-
-	!! Determine the destination offset and branch
-	!! to appropriate location
-	and	%o2, 3, %g4
-	and	%o2, 4, %g1
-	or	%g1, %g4, %g1
-	movrnz	%g4, 0, %g1
-	movrnz	%g1, 4, %g4
-
-	!! %g4 contains the index of the jump address
-	!! Load the address from the table.
-	LABEL_ADDRESS(offset_table, %g1)
-	sllx	%g4, 2, %g4
-	lduw	[%g1 + %g4], %g4
-	jmp	%g1 + %g4
-	add	%o2, 8, %o2		! src += 8, dst += 8
-
-.storexword:
-	stx	%o1, [%o2 - 8]		! store word to dst (address pre-incremented)
-
-1:
-	ldx	[%o2 + %o3], %o1	! src dword
-	add	%o2, 8, %o2		! src += 8, dst += 8
-	andn	%o5, %o1, %g1		! ~dword & 0x8080808080808080
-	sub	%o1, %o4, %g4		! dword - 0x0101010101010101
-	andcc	%g4, %g1, %g0		! ((dword - 0x0101010101010101) & ~dword & 0x8080808080808080)
-	bz,a,pt	%xcc, 1b		! no zero byte if magic expression == 0
-	  stx	%o1, [%o2 - 8]		! store word to dst (address pre-incremented)
-
-	ba,a	.zerobyte
-
-.storebyte1421:
-	!! Offset 3 or 7
-	srlx	%o1, 56, %g1		! %g1<7:0> = first byte; word aligned now
-	stb	%g1, [%o2 - 8]		! store first byte
-	srlx	%o1, 24, %g1		! %g1<31:0> = bytes 2, 3, 4, 5
-	stw	%g1, [%o2 - 7]		! store bytes 2, 3, 4, 5
-	srlx	%o1, 8, %g1		! %g1<15:0> = bytes 6, 7
-	sth	%g1, [%o2 - 3]		! store bytes 6, 7
-
-	stx	%l0, [%sp + STACK_OFFSET + 0]
-	and	%o2, 7, %g1
-	stx	%l1, [%sp + STACK_OFFSET + 8]
-	cmp	%g1, 3
-	stx	%l2, [%sp + STACK_OFFSET + 16]
-
-	move	%ncc, 40, %l0
-	move	%ncc, 24, %l1
-	move	%ncc, -11, %l2
-
-	movne	%ncc, 8, %l0
-	movne	%ncc, 56, %l1
-	movne	%ncc, -15, %l2
-
-	ba	.dstaligned
-	mov	%o1, %g5
-
-.storebyte1241:
-	!! Offset 1 or 5
-	srlx	%o1, 56, %g1		! %g1<7:0> = first byte; word aligned now
-	stb	%g1, [%o2 - 8]		! store first byte
-	srlx	%o1, 40, %g1		! %g1<15:0> = bytes 2, 3
-	sth	%g1, [%o2 - 7]		! store bytes 2, 3
-	srlx	%o1, 8, %g1		! %g1<31:0> = bytes 4, 5, 6, 7
-	stw	%g1, [%o2 - 5]		! store bytes 4, 5, 6, 7
-
-	stx	%l0, [%sp + STACK_OFFSET + 0]
-	and	%o2, 7, %g1
-	stx	%l1, [%sp + STACK_OFFSET + 8]
-	cmp	%g1, 1
-	stx	%l2, [%sp + STACK_OFFSET + 16]
-
-	move	%ncc, 56, %l0
-	move	%ncc, 8, %l1
-	move	%ncc, -9, %l2
-
-	movne	%ncc, 24, %l0
-	movne	%ncc, 40, %l1
-	movne	%ncc, -13, %l2
-
-	ba	.dstaligned
-	mov	%o1, %g5
-
-.storehalfword:
-	srlx	%o1, 48, %g1		! get first and second byte
-	sth	%g1, [%o2 - 8]		! store first and second byte; word aligned now
-	srlx	%o1, 16, %g1		! %g1<31:0> = bytes 3, 4, 5, 6
-	stw	%g1, [%o2 - 6]		! store bytes 3, 4, 5, 6
-
-	stx	%l0, [%sp + STACK_OFFSET + 0]
-	and	%o2, 7, %g1
-	stx	%l1, [%sp + STACK_OFFSET + 8]
-	cmp	%g1, 2
-	stx	%l2, [%sp + STACK_OFFSET + 16]
-
-	move	%ncc, 48, %l0
-	move	%ncc, 16, %l1
-	move	%ncc, -10, %l2
-
-	movne	%ncc, 16, %l0
-	movne	%ncc, 48, %l1
-	movne	%ncc, -14, %l2
-
-	ba	.dstaligned
-	mov	%o1, %g5
-
-.storeword:
-	srlx	%o1, 32, %g1		! get bytes 1,2,3,4
-	stw	%g1, [%o2 - 8]		! store bytes 1,2,3,4 (address is pre-incremented)
-
-	stx	%l0, [%sp + STACK_OFFSET + 0]
-	mov	32, %l0			! Num of bits to be shifted left
-	stx	%l1, [%sp + STACK_OFFSET + 8]
-	mov	32, %l1			! Num of bits to be shifted right
-	stx	%l2, [%sp + STACK_OFFSET + 16]
-	mov	-12, %l2		! -offset
-	mov	%o1, %g5
-
-	nop	! Do not delete. Used for alignment.
-.dstaligned:
-	ldx	[%o2 + %o3], %o1	! x = src[]
-	add	%o2, 8, %o2		! src += 8, dst += 8
-	andn	%o5, %o1, %g1		! ~x & 0x8080808080808080
-	sub	%o1, %o4, %g4		! x - 0x0101010101010101
-	andcc	%g4, %g1, %g0		! ((x - 0x0101010101010101) & ~x & 0x8080808080808080)
-	bnz,a,pn %xcc, .finishup	! x has zero byte, handle end cases
-	  stb	%g5, [%o2 - 9]
-
-	sllx	%g5, %l0, %g5
-	srlx	%o1, %l1, %g4
-	or	%g5, %g4, %g5
-
-	stx	%g5, [%o2 + %l2]
-	ba	.dstaligned
-	mov	%o1, %g5
-
-.finishup:
-	cmp	%l0, 56
-	be,pn	%ncc, .zerobyte_restore
-	andcc	%o2, 1, %g0
-	bnz,a	%ncc, 1f
-	  srlx	%g5, 8, %g5
-
-1:	srlx	%l1, 4, %g4	! g4 contains 1, 2 or 3
-	sub	%g4, 1, %g4	! multiple of 16
-	sllx	%g4, 4, %g4	! How many bits to shift
-	srlx	%g5, %g4, %l0
-	add	%o2, %l2, %g1
-
-2:	sth	%l0, [%g1]
-	sub	%g4, 16, %g4
-	add	%g1, 2, %g1
-	brgez,a,pt	%g4, 2b
-	  srlx	%g5, %g4, %l0
-
-.zerobyte_restore:
-	ldx	[%sp + STACK_OFFSET + 0], %l0
-	andn	%o5, %o1, %o3		! ~val & 0x80
-	ldx	[%sp + STACK_OFFSET + 8], %l1
-	sub	%o1, %o4, %g1		! val - 0x01
-	ldx	[%sp + STACK_OFFSET + 16], %l2
-
-	ba	1f
-	andcc	%o3, %g1, %o3		! ~val & 0x80 & (val - 0x01)
-
-.zerobyte:
-	!! %o5:	0x8080808080808080
-	!! %o4: 0x0101010101010101
-	!! %o1: Left justified dowrd that contains 0 byte
-	!! %o2: Address to be written + 8
-
-	andn	%o5, %o1, %o3		! ~val & 0x80
-	sub	%o1, %o4, %g1		! val - 0x01
-	andcc	%o3, %g1, %o3		! ~val & 0x80 & (val - 0x01)
-
-1:	srlx	%o3, 7, %o3		! shift 0x80 -> 0x01
-	andn	%o3, %o1, %o3		! mask off leading 0x01 bytes
-	lzd	%o3, %o4		! 7, 15, ... 63
-
-	mov	64, %o5			! Calc # of bytes to be discarded
-	inc	%o4			! Include the zero byte too
-	sub	%o5, %o4, %o5		! after the null byte
-	sub	%o2, 8, %o2		! Adjust address which is +8 here.
-	srlx	%o1, %o5, %o1		! Discard them
-
-	srlx	%o4, 3, %o4		! Bits to bytes to be written
-	dec	%o4			! dec 1 to use it as offset
-
-2:	stub	%o1, [%o2 + %o4]
-	dec	%o4
-	brgez,pt %o4, 2b
-	srlx	%o1, 8, %o1
-
-#ifndef	__sparcv9
-	ldx	[%sp + STACK_OFFSET + 24], %g4
-	ldx	[%sp + STACK_OFFSET + 32], %g5
-#endif
-	retl				! done with leaf function
-	add	%sp, SA(STACK_OFFSET + SAVESIZE), %sp
-	SET_SIZE(strcpy)
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/strlen.s	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,127 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-	.file	"strlen.s"
-
-/*
- * strlen(s)
- *
- * Given string s, return length (not including the terminating null).
- *
- * Fast assembler language version of the following C-program strlen
- * which represents the `standard' for the C-library.
- *
- *	size_t
- *	strlen(s)
- *	register const char *s;
- *	{
- *		register const char *s0 = s + 1;
- *
- *		while (*s++ != '\0')
- *			;
- *		return (s - s0);
- *	}
- */
-
-#include <sys/asm_linkage.h>
-
-	/*
-	 * There are two key optimizations in the routine below.
-	 * First, all memory accesses are 8 bytes wide.  The time
-	 * for long strings is dominated by the latency of load
-	 * instructions in the inner loop, and going 8 bytes at
-	 * a time means 1/8th as much latency.
-	 *
-	 * Scanning an 8 byte word for a '\0' is made fast by
-	 * this formula (due to Alan Mycroft):
-	 *     ~x & 0x808080808080 & (x - 0x0101010101010101)
-	 * The result of this formula is non-zero iff there's
-	 * a '\0' somewhere in x.
-	 *
-	 * Second, the cost of short strings is dominated by the
-	 * cost of figuring out which byte out of the last 8
-	 * contained the '\0' that terminated the string.  We use
-	 * properties of the formula above to convert scanning the
-	 * word for '\0' into a single LZD instruction.
-	 */
-	.align	64
-	.skip	4*4	! force .findnull to align to 64 bytes
-	ENTRY_NP(strlen)
-	and	%o0, 7, %o3			! off = addr & 7
-	sethi	%hi(0x01010101), %o4		! 0x01010000
-
-	sub	%g0, %o3, %o2			! count = -off
-	or	%o4, %lo(0x01010101), %o4	! 0x01010101
-
-	ldx	[%o0 + %o2], %o1		! val = *(addr + count)
-	sllx	%o4, 32, %o5			! 0x01010101 << 32
-
-	mov	-1, %g1				! mask = -1
-	sllx	%o3, 3, %o3			! shift = off * 8
-
-	or	%o4, %o5, %o4			! 0x0101010101010101
-	srlx	%g1, %o3, %g1			! -1 >> ((addr & 7) * 8)
-
-	sllx	%o4, 7, %o5			! 0x8080808080808080
-	orn	%o1, %g1, %o1			! val |= ~mask
-.strlen_findnull:
-	!! %o0 - base address
-	!! %o1 - xword from memory
-	!! %o2 - index
-	!! %o3 - result of test for '\0'
-	!! %o4 - constant 0x0101.0101.0101.0101
-	!! %o5 - constant 0x8080.8080.8080.8080
-	!! %g1 - scratch
-	andn	%o5, %o1, %o3		! ~val & 0x80
-	sub	%o1, %o4, %g1		! val - 0x01
-	andcc	%o3, %g1, %o3		! ~val & 0x80 & (val - 0x01)
-	inc	8, %o2
-	bz,a,pt	%xcc, .strlen_findnull
-	  ldx	[%o0 + %o2], %o1
-
-	/*
-	 * The result of Mycroft's formula is a pattern of 0x80 and
-	 * 0x00 bytes.  There's a 0x80 at every byte position where
-	 * there was a '\0' character, but a string of 0x01 bytes
-	 * immediately preceding a '\0' becomes a corresponding
-	 * string of 0x80 bytes.  (e.g. 0x0101010101010100 becomes
-	 * 0x8080808080808080).  We need one final step to discount
-	 * any leading 0x01 bytes, and then LZD can tell us how many
-	 * characters there were before the terminating '\0'.
-	 */
-	!! %o1 - last data word
-	!! %o2 - length+8, plus 1-8 extra
-	!! %o3 - xword with 0x80 for each 0x00 byte and leading 0x01
-	sub	%o2, 8, %o2		! subtract off '\0' and last 8
-	srlx	%o3, 7, %o3		! shift 0x80 -> 0x01
-	andn	%o3, %o1, %o3		! mask off leading 0x01 bytes
-	lzd	%o3, %o3		! 7, 15, ... 63
-	srlx	%o3, 3, %o3		! 0 ... 7
-
-	retl
-	add	%o2, %o3, %o0		! add back bytes before '\0'
-
-	SET_SIZE(strlen)
--- a/usr/src/lib/libc/sparc_hwcap1/sparc/Makefile	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/lib/libc/sparc_hwcap1/sparc/Makefile	Thu Aug 06 17:39:39 2009 -0700
@@ -28,8 +28,7 @@
 
 LIBRARY=	libc_hwcap1.a
 
-EXTN_CPPFLAGS=	-DSMT_PAUSE_FUNCTION=_rock_pause \
-		-I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include
+EXTN_CPPFLAGS=	-I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include
 EXTN_ASFLAGS=	-xarch=v8plusd
 EXTN_DYNFLAGS=	-M mapfile
 
@@ -40,10 +39,10 @@
 
 PRFOBJS=		\
 	memcpy.o	\
+	memmove.o	\
 	memset.o	\
 	strlen.o	\
 	strcpy.o	\
-	misc.o
 
 MAPFILE_AUX =	mapfile-vers-aux
 
--- a/usr/src/lib/libc/sparc_hwcap1/sparcv9/Makefile	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/lib/libc/sparc_hwcap1/sparcv9/Makefile	Thu Aug 06 17:39:39 2009 -0700
@@ -27,8 +27,7 @@
 
 LIBRARY=	libc_hwcap1.a
 
-EXTN_CPPFLAGS=	-DSMT_PAUSE_FUNCTION=_rock_pause \
-		-I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include
+EXTN_CPPFLAGS=	-I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include
 EXTN_ASFLAGS=	-xarch=v9d
 EXTN_DYNFLAGS=	-M mapfile
 
@@ -39,10 +38,10 @@
 
 PRFOBJS=		\
 	memcpy.o	\
+	memmove.o	\
 	memset.o	\
 	strlen.o	\
 	strcpy.o	\
-	misc.o
 
 MAPFILE_AUX =	mapfile-vers-aux
 
--- a/usr/src/lib/libdisasm/sparc/dis_sparc_fmt.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/lib/libdisasm/sparc/dis_sparc_fmt.c	Thu Aug 06 17:39:39 2009 -0700
@@ -20,12 +20,12 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
- * Copyright 2008 Jason King.  All rights reserved.
+ * Copyright 2009 Jason King.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -389,15 +389,15 @@
 	uint32_t op3:6;
 	uint32_t rs1:5;
 	uint32_t i:1;
-	uint32_t undef:5;
-	uint32_t cmask:4;
+	uint32_t undef:6;
+	uint32_t cmask:3;
 	uint32_t mmask:4;
 } formatmbr_t;
 #elif defined(_BIT_FIELDS_LTOH)
 typedef struct formatmbr {
 	uint32_t mmask:4;
-	uint32_t cmask:4;
-	uint32_t undef:5;
+	uint32_t cmask:3;
+	uint32_t undef:6;
 	uint32_t i:1;
 	uint32_t rs1:5;
 	uint32_t op3:6;
@@ -566,8 +566,8 @@
 	"#LoadLoad", "#StoreLoad", "#LoadStore", "#StoreStore"
 };
 
-static const char *membar_cmask[4] = {
-	"#Lookaside", "#MemIssue", "#Sync", "#Halt"
+static const char *membar_cmask[3] = {
+	"#Lookaside", "#MemIssue", "#Sync"
 };
 
 /* v8 ancillary state register names */
@@ -592,15 +592,15 @@
 	"%pcr",		"%pic",		"%dcr",	"%gsr",
 	"%softint_set",	"%softint_clr",	"%softint",	"%tick_cmpr",
 	"%stick",	"%stick_cmpr",	NULL,	NULL,
-	"%cps",		NULL,		NULL,	NULL
+	NULL,		NULL,		NULL,	NULL
 };
 /*
  * on v9, only certain registers are valid for read or writing
  * these are bitmasks corresponding to which registers are valid in which
- * case
+ * case. Any access to %dcr is illegal.
  */
-static const uint32_t v9_asr_rdmask = 0x13cb007d;
-static const uint32_t v9_asr_wrmask = 0x13fb004d;
+static const uint32_t v9_asr_rdmask = 0x03cb007d;
+static const uint32_t v9_asr_wrmask = 0x03fb004d;
 
 /* privledged register names on v9 */
 /* TODO: compat - NULL to %priv_nn */
@@ -617,7 +617,7 @@
 
 /* hyper privileged register names on v9 */
 static const char *v9_hprivreg_names[32] = {
-	"%hpstate",	 "%htstate",	"%hrstba",  "%hintp",
+	"%hpstate",	 "%htstate",	NULL,  "%hintp",
 	NULL,	"%htba",	 "%hver",  NULL,
 	NULL,	NULL,	NULL,	NULL,
 	NULL,	NULL,	NULL,	NULL,
@@ -629,8 +629,8 @@
 
 static const uint32_t v9_pr_rdmask = 0x80017fff;
 static const uint32_t v9_pr_wrmask = 0x00017fff;
-static const uint32_t v9_hpr_rdmask = 0x8000006f;
-static const uint32_t v9_hpr_wrmask = 0x8000006f;
+static const uint32_t v9_hpr_rdmask = 0x8000006b;
+static const uint32_t v9_hpr_wrmask = 0x8000006b;
 
 static const char *prefetch_str[32] = {
 	"#n_reads", "#one_read",
@@ -784,7 +784,6 @@
 	int32_t disp;
 	uint32_t flags = inp->in_data.in_def.in_flags;
 	int octal = ((dhp->dh_flags & DIS_OCTAL) != 0);
-	int chkpt = 0;
 
 	if ((dhp->dh_debug & DIS_DEBUG_PRTFMT) != 0) {
 		prt_field("op", f->f2.op, 2);
@@ -822,13 +821,6 @@
 		flags = FLG_RS1(REG_NONE)|FLG_DISP(DISP19);
 	}
 
-	if (f->f2b.op2 == 0x01 && f->f2b.a == 1 &&
-	    f->f2b.p == 0 && f->f2b.cond == 0x8 && f->f2b.cc == 0x01) {
-		name = "chkpt";
-		flags = FLG_RS1(REG_NONE)|FLG_DISP(DISP19);
-		chkpt = 1;
-	}
-
 
 	switch (FLG_DISP_VAL(flags)) {
 	case DISP22:
@@ -867,11 +859,7 @@
 		}
 	}
 
-	if (!chkpt) {
-		(void) snprintf(buf, sizeof (buf), "%s%s%s", name, annul, pred);
-	} else {
-		(void) snprintf(buf, sizeof (buf), "%s", name);
-	}
+	(void) snprintf(buf, sizeof (buf), "%s%s%s", name, annul, pred);
 	prt_name(dhp, buf, 1);
 
 
@@ -884,19 +872,11 @@
 		break;
 
 	case DISP19:
-		if (!chkpt) {
-			bprintf(dhp,
-			    (octal != 0) ? "%s, %s0%-5lo <" :
-			    "%s, %s0x%-04lx <",
-			    r,
-			    (disp < 0) ? "-" : "+",
-			    (disp < 0) ? (-disp) : disp);
-		} else {
-			bprintf(dhp,
-			    (octal != 0) ? "%s0%-5lo <" : "%s0x%-04lx <",
-			    (disp < 0) ? "-" : "+",
-			    (disp < 0) ? (-disp) : disp);
-		}
+		bprintf(dhp,
+		    (octal != 0) ? "%s, %s0%-5lo <" :
+		    "%s, %s0x%-04lx <", r,
+		    (disp < 0) ? "-" : "+",
+		    (disp < 0) ? (-disp) : disp);
 		break;
 
 	case DISP16:
@@ -1328,7 +1308,7 @@
 
 			first = 0;
 
-			for (i = 0; i < 5; ++i) {
+			for (i = 0; i < 4; ++i) {
 				if ((f->fmb.cmask & (1L << i)) != 0) {
 					bprintf(dhp, "%s%s",
 					    (first != 0) ? "|" : "",
@@ -1503,7 +1483,6 @@
 
 	int v9 = ((dhp->dh_flags & (DIS_SPARC_V9|DIS_SPARC_V9_SGI)) != 0);
 	int p_rs1, p_t;
-	char failstr[8] = "fail";
 
 	if (f->ftcc.undef != 0)
 		return (-1);
@@ -1530,26 +1509,13 @@
 		    (p_rs1 != 0) ? " + " : "",
 		    (p_t != 0) ? reg_names[f->f3.rs2] : "");
 	} else {
-		if ((p_rs1 == 0) && (f->ftcc.immtrap == 0xF)) {
-		(void) strlcat(failstr,
-		    (const char *)&(inp->in_data.in_def.in_name[1]),
-		    sizeof (failstr));
-
-		prt_name(dhp, failstr, 1);
-		bprintf(dhp, "%s%s%s",
-		    (v9 != 0) ? icc_names[f->ftcc2.cc] : "",
-		    (p_rs1 != 0) ? reg_names[f->ftcc2.rs1] : "",
-		    (p_rs1 != 0) ? " + " : "");
-		} else {
 		bprintf(dhp, "%-9s %s%s%s%s0x%x", inp->in_data.in_def.in_name,
 		    (v9 != 0) ? icc_names[f->ftcc2.cc] : "",
 		    (v9 != 0) ? ", " : "",
 		    (p_rs1 != 0) ? reg_names[f->ftcc2.rs1] : "",
 		    (p_rs1 != 0) ? " + " : "",
 		    f->ftcc.immtrap);
-		}
 	}
-
 	return (0);
 }
 
@@ -1894,17 +1860,9 @@
 		return (0);
 
 	case 0x3b:
-		if (f->f3.rd == 1) {
-			/* flusha */
-			prt_name(dhp, "flusha", 1);
-			prt_address(dhp, instr, 0);
-			(void) strlcat(dhp->dh_buf, " ", dhp->dh_buflen);
-			prt_asi(dhp, instr);
-		} else {
-			/* flush */
-			prt_name(dhp, name, 1);
-			prt_address(dhp, instr, 0);
-		}
+		/* flush */
+		prt_name(dhp, name, 1);
+		prt_address(dhp, instr, 0);
 		return (0);
 
 	case 0x3c:
--- a/usr/src/lib/libdisasm/sparc/instr.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/lib/libdisasm/sparc/instr.c	Thu Aug 06 17:39:39 2009 -0700
@@ -20,12 +20,12 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
- * Copyright 2007 Jason King.  All rights reserved.
+ * Copyright 2009 Jason King.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -155,12 +155,12 @@
 };
 
 static const inst_t BPr_table_def[16] = {
-	INST("brnr",  V9, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)),
+	INVALID,
 	INST("brz",   V9|V9S, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)),
 	INST("brlez", V9|V9S, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)),
 	INST("brlz",  V9|V9S, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)),
 
-	INST("brr",  V9, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)),
+	INVALID,
 	INST("brnz",  V9|V9S, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)),
 	INST("brgz",  V9|V9S, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)),
 	INST("brgez", V9|V9S, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)),
@@ -483,10 +483,7 @@
 
 	/* 0x10 */
 	INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID,
-	INVALID, INVALID, INVALID, INVALID, INVALID, INVALID,
-	INST("commit", V9, 0),
-	INVALID
-
+	INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID
 };
 
 static const table_t tr_table = {
@@ -637,12 +634,7 @@
 	INST("fsqrtq", VALL,
 		FLG_P1(REG_NONE)|FLG_P2(REG_FPQ)|FLG_NOIMM|FLG_P3(REG_FPQ)),
 
-	INVALID,
-	INST("frsqrt1xs", V9,
-		FLG_P1(REG_NONE)|FLG_P2(REG_FPQ)|FLG_NOIMM|FLG_P3(REG_FPQ)),
-	INST("frsqrt1xd", VALL,
-		FLG_P1(REG_NONE)|FLG_P2(REG_FPD)|FLG_NOIMM|FLG_P3(REG_FPD)),
-	INVALID,
+	INVALID, INVALID, INVALID, INVALID,
 
 	/* 0x30 */
 	INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID,
@@ -683,31 +675,11 @@
 		FLG_P1(REG_FPQ)|FLG_P2(REG_FPQ)|FLG_NOIMM|FLG_P3(REG_FPQ)),
 
 	/* 0x050 */
-	INVALID,
-	INST("fnadds", V9S,
-		FLG_P1(REG_FP)|FLG_P2(REG_FP)|FLG_NOIMM|FLG_P3(REG_FP)),
-	INST("fnaddd", V9S,
-		FLG_P1(REG_FPD)|FLG_P2(REG_FPD)|FLG_NOIMM|FLG_P3(REG_FPD)),
-	INVALID, INVALID, INVALID, INVALID, INVALID,
-	INVALID,
-	INST("fnmuls", V9S,
-		FLG_P1(REG_FP)|FLG_P2(REG_FP)|FLG_NOIMM|FLG_P3(REG_FP)),
-	INST("fnmuld", V9S,
-		FLG_P1(REG_FPD)|FLG_P2(REG_FPD)|FLG_NOIMM|FLG_P3(REG_FPD)),
-	INVALID, INVALID, INVALID, INVALID, INVALID,
+	INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID,
+	INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID,
 
 	/* 0x060 */
-	INVALID,
-	INST("fhadds", V9,
-		FLG_P1(REG_FP)|FLG_P2(REG_FP)|FLG_NOIMM|FLG_P3(REG_FP)),
-	INST("fhaddd", V9,
-		FLG_P1(REG_FPD)|FLG_P2(REG_FPD)|FLG_NOIMM|FLG_P3(REG_FPD)),
-	INVALID, INVALID,
-	INST("fhsubs", V9S,
-		FLG_P1(REG_FP)|FLG_P2(REG_FP)|FLG_NOIMM|FLG_P3(REG_FP)),
-	INST("fhsubd", V9S,
-		FLG_P1(REG_FPD)|FLG_P2(REG_FPD)|FLG_NOIMM|FLG_P3(REG_FPD)),
-	INVALID,
+	INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID,
 
 	/* 0x068 */
 	INVALID,
@@ -723,16 +695,8 @@
 	INVALID,
 
 	/* 0x070 */
-	INVALID,
-	INST("fnhadds", V9S,
-		FLG_P1(REG_FP)|FLG_P2(REG_FP)|FLG_NOIMM|FLG_P3(REG_FP)),
-	INST("fnhaddd", V9S,
-		FLG_P1(REG_FPD)|FLG_P2(REG_FPD)|FLG_NOIMM|FLG_P3(REG_FPD)),
-	INVALID, INVALID, INVALID, INVALID, INVALID,
-	INVALID,
-	INST("fnsmuld", V9S,
-		FLG_P1(REG_FP)|FLG_P2(REG_FP)|FLG_NOIMM|FLG_P3(REG_FPD)),
-	INVALID, INVALID, INVALID, INVALID, INVALID, INVALID,
+	INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID,
+	INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID,
 
 	/* 0x080 */
 	INVALID,
@@ -1600,40 +1564,6 @@
 	.tbl_inp   = fused_table_def
 };
 
-static const inst_t unfused_table_def[16] = {
-	/* 0x0 */
-	INVALID,
-	INST("fumadds", V9, FLG_P1(REG_FP)),
-	INST("fumaddd", V9, FLG_P1(REG_FPD)),
-	INVALID,
-
-	/* 0x4 */
-	INVALID,
-	INST("fumsubs", V9, FLG_P1(REG_FP)),
-	INST("fumsubd", V9, FLG_P1(REG_FPD)),
-	INVALID,
-
-	/* 0x8 */
-	INVALID,
-	INST("fnumsubs", V9, FLG_P1(REG_FP)),
-	INST("fnumsubd", V9, FLG_P1(REG_FPD)),
-	INVALID,
-
-	/* 0xc */
-	INVALID,
-	INST("fnumadds", V9, FLG_P1(REG_FP)),
-	INST("fnumaddd", V9, FLG_P1(REG_FPD)),
-	INVALID
-};
-
-static const table_t unfused_table = {
-	.tbl_field = 8,
-	.tbl_len   = 4,
-	.tbl_ovp   = NULL,
-	.tbl_fmt   = fmt_fused,
-	.tbl_inp   = unfused_table_def
-};
-
 static const inst_t alu_table_def[64] = {
 	/* 0x00 */
 	INST("add",		VALL,	0),
@@ -1722,7 +1652,7 @@
 	INST("save",		VALL,	0),
 	INST("restore",		VALL,	0),
 	TABLE(tr_table,		V9|V9S),
-	TABLE(unfused_table,	V9|V9S)
+	INVALID
 };
 
 
--- a/usr/src/lib/libprtdiag/common/display_sun4v.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/lib/libprtdiag/common/display_sun4v.c	Thu Aug 06 17:39:39 2009 -0700
@@ -106,8 +106,6 @@
 static void sun4v_env_print_current_indicators();
 static void sun4v_env_print_voltage_sensors();
 static void sun4v_env_print_voltage_indicators();
-static void sun4v_env_print_humidity_sensors();
-static void sun4v_env_print_humidity_indicators();
 static void sun4v_env_print_LEDs();
 static void sun4v_print_fru_status();
 static int is_fru_absent(picl_nodehdl_t);
@@ -1101,16 +1099,6 @@
 
 	class_node_found = 0;
 	all_status_ok = 1;
-	sun4v_env_print_humidity_sensors();
-	exit_code |= (!all_status_ok);
-
-	class_node_found = 0;
-	all_status_ok = 1;
-	sun4v_env_print_humidity_indicators();
-	exit_code |= (!all_status_ok);
-
-	class_node_found = 0;
-	all_status_ok = 1;
 	sun4v_env_print_LEDs();
 	exit_code |= (!all_status_ok);
 
@@ -1737,68 +1725,6 @@
 }
 
 static void
-sun4v_env_print_humidity_sensors()
-{
-	char *fmt = "%-34s %-14s %-10s\n";
-	(void) picl_walk_tree_by_class(phyplatformh,
-	    PICL_CLASS_HUMIDITY_SENSOR,
-	    (void *)PICL_PROP_HUMIDITY,
-	    sun4v_env_print_sensor_callback);
-	if (!class_node_found)
-		return;
-	log_printf("\nHumidity sensors:\n");
-	if (syserrlog == 0) {
-		(void) picl_walk_tree_by_class(phyplatformh,
-		    PICL_CLASS_HUMIDITY_SENSOR,
-		    PICL_PROP_HUMIDITY, sun4v_env_print_sensor_callback);
-		if (all_status_ok) {
-			log_printf("All humidity sensors are OK.\n");
-			return;
-		}
-	}
-	log_printf("-------------------------------------------------"
-	    "-----------\n");
-	log_printf(fmt, "Location", "Sensor", "Status", 0);
-	log_printf("-------------------------------------------------"
-	    "-----------\n");
-	(void) picl_walk_tree_by_class(phyplatformh,
-	    PICL_CLASS_HUMIDITY_SENSOR,
-	    (void *)PICL_PROP_HUMIDITY,
-	    sun4v_env_print_sensor_callback);
-}
-
-static void
-sun4v_env_print_humidity_indicators()
-{
-	char *fmt = "%-34s %-14s %-8s\n";
-	(void) picl_walk_tree_by_class(phyplatformh,
-	    PICL_CLASS_HUMIDITY_INDICATOR,
-	    (void *)PICL_PROP_CONDITION,
-	    sun4v_env_print_indicator_callback);
-	if (!class_node_found)
-		return;
-	log_printf("\nHumidity indicators:\n");
-	if (syserrlog == 0) {
-		(void) picl_walk_tree_by_class(phyplatformh,
-		    PICL_CLASS_HUMIDITY_INDICATOR, (void *)PICL_PROP_CONDITION,
-		    sun4v_env_print_indicator_callback);
-		if (all_status_ok) {
-			log_printf("All humidity indicators are OK.\n");
-			return;
-		}
-	}
-	log_printf("-------------------------------------------------"
-	    "-----------\n");
-	log_printf(fmt, "Location", "Indicator", "Condition", 0);
-	log_printf("-------------------------------------------------"
-	    "-----------\n");
-	(void) picl_walk_tree_by_class(phyplatformh,
-	    PICL_CLASS_HUMIDITY_INDICATOR,
-	    (void *)PICL_PROP_CONDITION,
-	    sun4v_env_print_indicator_callback);
-}
-
-static void
 sun4v_env_print_LEDs()
 {
 	char *fmt = "%-34s %-14s %-8s\n";
--- a/usr/src/pkgdefs/Makefile	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/pkgdefs/Makefile	Thu Aug 06 17:39:39 2009 -0700
@@ -96,7 +96,6 @@
 	SUNWssad  \
 	SUNWstc.u \
 	SUNWus.u \
-	SUNWusat10.v \
 	SUNWust1.v \
 	SUNWust2.v \
 	SUNWwbsd
--- a/usr/src/pkgdefs/SUNWusat10.v/Makefile	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,35 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
-
-include ../Makefile.com
-
-.KEEP_STATE:
-
-all: $(FILES)
-
-install: all pkg
-
-include ../Makefile.targ
--- a/usr/src/pkgdefs/SUNWusat10.v/pkginfo.tmpl	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,55 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
-# This required package information file describes characteristics of the
-# package, such as package abbreviation, full package name, package version,
-# and package architecture.
-#
-PKG="SUNWusat10"
-NAME="UltraSPARC-AT10 (Root)"
-ARCH="sparc.sun4v"
-VERSION="ONVERS,REV=0.0.0"
-SUNW_PRODNAME="SunOS"
-SUNW_PRODVERS="RELEASE/VERSION"
-SUNW_PKGTYPE="root"
-MAXINST="1000"
-CATEGORY="system"
-DESC="UltraSPARC-AT10 core kernel software"
-VENDOR="Sun Microsystems, Inc."
-HOTLINE="Please contact your local service provider"
-EMAIL=""
-CLASSES="none"
-BASEDIR=/
-SUNW_PKGVERS="1.0"
-SUNW_PKG_ALLZONES="true"
-SUNW_PKG_HOLLOW="true"
-SUNW_PKG_THISZONE="false"
-#VSTOCK="<reserved by Release Engineering for package part #>"
-#ISTATES="<developer defined>"
-#RSTATES='<developer defined>'
-#ULIMIT="<developer defined>"
-#ORDER="<developer defined>"
-#PSTAMP="<developer defined>"
-#INTONLY="<developer defined>"
--- a/usr/src/pkgdefs/SUNWusat10.v/prototype_com	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,52 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
-# This required package information file contains a list of package contents.
-# The 'pkgmk' command uses this file to identify the contents of a package
-# and their location on the development machine when building the package.
-# Can be created via a text editor or through use of the 'pkgproto' command.
-
-#!search <pathname pathname ...>	# where to find pkg objects
-#!include <filename>			# include another 'prototype' file
-#!default <mode> <owner> <group>	# default used if not specified on entry
-#!<param>=<value>			# puts parameter in pkg environment
-
-# packaging files
-i pkginfo
-i copyright
-#
-# source locations relative to the prototype file
-#
-# SUNWusat10.v
-#
-d none platform 755 root sys
-d none platform/sun4v 755 root sys
-d none platform/sun4v/kernel 755 root sys
-d none platform/sun4v/kernel/cpu 755 root sys
-d none platform/sun4v/kernel/cpu/sparcv9 755 root sys
-f none platform/sun4v/kernel/cpu/sparcv9/SUNW,UltraSPARC-AT10 755 root sys
-d none platform/sun4v/kernel/pcbe 755 root sys
-d none platform/sun4v/kernel/pcbe/sparcv9 755 root sys
-f none platform/sun4v/kernel/pcbe/sparcv9/pcbe.SUNW,UltraSPARC-AT10 755 root sys
--- a/usr/src/pkgdefs/SUNWusat10.v/prototype_sparc	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,30 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
-
-# Include ISA independent files (prototype_com)
-!include prototype_com
-
-# List files which are SPARC specific here
--- a/usr/src/uts/common/io/mem.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/common/io/mem.c	Thu Aug 06 17:39:39 2009 -0700
@@ -230,9 +230,6 @@
 	    flags, name, valuep, lengthp, 0));
 }
 
-extern void mach_sync_icache_pa(caddr_t, size_t);
-#pragma weak mach_sync_icache_pa
-
 static int
 mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio,
     page_t *pp)
@@ -271,18 +268,9 @@
 				error = EFAULT;
 		} else
 			error = EIO;
-	} else {
+	} else
 		error = uiomove(va + pageoff, nbytes, rw, uio);
 
-		/*
-		 * In case this has changed executable code,
-		 * non-coherent I-caches must be flushed.
-		 */
-		if (rw != UIO_READ && &mach_sync_icache_pa != NULL) {
-			mach_sync_icache_pa((caddr_t)ptob(pfn), PAGESIZE);
-		}
-	}
-
 	if (devload)
 		hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK);
 	else if (pp)
--- a/usr/src/uts/common/sys/auxv_SPARC.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/common/sys/auxv_SPARC.h	Thu Aug 06 17:39:39 2009 -0700
@@ -45,7 +45,6 @@
 #define	AV_SPARC_VIS2	0x0040	/* VIS2 instruction set supported */
 #define	AV_SPARC_ASI_BLK_INIT	0x0080	/* ASI_BLK_INIT_xxx ASI */
 #define	AV_SPARC_FMAF	0x0100	/* Fused Multiply-Add */
-#define	AV_SPARC_FMAU	0x0200  /* Unfused Multiply-Add */
 #define	AV_SPARC_VIS3	0x0400  /* VIS3 instruction set extensions */
 #define	AV_SPARC_HPC	0x0800  /* High Performance Computing insns */
 #define	AV_SPARC_RANDOM	0x1000  /* random instruction */
@@ -57,7 +56,7 @@
 #define	FMT_AV_SPARC	\
 	"\20" \
 	"\21cspare"	\
-	"\20ima\17fjfmau\16trans\15random\14hpc\13vis3\12fmau\11fmaf" 	\
+	"\20ima\17fjfmau\16trans\15random\14hpc\13vis3\12-\11fmaf" 	\
 	"\10ASIBlkInit\7vis2\6vis\5popc\4v8plus\3fsmuld\2div32\1mul32"
 
 /*
--- a/usr/src/uts/common/vm/hat.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/common/vm/hat.h	Thu Aug 06 17:39:39 2009 -0700
@@ -425,25 +425,6 @@
 #define	HAT_STRUCTURE_LE	0x2000
 #define	HAT_ENDIAN_MASK		0x3000
 
-/*
- * Attributes for non-coherent I-cache support.
- *
- * We detect if an I-cache has been filled by first resetting
- * execute permission in a tte entry. This forces a trap when
- * an instruction fetch first occurs in that page. In "soft
- * execute mode", the hardware execute permission is cleared
- * and a different software execution bit is set in the tte.
- *
- * HAT_ATTR_TEXT: set this flag to avoid the extra trap associated
- * with soft execute mode. Same meaning as HAT_LOAD_TEXT.
- *
- * HAT_ATTR_NOSOFTEXEC: set this flag when installing a permanent
- * mapping, or installing a mapping that will never be
- * freed. Overrides soft execute mode.
- */
-#define	HAT_ATTR_TEXT		0x4000
-#define	HAT_ATTR_NOSOFTEXEC	0x8000
-
 /* flags for hat_softlock */
 #define	HAT_COW			0x0001
 
--- a/usr/src/uts/common/vm/page.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/common/vm/page.h	Thu Aug 06 17:39:39 2009 -0700
@@ -780,7 +780,7 @@
 int	page_reclaim_mem(pgcnt_t, pgcnt_t, int);
 
 void page_set_props(page_t *, uint_t);
-void page_clr_all_props(page_t *, int);
+void page_clr_all_props(page_t *);
 int page_clear_lck_cow(page_t *, int);
 
 kmutex_t	*page_vnode_mutex(struct vnode *);
--- a/usr/src/uts/common/vm/page_retire.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/common/vm/page_retire.c	Thu Aug 06 17:39:39 2009 -0700
@@ -535,7 +535,7 @@
 	ASSERT(!hat_page_is_mapped(pp));
 	ASSERT(!pp->p_vnode);
 
-	page_clr_all_props(pp, 0);
+	page_clr_all_props(pp);
 	pagescrub(pp, 0, MMU_PAGESIZE);
 
 	pp->p_next = NULL;
--- a/usr/src/uts/common/vm/seg_kmem.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/common/vm/seg_kmem.c	Thu Aug 06 17:39:39 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -883,15 +883,6 @@
 	else
 		allocflag = 0;
 
-	/*
-	 * Support for non-coherent I-cache.
-	 * Set HAT_LOAD_TEXT to override soft execute.
-	 */
-	if (attr & HAT_ATTR_TEXT) {
-		attr &= ~HAT_ATTR_TEXT;
-		allocflag |= HAT_LOAD_TEXT;
-	}
-
 	while (ppl != NULL) {
 		page_t *pp = ppl;
 		page_sub(&ppl, pp);
--- a/usr/src/uts/common/vm/vm_page.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/common/vm/vm_page.c	Thu Aug 06 17:39:39 2009 -0700
@@ -620,7 +620,7 @@
 		 * initialize other fields in the page_t
 		 */
 		PP_SETFREE(pp);
-		page_clr_all_props(pp, 0);
+		page_clr_all_props(pp);
 		PP_SETAGED(pp);
 		pp->p_offset = (u_offset_t)-1;
 		pp->p_next = pp;
@@ -2662,7 +2662,7 @@
 	PP_SETFREE(pp);
 	ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
 	    !hat_ismod(pp));
-	page_clr_all_props(pp, 0);
+	page_clr_all_props(pp);
 	ASSERT(!hat_page_getshare(pp));
 
 	/*
@@ -2803,7 +2803,7 @@
 		ASSERT(tpp->p_szc == szc);
 
 		PP_SETFREE(tpp);
-		page_clr_all_props(tpp, 0);
+		page_clr_all_props(tpp);
 		PP_SETAGED(tpp);
 		tpp->p_offset = (u_offset_t)-1;
 		ASSERT(tpp->p_next == tpp);
@@ -3149,7 +3149,7 @@
 		ASSERT(tpp->p_szc == szc);
 
 		PP_SETFREE(tpp);
-		page_clr_all_props(tpp, 0);
+		page_clr_all_props(tpp);
 		PP_SETAGED(tpp);
 		ASSERT(tpp->p_next == tpp);
 		ASSERT(tpp->p_prev == tpp);
@@ -3525,7 +3525,7 @@
 		page_vpsub(&vp->v_pages, pp);
 
 	pp->p_hash = NULL;
-	page_clr_all_props(pp, 1);
+	page_clr_all_props(pp);
 	PP_CLRSWAP(pp);
 	pp->p_vnode = NULL;
 	pp->p_offset = (u_offset_t)-1;
@@ -4542,7 +4542,7 @@
 	old->p_vnode = NULL;
 	PP_CLRSWAP(old);
 	old->p_offset = (u_offset_t)-1;
-	page_clr_all_props(old, 1);
+	page_clr_all_props(old);
 
 	/*
 	 * Wake up processes waiting for this page.  The page's
@@ -4888,7 +4888,7 @@
 
 	for (i = 0; i < npgs; i++) {
 		ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
-		page_clr_all_props(repl, 0);
+		page_clr_all_props(repl);
 		page_set_props(repl, ppattr);
 		page_relocate_hash(repl, targ);
 
@@ -4899,7 +4899,7 @@
 		 * page_relocate_hash(), they no longer
 		 * have any meaning.
 		 */
-		page_clr_all_props(targ, 0);
+		page_clr_all_props(targ);
 		ASSERT(targ->p_next == targ);
 		ASSERT(targ->p_prev == targ);
 		page_list_concat(&pl, &targ);
@@ -4983,7 +4983,7 @@
 		pp = pplist;
 		if (pp->p_szc == 0) {
 			page_sub(&pplist, pp);
-			page_clr_all_props(pp, 0);
+			page_clr_all_props(pp);
 			PP_SETFREE(pp);
 			PP_SETAGED(pp);
 			page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
@@ -4997,7 +4997,7 @@
 			do {
 				ASSERT(PAGE_EXCL(tpp));
 				ASSERT(!hat_page_is_mapped(tpp));
-				page_clr_all_props(tpp, 0);
+				page_clr_all_props(tpp);
 				PP_SETFREE(tpp);
 				PP_SETAGED(tpp);
 			} while ((tpp = tpp->p_next) != pp);
@@ -6110,25 +6110,9 @@
 	pp->p_nrm |= (uchar_t)flags;
 }
 
-extern void mach_sync_icache_pp(page_t *);
-#pragma weak mach_sync_icache_pp
-
-/*
- * Flush I-cache if the page is being reassigned.  The hashout flag is
- * set when a page has been removed from a hash chain (i.e. vnode
- * pages). If the page stays on the hash chain there is a chance it
- * will be re-used, therefore there is no need to flush the
- * I-cache. However, if the page is being removed from a hash chain
- * then the page can be used for any new purpose, and the I-cache must
- * be flushed.
- */
-/* ARGSUSED */
 void
-page_clr_all_props(page_t *pp, int hashout)
+page_clr_all_props(page_t *pp)
 {
-	if (&mach_sync_icache_pp != NULL && hashout) {
-		mach_sync_icache_pp(pp);
-	}
 	pp->p_nrm = 0;
 }
 
--- a/usr/src/uts/sfmmu/ml/sfmmu_asm.s	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sfmmu/ml/sfmmu_asm.s	Thu Aug 06 17:39:39 2009 -0700
@@ -248,7 +248,6 @@
 	 */								;\
 	sllx	tagtarget, TTARGET_VA_SHIFT, tagtarget			;\
 	ldxa	[ttepa]ASI_MEM, tte					;\
-	TTE_CLR_SOFTEXEC_ML(tte)					;\
 	srlx	tagtarget, TTARGET_VA_SHIFT, tagtarget			;\
 	sethi	%hi(TSBTAG_INVALID), tmp2				;\
 	add	tsbep, TSBE_TAG, tmp1					;\
@@ -371,7 +370,6 @@
 #define	TSB_UPDATE(tsbep, tteva, tagtarget, tmp1, tmp2, label)		\
 	/* can't rd tteva after locking tsb because it can tlb miss */	;\
 	ldx	[tteva], tteva			/* load tte */		;\
-	TTE_CLR_SOFTEXEC_ML(tteva)					;\
 	TSB_LOCK_ENTRY(tsbep, tmp1, tmp2, label)			;\
 	sethi	%hi(TSBTAG_INVALID), tmp2				;\
 	add	tsbep, TSBE_TAG, tmp1					;\
@@ -946,11 +944,6 @@
 {
 }
 
-void
-sfmmu_patch_pgsz_reg(void)
-{
-}
-
 /* ARGSUSED */
 void
 sfmmu_load_tsbe(struct tsbe *tsbep, uint64_t vaddr, tte_t *ttep, int phys)
@@ -1441,19 +1434,6 @@
 #endif /* sun4u */
 	SET_SIZE(sfmmu_patch_shctx)
 
-	ENTRY_NP(sfmmu_patch_pgsz_reg)
-#ifdef sun4u
-	retl
-	  nop
-#else /* sun4u */
-	set	sfmmu_pgsz_load_mmustate_patch, %o0
-	MAKE_NOP_INSTR(%o1)
-	st	%o1, [%o0]
-	retl
-	flush	%o0
-#endif /* sun4u */
-	SET_SIZE(sfmmu_patch_pgsz_reg)
-
 	/*
 	 * Routine that loads an entry into a tsb using virtual addresses.
 	 * Locking is required since all cpus can use the same TSB.
@@ -2408,13 +2388,6 @@
 	ba,a,pt	%xcc, label/**/8					;\
 label/**/6:								;\
 	GET_SCDSHMERMAP(tsbarea, hmeblkpa, hatid, hmemisc)		;\
-	/*                                  				;\
-	 * hmemisc is set to 1 if this is a shared mapping. It will	;\
-	 * be cleared by CHECK_SHARED_PGSZ if this pagesize is not	;\
-	 * allowed, in order to limit the number of entries in the	;\
-	 * pagesize register.						;\
-	 */								;\
-	CHECK_SHARED_PGSZ(tsbarea, tte, hatid, hmemisc, label/**/9)	;\
 	ldn	[tsbarea + (TSBMISS_SCRATCH + TSBMISS_HMEBP)], hatid 	;\
 label/**/7:								;\
 	set	TTE_SUSPEND, hatid					;\
@@ -3295,37 +3268,8 @@
 	stub    %g1, [%g6 + TSBMISS_URTTEFLAGS]
 
 	SAVE_CTX1(%g7, %g2, %g1, tsb_shmel)	
-	ba	tsb_validtte
 #endif /* sun4u && !UTSB_PHYS */
 
-tsb_ism_validtte:
-#ifdef sun4v
-	/*
-	 * Check pagesize against bitmap for Rock page size register,
-	 * for ism mappings.
-	 *
-	 * %g1, %g2 = scratch
-	 * %g3 = tte
-	 * g4 = tte pa
-	 * g5 = tte va
-	 * g6 = tsbmiss area
-	 * %g7 = tt
-	 */
-	ldub    [%g6 + TSBMISS_URTTEFLAGS], %g1
-	and     %g1, HAT_CHKCTX1_FLAG, %g2
-	/*
-	 * Clear the HAT_CHKCTX1_FLAG in %g2 if this shared pagesize is not allowed
-	 * to limit the number of entries in the pagesize search register.
-	 */
-	CHECK_SHARED_PGSZ(%g6, %g3, %g7, %g2, ism_chk_pgsz)
-	andn	%g1, HAT_CHKCTX1_FLAG, %g1
-	or      %g1, %g2, %g1
-	stub    %g1, [%g6 + TSBMISS_URTTEFLAGS]
-	brz     %g2, tsb_validtte
-	  rdpr  %tt, %g7
-	SAVE_CTX1(%g7, %g1, %g2, tsb_shctxl)
-#endif /* sun4v */
-
 tsb_validtte:
 	/*
 	 * g3 = tte
@@ -3355,11 +3299,9 @@
 	ba,pt	%xcc, tsb_update_tl1
 	  nop
 4:
-	/*
-	 * ITLB translation was found but execute permission is
-	 * disabled. If we have software execute permission (soft exec
-	 * bit is set), then enable hardware execute permission.
-	 * Otherwise continue with a protection violation.
+	/* 
+	 * If ITLB miss check exec bit.
+	 * If not set treat as invalid TTE.
 	 */
 	cmp     %g7, T_INSTR_MMU_MISS
 	be,pn	%icc, 5f
@@ -3368,11 +3310,9 @@
 	bne,pt %icc, 3f
 	  andcc   %g3, TTE_EXECPRM_INT, %g0	/* check execute bit is set */
 5:
-	bnz,pn %icc, 3f
-	  TTE_CHK_SOFTEXEC_ML(%g3)		/* check soft execute */
 	bz,pn %icc, tsb_protfault
 	  nop
-	TTE_SET_EXEC_ML(%g3, %g4, %g7, tsb_lset_exec)
+
 3:
 	/*
 	 * Set reference bit if not already set
@@ -3415,7 +3355,6 @@
 #endif /* sun4v */
 
 tsb_update_tl1:
-	TTE_CLR_SOFTEXEC_ML(%g3)
 	srlx	%g2, TTARGET_CTX_SHIFT, %g7
 	brz,pn	%g7, tsb_kernel
 #ifdef sun4v
@@ -3658,7 +3597,10 @@
 	ldub    [%g6 + TSBMISS_URTTEFLAGS], %g5
 	or      %g5, HAT_CHKCTX1_FLAG, %g5
 	stub    %g5, [%g6 + TSBMISS_URTTEFLAGS]
+	rdpr    %tt, %g5
+	SAVE_CTX1(%g5, %g3, %g1, tsb_shctxl)
 #endif /* defined(sun4v) || defined(UTSB_PHYS) */
+
 	/*
 	 * ISM pages are always locked down.
 	 * If we can't find the tte then pagefault
@@ -3690,7 +3632,7 @@
 	/* NOT REACHED */
 	
 tsb_ism_32M_found:
-	brlz,a,pt %g3, tsb_ism_validtte
+	brlz,a,pt %g3, tsb_validtte
 	  rdpr	%tt, %g7
 	ba,pt	%xcc, tsb_ism_4M
 	  nop
@@ -3708,7 +3650,7 @@
 	    tsb_ism_4M)
 
 tsb_ism_256M_found:
-	brlz,a,pt %g3, tsb_ism_validtte
+	brlz,a,pt %g3, tsb_validtte
 	  rdpr	%tt, %g7
 
 tsb_ism_4M:
@@ -3721,7 +3663,7 @@
 	/* NOT REACHED */
 
 tsb_ism_4M_found:
-	brlz,a,pt %g3, tsb_ism_validtte
+	brlz,a,pt %g3, tsb_validtte
 	  rdpr	%tt, %g7
 
 tsb_ism_8K:
@@ -3735,7 +3677,7 @@
 	/* NOT REACHED */
 
 tsb_ism_8K_found:
-	brlz,a,pt %g3, tsb_ism_validtte
+	brlz,a,pt %g3, tsb_validtte
 	  rdpr	%tt, %g7
 
 tsb_pagefault:
--- a/usr/src/uts/sfmmu/ml/sfmmu_kdi.s	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sfmmu/ml/sfmmu_kdi.s	Thu Aug 06 17:39:39 2009 -0700
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -328,21 +328,7 @@
 	ldxa	[%g2]ASI_MEM, %g1
 	brgez,a	%g1, 4f
 	clr	%g1
-4:
-	/*
-	 * If soft execute bit is set, make sure HW execute permission
-	 * is also set. But, clear soft execute bit before giving tte to
-	 * the caller.
-	 */
-	TTE_CHK_SOFTEXEC_ML(%g1)
-	bz,pt	%icc, 6f
-	  andcc %g1, TTE_EXECPRM_INT, %g0
-	bnz,pt	%icc, 7f
-	  nop
-	TTE_SET_EXEC_ML(%g1, %g2, %g4, kdi_trap_vatotte)
-7:
-	TTE_CLR_SOFTEXEC_ML(%g1)
-	ba,a	6f
+4:	ba,a	6f
 
 5:	add	%g3, 1, %g3
 	set	mmu_hashcnt, %g4
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c	Thu Aug 06 17:39:39 2009 -0700
@@ -184,14 +184,6 @@
 #define	HAT_TMPNC	0x4
 
 /*
- * This flag is set to 0 via the MD in platforms that do not support
- * I-cache coherency in hardware. Used to enable "soft exec" mode.
- * The MD "coherency" property is optional, and defaults to 1 (because
- * coherent I-cache is the norm.)
- */
-uint_t	icache_is_coherent = 1;
-
-/*
  * Flag to allow the creation of non-cacheable translations
  * to system memory. It is off by default. At the moment this
  * flag is used by the ecache error injector. The error injector
@@ -227,7 +219,6 @@
 uint_t	disable_ism_large_pages = (1 << TTE512K);
 uint_t	disable_auto_data_large_pages = 0;
 uint_t	disable_auto_text_large_pages = 0;
-uint_t	disable_shctx_large_pages = 0;
 
 /*
  * Private sfmmu data structures for hat management
@@ -294,14 +285,6 @@
 /* Internal variable, set by MD if the HW supports shctx feature */
 int shctx_on = 0;
 
-/* Internal variable, set by MD if the HW supports the search order register */
-int pgsz_search_on = 0;
-/*
- * External /etc/system tunable, for controlling search order register
- * support.
- */
-int disable_pgsz_search = 0;
-
 #ifdef DEBUG
 static void check_scd_sfmmu_list(sfmmu_t **, sfmmu_t *, int);
 #endif
@@ -481,6 +464,7 @@
 			pfn_t, int);
 static void	sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int);
 static void	sfmmu_tlb_range_demap(demap_range_t *);
+static void	sfmmu_invalidate_ctx(sfmmu_t *);
 static void	sfmmu_sync_mmustate(sfmmu_t *);
 
 static void 	sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t);
@@ -589,7 +573,7 @@
 uint64_t	mmu_saved_gnum = 0;	/* to init incoming MMUs' gnums */
 
 #define	DEFAULT_NUM_CTXS_PER_MMU 8192
-uint_t	nctxs = DEFAULT_NUM_CTXS_PER_MMU;
+static uint_t	nctxs = DEFAULT_NUM_CTXS_PER_MMU;
 
 int		cache;			/* describes system cache */
 
@@ -743,7 +727,11 @@
 static void	sfmmu_mlist_reloc_enter(page_t *, page_t *,
 				kmutex_t **, kmutex_t **);
 static void	sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *);
-static hatlock_t *sfmmu_hat_tryenter(sfmmu_t *);
+static hatlock_t *
+		sfmmu_hat_enter(sfmmu_t *);
+static hatlock_t *
+		sfmmu_hat_tryenter(sfmmu_t *);
+static void	sfmmu_hat_exit(hatlock_t *);
 static void	sfmmu_hat_lock_all(void);
 static void	sfmmu_hat_unlock_all(void);
 static void	sfmmu_ismhat_enter(sfmmu_t *, int);
@@ -1067,14 +1055,12 @@
 	disable_ism_large_pages |= disable_large_pages;
 	disable_auto_data_large_pages = disable_large_pages;
 	disable_auto_text_large_pages = disable_large_pages;
-	disable_shctx_large_pages |= disable_large_pages;
 
 	/*
 	 * Initialize mmu-specific large page sizes.
 	 */
 	if (&mmu_large_pages_disabled) {
 		disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD);
-		disable_shctx_large_pages |= disable_large_pages;
 		disable_ism_large_pages |=
 		    mmu_large_pages_disabled(HAT_LOAD_SHARE);
 		disable_auto_data_large_pages |=
@@ -1413,14 +1399,6 @@
 		shctx_on = 0;
 	}
 
-	/*
-	 * If support for page size search is disabled via /etc/system
-	 * set pgsz_search_on to 0 here.
-	 */
-	if (pgsz_search_on && disable_pgsz_search) {
-		pgsz_search_on = 0;
-	}
-
 	if (shctx_on) {
 		srd_buckets = kmem_zalloc(SFMMU_MAX_SRD_BUCKETS *
 		    sizeof (srd_buckets[0]), KM_SLEEP);
@@ -1595,11 +1573,6 @@
 	sfmmup->sfmmu_scdp = NULL;
 	sfmmup->sfmmu_scd_link.next = NULL;
 	sfmmup->sfmmu_scd_link.prev = NULL;
-
-	if (&mmu_set_pgsz_order && sfmmup !=  ksfmmup) {
-		mmu_set_pgsz_order(sfmmup, 0);
-		sfmmu_init_pgsz_hv(sfmmup);
-	}
 	return (sfmmup);
 }
 
@@ -2082,8 +2055,6 @@
 				newhat->sfmmu_scdismttecnt[i] =
 				    hat->sfmmu_scdismttecnt[i];
 			}
-		} else if (&mmu_set_pgsz_order) {
-			mmu_set_pgsz_order(newhat, 0);
 		}
 
 		sfmmu_check_page_sizes(newhat, 1);
@@ -2579,7 +2550,7 @@
 void
 sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz)
 {
-	ASSERT((attr & ~(SFMMU_LOAD_ALLATTR | HAT_ATTR_NOSOFTEXEC)) == 0);
+	ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR));
 
 	ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */);
 	ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */);
@@ -2593,18 +2564,6 @@
 	if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) {
 		panic("sfmmu_memtte: can't set both NFO and EXEC bits");
 	}
-
-	/*
-	 * Disable hardware execute permission to force a fault if
-	 * this page is executed, so we can detect the execution.  Set
-	 * the soft exec bit to remember that this TTE has execute
-	 * permission.
-	 */
-	if (TTE_IS_EXECUTABLE(ttep) && (attr & HAT_ATTR_NOSOFTEXEC) == 0 &&
-	    icache_is_coherent == 0) {
-		TTE_CLR_EXEC(ttep);
-		TTE_SET_SOFTEXEC(ttep);
-	}
 }
 
 /*
@@ -3095,26 +3054,9 @@
 			    (void *)hmeblkp);
 		}
 		ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep));
-
-		if (TTE_IS_EXECUTABLE(&tteold) && TTE_IS_SOFTEXEC(ttep)) {
-			TTE_SET_EXEC(ttep);
-		}
 	}
 
 	if (pp) {
-		/*
-		 * If we know that this page will be executed, because
-		 * it was in the past (PP_ISEXEC is already true), or
-		 * if the caller says it will likely be executed
-		 * (HAT_LOAD_TEXT is true), then there is no need to
-		 * dynamically detect execution with a soft exec
-		 * fault. Enable hardware execute permission now.
-		 */
-		if ((PP_ISEXEC(pp) || (flags & HAT_LOAD_TEXT)) &&
-		    TTE_IS_SOFTEXEC(ttep)) {
-			TTE_SET_EXEC(ttep);
-		}
-
 		if (size == TTE8K) {
 #ifdef VAC
 			/*
@@ -3138,12 +3080,6 @@
 				sfmmu_page_exit(pmtx);
 			}
 
-			if (TTE_EXECUTED(ttep)) {
-				pmtx = sfmmu_page_enter(pp);
-				PP_SETEXEC(pp);
-				sfmmu_page_exit(pmtx);
-			}
-
 		} else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) {
 			/*
 			 * sfmmu_pagearray_setup failed so return
@@ -3151,9 +3087,6 @@
 			sfmmu_mlist_exit(pml);
 			return (1);
 		}
-
-	} else if (TTE_IS_SOFTEXEC(ttep)) {
-		TTE_SET_EXEC(ttep);
 	}
 
 	/*
@@ -3227,17 +3160,11 @@
 			if (!(sfmmup->sfmmu_tteflags & tteflag)) {
 				hatlockp = sfmmu_hat_enter(sfmmup);
 				sfmmup->sfmmu_tteflags |= tteflag;
-				if (&mmu_set_pgsz_order) {
-					mmu_set_pgsz_order(sfmmup, 1);
-				}
 				sfmmu_hat_exit(hatlockp);
 			}
 		} else if (!(sfmmup->sfmmu_rtteflags & tteflag)) {
 			hatlockp = sfmmu_hat_enter(sfmmup);
 			sfmmup->sfmmu_rtteflags |= tteflag;
-			if (&mmu_set_pgsz_order && sfmmup !=  ksfmmup) {
-				mmu_set_pgsz_order(sfmmup, 1);
-			}
 			sfmmu_hat_exit(hatlockp);
 		}
 		/*
@@ -3284,8 +3211,7 @@
 		 * ref bit in tteload.
 		 */
 		ASSERT(TTE_IS_REF(ttep));
-		if (TTE_IS_MOD(&tteold) || (TTE_EXECUTED(&tteold) &&
-		    !TTE_IS_EXECUTABLE(ttep))) {
+		if (TTE_IS_MOD(&tteold)) {
 			sfmmu_ttesync(sfmmup, vaddr, &tteold, pp);
 		}
 		/*
@@ -3416,12 +3342,6 @@
 			sfmmu_page_exit(pmtx);
 		}
 
-		if (TTE_EXECUTED(ttep)) {
-			pmtx = sfmmu_page_enter(pp);
-			PP_SETEXEC(pp);
-			sfmmu_page_exit(pmtx);
-		}
-
 		/*
 		 * If this is a remap we skip vac & contiguity checks.
 		 */
@@ -5052,11 +4972,9 @@
 				continue;
 			}
 
-			if ((tteflags.tte_intlo & TTE_HWWR_INT) ||
-			    (TTE_EXECUTED(&tte) &&
-			    !TTE_IS_EXECUTABLE(&ttemod))) {
+			if (tteflags.tte_intlo & TTE_HWWR_INT) {
 				/*
-				 * need to sync if clearing modify/exec bit.
+				 * need to sync if we are clearing modify bit.
 				 */
 				sfmmu_ttesync(sfmmup, addr, &tte, pp);
 			}
@@ -5109,14 +5027,6 @@
 		ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr);
 		ttemaskp->tte_inthi = TTEINTHI_ATTR;
 		ttemaskp->tte_intlo = TTEINTLO_ATTR;
-		if (!icache_is_coherent) {
-			if (!(attr & PROT_EXEC)) {
-				TTE_SET_SOFTEXEC(ttemaskp);
-			} else {
-				TTE_CLR_EXEC(ttemaskp);
-				TTE_SET_SOFTEXEC(&ttevalue);
-			}
-		}
 		break;
 	case SFMMU_SETATTR:
 		ASSERT(!(attr & ~HAT_PROT_MASK));
@@ -5171,9 +5081,6 @@
 	if (TTE_IS_EXECUTABLE(ttep)) {
 		attr |= PROT_EXEC;
 	}
-	if (TTE_IS_SOFTEXEC(ttep)) {
-		attr |= PROT_EXEC;
-	}
 	if (!TTE_IS_PRIVILEGED(ttep)) {
 		attr |= PROT_USER;
 	}
@@ -5390,11 +5297,6 @@
 
 			ttemod = tte;
 			TTE_SET_LOFLAGS(&ttemod, tteflags, pprot);
-			ASSERT(TTE_IS_SOFTEXEC(&tte) ==
-			    TTE_IS_SOFTEXEC(&ttemod));
-			ASSERT(TTE_IS_EXECUTABLE(&tte) ==
-			    TTE_IS_EXECUTABLE(&ttemod));
-
 #if defined(SF_ERRATA_57)
 			if (check_exec && addr < errata57_limit)
 				ttemod.tte_exec_perm = 0;
@@ -6094,8 +5996,7 @@
 				continue;
 			}
 
-			if (!(flags & HAT_UNLOAD_NOSYNC) ||
-			    (pp != NULL && TTE_EXECUTED(&tte))) {
+			if (!(flags & HAT_UNLOAD_NOSYNC)) {
 				sfmmu_ttesync(sfmmup, addr, &tte, pp);
 			}
 
@@ -6435,49 +6336,37 @@
 sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp)
 {
 	uint_t rm = 0;
-	int sz = TTE_CSZ(ttep);
+	int   	sz;
 	pgcnt_t	npgs;
 
 	ASSERT(TTE_IS_VALID(ttep));
 
-	if (!TTE_IS_NOSYNC(ttep)) {
-
-		if (TTE_IS_REF(ttep))
-			rm |= P_REF;
-
-		if (TTE_IS_MOD(ttep))
-			rm |= P_MOD;
-
-		if (rm != 0) {
-			if (sfmmup != NULL && sfmmup->sfmmu_rmstat) {
-				int i;
-				caddr_t	vaddr = addr;
-
-				for (i = 0; i < TTEPAGES(sz); i++) {
-					hat_setstat(sfmmup->sfmmu_as, vaddr,
-					    MMU_PAGESIZE, rm);
-					vaddr += MMU_PAGESIZE;
-				}
-			}
-		}
-	}
-
-	if (!pp)
+	if (TTE_IS_NOSYNC(ttep)) {
 		return;
-
-	/*
-	 * If software says this page is executable, and the page was
-	 * in fact executed (indicated by hardware exec permission
-	 * being enabled), then set P_EXEC on the page to remember
-	 * that it was executed. The I$ will be flushed when the page
-	 * is reassigned.
-	 */
-	if (TTE_EXECUTED(ttep)) {
-		rm |= P_EXEC;
-	} else if (rm == 0) {
+	}
+
+	if (TTE_IS_REF(ttep))  {
+		rm = P_REF;
+	}
+	if (TTE_IS_MOD(ttep))  {
+		rm |= P_MOD;
+	}
+
+	if (rm == 0) {
 		return;
 	}
 
+	sz = TTE_CSZ(ttep);
+	if (sfmmup != NULL && sfmmup->sfmmu_rmstat) {
+		int i;
+		caddr_t	vaddr = addr;
+
+		for (i = 0; i < TTEPAGES(sz); i++, vaddr += MMU_PAGESIZE) {
+			hat_setstat(sfmmup->sfmmu_as, vaddr, MMU_PAGESIZE, rm);
+		}
+
+	}
+
 	/*
 	 * XXX I want to use cas to update nrm bits but they
 	 * currently belong in common/vm and not in hat where
@@ -6485,6 +6374,8 @@
 	 * The nrm bits are protected by the same mutex as
 	 * the one that protects the page's mapping list.
 	 */
+	if (!pp)
+		return;
 	ASSERT(sfmmu_mlist_held(pp));
 	/*
 	 * If the tte is for a large page, we need to sync all the
@@ -6503,8 +6394,7 @@
 		ASSERT(pp);
 		ASSERT(sfmmu_mlist_held(pp));
 		if (((rm & P_REF) != 0 && !PP_ISREF(pp)) ||
-		    ((rm & P_MOD) != 0 && !PP_ISMOD(pp)) ||
-		    ((rm & P_EXEC) != 0 && !PP_ISEXEC(pp)))
+		    ((rm & P_MOD) != 0 && !PP_ISMOD(pp)))
 			hat_page_setattr(pp, rm);
 
 		/*
@@ -6826,7 +6716,6 @@
 	kmutex_t	*low, *high;
 	spgcnt_t	npages, i;
 	page_t		*pl = NULL;
-	uint_t		ppattr;
 	int		old_pil;
 	cpuset_t	cpuset;
 	int		cap_cpus;
@@ -6977,9 +6866,8 @@
 		 * Copy attributes.  VAC consistency was handled above,
 		 * if required.
 		 */
-		ppattr = hat_page_getattr(tpp, (P_MOD | P_REF | P_RO));
-		page_clr_all_props(rpp, 0);
-		page_set_props(rpp, ppattr);
+		rpp->p_nrm = tpp->p_nrm;
+		tpp->p_nrm = 0;
 		rpp->p_index = tpp->p_index;
 		tpp->p_index = 0;
 #ifdef VAC
@@ -7791,7 +7679,7 @@
 	noshuffle = flag & P_NSH;
 	flag &= ~P_NSH;
 
-	ASSERT(!(flag & ~(P_MOD | P_REF | P_RO | P_EXEC)));
+	ASSERT(!(flag & ~(P_MOD | P_REF | P_RO)));
 
 	/*
 	 * nothing to do if attribute already set
@@ -8480,8 +8368,6 @@
 	int		j;
 	sf_scd_t	*scdp;
 	uchar_t		rid;
-	hatlock_t 	*hatlockp;
-	int		ismnotinscd = 0;
 
 	ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY));
 	scdp = sfmmup->sfmmu_scdp;
@@ -8502,21 +8388,9 @@
 				/* ISMs is not in SCD */
 				npgs +=
 				    ism_map[j].imap_ismhat->sfmmu_ttecnt[szc];
-				ismnotinscd = 1;
-			}
-		}
-	}
-
-	if (&mmu_set_pgsz_order) {
-		hatlockp = sfmmu_hat_enter(sfmmup);
-		if (ismnotinscd) {
-			SFMMU_FLAGS_SET(sfmmup, HAT_ISMNOTINSCD);
-		} else {
-			SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMNOTINSCD);
-		}
-		sfmmu_hat_exit(hatlockp);
-	}
-
+			}
+		}
+	}
 	sfmmup->sfmmu_ismttecnt[szc] = npgs;
 	sfmmup->sfmmu_scdismttecnt[szc] = npgs_scd;
 	return (npgs);
@@ -8850,11 +8724,6 @@
 		sfmmu_hat_exit(hatlockp);
 	}
 
-	if (&mmu_set_pgsz_order) {
-		hatlockp = sfmmu_hat_enter(sfmmup);
-		mmu_set_pgsz_order(sfmmup, 1);
-		sfmmu_hat_exit(hatlockp);
-	}
 	sfmmu_ismhat_exit(sfmmup, 0);
 
 	/*
@@ -9050,11 +8919,6 @@
 			(void) ism_tsb_entries(sfmmup, i);
 	}
 
-	if (&mmu_set_pgsz_order) {
-		hatlockp = sfmmu_hat_enter(sfmmup);
-		mmu_set_pgsz_order(sfmmup, 1);
-		sfmmu_hat_exit(hatlockp);
-	}
 	sfmmu_ismhat_exit(sfmmup, 0);
 
 	/*
@@ -11027,7 +10891,7 @@
 	mutex_exit(low);
 }
 
-hatlock_t *
+static hatlock_t *
 sfmmu_hat_enter(sfmmu_t *sfmmup)
 {
 	hatlock_t	*hatlockp;
@@ -11054,7 +10918,7 @@
 	return (NULL);
 }
 
-void
+static void
 sfmmu_hat_exit(hatlock_t *hatlockp)
 {
 	if (hatlockp != NULL)
@@ -12197,13 +12061,8 @@
 		 * then we flush the shared TSBs, if we find a private hat,
 		 * which is part of an SCD, but where the region
 		 * is not part of the SCD then we flush the private TSBs.
-		 *
-		 * If the Rock page size register is present, then SCDs
-		 * may contain both shared and private pages, so we cannot
-		 * use this optimization to avoid flushing private TSBs.
-		 */
-		if (pgsz_search_on == 0 &&
-		    !sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL &&
+		 */
+		if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL &&
 		    !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) {
 			scdp = sfmmup->sfmmu_scdp;
 			if (SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) {
@@ -12332,13 +12191,8 @@
 		 * which is part of an SCD, but where the region
 		 * corresponding to this va is not part of the SCD then we
 		 * flush the private TSBs.
-		 *
-		 * If the Rock page size register is present, then SCDs
-		 * may contain both shared and private pages, so we cannot
-		 * use this optimization to avoid flushing private TSBs.
-		 */
-		if (pgsz_search_on == 0 &&
-		    !sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL &&
+		 */
+		if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL &&
 		    !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD) &&
 		    !SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) {
 			if (!find_ism_rid(sfmmup, ism_sfmmup, va,
@@ -12648,7 +12502,7 @@
  * A per-process (PP) lock is used to synchronize ctx allocations in
  * resume() and ctx invalidations here.
  */
-void
+static void
 sfmmu_invalidate_ctx(sfmmu_t *sfmmup)
 {
 	cpuset_t cpuset;
@@ -14174,9 +14028,6 @@
 			if (tteflag && !(sfmmup->sfmmu_rtteflags & tteflag)) {
 				hatlockp = sfmmu_hat_enter(sfmmup);
 				sfmmup->sfmmu_rtteflags |= tteflag;
-				if (&mmu_set_pgsz_order) {
-					mmu_set_pgsz_order(sfmmup, 1);
-				}
 				sfmmu_hat_exit(hatlockp);
 			}
 			hatlockp = sfmmu_hat_enter(sfmmup);
@@ -15232,9 +15083,6 @@
 		ASSERT(sfmmup->sfmmu_ttecnt[i] >= scdp->scd_rttecnt[i]);
 		atomic_add_long(&sfmmup->sfmmu_ttecnt[i],
 		    -sfmmup->sfmmu_scdrttecnt[i]);
-		if (!sfmmup->sfmmu_ttecnt[i]) {
-			sfmmup->sfmmu_tteflags &= ~(1 << i);
-		}
 	}
 	/* update tsb0 inflation count */
 	if (old_scdp != NULL) {
@@ -15245,9 +15093,6 @@
 	    scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt);
 	sfmmup->sfmmu_tsb0_4minflcnt -= scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt;
 
-	if (&mmu_set_pgsz_order) {
-		mmu_set_pgsz_order(sfmmup, 0);
-	}
 	sfmmu_hat_exit(hatlockp);
 
 	if (old_scdp != NULL) {
@@ -15307,7 +15152,7 @@
 	for (scdp = srdp->srd_scdp; scdp != NULL;
 	    scdp = scdp->scd_next) {
 		SF_RGNMAP_EQUAL(&scdp->scd_region_map,
-		    &sfmmup->sfmmu_region_map, SFMMU_RGNMAP_WORDS, ret);
+		    &sfmmup->sfmmu_region_map, ret);
 		if (ret == 1) {
 			SF_SCD_INCR_REF(scdp);
 			mutex_exit(&srdp->srd_scd_mutex);
@@ -15455,10 +15300,6 @@
 		    scdp->scd_rttecnt[i]);
 		atomic_add_long(&sfmmup->sfmmu_ttecnt[i],
 		    sfmmup->sfmmu_scdrttecnt[i]);
-		if (sfmmup->sfmmu_ttecnt[i] &&
-		    (sfmmup->sfmmu_tteflags & (1 << i)) == 0) {
-			sfmmup->sfmmu_tteflags |= (1 << i);
-		}
 		sfmmup->sfmmu_scdrttecnt[i] = 0;
 		/* update ismttecnt to include SCD ism before hat leaves SCD */
 		sfmmup->sfmmu_ismttecnt[i] += sfmmup->sfmmu_scdismttecnt[i];
@@ -15472,9 +15313,6 @@
 	}
 	sfmmup->sfmmu_scdp = NULL;
 
-	if (&mmu_set_pgsz_order) {
-		mmu_set_pgsz_order(sfmmup, 0);
-	}
 	sfmmu_hat_exit(hatlockp);
 
 	/*
@@ -15520,8 +15358,7 @@
 	 * It is possible that the scd has been freed and reallocated with a
 	 * different region map while we've been waiting for the srd_scd_mutex.
 	 */
-	SF_RGNMAP_EQUAL(scd_rmap, &sp->scd_region_map,
-	    SFMMU_RGNMAP_WORDS, ret);
+	SF_RGNMAP_EQUAL(scd_rmap, &sp->scd_region_map, ret);
 	if (ret != 1) {
 		mutex_exit(&srdp->srd_scd_mutex);
 		return;
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.h	Thu Aug 06 17:39:39 2009 -0700
@@ -112,7 +112,6 @@
 #define	P_TNC	0x10		/* non-caching is temporary bit */
 #define	P_KPMS	0x20		/* kpm mapped small (vac alias prevention) */
 #define	P_KPMC	0x40		/* kpm conflict page (vac alias prevention) */
-#define	P_EXEC	0x80		/* execution reference (I-cache filled) */
 
 #define	PP_GENERIC_ATTR(pp)	((pp)->p_nrm & (P_MOD | P_REF | P_RO))
 #define	PP_ISMOD(pp)		((pp)->p_nrm & P_MOD)
@@ -125,7 +124,6 @@
 #endif
 #define	PP_ISKPMS(pp)		((pp)->p_nrm & P_KPMS)
 #define	PP_ISKPMC(pp)		((pp)->p_nrm & P_KPMC)
-#define	PP_ISEXEC(pp)		((pp)->p_nrm & P_EXEC)
 
 #define	PP_SETMOD(pp)		((pp)->p_nrm |= P_MOD)
 #define	PP_SETREF(pp)		((pp)->p_nrm |= P_REF)
@@ -138,7 +136,6 @@
 #endif
 #define	PP_SETKPMS(pp)		((pp)->p_nrm |= P_KPMS)
 #define	PP_SETKPMC(pp)		((pp)->p_nrm |= P_KPMC)
-#define	PP_SETEXEC(pp)		((pp)->p_nrm |= P_EXEC)
 
 #define	PP_CLRMOD(pp)		((pp)->p_nrm &= ~P_MOD)
 #define	PP_CLRREF(pp)		((pp)->p_nrm &= ~P_REF)
@@ -150,17 +147,6 @@
 #endif
 #define	PP_CLRKPMS(pp)		((pp)->p_nrm &= ~P_KPMS)
 #define	PP_CLRKPMC(pp)		((pp)->p_nrm &= ~P_KPMC)
-#define	PP_CLREXEC(pp)		((pp)->p_nrm &= ~P_EXEC)
-
-/*
- * Support for non-coherent I-cache. If the MD property "coherency"
- * is set to 0, it means that the I-cache must be flushed in
- * software. Use the "soft exec" bit in the TTE to detect when a page
- * has been executed, so that it can be flushed before it is re-used
- * for another program.
- */
-#define	TTE_EXECUTED(ttep)						\
-	(TTE_IS_EXECUTABLE(ttep) && TTE_IS_SOFTEXEC(ttep))
 
 /*
  * All shared memory segments attached with the SHM_SHARE_MMU flag (ISM)
@@ -337,15 +323,15 @@
 }
 
 /*
- * Returns 1 if region map1 and map2 are equal.
+ * Returns 1 if map1 and map2 are equal.
  */
-#define	SF_RGNMAP_EQUAL(map1, map2, words, rval)	{	\
+#define	SF_RGNMAP_EQUAL(map1, map2, rval)	{		\
 	int _i;							\
-	for (_i = 0; _i < words; _i++) {			\
+	for (_i = 0; _i < SFMMU_RGNMAP_WORDS; _i++) {		\
 		if ((map1)->bitmap[_i] != (map2)->bitmap[_i])	\
 			break;					\
 	}							\
-	if (_i < words)					\
+	if (_i < SFMMU_RGNMAP_WORDS)				\
 		rval = 0;					\
 	else							\
 		rval = 1;					\
@@ -609,13 +595,9 @@
 
 extern uint_t		max_mmu_ctxdoms;
 extern mmu_ctx_t	**mmu_ctxs_tbl;
-extern uint_t		nctxs;
 
 extern void	sfmmu_cpu_init(cpu_t *);
 extern void	sfmmu_cpu_cleanup(cpu_t *);
-extern void	sfmmu_invalidate_ctx(sfmmu_t *);
-extern hatlock_t *sfmmu_hat_enter(sfmmu_t *);
-extern void	sfmmu_hat_exit(hatlock_t *);
 
 /*
  * The following structure is used to get MMU context domain information for
@@ -652,6 +634,7 @@
 	uint64_t	cnum:16;
 } sfmmu_ctx_t;
 
+
 /*
  * The platform dependent hat structure.
  * tte counts should be protected by cas.
@@ -713,11 +696,7 @@
 	sf_rgn_link_t	*sfmmu_hmeregion_links[SFMMU_L1_HMERLINKS];
 	sf_rgn_link_t	sfmmu_scd_link;	/* link to scd or pending queue */
 #ifdef sun4v
-	/* ttecnt for Rock pagesize register management */
-	ulong_t		sfmmu_mmuttecnt[MMU_PAGE_SIZES];
 	struct hv_tsb_block sfmmu_hvblock;
-	struct hv_pgsz_order sfmmu_pgsz_order; /*  pagesize search order */
-	uint8_t		sfmmu_pgsz_map; /* bit map to control shared pgsz use */
 #endif
 	/*
 	 * sfmmu_ctxs is a variable length array of max_mmu_ctxdoms # of
@@ -763,8 +742,6 @@
 
 extern int disable_shctx;
 extern int shctx_on;
-extern int pgsz_search_on;
-extern int disable_pgsz_search;
 
 /*
  * bit mask for managing vac conflicts on large pages.
@@ -878,7 +855,6 @@
 #define	HAT_CTX1_FLAG   	0x100 /* ISM imap hatflag for ctx1 */
 #define	HAT_JOIN_SCD		0x200 /* region is joining scd */
 #define	HAT_ALLCTX_INVALID	0x400 /* all per-MMU ctxs are invalidated */
-#define	HAT_ISMNOTINSCD		0x800 /* Not all ISM segs are in the SCD */
 
 #define	SFMMU_LGPGS_INUSE(sfmmup)					\
 	(((sfmmup)->sfmmu_tteflags | (sfmmup)->sfmmu_rtteflags) ||	\
@@ -1822,8 +1798,7 @@
 	uintptr_t		scratch[3];
 	ulong_t		shmermap[SFMMU_HMERGNMAP_WORDS];	/* 8 bytes */
 	ulong_t		scd_shmermap[SFMMU_HMERGNMAP_WORDS];	/* 8 bytes */
-	uint8_t		pgsz_bitmap;		 /* limits ctx1 page sizes */
-	uint8_t		pad[47];		 /* pad to 64 bytes */
+	uint8_t		pad[48];			/* pad to 64 bytes */
 };
 
 /*
@@ -2354,17 +2329,11 @@
 #pragma weak mmu_large_pages_disabled
 #pragma weak mmu_set_ctx_page_sizes
 #pragma weak mmu_check_page_sizes
-#pragma weak mmu_set_pgsz_order
-#pragma weak sfmmu_init_pgsz_hv
-#pragma weak mmu_enable_pgsz_search
 
 extern void mmu_init_scd(sf_scd_t *);
 extern uint_t mmu_large_pages_disabled(uint_t);
 extern void mmu_set_ctx_page_sizes(sfmmu_t *);
 extern void mmu_check_page_sizes(sfmmu_t *, uint64_t *);
-extern void mmu_set_pgsz_order(sfmmu_t *, int);
-extern void sfmmu_init_pgsz_hv(sfmmu_t *);
-extern void mmu_enable_pgsz_search();
 
 extern sfmmu_t 		*ksfmmup;
 extern caddr_t		ktsb_base;
@@ -2406,15 +2375,12 @@
 extern uint_t		disable_ism_large_pages;
 extern uint_t		disable_auto_data_large_pages;
 extern uint_t		disable_auto_text_large_pages;
-extern uint_t		disable_shctx_large_pages;
-
-extern void		sfmmu_patch_shctx(void);
-extern void		sfmmu_patch_pgsz_reg(void);
 
 /* kpm externals */
 extern pfn_t		sfmmu_kpm_vatopfn(caddr_t);
 extern void		sfmmu_kpm_patch_tlbm(void);
 extern void		sfmmu_kpm_patch_tsbm(void);
+extern void		sfmmu_patch_shctx(void);
 extern void		sfmmu_kpm_load_tsb(caddr_t, tte_t *, int);
 extern void		sfmmu_kpm_unload_tsb(caddr_t, int);
 extern void		sfmmu_kpm_tsbmtl(short *, uint_t *, int);
--- a/usr/src/uts/sparc/fpu/fpu_simulator.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sparc/fpu/fpu_simulator.c	Thu Aug 06 17:39:39 2009 -0700
@@ -112,14 +112,6 @@
 	{ "fpu_sim_fnmaddd",		KSTAT_DATA_UINT64},
 	{ "fpu_sim_fnmsubs",		KSTAT_DATA_UINT64},
 	{ "fpu_sim_fnmsubd",		KSTAT_DATA_UINT64},
-	{ "fpu_sim_fumadds",		KSTAT_DATA_UINT64},
-	{ "fpu_sim_fumaddd",		KSTAT_DATA_UINT64},
-	{ "fpu_sim_fumsubs",		KSTAT_DATA_UINT64},
-	{ "fpu_sim_fumsubd",		KSTAT_DATA_UINT64},
-	{ "fpu_sim_fnumadds",		KSTAT_DATA_UINT64},
-	{ "fpu_sim_fnumaddd",		KSTAT_DATA_UINT64},
-	{ "fpu_sim_fnumsubs",		KSTAT_DATA_UINT64},
-	{ "fpu_sim_fnumsubd",		KSTAT_DATA_UINT64},
 	{ "fpu_sim_invalid",		KSTAT_DATA_UINT64},
 };
 
@@ -185,14 +177,12 @@
 	enum fcc_type	cc;
 	uint32_t	nfcc;		/* fcc number field. */
 	uint64_t	lusr;
-	uint_t		fmau_mul_exceptions;
 
 	nrs1 = inst.rs1;
 	nrs2 = inst.rs2;
 	nrd = inst.rd;
 	fsr = *pfsr;
 	pfpsd->fp_current_exceptions = 0;	/* Init current exceptions. */
-	fmau_mul_exceptions = 0;
 	pfpsd->fp_fsrtem    = fsr.tem;		/* Obtain fsr's tem */
 	/*
 	 * Obtain rounding direction and precision
@@ -200,7 +190,7 @@
 	pfpsd->fp_direction = GSR_IM(gsr) ? GSR_IRND(gsr) : fsr.rnd;
 	pfpsd->fp_precision = fsr.rnp;
 
-	if (inst.op3 == 0x37) { /* FMA-fused opcode */
+	if (inst.op3 == 0x37) { /* IMPDEP2B FMA-fused opcode */
 		fp_fma_inst_type *fma_inst;
 		uint32_t	nrs3;
 		unpacked	us3;
@@ -263,121 +253,6 @@
 			FPUINFO_KSTAT_PREC(fma_inst->sz, fpu_sim_fnmsubs,
 			    fpu_sim_fnmsubd, fpu_sim_invalid);
 		}
-	} else if (inst.op3 == fmau) { /* FMA-unfused opcode */
-		fp_fma_inst_type *fmau_inst;
-		uint32_t	nrs3;
-		unpacked	us3;
-		unpacked	ust;
-		/*
-		 * For FMA-unfused, if either the multiply part or the add
-		 * part raises an exception whose trap is enabled, we trap
-		 * with cexc indicating only that exception and aexc un-
-		 * changed.  If neither part raises an exception whose trap
-		 * is enabled, the instruction completes with cexc indicating
-		 * just those exceptions that occurred in the add part and
-		 * aexc accumulating all exceptions that occurred in either
-		 * part.  We use fmau_mul_exceptions to keep track of the
-		 * exceptions that occurred in the multiply part while we
-		 * simulate the add part.
-		 */
-		fmau_inst = (fp_fma_inst_type *) &inst;
-		nrs2 = fmau_inst->rs2;
-		nrs3 = fmau_inst->rs3;
-		switch (fmau_inst->var) {
-		case fmadd:
-			_fp_unpack(pfpsd, &us1, nrs1, fmau_inst->sz);
-			_fp_unpack(pfpsd, &us2, nrs2, fmau_inst->sz);
-			_fp_mul(pfpsd, &us1, &us2, &ust);
-			_fp_pack(pfpsd, &ust, nrd, fmau_inst->sz);
-			if ((pfpsd->fp_current_exceptions & fsr.tem) == 0) {
-				fmau_mul_exceptions =
-				    pfpsd->fp_current_exceptions;
-				pfpsd->fp_current_exceptions = 0;
-				_fp_unpack(pfpsd, &us3, nrs3, fmau_inst->sz);
-				_fp_unpack(pfpsd, &ust, nrd, fmau_inst->sz);
-				_fp_add(pfpsd, &ust, &us3, &ud);
-				/* ensure QSNaN1 has precedence over QNaN3 */
-				if ((us3.fpclass == fp_quiet) &&
-				    ((us1.fpclass == fp_signaling) ||
-				    (us2.fpclass == fp_signaling)))
-					ud = ust;
-				_fp_pack(pfpsd, &ud, nrd, fmau_inst->sz);
-			}
-			FPUINFO_KSTAT_PREC(fmau_inst->sz, fpu_sim_fumadds,
-			    fpu_sim_fumaddd, fpu_sim_invalid);
-			break;
-		case fmsub:
-			_fp_unpack(pfpsd, &us1, nrs1, fmau_inst->sz);
-			_fp_unpack(pfpsd, &us2, nrs2, fmau_inst->sz);
-			_fp_mul(pfpsd, &us1, &us2, &ust);
-			_fp_pack(pfpsd, &ust, nrd, fmau_inst->sz);
-			if ((pfpsd->fp_current_exceptions & fsr.tem) == 0) {
-				fmau_mul_exceptions =
-				    pfpsd->fp_current_exceptions;
-				pfpsd->fp_current_exceptions = 0;
-				_fp_unpack(pfpsd, &us3, nrs3, fmau_inst->sz);
-				_fp_unpack(pfpsd, &ust, nrd, fmau_inst->sz);
-				_fp_sub(pfpsd, &ust, &us3, &ud);
-				/* ensure QSNaN1 has precedence over QNaN3 */
-				if ((us3.fpclass == fp_quiet) &&
-				    ((us1.fpclass == fp_signaling) ||
-				    (us2.fpclass == fp_signaling)))
-					ud = ust;
-				_fp_pack(pfpsd, &ud, nrd, fmau_inst->sz);
-			}
-			FPUINFO_KSTAT_PREC(fmau_inst->sz, fpu_sim_fumsubs,
-			    fpu_sim_fumsubd, fpu_sim_invalid);
-			break;
-		case fnmadd:
-			_fp_unpack(pfpsd, &us1, nrs1, fmau_inst->sz);
-			_fp_unpack(pfpsd, &us2, nrs2, fmau_inst->sz);
-			_fp_mul(pfpsd, &us1, &us2, &ust);
-			_fp_pack(pfpsd, &ust, nrd, fmau_inst->sz);
-			if ((pfpsd->fp_current_exceptions & fsr.tem) == 0) {
-				fmau_mul_exceptions =
-				    pfpsd->fp_current_exceptions;
-				pfpsd->fp_current_exceptions = 0;
-				_fp_unpack(pfpsd, &us3, nrs3, fmau_inst->sz);
-				_fp_unpack(pfpsd, &ust, nrd, fmau_inst->sz);
-				if (ust.fpclass != fp_quiet &&
-				    ust.fpclass != fp_signaling)
-					ust.sign ^= 1;
-				_fp_sub(pfpsd, &ust, &us3, &ud);
-				/* ensure QSNaN1 has precedence over QNaN3 */
-				if ((us3.fpclass == fp_quiet) &&
-				    ((us1.fpclass == fp_signaling) ||
-				    (us2.fpclass == fp_signaling)))
-					ud = ust;
-				_fp_pack(pfpsd, &ud, nrd, fmau_inst->sz);
-			}
-			FPUINFO_KSTAT_PREC(fmau_inst->sz, fpu_sim_fnumadds,
-			    fpu_sim_fnumaddd, fpu_sim_invalid);
-			break;
-		case fnmsub:
-			_fp_unpack(pfpsd, &us1, nrs1, fmau_inst->sz);
-			_fp_unpack(pfpsd, &us2, nrs2, fmau_inst->sz);
-			_fp_mul(pfpsd, &us1, &us2, &ust);
-			_fp_pack(pfpsd, &ust, nrd, fmau_inst->sz);
-			if ((pfpsd->fp_current_exceptions & fsr.tem) == 0) {
-				fmau_mul_exceptions =
-				    pfpsd->fp_current_exceptions;
-				pfpsd->fp_current_exceptions = 0;
-				_fp_unpack(pfpsd, &us3, nrs3, fmau_inst->sz);
-				_fp_unpack(pfpsd, &ust, nrd, fmau_inst->sz);
-				if (ust.fpclass != fp_quiet &&
-				    ust.fpclass != fp_signaling)
-					ust.sign ^= 1;
-				_fp_add(pfpsd, &ust, &us3, &ud);
-				/* ensure QSNaN1 has precedence over QNaN3 */
-				if ((us3.fpclass == fp_quiet) &&
-				    ((us1.fpclass == fp_signaling) ||
-				    (us2.fpclass == fp_signaling)))
-					ud = ust;
-				_fp_pack(pfpsd, &ud, nrd, fmau_inst->sz);
-			}
-			FPUINFO_KSTAT_PREC(fmau_inst->sz, fpu_sim_fnumsubs,
-			    fpu_sim_fnumsubd, fpu_sim_invalid);
-		}
 	} else {
 		nfcc = nrd & 0x3;
 		if (inst.op3 == 0x35) {		/* fpop2 */
@@ -645,7 +520,7 @@
 		*pfsr = fsr;
 		return (ftt_ieee);
 	} else {	/* Just set accrued exception field. */
-		fsr.aexc |= pfpsd->fp_current_exceptions | fmau_mul_exceptions;
+		fsr.aexc |= pfpsd->fp_current_exceptions;
 	}
 	*pfsr = fsr;
 	return (ftt_none);
@@ -697,7 +572,7 @@
 			return (ftt);
 	} else if ((fp.inst.hibits == 2) &&
 	    ((fp.inst.op3 == 0x34) || (fp.inst.op3 == 0x35) ||
-	    (fp.inst.op3 == 0x37) || (fp.inst.op3 == 0x3f))) {
+	    (fp.inst.op3 == 0x37))) {
 		ftt =  _fp_fpu_simulator(pfpsd, fp.inst, pfsr, gsr);
 		if (ftt == ftt_none || ftt == ftt_ieee) {
 			pregs->r_pc = pregs->r_npc;
@@ -776,7 +651,7 @@
 
 	if ((fp.inst.hibits == 2) &&
 	    ((fp.inst.op3 == 0x34) || (fp.inst.op3 == 0x35) ||
-	    (fp.inst.op3 == 0x37) || (fp.inst.op3 == 0x3f))) {
+	    (fp.inst.op3 == 0x37))) {
 		ftt = _fp_fpu_simulator(pfpsd, fp.inst, (fsr_type *)&tfsr, gsr);
 		/* Do not retry emulated instruction. */
 		pregs->r_pc = pregs->r_npc;
@@ -816,7 +691,7 @@
 		return (ftt);
 	if ((fp.inst.hibits == 2) &&		/* fpops */
 	    ((fp.inst.op3 == 0x34) || (fp.inst.op3 == 0x35) ||
-	    (fp.inst.op3 == 0x37) || (fp.inst.op3 == 0x3f))) {
+	    (fp.inst.op3 == 0x37))) {
 		ftt = _fp_fpu_simulator(pfpsd, fp.inst, (fsr_type *)&tfsr, gsr);
 		/* Do not retry emulated instruction. */
 		pfpu->fpu_fsr = tfsr;
--- a/usr/src/uts/sparc/sys/fpu/fpu_simulator.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sparc/sys/fpu/fpu_simulator.h	Thu Aug 06 17:39:39 2009 -0700
@@ -204,7 +204,7 @@
 	ft_op_38	= 0x38,
 	fp_op_39 = 0x39, fp_op_3a = 0x3a, fp_op_3b = 0x3b,
 	fp_op_3c	= 0x3c,
-	fp_op_3d = 0x3d, fp_op_3e = 0x3e, fmau = 0x3f
+	fp_op_3d = 0x3d, fp_op_3e = 0x3e, fp_op_3f = 0x3f
 };
 
 typedef			/* FPU instruction. */
@@ -219,14 +219,14 @@
 	uint32_t		rs2	: 5;	/* Second operand. */
 } fp_inst_type;
 
-enum fp_op_fma_var {	/* FMA-fused/unfused instr. variations */
+enum fp_op_fma_var {	/* IMPDEP2B FMA-fused instr. variations */
 	fmadd	=	0,
 	fmsub	=	1,
 	fnmsub	=	2,
 	fnmadd	=	3
 };
 
-typedef		/* FPU FMA-fused/unfused instructions. */
+typedef		/* IMPDEP2B FPU FMA-fused instruction. */
 	struct {
 	uint32_t		hibits	: 2;	/* Top two bits. */
 	uint32_t		rd	: 5;	/* Destination. */
@@ -330,14 +330,6 @@
 	struct kstat_named		fpu_sim_fnmaddd;
 	struct kstat_named		fpu_sim_fnmsubs;
 	struct kstat_named		fpu_sim_fnmsubd;
-	struct kstat_named		fpu_sim_fumadds;
-	struct kstat_named		fpu_sim_fumaddd;
-	struct kstat_named		fpu_sim_fumsubs;
-	struct kstat_named		fpu_sim_fumsubd;
-	struct kstat_named		fpu_sim_fnumadds;
-	struct kstat_named		fpu_sim_fnumaddd;
-	struct kstat_named		fpu_sim_fnumsubs;
-	struct kstat_named		fpu_sim_fnumsubd;
 	struct kstat_named		fpu_sim_invalid;
 };
 
--- a/usr/src/uts/sun4/os/startup.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4/os/startup.c	Thu Aug 06 17:39:39 2009 -0700
@@ -896,7 +896,7 @@
 	PRM_DEBUG(kmem64_pabase);
 	PRM_DEBUG(kmem64_szc);
 	sfmmu_memtte(&tte, kmem64_pabase >> MMU_PAGESHIFT,
-	    PROC_DATA | HAT_NOSYNC | HAT_ATTR_NOSOFTEXEC, kmem64_szc);
+	    PROC_DATA | HAT_NOSYNC, kmem64_szc);
 	PRM_DEBUG(tte.ll);
 	(void) sprintf(b, kmem64_obp_str,
 	    kmem64_base, kmem64_end, TTE_PAGEMASK(kmem64_szc), tte.ll);
@@ -2850,8 +2850,6 @@
 	"h# %p constant KCONTEXT "
 	"h# %p constant KHATID "
 	"h# %x constant ASI_MEM "
-	"h# %x constant SOFTEXEC "
-	"h# %x constant EXECPRM "
 
 	": PHYS-X@ ( phys -- data ) "
 	"   ASI_MEM spacex@ "
@@ -2954,11 +2952,7 @@
 	"         ?dup  if                    ( addr sfmmup hmeblkp ) "
 	"            nip swap HBLK_TO_TTEP    ( ttep ) "
 	"            dup TTE_IS_VALID  if     ( valid-ttep ) "
-	"               PHYS-X@               ( tte-data ) "
-	"               dup SOFTEXEC and 0> if  ( tte-data ) "
-	"                 SOFTEXEC - EXECPRM or ( tte-data ) "
-	"               then                    ( tte-data ) "
-	"               true                  ( tte-data true ) "
+	"               PHYS-X@ true          ( tte-data true ) "
 	"            else                     ( invalid-tte ) "
 	"               drop false            ( false ) "
 	"            then                     ( false | tte-data true ) "
@@ -3009,9 +3003,7 @@
 	    KHMEHASH_SZ,
 	    KCONTEXT,
 	    KHATID,
-	    ASI_MEM,
-	    icache_is_coherent ? 0 : TTE_SOFTEXEC_INT,
-	    TTE_EXECPRM_INT);
+	    ASI_MEM);
 	prom_interpret(bp, 0, 0, 0, 0, 0);
 
 	kobj_free(bp, MMU_PAGESIZE);
--- a/usr/src/uts/sun4/vm/sfmmu.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4/vm/sfmmu.c	Thu Aug 06 17:39:39 2009 -0700
@@ -199,10 +199,6 @@
 		sfmmu_patch_shctx();
 	}
 
-	if (&mmu_enable_pgsz_search) {
-		mmu_enable_pgsz_search();
-	}
-
 	/*
 	 * The 8K-indexed kernel TSB space is used to hold
 	 * translations below...
--- a/usr/src/uts/sun4/vm/vm_dep.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4/vm/vm_dep.h	Thu Aug 06 17:39:39 2009 -0700
@@ -859,16 +859,6 @@
 extern size_t ndata_maxsize(struct memlist *);
 extern size_t ndata_spare(struct memlist *, size_t, size_t);
 
-/*
- * Platform specific support for non-coherent I-cache and soft exec
- */
-extern uint_t	icache_is_coherent;
-extern uint_t	force_sync_icache_after_bcopy;
-extern uint_t	force_sync_icache_after_dma;
-
-extern void	mach_setup_icache(uint_t);
-#pragma weak	mach_setup_icache
-
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/sun4u/sys/pte.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4u/sys/pte.h	Thu Aug 06 17:39:39 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -354,23 +354,6 @@
 
 #endif /* !_ASM */
 
-/*
- * There is no support for non-coherent I-cache in sun4u
- */
-#define	TTE_SOFTEXEC_INT	0x00000000
-#ifndef _ASM
-#ifdef lint
-/* fix lint warnings about constant conditionals and empty if */
-#define	TTE_IS_SOFTEXEC(ttep)	TTE_IS_EXECUTABLE(ttep)
-#define	TTE_SET_SOFTEXEC(ttep)	TTE_SET_EXEC(ttep)
-#define	TTE_CLR_SOFTEXEC(ttep)	TTE_CLR_EXEC(ttep)
-#else
-#define	TTE_IS_SOFTEXEC(ttep)	(0)
-#define	TTE_SET_SOFTEXEC(ttep)
-#define	TTE_CLR_SOFTEXEC(ttep)
-#endif	/* lint */
-#endif /* !_ASM */
-
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/sun4u/vm/mach_sfmmu.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4u/vm/mach_sfmmu.h	Thu Aug 06 17:39:39 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -112,12 +112,6 @@
 	/* END CSTYLED */
 
 /*
- * This macro is to control the pagesizes used for shared context on
- * Rock systems.
- */
-#define	CHECK_SHARED_PGSZ(tsbarea, tte, tmp, use_shctx, label)
-
-/*
  * This macro is used in the MMU code to check if TL should be lowered from
  * 2 to 1 to pop trapstat's state.  See the block comment in trapstat.c
  * for details.
@@ -267,12 +261,6 @@
 	/* CSTYLED */							\
 label/**/1:
 
-/*
- * No support for non-coherent I-cache in sun4u
- */
-#define	TTE_SET_EXEC_ML(tte, ttepa, tmp1, label)
-#define	TTE_CLR_SOFTEXEC_ML(tte)
-#define	TTE_CHK_SOFTEXEC_ML(tte)	andcc tte, 0, %g0
 
 /*
  * TTE_SET_REF_ML is a macro that updates the reference bit if it is
--- a/usr/src/uts/sun4v/Makefile.files	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/Makefile.files	Thu Aug 06 17:39:39 2009 -0700
@@ -179,7 +179,6 @@
 #
 NI_PCBE_OBJS	= niagara_pcbe.o
 N2_PCBE_OBJS	= niagara2_pcbe.o
-RK_PCBE_OBJS	= rock_pcbe.o
 
 #
 #			cpu modules
@@ -190,7 +189,6 @@
 NIAGARACPU_OBJS += niagara_asm.o atomic.o
 NIAGARA2CPU_OBJS = niagara2.o niagara_copy.o common_asm.o niagara_perfctr.o
 NIAGARA2CPU_OBJS += niagara2_asm.o atomic.o
-ROCKCPU_OBJS = rock.o rock_copy.o common_asm.o rock_asm.o atomic.o
 
 #
 #			platform module
--- a/usr/src/uts/sun4v/Makefile.sun4v.shared	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/Makefile.sun4v.shared	Thu Aug 06 17:39:39 2009 -0700
@@ -433,9 +433,9 @@
 #
 #	cpu modules
 #
-CPU_KMODS	+= generic niagara niagara2 vfalls rock
+CPU_KMODS	+= generic niagara niagara2 vfalls
 
-LINT_CPU_KMODS	+= generic rock
+LINT_CPU_KMODS	+= generic
 
 #
 #	Performance Counter BackEnd Modules (/usr/kernel/pcbe):
@@ -443,4 +443,3 @@
 PCBE_KMODS	+= niagara_pcbe
 PCBE_KMODS	+= niagara2_pcbe
 PCBE_KMODS	+= vfalls_pcbe
-PCBE_KMODS	+= rock_pcbe
--- a/usr/src/uts/sun4v/cpu/rock.c	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1014 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/types.h>
-#include <sys/systm.h>
-#include <sys/archsystm.h>
-#include <sys/machparam.h>
-#include <sys/machsystm.h>
-#include <sys/cpu.h>
-#include <sys/elf_SPARC.h>
-#include <vm/page.h>
-#include <vm/vm_dep.h>
-#include <sys/cpuvar.h>
-#include <sys/async.h>
-#include <sys/cmn_err.h>
-#include <sys/debug.h>
-#include <sys/dditypes.h>
-#include <sys/sunddi.h>
-#include <sys/cpu_module.h>
-#include <sys/prom_debug.h>
-#include <sys/vmsystm.h>
-#include <sys/prom_plat.h>
-#include <sys/sysmacros.h>
-#include <sys/intreg.h>
-#include <sys/machtrap.h>
-#include <sys/ontrap.h>
-#include <sys/ivintr.h>
-#include <sys/atomic.h>
-#include <sys/panic.h>
-#include <sys/dtrace.h>
-#include <vm/seg_spt.h>
-#include <sys/hypervisor_api.h>
-#include <sys/rock_hypervisor_api.h>
-#include <sys/hsvc.h>
-#include <vm/hat_sfmmu.h>
-#include <sys/mutex_impl.h>
-
-uint_t root_phys_addr_lo_mask = 0xffffffffU;
-uint8_t	enable_tm = 1;
-
-char cpu_module_name[] = "SUNW,UltraSPARC-AT10";
-boolean_t	hsvc_tm_available = B_TRUE;
-
-static	hsvc_info_t rock_tm_hsvc = {
-	HSVC_REV_1,		/* HSVC rev num */
-	NULL,			/* Private */
-	HSVC_GROUP_TM,		/* Requested API Group */
-	ROCK_HSVC_MAJOR,	/* Requested Major */
-	ROCK_HSVC_MINOR,	/* Requested Minor */
-	cpu_module_name		/* Module name */
-};
-
-boolean_t	hsvc_mmu_ext_available = B_TRUE;
-
-static	hsvc_info_t rock_mmu_ext_hsvc = {
-	HSVC_REV_1,		/* HSVC rev num */
-	NULL,			/* Private */
-	HSVC_GROUP_RKMMU_EXT,	/* Requested API Group */
-	ROCK_HSVC_MAJOR,	/* Requested Major */
-	ROCK_HSVC_MINOR,	/* Requested Minor */
-	cpu_module_name		/* Module name */
-};
-
-static void encode_pgsz_order(uint64_t, int, int, uint16_t *, uchar_t *);
-static void set_pgsz_order(uchar_t, uchar_t, uint64_t *, int *, int *,
-    sfmmu_t *);
-
-extern	void rock_mutex_delay(void);
-
-/*
- * External /etc/system tunable, for controlling whether shared or private pages
- * come first in the pagesize order register.
- */
-int pgsz_order_shared_first = 1;
-
-#define	MCOREID_MASK	0x1E
-#define	MCOREID_SHIFT	1
-
-static uint_t mmu_disable_large_pages = ((1 << TTE512K) | (1 << TTE32M) |
-		(1 << TTE2G) | (1 << TTE16G));
-static uint_t mmu_disable_ism_large_pages = ((1 << TTE512K) | (1 << TTE32M) |
-	(1 << TTE2G) | (1 << TTE16G));
-static uint_t mmu_disable_auto_data_large_pages = ((1 << TTE512K) |
-	(1 << TTE32M) | (1 << TTE2G) | (1 << TTE16G));
-static uint_t mmu_disable_auto_text_large_pages = ((1 << TTE512K) |
-	(1 << TTE32M) | (1 << TTE2G) | (1 << TTE16G));
-
-void
-cpu_setup(void)
-{
-	extern int	cpc_has_overflow_intr;
-	uint64_t	sup_minor;
-	int		status;
-
-	/*
-	 * The setup common to all CPU modules is done in cpu_setup_common
-	 * routine.
-	 */
-	cpu_setup_common(NULL);
-
-	/*
-	 * Rock's max nctxs is 64K. Set it accordingly.
-	 */
-	nctxs = MAX_NCTXS;
-
-	/*
-	 * Rock I$ is non-coherent.
-	 */
-	mach_setup_icache(0);
-
-#ifdef DEBUG
-	/*
-	 * These should always be present on Rock
-	 */
-	if (cpu_hwcap_flags == 0)
-		cmn_err(CE_WARN, "hwcap-list missing from MD");
-#endif
-	cpu_hwcap_flags |= AV_SPARC_ASI_CACHE_SPARING;
-
-	cache |= (CACHE_PTAG | CACHE_IOCOHERENT);
-
-	if (use_page_coloring) {
-		do_pg_coloring = 1;
-	}
-
-	/*
-	 * Rock generates hpriv performance event trap instead of pic overflow
-	 * trap. To get the attention of the guest hv in-turn generates pic
-	 * overflow trap. Therefore enable support for that.
-	 */
-	cpc_has_overflow_intr = 1;
-
-	/*
-	 * Enable 4M pages for OOB.
-	 */
-	max_uheap_lpsize = MMU_PAGESIZE4M;
-	max_ustack_lpsize = MMU_PAGESIZE4M;
-	max_privmap_lpsize = MMU_PAGESIZE4M;
-
-	/*
-	 * hv_tm_enable is a part of TM group. We need to
-	 * negotiate that API group before we can use it.
-	 */
-	status = hsvc_register(&rock_tm_hsvc, &sup_minor);
-	if ((status != 0) || (sup_minor < (uint64_t)ROCK_HSVC_MINOR)) {
-		cmn_err(CE_WARN, "%s cannot negotiate hypervisor services: "
-		    "major: 0x%lx minor: 0x%lx group: 0x%x errno: %d",
-		    cpu_module_name, rock_tm_hsvc.hsvc_major,
-		    rock_tm_hsvc.hsvc_minor, HSVC_GROUP_TM, status);
-		hsvc_tm_available = B_FALSE;
-	}
-
-	/*
-	 * Negotiate API group for rock mmu extensions.
-	 */
-	status = hsvc_register(&rock_mmu_ext_hsvc, &sup_minor);
-	if ((status != 0) || (sup_minor <
-	    (uint64_t)ROCK_HSVC_MINOR)) {
-		cmn_err(CE_WARN, "%s cannot negotiate hypervisor services: "
-		    "major: 0x%lx minor: 0x%lx group: 0x%x errno: %d",
-		    cpu_module_name, rock_mmu_ext_hsvc.hsvc_major,
-		    rock_mmu_ext_hsvc.hsvc_minor, HSVC_GROUP_RKMMU_EXT,
-		    status);
-		hsvc_mmu_ext_available = B_FALSE;
-	}
-}
-
-/*
- * Set the magic constants of the implementation.
- */
-void
-cpu_fiximp(struct cpu_node *cpunode)
-{
-	/*
-	 * The Cache node is optional in MD. Therefore in case it
-	 * does not exist, use hardcoded values.
-	 */
-#ifdef DEBUG
-	/*
-	 * ...that said, we do want this info to come from the MD.
-	 */
-	if (cpunode->ecache_size == 0 || cpunode->ecache_linesize == 0 ||
-	    cpunode->ecache_associativity == 0) {
-		cmn_err(CE_WARN, "ecache info missing from MD");
-	}
-#endif
-	if (cpunode->ecache_size == 0)
-		cpunode->ecache_size = 2 * 1024 * 1024;
-	if (cpunode->ecache_linesize == 0)
-		cpunode->ecache_linesize = 64;
-	if (cpunode->ecache_associativity == 0)
-		cpunode->ecache_associativity = 8;
-}
-
-void
-dtrace_flush_sec(uintptr_t addr)
-{
-	pfn_t pfn;
-	proc_t *procp = ttoproc(curthread);
-	page_t *pp;
-	caddr_t va;
-
-	pfn = hat_getpfnum(procp->p_as->a_hat, (void *)addr);
-	if (pfn != -1) {
-		ASSERT(pf_is_memory(pfn));
-		pp = page_numtopp_noreclaim(pfn, SE_SHARED);
-		if (pp != NULL) {
-			va = ppmapin(pp, PROT_READ | PROT_WRITE, (void *)addr);
-			/* sparc needs 8-byte align */
-			doflush((caddr_t)((uintptr_t)va & -8l));
-			ppmapout(va);
-			page_unlock(pp);
-		}
-	}
-}
-
-void
-cpu_map_exec_units(struct cpu *cp)
-{
-	ASSERT(MUTEX_HELD(&cpu_lock));
-
-	/*
-	 * The cpu_ipipe and cpu_fpu fields are initialized based on
-	 * the execution unit sharing information from the MD. They
-	 * default to the CPU id in the absence of such information.
-	 */
-	cp->cpu_m.cpu_ipipe = cpunodes[cp->cpu_id].exec_unit_mapping;
-	if (cp->cpu_m.cpu_ipipe == NO_EU_MAPPING_FOUND)
-		cp->cpu_m.cpu_ipipe = (id_t)(cp->cpu_id);
-
-	cp->cpu_m.cpu_fpu = cpunodes[cp->cpu_id].fpu_mapping;
-	if (cp->cpu_m.cpu_fpu == NO_EU_MAPPING_FOUND)
-		cp->cpu_m.cpu_fpu = (id_t)(cp->cpu_id);
-
-	cp->cpu_m.cpu_core = (cp->cpu_id & MCOREID_MASK) >> MCOREID_SHIFT;
-
-	/*
-	 * The cpu_chip field is initialized based on the information
-	 * in the MD and assume that all cpus within a chip
-	 * share the same L2 cache. If no such info is available, we
-	 * set the cpu to CPU_CHIPID_INVALID.
-	 */
-	cp->cpu_m.cpu_mpipe = cpunodes[cp->cpu_id].l2_cache_mapping;
-	if (cp->cpu_m.cpu_mpipe == NO_L2_CACHE_MAPPING_FOUND)
-		cp->cpu_m.cpu_mpipe = CPU_L2_CACHEID_INVALID;
-
-	cp->cpu_m.cpu_chip = cpunodes[cp->cpu_id].l2_cache_mapping;
-	if (cp->cpu_m.cpu_chip == NO_L2_CACHE_MAPPING_FOUND)
-		cp->cpu_m.cpu_chip = CPU_CHIPID_INVALID;
-}
-
-void
-cpu_init_private(struct cpu *cp)
-{
-	cpu_map_exec_units(cp);
-	mutex_delay = rock_mutex_delay;
-}
-
-/*ARGSUSED*/
-void
-cpu_uninit_private(struct cpu *cp)
-{
-}
-
-/*
- * cpu_feature_init
- *
- * This function is called once per strand.
- */
-void
-cpu_feature_init(void)
-{
-	static	int	set_mutex_backoff_tunables = 0;
-	/*
-	 * Set constants for mutex_backoff only once.
-	 * On Rock, setting this to 8 gives the best performance,
-	 * even for multi-chip systems.
-	 */
-	if (! set_mutex_backoff_tunables) {
-		mutex_backoff_base = 1;
-		mutex_cap_factor = 8;
-		set_mutex_backoff_tunables = 1;
-	}
-
-	/*
-	 * Enable or disable for each cpu if hypervisor API is negotiated.
-	 */
-	if (hsvc_tm_available == B_TRUE)
-		(void) hv_tm_enable((uint64_t)enable_tm);
-}
-
-/*
- * Flush specified address range from I$ via hv_mem_iflush interface
- * Note that the hypervisor interface expects physical address range
- * and can flush less than the requested size.
- */
-
-void
-rock_sync_icache(caddr_t addr, size_t size)
-{
-	uint64_t pa, i, flushlen, flushed;
-
-	if (!force_sync_icache_after_bcopy)
-		/*
-		 * Do not clear the I-cache after bcopy.
-		 * The default value is 0. This flag made be
-		 * set via /etc/system.
-		 */
-		return;
-
-	if (!tba_taken_over)
-		/*
-		 * Very early in boot, va_to_pa() will try to call back
-		 * into OBP.  Very *very* early in boot, this will fail
-		 * because we haven't set up the OBP callback handler.
-		 * (Without this check, kmdb boot will fail.)
-		 */
-		return;
-
-	for (i = 0; i < size; i += flushed) {
-		pa = va_to_pa(addr + i);
-		ASSERT(pa != -1);
-
-		/*
-		 * Only flush the required length up to a PAGESIZE.
-		 */
-
-		flushlen = MIN((size - i), (PAGESIZE - (pa & MMU_PAGEOFFSET)));
-
-		/*
-		 * Flush I$ up to the page bounday. This call should never
-		 * fail. If it does, we panic the system as I$ may contain
-		 * stale instructions, which can result in silent data
-		 * corruption.
-		 */
-
-		if (hv_mem_iflush(pa, flushlen, &flushed) != H_EOK) {
-			cmn_err(CE_PANIC, "Flushing the Icache failed");
-		}
-
-	}
-}
-
-/*
- * There are no Hypervisor trapstat(1m) interfaces for Rock
- * If trapstat(1m) wants to do its thing, it will have to
- * take over all TLB miss handling.
- */
-int
-cpu_trapstat_conf(int cmd)
-{
-	int status;
-
-	switch (cmd) {
-	case CPU_TSTATCONF_INIT:
-	case CPU_TSTATCONF_FINI:
-	case CPU_TSTATCONF_ENABLE:
-	case CPU_TSTATCONF_DISABLE:
-		status = ENOTSUP;
-		break;
-	default:
-		status = EINVAL;
-		break;
-	}
-	return (status);
-}
-
-/*ARGSUSED*/
-void
-cpu_trapstat_data(void *buf, uint_t tstat_pgszs)
-{
-}
-
-#define	MAX_PAGE_COLORS		(1 << MAX_PAGE_COLORS_SHIFT)
-#define	MAX_PAGE_COLORS_SHIFT	(5)
-
-/*ARGSUSED*/
-uint_t
-page_pfn_2_color_cpu(pfn_t pfn, uchar_t szc, void *cookie)
-{
-	uint_t	color;
-
-	pfn = PFN_BASE(pfn, szc);
-	color = pfn ^ (pfn >> 20);
-	color = color ^ (color >> 10);
-	return ((color ^ (color >> 5)) & 0x1f);
-}
-
-/*
- * this macro rotates value "x" n steps to the right
- * mask consists of "n + m" bits
- * ASSERT(x < (1 << (n + m));
- */
-#define	ROTATE_BITS(x, n, m) (((x) >> (n)) | (((x) & ((1 << (n)) - 1)) << m))
-
-
-uchar_t clr2sqnclr_table[MMU_PAGE_SIZES][MAX_PAGE_COLORS];
-
-/*
- * on Rock, the hash cache index is calculated as follows:
- * pa[47:43]^pa[42:38]^pa[37:33]^pa[32:28]^
- * 	pa[27:23]^pa[22:18]^pa[17:13].pa[12:6]
- * That is, every 5 bits is folded and XORd together. Page sizes
- * differ by 3 bits, which is a factor of 8. This function computes
- * the next sequential color by rotating by 3 steps within a field of 5 bits
- * for every page size.
- */
-void
-clr2sqnclr_table_init()
-{
-	uchar_t szc;
-	uint_t  color;
-	uint_t  rot = 0;
-
-	for (szc = 0; szc < MMU_PAGE_SIZES; szc++) {
-		rot = (szc * 3) % MAX_PAGE_COLORS_SHIFT;
-		for (color = 0; color < MAX_PAGE_COLORS; color++) {
-			clr2sqnclr_table[szc][color] =
-			    ROTATE_BITS(color, rot,
-			    (MAX_PAGE_COLORS_SHIFT - rot));
-		}
-	}
-}
-
-uint_t
-clr2sqnclr(uchar_t szc, uint_t color)
-{
-	ASSERT(szc < MMU_PAGE_SIZES);
-	ASSERT(color < MAX_PAGE_COLORS);
-
-	return (clr2sqnclr_table[szc][color]);
-}
-
-#if MMU_PAGE_SIZES > 8
-#error MMU_PAGE_SIZES can be at most 8
-#endif
-
-uint_t
-page_get_nsz_color_mask_cpu(uchar_t szc, uint_t mask)
-{
-	static uint_t rock_color_masks[7] = {0x18, 6, 0x11, 0xc, 3, 0x18, 6};
-
-	ASSERT(szc < MMU_PAGE_SIZES - 1);
-	return (mask & rock_color_masks[szc]);
-}
-
-/*ARGSUSED*/
-uint_t
-page_get_nsz_color_cpu(uchar_t szc, uint_t color)
-{
-	return (color);
-}
-
-uint_t
-page_get_color_shift_cpu(uchar_t szc, uchar_t nszc)
-{
-	ASSERT(nszc >= szc);
-	return (0);
-}
-
-/*ARGSUSED*/
-pfn_t
-page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color,
-    uint_t ceq_mask, uint_t color_mask, void *cookie)
-{
-	uint_t	sqn_ceq_mask = clr2sqnclr(szc, ceq_mask);
-	uint_t	sqn_color = clr2sqnclr(szc, color);
-	uint_t	pfn_shift = PNUM_SHIFT(szc);
-	pfn_t	cpfn, npfn, base_pfn = pfn & (~(pfn_t)color_mask << pfn_shift);
-	uint_t  base_sqn_color, nsqn_color, wrap = 0;
-
-	ASSERT((color & ~ceq_mask) == 0);
-
-	base_sqn_color = clr2sqnclr(szc,
-	    page_pfn_2_color_cpu(base_pfn, szc, NULL)) ^ sqn_color;
-	nsqn_color = base_sqn_color;
-
-	cpfn = (pfn_t)-1L;
-	do {
-		npfn = base_pfn | (nsqn_color << pfn_shift);
-
-		ASSERT(((page_pfn_2_color_cpu(npfn, szc, NULL) ^ color) &
-		    ceq_mask) == 0);
-
-		if (npfn > pfn && npfn < cpfn)
-			cpfn = npfn;
-
-		nsqn_color = INC_MASKED(nsqn_color, sqn_ceq_mask, color_mask);
-		if (nsqn_color != base_sqn_color)
-			continue;
-
-		if (cpfn != (pfn_t)-1L)
-			break;
-
-		base_pfn += ((pfn_t)color_mask + 1) << pfn_shift;
-
-		base_sqn_color = clr2sqnclr(szc,
-		    page_pfn_2_color_cpu(base_pfn, szc, NULL)) ^ sqn_color;
-		nsqn_color = base_sqn_color;
-		wrap++;
-
-	} while (nsqn_color != base_sqn_color || wrap < 2);
-
-	ASSERT(cpfn != (pfn_t)-1L);
-
-	return (cpfn);
-}
-
-void
-page_coloring_init_cpu()
-{
-	int i;
-	uint_t colors = 1 << MAX_PAGE_COLORS_SHIFT;
-
-	for (i = 0; i < mmu_page_sizes; i++) {
-		hw_page_array[i].hp_colors = colors;
-	}
-
-	/*
-	 * initialise conversion table between page colors and
-	 * sequential colors
-	 */
-	clr2sqnclr_table_init();
-
-}
-
-/*
- * group colorequiv colors on Rock by low order bits of the color first
- */
-void
-page_set_colorequiv_arr_cpu(void)
-{
-	static uint_t nequiv_shades_log2[MMU_PAGE_SIZES] = {0, 3, 0, 0, 0, 0};
-
-	if (colorequiv > 1) {
-		int i;
-		uint_t sv_a = lowbit(colorequiv) - 1;
-
-		if (sv_a > 15)
-			sv_a = 15;
-
-		for (i = 0; i < MMU_PAGE_SIZES; i++) {
-			uint_t colors;
-			uint_t a = sv_a;
-
-			if ((colors = hw_page_array[i].hp_colors) <= 1)
-				continue;
-			while ((colors >> a) == 0)
-				a--;
-			if (a > (colorequivszc[i] & 0xf) +
-			    (colorequivszc[i] >> 4)) {
-				if (a <= nequiv_shades_log2[i]) {
-					colorequivszc[i] = (uchar_t)a;
-				} else {
-					colorequivszc[i] =
-					    ((a - nequiv_shades_log2[i]) << 4) |
-					    nequiv_shades_log2[i];
-				}
-			}
-		}
-	}
-}
-
-/*
- * Calculate the page sizes needed to program Rock TLB page size register.
- * The invctx parameter is a flag which indicates that it will be necessary to
- * synchronize by invalidating contexts if the sfmmu pagesize register is
- * updated.
- */
-void
-mmu_set_pgsz_order(sfmmu_t *sfmmup, int invctx)
-{
-	uchar_t private_pgsz_mask;
-	uchar_t shared_pgsz_mask;
-	uint16_t pgsz_order_hv[MAX_PGSZ_SEARCH_ORDER];
-	uint64_t pgsz_order = 0;
-	uchar_t pgsz_map = 0;
-	int private_pgsz_num = 0;
-	int shared_pgsz_num = 0;
-	int tot_pgsz_num;
-	sf_scd_t *scdp;
-	int ret;
-	int i;
-
-	/*
-	 * The hatlock must be held in all cases except when the sfmmu is
-	 * being initialized by hat_alloc() or we are calling hat_dup(), in
-	 * these cases no other thread will be using the sfmmu yet.
-	 */
-
-	ASSERT(!invctx || sfmmu_hat_lock_held(sfmmup));
-
-	if (pgsz_search_on == 0)
-		return;
-
-	/* Always enable 8K private mappings */
-	private_pgsz_mask = 1 << TTE8K;
-
-	/* Enable 64K private mappings unless specifically disabled */
-	if (!(disable_large_pages & (1 << TTE64K))) {
-		private_pgsz_mask |= 1 << TTE64K;
-	}
-
-	/*
-	 * First check for ISM segments not in an SCD. The algorithm for
-	 * creating an SCD is to create one when an (D)ISM segment is attached
-	 * unless the process's shared segments are a subset of an SCD which
-	 * already exists.
-	 *
-	 * This situation also arises when we attach to more than the maximum
-	 * number of (D)ISM segments defined in the region bit map
-	 * (currently 64).
-	 *
-	 * We have set mmu_disable_ism_large_pages to force ISM segments to use
-	 * only 4M and 256M pages.
-	 */
-	if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMNOTINSCD)) {
-		private_pgsz_mask |= 1 << TTE4M;
-		if (SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM)) {
-			private_pgsz_mask |= 1 << TTE256M;
-		}
-	}
-
-	/* Now check for regions not included in the SCD. */
-	if ((scdp = sfmmup->sfmmu_scdp) != NULL) {
-		SF_RGNMAP_EQUAL(&scdp->scd_hmeregion_map,
-		    &sfmmup->sfmmu_hmeregion_map,
-		    SFMMU_HMERGNMAP_WORDS, ret);
-		if (!ret) {
-			private_pgsz_mask |= sfmmup->sfmmu_rtteflags;
-		}
-	} else {
-		private_pgsz_mask |= sfmmup->sfmmu_rtteflags;
-	}
-
-	private_pgsz_mask |= sfmmup->sfmmu_tteflags;
-
-	/*
-	 * If the process is part of an SCD then enable 4M and 256M shared
-	 * page sizes - unless these are specifically disabled. If the 4M
-	 * shared page size is specifically disabled and the process has (D)ISM
-	 * segments attached or 4M regions then enable the private 4M page size.
-	 * If the 256M shared page size is disabled and the process has a 256M
-	 * page size region then enable the 256M private page size. The trap
-	 * handler looks at the shared page sizes enabled and if a shared
-	 * mapping does not correspond to one these sizes then it is treated
-	 * as a private mapping.
-	 *
-	 * The SCD includes the process's main text segment and (D)ISM segments
-	 * but we only enable the 4M shared page size so an 8K main text
-	 * segment will be treated as private due to the trap handler support.
-	 *
-	 * Note that for simplicity the ordering of the shared page sizes is
-	 * hard coded.
-	 */
-	shared_pgsz_mask = 0;
-	if (sfmmup->sfmmu_scdp != NULL) {
-		if (!(disable_shctx_large_pages  & (1 << TTE4M))) {
-			shared_pgsz_mask |= 1 << TTE4M;
-		} else if (sfmmup->sfmmu_iblk != NULL ||
-		    (sfmmup->sfmmu_rtteflags &
-		    (1 << TTE4M))) {
-			private_pgsz_mask |= 1 << TTE4M;
-		}
-
-		if (SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM) ||
-		    (sfmmup->sfmmu_rtteflags & (1 << TTE256M))) {
-			if (!(disable_shctx_large_pages  & (1 << TTE256M))) {
-				shared_pgsz_mask |= 1 << TTE256M;
-			} else {
-				private_pgsz_mask |= 1 << TTE256M;
-			}
-		}
-	}
-
-	set_pgsz_order(private_pgsz_mask, shared_pgsz_mask, &pgsz_order,
-	    &private_pgsz_num, &shared_pgsz_num, sfmmup);
-
-	encode_pgsz_order(pgsz_order, private_pgsz_num, shared_pgsz_num,
-	    pgsz_order_hv, &pgsz_map);
-
-	tot_pgsz_num = private_pgsz_num + shared_pgsz_num;
-	ASSERT(tot_pgsz_num <= MAX_PGSZ_SEARCH_ORDER);
-
-	for (i = 0; i < tot_pgsz_num; i++) {
-		if (pgsz_order_hv[i] != sfmmup->sfmmu_pgsz_order_hv[i])
-			break;
-	}
-
-	/*
-	 * If either we've reached the maximum number of page sizes or the
-	 * next element is 0, indicating the end of the list, then both the
-	 * entries and their number in both arrays is the same and we return.
-	 */
-	if ((i == tot_pgsz_num) && (i == MAX_PGSZ_SEARCH_ORDER ||
-	    sfmmup->sfmmu_pgsz_order_hv[i] == 0)) {
-		ASSERT(pgsz_map == sfmmup->sfmmu_pgsz_map);
-		return;
-	}
-
-	/* Otherwise update the sw page size register setting */
-	if (invctx) {
-		sfmmu_invalidate_ctx(sfmmup);
-	}
-
-	for (i = 0; i < tot_pgsz_num; i++) {
-		sfmmup->sfmmu_pgsz_order_hv[i] = pgsz_order_hv[i];
-	}
-
-	/* Disable next entry in search list to mark the end */
-	if (i < MAX_PGSZ_SEARCH_ORDER) {
-		sfmmup->sfmmu_pgsz_order_hv[i] = 0;
-	}
-	sfmmup->sfmmu_pgsz_map = pgsz_map;
-}
-
-/*
- * Encode the Rock TLB page size register.
- *
- * Input:
- *        pgsz_order, ordered list of page sizes, private and shared, the order
- *        between these depends on the pgsz_order_shared_first config variable.
- *        private_pgsz_num, number of private page sizes.
- *        shared_pgsz_num, number of shared page sizes.
- * Output:
- *        pgsz_order_hv contains the encoded pagesize search order for the hv
- *	  pgsz_map field contains the page size bit map used by the trap
- *        handler to prevent unauthorized shared page sizes being used.
- */
-
-static void
-encode_pgsz_order(uint64_t pgsz_order, int private_pgsz_num,
-    int shared_pgsz_num, uint16_t *pgsz_order_hv, uchar_t *pgsz_map)
-{
-	int i;
-	int tot_pgsz_num;
-	uint16_t pgsz_entry;
-	uint16_t first_entry_mask, second_entry_mask;
-	int	first_pgsz_num;
-
-	ASSERT(private_pgsz_num < MMU_PAGE_SIZES);
-	ASSERT(shared_pgsz_num < MMU_PAGE_SIZES);
-	ASSERT(private_pgsz_num > 0);
-
-	if (pgsz_order_shared_first) {
-		first_entry_mask = TLB_PGSZ_CONTEXT1_ENABLE;
-		second_entry_mask = TLB_PGSZ_ENABLE;
-		first_pgsz_num = shared_pgsz_num;
-	} else {
-		first_entry_mask = TLB_PGSZ_ENABLE;
-		second_entry_mask = TLB_PGSZ_CONTEXT1_ENABLE;
-		first_pgsz_num = private_pgsz_num;
-	}
-
-	tot_pgsz_num = private_pgsz_num + shared_pgsz_num;
-	for (i = 0; i < tot_pgsz_num; i++) {
-		pgsz_entry = pgsz_order & TTE_SZ_BITS;
-		if (i < first_pgsz_num) {
-			if (pgsz_order_shared_first) {
-				*pgsz_map |= (1 << pgsz_entry);
-			}
-			pgsz_entry |= first_entry_mask;
-		} else {
-			if (!pgsz_order_shared_first) {
-				*pgsz_map |= (1 << pgsz_entry);
-			}
-			pgsz_entry |= second_entry_mask;
-		}
-		pgsz_order >>= 4;
-		pgsz_order_hv[i] = pgsz_entry;
-	}
-}
-
-/*
- * The function returns the mmu-specific values for the
- * hat's disable_large_pages, disable_ism_large_pages, and
- * disable_auto_data_large_pages and
- * disable_text_data_large_pages variables.
- */
-uint_t
-mmu_large_pages_disabled(uint_t flag)
-{
-	uint_t pages_disable = 0;
-
-	if (flag == HAT_LOAD) {
-		pages_disable =  mmu_disable_large_pages;
-	} else if (flag == HAT_LOAD_SHARE) {
-		pages_disable = mmu_disable_ism_large_pages;
-	} else if (flag == HAT_AUTO_DATA) {
-		pages_disable = mmu_disable_auto_data_large_pages;
-	} else if (flag == HAT_AUTO_TEXT) {
-		pages_disable = mmu_disable_auto_text_large_pages;
-	}
-	return (pages_disable);
-}
-
-/*
- * Uses private and shared page size bitmaps to produce an ordered list
- * of page sizes and counts to be passed to encode_pgsz_order().
- *
- * Input:
- *        private_pgsz_mask, bit map of private page sizes.
- *        shared_pgsz_mask,  bit map of private page sizes.
- *	  sfmmup, pointer to hat structure.
- *
- * Output:
- *        pgsz_order, ordered list of page sizes.
- *        private_pgsz_num, number of private page sizes in pgsz_order.
- *        shared_pgsz_num, number of shared page sizes in pgsz_order.
- */
-static void
-set_pgsz_order(uchar_t private_pgsz_mask, uchar_t shared_pgsz_mask,
-    uint64_t *pgsz_order, int *private_pgsz_num, int *shared_pgsz_num,
-    sfmmu_t *sfmmup)
-{
-	int64_t sortcnt[MMU_PAGE_SIZES];
-	int8_t tmp_pgsz[MMU_PAGE_SIZES];
-	ulong_t tmp;
-	uint8_t i, j, max;
-
-	*private_pgsz_num = 0;
-	*shared_pgsz_num = 0;
-	*pgsz_order = 0;
-
-	/* Sort pages by area mapped */
-	for (i = 0; i < mmu_page_sizes; i++) {
-		tmp = sfmmup->sfmmu_ttecnt[i] + sfmmup->sfmmu_ismttecnt[i];
-		sortcnt[i] = tmp << TTE_PAGE_SHIFT(i);
-	}
-
-	for (j = 0; j < mmu_page_sizes; j++) {
-		for (i = mmu_page_sizes - 1, max = 0; i > 0; i--) {
-			if (sortcnt[i] > sortcnt[max])
-				max = i;
-		}
-		tmp_pgsz[j] = max;
-		sortcnt[max] = -1;
-	}
-
-	/* Add shared page sizes to page order if these come first */
-	if (pgsz_order_shared_first) {
-		if (shared_pgsz_mask & (1 << TTE256M)) {
-			*pgsz_order =  TTE256M;
-			(*shared_pgsz_num)++;
-		}
-		if (shared_pgsz_mask & (1 << TTE4M)) {
-			*pgsz_order |= (TTE4M << (*shared_pgsz_num * 4));
-			(*shared_pgsz_num)++;
-		}
-	}
-
-
-	/* Add private page sizes to page order */
-	for (i = 0; i < mmu_page_sizes; i++) {
-		if (private_pgsz_mask & (1 << tmp_pgsz[i])) {
-			*pgsz_order |= (tmp_pgsz[i] <<
-			    ((*private_pgsz_num + *shared_pgsz_num) * 4));
-			(*private_pgsz_num)++;
-		}
-	}
-
-	/* Add shared page sizes to page order if these come last */
-	if (!pgsz_order_shared_first) {
-		if (shared_pgsz_mask & (1 << TTE256M)) {
-			*pgsz_order |=  (TTE256M <<
-			    ((*private_pgsz_num + *shared_pgsz_num) * 4));
-			(*shared_pgsz_num)++;
-		}
-		if (shared_pgsz_mask & (1 << TTE4M)) {
-			*pgsz_order |= (TTE4M <<
-			    ((*private_pgsz_num + *shared_pgsz_num) * 4));
-			(*shared_pgsz_num)++;
-		}
-	}
-
-	ASSERT(*pgsz_order);
-	ASSERT(*private_pgsz_num);
-	ASSERT((*private_pgsz_num + *shared_pgsz_num)
-	    <= MAX_PGSZ_SEARCH_ORDER);
-}
-
-/*
- * This routine is called without holding the hat lock to determine
- * whether the process's optimal page size order has changed significantly
- * since the page size register was last set. If it has changed we get the
- * hat lock and call mmu_set_pgsz_order() to update the effective pagesize
- * order.
- */
-void
-mmu_check_page_sizes(sfmmu_t *sfmmup, uint64_t *ttecnt)
-{
-	int64_t sortcnt[MMU_PAGE_SIZES];
-	int8_t tmp_pgsz[MMU_PAGE_SIZES];
-	ulong_t tmp;
-	int8_t i, j, max;
-	uint_t pgsz;
-	uint16_t *pgsz_order_hv;
-	int page_order_changed;
-	hatlock_t *hatlockp;
-	int pgsz_count = 0;
-
-	ASSERT(!sfmmu_hat_lock_held(sfmmup));
-
-	if (pgsz_search_on == 0)
-		return;
-
-	/*
-	 * Check if ttecnt has changed significantly, since the last time we
-	 * were called. If the shared page sizes have changed then this is
-	 * handled by mmu_set_pgsz_order() being called directly when we join
-	 * the SCD.
-	 */
-	for (i = 0; i < mmu_page_sizes; i++) {
-		if (ttecnt[i] > (sfmmup->sfmmu_mmuttecnt[i] << 1) ||
-		    ttecnt[i] < (sfmmup->sfmmu_mmuttecnt[i] >> 1))
-			break;
-	}
-
-	if (i == mmu_page_sizes) {
-		return;
-	}
-
-	/* Sort pages by area mapped */
-	for (i = 0; i < mmu_page_sizes; i++) {
-		tmp = ttecnt[i];
-		sortcnt[i] = tmp << TTE_PAGE_SHIFT(i);
-	}
-
-	for (j = 0; j < mmu_page_sizes; j++) {
-		for (i = mmu_page_sizes - 1, max = 0; i > 0; i--) {
-			if (sortcnt[i] > sortcnt[max])
-				max = i;
-		}
-		tmp_pgsz[j] = max;
-		sortcnt[max] = -1;
-	}
-
-	/*
-	 * Check if the order of the private page sizes has changed. We call
-	 * mmu_set_pgsz_order() directly if additional page sizes are used,
-	 * so we can assume that the number of entries is unchanged.
-	 */
-	pgsz_order_hv = sfmmup->sfmmu_pgsz_order_hv;
-	if (pgsz_order_shared_first) {
-		/* skip over shared pgsz entries */
-		while ((pgsz_order_hv[pgsz_count] & TLB_PGSZ_CONTEXT1_ENABLE) ==
-		    TLB_PGSZ_CONTEXT1_ENABLE) {
-			pgsz_count++;
-		}
-	}
-
-	i = 0;
-	page_order_changed = 0;
-	while ((pgsz_order_hv[pgsz_count] & TLB_PGSZ_ENABLE) &&
-	    !(pgsz_order_hv[pgsz_count] & TLB_PGSZ_CONTEXT1) &&
-	    (pgsz_count < MAX_PGSZ_SEARCH_ORDER)) {
-		pgsz = (pgsz_order_hv[pgsz_count] & TTE_SZ_BITS);
-		ASSERT(pgsz < MMU_PAGE_SIZES);
-
-		if (pgsz != tmp_pgsz[i]) {
-			page_order_changed = 1;
-			break;
-		}
-		pgsz_count++;
-		i++;
-	}
-
-	if (page_order_changed) {
-		hatlockp = sfmmu_hat_enter(sfmmup);
-		/* Save old values of ttecnt */
-		for (i = 0; i < mmu_page_sizes; i++) {
-			sfmmup->sfmmu_mmuttecnt[i] = ttecnt[i];
-		}
-		mmu_set_pgsz_order(sfmmup, 1);
-		sfmmu_hat_exit(hatlockp);
-	}
-}
-
-/*
- * If the mmu extension API is supported and pgsz_search_on is set,
- * patch out the instruction to branch over the hypervisor call in
- * sfmmu_load_mmustate().
- */
-void
-mmu_enable_pgsz_search()
-{
-	if ((hsvc_mmu_ext_available == B_TRUE) && pgsz_search_on) {
-		/* patch in hcall to set pgsz order */
-		sfmmu_patch_pgsz_reg();
-	}
-}
--- a/usr/src/uts/sun4v/cpu/rock_asm.s	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,486 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/asm_linkage.h>
-#include <sys/hypervisor_api.h>		/* For FAST_TRAP */
-#include <sys/rock_hypervisor_api.h>
-#include <sys/sun4asi.h>	/* ASI_BLK_P */
-#include <sys/machthread.h>	/* THREAD_REG */
-#include <sys/fsr.h>		/* FPRS_FEF, FPRS_DU */
-#include <vm/hat_sfmmu.h>	/* TSBTAG_INVALID */
-
-#if defined(lint)
-#include <sys/types.h>
-
-void
-cpu_smt_pause(void)
-{}
-
-void
-fp_zero(void)
-{}
-
-/*ARGSUSED*/
-uint64_t
-hv_rk_perf_count_init(uint64_t counter)
-{ return (0); }
-
-/*ARGSUSED*/
-uint64_t
-hv_rk_perf_count_release(uint64_t counter)
-{ return (0); }
-
-/*ARGSUSED*/
-uint64_t
-hv_rk_perf_count_set(uint64_t counter, uint64_t value)
-{ return (0); }
-
-/*ARGSUSED*/
-uint64_t
-hv_rk_perf_count_get(uint64_t counter, uint64_t *value)
-{ return (0); }
-
-/*ARGSUSED*/
-uint64_t
-hv_rk_perf_count_start(uint64_t counter, uint64_t value)
-{ return (0); }
-
-/*ARGSUSED*/
-uint64_t
-hv_rk_perf_count_overflow(uint64_t counter, uint64_t *ovf_cnt)
-{ return (0); }
-
-/*ARGSUSED*/
-uint64_t
-hv_rk_perf_count_stop(uint64_t counter)
-{ return (0); }
-
-/*ARGSUSED*/
-uint64_t
-hv_rk_perf_sample_init(uint64_t sampler, uint64_t ringbuf_pa)
-{ return (0); }
-
-/*ARGSUSED*/
-uint64_t
-hv_rk_perf_sample_release(uint64_t sampler)
-{ return (0); }
-
-/*ARGSUSED*/
-uint64_t
-hv_rk_perf_sample_start(uint64_t sampler, uint64_t freq,
-	 		uint64_t list_size, uint64_t valist_pa)
-{ return (0); }
-
-/*ARGSUSED*/
-uint64_t
-hv_rk_perf_sample_config(uint64_t sampler, uint64_t reg_va, uint64_t reg_value)
-{ return (0); }
-
-/*ARGSUSED*/
-uint64_t
-hv_rk_perf_sample_pending(uint64_t sampler, uint64_t *pend_cnt)
-{ return (0); }
-
-/*ARGSUSED*/
-uint64_t
-hv_rk_perf_sample_stop(uint64_t sampler)
-{ return (0); }
-
-/*ARGSUSED*/
-void
-cpu_inv_tsb(caddr_t tsb_base, uint_t tsb_bytes)
-{}
-
-void
-cpu_atomic_delay(void)
-{}
-
-void
-rock_mutex_delay(void)
-{}
-#else	/* lint */
-
-/*
- * Called from various spin loops to prevent this strand from
- * stealing too many cycles from its sibling, who is presumably
- * doing useful work.
- *
- * With a 2.1 GHz clock, 100 membar #Halt instructions plus
- * the call/return overhead will take approximately 500 nanoseconds.
- * That is a suitable time for a PAUSE, as it is roughly equal to
- * two memory accesses.
- */
-	ENTRY_NP(cpu_smt_pause)
-	mov	10, %o0
-1:	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	subcc	%o0, 1, %o0
-	bg,pt	%xcc, 1b
-	membar	#Halt
-	retl
-	membar	#Halt
-	SET_SIZE(cpu_smt_pause)
-
-/*
- * fp_zero() - clear all fp data registers and the fsr
- */
-
-.global	fp_zero_zero
-.align 8
-fp_zero_zero:
-	.xword	0
-
-	ENTRY_NP(fp_zero)
-	sethi	%hi(fp_zero_zero), %o0
-	ldx	[%o0 + %lo(fp_zero_zero)], %fsr
-	movxtod %g0, %d0
-	fzero   %d2
-	movxtod %g0, %d4
-	fzero   %d6
-	movxtod %g0, %d8
-	fzero   %d10
-	movxtod %g0, %d12
-	fzero   %d14
-	movxtod %g0, %d16
-	fzero   %d18
-	movxtod %g0, %d20
-	fzero   %d22
-	movxtod %g0, %d24
-	fzero   %d26
-	movxtod %g0, %d28
-	fzero   %d30
-	movxtod %g0, %d32
-	fzero   %d34
-	movxtod %g0, %d36
-	fzero   %d38
-	movxtod %g0, %d40
-	fzero   %d42
-	movxtod %g0, %d44
-	fzero   %d46
-	movxtod %g0, %d48
-	fzero   %d50
-	movxtod %g0, %d52
-	fzero   %d54
-	movxtod %g0, %d56
-	fzero   %d58
-	movxtod %g0, %d60
-	retl
-	fzero   %d62
-	SET_SIZE(fp_zero)
-
-	/* hcalls for performance counters */
-
-	/*
-	 * uint64_t hv_rk_perf_count_init(uint64_t counter);
-	 */
-	ENTRY(hv_rk_perf_count_init)
-	mov	HV_RK_PERF_COUNT_INIT, %o5
-	ta	FAST_TRAP
-	retl
-	  nop
-	SET_SIZE(hv_rk_perf_count_init)
-
-	/*
-	 * uint64_t hv_rk_perf_count_release(uint64_t counter);
-	 */
-	ENTRY(hv_rk_perf_count_release)
-	mov	HV_RK_PERF_COUNT_RELEASE, %o5
-	ta	FAST_TRAP
-	retl
-	  nop
-	SET_SIZE(hv_rk_perf_count_release)
-
-	/*
-	 * uint64_t hv_rk_perf_count_set(uint64_t counter, uint64_t value)
-	 */
-	ENTRY(hv_rk_perf_count_set)
-	mov	HV_RK_PERF_COUNT_SET, %o5
-	ta	FAST_TRAP
-	retl
-	  nop
-	SET_SIZE(hv_rk_perf_count_set)
-
-	/*
-	 * uint64_t hv_rk_perf_count_get(uint64_t counter, uint64_t *value)
-	 */
-	ENTRY(hv_rk_perf_count_get)
-	mov	HV_RK_PERF_COUNT_GET, %o5
-	mov	%o1, %o2	! Save the address
-	ta	FAST_TRAP
-	retl
-	  stx	%o1, [%o2]	! Value is returned in %o1 by the HV
-	SET_SIZE(hv_rk_perf_count_get)
-
-	/*
-	 * uint64_t hv_rk_perf_count_start(uint64_t counter, uint64_t value)
-	 */
-	ENTRY(hv_rk_perf_count_start)
-	mov	HV_RK_PERF_COUNT_START, %o5
-	ta	FAST_TRAP
-	retl
-	  nop
-	SET_SIZE(hv_rk_perf_count_start)
-
-	/*
-	 * uint64_t hv_rk_perf_count_overflow(uint64_t counter,
-	 * 						uint64_t *ovf_cnt)
-	 */
-	ENTRY(hv_rk_perf_count_overflow)
-	mov	%o1, %o2
-	mov	HV_RK_PERF_COUNT_OVERFLOW, %o5
-	ta	FAST_TRAP
-	retl
-	  stx	%o1, [%o2]
-	SET_SIZE(hv_rk_perf_count_overflow)
-
-	/*
-	 * uint64_t hv_rk_perf_count_stop(uint64_t counter)
-	 */
-	ENTRY(hv_rk_perf_count_stop)
-	mov	HV_RK_PERF_COUNT_STOP, %o5
-	ta	FAST_TRAP
-	retl
-	  nop
-	SET_SIZE(hv_rk_perf_count_stop)
-
-	/*
-	 * uint64_t hv_rk_perf_sample_init(uint64_t counter,
-						uint64_t ringbuf_pa)
-	 */
-	ENTRY(hv_rk_perf_sample_init)
-	mov	HV_RK_PERF_SAMPLE_INIT, %o5
-	ta	FAST_TRAP
-	retl
-	  nop
-	SET_SIZE(hv_rk_perf_sample_init)
-
-	/*
-	 * uint64_t hv_rk_perf_sample_release(uint64_t counter)
-	 */
-	ENTRY(hv_rk_perf_sample_release)
-	mov	HV_RK_PERF_SAMPLE_RELEASE, %o5
-	ta	FAST_TRAP
-	retl
-	  nop
-	SET_SIZE(hv_rk_perf_sample_release)
-
-	/*
-	 * uint64_t hv_rk_perf_sample_config(uint64_t sampler, uint64_t reg_va,
-	 *					uint64_t reg_value)
-	 */
-	ENTRY(hv_rk_perf_sample_config)
-	mov	HV_RK_PERF_SAMPLE_CONFIG, %o5
-	ta	FAST_TRAP
-	retl
-	  nop
-	SET_SIZE(hv_rk_perf_sample_config)
-
-	/*
-	 * uint64_t hv_rk_perf_sample_start(uint64_t sampler, uint64_t freq,
-	 *			uint64_t list_size, uint64_t valist_pa)
-	 */
-	ENTRY(hv_rk_perf_sample_start)
-	mov	HV_RK_PERF_SAMPLE_START, %o5
-	ta	FAST_TRAP
-	retl
-	  nop
-	SET_SIZE(hv_rk_perf_sample_start)
-
-	/*
-	 * uint64_t hv_rk_perf_sample_pending(uint64_t sampler, 
-	 *					uint64_t *pend_cnt)
-	 */
-	ENTRY(hv_rk_perf_sample_pending)
-	mov	%o1, %o2
-	mov	HV_RK_PERF_SAMPLE_PENDING, %o5
-	ta	FAST_TRAP
-	retl
-	  stx	%o1, [%o2]
-	SET_SIZE(hv_rk_perf_sample_pending)
-
-	/*
-	 * uint64_t hv_rk_perf_sample_stop(uint64_t sampler)
-	 */
-	ENTRY(hv_rk_perf_sample_stop)
-	mov	HV_RK_PERF_SAMPLE_STOP, %o5
-	ta	FAST_TRAP
-	retl
-	  nop
-	SET_SIZE(hv_rk_perf_sample_stop)
-
-/*
- * Invalidate all of the entries within the TSB, by setting the inv bit
- * in the tte_tag field of each tsbe.
- *
- * We take advantage of the fact that the TSBs are page aligned and a
- * multiple of PAGESIZE to use ASI_BLK_INIT_xxx ASI.
- *
- * See TSB_LOCK_ENTRY and the miss handlers for how this works in practice
- * (in short, we set all bits in the upper word of the tag, and we give the
- * invalid bit precedence over other tag bits in both places).
- */
-
-#define	VIS_BLOCKSIZE	64
-#include "assym.h"	/* T_PREEMPT */
-
-	ENTRY(cpu_inv_tsb)
-
-	! Get space for aligned block of saved fp regs.
-	save	%sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
-
-	! kpreempt_disable();
-	ldsb	[THREAD_REG + T_PREEMPT], %l3
-	inc	%l3
-	stb	%l3, [THREAD_REG + T_PREEMPT]
-
-	! See if fpu was in use.  If it was, we need to save off the
-	! floating point registers to the stack.
-	rd	%fprs, %l0			! %l0 = cached copy of fprs
-	mov	%g0, %l2
-
-	btst	FPRS_FEF, %l0
-	bz,pt	%icc, 4f
-	nop
-
-	! If upper half fp registers are in use, save them as they will be
-	! used below.
-	btst	FPRS_DU, %l0
-	bz,pt	%icc, 4f
-	nop
-
-	! save in-use fpregs on stack
-
-	add	%fp, STACK_BIAS - 65, %l1	! get stack frame for fp regs
-	and	%l1, -VIS_BLOCKSIZE, %l1	! block align frame
-	stda	%d32, [%l1]ASI_BLK_P		! %l1 = addr of saved fp regs
-
-	! Set a flag saying fp regs are saved.
-	mov	1, %l2
-
-	! enable fp
-
-4:	membar	#StoreStore|#StoreLoad|#LoadStore
-	wr	%g0, FPRS_FEF|FPRS_DU, %fprs
-	wr	%g0, ASI_BLK_P, %asi
-
-	! load up FP registers with invalid TSB tag.
-	set	TSBTAG_INVALID, %l3
-	movxtod	%l3, %d32
-	movxtod	%l3, %d36
-	movxtod	%l3, %d40	! Invalidate context
-	movxtod	%l3, %d44
-	movxtod	%g0, %d34
-	movxtod	%g0, %d38
-	movxtod	%g0, %d42	! Zero in TTE
-	movxtod	%g0, %d46
-
-	ba,pt	%xcc, .cpu_inv_doblock
-	mov	(4*VIS_BLOCKSIZE), %i4	! we do 4 stda's each loop below
-
-.cpu_inv_blkstart:
-	stda	%d32, [%i0+128]%asi
-	stda	%d32, [%i0+64]%asi
-	stda	%d32, [%i0]%asi
-
-	add	%i0, %i4, %i0
-	sub	%i1, %i4, %i1
-
-.cpu_inv_doblock:
-	cmp	%i1, (4*VIS_BLOCKSIZE)	! check for completion
-	bgeu,a	%icc, .cpu_inv_blkstart
-	  stda	%d32, [%i0+192]%asi
-
-.cpu_inv_finish:
-	membar	#Sync
-	brz,a	%l2, .cpu_inv_finished
-	  wr	%l0, 0, %fprs		! restore fprs
-
-	! restore fpregs from stack
-	ldda    [%l1]ASI_BLK_P, %d32
-
-	membar	#Sync
-	wr	%l0, 0, %fprs		! restore fprs
-
-.cpu_inv_finished:
-	! kpreempt_enable();
-	ldsb	[THREAD_REG + T_PREEMPT], %l3
-	dec	%l3
-	stb	%l3, [THREAD_REG + T_PREEMPT]
-	ret
-	restore
-	SET_SIZE(cpu_inv_tsb)
-
-/*
- * This is CPU specific delay routine for atomic backoff.
- * It is used in case of Rock CPU. The rd instruction uses
- * less resources than casx on these CPUs.
- */
-	.align	32
-	ENTRY(cpu_atomic_delay)
-	rd	%ccr, %g0
-	rd	%ccr, %g0
-	retl
-	rd	%ccr, %g0
-	SET_SIZE(cpu_atomic_delay)
-
-/*
- * Delay to last ~100 nano seconds on a 2.1 GHz. Membars
- * should be linear and not in a loop to avoid impact
- * on the sibling strand (BR pipeline is shared by
- * two sibling strands).
- */
-	.align	64
-	ENTRY(rock_mutex_delay)
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	membar	#Halt
-	retl
-	membar	#Halt
-	SET_SIZE(rock_mutex_delay)
-#endif /* lint */
--- a/usr/src/uts/sun4v/cpu/rock_copy.s	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4941 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/param.h>
-#include <sys/errno.h>
-#include <sys/asm_linkage.h>
-#include <sys/vtrace.h>
-#include <sys/machthread.h>
-#include <sys/clock.h>
-#include <sys/asi.h>
-#include <sys/fsr.h>
-#include <sys/privregs.h>
-#include <sys/rockasi.h>
-
-#if !defined(lint)
-#include "assym.h"
-#endif	/* lint */
-
-/*
- * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
- * to "break even" using FP/VIS-accelerated memory operations.
- * The FPBLK code assumes a minimum number of bytes are available
- * to be moved on entry.  Check that code carefully before 
- * reducing VIS_COPY_THRESHOLD below 256.
- */
-/*
- * This shadows sys/machsystm.h which can't be included due to
- * the lack of _ASM guards in include files it references.
- * Change it here, change it there.
- */
-#define VIS_COPY_THRESHOLD 256
-
-/*
- * TEST for very short copies
- * Be aware that the maximum unroll for the short unaligned case
- * is SHORTCOPY+1
- */
-#define SHORTCOPY 3
-#define CHKSIZE  39
-
-/*
- * Indicates that we're to trampoline to the error handler.
- * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
- * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
- */
-#define	FPUSED_FLAG	1
-#define	TRAMP_FLAG	2
-#define	KCOPY_FLAG	4
-#define	FPSAVED_FLAG	8
-#define	MASK_FLAGS	0xf
-
-/*
- * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
- * handler was set
- */
-#define	LOFAULT_SET 2
-
-/*
- * Number of outstanding prefetches.
- * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with
- * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a
- * reach of 5*BLOCK_SIZE.  The double prefetch gives an typical improvement
- * of 5% for large copies as compared to a single prefetch.  The reason
- * for the improvement is that with Cheetah and Jaguar, some prefetches
- * are dropped due to the prefetch queue being full.  The second prefetch
- * reduces the number of cache lines that are dropped. 
- * Do not remove the double prefetch or change either FIRST_PREFETCH
- * or SECOND_PREFETCH without extensive performance tests to prove
- * there is no loss of performance.
- * XXX: For ROCK, the prefetch depth can be upto 16, but sticking
- *      with 8 as of now pending more clarity on this.
- */
-#define	FIRST_PREFETCH	8
-#define	SECOND_PREFETCH	5
-
-#define	VIS_BLOCKSIZE		64
-
-/*
- * Size of stack frame in order to accomodate a 64-byte aligned
- * floating-point register save area and 2 64-bit temp locations.
- * All copy functions use two quadrants of fp registers; to assure a
- * block-aligned two block buffer in which to save we must reserve
- * three blocks on stack.  Not all functions preserve %pfrs on stack
- * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
- *
- *    _______________________________________ <-- %fp + STACK_BIAS
- *    | We may need to preserve 2 quadrants |
- *    | of fp regs, but since we do so with |
- *    | BST/BLD we need room in which to    |
- *    | align to VIS_BLOCKSIZE bytes.  So   |
- *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
- *    |-------------------------------------|
- *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
- *    |-------------------------------------|
- *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
- *    ---------------------------------------
- */
-#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
-#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
-#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
-#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
-#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
-
-#define	ICACHE_LINE_SIZE	64
-
-#define	MEDIUM_MAX	255
-#define	MED_WMAX	256 /* max copy for medium word-aligned case */
-#define	MED_MAX		256 /* max copy for medium longword-aligned case */
-
-#define	PAGE_MASK	8191
-#define	ST_CACHE_ALIGN	127
-
-#ifndef	BSTORE_SIZE
-#define	BSTORE_SIZE	256	/* min copy size for block store */
-#endif
-
-/*
- * Common macros used by the various versions of the block copy
- * routines in this file.
- */
-
-/*
- * In FP copies if we do not have preserved data to restore over
- * the fp regs we used then we must zero those regs to avoid
- * exposing portions of the data to later threads (data security).
- *
- * Copy functions use either quadrants 1 and 3 or 2 and 4.
- *
- * FZEROQ3Q4: Zero quadrants 3 and 4, ie %d32 - %d46 and %d48 - %d62
- *
- */
-#define	FZEROQ3Q4		\
-	movxtod	%g0, %d32	;\
-	movxtod	%g0, %d34	;\
-	fsrc1	%d0, %d36	;\
-	fsrc1	%d0, %d38	;\
-	fsrc1	%d0, %d40	;\
-	fsrc1	%d0, %d42	;\
-	fsrc1	%d0, %d44	;\
-	fsrc1	%d0, %d46	;\
-	fsrc1	%d0, %d48	;\
-	fsrc1	%d0, %d50	;\
-	fsrc1	%d0, %d52	;\
-	fsrc1	%d0, %d54	;\
-	fsrc1	%d0, %d56	;\
-	fsrc1	%d0, %d58	;\
-	fsrc1	%d0, %d60	;\
-	fsrc1	%d0, %d62
-
-
-/*
- * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
- * Used to save and restore in-use fp registers when we want to use FP
- * and find fp already in use and copy size still large enough to justify
- * the additional overhead of this save and restore.
- *
- * A membar #Sync is needed before save to sync fp ops initiated before
- * the call to the copy function (by whoever has fp in use); for example
- * an earlier block load to the quadrant we are about to save may still be
- * "in flight".  A membar #Sync is required at the end of the save to
- * sync our block store (the copy code is about to begin ldd's to the
- * first quadrant).  Note, however, that since Cheetah pipeline block load
- * is blocking we can omit the initial membar before saving fp state (they're
- * commented below in case of future porting to a chip that does not block
- * on block load).
- *
- * Similarly: a membar #Sync before restore allows the block stores of
- * the copy operation to complete before we fill the quadrants with their
- * original data, and a membar #Sync after restore lets the block loads
- * of the restore complete before we return to whoever has the fp regs
- * in use.  To avoid repeated membar #Sync we make it the responsibility
- * of the copy code to membar #Sync immediately after copy is complete
- * and before using the BLD_*_FROMSTACK macro.
- */
-#if !defined(lint)
-#define BST_FPQ3Q4_TOSTACK(tmp1)				\
-	/* membar #Sync	*/					;\
-	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
-	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
-	stda	%d32, [tmp1]ASI_BLK_P				;\
-	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
-	stda	%d48, [tmp1]ASI_BLK_P				;\
-	membar	#Sync
-
-#define	BLD_FPQ3Q4_FROMSTACK(tmp1)				\
-	/* membar #Sync - provided at copy completion */	;\
-	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
-	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
-	ldda	[tmp1]ASI_BLK_P, %d32				;\
-	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
-	ldda	[tmp1]ASI_BLK_P, %d48				;\
-	membar	#Sync
-#endif
-
-/*
- * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
- * prevent preemption if there is no t_lwp to save FP state to on context
- * switch) before commencing a FP copy, and reallow it on completion or
- * in error trampoline paths when we were using FP copy.
- *
- * Both macros may call other functions, so be aware that all outputs are
- * forfeit after using these macros.  For this reason we do not pass registers
- * to use - we just use any outputs we want.
- *
- * For fpRAS we need to perform the fpRAS mechanism test on the same
- * CPU as we use for the copy operation, both so that we validate the
- * CPU we perform the copy on and so that we know which CPU failed
- * if a failure is detected.  Hence we need to be bound to "our" CPU.
- * This could be achieved through disabling preemption (and we have do it that
- * way for threads with no t_lwp) but for larger copies this may hold
- * higher priority threads off of cpu for too long (eg, realtime).  So we
- * make use of the lightweight t_nomigrate mechanism where we can (ie, when
- * we have a t_lwp).
- *
- * Pseudo code:
- *
- * FP_NOMIGRATE:
- *
- * if (curthread->t_lwp) {
- *	thread_nomigrate();
- * } else {
- *	kpreempt_disable();
- * }
- *
- * FP_ALLOWMIGRATE:
- *
- * if (curthread->t_lwp) {
- *	thread_allowmigrate();
- * } else {
- *	kpreempt_enable();
- * }
- */
-
-#define	FP_NOMIGRATE(label1, label2)				\
-	ldn	[THREAD_REG + T_LWP], %o0			;\
-	brz,a,pn %o0, label1/**/f				;\
-	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
-	call	thread_nomigrate				;\
-	  nop							;\
-	ba	label2/**/f					;\
-	  nop							;\
-label1:								;\
-	inc	%o1						;\
-	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
-label2:
-
-#define	FP_ALLOWMIGRATE(label1, label2)			\
-	ldn	[THREAD_REG + T_LWP], %o0			;\
-	brz,a,pn %o0, label1/**/f				;\
-	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
-	call thread_allowmigrate				;\
-	  nop							;\
-	ba	label2/**/f					;\
-	  nop							;\
-label1:								;\
-	dec	%o1						;\
-	brnz,pn	%o1, label2/**/f				;\
-	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
-	ldn	[THREAD_REG + T_CPU], %o0			;\
-	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
-	brz,pt	%o0, label2/**/f				;\
-	  nop							;\
-	call	kpreempt					;\
-	  rdpr	%pil, %o0					;\
-label2:
-
-/*
- * Copy a block of storage, returning an error code if `from' or
- * `to' takes a kernel pagefault which cannot be resolved.
- * Returns errno value on pagefault error, 0 if all ok
- */
-
-#if defined(lint)
-
-/* ARGSUSED */
-int
-kcopy(const void *from, void *to, size_t count)
-{ return(0); }
-
-#else	/* lint */
-
-	.seg	".text"
-	.align	4
-
-	ENTRY(kcopy)
-
-	sethi	%hi(.copyerr_no_fp_used), %o4
-	or	%o4, %lo(.copyerr_fp_used), %o4
-	stn	%o4, [THREAD_REG + T_LOFAULT]	! set t_lofault
-	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
-	or	%o5, KCOPY_FLAG, %o5
-	membar	#Sync				! sync error barrier
-	ba,pt	%ncc, .forcpy			! common code
-	 nop
-
-
-/*
- * We got here because of a fault in .copyerr_fp_used.  We can't safely
- * restore fp state, so we panic.
- */
-fp_panic_msg:
-	.asciz	"Unable to restore fp state after copy operation"
-
-	.align	4
-.copyerr2:
-	set	fp_panic_msg, %o0
-	call	panic
-	  nop
-
-/*
- * We got here because of a fault during a small kcopy or bcopy.
- * No floating point registers were used in this copy.
- * Errno value is in %g1.
- */
-.copyerr_no_fp_used:
-	btst	TRAMP_FLAG, %o5
-	membar	#Sync
-	andn	%o5, TRAMP_FLAG, %o5
-	bnz,pn	%ncc, 3f
-	  stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	retl
-	  mov	%g1, %o0
-3:
-	jmp	%o5				! goto real handler
-	  mov	%g0, %o0			! 
-
-/*
- * We got here because of a fault during a small kcopy or bcopy.
- * floating point registers were used in this copy.
- * Errno value is in %g1.
- */
-.copyerr_fp_used:
-	set	.copyerr2, %l0
-	membar	#Sync				! sync error barrier
-	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
-	btst	FPUSED_FLAG, %l6
-	bz	%ncc, 1f
-	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
-
-	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
-	wr	%o2, 0, %gsr
-
-	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
-	! No need to restore regs if they were not saved
-	btst	FPSAVED_FLAG, %l6
-	bz	%ncc, 4f
-	  nop
-
-	BLD_FPQ3Q4_FROMSTACK(%o2)
-
-	ba,pt	%ncc, 1f
-	  wr	%o3, 0, %fprs		! restore fprs
-
-4:
-	FZEROQ3Q4
-	wr	%o3, 0, %fprs		! restore fprs
-
-	!
-	! Need to cater for the different expectations of kcopy
-	! and bcopy. kcopy will *always* set a t_lofault handler
-	! If it fires, we're expected to just return the error code
-	! and *not* to invoke any existing error handler. As far as
-	! bcopy is concerned, we only set t_lofault if there was an
-	! existing lofault handler. In that case we're expected to
-	! invoke the previously existing handler after resetting the
-	! t_lofault value.
-	!
-1:
-	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
-	membar	#Sync				! sync error barrier
-	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	FP_ALLOWMIGRATE(5, 6)
-
-	btst	TRAMP_FLAG, %l0
-	bnz,pn	%ncc, 3f
-	  nop
-	ret
-	  restore	%g1, 0, %o0
-
-3:
-	!
-	! We're here via bcopy. There *must* have been an error handler
-	! in place otherwise we would have died a nasty death already.
-	!
-	jmp	%l6				! goto real handler
-	  restore	%g0, 0, %o0		! dispose of copy window
-
-	SET_SIZE(kcopy)
-#endif	/* lint */
-
-#define	ALIGN8(X)	(((X) + 7) & ~7)
-#define	ICACHE_LINE_SIZE	64
-#define	PF_FAR		2048
-#define	PF_NEAR		1024
-#define	SMALL_MAX	39
-/*
- * Copy a block of storage - must not overlap (from + len <= to).
- * Registers: l6 - saved t_lofault
- * (for short copies, o5 - saved t_lofault)
- *
- * Copy a page of memory.
- * Assumes double word alignment and a count >= 256.
- */
-#if defined(lint)
-
-/* ARGSUSED */
-void
-bcopy(const void *from, void *to, size_t count)
-{}
-#else	/* lint */
-
-	.align ICACHE_LINE_SIZE
-	ENTRY(bcopy)
-	ENTRY(__align_cpy_1)
-	ldn	[THREAD_REG + T_LOFAULT], %o5	! save t_lofault
-	tst	%o5
-	bz,pt	%icc, .forcpy
-	  nop
-	sethi	%hi(.copyerr_no_fp_used), %o4
-	or	%o4, %lo(.copyerr_no_fp_used), %o4
-	membar	#Sync				! sync error barrier
-	stn	%o4, [THREAD_REG + T_LOFAULT]	! install new vector
-	or	%o5, TRAMP_FLAG, %o5		! error should trampoline
-.forcpy:
-	cmp	%o2, SMALL_MAX		! check for not small case
-	bgu,pn	%ncc, .medium_bcopy		! go to larger cases
-	cmp	%o2, SHORTCOPY		! check for really short case
-	ble,pt	%ncc, .smallleft_bcopy	!
-	or	%o1, %o0, %o3		! prepare alignment check
-	andcc	%o3, 0x3, %g0		! test for alignment
-	bz,pt	%ncc, .smallword_bcopy	! branch to word aligned case
-	sub	%o2, 3, %o2		! adjust count to allow cc zero test
-.smallnotalign4_bcopy:
-	ldub	[%o0], %o3		! read byte
-	subcc	%o2, 4, %o2		! reduce count by 4
-	stb	%o3, [%o1]		! write byte
-	ldub	[%o0+1], %o3		! repeat for a total of 4 bytes
-	add	%o0, 4, %o0		! advance SRC by 4
-	stb	%o3, [%o1+1]
-	ldub	[%o0-2], %o3
-	add	%o1, 4, %o1		! advance DST by 4
-	stb	%o3, [%o1-2]
-	ldub	[%o0-1], %o3
-	bgu,pt	%ncc, .smallnotalign4_bcopy	! loop til 3 or fewer bytes remain
-	stb	%o3, [%o1-1]
-	add	%o2, 3, %o2		! restore count
-.smallleft_bcopy:
-	tst	%o2
-	bz,pt	%ncc, .smallexit_bcopy
-	nop
-.smallleft3_bcopy:				! 1, 2, or 3 bytes remain
-	ldub	[%o0], %o3		! load one byte
-	deccc	%o2			! reduce count for cc test
-	bz,pt	%ncc, .smallexit_bcopy
-	stb	%o3, [%o1]		! store one byte
-	ldub	[%o0+1], %o3		! load second byte
-	deccc	%o2
-	bz,pt	%ncc, .smallexit_bcopy
-	stb	%o3, [%o1+1]		! store second byte
-	ldub	[%o0+2], %o3		! load third byte
-	stb	%o3, [%o1+2]		! store third byte
-	membar	#Sync				! sync error barrier
-	andn	%o5, TRAMP_FLAG, %o5
-	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	retl
-	clr	%o0
-
-	.align	16
-	nop				! affects loop icache alignment
-.smallwords_bcopy:
-	lduw	[%o0], %o3		! read word
-.smallwordx_bcopy:
-	subcc	%o2, 8, %o2		! update count
-	stw	%o3, [%o1]		! write word
-	add	%o0, 8, %o0		! update SRC
-	lduw	[%o0-4], %o3		! read word
-	add	%o1, 8, %o1		! update DST
-	bgu,pt	%ncc, .smallwords_bcopy	! loop until done
-	stw	%o3, [%o1-4]		! write word
-	addcc	%o2, 7, %o2		! restore count
-	bz,pt	%ncc, .smallexit_bcopy	! check for completion
-	nop
-	cmp	%o2, 4			! check for 4 or more bytes left
-	blt	.smallleft3_bcopy		! if not, go to finish up
-	nop
-	lduw	[%o0], %o3
-	add	%o0, 4, %o0
-	subcc	%o2, 4, %o2
-	stw	%o3, [%o1]
-	add	%o1, 4, %o1
-	bnz,pt	%ncc, .smallleft3_bcopy
-	nop
-	membar	#Sync				! sync error barrier
-	andn	%o5, TRAMP_FLAG, %o5
-	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	retl
-	clr	%o0
-
-.smallword_bcopy:
-	subcc	%o2, 4, %o2		! update count
-	bgu,pt	%ncc, .smallwordx_bcopy
-	lduw	[%o0], %o3		! read word
-	addcc	%o2, 3, %o2		! restore count
-	bz,pt	%ncc, .smallexit_bcopy
-	stw	%o3, [%o1]		! write word
-	deccc	%o2			! reduce count for cc test
-	ldub	[%o0+4], %o3		! load one byte
-	bz,pt	%ncc, .smallexit_bcopy
-	stb	%o3, [%o1+4]		! store one byte
-	ldub	[%o0+5], %o3		! load second byte
-	deccc	%o2
-	bz,pt	%ncc, .smallexit_bcopy
-	stb	%o3, [%o1+5]		! store second byte
-	ldub	[%o0+6], %o3		! load third byte
-	stb	%o3, [%o1+6]		! store third byte
-.smallexit_bcopy:
-	membar	#Sync				! sync error barrier
-	andn	%o5, TRAMP_FLAG, %o5
-	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	retl
-	clr	%o0
-	.align 16
-.medium_bcopy:
-	neg	%o1, %g5
-	neg	%o0, %o3	
-	andcc	%g5, 7, %g5	! bytes till DST 8 byte aligned
-	and	%o3, 7, %o3	! bytes till SRC 8 byte aligned
-	cmp	%g5, %o3
-	bne	%ncc, continue
-	sub	%g5, %o3, %o3	! -(bytes till SRC aligned after DST aligned)
-				! o3={-7, -6, ... 7}  o3>0 => SRC overaligned
-	! src and dst are aligned.
-	mov	%o3, %g1		! save %o3
-	andcc	%o0, 7, %o3		! is src buf  aligned on a 8 byte bound
-	brz,pt	%o3, src_dst_aligned_on_8		
-	nop
-	mov	%o3, %g5
-	mov	8, %o4
-	sub 	%o4, %o3, %o3
-	cmp	%o3, %o2
-	bg,a,pn	%ncc, 1f
-	mov	%o2, %o3	
-1:
-	! %o3 has the bytes to be written in partial store.
-	sub	%o2, %o3, %o2
-	prefetch	[%o0],2
-7:
-	deccc	%o3			! byte clearing loop
-	ldub	[%o0], %o4		! load one byte
-	stb	%o4, [%o1]
-	inc	%o1			! increment dst
-	bgu,pt	%ncc, 7b
-	inc	%o0			! increment src
-	mov	%g1, %o3		! restore %o3
-src_dst_aligned_on_8:
-	! check  if we are copying 1k or more bytes
-	cmp	%o2, 511
-	bgu,pt	%ncc, copying_ge_512
-	nop
-	ba	.medlword_bcopy
-	nop
-
-continue:
-	andcc	%g5, 7, %g5	! bytes till DST 8 byte aligned
-	bz	%ncc, 2f
-	nop
-
-	sub	%o2, %g5, %o2	! update count
-
-1:
-	ldub	[%o0], %o4
-	deccc	%g5
-	inc	%o0
-	stb	%o4, [%o1]
-	bgu,pt	%ncc, 1b
-	inc	%o1
-
-	! Now DST is 8-byte aligned.  dst, from, o2 are current.
-
-2:
-	andcc	%o0, 0x3, %g0		! test alignment
-	bnz,pt	%ncc, .mediumsetup_bcopy	! branch to skip aligned cases
-					! if src, dst not aligned
-	prefetch [%o0 + (1 * VIS_BLOCKSIZE)], #n_reads
-
-/*
- * Handle all cases where src and dest are aligned on word
- * or long word boundaries.  Use unrolled loops for better
- * performance.  This option wins over standard large data
- * move when source and destination is in cache for medium
- * to short data moves.
- */
-	andcc	%o0, 0x7, %g0		! test word alignment
-	bz,pt	%ncc, src_dst_lword_aligned	! branch to long word aligned case
-	prefetch [%o0 + (2 * VIS_BLOCKSIZE)], #n_reads
-	cmp	%o2, MED_WMAX		! limit to store buffer size
-	bgu,pt	%ncc, .mediumrejoin_bcopy	! otherwise rejoin main loop
-	nop
-	subcc	%o2, 15, %o2		! adjust length to allow cc test
-					! for end of loop
-	ble,pt	%ncc, .medw15_bcopy		! skip big loop if less than 16
-	prefetch [%o0 + (3 * VIS_BLOCKSIZE)], #n_reads
-/*
- * no need to put prefetch in loop as prefetches have
- * already been issued for maximum loop size
- */
-.medw16_bcopy:
-	ld	[%o0], %o4		! load
-	subcc	%o2, 16, %o2		! decrement length count
-	stw	%o4, [%o1]		! and store
-	ld	[%o0+4], %o3		! a block of 16 bytes
-	add	%o0, 16, %o0		! increase src ptr by 16
-	stw	%o3, [%o1+4]
-	ld	[%o0-8], %o4
-	add	%o1, 16, %o1		! increase dst ptr by 16
-	stw	%o4, [%o1-8]
-	ld	[%o0-4], %o3
-	bgu,pt	%ncc, .medw16_bcopy		! repeat if at least 16 bytes left
-	stw	%o3, [%o1-4]
-.medw15_bcopy:
-	addcc	%o2, 15, %o2		! restore count
-	bz,pt	%ncc, .medwexit_bcopy		! exit if finished
-	nop
-	cmp	%o2, 8
-	blt,pt	%ncc, .medw7_bcopy		! skip if 7 or fewer bytes left
-	nop				!
-	ld	[%o0], %o4		! load 4 bytes
-	subcc	%o2, 8, %o2		! decrease count by 8
-	stw	%o4, [%o1]		! and store 4 bytes
-	add	%o0, 8, %o0		! increase src ptr by 8
-	ld	[%o0-4], %o3		! load 4 bytes
-	add	%o1, 8, %o1		! increase dst ptr by 8
-	stw	%o3, [%o1-4]		! and store 4 bytes
-	bz	%ncc, .medwexit_bcopy		! exit if finished
-	nop
-.medw7_bcopy:					! count is ge 1, less than 8
-	cmp	%o2, 3			! check for 4 bytes left
-	ble,pt	%ncc, .medw3_bcopy		! skip if 3 or fewer bytes left
-	nop				!
-	ld	[%o0], %o4		! load 4 bytes
-	sub	%o2, 4, %o2		! decrease count by 4
-	add	%o0, 4, %o0		! increase src ptr by 4
-	stw	%o4, [%o1]		! and store 4 bytes
-	add	%o1, 4, %o1		! increase dst ptr by 4
-	tst	%o2			! check for zero bytes left
-	bz	%ncc, .medwexit_bcopy		! exit if finished
-	nop
-.medw3_bcopy:					! count is known to be 1, 2, or 3
-	deccc	%o2			! reduce count by one
-	ldub	[%o0], %o3		! load one byte
-	bz,pt	%ncc, .medwexit_bcopy		! exit if last byte
-	stb	%o3, [%o1]		! store one byte
-	ldub	[%o0+1], %o3		! load second byte
-	deccc	%o2			! reduce count by one
-	bz,pt	%ncc, .medwexit_bcopy		! exit if last byte
-	stb	%o3, [%o1+1]		! store second byte
-	ldub	[%o0+2], %o3		! load third byte
-	stb	%o3, [%o1+2]		! store third byte
-.medwexit_bcopy:
-	membar	#Sync				! sync error barrier
-	andn	%o5, TRAMP_FLAG, %o5
-	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	retl
-	clr	%o0
-	
-/*
- * Special case for handling when src and dest are both long word aligned
- * and total data to move is between SMALL_MAX and MED_MAX bytes
- */
-
-	.align 16
-	nop
-src_dst_lword_aligned:
-.medlword_bcopy:				! long word aligned
-	cmp	%o2, MED_MAX		! limit to store buffer size
-	bgu,pt	%ncc, .mediumrejoin_bcopy	! otherwise rejoin main loop
-	nop
-	subcc	%o2, 31, %o2		! adjust length to allow cc test
-					! for end of loop
-	ble,pt	%ncc, .medl31_bcopy		! skip big loop if less than 32
-	prefetch [%o0 + (3 * VIS_BLOCKSIZE)], #n_reads ! into the l2 cache
-/*
- * no need to put prefetch in loop as prefetches have
- * already been issued for maximum loop size
- */
-.medl32_bcopy:
-	ldx	[%o0], %o4		! load
-	subcc	%o2, 32, %o2		! decrement length count
-	stx	%o4, [%o1]		! and store
-	ldx	[%o0+8], %o3		! a block of 32 bytes
-	add	%o0, 32, %o0		! increase src ptr by 32
-	stx	%o3, [%o1+8]
-	ldx	[%o0-16], %o4
-	add	%o1, 32, %o1		! increase dst ptr by 32
-	stx	%o4, [%o1-16]
-	ldx	[%o0-8], %o3
-	bgu,pt	%ncc, .medl32_bcopy		! repeat if at least 32 bytes left
-	stx	%o3, [%o1-8]
-.medl31_bcopy:
-	addcc	%o2, 16, %o2		! adjust remaining count
-	ble,pt	%ncc, .medl15_bcopy		! skip if 15 or fewer bytes left
-	nop				!
-	ldx	[%o0], %o4		! load and store 16 bytes
-	add	%o0, 16, %o0		! increase src ptr by 16
-	stx	%o4, [%o1]		!
-	sub	%o2, 16, %o2		! decrease count by 16
-	ldx	[%o0-8], %o3		!
-	add	%o1, 16, %o1		! increase dst ptr by 16
-	stx	%o3, [%o1-8]
-.medl15_bcopy:
-	addcc	%o2, 15, %o2		! restore count
-	bz,pt	%ncc, .medwexit_bcopy		! exit if finished
-	nop
-	cmp	%o2, 8
-	blt,pt	%ncc, .medw7_bcopy		! skip if 7 or fewer bytes left
-	nop
-	ldx	[%o0], %o4		! load 8 bytes
-	add	%o0, 8, %o0		! increase src ptr by 8
-	stx	%o4, [%o1]		! and store 8 bytes
-	subcc	%o2, 8, %o2		! decrease count by 8
-	bz	%ncc, .medwexit_bcopy		! exit if finished
-	add	%o1, 8, %o1		! increase dst ptr by 8
-	ba	.medw7_bcopy
-	nop
-
-	.align 16
-	nop
-	nop
-	nop
-unaligned_src_dst:
-
-.mediumsetup_bcopy:
-	prefetch [%o0 + (2 * VIS_BLOCKSIZE)], #one_read
-.mediumrejoin_bcopy:
-	! %o5 has the saved T_LOFAULT when we come here.
-	! We set a new error handler if the T_LOFAULT was set earlier OR
-	! KCOPY_FLAG is set.
-	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
-	mov	%i5, %l6
-	andn	%l6, TRAMP_FLAG, %o2
-	brz,pt	%o2, 1f
-	  nop
-	! We enter here if KCOPY_FLAG was set OR
-	! T_LOFAULT was set earlier.
-	! We only change the error handler pointer here.
-	! The flags TRAMP_FLAG or KCOPY_FLAG is left as it is in %l6.
-	sethi	%hi(.copyerr_fp_used), %o2
-	or	%o2, %lo(.copyerr_fp_used), %o2
-	membar	#Sync				! sync error barrier
-	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
-1:
-	FP_NOMIGRATE(6, 7)
-	mov	%i0, %o0
-	mov	%i1, %o1
-	mov	%i2, %o2
-	mov	%i3, %o3
-	mov	%i5, %o5
-	rd	%fprs, %o4		! check for unused fp
-	st	%o4, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
-
-	! FPU enabled ?  If not, enable it.
-	btst	FPRS_FEF, %o4
-	bz,a,pt	%icc, continue_bcopy
-	  wr	%g0, FPRS_FEF, %fprs
-
-	! save the FP registers even if DU is not set.
-
-	BST_FPQ3Q4_TOSTACK(%o4)
-	or	%l6, FPSAVED_FLAG, %l6
-	
-continue_bcopy:
-	rd	%gsr, %o4
-	stx	%o4, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
-	or	%l6, FPUSED_FLAG, %l6
-
-	add	%o0, 8, %o0		! prepare to round SRC upward
-
-	sethi	%hi(0x1234567f), %o5	! For GSR.MASK 
-	or	%o5, 0x67f, %o5
-
-	cmp	%o2, MEDIUM_MAX
-	bmask	%o5, %g0, %g0
-
-	! Compute o5 (number of bytes that need copying using the main loop).
-	! First, compute for the medium case.
-	! Then, if large case, o5 is replaced by count for block alignment.
-	! Be careful not to read past end of SRC
-	! Currently, o2 is the actual count remaining
-	!	    o3 is how much sooner we'll cross the alignment boundary
-	!		in SRC compared to in DST
-	!
-	! Examples:  Let # denote bytes that should not be accessed
-	!	    Let x denote a byte already copied to align DST
-	!	    Let . and - denote bytes not yet copied
-	!	    Let | denote double alignment boundaries
-	!
-	!	    DST:  ######xx|........|--------|..######   o2 = 18
-	!			  dst
-	!
-	!  o3 = -3:  SRC:  ###xx...|.....---|-----..#|########   o5 = 8
-	!			  from
-	!
-	!  o3 =  0:  SRC:  ######xx|........|--------|..######   o5 = 16-8 = 8
-	!				   from
-	!
-	!  o3 = +1:  SRC:  #######x|x.......|.-------|-..#####   o5 = 16-8 = 8
-	!				   from
-
-	mov	%asi, %g1		! save curr %asi
-	wr	%g0, ASI_CACHE_SPARING_P, %asi
-
-	or	%g0, -8, %o5
-	alignaddr %o0, %g0, %o0		! set GSR.ALIGN and align from
-
-	movrlz	%o3, %g0, %o5		! subtract 8 from o2+o3 only if o3>=0
-	add	%o5, %o2, %o5
-	add	%o5, %o3, %o5
-
-	bleu	%ncc, 4f
-	andn	%o5, 7, %o5		! 8 byte aligned count
-	neg	%o1, %o5		! 'large' case
-	and	%o5, VIS_BLOCKSIZE-1, %o5  ! bytes till DST block aligned
-4:	
-	brgez,a	%o3, .beginmedloop_bcopy
-	ldda	[%o0-8]%asi, %d32
-
-	add	%o0, %o3, %o0		! back up from
-5:
-	ldda	[%o0]ASI_FL8_P, %d34
-	inc	%o0
-	andcc	%o0, 7, %g0
-	bnz	%ncc, 5b
-	bshuffle %d32, %d34, %d32		! shifts d32 left 1 byte and or's in d34
-
-.beginmedloop_bcopy:
-	tst	%o5
-	bz	%ncc, .endmedloop_bcopy
-	sub	%o2, %o5, %o2		! update count for later
-
-	! Main loop to write out doubles.  Note: o5 & 7 == 0
-	
-	ldd	[%o0], %d34
-	subcc	%o5, 8, %o5		! update local count
-	bz,pn	%ncc, 1f
-	add	%o0, 8, %o0		! update SRC
-
-.medloop_bcopy:
-	faligndata %d32, %d34, %d36
-	ldda	[%o0]%asi, %d32
-	subcc	%o5, 8, %o5		! update local count
-	add	%o0, 16, %o0		! update SRC
-	std	%d36, [%o1]
-	bz,pn	%ncc, 2f
-	faligndata %d34, %d32, %d38
-	ldda	[%o0 - 8]%asi, %d34
-	subcc	%o5, 8, %o5		! update local count
-	std	%d38, [%o1 + 8]
-	bnz,pt	%ncc, .medloop_bcopy
-	add	%o1, 16, %o1		! update DST
-
-1:	
-	faligndata %d32, %d34, %d36
-	fmovd	%d34, %d32
-	std	%d36, [%o1]
-	ba	.endmedloop_bcopy
-	add	%o1, 8, %o1
-	
-2:
-	std	%d38, [%o1 + 8]
-	sub	%o0, 8, %o0
-	add	%o1, 16, %o1
-	
-
-.endmedloop_bcopy:
-	! Currently, from is pointing to the next double-aligned byte in SRC
-	! The 8 bytes starting at [from-8] are available in d32
-	! At least one, and possibly all, of these need to be written.
-
-	cmp	%o2, VIS_BLOCKSIZE	
-	bgu	%ncc, .large_bcopy		! otherwise, less than 16 bytes left
-	
-#if 1
-
-	/* This code will use partial stores.  */
-
-	mov	%g0, %o5
-	and	%o3, 7, %o3		! Number of bytes needed to completely
-					! fill %d32 with good (unwritten) data.
-
-	subcc	%o2, 8, %o2		! update count (maybe too much)
-	movl	%ncc, %o2, %o5		
-	addcc	%o3, %o5, %o5		! extra bytes we can stuff into %d32
-	sub	%o3, %o5, %o3		! update o3 (# bad bytes in %d32)
-
-	bz	%ncc, 2f
-	alignaddr %o3, %g0, %g0		! set GSR.ALIGN
-	
-1:
-	deccc	%o5
-	ldda	[%o0]ASI_FL8_P, %d34
-	inc	%o0
-	bgu	%ncc, 1b
-	bshuffle %d32, %d34, %d32		! shifts d32 left 1 byte and or's in d34
-
-2:
-	not     %o3
-	faligndata %d32, %d32, %d32	! shift bytes to the left
-	and	%o3, 7, %o3		! last byte to be stored in [%o1+%o3]
-	edge8n	%g0, %o3, %o5
-	stda	%d32, [%o1]%o5, ASI_PST8_P
-	brlez	%o2, exit_bcopy		
-	add	%o1, %o3, %o1		! update DST to last stored byte
-3:	
-	inc	%o1
-	deccc	%o2
-	ldub	[%o0], %o3
-	stb	%o3, [%o1]
-	bgu	%ncc, 3b
-	inc	%o0
-
-#else
-
-	andcc	%o3, 7, %o5		! Number of bytes needed to completely
-					! fill %d32 with good (unwritten) data.
-	bz	%ncc, 2f
-	sub	%o5, 8, %o3		! -(number of good bytes in %d32)
-	cmp	%o2, 8
-	bl,a	%ncc, 3f		! Not enough bytes to fill %d32
-	add	%o0, %o3, %o0 		! Back up %o0
-
-1:
-	deccc	%o5
-	ldda	[%o0]ASI_FL8_P, %d34
-	inc	%o0
-	bgu	%ncc, 1b
-	bshuffle %d32, %d34, %d32		! shifts d32 left 1 byte and or's in d34
-
-2:	
-	subcc	%o2, 8, %o2
-	std	%d32, [%o1]
-	bz	%ncc, exit_bcopy
-	add	%o1, 8, %o1
-3:	
-	ldub	[%o0], %o3
-	deccc	%o2
-	inc	%o0
-	stb	%o3, [%o1]
-	bgu	%ncc, 3b
-	inc	%o1
-#endif	
-
-exit_bcopy:
-	membar	#Sync
-
-	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
-	wr	%o2, 0, %gsr
-
-	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
-	! No need to restore regs if they were not saved
-	btst	FPSAVED_FLAG, %l6
-	bz	%ncc, 4f
-	  nop
-
-	BLD_FPQ3Q4_FROMSTACK(%o2)
-
-	ba,pt	%ncc, 5f
-	  wr	%o3, 0, %fprs		! restore fprs
-4:
-	FZEROQ3Q4
-	wr	%o3, 0, %fprs		! restore fprs
-5:
-	membar	#Sync				! sync error barrier
-	andn	%l6, MASK_FLAGS, %l6
-	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-
-	mov	%g1, %asi		! restore %asi
-	FP_ALLOWMIGRATE(6, 7)
-	ret
-	  restore	%g0, 0, %o0
-
-
-	.align ICACHE_LINE_SIZE
-.large_bcopy:
-	! The following test for BSTORE_SIZE is used to decide whether
-	! to store data with a block store or with individual stores.
-	! The block store wins when the amount of data is so large
-	! that it is causes other application data to be moved out
-	! of the L1 or L2 cache.
-	! On a Panther, block store can lose more often because block
-	! store forces the stored data to be removed from the L3 cache.
-	!
-	sethi	%hi(BSTORE_SIZE),%o5
-	or	%o5,%lo(BSTORE_SIZE),%o5
-	cmp	%o2, %o5
-	bgu	%ncc, .xlarge_bcopy		
-
-	! %o1 I/O DST is 64-byte aligned
-	! %o0 I/O 8-byte aligned (and we've set GSR.ALIGN)
-	! %d32 I/O already loaded with SRC data from [%o0-8]
-	! %o2 I/O count (number of bytes that need to be written)
-	! %o3 I   Not written.  If zero, then SRC is double aligned.
-	! %o4 I   Not written.  Holds fprs.
-	! %o5   O The number of doubles that remain to be written.
-
-	! Load the rest of the current block 
-	! Recall that %o0 is further into SRC than %o1 is into DST
-
-	prefetch [%o1 + (0 * VIS_BLOCKSIZE)], #n_writes
-	prefetch [%o1 + (1 * VIS_BLOCKSIZE)], #n_writes
-	prefetch [%o1 + (2 * VIS_BLOCKSIZE)], #n_writes
-	ldda	[%o0]%asi, %d34
-	prefetch [%o0 + (3 * VIS_BLOCKSIZE)], #one_read
-	ldda	[%o0 + 0x8]%asi, %d36
-	faligndata %d32, %d34, %d48
-	ldda	[%o0 + 0x10]%asi, %d38
-	faligndata %d34, %d36, %d50
-	ldda	[%o0 + 0x18]%asi, %d40
-	faligndata %d36, %d38, %d52
-	ldda	[%o0 + 0x20]%asi, %d42
-	or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
-	prefetch [%o0 + (4 * VIS_BLOCKSIZE)], #one_read
-	faligndata %d38, %d40, %d54
-	ldda	[%o0 + 0x28]%asi, %d44
-	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed lter)
-	faligndata %d40, %d42, %d56
-	ldda	[%o0 + 0x30]%asi, %d46
-	faligndata %d42, %d44, %d58
-	ldda	[%o0 + 0x38]%asi, %d32
-	sub	%o2, VIS_BLOCKSIZE, %o2	! update count
-	prefetch [%o0 + (5 * VIS_BLOCKSIZE)], #one_read
-	add	%o0, VIS_BLOCKSIZE, %o0		! update SRC
-
-	! Main loop.  Write previous block.  Load rest of current block.
-	! Some bytes will be loaded that won't yet be written.
-1:	
-	ldda	[%o0]%asi, %d34
-	faligndata %d44, %d46, %d60
-	ldda	[%o0 + 0x8]%asi, %d36
-	faligndata %d46, %d32, %d62
-	std	%d48, [%o1]
-	std	%d50, [%o1+8]
-	std	%d52, [%o1+16]
-	std	%d54, [%o1+24]
-	std	%d56, [%o1+32]
-	std	%d58, [%o1+40]
-	std	%d60, [%o1+48]
-	std	%d62, [%o1+56]
-	sub	%o2, VIS_BLOCKSIZE, %o2		! update count
-	prefetch [%o1 + (6 * VIS_BLOCKSIZE)], #n_writes
-	prefetch [%o1 + (3 * VIS_BLOCKSIZE)], #n_writes
-	add	%o1, VIS_BLOCKSIZE, %o1		! update DST
-	ldda	[%o0 + 0x10]%asi, %d38
-	faligndata %d32, %d34, %d48
-	ldda	[%o0 + 0x18]%asi, %d40
-	faligndata %d34, %d36, %d50
-	ldda	[%o0 + 0x20]%asi, %d42
-	faligndata %d36, %d38, %d52
-	ldda	[%o0 + 0x28]%asi, %d44
-	faligndata %d38, %d40, %d54
-	ldda	[%o0 + 0x30]%asi, %d46
-	faligndata %d40, %d42, %d56
-	ldda	[%o0 + 0x38]%asi, %d32
-	faligndata %d42, %d44, %d58
-	cmp	%o2, VIS_BLOCKSIZE + 8
-	prefetch [%o0 + (5 * VIS_BLOCKSIZE)], #one_read
-	bgu,pt	%ncc, 1b
-	add	%o0, VIS_BLOCKSIZE, %o0	! update SRC
-	faligndata %d44, %d46, %d60
-	faligndata %d46, %d32, %d62
-	stda	%d48, [%o1]ASI_BLK_P	! store 64 bytes, bypass cache
-	cmp	%o2, VIS_BLOCKSIZE
-	bne	%ncc, 2f		! exactly 1 block remaining?
-	add	%o1, VIS_BLOCKSIZE, %o1	! update DST
-	brz,a	%o3, 3f			! is SRC double aligned?
-	ldd	[%o0], %d34
-
-2:	
-	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8 
-	add	%o5, %o3, %o5
-
-	ba	.beginmedloop_bcopy
-	andn	%o5, 7, %o5		! 8 byte aligned count
-
-	! This is when there is exactly 1 block remaining and SRC is aligned
-3:
-	!  %d32 was loaded in the last iteration of the loop above, and
-	!  %d34 was loaded in the branch delay slot that got us here.
-	ldd	[%o0 + 0x08], %d36
-	ldd	[%o0 + 0x10], %d38
-	ldd	[%o0 + 0x18], %d40
-	ldd	[%o0 + 0x20], %d42
-	ldd	[%o0 + 0x28], %d44
-	ldd	[%o0 + 0x30], %d46
-	stda	%d32, [%o1]ASI_BLK_P
-
-	ba	exit_bcopy
-	nop
-
-	.align 16
-	! two nops here causes loop starting at 1f below to be
-	! on a cache line boundary, improving performance
-	nop
-	nop
-xlarge:
-.xlarge_bcopy:
-	/*
-	set	4096, %l2
-	subcc	%o2, %l2, %g0
-	bge	%ncc, size_ge_4k
-	nop
-	*/
-	! %o1 I/O DST is 64-byte aligned
-	! %o0 I/O 8-byte aligned (and we've set GSR.ALIGN)
-	! %d32 I/O already loaded with SRC data from [%o0-8]
-	! %o2 I/O count (number of bytes that need to be written)
-	! %o3 I   Not written.  If zero, then SRC is double aligned.
-	! %o4 I   Not written.  Holds fprs.
-	! %o5   O The number of doubles that remain to be written.
-
-	! Load the rest of the current block 
-	! Recall that %o0 is further into SRC than %o1 is into DST
-
-	! prefetch [%o0 + (3 * VIS_BLOCKSIZE)], #one_read
-	! executed in delay slot for branch to .xlarge
-	prefetch [%o0 + (4 * VIS_BLOCKSIZE)], #one_read
-	prefetch [%o0 + (5 * VIS_BLOCKSIZE)], #one_read
-	ldda	[%o0]%asi, %d34
-	prefetch [%o0 + (6 * VIS_BLOCKSIZE)], #one_read
-	ldda	[%o0 + 0x8]%asi, %d36
-	faligndata %d32, %d34, %d48
-	ldda	[%o0 + 0x10]%asi, %d38
-	faligndata %d34, %d36, %d50
-	ldda	[%o0 + 0x18]%asi, %d40
-	faligndata %d36, %d38, %d52
-	ldda	[%o0 + 0x20]%asi, %d42
-	or	%g0, -8, %o5		! if %o3 >= 0, %o5 = -8
-	faligndata %d38, %d40, %d54
-	ldda	[%o0 + 0x28]%asi, %d44
-	movrlz	%o3, %g0, %o5		! if %o3 < 0, %o5 = 0  (needed later)
-	faligndata %d40, %d42, %d56
-	ldda	[%o0 + 0x30]%asi, %d46
-	faligndata %d42, %d44, %d58
-	ldda	[%o0 + 0x38]%asi, %d32
-	sub	%o2, VIS_BLOCKSIZE, %o2	! update count
-	prefetch [%o0 + (7 * VIS_BLOCKSIZE)], #one_read
-	add	%o0, VIS_BLOCKSIZE, %o0	! update SRC
-
-	! This point is 32-byte aligned since 24 instructions appear since
-	! the previous alignment directive.
-	
-
-	! Main loop.  Write previous block.  Load rest of current block.
-	! Some bytes will be loaded that won't yet be written.
-1:
-	ldda	[%o0]%asi, %d34
-	faligndata %d44, %d46, %d60
-	ldda	[%o0 + 0x8]%asi, %d36
-	faligndata %d46, %d32, %d62
-	stda	%d48, [%o1]ASI_BLK_P
-	sub	%o2, VIS_BLOCKSIZE, %o2		! update count
-	ldda	[%o0 + 0x10]%asi, %d38
-	faligndata %d32, %d34, %d48
-	ldda	[%o0 + 0x18]%asi, %d40
-	faligndata %d34, %d36, %d50
-	ldda	[%o0 + 0x20]%asi, %d42
-	faligndata %d36, %d38, %d52
-	ldda	[%o0 + 0x28]%asi, %d44
-	faligndata %d38, %d40, %d54
-	ldda	[%o0 + 0x30]%asi, %d46
-	faligndata %d40, %d42, %d56
-	ldda	[%o0 + 0x38]%asi, %d32
-	faligndata %d42, %d44, %d58
-	! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K
-	prefetch [%o0 + (8 * VIS_BLOCKSIZE) + 8], #one_read
-	add	%o1, VIS_BLOCKSIZE, %o1		! update DST
-	cmp	%o2, VIS_BLOCKSIZE + 8
-	! second prefetch important to correct for occasional dropped
-	! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K
-	! strong prefetch prevents drops on Panther, but Jaguar and earlier
-	! US-III models treat strong prefetches as weak prefetchs
-	! to avoid regressions on customer hardware, we retain the prefetch
-	prefetch [%o0 + (5 * VIS_BLOCKSIZE)], #one_read
-	bgu,pt	%ncc, 1b
-	add	%o0, VIS_BLOCKSIZE, %o0	! update SRC
-
-	faligndata %d44, %d46, %d60
-	faligndata %d46, %d32, %d62
-	stda	%d48, [%o1]ASI_BLK_P	! store 64 bytes, bypass cache
-	cmp	%o2, VIS_BLOCKSIZE		
-	bne	%ncc, 2f		! exactly 1 block remaining?
-	add	%o1, VIS_BLOCKSIZE, %o1	! update DST
-	brz,a	%o3, 3f			! is SRC double aligned?
-	ldd	[%o0], %d34
-
-2:	
-	add	%o5, %o2, %o5		! %o5 was already set to 0 or -8 
-	add	%o5, %o3, %o5
-
-
-	ba	.beginmedloop_bcopy
-	andn	%o5, 7, %o5		! 8 byte aligned count
-
-
-	! This is when there is exactly 1 block remaining and SRC is aligned
-3:
-	!  %d32 was loaded in the last iteration of the loop above, and
-	!  %d34 was loaded in the branch delay slot that got us here.
-	ldd	[%o0 + 0x08], %d36
-	ldd	[%o0 + 0x10], %d38
-	ldd	[%o0 + 0x18], %d40
-	ldd	[%o0 + 0x20], %d42
-	ldd	[%o0 + 0x28], %d44
-	ldd	[%o0 + 0x30], %d46
-	stda	%d32, [%o1]ASI_BLK_P
-
-	ba	exit_bcopy
-	nop
-
-copying_ge_512:
-	! both src and dst are aligned to 8 byte boundary
-	! and the number of bytes to copy is 512 or more.
-	! %o5 has the saved T_LOFAULT when we come here.
-	! We set a new error handler if the T_LOFAULT was set earlier OR
-	! KCOPY_FLAG is set.
-	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
-	mov	%i5, %l6
-	andn	%l6, TRAMP_FLAG, %o2
-	brz,pt	%o2, 1f
-	  nop
-	! We enter here if KCOPY_FLAG was set OR
-	! T_LOFAULT was set earlier.
-	! We only change the error handler pointer here.
-	! The flags TRAMP_FLAG or KCOPY_FLAG is left as it is in %l6.
-	sethi	%hi(.copyerr_fp_used), %o2
-	or	%o2, %lo(.copyerr_fp_used), %o2
-	membar	#Sync				! sync error barrier
-	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
-1:
-	FP_NOMIGRATE(6, 7)
-	mov	%i0, %o0
-	mov	%i1, %o1
-	mov	%i2, %o2
-	mov	%i3, %o3
-	mov	%i5, %o5
-	rd	%fprs, %o5		! check for unused fp
-	st	%o5, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
-
-	! FPU enabled ?  If not, enable it.
-	btst	FPRS_FEF, %o5
-	bz,a,pt	%icc, 1f
-	  wr	%g0, FPRS_FEF, %fprs
-
-
-	! save the FP registers even if DU is not set.
-
-	BST_FPQ3Q4_TOSTACK(%o5)
-	or	%l6, FPSAVED_FLAG, %l6
-1:
-	rd	%gsr, %o5
-	stx	%o5, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
-	or	%l6, FPUSED_FLAG, %l6
-	!prefetch 256 bytes from nearest 128 byte aligned src buf
-	sub     %o0,1,%o3
-	andn    %o3,0x7f,%l1
-	add     %l1,128,%l1
-	prefetch [%l1],2
-	prefetch [%l1+64],2
-	prefetch [%l1+(2*64)],2
-	prefetch [%l1+(3*64)],2
-	!prefetch 256 bytes from nearest 128 byte aligned dst buf
-	sub     %o1,1,%o3
-	andn    %o3,0x7f,%l1
-	add     %l1,128,%l1
-	prefetch [%l1],2
-	prefetch [%l1+64],2
-	prefetch [%l1+(2*64)],2
-	prefetch [%l1+(3*64)],2
-
-	andcc   %o1,0x7f,%o3	    !Check if buffers are 128 byte aligned
-	brz,pn  %o3,aligned_on_128
-	sub     %o3,128,%o3
-
-	add     %o2,%o3,%o2
-align_to_128:
-	ldxa	[%o0]ASI_CACHE_SPARING_P, %o4
-	add     %o0,8,%o0		! increment src pointer
-	stxa    %o4,[%o1]ASI_CACHE_SPARING_P
-	addcc   %o3,8,%o3
-	bl,pt   %ncc,align_to_128
-	add     %o1,8,%o1		! increment dst pointer
-
-aligned_on_128:
-	andcc	%o1,0x1ff,%o3	!Check if buffers are 512 byte aligned.
-	brnz,pn	%o3, 4f
-	mov	%o2,%l4		!l4=number of bytes to copy
-	! buffers are now 512 byte aligned.
-	! if we have 4096 or more bytes to copy we will use the
-	! stingray_optimized_copy
-	set	4096, %l2
-	subcc	%o2, %l2, %g0
-	bge,pn	%ncc, stingray_optimized_copy
-	nop
-4:
-	! determine how many bytes are left to be copied after the buffers
-	! are aligned to 512 byte boundary.
-	! if we have 4096 or more then we can perform stingray_optimized_copy
-	! register l4 will contain the number of bytes to copy after buffers\
-	! are aligned to 512 byte boundary. l4 is set to 0 if we have less than
-	! 4096 bytes to  copy after aligning buffers to 512 byte.
-	sub	%o1,8,%o5	! should be in current 512 chunk
-	andn 	%o5,0x1ff,%o3	! %o3=aligned 512b addr
-	add 	%o3,0x200,%o3	! %o3=next aligned 512b addr
-	sub 	%o3,%o1,%o3	! %o3=how many bytes to copy for 512 byte
-				! alignment
-	sub	%o2,%o3,%l4	! l4=bytes to copy after aligning buffers to 512
-	! if l4 is < 4096 do interleave128_copy only.
-	set	4096, %l2
-	subcc	%l4, %l2, %g0
-	bge,pn	%ncc,6f
-	nop
-	mov	%g0, %l4
-	add	%o1, %o2, %l1
-	ba	interleave128_copy
-	nop
-6:
-	mov	%o3, %o2
-	subcc 	%o3,256,%g0	!use interleave128_copy if 256 or more
-	bl,pn	%ncc,copy_word	!o.w use copy_word to finish the 512 byte alignment.
-	!%o2=new count i.e how many bytes to write
-	add     %o1,%o2,%l1	     !cal the last byte to write %l1
-	ba	interleave128_copy
-	nop
-
-	.align	64
-interleave128_copy:
-	! %l1 has the addr of the dest. buffer at or beyond which no write
-	! is to be done.
-	! %l4 has the number of bytes to zero using stingray_optimized_bzero
-	!prefetch src 
-
-	add	%o0, 256, %o3
-	prefetch [%o3], 2	!1st 64 byte line of next 256 byte block
-	add	%o0, 384, %o3
-	prefetch [%o3], 2	!3rd 64 byte line of next 256 byte block
-	add	%o0, 320, %o3
-	prefetch [%o3], 2	!2nd 64 byte line of next 256 byte block
-	add	%o0, 448, %o3
-	prefetch [%o3], 2	!4th 64 byte line of next 256 byte block
-
-	!prefetch dst 
-
-	add	%o1, 256, %o3
-	prefetch [%o3], 2	!1st 64 byte line of next 256 byte block
-	add	%o1, 384, %o3
-	prefetch [%o3], 2	!3rd 64 byte line of next 256 byte block
-	add	%o1, 320, %o3
-	prefetch [%o3], 2	!2nd 64 byte line of next 256 byte block
-	add	%o1, 448, %o3
-	prefetch [%o3], 2	!4th 64 byte line of next 256 byte block
-
-	ldxa	[%o0]ASI_CACHE_SPARING_P, %o4
-	stxa     %o4,[%o1]ASI_CACHE_SPARING_P
-	add	%o0, 128, %o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, 128, %o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (1 * 8), %o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add	%o1, (1 * 8), %o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (1 * 8 + 128), %o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (1 * 8 + 128), %o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (2 * 8),%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (2 * 8),%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (2 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (2 * 8 + 128) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (3 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (3 * 8) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (3 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (3 * 8 + 128) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (4 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (4 * 8) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (4 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (4 * 8 + 128) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (5 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (5 * 8) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (5 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (5 * 8 + 128) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (6 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (6 * 8) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (6 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (6 * 8 + 128) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (7 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (7 * 8) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (7 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (7 * 8 + 128) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (8 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (8 * 8) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (8 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (8 * 8 + 128) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (9 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (9 * 8) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (9 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (9 * 8 + 128) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (10 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (10 * 8) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (10 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (10 * 8 + 128) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (11 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (11 * 8) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (11 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (11 * 8 + 128) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (12 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (12 * 8) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (12 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (12 * 8 + 128) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (13 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (13 * 8) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (13 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (13 * 8 + 128) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (14 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (14 * 8) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (14 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (14 * 8 + 128) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (15 * 8) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (15 * 8) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add     %o0, (15 * 8 + 128) ,%o3
-	ldxa	[%o3]ASI_CACHE_SPARING_P, %o4
-	add     %o1, (15 * 8 + 128) ,%o3
-	stxa    %o4,[%o3]ASI_CACHE_SPARING_P	
-	add	%o0, 256, %o0
-
-	! check if the next 256 byte copy will not exceed the number of
-	! bytes remaining to be copied.
-	! %l2 points to the dest buffer after copying 256 bytes more.
-	! %l1 points to dest. buffer at or beyond which no writes should be done.
-	add     %o1,512,%l2
-	subcc   %l1,%l2,%g0
-	bge,pt  %ncc,interleave128_copy
-	add     %o1,256,%o1
-
-copy_word:
-	and     %o2,255,%o3
-	and     %o3,7,%o2
-
-	! Set the remaining doubles
-	subcc   %o3, 8, %o3		! Can we store any doubles?
-	bl,pn  %ncc, 6f
-	and	%o2, 7, %o2		! calc bytes left after doubles
-
-	!prefetch src 
-
-	mov	%o0, %o4
-	prefetch [%o4], 2	!1st 64 byte line of next 256 byte block
-	add	%o0, 128, %o4
-	prefetch [%o4], 2	!3rd 64 byte line of next 256 byte block
-	add	%o0, 64, %o4
-	prefetch [%o4], 2	!2nd 64 byte line of next 256 byte block
-	add	%o0, 192, %o4
-	prefetch [%o4], 2	!4th 64 byte line of next 256 byte block
-
-	!prefetch dst 
-
-	mov	%o1, %o4
-	prefetch [%o4], 2	!1st 64 byte line of next 256 byte block
-	add	%o1, 128, %o4
-	prefetch [%o4], 2	!3rd 64 byte line of next 256 byte block
-	add	%o1, 64, %o4
-	prefetch [%o4], 2	!2nd 64 byte line of next 256 byte block
-	add	%o1, 192, %o4
-	prefetch [%o4], 2	!4th 64 byte line of next 256 byte block
-
-5:	
-	ldxa	[%o0]ASI_CACHE_SPARING_P, %o4
-	add     %o0, 8, %o0      
-	stxa	%o4, [%o1]ASI_CACHE_SPARING_P
-	subcc   %o3, 8, %o3
-	bge,pt	%ncc, 5b
-	add     %o1, 8, %o1      
-6:
-	! Set the remaining bytes
-	brz	%o2,  can_we_do_stingray_optimized_copy
-	nop
-	
-7:
-	deccc	%o2			! byte clearing loop
-	ldub	[%o0], %o4		! load one byte
-	stb	%o4, [%o1]
-	inc	%o1			! increment dst
-	bgu,pt	%ncc, 7b
-	inc	%o0			! increment src
-
-can_we_do_stingray_optimized_copy:
-	! %l4 contains the number of bytes to be copied
-	mov	%l4, %o2
-	brnz,pn	%o2, stingray_optimized_copy
-	nop
-	
-exit:	
-	membar	#Sync
-
-	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o5	! restore gsr
-	wr	%o5, 0, %gsr
-
-	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
-	! No need to restore regs if they were not saved
-	btst	FPSAVED_FLAG, %l6
-	bz	%ncc, 4f
-	  nop
-
-	BLD_FPQ3Q4_FROMSTACK(%o5)
-
-	ba,pt	%ncc, 5f
-	  wr	%o3, 0, %fprs		! restore fprs
-4:
-	FZEROQ3Q4
-	wr	%o3, 0, %fprs		! restore fprs
-5:
-	membar	#Sync				! sync error barrier
-	andn	%l6, MASK_FLAGS, %l6
-	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	FP_ALLOWMIGRATE(6, 7)
-	ret
-	  restore	%g0, 0, %o0
-
-
-stingray_optimized_copy:
-	 ! This code tries to maximize bandwidth by being clever about
-	 ! accessing the two cache lines that are BUDDY PAIRS in the L3 cache.  
-	 ! THIS VERSION IS OPTIMIZED FOR THE CASE OF SWAPPING PA BITS 6 and 9. 
-	 ! To keep this code simple, we assume the addresses given are aligned
-	 ! at least on a 128-byte boundary, and the length is assumed to be
-	 ! a multiple of 4k bytes.
-	 ! THIS VERSION USES BLKSTORES, AND PREFETCHES BOTH SOURCE AND
-	 ! DESTINATION DATA.
-
-	add	%o1, %l4, %o2
-
-	!save original value of %o0 so we can restore it.
-	or      %g0,%o0,%l2
-	
-	wr      %g0,ASI_BLK_P,%asi
-
-	prefetch [%o0+0],2
-	prefetch [%o0+(64*1)],2
-	prefetch [%o0+(64*2)],2
-	prefetch [%o0+(64*3)],2
-	prefetch [%o0+(64*4)],2
-	prefetch [%o0+(64*5)],2
-	prefetch [%o0+(64*6)],2
-	prefetch [%o0+(64*7)],2
-	prefetch [%o0+(64*8)],2
-	prefetch [%o0+(64*9)],2
-	prefetch [%o0+(64*10)],2
-	prefetch [%o0+(64*11)],2
-	prefetch [%o0+(64*12)],2
-	prefetch [%o0+(64*13)],2
-	prefetch [%o0+(64*14)],2
-	prefetch [%o0+(64*15)],2
-
-	prefetch [%o1+0],2
-	prefetch [%o1+(64*1)],2
-	prefetch [%o1+(64*2)],2
-	prefetch [%o1+(64*3)],2
-	prefetch [%o1+(64*4)],2
-	prefetch [%o1+(64*5)],2
-	prefetch [%o1+(64*6)],2
-	prefetch [%o1+(64*7)],2
-	prefetch [%o1+(64*8)],2
-	prefetch [%o1+(64*9)],2
-	prefetch [%o1+(64*10)],2
-	prefetch [%o1+(64*11)],2
-	prefetch [%o1+(64*12)],2
-	prefetch [%o1+(64*13)],2
-	prefetch [%o1+(64*14)],2
-	prefetch [%o1+(64*15)],2
-	
-	ba      stingray_optimized_4k_copy_loop
-	srl	%l4, 12, %l4
-	
-	! Local register usage:
-	! %l1   address at short distance ahead of current src buf for prefetching
-	!	into L1 cache. 
-	! %l2   address at far ahead of current src buf for prefetching
-	!	into L2 cache.
-	! %l3   save %o1 at start of inner loop. 
-	! %l4	Number of 4k blocks to copy
-	! %g1   save src buf pointer at start of inner loop. 
-	! %l5   iteration counter to make buddy loop execute 2 times. 
-	! %o5   iteration counter to make inner loop execute 4 times. 
-	! %l7   address at far ahead of current dst buf for prefetching dest
-	!	into L2 cache.
-	       
-	.align 64
-stingray_optimized_4k_copy_loop:
-	set      2, %l5		! %l5 is the loop count for the buddy loop
-	add      %o1, 0, %l3 
-	add      %o0, 0, %g1 
-buddyloop_bcopy:
-	set      PF_FAR, %g5
-	add      %o0, %g5, %l2	! Set %l2 to far ahead of src buffer to prefetch
-	!  For prefetching into L1 D$, set %l1 a little ahead of src buffer
-	add      %o0, PF_NEAR, %l1
-	add      %o1, %g5, %l7	! Set %l7 to far ahead of dst buffer to prefetch
-
-	add      %l2, %g5, %g5	! %g5 is now double far ahead of the src buffer
-	prefetch [%g5+%g0],2	! Prefetch ahead to get TLB entry in advance.
-	set      2*PF_FAR, %g5
-	add      %o1, %g5, %g5	! %g5 is now double far ahead of the dst buffer
-	prefetch [%g5+%g0],2	! Prefetch ahead to get TLB entry in advance.
-
-	set      4,%o5		! %o5 = loop count for the inner loop
-	set      0, %g5
-	
-	! Each iteration of the inner loop below copies 8 sequential lines.
-	! This loop is iterated 4 times, to move a total of 32 lines, all of
-	! which have the same value of PA[9], so we increment the base 
-	! address by 1024 bytes in each iteration, which varies PA[10].
-innerloop_bcopy:	  
-	! copy line 1 of 8
-	prefetch [%l2+%g5],2
-	prefetch [%l7+%g5],2
-	prefetch [%l1+%g5],1
-
-	ldd     [%o0],%d32
-	ldd     [%o0+8],%d34
-	ldd     [%o0+16],%d36
-	ldd     [%o0+24],%d38
-	ldd     [%o0+32],%d40
-	ldd     [%o0+40],%d42
-	ldd     [%o0+48],%d44
-	ldd     [%o0+56],%d46
-	stda    %d32,[%o1+0] %asi
-	add     %g5, 64, %g5
-	add     %o1, 64, %o1
-	add     %o0, 64, %o0
-
-	! copy line 2 of 8
-	prefetch [%l2+%g5],2
-	prefetch [%l7+%g5],2
-	prefetch [%l1+%g5],1
-
-	ldd     [%o0],%d32
-	ldd     [%o0+8],%d34
-	ldd     [%o0+16],%d36
-	ldd     [%o0+24],%d38
-	ldd     [%o0+32],%d40
-	ldd     [%o0+40],%d42
-	ldd     [%o0+48],%d44
-	ldd     [%o0+56],%d46
-	stda    %d32,[%o1+0] %asi
-	add     %g5, 64, %g5
-	add     %o1, 64, %o1
-	add     %o0, 64, %o0
-
-	! copy line 3 of 8
-	prefetch [%l2+%g5],2
-	prefetch [%l7+%g5],2
-	prefetch [%l1+%g5],1
-
-	ldd     [%o0],%d32
-	ldd     [%o0+8],%d34
-	ldd     [%o0+16],%d36
-	ldd     [%o0+24],%d38
-	ldd     [%o0+32],%d40
-	ldd     [%o0+40],%d42
-	ldd     [%o0+48],%d44
-	ldd     [%o0+56],%d46
-	stda    %d32,[%o1+0] %asi
-	add     %g5, 64, %g5
-	add     %o1, 64, %o1
-	add     %o0, 64, %o0
-
-	! copy line 4 of 8
-	prefetch [%l2+%g5],2
-	prefetch [%l7+%g5],2
-	prefetch [%l1+%g5],1
-
-	ldd     [%o0],%d32
-	ldd     [%o0+8],%d34
-	ldd     [%o0+16],%d36
-	ldd     [%o0+24],%d38
-	ldd     [%o0+32],%d40
-	ldd     [%o0+40],%d42
-	ldd     [%o0+48],%d44
-	ldd     [%o0+56],%d46
-	stda    %d32,[%o1+0] %asi
-	add     %g5, 64, %g5
-	add     %o1, 64, %o1
-	add     %o0, 64, %o0
-
-	! copy line 5 of 8
-	prefetch [%l2+%g5],2
-	prefetch [%l7+%g5],2
-	prefetch [%l1+%g5],1
-
-	ldd     [%o0],%d32
-	ldd     [%o0+8],%d34
-	ldd     [%o0+16],%d36
-	ldd     [%o0+24],%d38
-	ldd     [%o0+32],%d40
-	ldd     [%o0+40],%d42
-	ldd     [%o0+48],%d44
-	ldd     [%o0+56],%d46
-	stda    %d32,[%o1+0] %asi
-	add     %g5, 64, %g5
-	add     %o1, 64, %o1
-	add     %o0, 64, %o0
-
-	! copy line 6 of 8
-	prefetch [%l2+%g5],2
-	prefetch [%l7+%g5],2
-	prefetch [%l1+%g5],1
-
-	ldd     [%o0],%d32
-	ldd     [%o0+8],%d34
-	ldd     [%o0+16],%d36
-	ldd     [%o0+24],%d38
-	ldd     [%o0+32],%d40
-	ldd     [%o0+40],%d42
-	ldd     [%o0+48],%d44
-	ldd     [%o0+56],%d46
-	stda    %d32,[%o1+0] %asi
-	add     %g5, 64, %g5
-	add     %o1, 64, %o1
-	add     %o0, 64, %o0
-
-	! copy line 7 of 8
-	prefetch [%l2+%g5],2
-	prefetch [%l7+%g5],2
-	prefetch [%l1+%g5],1
-
-	ldd     [%o0],%d32
-	ldd     [%o0+8],%d34
-	ldd     [%o0+16],%d36
-	ldd     [%o0+24],%d38
-	ldd     [%o0+32],%d40
-	ldd     [%o0+40],%d42
-	ldd     [%o0+48],%d44
-	ldd     [%o0+56],%d46
-	stda    %d32,[%o1+0] %asi
-	add     %g5, 64, %g5
-	add     %o1, 64, %o1
-	add     %o0, 64, %o0
-
-	! copy line 8 of 8
-	prefetch [%l2+%g5],2
-	prefetch [%l7+%g5],2
-	prefetch [%l1+%g5],1
-
-	ldd     [%o0],%d32
-	ldd     [%o0+8],%d34
-	ldd     [%o0+16],%d36
-	ldd     [%o0+24],%d38
-	ldd     [%o0+32],%d40
-	ldd     [%o0+40],%d42
-	ldd     [%o0+48],%d44
-	ldd     [%o0+56],%d46
-	stda    %d32,[%o1+0] %asi
-
-	subcc   %o5,1,%o5	! Decrement the inner loop counter.
-	
-	! Now increment by 64 + 512 so we don't toggle PA[9]
-
-	add     %g5, 576, %g5
-	add     %o1, 576, %o1	! increment dst buffer
-
-	bg,pt   %icc,innerloop_bcopy
-	add     %o0, 576, %o0	! increment src buffer
-	! END OF INNER LOOP
-
-
-	subcc   %l5,1,%l5
-	add     %l3, 512, %o1	! increment dst buf to the first buddy line
-	bg,pt   %icc,buddyloop_bcopy
-	add     %g1, 512 ,%o0	! increment src buf to the first buddy lines. */
-
-	subcc   %l4, 1, %l4
-	add     %o1, 3584, %o1	! Advance src and dst buffers by 4k
-	add     %o0, 3584, %o0	! They were already incremented by 512,
-				! so just add 3584.
-
-	bg,pt   %icc,stingray_optimized_4k_copy_loop
-	nop
-
-	! End of stingray_optimized_copy
-	! if we have 256 or more bytes to copy we use interleave128_copy
-	! else we use copy_word
-
-	sub	%o2,%o1,%o2	! bytes remaining to be copied
-	brz,pn	%o2,exit
-	mov	%g0,%l4
-	add     %o1,%o2,%l1	!cal the last byte to write %l1
-	subcc	%o2,256,%g0
-	bge,pt	%ncc,interleave128_copy
-	mov	%g0, %l4
-	
-	ba	copy_word
-	nop
-	
-	SET_SIZE(bcopy)
-	SET_SIZE(__align_cpy_1)
-#endif	/* lint */
-
-#define	REALSRC	%i0
-#define	DST	%i1
-#define	CNT	%i2
-#define	SRC	%i3
-#define	TMP	%i5
-
-/*
- * Block copy with possibly overlapped operands.
- */
-
-#if defined(lint)
-
-/*ARGSUSED*/
-void
-ovbcopy(const void *from, void *to, size_t count)
-{}
-
-#else	/* lint */
-
-	ENTRY(ovbcopy)
-	tst	%o2			! check count
-	bgu,a	%ncc, 1f		! nothing to do or bad arguments
-	  subcc	%o0, %o1, %o3		! difference of from and to address
-
-	retl				! return
-	  nop
-1:
-	bneg,a	%ncc, 2f
-	  neg	%o3			! if < 0, make it positive
-2:	cmp	%o2, %o3		! cmp size and abs(from - to)
-	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
-	  .empty				!   no overlap
-	  cmp	%o0, %o1		! compare from and to addresses
-	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
-	  nop
-	!
-	! Copy forwards.
-	!
-.ov_fwd:
-	ldub	[%o0], %o3		! read from address
-	inc	%o0			! inc from address
-	stb	%o3, [%o1]		! write to address
-	deccc	%o2			! dec count
-	bgu	%ncc, .ov_fwd		! loop till done
-	  inc	%o1			! inc to address
-
-	retl				! return
-	  nop
-	!
-	! Copy backwards.
-	!
-.ov_bkwd:
-	deccc	%o2			! dec count
-	ldub	[%o0 + %o2], %o3	! get byte at end of src
-	bgu	%ncc, .ov_bkwd		! loop till done
-	  stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
-
-	retl				! return
-	  nop
-
-	SET_SIZE(ovbcopy)
-
-#endif	/* lint */
-
-
-/*
- * hwblkpagecopy()
- *
- * Copies exactly one page.  This routine assumes the caller (ppcopy)
- * has already disabled kernel preemption and has checked
- * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
- */
-#ifdef lint
-/*ARGSUSED*/
-void
-hwblkpagecopy(const void *src, void *dst)
-{ }
-#else /* lint */
-	ENTRY(hwblkpagecopy)
-	! get another window w/space for three aligned blocks of saved fpregs
-	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
-
-	! %i0 - source address (arg)
-	! %i1 - destination address (arg)
-	! %i2 - length of region (not arg)
-	! %l0 - saved fprs
-	! %l1 - pointer to saved fpregs
-
-	rd	%fprs, %l0		! check for unused fp
-
-	! FPU enabled ?  If not, enable it.
-	btst	FPRS_FEF, %l0
-	bz,a,pt	%icc, 1f
-	  wr	%g0, FPRS_FEF, %fprs
-
-	! save the FP registers even if DU is not set.
-
-	BST_FPQ3Q4_TOSTACK(%l1)
-
-1:	set	PAGESIZE, CNT
-	mov	%i1, %o0		! store destination address for flushing
-	mov	REALSRC, SRC
-
-	prefetch [SRC], #one_read
-	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
-	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
-	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
-	ldd	[SRC], %d32
-#if FIRST_PREFETCH > 4
-	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
-#endif
-	ldd	[SRC + 0x08], %d34
-#if FIRST_PREFETCH > 5
-	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
-#endif
-	ldd	[SRC + 0x10], %d36
-#if FIRST_PREFETCH > 6
-	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
-#endif
-	faligndata %d32, %d34, %d48
-	ldd	[SRC + 0x18], %d38
-#if FIRST_PREFETCH > 7
-	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
-#endif
-	faligndata %d34, %d36, %d50
-	ldd	[SRC + 0x20], %d40
-	faligndata %d36, %d38, %d52
-	ldd	[SRC + 0x28], %d42
-	faligndata %d38, %d40, %d54
-	ldd	[SRC + 0x30], %d44
-	faligndata %d40, %d42, %d56
-	ldd	[SRC + 0x38], %d46
-	faligndata %d42, %d44, %d58
-	ldd	[SRC + VIS_BLOCKSIZE], %d32
-	sub	CNT, VIS_BLOCKSIZE, CNT
-	add	SRC, VIS_BLOCKSIZE, SRC
-	ba,a,pt	%ncc, 2f
-	  nop
-	.align	ICACHE_LINE_SIZE
-2:
-	ldd	[SRC + 0x08], %d34
-	faligndata %d44, %d46, %d60
-	ldd	[SRC + 0x10], %d36
-	faligndata %d46, %d32, %d62
-	stda	%d48, [DST]ASI_BLK_P
-	ldd	[SRC + 0x18], %d38
-	faligndata %d32, %d34, %d48
-	ldd	[SRC + 0x20], %d40
-	faligndata %d34, %d36, %d50
-	ldd	[SRC + 0x28], %d42
-	faligndata %d36, %d38, %d52
-	ldd	[SRC + 0x30], %d44
-	faligndata %d38, %d40, %d54
-	ldd	[SRC + 0x38], %d46
-	faligndata %d40, %d42, %d56
-	ldd	[SRC + VIS_BLOCKSIZE], %d32
-	faligndata %d42, %d44, %d58
-	prefetch [SRC + ((FIRST_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
-	sub	CNT, VIS_BLOCKSIZE, CNT
-	add	DST, VIS_BLOCKSIZE, DST
-	cmp	CNT, VIS_BLOCKSIZE + 8
-	prefetch [SRC + ((SECOND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
-	bgu,pt	%ncc, 2b
-	  add	SRC, VIS_BLOCKSIZE, SRC
-
-	! trailing block
-	ldd	[SRC + 0x08], %d34
-	faligndata %d44, %d46, %d60
-	ldd	[SRC + 0x10], %d36
-	faligndata %d46, %d32, %d62
-	stda	%d48, [DST]ASI_BLK_P
-	ldd	[SRC + 0x18], %d38
-	ldd	[SRC + 0x20], %d40
-	ldd	[SRC + 0x28], %d42
-	ldd	[SRC + 0x30], %d44
-	ldd	[SRC + 0x38], %d46
-	sub	CNT, VIS_BLOCKSIZE, CNT
-	add	DST, VIS_BLOCKSIZE, DST
-	add	SRC, VIS_BLOCKSIZE, SRC
-	stda	%d32, [DST]ASI_BLK_P
-
-	set	PAGESIZE, %o1
-	call	rock_sync_icache
-	nop
-
-	membar	#Sync
-
-	btst	FPRS_FEF, %l0
-	bz,pt	%icc, 2f
-	  nop
-
-	BLD_FPQ3Q4_FROMSTACK(%l3)
-	ba	3f
-	  nop
-
-2:	FZEROQ3Q4
-
-3:	wr	%l0, 0, %fprs		! restore fprs
-	ret
-	  restore	%g0, 0, %o0
-
-	SET_SIZE(hwblkpagecopy)
-#endif	/* lint */
-
-
-/*
- * Transfer data to and from user space -
- * Note that these routines can cause faults
- * It is assumed that the kernel has nothing at
- * less than KERNELBASE in the virtual address space.
- *
- * Note that copyin(9F) and copyout(9F) are part of the
- * DDI/DKI which specifies that they return '-1' on "errors."
- *
- * Sigh.
- *
- * So there's two extremely similar routines - xcopyin() and xcopyout()
- * which return the errno that we've faithfully computed.  This
- * allows other callers (e.g. uiomove(9F)) to work correctly.
- * Given that these are used pretty heavily, we expand the calling
- * sequences inline for all flavours (rather than making wrappers).
- *
- * There are also stub routines for xcopyout_little and xcopyin_little,
- * which currently are intended to handle requests of <= 16 bytes from
- * do_unaligned. Future enhancement to make them handle 8k pages efficiently
- * is left as an exercise...
- */
-
-/*
- * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
- *	
- * General theory of operation:
- *
- * The only difference between copy{in,out} and
- * xcopy{in,out} is in the error handling routine they invoke
- * when a memory access error occurs. xcopyOP returns the errno
- * while copyOP returns -1 (see above). copy{in,out}_noerr set
- * a special flag (by oring the TRAMP_FLAG into the fault handler address)
- * if they are called with a fault handler already in place. That flag
- * causes the default handlers to trampoline to the previous handler
- * upon an error.
- *
- * None of the copyops routines grab a window until it's decided that
- * we need to do a HW block copy operation. This saves a window
- * spill/fill when we're called during socket ops. The typical IO
- * path won't cause spill/fill traps.
- *
- * This code uses a set of 4 limits for the maximum size that will
- * be copied given a particular input/output address alignment.
- * If the value for a particular limit is zero, the copy will be performed
- * by the plain copy loops rather than FPBLK.
- *
- * See the description of bcopy above for more details of the
- * data copying algorithm and the default limits.
- *
- */
-
-/*
- * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
- */
-
-#if defined(lint)
-
-
-#else	/* lint */
-/*
- * We save the arguments in the following registers in case of a fault:
- *	kaddr - %l1
- *	uaddr - %l2
- *	count - %l3
- */
-#define SAVE_SRC	%l1
-#define SAVE_DST	%l2
-#define SAVE_COUNT	%l3
-
-#define SM_SAVE_SRC		%g4
-#define SM_SAVE_DST		%g5
-#define SM_SAVE_COUNT		%o5
-#define ERRNO		%l5
-
-
-#define REAL_LOFAULT	%l4
-/*
- * Generic copyio fault handler.  This is the first line of defense when a
- * fault occurs in (x)copyin/(x)copyout.  In order for this to function
- * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
- * This allows us to share common code for all the flavors of the copy
- * operations, including the _noerr versions.
- *
- * Note that this function will restore the original input parameters before
- * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
- * member of the t_copyop structure, if needed.
- */
-	ENTRY(copyio_fault)
-	membar	#Sync
-	mov	%g1,ERRNO			! save errno in ERRNO
-	btst	FPUSED_FLAG, %l6
-	bz	%ncc, 1f
-	  nop
-
-	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
-	wr	%o2, 0, %gsr    	! restore gsr
-
-	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
-	btst	FPRS_FEF, %o3
-	bz,pt	%icc, 4f
-	  nop
-
-	BLD_FPQ3Q4_FROMSTACK(%o2)
-
-	ba,pt	%ncc, 1f
-	  wr	%o3, 0, %fprs   	! restore fprs
-
-4:
-	FZEROQ3Q4
-	wr	%o3, 0, %fprs   	! restore fprs
-
-1:
-	andn	%l6, FPUSED_FLAG, %l6
-	membar	#Sync
-	stn	%l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
-	FP_ALLOWMIGRATE(5, 6)
-
-	mov	SAVE_SRC, %i0
-	mov	SAVE_DST, %i1
-	jmp	REAL_LOFAULT
-	  mov	SAVE_COUNT, %i2
-
-	SET_SIZE(copyio_fault)
-
-
-#endif
-
-#if defined(lint)
-
-/*ARGSUSED*/
-int
-copyout(const void *kaddr, void *uaddr, size_t count)
-{ return (0); }
-
-#else	/* lint */
-
-	ENTRY(copyout)
-
-	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
-	bleu,pt	%ncc, .copyout_small		! go to larger cases
-	  xor	%o0, %o1, %o3			! are src, dst alignable?
-	btst	7, %o3				!
-	bz,pt	%ncc, .copyout_8		! check for longword alignment
-	  nop
-	btst	1, %o3				! 
-	bz,pt	%ncc, .copyout_2		! check for half-word
-	  nop
-	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyout_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
-	  nop
-.copyout_2:
-	btst	3, %o3				!
-	bz,pt	%ncc, .copyout_4		! check for word alignment
-	  nop
-	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyout_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
-	  nop
-.copyout_4:
-	! already checked longword, must be word aligned
-	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyout_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
-	  nop
-.copyout_8:
-	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyout_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
-	  nop
-
-	.align	16
-	nop				! instruction alignment
-					! see discussion at start of file
-.copyout_small:
-	sethi	%hi(.sm_copyout_err), %o5	! .sm_copyout_err is lofault
-	or	%o5, %lo(.sm_copyout_err), %o5
-	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
-	membar	#Sync				! sync error barrier
-	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
-.sm_do_copyout:
-	mov	%o0, SM_SAVE_SRC
-	mov	%o1, SM_SAVE_DST
-	cmp	%o2, SHORTCOPY		! check for really short case
-	bleu,pt	%ncc, .co_sm_left	!
-	  mov	%o2, SM_SAVE_COUNT
-	cmp	%o2, CHKSIZE		! check for medium length cases
-	bgu,pn	%ncc, .co_med		!
-	  or	%o0, %o1, %o3		! prepare alignment check
-	andcc	%o3, 0x3, %g0		! test for alignment
-	bz,pt	%ncc, .co_sm_word	! branch to word aligned case
-.co_sm_movebytes:
-	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
-.co_sm_notalign4:
-	ldub	[%o0], %o3		! read byte
-	subcc	%o2, 4, %o2		! reduce count by 4
-	stba	%o3, [%o1]ASI_USER	! write byte
-	inc	%o1			! advance DST by 1
-	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
-	add	%o0, 4, %o0		! advance SRC by 4
-	stba	%o3, [%o1]ASI_USER
-	inc	%o1			! advance DST by 1
-	ldub	[%o0 - 2], %o3
-	stba	%o3, [%o1]ASI_USER
-	inc	%o1			! advance DST by 1
-	ldub	[%o0 - 1], %o3
-	stba	%o3, [%o1]ASI_USER
-	bgt,pt	%ncc, .co_sm_notalign4	! loop til 3 or fewer bytes remain
-	  inc	%o1			! advance DST by 1
-	add	%o2, 3, %o2		! restore count
-.co_sm_left:
-	tst	%o2
-	bz,pt	%ncc, .co_sm_exit	! check for zero length
-	  nop
-	ldub	[%o0], %o3		! load one byte
-	deccc	%o2			! reduce count for cc test
-	bz,pt	%ncc, .co_sm_exit
-	  stba	%o3,[%o1]ASI_USER	! store one byte
-	ldub	[%o0 + 1], %o3		! load second byte
-	deccc	%o2
-	inc	%o1
-	bz,pt	%ncc, .co_sm_exit
-	  stba	%o3,[%o1]ASI_USER	! store second byte
-	ldub	[%o0 + 2], %o3		! load third byte
-	inc	%o1
-	stba	%o3,[%o1]ASI_USER	! store third byte
-	membar	#Sync				! sync error barrier
-	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	retl
-	  mov	%g0, %o0		! return 0
-	.align	16
-.co_sm_words:
-	lduw	[%o0], %o3		! read word
-.co_sm_wordx:
-	subcc	%o2, 8, %o2		! update count
-	stwa	%o3, [%o1]ASI_USER	! write word
-	add	%o0, 8, %o0		! update SRC
-	lduw	[%o0 - 4], %o3		! read word
-	add	%o1, 4, %o1		! update DST
-	stwa	%o3, [%o1]ASI_USER	! write word
-	bgt,pt	%ncc, .co_sm_words	! loop til done
-	  add	%o1, 4, %o1		! update DST
-	addcc	%o2, 7, %o2		! restore count
-	bz,pt	%ncc, .co_sm_exit
-	  nop
-	deccc	%o2
-	bz,pt	%ncc, .co_sm_byte
-.co_sm_half:
-	  subcc	%o2, 2, %o2		! reduce count by 2
-	lduh	[%o0], %o3		! read half word
-	add	%o0, 2, %o0		! advance SRC by 2
-	stha	%o3, [%o1]ASI_USER	! write half word
-	bgt,pt	%ncc, .co_sm_half	! loop til done
-	  add	%o1, 2, %o1		! advance DST by 2
-	addcc	%o2, 1, %o2		! restore count
-	bz,pt	%ncc, .co_sm_exit
-	  nop
-.co_sm_byte:
-	ldub	[%o0], %o3
-	stba	%o3, [%o1]ASI_USER
-	membar	#Sync				! sync error barrier
-	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	retl
-	  mov	%g0, %o0		! return 0
-	.align 16
-.co_sm_word:
-	subcc	%o2, 4, %o2		! update count
-	bgt,pt	%ncc, .co_sm_wordx
-	  lduw	[%o0], %o3		! read word
-	addcc	%o2, 3, %o2		! restore count
-	bz,pt	%ncc, .co_sm_exit
-	  stwa	%o3, [%o1]ASI_USER	! write word
-	deccc	%o2			! reduce count for cc test
-	ldub	[%o0 + 4], %o3		! load one byte
-	add	%o1, 4, %o1
-	bz,pt	%ncc, .co_sm_exit
-	  stba	%o3, [%o1]ASI_USER	! store one byte
-	ldub	[%o0 + 5], %o3		! load second byte
-	deccc	%o2
-	inc	%o1
-	bz,pt	%ncc, .co_sm_exit
-	  stba	%o3, [%o1]ASI_USER	! store second byte
-	ldub	[%o0 + 6], %o3		! load third byte
-	inc	%o1
-	stba	%o3, [%o1]ASI_USER	! store third byte
-.co_sm_exit:
-	  membar	#Sync				! sync error barrier
-	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	retl
-	  mov	%g0, %o0		! return 0
-
-	.align 16
-.co_med:
-	xor	%o0, %o1, %o3		! setup alignment check
-	btst	1, %o3
-	bnz,pt	%ncc, .co_sm_movebytes	! unaligned
-	  nop
-	btst	3, %o3
-	bnz,pt	%ncc, .co_med_half	! halfword aligned
-	  nop
-	btst	7, %o3
-	bnz,pt	%ncc, .co_med_word	! word aligned
-	  nop
-.co_med_long:
-	btst	3, %o0			! check for
-	bz,pt	%ncc, .co_med_long1	! word alignment
-	  nop
-.co_med_long0:
-	ldub	[%o0], %o3		! load one byte
-	inc	%o0
-	stba	%o3,[%o1]ASI_USER	! store byte
-	inc	%o1
-	btst	3, %o0
-	bnz,pt	%ncc, .co_med_long0
-	  dec	%o2
-.co_med_long1:			! word aligned
-	btst	7, %o0			! check for long word
-	bz,pt	%ncc, .co_med_long2
-	  nop
-	lduw	[%o0], %o3		! load word
-	add	%o0, 4, %o0		! advance SRC by 4
-	stwa	%o3, [%o1]ASI_USER	! store word
-	add	%o1, 4, %o1		! advance DST by 4
-	sub	%o2, 4, %o2		! reduce count by 4
-!
-!  Now long word aligned and have at least 32 bytes to move
-!
-.co_med_long2:
-	sub	%o2, 31, %o2		! adjust count to allow cc zero test
-	sub	%o1, 8, %o1		! adjust pointer to allow store in
-					! branch delay slot instead of add
-.co_med_lmove:
-	add	%o1, 8, %o1		! advance DST by 8
-	ldx	[%o0], %o3		! read long word
-	subcc	%o2, 32, %o2		! reduce count by 32
-	stxa	%o3, [%o1]ASI_USER	! write long word
-	add	%o1, 8, %o1		! advance DST by 8
-	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
-	add	%o0, 32, %o0		! advance SRC by 32
-	stxa	%o3, [%o1]ASI_USER
-	ldx	[%o0 - 16], %o3
-	add	%o1, 8, %o1		! advance DST by 8
-	stxa	%o3, [%o1]ASI_USER
-	ldx	[%o0 - 8], %o3
-	add	%o1, 8, %o1		! advance DST by 8
-	bgt,pt	%ncc, .co_med_lmove	! loop til 31 or fewer bytes left
-	  stxa	%o3, [%o1]ASI_USER
-	add	%o1, 8, %o1		! advance DST by 8
-	addcc	%o2, 24, %o2		! restore count to long word offset
-	ble,pt	%ncc, .co_med_lextra	! check for more long words to move
-	  nop
-.co_med_lword:
-	ldx	[%o0], %o3		! read long word
-	subcc	%o2, 8, %o2		! reduce count by 8
-	stxa	%o3, [%o1]ASI_USER	! write long word
-	add	%o0, 8, %o0		! advance SRC by 8
-	bgt,pt	%ncc, .co_med_lword	! loop til 7 or fewer bytes left
-	  add	%o1, 8, %o1		! advance DST by 8
-.co_med_lextra:
-	addcc	%o2, 7, %o2		! restore rest of count
-	bz,pt	%ncc, .co_sm_exit	! if zero, then done
-	  deccc	%o2
-	bz,pt	%ncc, .co_sm_byte
-	  nop
-	ba,pt	%ncc, .co_sm_half
-	  nop
-
-	.align 16
-	nop				! instruction alignment
-					! see discussion at start of file
-.co_med_word:
-	btst	3, %o0			! check for
-	bz,pt	%ncc, .co_med_word1	! word alignment
-	  nop
-.co_med_word0:
-	ldub	[%o0], %o3		! load one byte
-	inc	%o0
-	stba	%o3,[%o1]ASI_USER	! store byte
-	inc	%o1
-	btst	3, %o0
-	bnz,pt	%ncc, .co_med_word0
-	  dec	%o2
-!
-!  Now word aligned and have at least 36 bytes to move
-!
-.co_med_word1:
-	sub	%o2, 15, %o2		! adjust count to allow cc zero test
-.co_med_wmove:
-	lduw	[%o0], %o3		! read word
-	subcc	%o2, 16, %o2		! reduce count by 16
-	stwa	%o3, [%o1]ASI_USER	! write word
-	add	%o1, 4, %o1		! advance DST by 4
-	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
-	add	%o0, 16, %o0		! advance SRC by 16
-	stwa	%o3, [%o1]ASI_USER
-	add	%o1, 4, %o1		! advance DST by 4
-	lduw	[%o0 - 8], %o3
-	stwa	%o3, [%o1]ASI_USER
-	add	%o1, 4, %o1		! advance DST by 4
-	lduw	[%o0 - 4], %o3
-	stwa	%o3, [%o1]ASI_USER
-	bgt,pt	%ncc, .co_med_wmove	! loop til 15 or fewer bytes left
-	  add	%o1, 4, %o1		! advance DST by 4
-	addcc	%o2, 12, %o2		! restore count to word offset
-	ble,pt	%ncc, .co_med_wextra	! check for more words to move
-	  nop
-.co_med_word2:
-	lduw	[%o0], %o3		! read word
-	subcc	%o2, 4, %o2		! reduce count by 4
-	stwa	%o3, [%o1]ASI_USER	! write word
-	add	%o0, 4, %o0		! advance SRC by 4
-	bgt,pt	%ncc, .co_med_word2	! loop til 3 or fewer bytes left
-	  add	%o1, 4, %o1		! advance DST by 4
-.co_med_wextra:
-	addcc	%o2, 3, %o2		! restore rest of count
-	bz,pt	%ncc, .co_sm_exit	! if zero, then done
-	  deccc	%o2
-	bz,pt	%ncc, .co_sm_byte
-	  nop
-	ba,pt	%ncc, .co_sm_half
-	  nop
-
-	.align 16
-	nop				! instruction alignment
-	nop				! see discussion at start of file
-	nop
-.co_med_half:
-	btst	1, %o0			! check for
-	bz,pt	%ncc, .co_med_half1	! half word alignment
-	  nop
-	ldub	[%o0], %o3		! load one byte
-	inc	%o0
-	stba	%o3,[%o1]ASI_USER	! store byte
-	inc	%o1
-	dec	%o2
-!
-!  Now half word aligned and have at least 38 bytes to move
-!
-.co_med_half1:
-	sub	%o2, 7, %o2		! adjust count to allow cc zero test
-.co_med_hmove:
-	lduh	[%o0], %o3		! read half word
-	subcc	%o2, 8, %o2		! reduce count by 8
-	stha	%o3, [%o1]ASI_USER	! write half word
-	add	%o1, 2, %o1		! advance DST by 2
-	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
-	add	%o0, 8, %o0		! advance SRC by 8
-	stha	%o3, [%o1]ASI_USER
-	add	%o1, 2, %o1		! advance DST by 2
-	lduh	[%o0 - 4], %o3
-	stha	%o3, [%o1]ASI_USER
-	add	%o1, 2, %o1		! advance DST by 2
-	lduh	[%o0 - 2], %o3
-	stha	%o3, [%o1]ASI_USER
-	bgt,pt	%ncc, .co_med_hmove	! loop til 7 or fewer bytes left
-	  add	%o1, 2, %o1		! advance DST by 2
-	addcc	%o2, 7, %o2		! restore count
-	bz,pt	%ncc, .co_sm_exit
-	  deccc	%o2
-	bz,pt	%ncc, .co_sm_byte
-	  nop
-	ba,pt	%ncc, .co_sm_half
-	  nop
-
-/*
- * We got here because of a fault during short copyout.
- * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
- */
-.sm_copyout_err:
-	membar	#Sync
-	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
-	mov	SM_SAVE_SRC, %o0
-	mov	SM_SAVE_DST, %o1
-	mov	SM_SAVE_COUNT, %o2
-	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
-	tst	%o3
-	bz,pt	%ncc, 3f			! if not, return error
-	  nop
-	ldn	[%o3 + CP_COPYOUT], %o5		! if handler, invoke it with
-	jmp	%o5				! original arguments
-	  nop
-3:
-	retl
-	  or	%g0, -1, %o0		! return error value
-
-	SET_SIZE(copyout)
-
-/*
- * The _more entry points are not intended to be used directly by
- * any caller from outside this file.  They are provided to allow
- * profiling and dtrace of the portions of the copy code that uses
- * the floating point registers.
- * This entry is particularly important as DTRACE (at least as of
- * 4/2004) does not support leaf functions.
- */
-
-	ENTRY(copyout_more)
-.copyout_more:
-	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
-	set	.copyout_err, REAL_LOFAULT
-
-/*
- * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
- */
-.do_copyout:
-        set     copyio_fault, %l7		! .copyio_fault is lofault val
-
-	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
-	membar	#Sync				! sync error barrier
-	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
-
-	mov	%i0, SAVE_SRC
-	mov	%i1, SAVE_DST
-	mov	%i2, SAVE_COUNT
-
-	FP_NOMIGRATE(6, 7)
-
-	rd	%fprs, %o2		! check for unused fp
-	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
-
-	! FPU enabled ?  If not, enable it.
-	btst	FPRS_FEF, %o2
-	bz,a,pt	%icc, .do_blockcopyout
-	  wr	%g0, FPRS_FEF, %fprs
-
-	! save the FP registers even if DU is not set.
-
-	BST_FPQ3Q4_TOSTACK(%o2)
-
-.do_blockcopyout:
-	rd	%gsr, %o2
-	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
-	or	%l6, FPUSED_FLAG, %l6
-
-	andcc	DST, VIS_BLOCKSIZE - 1, TMP
-	mov	ASI_USER, %asi
-	bz,pt	%ncc, 2f
-	  neg	TMP
-	add	TMP, VIS_BLOCKSIZE, TMP
-
-	! TMP = bytes required to align DST on FP_BLOCK boundary
-	! Using SRC as a tmp here
-	cmp	TMP, 3
-	bleu,pt	%ncc, 1f
-	  sub	CNT,TMP,CNT		! adjust main count
-	sub	TMP, 3, TMP		! adjust for end of loop test
-.co_blkalign:
-	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
-	stba	SRC, [DST]%asi
-	subcc	TMP, 4, TMP
-	ldub	[REALSRC + 1], SRC
-	add	REALSRC, 4, REALSRC
-	stba	SRC, [DST + 1]%asi
-	ldub	[REALSRC - 2], SRC
-	add	DST, 4, DST
-	stba	SRC, [DST - 2]%asi
-	ldub	[REALSRC - 1], SRC
-	bgu,pt	%ncc, .co_blkalign
-	  stba	SRC, [DST - 1]%asi
-
-	addcc	TMP, 3, TMP		! restore count adjustment
-	bz,pt	%ncc, 2f		! no bytes left?
-	  nop
-1:	ldub	[REALSRC], SRC
-	inc	REALSRC
-	inc	DST
-	deccc	TMP
-	bgu	%ncc, 1b
-	  stba	SRC, [DST - 1]%asi
-
-2:
-	andn	REALSRC, 0x7, SRC
-	alignaddr REALSRC, %g0, %g0
-
-	! SRC - 8-byte aligned
-	! DST - 64-byte aligned
-	prefetch [SRC], #one_read
-	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
-	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
-	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
-	ldd	[SRC], %d32
-#if FIRST_PREFETCH > 4
-	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
-#endif
-	ldd	[SRC + 0x08], %d34
-#if FIRST_PREFETCH > 5
-	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
-#endif
-	ldd	[SRC + 0x10], %d36
-#if FIRST_PREFETCH > 6
-	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
-#endif
-	faligndata %d32, %d34, %d48
-	ldd	[SRC + 0x18], %d38
-#if FIRST_PREFETCH > 7
-	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
-#endif
-	faligndata %d34, %d36, %d50
-	ldd	[SRC + 0x20], %d40
-	faligndata %d36, %d38, %d52
-	ldd	[SRC + 0x28], %d42
-	faligndata %d38, %d40, %d54
-	ldd	[SRC + 0x30], %d44
-	faligndata %d40, %d42, %d56
-	ldd	[SRC + 0x38], %d46
-	faligndata %d42, %d44, %d58
-	ldd	[SRC + VIS_BLOCKSIZE], %d32
-	sub	CNT, VIS_BLOCKSIZE, CNT
-	add	SRC, VIS_BLOCKSIZE, SRC
-	add	REALSRC, VIS_BLOCKSIZE, REALSRC
-	ba,a,pt	%ncc, 1f
-	  nop
-	.align	ICACHE_LINE_SIZE
-1:
-	ldd	[SRC + 0x08], %d34
-	faligndata %d44, %d46, %d60
-	ldd	[SRC + 0x10], %d36
-	faligndata %d46, %d32, %d62
-	stda	%d48, [DST]ASI_BLK_AIUS
-	ldd	[SRC + 0x18], %d38
-	faligndata %d32, %d34, %d48
-	ldd	[SRC + 0x20], %d40
-	faligndata %d34, %d36, %d50
-	ldd	[SRC + 0x28], %d42
-	faligndata %d36, %d38, %d52
-	ldd	[SRC + 0x30], %d44
-	faligndata %d38, %d40, %d54
-	ldd	[SRC + 0x38], %d46
-	faligndata %d40, %d42, %d56
-	sub	CNT, VIS_BLOCKSIZE, CNT
-	ldd	[SRC + VIS_BLOCKSIZE], %d32
-	faligndata %d42, %d44, %d58
-	prefetch [SRC + ((FIRST_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
-	add	DST, VIS_BLOCKSIZE, DST
-	prefetch [SRC + ((SECOND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
-	add	REALSRC, VIS_BLOCKSIZE, REALSRC
-	cmp	CNT, VIS_BLOCKSIZE + 8
-	bgu,pt	%ncc, 1b
-	  add	SRC, VIS_BLOCKSIZE, SRC
-
-	! only if REALSRC & 0x7 is 0
-	cmp	CNT, VIS_BLOCKSIZE
-	bne	%ncc, 3f
-	  andcc	REALSRC, 0x7, %g0
-	bz,pt	%ncc, 2f
-	  nop
-3:
-	faligndata %d44, %d46, %d60
-	faligndata %d46, %d32, %d62
-	stda	%d48, [DST]ASI_BLK_AIUS
-	add	DST, VIS_BLOCKSIZE, DST
-	ba,pt	%ncc, 3f
-	  nop
-2:
-	ldd	[SRC + 0x08], %d34
-	faligndata %d44, %d46, %d60
-	ldd	[SRC + 0x10], %d36
-	faligndata %d46, %d32, %d62
-	stda	%d48, [DST]ASI_BLK_AIUS
-	ldd	[SRC + 0x18], %d38
-	ldd	[SRC + 0x20], %d40
-	ldd	[SRC + 0x28], %d42
-	ldd	[SRC + 0x30], %d44
-	ldd	[SRC + 0x38], %d46
-	sub	CNT, VIS_BLOCKSIZE, CNT
-	add	DST, VIS_BLOCKSIZE, DST
-	add	SRC, VIS_BLOCKSIZE, SRC
-	add	REALSRC, VIS_BLOCKSIZE, REALSRC
-	stda	%d32, [DST]ASI_BLK_AIUS
-	add	DST, VIS_BLOCKSIZE, DST
-	ba,a,pt	%ncc, 4f
-	  nop
-
-3:	tst	CNT
-	bz,a	%ncc, 4f
-	  nop
-
-5:	ldub	[REALSRC], TMP
-	inc	REALSRC
-	inc	DST
-	deccc	CNT
-	bgu	%ncc, 5b
-	  stba	TMP, [DST - 1]%asi
-4:
-
-.copyout_exit:
-	membar	#Sync
-
-	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
-	wr	%o2, 0, %gsr		! restore gsr
-
-	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
-	btst	FPRS_FEF, %o3
-	bz,pt	%icc, 4f
-	  nop
-
-	BLD_FPQ3Q4_FROMSTACK(%o2)
-
-	ba,pt	%ncc, 1f
-	  wr	%o3, 0, %fprs		! restore fprs
-
-4:
-	FZEROQ3Q4
-	wr	%o3, 0, %fprs		! restore fprs
-
-1:
-	membar	#Sync
-	andn	%l6, FPUSED_FLAG, %l6
-	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	FP_ALLOWMIGRATE(5, 6)
-	ret
-	  restore	%g0, 0, %o0
-
-/*
- * We got here because of a fault during copyout.
- * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
- */
-.copyout_err:
-	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
-	tst	%o4
-	bz,pt	%ncc, 2f			! if not, return error
-	  nop
-	ldn	[%o4 + CP_COPYOUT], %g2		! if handler, invoke it with
-	jmp	%g2				! original arguments
-	  restore %g0, 0, %g0			! dispose of copy window
-2:
-        ret
-	  restore %g0, -1, %o0			! return error value
-
-
-	SET_SIZE(copyout_more)
-
-#endif	/* lint */
-
-
-#ifdef	lint
-
-/*ARGSUSED*/
-int
-xcopyout(const void *kaddr, void *uaddr, size_t count)
-{ return (0); }
-
-#else	/* lint */
-
-	ENTRY(xcopyout)
-	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
-	bleu,pt	%ncc, .xcopyout_small		! go to larger cases
-	  xor	%o0, %o1, %o3			! are src, dst alignable?
-	btst	7, %o3				!
-	bz,pt	%ncc, .xcopyout_8		!
-	  nop
-	btst	1, %o3				! 
-	bz,pt	%ncc, .xcopyout_2		! check for half-word
-	  nop
-	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
-	tst	%o3
-	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .xcopyout_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
-	  nop
-.xcopyout_2:
-	btst	3, %o3				!
-	bz,pt	%ncc, .xcopyout_4		! check for word alignment
-	  nop
-	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
-	tst	%o3
-	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .xcopyout_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
-	  nop
-.xcopyout_4:
-	! already checked longword, must be word aligned
-	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
-	tst	%o3
-	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .xcopyout_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
-	  nop
-.xcopyout_8:
-	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
-	tst	%o3
-	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .xcopyout_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
-	  nop
-
-.xcopyout_small:
-	sethi	%hi(.sm_xcopyout_err), %o5	! .sm_xcopyout_err is lofault
-	or	%o5, %lo(.sm_xcopyout_err), %o5
-	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
-	membar	#Sync				! sync error barrier
-	ba,pt	%ncc, .sm_do_copyout		! common code
-	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
-
-.xcopyout_more:
-	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
-	sethi	%hi(.xcopyout_err), REAL_LOFAULT
-	ba,pt	%ncc, .do_copyout		! common code
-	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
-
-/*
- * We got here because of fault during xcopyout
- * Errno value is in ERRNO
- */
-.xcopyout_err:
-	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
-	tst	%o4
-	bz,pt	%ncc, 2f			! if not, return error
-	  nop
-	ldn	[%o4 + CP_XCOPYOUT], %g2	! if handler, invoke it with
-	jmp	%g2				! original arguments
-	  restore %g0, 0, %g0			! dispose of copy window
-2:
-        ret
-	  restore ERRNO, 0, %o0			! return errno value
-
-.sm_xcopyout_err:
-
-	membar	#Sync
-	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
-	mov	SM_SAVE_SRC, %o0
-	mov	SM_SAVE_DST, %o1
-	mov	SM_SAVE_COUNT, %o2
-	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
-	tst	%o3
-	bz,pt	%ncc, 3f			! if not, return error
-	  nop
-	ldn	[%o3 + CP_XCOPYOUT], %o5	! if handler, invoke it with
-	jmp	%o5				! original arguments
-	  nop
-3:
-	retl
-	  or	%g1, 0, %o0		! return errno value
-
-	SET_SIZE(xcopyout)
-
-#endif	/* lint */
-	
-#ifdef	lint
-
-/*ARGSUSED*/
-int
-xcopyout_little(const void *kaddr, void *uaddr, size_t count)
-{ return (0); }
-
-#else	/* lint */
-
-	ENTRY(xcopyout_little)
-	sethi	%hi(.xcopyio_err), %o5
-	or	%o5, %lo(.xcopyio_err), %o5
-	ldn	[THREAD_REG + T_LOFAULT], %o4
-	membar	#Sync				! sync error barrier
-	stn	%o5, [THREAD_REG + T_LOFAULT]
-	mov	%o4, %o5
-
-	subcc	%g0, %o2, %o3
-	add	%o0, %o2, %o0
-	bz,pn	%ncc, 2f		! check for zero bytes
-	  sub	%o2, 1, %o4
-	add	%o0, %o4, %o0		! start w/last byte
-	add	%o1, %o2, %o1
-	ldub	[%o0 + %o3], %o4
-
-1:	stba	%o4, [%o1 + %o3]ASI_AIUSL
-	inccc	%o3
-	sub	%o0, 2, %o0		! get next byte
-	bcc,a,pt %ncc, 1b
-	  ldub	[%o0 + %o3], %o4
-
-2:
-	membar	#Sync				! sync error barrier
-	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	retl
-	  mov	%g0, %o0		! return (0)
-
-	SET_SIZE(xcopyout_little)
-
-#endif	/* lint */
-
-/*
- * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
- */
-
-#if defined(lint)
-
-/*ARGSUSED*/
-int
-copyin(const void *uaddr, void *kaddr, size_t count)
-{ return (0); }
-
-#else	/* lint */
-
-	ENTRY(copyin)
-	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
-	bleu,pt	%ncc, .copyin_small		! go to larger cases
-	  xor	%o0, %o1, %o3			! are src, dst alignable?
-	btst	7, %o3				!
-	bz,pt	%ncc, .copyin_8			! check for longword alignment
-	  nop
-	btst	1, %o3				! 
-	bz,pt	%ncc, .copyin_2			! check for half-word
-	  nop
-	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyin_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
-	  nop
-.copyin_2:
-	btst	3, %o3				!
-	bz,pt	%ncc, .copyin_4			! check for word alignment
-	  nop
-	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyin_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
-	  nop
-.copyin_4:
-	! already checked longword, must be word aligned
-	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyin_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
-	  nop
-.copyin_8:
-	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyin_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
-	  nop
-
-	.align	16
-	nop				! instruction alignment
-					! see discussion at start of file
-.copyin_small:
-	sethi	%hi(.sm_copyin_err), %o5	! .sm_copyin_err is lofault 
-	or	%o5, %lo(.sm_copyin_err), %o5
-	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofault, no tramp
-	membar	#Sync				! sync error barrier
-	stn	%o5, [THREAD_REG + T_LOFAULT]
-.sm_do_copyin:
-	mov	%o0, SM_SAVE_SRC
-	mov	%o1, SM_SAVE_DST
-	cmp	%o2, SHORTCOPY		! check for really short case
-	bleu,pt	%ncc, .ci_sm_left	!
-	  mov	%o2, SM_SAVE_COUNT
-	cmp	%o2, CHKSIZE		! check for medium length cases
-	bgu,pn	%ncc, .ci_med		!
-	  or	%o0, %o1, %o3		! prepare alignment check
-	andcc	%o3, 0x3, %g0		! test for alignment
-	bz,pt	%ncc, .ci_sm_word	! branch to word aligned case
-.ci_sm_movebytes:
-	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
-.ci_sm_notalign4:
-	lduba	[%o0]ASI_USER, %o3	! read byte
-	subcc	%o2, 4, %o2		! reduce count by 4
-	stb	%o3, [%o1]		! write byte
-	add	%o0, 1, %o0		! advance SRC by 1
-	lduba	[%o0]ASI_USER, %o3	! repeat for a total of 4 bytes
-	add	%o0, 1, %o0		! advance SRC by 1
-	stb	%o3, [%o1 + 1]
-	add	%o1, 4, %o1		! advance DST by 4
-	lduba	[%o0]ASI_USER, %o3
-	add	%o0, 1, %o0		! advance SRC by 1
-	stb	%o3, [%o1 - 2]
-	lduba	[%o0]ASI_USER, %o3
-	add	%o0, 1, %o0		! advance SRC by 1
-	bgt,pt	%ncc, .ci_sm_notalign4	! loop til 3 or fewer bytes remain
-	  stb	%o3, [%o1 - 1]
-	add	%o2, 3, %o2		! restore count
-.ci_sm_left:
-	tst	%o2
-	bz,pt	%ncc, .ci_sm_exit
-	  nop
-	lduba	[%o0]ASI_USER, %o3		! load one byte
-	deccc	%o2			! reduce count for cc test
-	bz,pt	%ncc, .ci_sm_exit
-	  stb	%o3,[%o1]		! store one byte
-	inc	%o0
-	lduba	[%o0]ASI_USER, %o3	! load second byte
-	deccc	%o2
-	bz,pt	%ncc, .ci_sm_exit
-	  stb	%o3,[%o1 + 1]		! store second byte
-	inc	%o0
-	lduba	[%o0]ASI_USER, %o3	! load third byte
-	stb	%o3,[%o1 + 2]		! store third byte
-	membar	#Sync				! sync error barrier
-	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	retl
-	  mov	%g0, %o0		! return 0
-	.align	16
-.ci_sm_words:
-	lduwa	[%o0]ASI_USER, %o3		! read word
-.ci_sm_wordx:
-	subcc	%o2, 8, %o2		! update count
-	stw	%o3, [%o1]		! write word
-	add	%o0, 4, %o0		! update SRC
-	add	%o1, 8, %o1		! update DST
-	lduwa	[%o0]ASI_USER, %o3	! read word
-	add	%o0, 4, %o0		! update SRC
-	bgt,pt	%ncc, .ci_sm_words	! loop til done
-	  stw	%o3, [%o1 - 4]		! write word
-	addcc	%o2, 7, %o2		! restore count
-	bz,pt	%ncc, .ci_sm_exit
-	  nop
-	deccc	%o2
-	bz,pt	%ncc, .ci_sm_byte
-.ci_sm_half:
-	  subcc	%o2, 2, %o2		! reduce count by 2
-	lduha	[%o0]ASI_USER, %o3	! read half word
-	add	%o0, 2, %o0		! advance SRC by 2
-	add	%o1, 2, %o1		! advance DST by 2
-	bgt,pt	%ncc, .ci_sm_half	! loop til done
-	  sth	%o3, [%o1 - 2]		! write half word
-	addcc	%o2, 1, %o2		! restore count
-	bz,pt	%ncc, .ci_sm_exit
-	  nop
-.ci_sm_byte:
-	lduba	[%o0]ASI_USER, %o3
-	stb	%o3, [%o1]
-	membar	#Sync				! sync error barrier
-	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	retl
-	  mov	%g0, %o0		! return 0
-	.align	16
-.ci_sm_word:
-	subcc	%o2, 4, %o2		! update count
-	bgt,pt	%ncc, .ci_sm_wordx
-	  lduwa	[%o0]ASI_USER, %o3		! read word
-	addcc	%o2, 3, %o2		! restore count
-	bz,pt	%ncc, .ci_sm_exit
-	  stw	%o3, [%o1]		! write word
-	deccc	%o2			! reduce count for cc test
-	add	%o0, 4, %o0
-	lduba	[%o0]ASI_USER, %o3	! load one byte
-	bz,pt	%ncc, .ci_sm_exit
-	  stb	%o3, [%o1 + 4]		! store one byte
-	inc	%o0
-	lduba	[%o0]ASI_USER, %o3	! load second byte
-	deccc	%o2
-	bz,pt	%ncc, .ci_sm_exit
-	  stb	%o3, [%o1 + 5]		! store second byte
-	inc	%o0
-	lduba	[%o0]ASI_USER, %o3	! load third byte
-	stb	%o3, [%o1 + 6]		! store third byte
-.ci_sm_exit:
-	membar	#Sync				! sync error barrier
-	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	retl
-	  mov	%g0, %o0		! return 0
-
-	.align 16
-.ci_med:
-	xor	%o0, %o1, %o3		! setup alignment check
-	btst	1, %o3
-	bnz,pt	%ncc, .ci_sm_movebytes	! unaligned
-	  nop
-	btst	3, %o3
-	bnz,pt	%ncc, .ci_med_half	! halfword aligned
-	  nop
-	btst	7, %o3
-	bnz,pt	%ncc, .ci_med_word	! word aligned
-	  nop
-.ci_med_long:
-	btst	3, %o0			! check for
-	bz,pt	%ncc, .ci_med_long1	! word alignment
-	  nop
-.ci_med_long0:
-	lduba	[%o0]ASI_USER, %o3		! load one byte
-	inc	%o0
-	stb	%o3,[%o1]		! store byte
-	inc	%o1
-	btst	3, %o0
-	bnz,pt	%ncc, .ci_med_long0
-	  dec	%o2
-.ci_med_long1:			! word aligned
-	btst	7, %o0			! check for long word
-	bz,pt	%ncc, .ci_med_long2
-	  nop
-	lduwa	[%o0]ASI_USER, %o3	! load word
-	add	%o0, 4, %o0		! advance SRC by 4
-	stw	%o3, [%o1]		! store word
-	add	%o1, 4, %o1		! advance DST by 4
-	sub	%o2, 4, %o2		! reduce count by 4
-!
-!  Now long word aligned and have at least 32 bytes to move
-!
-.ci_med_long2:
-	sub	%o2, 31, %o2		! adjust count to allow cc zero test
-.ci_med_lmove:
-	ldxa	[%o0]ASI_USER, %o3	! read long word
-	subcc	%o2, 32, %o2		! reduce count by 32
-	stx	%o3, [%o1]		! write long word
-	add	%o0, 8, %o0		! advance SRC by 8
-	ldxa	[%o0]ASI_USER, %o3	! repeat for a total for 4 long words
-	add	%o0, 8, %o0		! advance SRC by 8
-	stx	%o3, [%o1 + 8]
-	add	%o1, 32, %o1		! advance DST by 32
-	ldxa	[%o0]ASI_USER, %o3
-	add	%o0, 8, %o0		! advance SRC by 8
-	stx	%o3, [%o1 - 16]
-	ldxa	[%o0]ASI_USER, %o3
-	add	%o0, 8, %o0		! advance SRC by 8
-	bgt,pt	%ncc, .ci_med_lmove	! loop til 31 or fewer bytes left
-	  stx	%o3, [%o1 - 8]
-	addcc	%o2, 24, %o2		! restore count to long word offset
-	ble,pt	%ncc, .ci_med_lextra	! check for more long words to move
-	  nop
-.ci_med_lword:
-	ldxa	[%o0]ASI_USER, %o3	! read long word
-	subcc	%o2, 8, %o2		! reduce count by 8
-	stx	%o3, [%o1]		! write long word
-	add	%o0, 8, %o0		! advance SRC by 8
-	bgt,pt	%ncc, .ci_med_lword	! loop til 7 or fewer bytes left
-	  add	%o1, 8, %o1		! advance DST by 8
-.ci_med_lextra:
-	addcc	%o2, 7, %o2		! restore rest of count
-	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
-	  deccc	%o2
-	bz,pt	%ncc, .ci_sm_byte
-	  nop
-	ba,pt	%ncc, .ci_sm_half
-	  nop
-
-	.align 16
-	nop				! instruction alignment
-					! see discussion at start of file
-.ci_med_word:
-	btst	3, %o0			! check for
-	bz,pt	%ncc, .ci_med_word1	! word alignment
-	  nop
-.ci_med_word0:
-	lduba	[%o0]ASI_USER, %o3	! load one byte
-	inc	%o0
-	stb	%o3,[%o1]		! store byte
-	inc	%o1
-	btst	3, %o0
-	bnz,pt	%ncc, .ci_med_word0
-	  dec	%o2
-!
-!  Now word aligned and have at least 36 bytes to move
-!
-.ci_med_word1:
-	sub	%o2, 15, %o2		! adjust count to allow cc zero test
-.ci_med_wmove:
-	lduwa	[%o0]ASI_USER, %o3	! read word
-	subcc	%o2, 16, %o2		! reduce count by 16
-	stw	%o3, [%o1]		! write word
-	add	%o0, 4, %o0		! advance SRC by 4
-	lduwa	[%o0]ASI_USER, %o3	! repeat for a total for 4 words
-	add	%o0, 4, %o0		! advance SRC by 4
-	stw	%o3, [%o1 + 4]
-	add	%o1, 16, %o1		! advance DST by 16
-	lduwa	[%o0]ASI_USER, %o3
-	add	%o0, 4, %o0		! advance SRC by 4
-	stw	%o3, [%o1 - 8]
-	lduwa	[%o0]ASI_USER, %o3
-	add	%o0, 4, %o0		! advance SRC by 4
-	bgt,pt	%ncc, .ci_med_wmove	! loop til 15 or fewer bytes left
-	  stw	%o3, [%o1 - 4]
-	addcc	%o2, 12, %o2		! restore count to word offset
-	ble,pt	%ncc, .ci_med_wextra	! check for more words to move
-	  nop
-.ci_med_word2:
-	lduwa	[%o0]ASI_USER, %o3	! read word
-	subcc	%o2, 4, %o2		! reduce count by 4
-	stw	%o3, [%o1]		! write word
-	add	%o0, 4, %o0		! advance SRC by 4
-	bgt,pt	%ncc, .ci_med_word2	! loop til 3 or fewer bytes left
-	  add	%o1, 4, %o1		! advance DST by 4
-.ci_med_wextra:
-	addcc	%o2, 3, %o2		! restore rest of count
-	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
-	  deccc	%o2
-	bz,pt	%ncc, .ci_sm_byte
-	  nop
-	ba,pt	%ncc, .ci_sm_half
-	  nop
-
-	.align 16
-	nop				! instruction alignment
-					! see discussion at start of file
-.ci_med_half:
-	btst	1, %o0			! check for
-	bz,pt	%ncc, .ci_med_half1	! half word alignment
-	  nop
-	lduba	[%o0]ASI_USER, %o3	! load one byte
-	inc	%o0
-	stb	%o3,[%o1]		! store byte
-	inc	%o1
-	dec	%o2
-!
-!  Now half word aligned and have at least 38 bytes to move
-!
-.ci_med_half1:
-	sub	%o2, 7, %o2		! adjust count to allow cc zero test
-.ci_med_hmove:
-	lduha	[%o0]ASI_USER, %o3	! read half word
-	subcc	%o2, 8, %o2		! reduce count by 8
-	sth	%o3, [%o1]		! write half word
-	add	%o0, 2, %o0		! advance SRC by 2
-	lduha	[%o0]ASI_USER, %o3	! repeat for a total for 4 halfwords
-	add	%o0, 2, %o0		! advance SRC by 2
-	sth	%o3, [%o1 + 2]
-	add	%o1, 8, %o1		! advance DST by 8
-	lduha	[%o0]ASI_USER, %o3
-	add	%o0, 2, %o0		! advance SRC by 2
-	sth	%o3, [%o1 - 4]
-	lduha	[%o0]ASI_USER, %o3
-	add	%o0, 2, %o0		! advance SRC by 2
-	bgt,pt	%ncc, .ci_med_hmove	! loop til 7 or fewer bytes left
-	  sth	%o3, [%o1 - 2]
-	addcc	%o2, 7, %o2		! restore count
-	bz,pt	%ncc, .ci_sm_exit
-	  deccc	%o2
-	bz,pt	%ncc, .ci_sm_byte
-	  nop
-	ba,pt	%ncc, .ci_sm_half
-	  nop
-
-.sm_copyin_err:
-	membar	#Sync
-	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
-	mov	SM_SAVE_SRC, %o0
-	mov	SM_SAVE_DST, %o1
-	mov	SM_SAVE_COUNT, %o2
-	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
-	tst	%o3
-	bz,pt	%ncc, 3f			! if not, return error
-	  nop
-	ldn	[%o3 + CP_COPYIN], %o5		! if handler, invoke it with
-	jmp	%o5				! original arguments
-	  nop
-3:
-	retl
-	  or	%g0, -1, %o0		! return errno value
-
-	SET_SIZE(copyin)
-
-
-/*
- * The _more entry points are not intended to be used directly by
- * any caller from outside this file.  They are provided to allow
- * profiling and dtrace of the portions of the copy code that uses
- * the floating point registers.
- * This entry is particularly important as DTRACE (at least as of
- * 4/2004) does not support leaf functions.
- */
-
-	ENTRY(copyin_more)
-.copyin_more:
-	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
-	set	.copyin_err, REAL_LOFAULT
-
-/*
- * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
- */
-.do_copyin:
-	set	copyio_fault, %l7		! .copyio_fault is lofault val
-
-	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
-	membar	#Sync				! sync error barrier
-	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
-
-	mov	%i0, SAVE_SRC
-	mov	%i1, SAVE_DST
-	mov	%i2, SAVE_COUNT
-
-	FP_NOMIGRATE(6, 7)
-
-	rd	%fprs, %o2		! check for unused fp
-	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
-
-	! FPU enabled ?  If not, enable it.
-	btst	FPRS_FEF, %o2
-	bz,a,pt	%icc, .do_blockcopyin
-	  wr	%g0, FPRS_FEF, %fprs
-
-	! save the FP registers even if DU is not set.
-
-	BST_FPQ3Q4_TOSTACK(%o2)
-
-.do_blockcopyin:
-	rd	%gsr, %o2
-	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
-	or	%l6, FPUSED_FLAG, %l6
-
-	andcc	DST, VIS_BLOCKSIZE - 1, TMP
-	mov	ASI_USER, %asi
-	bz,pt	%ncc, 2f
-	  neg	TMP
-	add	TMP, VIS_BLOCKSIZE, TMP
-
-	! TMP = bytes required to align DST on FP_BLOCK boundary
-	! Using SRC as a tmp here
-	cmp	TMP, 3
-	bleu,pt	%ncc, 1f
-	  sub	CNT,TMP,CNT		! adjust main count
-	sub	TMP, 3, TMP		! adjust for end of loop test
-.ci_blkalign:
-	lduba	[REALSRC]%asi, SRC	! move 4 bytes per loop iteration
-	stb	SRC, [DST]
-	subcc	TMP, 4, TMP
-	lduba	[REALSRC + 1]%asi, SRC
-	add	REALSRC, 4, REALSRC
-	stb	SRC, [DST + 1]
-	lduba	[REALSRC - 2]%asi, SRC
-	add	DST, 4, DST
-	stb	SRC, [DST - 2]
-	lduba	[REALSRC - 1]%asi, SRC
-	bgu,pt	%ncc, .ci_blkalign
-	  stb	SRC, [DST - 1]
-
-	addcc	TMP, 3, TMP		! restore count adjustment
-	bz,pt	%ncc, 2f		! no bytes left?
-	  nop
-1:	lduba	[REALSRC]%asi, SRC
-	inc	REALSRC
-	inc	DST
-	deccc	TMP
-	bgu	%ncc, 1b
-	  stb	SRC, [DST - 1]
-
-2:
-	andn	REALSRC, 0x7, SRC
-	alignaddr REALSRC, %g0, %g0
-
-	! SRC - 8-byte aligned
-	! DST - 64-byte aligned
-	prefetcha [SRC]%asi, #one_read
-	prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #one_read
-	prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #one_read
-	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read
-	ldda	[SRC]%asi, %d32
-#if FIRST_PREFETCH > 4
-	prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
-#endif
-	ldda	[SRC + 0x08]%asi, %d34
-#if FIRST_PREFETCH > 5
-	prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read
-#endif
-	ldda	[SRC + 0x10]%asi, %d36
-#if FIRST_PREFETCH > 6
-	prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read
-#endif
-	faligndata %d32, %d34, %d48
-	ldda	[SRC + 0x18]%asi, %d38
-#if FIRST_PREFETCH > 7
-	prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read
-#endif
-	faligndata %d34, %d36, %d50
-	ldda	[SRC + 0x20]%asi, %d40
-	faligndata %d36, %d38, %d52
-	ldda	[SRC + 0x28]%asi, %d42
-	faligndata %d38, %d40, %d54
-	ldda	[SRC + 0x30]%asi, %d44
-	faligndata %d40, %d42, %d56
-	ldda	[SRC + 0x38]%asi, %d46
-	faligndata %d42, %d44, %d58
-	ldda	[SRC + VIS_BLOCKSIZE]%asi, %d32
-	sub	CNT, VIS_BLOCKSIZE, CNT
-	add	SRC, VIS_BLOCKSIZE, SRC
-	add	REALSRC, VIS_BLOCKSIZE, REALSRC
-	ba,a,pt	%ncc, 1f
-	  nop
-	.align	ICACHE_LINE_SIZE
-1:
-	ldda	[SRC + 0x08]%asi, %d34
-	faligndata %d44, %d46, %d60
-	ldda	[SRC + 0x10]%asi, %d36
-	faligndata %d46, %d32, %d62
-	stda	%d48, [DST]ASI_BLK_P
-	ldda	[SRC + 0x18]%asi, %d38
-	faligndata %d32, %d34, %d48
-	ldda	[SRC + 0x20]%asi, %d40
-	faligndata %d34, %d36, %d50
-	ldda	[SRC + 0x28]%asi, %d42
-	faligndata %d36, %d38, %d52
-	ldda	[SRC + 0x30]%asi, %d44
-	faligndata %d38, %d40, %d54
-	ldda	[SRC + 0x38]%asi, %d46
-	faligndata %d40, %d42, %d56
-	sub	CNT, VIS_BLOCKSIZE, CNT
-	ldda	[SRC + VIS_BLOCKSIZE]%asi, %d32
-	faligndata %d42, %d44, %d58
-	prefetcha [SRC + ((FIRST_PREFETCH) * VIS_BLOCKSIZE) + 8]%asi, #one_read
-	add	DST, VIS_BLOCKSIZE, DST
-	prefetcha [SRC + ((SECOND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
-	add	REALSRC, VIS_BLOCKSIZE, REALSRC
-	cmp	CNT, VIS_BLOCKSIZE + 8
-	bgu,pt	%ncc, 1b
-	  add	SRC, VIS_BLOCKSIZE, SRC
-
-	! only if REALSRC & 0x7 is 0
-	cmp	CNT, VIS_BLOCKSIZE
-	bne	%ncc, 3f
-	  andcc	REALSRC, 0x7, %g0
-	bz,pt	%ncc, 2f
-	  nop
-3:	
-	faligndata %d44, %d46, %d60
-	faligndata %d46, %d32, %d62
-	stda	%d48, [DST]ASI_BLK_P
-	add	DST, VIS_BLOCKSIZE, DST
-	ba,pt	%ncc, 3f
-	  nop
-2:
-	ldda	[SRC + 0x08]%asi, %d34
-	faligndata %d44, %d46, %d60
-	ldda	[SRC + 0x10]%asi, %d36
-	faligndata %d46, %d32, %d62
-	stda	%d48, [DST]ASI_BLK_P
-	ldda	[SRC + 0x18]%asi, %d38
-	ldda	[SRC + 0x20]%asi, %d40
-	ldda	[SRC + 0x28]%asi, %d42
-	ldda	[SRC + 0x30]%asi, %d44
-	ldda	[SRC + 0x38]%asi, %d46
-	sub	CNT, VIS_BLOCKSIZE, CNT
-	add	DST, VIS_BLOCKSIZE, DST
-	add	SRC, VIS_BLOCKSIZE, SRC
-	add	REALSRC, VIS_BLOCKSIZE, REALSRC
-	stda	%d32, [DST]ASI_BLK_P
-	add	DST, VIS_BLOCKSIZE, DST
-	ba,a,pt	%ncc, 4f
-	  nop
-
-3:	tst	CNT
-	bz,a	%ncc, 4f
-	  nop
-
-5:	lduba	[REALSRC]ASI_USER, TMP
-	inc	REALSRC
-	inc	DST
-	deccc	CNT
-	bgu	%ncc, 5b
-	  stb	TMP, [DST - 1]
-4:
-
-.copyin_exit:
-	membar	#Sync
-
-	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
-	wr	%o2, 0, %gsr
-
-	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
-	btst	FPRS_FEF, %o3
-	bz,pt	%icc, 4f
-	  nop
-
-	BLD_FPQ3Q4_FROMSTACK(%o2)
-
-	ba,pt	%ncc, 1f
-	  wr	%o3, 0, %fprs		! restore fprs
-
-4:
-	FZEROQ3Q4
-	wr	%o3, 0, %fprs		! restore fprs
-
-1:
-	membar	#Sync				! sync error barrier
-	andn	%l6, FPUSED_FLAG, %l6
-	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	FP_ALLOWMIGRATE(5, 6)
-	ret
-	  restore	%g0, 0, %o0
-/*
- * We got here because of a fault during copyin
- * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
- */
-.copyin_err:
-	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
-	tst	%o4
-	bz,pt	%ncc, 2f			! if not, return error
-	nop
-	ldn	[%o4 + CP_COPYIN], %g2		! if handler, invoke it with
-	jmp	%g2				! original arguments
-	restore %g0, 0, %g0			! dispose of copy window
-2:
-	ret
-	restore %g0, -1, %o0			! return error value
-
-
-	SET_SIZE(copyin_more)
-
-#endif	/* lint */
-
-#ifdef	lint
-
-/*ARGSUSED*/
-int
-xcopyin(const void *uaddr, void *kaddr, size_t count)
-{ return (0); }
-
-#else	/* lint */
-
-	ENTRY(xcopyin)
-
-	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
-	bleu,pt	%ncc, .xcopyin_small		! go to larger cases
-	  xor	%o0, %o1, %o3			! are src, dst alignable?
-	btst	7, %o3				!
-	bz,pt	%ncc, .xcopyin_8		! check for longword alignment
-	  nop
-	btst	1, %o3				! 
-	bz,pt	%ncc, .xcopyin_2		! check for half-word
-	  nop
-	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
-	tst	%o3
-	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .xcopyin_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
-	  nop
-.xcopyin_2:
-	btst	3, %o3				!
-	bz,pt	%ncc, .xcopyin_4		! check for word alignment
-	  nop
-	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
-	tst	%o3
-	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .xcopyin_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
-	  nop
-.xcopyin_4:
-	! already checked longword, must be word aligned
-	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
-	tst	%o3
-	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .xcopyin_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
-	  nop
-.xcopyin_8:
-	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
-	tst	%o3
-	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .xcopyin_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
-	  nop
-
-.xcopyin_small:
-	sethi	%hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
-	or	%o5, %lo(.sm_xcopyin_err), %o5
-	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofaul
-	membar	#Sync				! sync error barrier
-	ba,pt	%ncc, .sm_do_copyin		! common code
-	  stn	%o5, [THREAD_REG + T_LOFAULT]
-	
-.xcopyin_more:
-	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
-	sethi	%hi(.xcopyin_err), REAL_LOFAULT	! .xcopyin_err is lofault value
-	ba,pt	%ncc, .do_copyin
-	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
-
-/*
- * We got here because of fault during xcopyin
- * Errno value is in ERRNO
- */
-.xcopyin_err:
-	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
-	tst	%o4
-	bz,pt	%ncc, 2f			! if not, return error
-	  nop
-	ldn	[%o4 + CP_XCOPYIN], %g2		! if handler, invoke it with
-	jmp	%g2				! original arguments
-	  restore %g0, 0, %g0			! dispose of copy window
-2:
-        ret
-	  restore ERRNO, 0, %o0			! return errno value
-
-.sm_xcopyin_err:
-
-	membar	#Sync
-	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
-	mov	SM_SAVE_SRC, %o0
-	mov	SM_SAVE_DST, %o1
-	mov	SM_SAVE_COUNT, %o2
-	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
-	tst	%o3
-	bz,pt	%ncc, 3f			! if not, return error
-	  nop
-	ldn	[%o3 + CP_XCOPYIN], %o5		! if handler, invoke it with
-	jmp	%o5				! original arguments
-	  nop
-3:
-	retl
-	  or	%g1, 0, %o0		! return errno value
-
-	SET_SIZE(xcopyin)
-
-#endif	/* lint */
-
-#ifdef	lint
-
-/*ARGSUSED*/
-int
-xcopyin_little(const void *uaddr, void *kaddr, size_t count)
-{ return (0); }
-
-#else	/* lint */
-
-	ENTRY(xcopyin_little)
-	sethi	%hi(.xcopyio_err), %o5
-	or	%o5, %lo(.xcopyio_err), %o5
-	ldn	[THREAD_REG + T_LOFAULT], %o4
-	membar	#Sync				! sync error barrier
-	stn	%o5, [THREAD_REG + T_LOFAULT]	
-	mov	%o4, %o5
-
-	subcc	%g0, %o2, %o3
-	add	%o0, %o2, %o0
-	bz,pn	%ncc, 2f		! check for zero bytes
-	  sub	%o2, 1, %o4
-	add	%o0, %o4, %o0		! start w/last byte	
-	add	%o1, %o2, %o1
-	lduba	[%o0 + %o3]ASI_AIUSL, %o4
-
-1:	stb	%o4, [%o1 + %o3]
-	inccc	%o3
-	sub	%o0, 2, %o0		! get next byte
-	bcc,a,pt %ncc, 1b
-	  lduba	[%o0 + %o3]ASI_AIUSL, %o4
-
-2:
-	membar	#Sync				! sync error barrier
-	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	retl
-	  mov	%g0, %o0		! return (0)
-
-.xcopyio_err:
-	membar	#Sync				! sync error barrier
-	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-	retl
-	  mov	%g1, %o0
-
-	SET_SIZE(xcopyin_little)
-
-#endif	/* lint */
-
-
-/*
- * Copy a block of storage - must not overlap (from + len <= to).
- * No fault handler installed (to be called under on_fault())
- */
-#if defined(lint)
-
-/* ARGSUSED */
-void
-copyin_noerr(const void *ufrom, void *kto, size_t count)
-{}
-
-#else	/* lint */
-	ENTRY(copyin_noerr)
-
-	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
-	bleu,pt	%ncc, .copyin_ne_small		! go to larger cases
-	  xor	%o0, %o1, %o3			! are src, dst alignable?
-	btst	7, %o3				!
-	bz,pt	%ncc, .copyin_ne_8		! check for longword alignment
-	  nop
-	btst	1, %o3				! 
-	bz,pt	%ncc, .copyin_ne_2		! check for half-word
-	  nop
-	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
-	  nop
-.copyin_ne_2:
-	btst	3, %o3				!
-	bz,pt	%ncc, .copyin_ne_4		! check for word alignment
-	  nop
-	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
-	  nop
-.copyin_ne_4:
-	! already checked longword, must be word aligned
-	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
-	  nop
-.copyin_ne_8:
-	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
-	  nop
-
-.copyin_ne_small:
-	ldn	[THREAD_REG + T_LOFAULT], %o4
-	tst	%o4
-	bz,pn	%ncc, .sm_do_copyin
-	  nop
-	sethi	%hi(.sm_copyio_noerr), %o5
-	or	%o5, %lo(.sm_copyio_noerr), %o5
-	membar	#Sync				! sync error barrier
-	ba,pt	%ncc, .sm_do_copyin
-	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
-
-.copyin_noerr_more:
-	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
-	sethi	%hi(.copyio_noerr), REAL_LOFAULT
-	ba,pt	%ncc, .do_copyin
-	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
-
-.copyio_noerr:
-	jmp	%l6
-	  restore %g0,0,%g0
-
-.sm_copyio_noerr:
-	membar	#Sync
-	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore t_lofault
-	jmp	%o4
-	  nop
-
-	SET_SIZE(copyin_noerr)
-#endif /* lint */
-
-/*
- * Copy a block of storage - must not overlap (from + len <= to).
- * No fault handler installed (to be called under on_fault())
- */
-
-#if defined(lint)
-
-/* ARGSUSED */
-void
-copyout_noerr(const void *kfrom, void *uto, size_t count)
-{}
-
-#else	/* lint */
-	ENTRY(copyout_noerr)
-
-	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
-	bleu,pt	%ncc, .copyout_ne_small		! go to larger cases
-	  xor	%o0, %o1, %o3			! are src, dst alignable?
-	btst	7, %o3				!
-	bz,pt	%ncc, .copyout_ne_8		! check for longword alignment
-	  nop
-	btst	1, %o3				! 
-	bz,pt	%ncc, .copyout_ne_2		! check for half-word
-	  nop
-	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
-	  nop
-.copyout_ne_2:
-	btst	3, %o3				!
-	bz,pt	%ncc, .copyout_ne_4		! check for word alignment
-	  nop
-	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
-	  nop
-.copyout_ne_4:
-	! already checked longword, must be word aligned
-	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
-	  nop
-.copyout_ne_8:
-	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
-	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
-	tst	%o3
-	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
-	  cmp	%o2, %o3			! if length <= limit
-	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
-	  nop
-	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
-	  nop
-
-.copyout_ne_small:
-	ldn	[THREAD_REG + T_LOFAULT], %o4
-	tst	%o4
-	bz,pn	%ncc, .sm_do_copyout
-	  nop
-	sethi	%hi(.sm_copyio_noerr), %o5
-	or	%o5, %lo(.sm_copyio_noerr), %o5
-	membar	#Sync				! sync error barrier
-	ba,pt	%ncc, .sm_do_copyout
-	stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
-
-.copyout_noerr_more:
-	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
-	sethi	%hi(.copyio_noerr), REAL_LOFAULT
-	ba,pt	%ncc, .do_copyout
-	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
-
-	SET_SIZE(copyout_noerr)
-#endif /* lint */
-
-
-/*
- * hwblkclr - clears block-aligned, block-multiple-sized regions that are
- * longer than 256 bytes in length using spitfire's block stores.  If
- * the criteria for using this routine are not met then it calls bzero
- * and returns 1.  Otherwise 0 is returned indicating success.
- * Caller is responsible for ensuring use_hw_bzero is true and that
- * kpreempt_disable() has been called.
- */
-#ifdef lint
-/*ARGSUSED*/
-int
-hwblkclr(void *addr, size_t len)
-{ 
-	return(0);
-}
-#else /* lint */
-	! %i0 - start address
-	! %i1 - length of region (multiple of 64)
-	! %l0 - saved fprs
-	! %l1 - pointer to saved %d32 block
-	! %l2 - saved curthread->t_lwp
-
-
-	ENTRY(hwblkclr)
-	! get another window w/space for one aligned block of saved fpregs
-	save	%sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
-
-#ifdef ROCK_CR_6654578
-	! Address aligned to 128 byte
-	andcc	%i0, ST_CACHE_ALIGN, %g0
-	bnz,pn  %ncc, .normal_hwblkclr
-	 nop
-	! multiple of 8k len, call page_hwblkclr
-	set	PAGE_MASK, %i3
-	andcc	%i1, %i3, %g0
-	bnz,pn	%ncc, .normal_hwblkclr
-	 nop
-	mov     %i0, %o0
-	call page_hwblkclr
-	 mov     %i1, %o1
-	ret
-	restore %g0, 0, %o0     ! I$ sync not required
-
-.normal_hwblkclr:
-#endif
-	! Must be block-aligned
-	andcc	%i0, (VIS_BLOCKSIZE-1), %g0
-	bnz,pn	%ncc, 1f
-	  nop
-
-	! ... and must be 256 bytes or more
-	cmp	%i1, 256
-	blu,pn	%ncc, 1f
-	  nop
-
-	! ... and length must be a multiple of VIS_BLOCKSIZE
-	andcc	%i1, (VIS_BLOCKSIZE-1), %g0
-	bz,pn	%ncc, 2f
-	  nop
-
-1:	! punt, call bzero but notify the caller that bzero was used
-	mov	%i0, %o0
-	call	bzero
-	mov	%i1, %o1
-	! call rock_sync_icache
-	mov     %i0, %o0
-	call	rock_sync_icache
-	mov     %i0, %o0
-	ret
-	  restore	%g0, 0, %o0 !  did not use block operations
-
-2:	mov	%g0, %l3		! clear flag to say fp regs not saved
-	rd	%fprs, %l0		! check for unused fp
-
-	! FPU enabled ?  If not, enable it.
-	btst	FPRS_FEF, %l0
-	bz,a,pt	%icc, 1f
-	  wr	%g0, FPRS_FEF, %fprs
-
-	! save the FP registers even if DU is not set.
-
-	membar	#Sync
-	add	%fp, STACK_BIAS - 65, %l1
-	and	%l1, -VIS_BLOCKSIZE, %l1
-	stda	%d32, [%l1]ASI_BLK_P
-        ! Set a flag saying fp regs are saved.
-	mov	1, %l3
-
-        ! Need to wait only here for the above save to be completed
-	membar	#StoreStore|#StoreLoad|#LoadStore
-
-1:	wr	%g0, ASI_BLK_P, %asi
-
-	! Clear block
-	movxtod	%g0, %d32
-	movxtod	%g0, %d34
-	fsrc1	%d32, %d36
-	fsrc1	%d32, %d38
-	fsrc1	%d32, %d40
-	fsrc1	%d32, %d42
-	fsrc1	%d32, %d44
-	fsrc1	%d32, %d46
-
-	mov	256, %i3
-	ba,pt	%ncc, .pz_doblock
-	  nop
-
-.pz_blkstart:	
-      ! stda	%d32, [%i0 + 192]%asi  ! in dly slot of branch that got us here
-#ifdef ROCK_CR_6654578
-	prefetcha [%i0 + VIS_COPY_THRESHOLD + 128]%asi, #n_writes
-#endif
-	stda	%d32, [%i0 + 128]%asi
-#ifdef ROCK_CR_6654578
-	prefetcha [%i0 + VIS_COPY_THRESHOLD + 64]%asi, #n_writes
-#endif
-	stda	%d32, [%i0 + 64]%asi
-#ifdef ROCK_CR_6654578
-	prefetcha [%i0 + VIS_COPY_THRESHOLD + 0]%asi, #n_writes
-#endif
-	stda	%d32, [%i0]%asi
-.pz_zinst:
-	add	%i0, %i3, %i0
-	sub	%i1, %i3, %i1
-.pz_doblock:
-#ifdef ROCK_CR_6654578
-	prefetcha [%i0 + VIS_COPY_THRESHOLD + 192]%asi, #n_writes
-#endif
-	cmp	%i1, 256
-	bgeu,a	%ncc, .pz_blkstart
-	  stda	%d32, [%i0 + 192]%asi
-
-	cmp	%i1, 64
-	blu	%ncc, .pz_finish
-	
-	  andn	%i1, (64-1), %i3
-	srl	%i3, 4, %i2		! using blocks, 1 instr / 16 words
-	set	.pz_zinst, %i4
-	sub	%i4, %i2, %i4
-	jmp	%i4
-	  nop
-
-.pz_finish:
-	brz,a	%l3, .pz_finished
-	  wr	%l0, 0, %fprs		! restore fprs
-
-	! restore fpregs from stack
-	ldda	[%l1]ASI_BLK_P, %d32
-	wr	%l0, 0, %fprs		! restore fprs
-
-.pz_finished:
-	membar	#Sync
-	ret
-	  restore	%g0, 0, %o0		! return (bzero or not)
-
-	SET_SIZE(hwblkclr)
-#endif	/* lint */
-
-#ifdef lint
-/*ARGSUSED*/
-void
-hw_pa_bcopy32(uint64_t src, uint64_t dst)
-{}
-#else /*!lint */
-	/*
-	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
-	 * using physical addresses.
-	 */
-	ENTRY_NP(hw_pa_bcopy32)
-	rdpr	%pstate, %g1
-	andn	%g1, PSTATE_IE, %g2
-	wrpr	%g0, %g2, %pstate
-
-	rdpr	%pstate, %g0
-	ldxa	[%o0]ASI_MEM, %o2
-	add	%o0, 8, %o0
-	ldxa	[%o0]ASI_MEM, %o3
-	add	%o0, 8, %o0
-	ldxa	[%o0]ASI_MEM, %o4
-	add	%o0, 8, %o0
-	ldxa	[%o0]ASI_MEM, %o5
-	stxa	%o2, [%o1]ASI_MEM
-	add	%o1, 8, %o1
-	stxa	%o3, [%o1]ASI_MEM
-	add	%o1, 8, %o1
-	stxa	%o4, [%o1]ASI_MEM
-	add	%o1, 8, %o1
-	stxa	%o5, [%o1]ASI_MEM
-
-	retl
-	  wrpr	  %g0, %g1, %pstate
-
-	SET_SIZE(hw_pa_bcopy32)
-
-#endif /* lint */
-
-
-/*
- * Zero a block of storage.
- *
- * uzero is used by the kernel to zero a block in user address space.
- */
-
-
-#if defined(lint)
-
-/* ARGSUSED */
-int
-kzero(void *addr, size_t count)
-{ return(0); }
-
-/* ARGSUSED */
-void
-uzero(void *addr, size_t count)
-{}
-
-#else	/* lint */
-
-	ENTRY(uzero)
-	!
-	! Set a new lo_fault handler only if we came in with one
-	! already specified.
-	!
-	wr	%g0, ASI_USER, %asi
-	ldn	[THREAD_REG + T_LOFAULT], %o5
-	tst	%o5
-	bz,pt	%ncc, .do_zero
-	sethi	%hi(.zeroerr), %o2
-	or	%o2, %lo(.zeroerr), %o2
-	membar	#Sync
-	ba,pt	%ncc, .do_zero
-	stn	%o2, [THREAD_REG + T_LOFAULT]
-
-	ENTRY(kzero)
-	!
-	! Always set a lo_fault handler
-	!
-	wr	%g0, ASI_P, %asi
-	ldn	[THREAD_REG + T_LOFAULT], %o5
-	sethi	%hi(.zeroerr), %o2
-	or	%o5, LOFAULT_SET, %o5
-	or	%o2, %lo(.zeroerr), %o2
-	membar	#Sync
-	ba,pt	%ncc, .do_zero
-	stn	%o2, [THREAD_REG + T_LOFAULT]
-
-/*
- * We got here because of a fault during kzero or if
- * uzero or bzero was called with t_lofault non-zero.
- * Otherwise we've already run screaming from the room.
- * Errno value is in %g1. Note that we're here iff
- * we did set t_lofault.
- */
-.zeroerr:
-	!
-	! Undo asi register setting. Just set it to be the
-        ! kernel default without checking.
-	!
-	wr	%g0, ASI_P, %asi
-
-	!
-	! We did set t_lofault. It may well have been zero coming in.
-	!
-1:
-	tst	%o5
-	membar #Sync
-	bne,pn	%ncc, 3f		
-	andncc	%o5, LOFAULT_SET, %o5
-2:
-	!
-	! Old handler was zero. Just return the error.
-	!
-	retl				! return
-	mov	%g1, %o0		! error code from %g1
-3:
-	!
-	! We're here because %o5 was non-zero. It was non-zero
-	! because either LOFAULT_SET was present, a previous fault
-	! handler was present or both. In all cases we need to reset
-	! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
-	! before we either simply return the error or we invoke the
-	! previously specified handler.
-	!
-	be	%ncc, 2b
-	stn	%o5, [THREAD_REG + T_LOFAULT]
-	jmp	%o5			! goto real handler
-	  nop
-	SET_SIZE(kzero)
-	SET_SIZE(uzero)
-
-#endif	/* lint */
-
-/*
- * Zero a block of storage.
- */
-
-#if defined(lint)
-
-/* ARGSUSED */
-void
-bzero(void *addr, size_t count)
-{}
-
-#else	/* lint */
-
-	ENTRY(bzero)
-
-	wr	%g0, ASI_P, %asi
-	ldn	[THREAD_REG + T_LOFAULT], %o5	! save old vector
-	tst	%o5
-	bz,pt	%ncc, .do_zero
-	sethi	%hi(.zeroerr), %o2
-	or	%o2, %lo(.zeroerr), %o2
-	membar	#Sync				! sync error barrier
-	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
-
-.do_zero:
-	/*
-	 * If 0 bytes to xfer return
-	 */
-	brnz	%o1, continue_bzero
-	nop
-	ba	.bzero_exit
-	nop
-continue_bzero:
-	prefetch	[%o0],2
-	cmp	%o1, 8
-	bge,pt	%ncc, xfer_8_or_more
-	nop
-
-.byteclr:
-	deccc	%o1			! byte clearing loop
-	stba	%g0, [%o0]%asi
-	bgu,pt	%ncc, .byteclr
-	inc	%o0
-	ba	.bzero_exit
-	nop
-
-xfer_8_or_more:
-	andcc	%o0, 7, %o3		! is add aligned on a 8 byte bound
-	brz,pt	%o3, blkchk		
-	sub	%o3, 8, %o3		! -(bytes till double aligned)
-	add	%o1, %o3, %o1		! update o1 with new count
-1:
-	stba	%g0, [%o0]%asi
-	inccc	%o3
-	bl,pt	%ncc, 1b
-	inc	%o0
-
-	! Now addr is double word aligned
-blkchk:
-	cmp     %o1, 767		! if large count use Block ld/st
-	bg,pt	%ncc,blkwr
-	nop
-	and	%o1, 24, %o3		! o3 is {0, 8, 16, 24}
-	brz	%o3, skip_dw_loop
-	nop
-1:	subcc	%o3, 8, %o3		! double-word loop
-	stxa	%g0, [%o0]%asi
-	bgu,pt %ncc, 1b
-	add	%o0, 8, %o0
-skip_dw_loop:
-	andncc	%o1, 31, %o4		! o4 has 32 byte aligned count
-	brz,pn	%o4, 3f
-	nop
-	ba	loop_32byte
-	nop
-
-	.align	ICACHE_LINE_SIZE
-
-loop_32byte:
-	subcc	%o4, 32, %o4		! main loop, 32 bytes per iteration
-	stxa	%g0, [%o0]%asi
-	stxa	%g0, [%o0 + 8]%asi
-	stxa	%g0, [%o0 + 16]%asi
-	stxa	%g0, [%o0 + 24]%asi
-	bne,pt  %ncc, loop_32byte
-	add	%o0, 32, %o0
-3:	
-	and	%o1, 7, %o1		! o1 has the remaining bytes (<8)
-	brnz	%o1, .byteclr
-	nop
-	ba	.bzero_exit
-	nop
-blkwr:
-	sub     %o0,1,%o3
-	andn    %o3,0x7f,%o4
-	add     %o4,128,%o4
-	prefetch [%o4],2		!prefetch next 128b
-	prefetch [%o4+64],2
-	prefetch [%o4+(2*64)],2	
-	prefetch [%o4+(3*64)],2
-
-	andcc   %o0,0x7f,%o3	    !o3=0 , means it is already 128 align
-	brz,pn  %o3,aligned_on_128_bzero
-	sub     %o3,128,%o3
-
-	add     %o1,%o3,%o1
-align_to_128_bzero:
-	stxa    %g0,[%o0]%asi
-	addcc   %o3,8,%o3
-	bl,pt   %ncc,align_to_128_bzero
-	add     %o0,8,%o0
-
-
-
-aligned_on_128_bzero:
-	! if the addr is 512 byte aligned and bytes to zero
-	! are greater than or equal to 4096 do a stingray_optimized_bzero
-	andcc	%o0,0x1ff,%o3	! Is addr 512 byte aligned ?
-	brnz,pn	%o3, 4f
-	mov	%o1,%g5
-	set	4096, %g4
-	subcc	%o1, %g4, %g0
-	bge,pn	%ncc, stingray_optimized_bzero
-	nop
-4:
-	! addr(dest. buffer) is not aligned to 512 byte
-	! if the number of bytes to zero are less than 4096 after
-	! aligning the addr to 512 byte then do interleave128_bzero.
-
-	sub	%o0,8,%o4
-	andn 	%o4,0x1ff,%o3
-	add 	%o3,0x200,%o3	!o3 = addr aligned to 512 byte.
-	sub 	%o3,%o0,%o3	!o3 = number of bytes to zero to align addr to 512
-	sub	%o1,%o3,%g5	!g5 = bytes to zero from 512 byte aligned addr
-	set	4096, %g4
-	subcc	%g5, %g4, %g0
-	bge,pn	%ncc,6f
-	nop
-	! clear %g5 to indicate that there is no need to do
-	! stingray_optimized_bzero
-	mov	%g0, %g5
-	add	%o0, %o1, %o4
-	ba	interleave128_bzero
-	nop
-6:
-	! %g5 contains the number of bytes to zero after 512 byte alignment
-	! We zero the bytes in dest. buffer until it is 512 byte aligned
-	! and call stingray_optimized_bzero
-	! if the nuber of bytes to zero(until 512 alignment) is less than 256
-	! we call bzero_word, else we call interleave128_bzero
-	mov	%o3, %o1
-	subcc 	%o3,256,%g0
-	bl,pn	%ncc,bzero_word
-	add     %o0,%o1,%o4	     !cal the last byte to write %o4
-	ba	interleave128_bzero
-	nop
-
-	.align	64
-interleave128_bzero:
-	! %o0 has the dest. buffer addr
-	! %o1 has the number of bytes to zero
-	! %o4 has the addr of the dest. buffer at or beyond which no write
-	! is to be done.
-	! %g5 has the number of bytes to zero using stingray_optimized_bzero
-
-	add	%o0, 256, %o3
-	prefetch [%o3], 2	!1st 64 byte line of next 256 byte block
-	add	%o0, 384, %o3
-	prefetch [%o3], 2	!3rd 64 byte line of next 256 byte block
-	add	%o0, 320, %o3
-	prefetch [%o3], 2	!2nd 64 byte line of next 256 byte block
-	add	%o0, 448, %o3
-	prefetch [%o3], 2	!4th 64 byte line of next 256 byte block
-	mov	%o0, %o3
-	stxa     %g0,[%o3]%asi	!1st 64 byte line
-	add     %o0,128,%o3
-	stxa     %g0,[%o3]%asi	!3rd 64 byte line
-	add     %o0,8,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(2 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128 ,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(3 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(4 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(5 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(6 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(7 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(8 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(9 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(10 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(11 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(12 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(13 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(14 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(15 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	! check if the next 256 byte copy will not exceed the number of
-	! bytes remaining to be copied.
-	! %g4 points to the dest buffer after copying 256 bytes more.
-	! %o4 points to dest. buffer at or beyond which no writes should be done.
-	add     %o0,512,%g4
-	subcc   %o4,%g4,%g0
-	bge,pt  %ncc,interleave128_bzero
-	add     %o0,256,%o0
-
-bzero_word:
-	and     %o1,255,%o3
-	and     %o3,7,%o1
-
-	! Set the remaining doubles
-	subcc   %o3, 8, %o3		! Can we store any doubles?
-	bl,pn  %ncc, 6f
-	and	%o1, 7, %o1		! calc bytes left after doubles
-
-5:	
-	stxa	%g0, [%o0]%asi
-	subcc   %o3, 8, %o3
-	bge,pt	%ncc, 5b
-	add     %o0, 8, %o0      
-6:
-	! Set the remaining bytes
-	brz	%o1,  can_we_do_stingray_optimized_bzero
-	
-7:
-	deccc	%o1			! byte clearing loop
-	stba	%g0, [%o0]%asi
-	bgu,pt	%ncc, 7b
-	inc	%o0
-can_we_do_stingray_optimized_bzero:
-	mov	%g5, %o1
-	brnz,pn	%o1, stingray_optimized_bzero
-	nop
-	
-	ba	.bzero_exit
-	nop
-
-stingray_optimized_bzero:
-	save	%sp, -SA(MINFRAME), %sp
-	mov	%i0, %o0
-	mov	%i1, %o1
-	mov	%i2, %o2
-	mov	%i3, %o3
-	mov	%i5, %o5
-init:
-	set     4096,%o2
-
-	prefetch [%o0+0],2
-	prefetch [%o0+(64*1)],2
-	prefetch [%o0+(64*2)],2
-	prefetch [%o0+(64*3)],2
-	prefetch [%o0+(64*4)],2
-	prefetch [%o0+(64*5)],2
-	prefetch [%o0+(64*6)],2
-	prefetch [%o0+(64*7)],2
-	prefetch [%o0+(64*8)],2
-	prefetch [%o0+(64*9)],2
-	prefetch [%o0+(64*10)],2
-	prefetch [%o0+(64*11)],2
-	prefetch [%o0+(64*12)],2
-	prefetch [%o0+(64*13)],2
-	prefetch [%o0+(64*14)],2
-	prefetch [%o0+(64*15)],2
-	ba      stingray_optimized_4k_zero_loop
-	add     %o0,%g5,%g5
-	! Local register usage:
-	! prefetching into L1 cache.
-	! %l3   dest. buffer at start of inner loop.
-	! %l5   iteration counter to make buddy loop execute 2 times.
-	! %l6   iteration counter to make inner loop execute 4 times.
-	! %l7   address at far ahead of current dest. buffer for prefetching
-	!	into L2 cache.
-
-	.align 64
-stingray_optimized_4k_zero_loop:
-	set      2,%l5
-	add      %o0, 0, %l3
-bzero_buddyloop:
-	set      PF_FAR, %g4
-	add      %o0, %g4, %l7
-
-	!  Prefetch ahead by 2 pages to get TLB entry in advance.
-	set      2*PF_FAR, %g4
-	add      %o0, %g4, %g4
-	prefetch [%g4+%g0],2
-
-	set      4,%l6
-	set      0, %g4
-
-	! Each iteration of the inner loop below writes 8 sequential lines.
-	! This loop is iterated 4 times, to move a total of 32 lines, all of
-	! which have the same value of PA[9], so we increment the base 
-	! address by 1024 bytes in each iteration, which varies PA[10].
-bzero_innerloop:
-	add	%o0, PF_FAR, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2 
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-	add	%o3, 64, %o3
-	prefetch [%o3],2
-
-	mov	%o0, %o3
-	stxa     %g0,[%o3]%asi	!1st 64 byte line
-	add     %o0,128,%o3
-	stxa     %g0,[%o3]%asi	!3rd 64 byte line
-	add     %o0,8,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(2 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128 ,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(3 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(4 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(5 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(6 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(7 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(8 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(9 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(10 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(11 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(12 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(13 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(14 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(15 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-
-	add     %o0,256,%o0
-
-	mov	%o0, %o3
-	stxa     %g0,[%o3]%asi	!1st 64 byte line
-	add     %o0,128,%o3
-	stxa     %g0,[%o3]%asi	!3rd 64 byte line
-	add     %o0,8,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(2 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128 ,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(3 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(4 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(5 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(6 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(7 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(8 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(9 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(10 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(11 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(12 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(13 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(14 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-	add     %o0,(15 * 8),%o3
-	stxa     %g0,[%o3]%asi
-	add     %o3,128,%o3
-	stxa     %g0,[%o3]%asi
-
-	subcc   %l6,1,%l6	! Decrement the inner loop counter.
-
-	! Now increment by 256 + 512 so we don't toggle PA[9]
-	add     %o0, 768, %o0
-
-	bg,pt   %ncc,bzero_innerloop
-	nop
-	! END OF INNER LOOP
-
-	subcc   %l5,1,%l5
-	add     %l3, 512, %o0	! increment %o0 to first buddy line of dest.
-	bg,pt   %ncc, bzero_buddyloop
-	nop
-	add     %o0, 3584, %o0	! Advance both base addresses by 4k
-	add %o0,%o2,%i5
-	subcc %g5,%i5,%g0
-	bge,pt   %ncc,stingray_optimized_4k_zero_loop
-	nop
-
-	! stingray_optimized_bzero_ends_here
-
-	mov	%o0, %i0
-	mov	%o1, %i1
-	mov	%o2, %i2
-	mov	%o3, %i3
-	mov	%o5, %i5
-	restore
-	sub	%g5,%o0,%o1	!how many byte left
-	brz,pn	%o1,.bzero_exit
-	mov	%g0,%g5
-	add     %o0,%o1,%o4	!cal the last byte to write %o4
-	subcc	%o1,256,%g0
-	bge,pt	%ncc,interleave128_bzero
-	mov	%g0,%g5
-	
-	ba	bzero_word
-	nop
-
-.bzero_exit:
-	!
-	! We're just concerned with whether t_lofault was set
-	! when we came in. We end up here from either kzero()
-	! or bzero(). kzero() *always* sets a lofault handler.
-	! It ors LOFAULT_SET into %o5 to indicate it has done
-	! this even if the value of %o5 is otherwise zero.
-	! bzero() sets a lofault handler *only* if one was
-	! previously set. Accordingly we need to examine
-	! %o5 and if it is non-zero be sure to clear LOFAULT_SET
-	! before resetting the error handler.
-	!
-	tst	%o5
-	bz	%ncc, 1f
-	andn	%o5, LOFAULT_SET, %o5
-	membar	#Sync				! sync error barrier
-	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
-1:
-	retl
-	clr	%o0			! return (0)
-
-	SET_SIZE(bzero)
-#endif	/* lint */
-
-#ifdef ROCK_CR_6654578
-/* This code tries to maximize bandwidth by being clever about accessing
- * the two cache lines that are BUDDY PAIRS in the L3 cache.  When line 0
- * of a pair is accessed, it will take hundreds of cycles to get the line
- * from memory, which brings in a 128-byte line to L3.  Until the line is
- * installed in L3, any other access to that line (such as buddy line 1)
- * is blocked.  For best throughput, we access many lines that are the first
- * of their buddy pairs, and only after many such accesses have been made,
- * we access the sequence of second buddy pair lines.  Hopefully the second
- * set of accesses comes after the L3 lines are installed, so the accesses
- * hitin L3 without being delayed.  This should yield better throughput. 
- * To keep this code simple, we assume the addresses given are aligned at
- * least on a 128 byte boundary, and the length is assumed to be a multiple
- * of 8k bytes.
- */
-
-#ifdef lint
-/*ARGSUSED*/
-int
-page_hwblkclr(void *addr, size_t len)
-{ 
-	return(0);
-}
-#else /* lint */
-	ENTRY(page_hwblkclr)
-	save	%sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
-
-	! %i0 address
-	! %i1 len
-	
-	rd      %fprs, %l0
-	mov     %g0, %l2		! clear flag to say fp regs not saved
-
-	! FPU enabled ?  If not, enable it.
-	btst    FPRS_FEF, %l0
-	bz,a,pt   %icc, 1f
-	  wr      %g0, FPRS_FEF, %fprs
-
-        ! save in-use fpregs on stack
-
-        add     %fp, STACK_BIAS - 65, %l1       ! get stack frame for fp regs
-        and     %l1, -VIS_BLOCKSIZE, %l1        ! block align frame
-        stda    %d32, [%l1]ASI_BLK_P            ! %l1 = addr of saved fp regs
-
-        ! Set a flag saying fp regs are saved.
-        mov     1, %l2
-
-        ! enable fp
-
-1:      membar  #StoreStore|#StoreLoad|#LoadStore
-
-        movxtod %g0, %d32
-        movxtod %g0, %d34
-        movxtod %g0, %d36
-        movxtod %g0, %d38
-        movxtod %g0, %d40
-        movxtod %g0, %d42
-        movxtod %g0, %d44
-        movxtod %g0, %d46
-
-        ba      myloop2
-        srl     %i1,12,%i1
-.align 64
-myloop2:
-        mov      2,%l5
-        mov      %i0, %l3 
-buddyloop:
-        set      4096, %l4    
-        add      %i0, %l4, %l4
-        prefetcha [%l4]ASI_BLK_P, #n_writes
-        mov      32,%l6
-innerloop:          
-
-        subcc   %l6,1,%l6
-        stda    %d32,[%i0]ASI_BLK_P
-        bg,pt   %icc,innerloop
-        add     %i0, 128, %i0
-
-        subcc   %l5,1,%l5
-        add     %l3, 64, %i0
-        bg,pt   %icc,buddyloop
-	nop
-	subcc	%i1,1,%i1
-        add     %i0, 4032, %i0
-        bg,pt   %icc,myloop2
-        nop
-
-        brz,a   %l2, 2f
-          wr    %l0, 0, %fprs           ! restore fprs
-
-        ! restore fpregs from stack
-        ldda    [%l1]ASI_BLK_P, %d32
-
-        wr      %l0, 0, %fprs           ! restore fprs
-2:
-        membar  #Sync
-
-        ret
-        restore  %g0, 0, %o0
-
-	SET_SIZE(page_hwblkclr)
-#endif	/* lint */
-#endif	/* ROCK_CR_6654578 */
-
-#if defined(lint)
-
-int use_hw_bcopy = 1;
-int use_hw_bzero = 1;
-uint_t hw_copy_limit_1 = 0x100;
-uint_t hw_copy_limit_2 = 0x200;
-uint_t hw_copy_limit_4 = 0x400;
-uint_t hw_copy_limit_8 = 0x400;
-
-#else /* !lint */
-
-	DGDEF(use_hw_bcopy)
-	.word	1
-	DGDEF(use_hw_bzero)
-	.word	1
-	DGDEF(hw_copy_limit_1)
-	.word	0x100
-	DGDEF(hw_copy_limit_2)
-	.word	0x200
-	DGDEF(hw_copy_limit_4)
-	.word	0x400
-	DGDEF(hw_copy_limit_8)
-	.word	0x400
-
-
-	.align	64
-	.section ".text"
-#endif /* !lint */
--- a/usr/src/uts/sun4v/io/px/px_lib4v.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/io/px/px_lib4v.c	Thu Aug 06 17:39:39 2009 -0700
@@ -41,8 +41,6 @@
 #include <sys/hotplug/pci/pcihp.h>
 #include "px_lib4v.h"
 #include "px_err.h"
-#include <vm/vm_dep.h>
-#include <vm/hat_sfmmu.h>
 
 /* mask for the ranges property in calculating the real PFN range */
 uint_t px_ranges_phi_mask = ((1 << 28) -1);
@@ -547,9 +545,6 @@
 	else
 		sync_dir = HVIO_DMA_SYNC_DIR_TO_DEV;
 
-	if (force_sync_icache_after_dma == 0 && !icache_is_coherent)
-		sync_dir |= HVIO_DMA_SYNC_DIR_NO_ICACHE_FLUSH;
-
 	off += mp->dmai_offset;
 	pg_off = off & MMU_PAGEOFFSET;
 
@@ -560,27 +555,12 @@
 	end = MMU_BTOPR(off + len - 1);
 	for (idx = MMU_BTOP(off); idx < end; idx++,
 	    len -= bytes_synced, pg_off = 0) {
-		size_t bytes_to_sync =  MIN(len, MMU_PAGESIZE - pg_off);
-
-		while (hvio_dma_sync(hdl,
-		    MMU_PTOB(PX_GET_MP_PFN(mp, idx)) + pg_off,
-		    bytes_to_sync, sync_dir, &bytes_synced) != H_EOK) {
-
-			if (!(sync_dir & HVIO_DMA_SYNC_DIR_NO_ICACHE_FLUSH)) {
-				bytes_synced = 0;
-				break;
-			}
+		size_t bytes_to_sync = bytes_to_sync =
+		    MIN(len, MMU_PAGESIZE - pg_off);
 
-			/*
-			 * Some versions of firmware do not support
-			 * this sync_dir flag. If the call fails clear
-			 * the flag and retry the call. Also, set the
-			 * global so that we dont set the sync_dir
-			 * flag again.
-			 */
-			sync_dir &= ~HVIO_DMA_SYNC_DIR_NO_ICACHE_FLUSH;
-			force_sync_icache_after_dma = 1;
-		}
+		if (hvio_dma_sync(hdl, MMU_PTOB(PX_GET_MP_PFN(mp, idx)) +
+		    pg_off, bytes_to_sync, sync_dir, &bytes_synced) != H_EOK)
+			break;
 
 		DBG(DBG_LIB_DMA, dip, "px_lib_dma_sync: Called hvio_dma_sync "
 		    "ra = %p bytes to sync = %x bytes synced %x\n",
--- a/usr/src/uts/sun4v/io/px/px_lib4v.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/io/px/px_lib4v.h	Thu Aug 06 17:39:39 2009 -0700
@@ -97,8 +97,7 @@
 
 #define	PX_VPCI_MINOR_VER_0	0x0ull
 #define	PX_VPCI_MINOR_VER_1	0x1ull
-#define	PX_VPCI_MINOR_VER_2	0x2ull
-#define	PX_VPCI_MINOR_VER	PX_VPCI_MINOR_VER_2
+#define	PX_VPCI_MINOR_VER	PX_VPCI_MINOR_VER_1
 
 extern uint64_t hvio_config_get(devhandle_t dev_hdl, pci_device_t bdf,
     pci_config_offset_t off, pci_config_size_t size, pci_cfg_data_t *data_p);
--- a/usr/src/uts/sun4v/ml/hcall.s	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/ml/hcall.s	Thu Aug 06 17:39:39 2009 -0700
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -321,20 +321,10 @@
 
 /*ARGSUSED*/
 uint64_t
-hv_mem_iflush(uint64_t real_addr, uint64_t length, uint64_t *flushed_len)
-{ return (0); }
-
-/*ARGSUSED*/
-uint64_t
-hv_mem_iflush_all()
-{ return (0); }
-
-/*ARGSUSED*/
-uint64_t
 hv_tm_enable(uint64_t enable)
 { return (0); }
 
-/*ARGSUSED*/
+/*ARGSUSED*/	
 uint64_t
 hv_mach_set_watchdog(uint64_t timeout, uint64_t *time_remaining)
 { return (0); }
@@ -742,34 +732,7 @@
 	SET_SIZE(hv_mem_sync)
 
 	/*
-	 * HV_MEM_IFLUSH
-	 * 	arg0 memory real address
-	 * 	arg1 flush length
-	 *	ret0 status
-	 *	ret1 flushed length
-	 *
-	 */
-	ENTRY(hv_mem_iflush)
-	mov	%o2, %o4
-	mov	HV_MEM_IFLUSH, %o5
-	ta	FAST_TRAP
-	retl
-	stx	%o1, [%o4]
-	SET_SIZE(hv_mem_iflush)
-
-	/*
-	 * HV_MEM_IFLUSH_ALL
-	 *	ret0 status
-	 */
-	ENTRY(hv_mem_iflush_all)
-	mov	HV_MEM_IFLUSH_ALL, %o5
-	ta	FAST_TRAP
-	retl
-	nop
-	SET_SIZE(hv_mem_iflush_all)
-
-	/*
-	 * uint64_t hv_rk_tm_enable(uint64_t enable)
+	 * uint64_t hv_tm_enable(uint64_t enable)
 	 */
 	ENTRY(hv_tm_enable)
 	mov	HV_TM_ENABLE, %o5
--- a/usr/src/uts/sun4v/ml/mach_interrupt.s	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/ml/mach_interrupt.s	Thu Aug 06 17:39:39 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -41,20 +41,7 @@
 #include <sys/error.h>
 #include <sys/mmu.h>
 #include <vm/hat_sfmmu.h>
-
 #define	INTR_REPORT_SIZE	64
-#define	ERRH_ASI_SHIFT		56		/* bits[63:56]; see errh_er_t */
-#define	NRE_ASI			0x00000001	/* ASI observed in attr field */
-#define	NRE_CTX			0x00000002	/* ASI equals ASI_MMU_CTX */
-#define	CRP_OBSERVED		(NRE_ASI | NRE_CTX)
-
-#define	OR_MCPU_NRE_ERROR(reg1,reg2,val)	\
-	add	reg1, CPU_MCPU, reg2;		\
-	add	reg2, MCPU_NRE_ERROR, reg2;	\
-	ldxa	[reg2]ASI_MEM, reg1;		\
-	or	reg1, val, reg1;		\
-	stxa	reg1, [reg2]ASI_MEM
-	
 
 #ifdef TRAPTRACE
 #include <sys/traptrace.h>
@@ -533,10 +520,6 @@
 
 	CPU_PADDR(%g1, %g4)			! %g1 = cpu struct paddr
 
-	add	%g1, CPU_MCPU, %g4
-	add	%g4, MCPU_NRE_ERROR, %g4	! &CPU->cpu_m.cpu_nre_error
-	stxa	%g0, [%g4]ASI_MEM		! clear cpu_nre_error
-
 2:	set	CPU_NRQ_BASE_OFF, %g4
 	ldxa	[%g1 + %g4]ASI_MEM, %g4		! %g4 = queue base PA
 	add	%g6, %g4, %g4			! %g4 = PA of ER in Q		
@@ -548,7 +531,7 @@
 	bne,pn	%xcc, 1f			! first 8 byte is not 0
 	nop
 
-	/* BEGIN: move 64 bytes from queue to buf */
+	/* Now we can move 64 bytes from queue to buf */
 	set	0, %g5
 	ldxa	[%g4 + %g5]ASI_MEM, %g1
 	stxa	%g1, [%g7 + %g5]ASI_MEM		! byte 0 - 7	
@@ -558,14 +541,7 @@
 	add	%g5, 8, %g5
 	ldxa	[%g4 + %g5]ASI_MEM, %g1
 	stxa	%g1, [%g7 + %g5]ASI_MEM		! byte 16 - 23
-	/* Check for sun4v ASI */
-	and	%g1, ERRH_ATTR_ASI, %g1		! isolate ASI bit
-	cmp	%g1, ERRH_ATTR_ASI
-	bne,pt	%xcc, 3f
-	  nop
-	CPU_PADDR(%g1, %g5)
-	OR_MCPU_NRE_ERROR(%g1, %g5, NRE_ASI)	! cpu_nre_error |= NRE_ASI
-3:	set	24, %g5
+	add	%g5, 8, %g5
 	ldxa	[%g4 + %g5]ASI_MEM, %g1
 	stxa	%g1, [%g7 + %g5]ASI_MEM		! byte 24 - 31
 	add	%g5, 8, %g5
@@ -574,20 +550,12 @@
 	add	%g5, 8, %g5
 	ldxa	[%g4 + %g5]ASI_MEM, %g1
 	stxa	%g1, [%g7 + %g5]ASI_MEM		! byte 40 - 47
-	/* Check for ASI==ASI_MMU_CTX */
-	srlx	%g1, ERRH_ASI_SHIFT, %g1	! isolate the ASI field
-	cmp	%g1, ASI_MMU_CTX		! ASI=0x21 for CRP
-	bne,pt	%xcc, 4f
-	  nop
-	CPU_PADDR(%g1, %g5)
-	OR_MCPU_NRE_ERROR(%g1, %g5, NRE_CTX)	! cpu_nre_error |= NRE_CTX
-4:	set	48, %g5
+	add	%g5, 8, %g5
 	ldxa	[%g4 + %g5]ASI_MEM, %g1
 	stxa	%g1, [%g7 + %g5]ASI_MEM		! byte 48 - 55
 	add	%g5, 8, %g5
 	ldxa	[%g4 + %g5]ASI_MEM, %g1
 	stxa	%g1, [%g7 + %g5]ASI_MEM		! byte 56 - 63
-	/* END: move 64 bytes from queue to buf */
 
 	set	CPU_NRQ_SIZE, %g5		! %g5 = queue size
 	sub	%g5, 1, %g5			! %g5 = queu size mask
@@ -608,36 +576,6 @@
 	membar	#Sync
 
 	/*
-	 * For CRP, force a hat reload as if the context were stolen
-	 * by storing INVALID_CONTEXT in the secondary and nulling TSB.
-	 * Primary will be reset by usr_rtt for user-mode traps, or 
-	 * has been reset in iae_crp or dae_crp for kernel-mode.
-	 */
-	CPU_PADDR(%g1, %g5)
-	add	%g1, CPU_MCPU, %g5
-	add	%g5, MCPU_NRE_ERROR, %g5	! &CPU->cpu_m.cpu_nre_error
-	ldxa	[%g5]ASI_MEM, %g4
-	cmp	%g4, CRP_OBSERVED		! confirm CRP
-	bne,pt	%xcc, 5f
-	  nop
-	mov	INVALID_CONTEXT, %g5		! force hat reload of context
-	mov	MMU_SCONTEXT, %g7
-	sethi	%hi(FLUSH_ADDR), %g4
-	stxa	%g5, [%g7]ASI_MMU_CTX		! set secondary context reg
-	flush	%g4
-	mov	%o0, %g4
-	mov	%o1, %g5
-	mov	%o5, %g7
-	mov	%g0, %o0
-	mov	%g0, %o1
-	mov	MMU_TSB_CTXNON0, %o5
-	ta      FAST_TRAP			! null TSB
-	  nop
-	mov	%g4, %o0
-	mov	%g5, %o1
-	mov	%g7, %o5
-
-	/*
 	 * Call sys_trap. %g2 is TL(arg2), %g3 is head and tail
 	 * offset(arg3).
 	 * %g3 looks like following:
@@ -648,7 +586,7 @@
 	 *
 	 * Run at PIL 14 unless we're already at PIL 15.
 	 */
-5:	sllx	%g3, 32, %g3			! %g3.h = tail offset
+	sllx	%g3, 32, %g3			! %g3.h = tail offset
 	or	%g3, %g2, %g3			! %g3.l = head offset
 	rdpr	%tl, %g2			! %g2 = current tl
 
--- a/usr/src/uts/sun4v/ml/mach_offsets.in	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/ml/mach_offsets.in	Thu Aug 06 17:39:39 2009 -0700
@@ -98,7 +98,6 @@
 	cpu_nrq_base_pa		MCPU_NRQ_BASE
 	cpu_nrq_size		MCPU_NRQ_SIZE
 	cpu_tstat_flags		MCPU_TSTAT_FLAGS
-	cpu_nre_error		MCPU_NRE_ERROR
 
 \#define	CPU_MPCB_PA	(CPU_MCPU + MCPU_MPCB_PA)
 \#define	CPU_KWBUF_FULL	(CPU_MCPU + MCPU_KWBUF_FULL)
@@ -145,8 +144,6 @@
 	sfmmu_cext
 	sfmmu_ctx_lock
 	sfmmu_ctxs
-	sfmmu_pgsz_order
-	sfmmu_pgsz_map
 
 sf_scd SCD_SIZE
 	scd_sfmmup
@@ -184,7 +181,6 @@
 	scratch		TSBMISS_SCRATCH
 	shmermap	TSBMISS_SHMERMAP
 	scd_shmermap	TSBMISS_SCDSHMERMAP
-	pgsz_bitmap	TSBMISS_PGSZ_BITMAP
 
 \#define	TSB_TAGACC	(0 * TSBMISS_SCRATCH_INCR)
 \#define	TSBMISS_HMEBP	(1 * TSBMISS_SCRATCH_INCR)
@@ -252,9 +248,6 @@
 	hv_tsb_info_pa
 	hv_tsb_info_cnt
 
-hv_pgsz_order
-	hv_pgsz_order_pa
-
 cpu_node	CPU_NODE_SIZE
 	nodeid
 	clock_freq
--- a/usr/src/uts/sun4v/ml/trap_table.s	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/ml/trap_table.s	Thu Aug 06 17:39:39 2009 -0700
@@ -1396,10 +1396,6 @@
  * (0=kernel, 1=invalid, or 2=user) rather than context ID)
  */
 	ALTENTRY(exec_fault)
-	set	icache_is_coherent, %g6		/* check soft exec mode */
-	ld	[%g6], %g6
-	brz,pn	%g6, sfmmu_slow_immu_miss
-	  nop
 	TRACE_TSBHIT(TT_MMU_EXEC)
 	MMU_FAULT_STATUS_AREA(%g4)
 	ldx	[%g4 + MMFSA_I_ADDR], %g2	/* g2 = address */
--- a/usr/src/uts/sun4v/os/error.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/os/error.c	Thu Aug 06 17:39:39 2009 -0700
@@ -38,8 +38,6 @@
 #include <sys/error.h>
 #include <sys/fm/util.h>
 #include <sys/ivintr.h>
-#include <sys/machasi.h>
-#include <sys/mmu.h>
 #include <sys/archsystm.h>
 
 #define	MAX_CE_FLTS		10
@@ -212,7 +210,6 @@
 	int expected = DDI_FM_ERR_UNEXPECTED;
 	uint64_t exec_mode;
 	uint8_t u_spill_fill;
-	int u_kill = 1;
 
 	mcpup = &(CPU->cpu_m);
 
@@ -278,33 +275,8 @@
 				break;
 			}
 			/*
-			 * Context Register Parity - for reload of secondary
-			 * context register, see nonresumable_error.
-			 */
-			if ((errh_flt.errh_er.attr & ERRH_ATTR_ASI) &&
-			    (errh_flt.errh_er.asi == ASI_MMU_CTX)) {
-
-				if (aflt->flt_tl)	/* TL>0, so panic */
-					break;
-
-				/* Panic on unknown context registers */
-				if (errh_flt.errh_er.addr < MMU_PCONTEXT0 ||
-				    errh_flt.errh_er.addr + errh_flt.errh_er.sz
-				    > MMU_SCONTEXT1 + sizeof (uint64_t)) {
-					cmn_err(CE_WARN, "Parity error on "
-					    "unknown context register\n");
-					aflt->flt_panic = 1;
-					break;
-				}
-
-				u_kill = 0;		/* do not terminate */
-				break;
-			}
-			/*
-			 * All other PR_NRE fall through in order to
-			 * check for protection.  The list can include
-			 * ERRH_ATTR_FRF, ERRH_ATTR_IRF, ERRH_ATTR_MEM,
-			 * and ERRH_ATTR_PIO.
+			 * Fall through, precise fault also need to check
+			 * to see if it was protected.
 			 */
 			/*FALLTHRU*/
 
@@ -344,7 +316,7 @@
 			 * for fatal errors.
 			 */
 			if (aflt->flt_class == BUS_FAULT) {
-				aflt->flt_addr = errh_flt.errh_er.addr;
+				aflt->flt_addr = errh_flt.errh_er.ra;
 				errh_cpu_run_bus_error_handlers(aflt,
 				    expected);
 			}
@@ -393,13 +365,13 @@
 			errh_page_retire(&errh_flt, PR_UE);
 
 		/*
-		 * If we queued an error for a thread that should terminate
-		 * and it was in user mode or protected by t_lofault, set AST
-		 * flag so the queue will be drained before returning to user
-		 * mode.  Note that user threads can be killed via pcb_flags.
+		 * If we queued an error and the it was in user mode, or
+		 * protected by t_lofault, or user_spill_fill is set, we
+		 * set AST flag so the queue will be drained before
+		 * returning to user mode.
 		 */
-		if (u_kill && (!aflt->flt_priv ||
-		    aflt->flt_prot == AFLT_PROT_COPY || u_spill_fill)) {
+		if (!aflt->flt_priv || aflt->flt_prot == AFLT_PROT_COPY ||
+		    u_spill_fill) {
 			int pcb_flag = 0;
 
 			if (aflt->flt_class == CPU_FAULT)
@@ -550,7 +522,7 @@
 			 * If we are going to panic, scrub the page first
 			 */
 			if (errh_fltp->cmn_asyncflt.flt_panic)
-				mem_scrub(errh_fltp->errh_er.addr,
+				mem_scrub(errh_fltp->errh_er.ra,
 				    errh_fltp->errh_er.sz);
 		}
 		break;
@@ -606,7 +578,7 @@
 static void
 errh_page_retire(errh_async_flt_t *errh_fltp, uchar_t flag)
 {
-	uint64_t flt_real_addr_start = errh_fltp->errh_er.addr;
+	uint64_t flt_real_addr_start = errh_fltp->errh_er.ra;
 	uint64_t flt_real_addr_end = flt_real_addr_start +
 	    errh_fltp->errh_er.sz - 1;
 	int64_t current_addr;
--- a/usr/src/uts/sun4v/os/fillsysinfo.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/os/fillsysinfo.c	Thu Aug 06 17:39:39 2009 -0700
@@ -41,7 +41,6 @@
 #include <sys/cmp.h>
 #include <sys/async.h>
 #include <vm/page.h>
-#include <vm/vm_dep.h>
 #include <vm/hat_sfmmu.h>
 #include <sys/sysmacros.h>
 #include <sys/mach_descrip.h>
@@ -66,7 +65,6 @@
 static uint64_t get_mmu_tsbs(md_t *, mde_cookie_t);
 static uint64_t	get_mmu_shcontexts(md_t *, mde_cookie_t);
 static uint64_t get_cpu_pagesizes(md_t *, mde_cookie_t);
-static int check_mmu_pgsz_search(md_t *, mde_cookie_t);
 static char *construct_isalist(md_t *, mde_cookie_t, char **);
 static void init_md_broken(md_t *, mde_cookie_t *);
 static int get_l2_cache_info(md_t *, mde_cookie_t, uint64_t *, uint64_t *,
@@ -356,68 +354,13 @@
 			}
 			md_free_scan_dag(mdp, &node);
 		}
+
+
 		md_free_scan_dag(mdp, &eunit);
 	}
 }
 
 /*
- * Setup instruction cache coherency.  The "memory-coherent" property
- * is optional.  Default for Icache_coherency is 1 (I$ is coherent).
- * If we find an Icache with coherency == 0, then enable non-coherent
- * Icache support.
- */
-void
-setup_icache_coherency(md_t *mdp)
-{
-	int ncache;
-	mde_cookie_t *cachelist;
-	int i;
-
-	ncache = md_alloc_scan_dag(mdp, md_root_node(mdp), "cache",
-	    "fwd", &cachelist);
-
-	/*
-	 * The "cache" node is optional in MD, therefore ncaches can be 0.
-	 */
-	if (ncache < 1) {
-		return;
-	}
-
-	for (i = 0; i < ncache; i++) {
-		uint64_t cache_level;
-		uint64_t memory_coherent;
-		uint8_t *type;
-		int typelen;
-
-		if (md_get_prop_val(mdp, cachelist[i], "level",
-		    &cache_level))
-			continue;
-
-		if (cache_level != 1)
-			continue;
-
-		if (md_get_prop_data(mdp, cachelist[i], "type",
-		    &type, &typelen))
-			continue;
-
-		if (strcmp((char *)type, "instn") != 0)
-			continue;
-
-		if (md_get_prop_val(mdp, cachelist[i], "memory-coherent",
-		    &memory_coherent))
-			continue;
-
-		if (memory_coherent != 0)
-			continue;
-
-		mach_setup_icache(memory_coherent);
-		break;
-	}
-
-	md_free_scan_dag(mdp, &cachelist);
-}
-
-/*
  * All the common setup of sun4v CPU modules is done by this routine.
  */
 void
@@ -461,11 +404,6 @@
 		shctx_on = 1;
 	}
 
-	/*
-	 *  Get and check page search register properties.
-	 */
-	pgsz_search_on = check_mmu_pgsz_search(mdp, cpulist[0]);
-
 	for (i = 0; i < nocpus; i++)
 		fill_cpu(mdp, cpulist[i]);
 
@@ -474,7 +412,6 @@
 
 	setup_chip_mappings(mdp);
 	setup_exec_unit_mappings(mdp);
-	setup_icache_coherency(mdp);
 
 	/*
 	 * If MD is broken then append the passed ISA set,
@@ -1116,50 +1053,3 @@
 
 	md_free_scan_dag(mdp, &platlist);
 }
-
-/*
- * This routine gets the MD properties associated with the TLB search order API
- * and compares these against the expected values for a processor which supports
- * this API. The return value is used to determine whether use the API.
- */
-static int
-check_mmu_pgsz_search(md_t *mdp, mde_cookie_t cpu_node_cookie)
-{
-
-	uint64_t mmu_search_nshared_contexts;
-	uint64_t mmu_max_search_order;
-	uint64_t mmu_non_priv_search_unified;
-	uint64_t mmu_search_page_size_list;
-
-	if (md_get_prop_val(mdp, cpu_node_cookie,
-	    "mmu-search-#shared-contexts", &mmu_search_nshared_contexts))
-		mmu_search_nshared_contexts = 0;
-
-	if (mmu_search_nshared_contexts == 0 ||
-	    mmu_search_nshared_contexts != NSEARCH_SHCONTEXTS)
-		return (0);
-
-	if (md_get_prop_val(mdp, cpu_node_cookie, "mmu-max-search-order",
-	    &mmu_max_search_order))
-		mmu_max_search_order = 0;
-
-	if (mmu_max_search_order == 0 || mmu_max_search_order !=
-	    MAX_PGSZ_SEARCH_ORDER)
-		return (0);
-
-	if (md_get_prop_val(mdp, cpu_node_cookie,
-	    "mmu-non-priv-search-unified", &mmu_non_priv_search_unified))
-		mmu_non_priv_search_unified = -1;
-
-	if (mmu_non_priv_search_unified != 1) {
-		return (0);
-	}
-
-	if (md_get_prop_val(mdp, cpu_node_cookie,
-	    "mmu-search-page-size-list", &mmu_search_page_size_list)) {
-		mmu_search_page_size_list = 0;
-		return (0);
-	}
-
-	return (1);
-}
--- a/usr/src/uts/sun4v/os/mach_cpu_states.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/os/mach_cpu_states.c	Thu Aug 06 17:39:39 2009 -0700
@@ -1075,20 +1075,7 @@
 void
 kdi_flush_caches(void)
 {
-	/*
-	 * May not be implemented by all sun4v architectures.
-	 *
-	 * Cannot use hsvc_version to see if the group is already
-	 * negotiated or not because, this function is called by
-	 * KMDB when it is at the console prompt which is running
-	 * at highest PIL. hsvc_version grabs an adaptive mutex and
-	 * this is a no-no at this PIL level.
-	 */
-	if (hsvc_kdi_mem_iflush_negotiated) {
-		uint64_t	status = hv_mem_iflush_all();
-		if (status != H_EOK)
-			cmn_err(CE_PANIC, "Flushing all I$ entries failed");
-	}
+	/* Not required on sun4v architecture. */
 }
 
 /*ARGSUSED*/
@@ -1101,16 +1088,6 @@
 void
 cpu_kdi_init(kdi_t *kdi)
 {
-	/*
-	 * Any API negotiation this early in the boot will be unsuccessful.
-	 * Therefore firmware for Sun4v platforms that have incoherent I$ are
-	 * assumed to support pre-negotiated MEM_IFLUSH APIs. Successful
-	 * invokation the MEM_IFLUSH_ALL is a test for is availability.
-	 * Set a flag if successful indicating its availabitlity.
-	 */
-	if (hv_mem_iflush_all() == 0)
-		hsvc_kdi_mem_iflush_negotiated = B_TRUE;
-
 	kdi->kdi_flush_caches = kdi_flush_caches;
 	kdi->mkdi_cpu_init = kdi_cpu_init;
 	kdi->mkdi_cpu_ready_iter = kdi_cpu_ready_iter;
--- a/usr/src/uts/sun4v/pcbe/rock_pcbe.c	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2316 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Rock Performance Counter Back End
- */
-
-#include <sys/cpuvar.h>
-#include <sys/systm.h>
-#include <sys/cmn_err.h>
-#include <sys/cpc_impl.h>
-#include <sys/cpc_pcbe.h>
-#include <sys/modctl.h>
-#include <sys/machsystm.h>
-#include <sys/sdt.h>
-#include <sys/hypervisor_api.h>
-#include <sys/rock_hypervisor_api.h>
-#include <sys/hsvc.h>
-
-#define	NT_END			0xFF
-
-/* Counter Types */
-#define	NUM_PCBE_COUNTERS	6
-#define	RK_PERF_CYC		0x0100
-#define	RK_PERF_INSTR		0x0200
-#define	RK_PERF_L2		0x0400
-#define	RK_PERF_MMU		0x0800
-#define	RK_PERF_YANK		0x2000
-#define	RK_PERF_SIBLK		0x4000
-#define	RK_PERF_LVLK		0x8000
-#define	RK_PERF_SPEC		0x1000	/* Reserved */
-
-#define	NORMAL_COUNTER		0x1
-#define	SYNTHETIC_COUNTER	0x2
-
-/* ASI_PERF_MMU_CNT_FILTER TXN bits */
-#define	ASI_PERF_MMU_CNT_FILTER_UTLB_HITS	0x1
-#define	ASI_PERF_MMU_CNT_FILTER_UTLB_MISS	0x2
-#define	ASI_PERF_MMU_CNT_FILTER_DATA_ACCESS	0x8
-#define	ASI_PERF_MMU_CNT_FILTER_INSTR_ACCESS	0x10
-#define	ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL	0x20
-#define	ASI_PERF_MMU_CNT_FILTER_EA_REAL		0x40
-
-#define	MMU_ALL_TXNS		(ASI_PERF_MMU_CNT_FILTER_UTLB_HITS | \
-				ASI_PERF_MMU_CNT_FILTER_UTLB_MISS | \
-				ASI_PERF_MMU_CNT_FILTER_DATA_ACCESS | \
-				ASI_PERF_MMU_CNT_FILTER_INSTR_ACCESS | \
-				ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \
-				ASI_PERF_MMU_CNT_FILTER_EA_REAL)
-
-#define	MMU_ITLB_MISS		(ASI_PERF_MMU_CNT_FILTER_EA_REAL | \
-				ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \
-				ASI_PERF_MMU_CNT_FILTER_INSTR_ACCESS | \
-				ASI_PERF_MMU_CNT_FILTER_UTLB_MISS | \
-				ASI_PERF_MMU_CNT_FILTER_UTLB_HITS)
-
-#define	MMU_DTLB_MISS		(ASI_PERF_MMU_CNT_FILTER_EA_REAL | \
-				ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \
-				ASI_PERF_MMU_CNT_FILTER_DATA_ACCESS | \
-				ASI_PERF_MMU_CNT_FILTER_UTLB_MISS | \
-				ASI_PERF_MMU_CNT_FILTER_UTLB_HITS)
-
-#define	MMU_UTLB_MISS		(ASI_PERF_MMU_CNT_FILTER_EA_REAL | \
-				ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \
-				ASI_PERF_MMU_CNT_FILTER_INSTR_ACCESS | \
-				ASI_PERF_MMU_CNT_FILTER_DATA_ACCESS | \
-				ASI_PERF_MMU_CNT_FILTER_UTLB_MISS)
-
-#define	MMU_UTLB_HIT		(ASI_PERF_MMU_CNT_FILTER_EA_REAL | \
-				ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \
-				ASI_PERF_MMU_CNT_FILTER_INSTR_ACCESS | \
-				ASI_PERF_MMU_CNT_FILTER_DATA_ACCESS | \
-				ASI_PERF_MMU_CNT_FILTER_UTLB_HITS)
-
-#define	MMU_ITLB_MISS_UTLB_HIT	(ASI_PERF_MMU_CNT_FILTER_EA_REAL | \
-				ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \
-				ASI_PERF_MMU_CNT_FILTER_INSTR_ACCESS | \
-				ASI_PERF_MMU_CNT_FILTER_UTLB_HITS)
-
-#define	MMU_ITLB_MISS_UTLB_MISS	(ASI_PERF_MMU_CNT_FILTER_EA_REAL | \
-				ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \
-				ASI_PERF_MMU_CNT_FILTER_INSTR_ACCESS | \
-				ASI_PERF_MMU_CNT_FILTER_UTLB_MISS)
-
-#define	MMU_DTLB_MISS_UTLB_HIT	(ASI_PERF_MMU_CNT_FILTER_EA_REAL | \
-				ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \
-				ASI_PERF_MMU_CNT_FILTER_DATA_ACCESS | \
-				ASI_PERF_MMU_CNT_FILTER_UTLB_HITS)
-
-#define	MMU_DTLB_MISS_UTLB_MISS	(ASI_PERF_MMU_CNT_FILTER_EA_REAL | \
-				ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \
-				ASI_PERF_MMU_CNT_FILTER_DATA_ACCESS | \
-				ASI_PERF_MMU_CNT_FILTER_UTLB_MISS)
-
-/*
- * These values will be loaded to nametable.bits which is a 32 bit number.
- * Please see the description of bits in nametable struct. If the counters
- * are a part of different pic, then we can re-use GROUP and TYPE.
- */
-#define	SYN_BIT			((uint32_t)1 << 31)	/* Set bit 32 */
-#define	GROUP_MASK		0xFFF000		/* Bits 12-23 */
-#define	ID_TO_GROUP(GROUP_ID)	((GROUP_ID)<<12)
-#define	GROUP(SYN_COUNTER)	((SYN_COUNTER) & GROUP_MASK)
-#define	TYPE(SYN_COUNTER)   ((SYN_COUNTER) & 0x000FFF)	/* Bits 0-12 */
-
-/* Synthetic counter types */
-#define	L2_GROUP_DS		ID_TO_GROUP(0)
-#define	DS_DRAM		0x0	/* From PRM */
-#define	DS_L3		0x1	/*   ditto  */
-#define	DS_OTHER_L2	0x2	/*   ditto  */
-#define	DS_LOCAL_L2	0x3	/*   ditto  */
-
-#define	L2_DS_DRAM		(SYN_BIT | L2_GROUP_DS | DS_DRAM)
-#define	L2_DS_L3		(SYN_BIT | L2_GROUP_DS | DS_L3)
-#define	L2_DS_OTHER_L2		(SYN_BIT | L2_GROUP_DS | DS_OTHER_L2)
-#define	L2_DS_LOCAL_L2		(SYN_BIT | L2_GROUP_DS | DS_LOCAL_L2)
-
-#define	L2_GROUP_TXN_MISS	ID_TO_GROUP(1)
-#define	TXN_LD			0x3	/* From PRM */
-#define	TXN_ST			0x18	/*   ditto  */
-#define	L2_TXN_LD_MISS		(SYN_BIT | L2_GROUP_TXN_MISS | TXN_LD)
-#define	L2_TXN_ST_MISS		(SYN_BIT | L2_GROUP_TXN_MISS | TXN_ST)
-
-#define	L2_GROUP_TXN_HIT	ID_TO_GROUP(2)
-#define	L2_TXN_LD_HIT		(SYN_BIT | L2_GROUP_TXN_HIT | TXN_LD)
-#define	L2_TXN_ST_HIT		(SYN_BIT | L2_GROUP_TXN_HIT | TXN_ST)
-
-#define	L2_GROUP_EVT		ID_TO_GROUP(3)
-#define	EVT_L2_MISS		0x8	/* From PRM */
-#define	EVT_L2_PEND_ST		0x2	/*   ditto  */
-#define	EVT_L2_PRIOR_MISS	0x1	/*   ditto  */
-#define	EVT_L2_NOEVENTS		0x0	/*   ditto  */
-#define	L2_HIT			0
-#define	L2_MISS			1
-
-#define	L2_EVT_HIT		(SYN_BIT | L2_GROUP_EVT | L2_HIT)
-#define	L2_EVT_MISS		(SYN_BIT | L2_GROUP_EVT | L2_MISS)
-
-/* Instruction types. Corresponds to ASI_PERF_IS_INFO.TYP */
-#define	I_GROUP_TYPE		ID_TO_GROUP(0)
-#define	TYPE_HELPER		(1<<0)
-#define	TYPE_LD			(1<<1)
-#define	TYPE_ST			(1<<2)
-#define	TYPE_CTI		(1<<3)
-#define	TYPE_FP			(1<<4)
-#define	TYPE_INT_ALU		(1<<5)
-#define	TYPE_CMPLX_ALU		(1<<6)
-
-#define	INSTR_TYPE_LD		(SYN_BIT | I_GROUP_TYPE | TYPE_LD)
-#define	INSTR_TYPE_ST		(SYN_BIT | I_GROUP_TYPE | TYPE_ST)
-#define	INSTR_TYPE_CTI		(SYN_BIT | I_GROUP_TYPE | TYPE_CTI)
-#define	INSTR_TYPE_FP		(SYN_BIT | I_GROUP_TYPE | TYPE_FP)
-
-/* Execution modes. Corresponds to ASI_PERF_IS_INFO.MODE */
-#define	I_GROUP_MODE		ID_TO_GROUP(1)
-#define	MODE_NOR		0x0	/* From PRM */
-#define	MODE_OOO		0x1	/*   ditto  */
-#define	MODE_EXE		0x2	/*   ditto  */
-#define	MODE_DLY		0x3	/*   ditto  */
-#define	MODE_DEF		0x4	/*   ditto  */
-#define	MODE_HWS		0x5	/*   ditto  */
-
-#define	INSTR_MODE_NOR		(SYN_BIT | I_GROUP_MODE | MODE_NOR)
-#define	INSTR_MODE_OOO		(SYN_BIT | I_GROUP_MODE | MODE_OOO)
-#define	INSTR_MODE_EXE		(SYN_BIT | I_GROUP_MODE | MODE_EXE)
-#define	INSTR_MODE_DLY		(SYN_BIT | I_GROUP_MODE | MODE_DLY)
-#define	INSTR_MODE_DEF		(SYN_BIT | I_GROUP_MODE | MODE_DEF)
-#define	INSTR_MODE_HWS		(SYN_BIT | I_GROUP_MODE | MODE_HWS)
-
-/* Instruction events. Corresponds to ASI_PERF_IS_INFO.EVT */
-#define	I_GROUP_EVT		ID_TO_GROUP(2)
-
-/* Bit numbers from PRM  */
-#define	EVT_DC_MISS		(1<<0)
-#define	EVT_PRIOR_MISS		(1<<1)
-#define	EVT_DTLB_MISS		(1<<2)
-#define	EVT_LDB_FULL		(1<<3)
-#define	EVT_STB_FULL		(1<<4)
-#define	EVT_FE_STALL		(1<<5)
-#define	EVT_FROM_DQ		(1<<6)
-#define	EVT_CORRECT_BP		(1<<7)
-#define	EVT_BYPASS_RAW		(1<<8)
-#define	EVT_NONBYPASS_RAW	(1<<9)
-#define	EVT_CTI_TAKEN		(1<<10)
-#define	EVT_FAILED_SPEC		(1<<11)
-
-#define	INSTR_EVT_DC_MISS	(SYN_BIT | I_GROUP_EVT | EVT_DC_MISS)
-#define	INSTR_EVT_PRIOR_MISS	(SYN_BIT | I_GROUP_EVT | EVT_PRIOR_MISS)
-#define	INSTR_EVT_DTLB_MISS	(SYN_BIT | I_GROUP_EVT | EVT_DTLB_MISS)
-#define	INSTR_EVT_LDB_FULL	(SYN_BIT | I_GROUP_EVT | EVT_LDB_FULL)
-#define	INSTR_EVT_STB_FULL	(SYN_BIT | I_GROUP_EVT | EVT_STB_FULL)
-#define	INSTR_EVT_FE_STALL	(SYN_BIT | I_GROUP_EVT | EVT_FE_STALL)
-#define	INSTR_EVT_FROM_DQ	(SYN_BIT | I_GROUP_EVT | EVT_FROM_DQ)
-#define	INSTR_EVT_CORRECT_BP	(SYN_BIT | I_GROUP_EVT | EVT_CORRECT_BP)
-#define	INSTR_EVT_BYPASS_RAW	(SYN_BIT | I_GROUP_EVT | EVT_BYPASS_RAW)
-#define	INSTR_EVT_NONBYPASS_RAW	(SYN_BIT | I_GROUP_EVT | EVT_NONBYPASS_RAW)
-#define	INSTR_EVT_CTI_TAKEN	(SYN_BIT | I_GROUP_EVT | EVT_CTI_TAKEN)
-#define	INSTR_EVT_FAILED_SPEC	(SYN_BIT | I_GROUP_EVT | EVT_FAILED_SPEC)
-
-/*
- * Synthetic counters to count MCCDESR error events
- * All the events are mutually exclusive therefore can be counted
- * simultaneously. Hence each one is a different pic. Therefore
- * there is no need to have GROUP or TYPE for these counters.
- */
-#define	MCCDESR_YANK		(SYN_BIT)
-#define	MCCDESR_SIBLK		(SYN_BIT)
-#define	MCCDESR_LVLK		(SYN_BIT)
-
-/* Number of samples to be taken before Performance Event Trap is generated */
-/* Maximum frequencies that can be configured */
-#define	INSTR_SAM_MAX_FREQ	0x3FF	/* 10 bits */
-#define	L2_SAM_MAX_FREQ		0xFFFF	/* 16 bits */
-#define	MMU_SAM_MAX_FREQ	0xFFFF	/* 16 bits */
-
-/* Minimum frequencies that should be configured to prevent DOS */
-#define	INSTR_SAM_MIN_FREQ	100
-#define	L2_SAM_MIN_FREQ		250
-#define	MMU_SAM_MIN_FREQ	250
-
-/* Default frequencies that are configured */
-#define	INSTR_SAM_DEF_FREQ	250
-#define	L2_SAM_DEF_FREQ		1000
-
-/* Number of bits in the hardware for the counter */
-#define	CYC_COUNTER_BITS	18
-#define	INSTR_COUNTER_BITS	18
-#define	L2_COUNTER_BITS		48
-#define	MMU_COUNTER_BITS	48
-#define	YANK_COUNTER_BITS	64
-#define	SIBLK_COUNTER_BITS	64
-#define	LVLK_COUNTER_BITS	64
-
-#define	RK_PERF_COUNT_TOE_SHIFT	(63)
-
-#define	STATE_CONFIGURED	0x1
-#define	STATE_PROGRAMMED	0x2
-#define	STATE_STOPPED		0x4
-#define	STATE_RELEASED		0x8
-#define	UNINITIALIZED		2 /* should be other than 0/1 */
-#define	TLZ			1 /* Do not make it zero */
-#define	TLNZ			2
-
-#define	CPU_REF_URL " Documentation for Sun processors can be found at: " \
-			"http://www.sun.com/processors/manuals"
-
-#define	MIN_RINGBUF_ENTRIES	100
-
-#define	RINGBUF_GET_HEAD(RB)		\
-	(uint64_t *)((uint64_t)(&RB->va_values) + RB->head);
-
-#define	RINGBUF_GET_TAIL(RB)		\
-	(uint64_t *)((uint64_t)(&RB->va_values) + RB->tail);
-
-#define	RINGBUF_SET_HEAD(RB, PTR)					\
-	RB->head = (uint64_t)PTR - (uint64_t)(&RB->va_values);		\
-	RB->hwm = RB->head + (RB->size >> 1);				\
-	if (RB->hwm >= RB->size)					\
-		RB->hwm -= RB->size;
-
-#define	RINGBUF_MOVE_HEAD(RB, PTR, SAMPLE_SZ)				\
-	PTR = (uint64_t *)((uint64_t)PTR + SAMPLE_SZ);			\
-	if (PTR >= (uint64_t *)((uint64_t)(&RB->va_values) + RB->size))	\
-		PTR = (uint64_t *)&RB->va_values;
-
-#define	MAKE_MASK(NBITS, SHIFT)	(((unsigned long)(1<<(NBITS))-1)<<SHIFT)
-
-#define	COUNTER_MAX(_p)	((int64_t)((1ULL << (_p->counter_bits - 1)) - 1))
-#define	COUNTER_MIN(_p)	((int64_t)-(COUNTER_MAX(_p)))
-#define	COUNTER_MASK(_p)	(bitmask(_p->counter_bits))
-
-/* Global Structures and typedefs */
-struct	_rk_pcbe_ringbuf {	/*	  INIT-ER	WRITTER	  READER */
-	uint32_t	head;	/* offset  guest	guest	  guest	 */
-	uint32_t	tail;	/* offset  guest	hv	   both	 */
-	uint32_t	size;	/* bytes   guest	n/a	   both	 */
-	uint32_t	hwm;	/* bytes   guest	hv	  guest  */
-	uint64_t	va_values; /*	   guest	hv	  guest  */
-};
-
-typedef	struct _rk_pcbe_ringbuf rk_pcbe_ringbuf_t;
-
-typedef	struct _sampler {
-	rk_pcbe_ringbuf_t *ring_buffer;	/* Ring buffer start address */
-	uint64_t	synthetic_pic;
-	uint32_t	frequency;	/* Sampling Frequency */
-	uint32_t	syn_counter;	/* Synthetic Counter Type */
-	uint32_t	sample_size;	/* Size of each sample in bytes */
-	uint32_t	flags;		/* instr sampler: priv */
-	uint8_t		tl;		/* Trap Level Filtering */
-	uint8_t		nohws;		/* Filter out HW Scouting samples */
-} sampler_t;
-
-typedef struct _rk_pcbe_config {
-	uint8_t		pcbe_picno;	/* 0-6:instr,l2,mmu,yank,siblk,lvlk */
-	uint8_t		counter_bits;	/* Number of counter bits */
-	uint8_t		counter_type;	/* Normal or Synthetic */
-	uint8_t		toe;		/* Trap on Enable */
-	uint32_t	counter;	/* Counter name */
-	uint32_t	src_type;	/* Strand, Strands, SIU, MMU */
-	uint32_t	flags;		/* instr counter:priv. l2,mmu:Xn */
-	uint64_t	pcbe_pic;	/* PIC counter value */
-	uint8_t		inuse;		/* pic in use or not */
-	uint8_t		state;		/* Current state of the pic */
-	processorid_t	cpu;		/* CPU associated to this pic */
-	sampler_t	sampler;
-#ifdef	RKPCBE_DBG
-	char		name[64];	/* Human readable counter name */
-#endif
-} rk_pcbe_config_t;
-
-/* Function Prototypes for those that are invoked using rk_pcbe_ops */
-static int rk_pcbe_init(void);
-static int rk_pcbe_fini(void);
-static uint_t rk_pcbe_ncounters(void);
-static const char *rk_pcbe_impl_name(void);
-static const char *rk_pcbe_cpuref(void);
-static char *rk_pcbe_list_events(uint_t picnum);
-static char *rk_pcbe_list_attrs(void);
-static uint64_t rk_pcbe_event_coverage(char *event);
-static uint64_t rk_pcbe_overflow_bitmap(void);
-static int rk_pcbe_configure(uint_t picnum, char *event, uint64_t preset,
-    uint32_t flags, uint_t nattrs, kcpc_attr_t *attrs, void **data,
-    void *token);
-static void rk_pcbe_program(void *token);
-static void rk_pcbe_allstop(void);
-static void rk_pcbe_sample(void *token);
-static void rk_pcbe_free(void *config);
-
-pcbe_ops_t rk_pcbe_ops = {
-	PCBE_VER_1,
-	CPC_CAP_OVERFLOW_INTERRUPT,
-	rk_pcbe_ncounters,
-	rk_pcbe_impl_name,
-	rk_pcbe_cpuref,
-	rk_pcbe_list_events,
-	rk_pcbe_list_attrs,
-	rk_pcbe_event_coverage,
-	rk_pcbe_overflow_bitmap,
-	rk_pcbe_configure,
-	rk_pcbe_program,
-	rk_pcbe_allstop,
-	rk_pcbe_sample,
-	rk_pcbe_free
-};
-
-/*
- * bits:
- *
- * |     31     |30        24|23      12|11      0
- * | Syn/Normal |    Rsvd    |  Group   |  Type  |
- */
-struct nametable {
-	const uint32_t	bits;
-	const char	*name;
-};
-
-/* Instruction Counter. picno: 0 */
-static const struct nametable Rock_names0[] = {
-	{0x1, "Instr_All"},
-	/* Synthetic counters */
-	{INSTR_MODE_NOR, "Instr_Normal"},
-	{INSTR_MODE_OOO, "Instr_Out_Of_Order"},
-	{INSTR_MODE_EXE, "Instr_Execute_Ahead"},
-	{INSTR_MODE_DLY, "Instr_Delay"},
-	{INSTR_MODE_DEF, "Instr_Deferred"},
-	{INSTR_MODE_HWS, "Instr_Scout"},
-
-	{INSTR_TYPE_LD,  "Instr_Load"},
-	{INSTR_TYPE_ST,  "Instr_Store"},
-	{INSTR_TYPE_CTI, "Instr_Branch"},
-	{INSTR_TYPE_FP,  "Instr_Float"},
-
-	{INSTR_EVT_DC_MISS,	"Instr_Dcache_Miss"},
-	{INSTR_EVT_PRIOR_MISS,	"Instr_Prior_Miss"},
-	{INSTR_EVT_DTLB_MISS,	"Instr_Dtlb_Miss"},
-	{INSTR_EVT_LDB_FULL,	"Instr_Loadbuf_Full"},
-	{INSTR_EVT_STB_FULL,	"Instr_Storebuf_Full"},
-	{INSTR_EVT_FE_STALL,	"Instr_Stall"},
-	{INSTR_EVT_FROM_DQ,	"Instr_DQ"},
-	{INSTR_EVT_CORRECT_BP,	"Instr_Correct_Branch_Predict"},
-	{INSTR_EVT_BYPASS_RAW,	"Instr_Bypass_Raw"},
-	{INSTR_EVT_NONBYPASS_RAW, "Instr_Nonbypass_Raw"},
-	{INSTR_EVT_CTI_TAKEN, 	"Instr_Branch_Taken"},
-	{INSTR_EVT_FAILED_SPEC,	"Instr_Failed_Spec"},
-
-	{NT_END, ""}
-};
-
-/* L2 Counters. picno: 1 */
-static const struct nametable Rock_names1[] = {
-	{0x1,			"L2_Icache_Load"},
-	{0x2,			"L2_Dcache_Load"},
-	{0x4,			"L2_Instr_Prefetch"},
-	{0x8,			"L2_Store_Prefetch"},
-	{0x10,			"L2_Store"},
-	{0x20,			"L2_Atomic_Ops"},
-	{0x40,			"L2_Flush"},
-	/* Synthetic counters */
-	{L2_DS_L3,		"L2_Load_From_L3"},
-	{L2_DS_DRAM,		"L2_Load_From_Dram"},
-	{L2_DS_OTHER_L2,	"L2_Load_From_Other_L2"},
-
-	{L2_TXN_LD_MISS,	"L2_Load_Miss"},
-	{L2_TXN_ST_MISS,	"L2_Store_Miss"},
-	{L2_TXN_LD_HIT,		"L2_Load_Hit"},
-	{L2_TXN_ST_HIT,		"L2_Store_Hit"},
-
-	{L2_EVT_HIT,		"L2_Hit"},
-	{L2_EVT_MISS,		"L2_Miss"},
-	{NT_END, ""}
-};
-
-/* MMU Counters. picno: 2 */
-static const struct nametable Rock_names2[] = {
-	{MMU_ALL_TXNS,			"MMU_All"},
-	{MMU_ITLB_MISS,			"MMU_Itlb_Miss"},
-	{MMU_DTLB_MISS,			"MMU_Dtlb_Miss"},
-	{MMU_UTLB_MISS,			"MMU_Utlb_Miss"},
-	{MMU_UTLB_HIT,			"MMU_Utlb_Hit"},
-	{MMU_ITLB_MISS_UTLB_MISS,	"MMU_I_Utlb_Miss"},
-	{MMU_ITLB_MISS_UTLB_HIT,	"MMU_I_Utlb_Hit"},
-	{MMU_DTLB_MISS_UTLB_MISS,	"MMU_D_Utlb_Miss"},
-	{MMU_DTLB_MISS_UTLB_HIT,	"MMU_D_Utlb_Hit"},
-	{NT_END, ""}
-};
-
-/* YANK Counter. picno: 3 */
-static const struct nametable Rock_names3[] = {
-	{MCCDESR_YANK,			"Yank"},
-	{NT_END, ""}
-};
-
-/* SIBLK Counter. picno: 4 */
-static const struct nametable Rock_names4[] = {
-	{MCCDESR_SIBLK,			"Siblk"},
-	{NT_END, ""}
-};
-
-/* LVLK Counter. picno: 5 */
-static const struct nametable Rock_names5[] = {
-	{MCCDESR_LVLK,			"Lvlk"},
-	{NT_END, ""}
-};
-
-static const struct nametable *Rock_names[NUM_PCBE_COUNTERS] = {
-	Rock_names0,
-	Rock_names1,
-	Rock_names2,
-	Rock_names3,
-	Rock_names4,
-	Rock_names5
-};
-
-extern	char	cpu_module_name[];
-uint32_t num_ringbuf_entries = 500; /* Should be a EVEN # */
-static const struct nametable **events;
-static char *pic_events[NUM_PCBE_COUNTERS];
-static rk_pcbe_config_t *active_pics[NUM_PCBE_COUNTERS][NCPU];
-static	boolean_t	rock_pcbe_hsvc_available = B_TRUE;
-
-static	char	*rock_name;
-static	char	rock_cpuref[256];
-static	char	pcbe_module_name[64] = "pcbe.";
-
-static hsvc_info_t rock_pcbe_hsvc = {
-	HSVC_REV_1,		/* HSVC rev num */
-	NULL,			/* Private */
-	HSVC_GROUP_RKPERF,	/* Requested API Group */
-	ROCK_HSVC_MAJOR,	/* Requested Major */
-	ROCK_HSVC_MINOR,	/* Requested Minor */
-	pcbe_module_name	/* Module name */
-};
-
-/* Function Definitions */
-static struct modlpcbe modlpcbe = {
-	&mod_pcbeops,
-	"Perf Counters v1.1",
-	&rk_pcbe_ops
-};
-
-static struct modlinkage modl = {
-	MODREV_1,
-	&modlpcbe,
-};
-
-/*
- * Below two structures are used to pass data from program_*_sampler() to
- * program_a_sampler()
- */
-struct	asi {
-	uint64_t	va;
-	uint64_t	value;
-};
-
-typedef struct	_s {
-	char		name[32];	/* User friendly name */
-	int		asi_config_num;	/* Num of ASIs to be configured */
-	struct	asi	asi_config[10];	/* ASIs that gets configured */
-	int		asi_sample_num;	/* Num of data return ASIs */
-	uint64_t	asi_sample[10];	/* Data return ASIs when sampled */
-} program_sampler_data_t;
-
-/* Local Function prototypes */
-static void rk_pcbe_stop_synthetic(rk_pcbe_config_t *pic);
-static void rk_pcbe_release(rk_pcbe_config_t *pic);
-static void rk_pcbe_free_synthetic(rk_pcbe_config_t *pic);
-
-static int rk_pcbe_program_normal(rk_pcbe_config_t *pic);
-static int rk_pcbe_program_synthetic(rk_pcbe_config_t *pic);
-static int program_l2_sampler(rk_pcbe_config_t *pic);
-static int program_instr_sampler(rk_pcbe_config_t *pic);
-static int program_a_sampler(rk_pcbe_config_t *pic,
-			program_sampler_data_t *sdata);
-
-static int rk_pcbe_sample_internal(rk_pcbe_config_t *pic, uint64_t *diffp);
-static int rk_pcbe_sample_synthetic(rk_pcbe_config_t *pic, int64_t *diffp);
-static int sample_l2_sampler(rk_pcbe_config_t *pic, int64_t *diffp);
-static int sample_instr_sampler(rk_pcbe_config_t *pic, int64_t *diffp);
-static int sample_mccdesr(rk_pcbe_config_t *pic, int64_t *diffp);
-static int synthesize_sample_count(rk_pcbe_config_t *pic, uint64_t sample_count,
-	uint64_t sample_hit_count, char *name, int64_t *diffp);
-
-static int alloc_ringbuffer(rk_pcbe_config_t *pic, uint32_t size,
-							uint32_t num_samples);
-static void free_ringbuffer(rk_pcbe_config_t *pic);
-static void print_hv_error(uint64_t rc, int *cntp, char *funcname,
-					rk_pcbe_config_t *pic);
-static	void set_string_constants(void);
-static	uint64_t bitmask(uint8_t);
-
-#ifdef	RKPCBE_DBG
-static void print_pic(rk_pcbe_config_t *pic, char *heading);
-static void set_pic_name(rk_pcbe_config_t *pic);
-/* lock for print clarity */
-static kmutex_t print_pic_lock;
-#define	PRINT_PIC(pic, heading)	\
-	print_pic(pic, heading)
-#define	DBG_PRINT(_z) printf _z
-#else
-#define	PRINT_PIC(pic, heading) (void)0
-#define	DBG_PRINT(ignore) (void)0
-#endif
-
-int
-_init(void)
-{
-	if (rk_pcbe_init() != 0)
-		return (ENOTSUP);
-	return (mod_install(&modl));
-}
-
-int
-_fini(void)
-{
-	if (rk_pcbe_fini() != 0)
-		return (EBUSY);
-	return (mod_remove(&modl));
-}
-
-int
-_info(struct modinfo *mi)
-{
-	return (mod_info(&modl, mi));
-}
-
-static int
-rk_pcbe_init(void)
-{
-	const struct 	nametable	*n;
-	int		i, status, j;
-	size_t		size;
-	uint64_t	rock_pcbe_hsvc_sup_minor;
-
-	set_string_constants();
-	/*
-	 * Validate API version for Rock pcbe hypervisor services
-	 */
-	status = hsvc_register(&rock_pcbe_hsvc, &rock_pcbe_hsvc_sup_minor);
-	if ((status != 0) || (rock_pcbe_hsvc_sup_minor <
-	    (uint64_t)ROCK_HSVC_MINOR)) {
-		cmn_err(CE_WARN, "%s cannot negotiate hypervisor services: "
-		    "major: 0x%lx minor: 0x%lx group: 0x%x errno: %d",
-		    pcbe_module_name, rock_pcbe_hsvc.hsvc_major,
-		    rock_pcbe_hsvc.hsvc_minor, HSVC_GROUP_RKPERF, status);
-		rock_pcbe_hsvc_available = B_FALSE;
-		return (-1);
-	}
-
-	events = Rock_names;
-	/*
-	 * Initialize the list of events for each PIC.
-	 * Do two passes: one to compute the size necessary and another
-	 * to copy the strings. Need room for event, comma, and NULL terminator.
-	 */
-	for (i = 0; i < NUM_PCBE_COUNTERS; i++) {
-		size = 0;
-		for (n = events[i]; n->bits != NT_END; n++)
-			size += strlen(n->name) + 1;
-		pic_events[i] = kmem_alloc(size + 1, KM_SLEEP);
-		*pic_events[i] = '\0';
-		for (n = events[i]; n->bits != NT_END; n++) {
-			(void) strcat(pic_events[i], n->name);
-			(void) strcat(pic_events[i], ",");
-		}
-		/*
-		 * Remove trailing comma.
-		 */
-		pic_events[i][size - 1] = '\0';
-
-		/* Initialize all active pics as NULL */
-		for (j = 0; j < NCPU; j++)
-			active_pics[i][j] = NULL;
-	}
-#ifdef	RKPCBE_DBG
-	mutex_init(&print_pic_lock, NULL, MUTEX_DRIVER,
-	    (void *)ipltospl(PIL_15));
-#endif
-	return (0);
-}
-
-static	int
-rk_pcbe_fini(void)
-{
-	return (0);
-}
-
-static uint_t
-rk_pcbe_ncounters(void)
-{
-	return (NUM_PCBE_COUNTERS);
-}
-
-static const char *
-rk_pcbe_impl_name(void)
-{
-	return (rock_name);
-}
-
-static const char *
-rk_pcbe_cpuref(void)
-{
-	return (rock_cpuref);
-}
-
-static char *
-rk_pcbe_list_events(uint_t picnum)
-{
-	ASSERT(picnum >= (uint_t)0 && picnum < cpc_ncounters);
-
-	return (pic_events[picnum]);
-}
-
-static char *
-rk_pcbe_list_attrs(void)
-{
-	/*
-	 * If no value is spcified in the command line for the
-	 * attributes then, a default value of 1 is passed into
-	 * pcbe from cpc. Specifying a value as zero is as good as
-	 * not specifying it.
-	 * 'source' attribute is equivallent of 'single, shared,
-	 * siu, mmu' all put together. 'source' will take precedence
-	 * over others.
-	 * Valid 'source' values are defined in rock_hypervisor_api.h.
-	 * If multiple flags need to be specified then user has to
-	 * specify the bitwise OR of the flags he/she is interested in.
-	 * populate_pic_config validates the correctness of the flags
-	 * specified.
-	 * tl is little odd. To consider instructions at
-	 * tl == 0, specify tl = TLZ in command line
-	 * tl > 0, specify tl = TLNZ in command line
-	 * The reason for this oddness: attr = 0 means, neglect
-	 * that attr.
-	 */
-	return ("freq,source,single,shared,siu,mmu,nohws,tl,hpriv");
-}
-
-static const struct nametable *
-find_event(int picno, char *name)
-{
-	const struct nametable *n;
-
-	for (n = events[picno]; n->bits != NT_END; n++)
-		if (strcmp(name, n->name) == 0)
-			return (n);
-
-	return (NULL);
-}
-
-static uint64_t
-rk_pcbe_event_coverage(char *event)
-{
-	uint64_t	bitmap = 0;
-	int 		i;
-
-	/* There is no intersection of events between different PICs */
-	for (i = 0; i <  NUM_PCBE_COUNTERS; i++) {
-		if (find_event(i, event) != NULL) {
-			bitmap = 1 << i;
-			break;
-		}
-	}
-	return (bitmap);
-}
-
-static uint64_t
-rk_pcbe_overflow_bitmap(void)
-{
-	int 			i;
-	rk_pcbe_config_t	*pic;
-	uint64_t		ovf_bitmask = 0, ovf_cnt;
-
-	for (i = 0; i <  NUM_PCBE_COUNTERS; i++) {
-		pic = active_pics[i][CPU->cpu_id];
-
-		if (pic == NULL || pic->inuse != B_TRUE)
-			continue;
-
-		DBG_PRINT(("CPU-%d: Pic %s (#%d, cntr %X) overflowed\n",
-		    CPU->cpu_id, pic->name, pic->pcbe_picno, pic->counter));
-
-		/* Check if any of the active pics overflowed */
-		if (pic->counter_type == NORMAL_COUNTER) {
-			hv_rk_perf_count_overflow((uint64_t)(pic->counter |
-			    pic->src_type), &ovf_cnt);
-			if (ovf_cnt > 0)
-				pic->pcbe_pic += (0x1ULL << pic->counter_bits);
-		} else {
-		/*
-		 * Synthetic counters don't overflow, so we must have gotten
-		 * here because the ringbuffer is getting half-full or
-		 * one of the normal counter which is a part of synthetic
-		 * counter did overflow. Force cpc to call
-		 * rk_pcbe_sample_synthetic by setting ovf_cnt to 1. If
-		 * returned 0, then cpc prints a WARNING message:
-		 * "WARNING: interrupt 0x80c at level 15 not serviced"
-		 */
-			ovf_cnt = B_TRUE;
-		}
-
-		if (ovf_cnt > 0)
-			ovf_bitmask |= (1 << pic->pcbe_picno);
-	}
-	return (ovf_bitmask);
-}
-
-/*
- * populate_pic_config
- *
- * Checks the validity of all the attributes and then updates flags
- * to reflect priv bits for Cycle and Instruction counters and
- * transaction bits for L2 and makes sure that flags is 0 for MMU.
- *
- * Along with validating the inputs, pic is populated with appropriate
- * values.
- *
- * Returns 0 on success and CPC_INVALID_ATTRIBUTE on failure.
- */
-static int
-populate_pic_config(uint_t picnum, uint_t nattrs, kcpc_attr_t *attrs,
-				uint32_t bits, rk_pcbe_config_t *pic)
-{
-	int 		i;
-	uint32_t	freq = 0;
-	uint32_t	*flagsp = &(pic->flags);
-	uint32_t	source = 0;
-
-	pic->pcbe_picno = (uint8_t)picnum;
-	pic->toe = B_TRUE;
-	pic->sampler.synthetic_pic = 0;
-	pic->sampler.ring_buffer = NULL;
-	pic->inuse = UNINITIALIZED;
-	pic->counter_type = ((bits & SYN_BIT) == 0) ? NORMAL_COUNTER :
-	    SYNTHETIC_COUNTER;
-
-	/*
-	 * Initialized to 0. If a valid source attribute is specified, then
-	 * src_type field gets populated later, else will be defaulted to
-	 * HV_RK_PERF_SRC_STRAND
-	 */
-	pic->src_type = 0;
-	/*
-	 * Initialized to zero. In all the fallthrough case, this
-	 * is checked to determine if certain fields needs to be
-	 * populated or not
-	 */
-	pic->counter = 0;
-
-	/*
-	 * When synthetic counter's ring buffer reaches HWM, HV generates
-	 * PIC overflow trap to get guest's attention. This is not same as
-	 * a hardware counter overflow. Size of the ring buffer is configurable
-	 * and since there is no definite size, CPC_OVF_NOTIFY_EMT flag has no
-	 * meaning wrt synthetic counters.
-	 */
-	if ((bits & SYN_BIT) && (*flagsp & CPC_OVF_NOTIFY_EMT))
-		return (CPC_PIC_NOT_CAPABLE);
-
-	/*
-	 * This flag is used by CPC to inform the application of a counter
-	 * overflow. It is of no use to PCBE.
-	 */
-	*flagsp &= ~(CPC_OVF_NOTIFY_EMT);
-
-	switch (picnum) {
-#define	PRIV_BITS_MASK	0x7
-#define	PRIV_BIT0_MASK	0x1
-#define	PRIV_BIT1_MASK	0x2
-#define	PRIV_BIT2_MASK	0x4
-
-		case 0:	/* Instruction Counter */
-			pic->counter = RK_PERF_INSTR;
-			pic->counter_bits = INSTR_COUNTER_BITS;
-
-			freq = INSTR_SAM_DEF_FREQ; /* Default Frequency */
-
-			for (i = 0; i < nattrs; i++) {
-				if ((strcmp(attrs[i].ka_name, "freq") == 0)) {
-					if ((bits & SYN_BIT) == 0 &&
-					    attrs[i].ka_val) {
-						return (CPC_INVALID_ATTRIBUTE);
-					}
-					freq = attrs[i].ka_val;
-				} else if ((strcmp(attrs[i].ka_name,
-				    "single") == 0) && attrs[i].ka_val)
-					pic->src_type |=
-					    HV_RK_PERF_SRC_STRAND;
-				else if ((strcmp(attrs[i].ka_name,
-				    "shared") == 0) && attrs[i].ka_val)
-					pic->src_type |=
-					    HV_RK_PERF_SRC_STRAND_M;
-				else if ((strcmp(attrs[i].ka_name,
-				    "hpriv") == 0) && attrs[i].ka_val)
-					*flagsp |= CPC_COUNT_HV;
-				else if ((strcmp(attrs[i].ka_name,
-				    "source") == 0) && attrs[i].ka_val)
-					source = attrs[i].ka_val &
-					    HV_RK_PERF_SRC_MASK;
-				else if ((strcmp(attrs[i].ka_name,
-				    "nohws") == 0) && attrs[i].ka_val) {
-					if (bits & SYN_BIT)
-						pic->sampler.nohws = B_TRUE;
-					else if (attrs[i].ka_val)
-						return (CPC_INVALID_ATTRIBUTE);
-				} else if ((strcmp(attrs[i].ka_name,
-				    "tl") == 0) && attrs[i].ka_val) {
-					if (bits & SYN_BIT) {
-						pic->sampler.tl =
-						    (uint8_t)attrs[i].ka_val;
-					} else if (attrs[i].ka_val)
-						return (CPC_INVALID_ATTRIBUTE);
-				} else {
-					if (attrs[i].ka_val)
-						return (CPC_INVALID_ATTRIBUTE);
-				}
-			}
-
-			if (source) {
-				if (source & (HV_RK_PERF_SRC_SIU |
-				    HV_RK_PERF_SRC_MMU))
-					return (CPC_INVALID_ATTRIBUTE);
-				pic->src_type = source;
-			}
-
-			if (pic->src_type == 0)
-				pic->src_type = HV_RK_PERF_SRC_STRAND;
-
-			/*
-			 * hpriv, sys, user are sent as bits 3, 2, 1 from kcpc.
-			 * They are maintained by PCBE as bits 2, 1, & 0.
-			 */
-			*flagsp >>= 1;
-			*flagsp &= PRIV_BITS_MASK;
-			if (bits & SYN_BIT) {
-				pic->sampler.flags = *flagsp;
-				pic->sampler.syn_counter = bits;
-				if (freq > INSTR_SAM_MAX_FREQ) {
-					cmn_err(CE_NOTE, "CPU-%d: freq set "
-					    "> MAX. Resetting to %d",
-					    CPU->cpu_id, INSTR_SAM_MAX_FREQ);
-					freq = INSTR_SAM_MAX_FREQ;
-				}
-				if (freq < INSTR_SAM_MIN_FREQ) {
-					cmn_err(CE_NOTE, "CPU-%d: freq set "
-					    "< MIN. Resetting to %d",
-					    CPU->cpu_id, INSTR_SAM_MIN_FREQ);
-					freq = INSTR_SAM_MIN_FREQ;
-				}
-				pic->sampler.frequency = freq;
-			}
-			/*
-			 * When programming counter priv bits should be
-			 * 0, 1, & 2, i.e., in reverse order. Therefore swap
-			 * bits 2 & 0.
-			 */
-			*flagsp = ((*flagsp & PRIV_BIT0_MASK) << 2) |
-			    ((*flagsp & PRIV_BIT2_MASK) >> 2) |
-			    (*flagsp & PRIV_BIT1_MASK);
-			break;
-		case 1:	/* L2 counter */
-			/*
-			 * nouser and sys are also invalid attributes for L2
-			 * and MMU counters. If user has not specified any
-			 * attributes then *flagsp contains CPC_COUNT_USER.
-			 * Any priv attrs are not applicable for L2 counters.
-			 */
-			if (*flagsp != CPC_COUNT_USER)
-				return (CPC_INVALID_ATTRIBUTE);
-
-			pic->counter_bits = L2_COUNTER_BITS;
-			if ((bits & SYN_BIT) == 0) {
-				/*
-				 * Normal counter:
-				 * Find the attibutes for L2 Counter.
-				 */
-				for (i = 0; i < nattrs; i++) {
-					if ((strcmp(attrs[i].ka_name,
-					    "single") == 0) && attrs[i].ka_val)
-						pic->src_type |=
-						    HV_RK_PERF_SRC_STRAND;
-					else if ((strcmp(attrs[i].ka_name,
-					    "shared") == 0) && attrs[i].ka_val)
-						pic->src_type |=
-						    HV_RK_PERF_SRC_STRAND_M;
-					else if ((strcmp(attrs[i].ka_name,
-					    "siu") == 0) && attrs[i].ka_val)
-						pic->src_type |=
-						    HV_RK_PERF_SRC_SIU;
-					else if ((strcmp(attrs[i].ka_name,
-					    "mmu") == 0) && attrs[i].ka_val)
-						pic->src_type |=
-						    HV_RK_PERF_SRC_MMU;
-					else if ((strcmp(attrs[i].ka_name,
-					    "source") == 0) && attrs[i].ka_val)
-						source = attrs[i].ka_val &
-						    HV_RK_PERF_SRC_MASK;
-					else if (attrs[i].ka_val)
-						return (CPC_INVALID_ATTRIBUTE);
-				}
-				if (source)
-					pic->src_type = source;
-
-				if (pic->src_type == 0)
-					pic->src_type = HV_RK_PERF_SRC_STRAND;
-
-				/* At least one hot Xn flag for L2 counters */
-				*flagsp = bits;
-			} else {
-				/*
-				 * Synthetic Counter
-				 */
-				pic->sampler.syn_counter = bits;
-				freq = L2_SAM_DEF_FREQ;	/* Default Frequency */
-				/*
-				 * Find the attibutes for L2 Sampler.
-				 */
-				for (i = 0; i < nattrs; i++) {
-					if ((strcmp(attrs[i].ka_name,
-					    "freq") == 0) && attrs[i].ka_val)
-						freq = attrs[i].ka_val;
-					else if ((strcmp(attrs[i].ka_name,
-					    "single") == 0) && attrs[i].ka_val)
-						pic->src_type |=
-						    HV_RK_PERF_SRC_STRAND;
-					else if ((strcmp(attrs[i].ka_name,
-					    "shared") == 0) && attrs[i].ka_val)
-						pic->src_type |=
-						    HV_RK_PERF_SRC_STRAND_M;
-					else if ((strcmp(attrs[i].ka_name,
-					    "siu") == 0) && attrs[i].ka_val)
-						pic->src_type |=
-						    HV_RK_PERF_SRC_SIU;
-					else if ((strcmp(attrs[i].ka_name,
-					    "mmu") == 0) && attrs[i].ka_val)
-						pic->src_type |=
-						    HV_RK_PERF_SRC_MMU;
-					else if ((strcmp(attrs[i].ka_name,
-					    "source") == 0) && attrs[i].ka_val)
-						source = attrs[i].ka_val &
-						    HV_RK_PERF_SRC_MASK;
-					else if (attrs[i].ka_val)
-						return (CPC_INVALID_ATTRIBUTE);
-				}
-				if (source)
-					pic->src_type = source;
-
-				if (pic->src_type == 0)
-					pic->src_type = HV_RK_PERF_SRC_STRAND;
-
-				/* Range check to avoid DOS */
-				if (freq > L2_SAM_MAX_FREQ) {
-					cmn_err(CE_NOTE, "CPU-%d: freq set "
-					    "> MAX. Resetting to %d",
-					    CPU->cpu_id, L2_SAM_MAX_FREQ);
-					freq = L2_SAM_MAX_FREQ;
-				}
-				if (freq < L2_SAM_MIN_FREQ) {
-					cmn_err(CE_NOTE, "CPU-%d: freq set "
-					    "< MIN. Resetting to %d",
-					    CPU->cpu_id, L2_SAM_MIN_FREQ);
-					freq = L2_SAM_MIN_FREQ;
-				}
-				pic->sampler.frequency = freq;
-				*flagsp = 0;
-			}
-			pic->counter = RK_PERF_L2;
-			break;
-		case 2:	/* MMU Counter */
-			if (*flagsp != CPC_COUNT_USER)
-				return (CPC_INVALID_ATTRIBUTE);
-
-			*flagsp = bits;
-			pic->counter_bits = MMU_COUNTER_BITS;
-
-			for (i = 0; i < nattrs; i++) {
-				if ((strcmp(attrs[i].ka_name, "single") == 0) &&
-				    attrs[i].ka_val)
-					pic->src_type |= HV_RK_PERF_SRC_STRAND;
-				else if
-				    ((strcmp(attrs[i].ka_name, "shared") ==
-				    0) && attrs[i].ka_val)
-					pic->src_type |=
-					    HV_RK_PERF_SRC_STRAND_M;
-				else if ((strcmp(attrs[i].ka_name,
-				    "source") == 0) && attrs[i].ka_val)
-					source = attrs[i].ka_val &
-					    HV_RK_PERF_SRC_MASK;
-				else if (attrs[i].ka_val)
-					return (CPC_INVALID_ATTRIBUTE);
-			}
-			if (source) {
-				if (source & (HV_RK_PERF_SRC_SIU |
-				    HV_RK_PERF_SRC_MMU))
-					return (CPC_INVALID_ATTRIBUTE);
-				pic->src_type = source;
-			}
-
-
-			if (pic->src_type == 0)
-				pic->src_type = HV_RK_PERF_SRC_STRAND;
-
-			pic->counter = RK_PERF_MMU;
-			break;
-		case 3: /* YANK Counter */
-			pic->counter = RK_PERF_YANK;
-			pic->counter_bits = YANK_COUNTER_BITS;
-			/* FALLTHROUGH */
-		case 4: /* SIBLK Counter */
-			if (pic->counter == 0) {
-				pic->counter = RK_PERF_SIBLK;
-				pic->counter_bits = SIBLK_COUNTER_BITS;
-			}
-			/* FALLTHROUGH */
-		case 5: /* LVLK Counter */
-			if (pic->counter == 0) {
-				pic->counter = RK_PERF_LVLK;
-				pic->counter_bits = LVLK_COUNTER_BITS;
-			}
-
-			if (*flagsp != CPC_COUNT_USER)
-				return (CPC_INVALID_ATTRIBUTE);
-
-			for (i = 0; i < nattrs; i++) {
-				if ((strcmp(attrs[i].ka_name, "single") ==
-				    0) && attrs[i].ka_val)
-					pic->src_type |= HV_RK_PERF_SRC_STRAND;
-				else if
-				    ((strcmp(attrs[i].ka_name, "shared") ==
-				    0) && attrs[i].ka_val)
-					pic->src_type |=
-					    HV_RK_PERF_SRC_STRAND_M;
-				else if ((strcmp(attrs[i].ka_name,
-				    "source") == 0) && attrs[i].ka_val)
-					source = attrs[i].ka_val &
-					    HV_RK_PERF_SRC_MASK;
-				else if (attrs[i].ka_val)
-					return (CPC_INVALID_ATTRIBUTE);
-			}
-			if (source) {
-				if (source & (HV_RK_PERF_SRC_SIU |
-				    HV_RK_PERF_SRC_MMU))
-					return (CPC_INVALID_ATTRIBUTE);
-				pic->src_type = source;
-			}
-
-
-			if (pic->src_type == 0)
-				pic->src_type = HV_RK_PERF_SRC_STRAND;
-
-			*flagsp = 0;
-			pic->sampler.frequency = 0;
-			pic->sampler.syn_counter = bits;
-			break;
-		}
-
-	if ((int64_t)pic->pcbe_pic > COUNTER_MAX(pic) ||
-	    (int64_t)pic->pcbe_pic < COUNTER_MIN(pic))
-		return (CPC_ATTRIBUTE_OUT_OF_RANGE);
-
-	pic->pcbe_pic &= COUNTER_MASK(pic);
-
-#ifdef	RKPCBE_DBG
-	set_pic_name(pic);
-#endif
-	return (0);
-}
-
-/*ARGSUSED7*/
-static int
-rk_pcbe_configure(uint_t picnum, char *event, uint64_t preset, uint32_t flags,
-		    uint_t nattrs, kcpc_attr_t *attrs, void **data, void *token)
-{
-	rk_pcbe_config_t *pic;
-	const struct nametable *n;
-	int		rc;
-
-	/* Is API version for Rock pcbe hypervisor services negotiated? */
-	if (rock_pcbe_hsvc_available == B_FALSE)
-		return (CPC_RESOURCE_UNAVAIL);
-
-	/*
-	 * If we've been handed an existing configuration, we need only preset
-	 * the counter value.
-	 */
-	if (*data != NULL) {
-		pic = *data;
-		if ((int64_t)preset > COUNTER_MAX(pic) ||
-		    (int64_t)preset < COUNTER_MIN(pic))
-			return (CPC_ATTRIBUTE_OUT_OF_RANGE);
-		pic->pcbe_pic = preset & COUNTER_MASK(pic);
-		return (0);
-	}
-
-	if (picnum < (uint_t)0 || picnum > NUM_PCBE_COUNTERS)
-		return (CPC_INVALID_PICNUM);
-
-	/*
-	 * Find other requests that will be programmed with this one, and ensure
-	 * they don't conflict.
-	 * Any other counter in this pic group is active?
-	 */
-	if (active_pics[picnum][CPU->cpu_id] != NULL)
-		return (CPC_CONFLICTING_REQS);
-
-	if ((n = find_event(picnum, event)) == NULL)
-		return (CPC_INVALID_EVENT);
-
-	/* Check for supported attributes and populate pic */
-	pic = kmem_zalloc(sizeof (rk_pcbe_config_t), KM_SLEEP);
-	pic->flags = flags;
-	pic->pcbe_pic = preset;
-
-	if (rc = populate_pic_config(picnum, nattrs, attrs, n->bits, pic)) {
-		kmem_free(pic, sizeof (rk_pcbe_config_t));
-		return (rc);
-	}
-
-	/*
-	 * num_ringbuf_entries should be always even. Since this
-	 * /etc/system tunable, need to check for this.
-	 */
-	if (num_ringbuf_entries & 1) {
-		num_ringbuf_entries++;
-		cmn_err(CE_WARN, "num_ringbuf_entries should be even."
-		    " Changing %u to %u\n", num_ringbuf_entries - 1,
-		    num_ringbuf_entries);
-	}
-	if (num_ringbuf_entries < MIN_RINGBUF_ENTRIES) {
-		cmn_err(CE_WARN, "num_ringbuf_entries should be at least "
-		    "%u. Changing %u to %u\n", MIN_RINGBUF_ENTRIES,
-		    num_ringbuf_entries, MIN_RINGBUF_ENTRIES);
-		num_ringbuf_entries = MIN_RINGBUF_ENTRIES;
-	}
-
-	pic->state = STATE_CONFIGURED;
-	pic->cpu = CPU->cpu_id;
-	active_pics[picnum][pic->cpu] = pic;
-	*data = pic;
-
-	if (pic->counter_type == NORMAL_COUNTER)
-		PRINT_PIC(pic, "After Configuration (N)");
-	return (0);
-}
-
-static void
-rk_pcbe_program(void *token)
-{
-	rk_pcbe_config_t	*pic = NULL;
-	int			rc;
-	uint64_t		counter;
-
-	while ((pic = (rk_pcbe_config_t *)kcpc_next_config(token, pic, NULL))
-	    != NULL) {
-
-		if (pic->inuse == B_FALSE)
-			continue;
-
-		counter = (uint64_t)(pic->counter | pic->src_type);
-		rc = (int)hv_rk_perf_count_init(counter);
-
-		if (curthread->t_cpc_ctx) {
-			/*
-			 * If in thread context, pic should get an exclusive
-			 * lock. If it cannot then invalidate the pic.
-			 */
-			if (rc != H_EOK) {
-				kcpc_invalidate_config(token);
-				continue;
-			}
-		} else {
-			/* Must be cpu context */
-			ASSERT(CPU->cpu_cpc_ctx);
-			if (rc == H_EWOULDBLOCK &&
-			    (pic->src_type & HV_RK_PERF_SRC_STRAND_M)) {
-				/* pic in use by a cpu of current guest */
-				pic->inuse = B_FALSE;
-				continue;
-			} else if (rc != H_EOK) {
-				/*
-				 * Either the counter is in use by a different
-				 * guest or another cpu in the current guest is
-				 * already using it in single source mode. In
-				 * either case, invalidate the pic.
-				 */
-				kcpc_invalidate_config(token);
-				continue;
-			}
-		}
-
-		/*
-		 * rc = H_EOK, hence current cpu was successful in
-		 * obtaining exclusive access to the counter, Set this
-		 * pic as active.
-		 */
-		if (CPU->cpu_id != pic->cpu) {
-			active_pics[pic->pcbe_picno][pic->cpu] = NULL;
-			pic->cpu = CPU->cpu_id;
-			active_pics[pic->pcbe_picno][pic->cpu] = pic;
-		}
-		pic->inuse = B_TRUE;
-
-		if (pic->counter_type == NORMAL_COUNTER)
-			rc = rk_pcbe_program_normal(pic);
-		else
-			rc = rk_pcbe_program_synthetic(pic);
-
-		pic->state = STATE_PROGRAMMED;
-
-		if (rc != H_EOK) {
-			kcpc_invalidate_config(token);
-			continue;
-		}
-	}
-}
-
-static void
-rk_pcbe_allstop(void)
-{
-	int 			i;
-	rk_pcbe_config_t	*pic;
-	uint64_t		diff;
-
-	for (i = 0; i <  NUM_PCBE_COUNTERS; i++) {
-		pic = active_pics[i][CPU->cpu_id];
-
-		if (pic == NULL || pic->state != STATE_PROGRAMMED)
-			continue;
-
-		ASSERT(pic->inuse == B_TRUE && CPU->cpu_id == pic->cpu);
-
-		/* Stop all active pics */
-		if (pic->counter_type == NORMAL_COUNTER) {
-			hv_rk_perf_count_stop((uint64_t)(pic->counter |
-			    pic->src_type));
-			DBG_PRINT(("CPU-%d: Counter %s(%X) stopped.\n",
-			    CPU->cpu_id, pic->name, pic->counter));
-		} else {
-			DBG_PRINT(("CPU-%d: Stopping counter %s(%lX)\n",
-			    CPU->cpu_id, pic->name,
-			    pic->sampler.synthetic_pic));
-			rk_pcbe_stop_synthetic(pic);
-		}
-
-		/* Mark pic as stopped */
-		pic->state = STATE_STOPPED;
-
-		/*
-		 * If running in lwp context, kcpc ensures a cpu that
-		 * executed pcbe_program will be the one that executes
-		 * pcbe_allstop. However, pcbe_free may be executed on
-		 * a different strand. HV puts a restriction that the
-		 * strand that programmed the counter should be the one
-		 * that releases it. Therefore, when counters are bound
-		 * to thread context, counters are released everytime
-		 * they are stopped.
-		 */
-		if (CPU->cpu_cpc_ctx == NULL) {
-			/*
-			 * If counter is being released, cache the current
-			 * sample since we cannot sample a counter that has
-			 * been released.
-			 */
-			if (rk_pcbe_sample_internal(pic, &diff) == H_EOK)
-				pic->pcbe_pic = diff;
-			else
-				pic->pcbe_pic = 0;
-			rk_pcbe_release(pic);
-		}
-	}
-}
-
-static void
-rk_pcbe_sample(void *token)
-{
-	rk_pcbe_config_t	*pic = NULL;
-	uint64_t		*pic_data;
-	int			rc;
-	uint64_t		diff;
-
-	while ((pic = (rk_pcbe_config_t *)
-	    kcpc_next_config(token, pic, &pic_data)) != NULL) {
-
-		if (pic->inuse != B_TRUE) {
-			continue;
-		}
-
-		/*
-		 * If counter is already released, then return the
-		 * cached value
-		 */
-		if (pic->state == STATE_RELEASED) {
-			*pic_data += pic->pcbe_pic;
-			pic->pcbe_pic = 0;
-			continue;
-		}
-
-		ASSERT(CPU->cpu_id == pic->cpu);
-
-		rc = rk_pcbe_sample_internal(pic, &diff);
-
-		if (pic->state == STATE_STOPPED) {
-			pic->pcbe_pic = 0;
-			rk_pcbe_release(pic);
-		}
-
-		if (rc == H_EOK) {
-			*pic_data += diff;
-		} else  {
-			kcpc_invalidate_config(token);
-		}
-	}
-}
-
-static void
-rk_pcbe_free(void *config)
-{
-	rk_pcbe_config_t 	*pic = (rk_pcbe_config_t *)config;
-
-	/* Release counter */
-	if (pic->inuse == B_TRUE) {
-		if (pic->state != STATE_RELEASED) {
-			rk_pcbe_release(pic);
-		}
-		if (pic->counter_type == SYNTHETIC_COUNTER)
-			rk_pcbe_free_synthetic(pic);
-	}
-
-	/* Mark pic as inactive */
-	active_pics[pic->pcbe_picno][pic->cpu] = NULL;
-	kmem_free(pic, sizeof (rk_pcbe_config_t));
-}
-
-static void
-rk_pcbe_release(rk_pcbe_config_t *pic)
-{
-	int			rc = 0;
-
-	ASSERT(pic->inuse == B_TRUE && pic->state != STATE_RELEASED);
-
-	DBG_PRINT(("CPU-%d: Releasing Pic %s (#%d, cntr %X) %p",
-	    CPU->cpu_id, pic->name, pic->pcbe_picno, pic->counter,
-	    (void *)pic));
-
-	rc = (int)hv_rk_perf_count_release((uint64_t)
-	    (pic->counter | pic->src_type));
-	if (rc != 0) {
-		cmn_err(CE_WARN, "CPU-%d: Releasing Pic-%d, counter: %X failed "
-		    "%p. rc=%d", CPU->cpu_id, pic->pcbe_picno, pic->counter,
-		    (void *)pic, rc);
-	}
-	if (pic->counter_type == SYNTHETIC_COUNTER &&
-	    !(pic->counter == RK_PERF_YANK || pic->counter == RK_PERF_SIBLK ||
-	    pic->counter == RK_PERF_LVLK)) {
-		rc = (int)hv_rk_perf_sample_release((uint64_t)
-		    (pic->counter | pic->src_type));
-		if (rc != 0) {
-			cmn_err(CE_WARN, "CPU-%d: Releasing Pic-%d, sampler: %X"
-			    " failed %p. rc=%d", CPU->cpu_id, pic->pcbe_picno,
-			    pic->counter, (void *)pic, rc);
-			return;
-		}
-	}
-	pic->state = STATE_RELEASED;
-}
-
-static int
-rk_pcbe_program_normal(rk_pcbe_config_t *pic)
-{
-	uint64_t		counter;
-	uint64_t		config_value;
-	uint64_t		rc = H_EOK;
-
-	ASSERT(pic->inuse == B_TRUE);
-
-	counter = (uint64_t)(pic->counter | pic->src_type);
-
-	/* Preset the counter value if non zero */
-	if (pic->pcbe_pic > 0)  {
-		DBG_PRINT(("CPU-%d: Counter getting preset to %lu (%lX)\n",
-		    CPU->cpu_id, pic->pcbe_pic, pic->pcbe_pic));
-		rc = (int)hv_rk_perf_count_set(counter, pic->pcbe_pic);
-	}
-
-	if (rc != H_EOK) {
-		cmn_err(CE_WARN, "{%d} Pic %d cntr %X not set",
-		    CPU->cpu_id, pic->pcbe_picno, pic->counter);
-		PRINT_PIC(pic, "Set counter failed");
-		return ((int)rc);
-	}
-
-	/* Configure and start counter */
-	config_value = ((uint64_t)pic->toe << RK_PERF_COUNT_TOE_SHIFT)
-	    | pic->flags;
-	rc = (int)hv_rk_perf_count_start(counter, config_value);
-
-	if (rc != H_EOK) {
-		cmn_err(CE_WARN, "{%d} Pic %d cntr %X not configured",
-		    CPU->cpu_id, pic->pcbe_picno, pic->counter);
-		PRINT_PIC(pic, "Configure counter failed");
-	}
-	return ((int)rc);
-}
-
-static int
-rk_pcbe_program_synthetic(rk_pcbe_config_t *pic)
-{
-	int	rc;
-	ASSERT(pic->inuse == B_TRUE);
-	switch (pic->counter) {
-		case RK_PERF_INSTR:
-			rc = program_instr_sampler(pic);
-			break;
-		case RK_PERF_L2:
-			rc = program_l2_sampler(pic);
-			break;
-		case RK_PERF_YANK:
-			/* FALLTHROUGH */
-		case RK_PERF_SIBLK:
-			/* FALLTHROUGH */
-		case RK_PERF_LVLK:
-			rc = rk_pcbe_program_normal(pic);
-			break;
-		default:
-			PRINT_PIC(pic, "rk_pcbe_program_synthetic");
-			ASSERT(0);
-			rc = H_EINVAL;
-			break;
-	}
-	return (rc);
-}
-
-static void
-rk_pcbe_free_synthetic(rk_pcbe_config_t *pic)
-{
-	ASSERT(pic->inuse == B_TRUE);
-	switch (pic->counter) {
-		case RK_PERF_INSTR:
-			/* FALLTHROUGH */
-		case RK_PERF_L2:
-			free_ringbuffer(pic);
-			break;
-		case RK_PERF_YANK:
-			/* FALLTHROUGH */
-		case RK_PERF_SIBLK:
-			/* FALLTHROUGH */
-		case RK_PERF_LVLK:
-			/* Do nothing */
-			break;
-		default:
-			PRINT_PIC(pic, "rk_pcbe_free_synthetic");
-			ASSERT(0);
-			break;
-	}
-}
-
-static int
-rk_pcbe_sample_internal(rk_pcbe_config_t *pic, uint64_t *data)
-{
-	uint64_t		counter_value;
-	int			rc;
-	int64_t			diff;
-
-		if (pic->counter_type == NORMAL_COUNTER) {
-			rc = (int)hv_rk_perf_count_get((uint64_t)(pic->counter |
-			    pic->src_type), &counter_value);
-			if (rc == H_EOK) {
-				counter_value &= COUNTER_MASK(pic);
-				diff = counter_value - pic->pcbe_pic;
-				pic->pcbe_pic = counter_value;
-				/*
-				 * When counter overflows the overflow handler
-				 * (rk_pcbe_overflow_bitmap) would have added
-				 * MAX count value to pic->pcbe_pic. Therefore
-				 * -ve implies that the counter has overflowed.
-				 * The actual count amounts to,
-				 * (counter_value - (pic->pcbe_pic - MAX)) + MAX
-				 * => counter_value - pic->pcbe_pic + (2 * MAX)
-				 * => diff + (2 * MAX)
-				 */
-				if (diff < 0) {
-					diff +=
-					    (0x1ULL << (pic->counter_bits + 1));
-				}
-			}
-		} else {
-			/*
-			 * Difference returned by synthetic counters will
-			 * be always +ve
-			 */
-			rc = rk_pcbe_sample_synthetic(pic, &diff);
-		}
-
-		if (rc == H_EOK)
-			*data = (uint64_t)diff;
-
-		return ((int)rc);
-}
-
-/* All sample_synthetic code may be executed at TL=1 */
-static int
-rk_pcbe_sample_synthetic(rk_pcbe_config_t *pic, int64_t *diffp)
-{
-	int	rc;
-	ASSERT(pic->inuse == B_TRUE);
-	switch (pic->counter) {
-		case RK_PERF_INSTR:
-			rc = sample_instr_sampler(pic, diffp);
-			break;
-		case RK_PERF_L2:
-			rc = sample_l2_sampler(pic, diffp);
-			break;
-		case RK_PERF_YANK:
-			/* FALLTHROUGH */
-		case RK_PERF_SIBLK:
-			/* FALLTHROUGH */
-		case RK_PERF_LVLK:
-			rc = sample_mccdesr(pic, diffp);
-			break;
-		default:
-			PRINT_PIC(pic, "rk_pcbe_sample_synthetic");
-			ASSERT(0);
-			break;
-	}
-	return (rc);
-}
-
-static void
-rk_pcbe_stop_synthetic(rk_pcbe_config_t *pic)
-{
-	uint64_t	counter = (uint64_t)(pic->counter | pic->src_type);
-
-	ASSERT(pic->inuse == B_TRUE);
-	switch (pic->counter) {
-		case RK_PERF_INSTR:
-			/* FALLTHROUGH */
-		case RK_PERF_L2:
-			hv_rk_perf_count_stop(counter);
-			hv_rk_perf_sample_stop(counter);
-			break;
-		case RK_PERF_YANK:
-			/* FALLTHROUGH */
-		case RK_PERF_SIBLK:
-			/* FALLTHROUGH */
-		case RK_PERF_LVLK:
-			hv_rk_perf_count_stop(counter);
-			break;
-		default:
-			PRINT_PIC(pic, "rk_pcbe_stop_synthetic");
-			ASSERT(0);
-			break;
-	}
-}
-
-static int
-program_l2_sampler(rk_pcbe_config_t *pic)
-{
-#define	ASI_PERF_L2_TXN_INFO		0xF10010
-#define	ASI_PERF_L2_EA_MASK		0xF10018
-#define	ASI_PERF_L2_EA_MATCH		0xF10020
-#define	ASI_PERF_L2_TXN_INFO_FILTER	0xF10030
-#define	ASI_PERF_L2_CC			0xF10038
-#define	TXN_ICACHE_LOAD			0x1
-#define	TXN_DCACHE_LOAD			0x2
-#define	TXN_INSTR_PREFETCH		0x4
-#define	TXN_STORE_PREFETCH		0x8
-#define	TXN_DCACHE_STORE		0x10
-#define	TXN_ATOMIC_LOAD_STORE		0x20
-#define	TXN_FLUSH			0x40
-#define	L2_ALL_TXNS	(TXN_ICACHE_LOAD | TXN_DCACHE_LOAD | \
-			TXN_INSTR_PREFETCH | TXN_STORE_PREFETCH | \
-			TXN_DCACHE_STORE | TXN_ATOMIC_LOAD_STORE | TXN_FLUSH)
-#define	L2_TXN_SHIFT			3
-#define	L2_ALL_EVT			0x3
-#define	L2_ALL_EVT_SHIFT		10
-#define	L2_TXN_INFO_FILTER_MASK		(L2_ALL_EVT << L2_ALL_EVT_SHIFT) | \
-					(L2_ALL_TXNS << L2_TXN_SHIFT)
-
-	program_sampler_data_t	sdata;
-	int			i = 0;
-
-	(void) strcpy(sdata.name, "program_l2_sampler");
-	pic->flags = L2_ALL_TXNS; /* For L2 counter */
-
-	/*
-	 * If (((Reported EA ^ MATCH) & MASK) == 0) then sample is taken
-	 */
-	sdata.asi_config[i].va = ASI_PERF_L2_EA_MASK;
-	sdata.asi_config[i].value = 0;
-	i++;
-
-	sdata.asi_config[i].va = ASI_PERF_L2_EA_MATCH;
-	sdata.asi_config[i].value = 0;
-	i++;
-
-	sdata.asi_config[i].va = ASI_PERF_L2_CC;
-	sdata.asi_config[i].value = pic->sampler.frequency;
-	i++;
-
-	sdata.asi_config[i].va = ASI_PERF_L2_TXN_INFO_FILTER;
-	sdata.asi_config[i].value = L2_TXN_INFO_FILTER_MASK;
-
-	sdata.asi_config_num = i + 1;
-
-	sdata.asi_sample[0] = ASI_PERF_L2_TXN_INFO;
-	sdata.asi_sample_num = 1;
-
-	return (program_a_sampler(pic, &sdata));
-}
-
-static int
-sample_l2_sampler(rk_pcbe_config_t *pic, int64_t *diffp)
-{
-#define	DS_SHIFT	34
-#define	EVT_SHIFT	22
-#define	TXN_SHIFT	7
-#define	DS_MASK		MAKE_MASK(2, 0)
-#define	EVT_MASK	MAKE_MASK(4, 0)
-#define	TXN_MASK	MAKE_MASK(7, 0)
-
-	rk_pcbe_ringbuf_t	*ringbuf = pic->sampler.ring_buffer;
-	uint32_t	value, target;
-	uint64_t	*head, *tail;
-	uint32_t	sample_count = 0, sample_hit_count = 0;
-	uint32_t	size = pic->sampler.sample_size;
-	uint8_t		ds, evt;
-	int		ret;
-
-	head =  RINGBUF_GET_HEAD(ringbuf);
-	tail =  RINGBUF_GET_TAIL(ringbuf);
-
-	if (head == tail) {
-		DBG_PRINT(("CPU-%d: HEAD eq TAIL to start with\n",
-		    CPU->cpu_id));
-	}
-
-	/* Consume samples */
-	while (head != tail) {
-		uint64_t rawvalue = *head;
-		DBG_PRINT(("CPU-%d: rawvalue=0x%lX\n", CPU->cpu_id, rawvalue));
-		target = TYPE(pic->sampler.syn_counter);
-
-		switch (GROUP(pic->sampler.syn_counter)) {
-		case L2_GROUP_DS:
-			value = (rawvalue >> DS_SHIFT) & DS_MASK;
-			DBG_PRINT(("CPU-%d: value=0x%X, target=0x%X\n",
-			    CPU->cpu_id, value, target));
-			switch (target) {
-			case DS_DRAM: /* FALLTHROUGH */
-			case DS_L3: /* FALLTHROUGH */
-			case DS_OTHER_L2: /* FALLTHROUGH */
-				if (value == target)
-					sample_hit_count++;
-				break;
-			}
-			break;
-		case L2_GROUP_TXN_MISS:
-			value = (rawvalue >> TXN_SHIFT) & TXN_MASK;
-			ds = (uint8_t)((rawvalue >> DS_SHIFT) & DS_MASK);
-			evt = (uint8_t)((rawvalue >> EVT_SHIFT) & EVT_MASK);
-			DBG_PRINT(("CPU-%d: value=0x%X, target=0x%X, "
-			    " ds: 0x%X, evt: 0x%X\n", CPU->cpu_id, value,
-			    target, ds, evt));
-			if (((value & target) != 0) && (evt == EVT_L2_MISS ||
-			    evt == EVT_L2_PRIOR_MISS) && (ds != DS_LOCAL_L2))
-				sample_hit_count++;
-			break;
-		case L2_GROUP_TXN_HIT:
-			value = (rawvalue >> TXN_SHIFT) & TXN_MASK;
-			ds = (uint8_t)((rawvalue >> DS_SHIFT) & DS_MASK);
-			evt = (uint8_t)((rawvalue >> EVT_SHIFT) & EVT_MASK);
-			DBG_PRINT(("CPU-%d: value=0x%X, target=0x%X, "
-			    " ds: 0x%X, evt: 0x%X\n", CPU->cpu_id, value,
-			    target, ds, evt));
-			if (((value & target) != 0) && (evt == EVT_L2_PEND_ST ||
-			    evt == EVT_L2_NOEVENTS) && (ds == DS_LOCAL_L2))
-				sample_hit_count++;
-			break;
-		case L2_GROUP_EVT:
-			evt = (rawvalue >> EVT_SHIFT) & EVT_MASK;
-			ds = (uint8_t)((rawvalue >> DS_SHIFT) & DS_MASK);
-			DBG_PRINT(("CPU-%d: evt=0x%X, target=0x%X, "
-			    "ds: 0x%X\n", CPU->cpu_id, evt, target, ds));
-
-			switch (target) {
-			case L2_HIT:
-				if ((evt == EVT_L2_NOEVENTS || evt ==
-				    EVT_L2_PEND_ST) && ds == DS_LOCAL_L2)
-					sample_hit_count++;
-				break;
-			case L2_MISS:
-				if ((evt == EVT_L2_MISS || evt ==
-				    EVT_L2_PRIOR_MISS) && ds == DS_LOCAL_L2)
-					sample_hit_count++;
-				break;
-			}
-		}
-		sample_count++;
-		RINGBUF_MOVE_HEAD(ringbuf, head, size);
-	}
-	RINGBUF_SET_HEAD(ringbuf, head);
-
-	ret = synthesize_sample_count(pic, sample_count, sample_hit_count,
-	    "sample_l2_sampler", diffp);
-
-	return (ret);
-}
-
-static int
-program_instr_sampler(rk_pcbe_config_t *pic)
-{
-#define	ASI_PERF_IS_PC_MASK		0x10
-#define	ASI_PERF_IS_PC_MATCH		0x18
-#define	ASI_PERF_IS_CC_LATENCY_MASK	0x160
-#define	ASI_PERF_IS_CONTEXT_FILTER	0x168
-#define	ASI_PERF_IS_INFO_MASK		0x170
-#define	ASI_PERF_IS_INFO_MATCH		0x178
-
-#define	ASI_PERF_IS_CONTEXT		0x108
-#define	ASI_PERF_IS_INFO		0x148
-
-#define	IS_BHR_LATENCY_CLAT_MASK	0xFFF
-#define	IS_CC_FILTER_TGTF_MASK		0x10
-#define	IS_CC_FILTER_TOF_MASK		0x8
-#define	IS_CC_LATENCY_FREQ_SHIFT	22
-
-
-	program_sampler_data_t	sdata;
-	int			i = 0;
-
-	(void) strcpy(sdata.name, "program_instr_sampler");
-	/*
-	 * If (((Reported Value ^ MATCH) & MASK) == 0) then sample is taken;
-	 */
-	sdata.asi_config[i].va = ASI_PERF_IS_PC_MASK;
-	sdata.asi_config[i].value = 0;
-	i++;
-
-	sdata.asi_config[i].va = ASI_PERF_IS_PC_MATCH;
-	sdata.asi_config[i].value = 0;
-	i++;
-
-	/*
-	 * Set CLAT_MASK to 0xFFF, meaning, drop instruction samples
-	 * whose latency is zero, means, sample all of them, because
-	 * all instructions has at least a latency of 1 cycle.
-	 */
-	sdata.asi_config[i].va = ASI_PERF_IS_CONTEXT_FILTER;
-	sdata.asi_config[i].value = (uint64_t)(IS_CC_FILTER_TGTF_MASK |
-	    IS_CC_FILTER_TOF_MASK | pic->sampler.flags);
-	i++;
-
-	/*
-	 * Even though frequency is set when started, it has to be
-	 * specified here, because, if left zero, then a PET is
-	 * immediately generated since the candidate counter is zero.
-	 */
-	sdata.asi_config[i].va = ASI_PERF_IS_CC_LATENCY_MASK;
-	sdata.asi_config[i].value = ((((uint64_t)pic->sampler.frequency) <<
-	    IS_CC_LATENCY_FREQ_SHIFT) | IS_BHR_LATENCY_CLAT_MASK);
-	i++;
-
-	sdata.asi_config[i].va = ASI_PERF_IS_INFO_MASK;
-	sdata.asi_config[i].value = 0;
-	i++;
-
-	sdata.asi_config[i].va = ASI_PERF_IS_INFO_MATCH;
-	sdata.asi_config[i].value = 0;
-
-	sdata.asi_config_num = i + 1;
-
-	sdata.asi_sample[0] = ASI_PERF_IS_INFO;
-	sdata.asi_sample[1] = ASI_PERF_IS_CONTEXT;
-	sdata.asi_sample_num = 2;
-
-	return (program_a_sampler(pic, &sdata));
-}
-
-static int
-sample_instr_sampler(rk_pcbe_config_t *pic, int64_t *diffp)
-{
-#define	I_MODE_SHIFT	34
-#define	I_TYPE_SHIFT	0
-#define	I_EVT_SHIFT	7
-#define	I_MODE_MASK	MAKE_MASK(3, 0)
-#define	I_TYPE_MASK	MAKE_MASK(7, 0)
-#define	I_EVT_MASK	MAKE_MASK(12, 0)
-
-	rk_pcbe_ringbuf_t	*ringbuf = pic->sampler.ring_buffer;
-	uint32_t	size = pic->sampler.sample_size;
-	uint32_t	value, target, shift, mask;
-	uint32_t	sample_count = 0, sample_hit_count = 0;
-	uint64_t	*head, *tail;
-	int		ret;
-
-	switch (GROUP(pic->sampler.syn_counter)) {
-	case I_GROUP_MODE:
-		mask = I_MODE_MASK;
-		shift = I_MODE_SHIFT;
-		break;
-	case I_GROUP_TYPE:
-		mask = I_TYPE_MASK;
-		shift = I_TYPE_SHIFT;
-		break;
-	case I_GROUP_EVT:
-		mask = I_EVT_MASK;
-		shift = I_EVT_SHIFT;
-		break;
-	default:
-		PRINT_PIC(pic, "No I_GROUP found");
-		ASSERT(0);
-		break;
-	}
-
-	head =  RINGBUF_GET_HEAD(ringbuf);
-	tail =  RINGBUF_GET_TAIL(ringbuf);
-
-	if (head == tail) {
-		DBG_PRINT(("CPU-%d: HEAD eq TAIL to start with\n",
-		    CPU->cpu_id));
-	}
-
-	/* Consume samples */
-	while (head != tail) {
-		/*
-		 * Data returned will be in the same order as the asi_list
-		 * passed to hypervisor during hv_rk_perf_sample_start call.
-		 */
-		uint64_t	rawvalue = *head;
-		uint64_t	context = *(head + 1);
-		uint8_t		tl = (uint8_t)((context >> 2) & 7);
-		int		drop_sample = B_FALSE;
-
-		if (rawvalue != 0) {
-			value = (rawvalue >> shift) & mask;
-			target = TYPE(pic->sampler.syn_counter);
-			DBG_PRINT(("CPU-%d: rawvalue=0x%lX, value=0x%X,"
-			    "target=0x%X\n", CPU->cpu_id, rawvalue, value,
-			    target));
-
-			/*
-			 * Several EVT fields are only valid for certain
-			 * instruction types.  Need to check TYP field
-			 * before trusting what's in EVT.
-			 */
-			if (GROUP(pic->sampler.syn_counter) == I_GROUP_EVT) {
-				uint64_t type = rawvalue >> I_TYPE_SHIFT;
-
-				switch (target) {
-				case EVT_DC_MISS:
-				case EVT_PRIOR_MISS:
-				case EVT_LDB_FULL:
-				case EVT_BYPASS_RAW:
-				case EVT_NONBYPASS_RAW:
-					if ((type & TYPE_LD) == 0)
-						drop_sample = B_TRUE;
-					break;
-				case EVT_STB_FULL:
-					if ((type & TYPE_ST) == 0)
-						drop_sample = B_TRUE;
-					break;
-				case EVT_DTLB_MISS:
-					if ((type & (TYPE_LD|TYPE_ST)) == 0)
-						drop_sample = B_TRUE;
-					break;
-				case EVT_CORRECT_BP:
-				case EVT_CTI_TAKEN:
-					if ((type & TYPE_CTI) == 0)
-						drop_sample = B_TRUE;
-					break;
-				}
-				DBG_PRINT(("CPU-%d: rawvalue=%lX, cleaned value"
-				    "=%X, target=%X\n", CPU->cpu_id, rawvalue,
-				    value, target));
-			}
-
-			/*
-			 * If user does not want to count instructions in scout
-			 * mode, and if the instruction sampled was in scout
-			 * mode, drop the sample.
-			 */
-			if (pic->sampler.nohws == B_TRUE) {
-				uint64_t mode = (rawvalue >> I_MODE_SHIFT) &
-				    I_MODE_MASK;
-				if (mode == MODE_HWS)
-					drop_sample = B_TRUE;
-			}
-
-			/*
-			 * If user wants to count instructions at a particular
-			 * trap level (0 or >0), and the samples are in
-			 * different trap level, drop the sample.
-			 */
-			switch (pic->sampler.tl) {
-			case TLZ: /* Sample ONLY instr at TL == 0 */
-				if (tl != 0)
-					drop_sample = B_TRUE;
-				break;
-			case TLNZ: /* Sample ONLY instr at TL > 0 */
-				if (tl == 0)
-					drop_sample = B_TRUE;
-				break;
-			}
-
-			switch (GROUP(pic->sampler.syn_counter)) {
-			case I_GROUP_MODE:
-				/* Fields that are integers */
-				if (value == target && drop_sample == B_FALSE)
-					sample_hit_count++;
-				break;
-			case I_GROUP_EVT:
-			case I_GROUP_TYPE:
-				/* Fields that are bit vectors */
-				if (value & target && drop_sample == B_FALSE)
-					sample_hit_count++;
-				break;
-			default:
-				ASSERT(0); /* missing case statement */
-			}
-		}
-		sample_count++;
-		RINGBUF_MOVE_HEAD(ringbuf, head, size);
-	}
-	RINGBUF_SET_HEAD(ringbuf, head);
-
-	ret = synthesize_sample_count(pic, sample_count, sample_hit_count,
-	    "sample_instr_sampler", diffp);
-
-	return (ret);
-}
-
-/*
- * mccdesr counters are synthetic counters. Hypervisor maintains
- * a 64 bit memory based counter. Therefore we can assume that
- * this counter never overflows.
- */
-static	int
-sample_mccdesr(rk_pcbe_config_t *pic, int64_t *diffp)
-{
-	uint64_t	rc = 0;
-	uint64_t	counter_value;
-	rc = hv_rk_perf_count_get((uint64_t)(pic->counter |
-	    pic->src_type), &counter_value);
-	if (rc == H_EOK) {
-		counter_value &= COUNTER_MASK(pic);
-		*diffp = counter_value - pic->pcbe_pic;
-		pic->pcbe_pic = counter_value;
-		if (*diffp < 0) {
-			cmn_err(CE_WARN, "CPU-%d: Pic-%d, counter: %X overflow",
-			    CPU->cpu_id, pic->pcbe_picno, pic->counter);
-		}
-	} else {
-		cmn_err(CE_WARN, "CPU-%d: Failed to sample pic-%d, counter-%X",
-		    CPU->cpu_id, pic->pcbe_picno, pic->counter);
-	}
-	return ((int)rc);
-}
-
-static int
-program_a_sampler(rk_pcbe_config_t *pic, program_sampler_data_t *sdata)
-{
-	uint64_t	ringbuf_pa, asi_list_pa, counter, rc;
-	int		hv_call_cnt = 1, ret = 0, need_init = 0, i;
-	uint64_t	temp_pcbe_pic = 0;
-
-	counter = (uint64_t)(pic->counter | pic->src_type);
-
-	if (pic->sampler.ring_buffer == NULL) {
-		pic->sampler.sample_size = sdata->asi_sample_num *
-		    sizeof (uint64_t);
-		rc = alloc_ringbuffer(pic, pic->sampler.sample_size,
-		    num_ringbuf_entries);
-		if (rc != 0)
-			return ((int)rc);
-		need_init = 1;
-		PRINT_PIC(pic, "After Configuration (S)");
-	}
-
-	if (need_init || pic->state == STATE_RELEASED) {
-		ringbuf_pa = va_to_pa(pic->sampler.ring_buffer);
-		rc = hv_rk_perf_sample_init(counter, ringbuf_pa);
-		print_hv_error(rc, &hv_call_cnt, sdata->name, pic);
-		if (rc != H_EOK)
-			return ((int)rc);
-	}
-
-	/*
-	 * If (((Reported Value ^ MATCH) & MASK) == 0) then sample is taken;
-	 */
-	for (i = 0; i < sdata->asi_config_num; i++) {
-		rc = hv_rk_perf_sample_config(counter, sdata->asi_config[i].va,
-		    sdata->asi_config[i].value);
-		ret |= (int)rc;
-		print_hv_error(rc, &hv_call_cnt, sdata->name, pic);
-	}
-
-	/*
-	 * pic->pcbe_pic is used to hold preset value in case of synthetic
-	 * counters
-	 */
-	if (pic->pcbe_pic > 0) {
-		temp_pcbe_pic = pic->pcbe_pic;
-		pic->pcbe_pic = 0;
-	}
-	ret |= rk_pcbe_program_normal(pic); /* Reset to zero & start counting */
-	pic->pcbe_pic = temp_pcbe_pic;
-
-	/*
-	 * Start sampling
-	 *
-	 * Data returned in the ringbuffer by the hypervisor will be in the
-	 * same order as it is programmed
-	 */
-	asi_list_pa = va_to_pa(sdata->asi_sample);
-	rc = hv_rk_perf_sample_start(counter, pic->sampler.frequency,
-	    sdata->asi_sample_num * sizeof (uint64_t), asi_list_pa);
-	ret |= (int)rc;
-	print_hv_error(rc, &hv_call_cnt, sdata->name, pic);
-	return (ret);
-}
-
-static int
-synthesize_sample_count(rk_pcbe_config_t *pic, uint64_t sample_count,
-	uint64_t sample_hit_count, char *name, int64_t *diffp)
-{
-
-	uint64_t	total_count, rc, ovf_count, hit_count = 0;
-	int		hv_call_cnt = 1, ret = 0;
-	/*
-	 * Since ring buffer is consumed, clear pending sample count.
-	 * Sample count is discarded, therefore reusing a variable.
-	 */
-	rc = hv_rk_perf_sample_pending((uint64_t)(pic->counter |
-	    pic->src_type), &total_count);
-	ret |= (int)rc;
-	print_hv_error(rc, &hv_call_cnt, name, pic);
-
-	/* Check if the counter overflowed */
-	rc = hv_rk_perf_count_overflow((uint64_t)(pic->counter |
-	    pic->src_type), &ovf_count);
-	ret |= (int)rc;
-	print_hv_error(rc, &hv_call_cnt, name, pic);
-
-	if (rc != H_EOK)
-		ovf_count = 0;
-
-	rc = hv_rk_perf_count_get((uint64_t)(pic->counter |
-	    pic->src_type), &total_count);
-	ret |= (int)rc;
-	print_hv_error(rc, &hv_call_cnt, name, pic);
-
-	if (rc != H_EOK)
-		total_count = 0;
-
-	total_count &= COUNTER_MASK(pic);
-
-	/*
-	 * Reset it to zero so that we need not maintain old value
-	 */
-	rc = hv_rk_perf_count_set((uint64_t)(pic->counter | pic->src_type), 0);
-	ret |= (int)rc;
-	print_hv_error(rc, &hv_call_cnt, name, pic);
-
-	/*
-	 * ovf_count > 0 means, counter has hit max, ovf_count times
-	 * before counting total_count of instructions. Therefore
-	 * add total_count to ovf_count times max count value.
-	 */
-	if (ovf_count)
-		total_count += (ovf_count * (0x1ULL << pic->counter_bits));
-
-	if (sample_count > 0)
-		hit_count = (sample_hit_count * total_count) / sample_count;
-
-	*diffp = (int64_t)hit_count;
-	DBG_PRINT(("CPU-%d: sample_instr_load. hit_count: %lu, *diffp: %ld\n",
-	    CPU->cpu_id, hit_count, *diffp));
-	if (*diffp < 0) {
-		cmn_err(CE_WARN, "CPU-%d Negative instr count. hit_count: %lu, "
-		    "*diffp: %ld\n", CPU->cpu_id, hit_count, *diffp);
-	}
-
-	if (pic->pcbe_pic) {
-		*diffp += pic->pcbe_pic;	/* Add the preset value */
-	/*
-	 * pic->pcbe_pic is used to hold preset value in case of synthetic
-	 * counters
-	 */
-		pic->pcbe_pic = 0;
-	}
-	return (ret);
-}
-
-static int
-alloc_ringbuffer(rk_pcbe_config_t *pic, uint32_t size,
-						uint32_t num_samples)
-{
-	uint32_t	ringbuf_size;
-	uint32_t	asize = 2;
-	rk_pcbe_ringbuf_t	*ringbuf;
-	ASSERT(!(num_samples & 1)); /* Assert number of samples is even */
-
-	ringbuf_size = sizeof (rk_pcbe_ringbuf_t) + (size * num_samples);
-
-	/* Size should be a power of 2 */
-	while ((ringbuf_size & (asize - 1)) != ringbuf_size)
-		asize <<= 1;
-
-	ringbuf = contig_mem_alloc_align_sleep(asize, 0);
-	if (ringbuf == NULL) {
-		cmn_err(CE_WARN, "CPU-%d: Ringbuffer memory allocation failed!",
-		    CPU->cpu_id);
-		return (-1);
-	}
-	pic->sampler.ring_buffer = ringbuf;
-	ringbuf->head = NULL;
-	ringbuf->tail = NULL;
-	ringbuf->size = size * num_samples;
-	ringbuf->hwm = ringbuf->size >> 1;
-	return (0);
-}
-
-static void
-free_ringbuffer(rk_pcbe_config_t *pic)
-{
-	rk_pcbe_ringbuf_t	*ringbuf = pic->sampler.ring_buffer;
-	/*
-	 * When multiple pics are used and one of the pics was not configurable
-	 * (eg: Bad attribute), then cpc calls rk_pcbe_free for the pics that
-	 * were already configured. This results in calling this routine with
-	 * NULL ringbuf, since ringbuf is allocated when the first sample is
-	 * taken. To protect against this condition, we need do the following
-	 * check before calling contig_mem_free since it uses ringbuf->size.
-	 */
-	if (ringbuf) {
-		uint32_t	ringbuf_size;
-		uint32_t	asize = 2;
-		DBG_PRINT(("CPU-%d: free_ringbuffer freeing %d bytes\n",
-		    CPU->cpu_id,
-		    (int)(sizeof (rk_pcbe_ringbuf_t) + ringbuf->size)));
-		ringbuf_size = sizeof (rk_pcbe_ringbuf_t) + ringbuf->size;
-		while ((ringbuf_size & (asize - 1)) != ringbuf_size)
-			asize <<= 1;
-		contig_mem_free(ringbuf, asize);
-	}
-}
-
-static void
-print_hv_error(uint64_t rc, int *cntp, char *funcname, rk_pcbe_config_t *pic)
-{
-	ASSERT(cntp && pic);
-	if (rc != H_EOK) {
-		cmn_err(CE_WARN, "{%d} pgm-hw call-%d in %s returned 0x%lX for "
-		    "pic %d cntr %X", CPU->cpu_id, *cntp, funcname, rc,
-		    pic->pcbe_picno, pic->counter);
-	}
-	(*cntp)++;
-}
-
-static	void
-set_string_constants(void)
-{
-	if (strncmp(cpu_module_name, "SUNW,", 5) == 0)
-		rock_name = &cpu_module_name[5];
-	else
-		rock_name = cpu_module_name;
-	(void) strcpy(rock_cpuref, "See the \"");
-	(void) strcat(rock_cpuref, rock_name);
-	(void) strcat(rock_cpuref, " User's Manual\" for descriptions of "
-	    "these events. "CPU_REF_URL);
-	(void) strcat(pcbe_module_name, cpu_module_name);
-}
-
-static	uint64_t
-bitmask(uint8_t bits)
-{
-	if (bits < 64)
-		return ((1ULL << bits) - 1);
-	return (-1);
-}
-
-#ifdef RKPCBE_DBG
-static	void
-set_pic_name(rk_pcbe_config_t *pic)
-{
-	uint32_t	bits;
-	const struct nametable	*n;
-
-	/*
-	 * For normal instruction counter, the 'bits' value is not saved.
-	 */
-	if (pic->counter_type == NORMAL_COUNTER) {
-		if (pic->counter == RK_PERF_INSTR) {
-			(void) strcpy(pic->name, "Instr_All");
-			return;
-		}
-		bits = pic->flags;
-	}
-	else
-		bits = pic->sampler.syn_counter;
-
-	for (n = events[pic->pcbe_picno]; n->bits != NT_END; n++) {
-		if (n->bits == bits) {
-			(void) strcpy(pic->name, n->name);
-			break;
-		}
-	}
-}
-
-static void
-print_pic(rk_pcbe_config_t *pic, char *heading)
-{
-	ASSERT(pic);
-	/*
-	 * On multi strand system, the print gets clobberd. Therefore
-	 * grab a lock so that the output is legible.
-	 */
-	mutex_enter(&print_pic_lock);
-	printf("{CPU-%d} %s:\n", CPU->cpu_id, heading);
-	printf("pic addr     : %p\n", (void *)pic);
-	printf("name         : %s\n", pic->name);
-	printf("pcbe_picno   : %d\n", pic->pcbe_picno);
-	printf("counter_bits : 0x%X\n", pic->counter_bits);
-	printf("counter_type : 0x%X\n", pic->counter_type);
-	printf("toe          : %d\n", pic->toe);
-	printf("counter      : 0x%X\n", pic->counter);
-	printf("src_type     : 0x%X\n", pic->src_type);
-	printf("flags        : 0x%X\n", pic->flags);
-	printf("pcbe_pic     : %ld\n", pic->pcbe_pic);
-	printf("inuse        : %d\n", pic->inuse);
-	printf("state        : 0x%X\n", pic->state);
-	printf("cpu          : %d\n", pic->cpu);
-	if (pic->counter_type == SYNTHETIC_COUNTER) {
-		printf("Synthetic counter:\n");
-		printf("\tsyn_pic: 0x%X\n", (int)pic->sampler.synthetic_pic);
-		printf("\tfreq   : %d\n", pic->sampler.frequency);
-		printf("\tsyn_cnt: 0x%X\n", pic->sampler.syn_counter);
-		printf("\tsize   : %d bytes\n", pic->sampler.sample_size);
-		printf("\tflags  : 0x%X\n", pic->sampler.flags);
-		printf("\ttl     : 0x%X\n", pic->sampler.tl);
-		printf("\tnohws  : 0x%X\n", pic->sampler.nohws);
-		printf("\trbuf   : 0x%p\n", (void *)pic->sampler.ring_buffer);
-		if (pic->sampler.ring_buffer) {
-			rk_pcbe_ringbuf_t *rb = pic->sampler.ring_buffer;
-			printf("\tRingbuffer:\n");
-			printf("\t\tHead: 0x%X\n", rb->head);
-			printf("\t\tTail: 0x%X\n", rb->tail);
-			printf("\t\tSize: 0x%X\n", rb->size);
-			printf("\t\tHwm : 0x%X\n", rb->hwm);
-		}
-	}
-	printf("-----------------\n");
-	mutex_exit(&print_pic_lock);
-}
-#endif
--- a/usr/src/uts/sun4v/rock/Makefile	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,113 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
-#	This makefile drives the production of the UltraSPARC-AT10 cpu module.
-#
-#	sun4v implementation architecture dependent
-#
-
-#
-#	Path to the base of the uts directory tree (usually /usr/src/uts).
-#
-UTSBASE	= ../..
-
-#
-#	Define the module and object file sets.
-#
-MODULE		= SUNW,UltraSPARC-AT10
-OBJECTS		= $(ROCKCPU_OBJS:%=$(OBJS_DIR)/%)
-LINTS		= $(ROCKCPU_OBJS:%.o=$(LINTS_DIR)/%.ln)
-ROOTMODULE	= $(ROOT_PSM_CPU_DIR)/$(MODULE)
-
-CPU_DIR		= .
-HERE		= ../rock
-
-#
-#	Include common rules.
-#
-include $(UTSBASE)/sun4v/Makefile.sun4v
-
-#
-#	Override defaults
-#
-CLEANFILES	+= $(CPULIB) $(SYM_MOD)
-
-#
-#	Define targets
-#
-ALL_TARGET	= $(SYM_MOD)
-LINT_TARGET	= $(MODULE).lint
-INSTALL_TARGET	= def $(BINARY) $(ROOTMODULE)
-
-#
-# The ATOMIC_BO_ENABLE_SHIFT enables backoff in atomic routines.
-# ATOMIC_SIMPLE_BO_ENABLE enables simple backoff required for rock
-#
-ATOMIC_BO_FLAG = -DATOMIC_BO_ENABLE_SHIFT=14 -DATOMIC_SIMPLE_BO_ENABLE
-
-#
-# lint pass one enforcement
-#
-CFLAGS += $(CCVERBOSE) $(ATOMIC_BO_FLAG)
-
-#
-# cpu-module-specific flags
-#
-CPPFLAGS +=    -DCPU_MODULE $(ATOMIC_BO_FLAG)
-AS_CPPFLAGS += -DCPU_MODULE -DCUSTOM_FPZERO $(ATOMIC_BO_FLAG)
-LINTFLAGS   += -DCUSTOM_FPZERO
-
-#
-#	Default build targets.
-#
-.KEEP_STATE:
-
-def:		$(DEF_DEPS)
-
-all:		$(ALL_DEPS)
-
-clean:		$(CLEAN_DEPS)
-
-clobber:	$(CLOBBER_DEPS)
-
-lint:		$(LINT_DEPS)
-
-modlintlib:	$(MODLINTLIB_DEPS)
-
-clean.lint:	$(CLEAN_LINT_DEPS)
-
-install:	$(INSTALL_DEPS)
-
-$(CPULIB):	$(BINARY)
-	$(LD) -o $(CPULIB) -G $(BINARY)
-
-$(SYM_MOD):	$(UNIX_O) $(CPULIB)
-	@echo "resolving symbols against unix.o"
-	@(cd $(UNIX_DIR); pwd; \
-	    CPU_DIR=$(HERE) SYM_MOD=$(HERE)/$(SYM_MOD) $(MAKE) symcheck)
-
-#	Include common targets.
-#
-include $(UTSBASE)/$(PLATFORM)/Makefile.targ
--- a/usr/src/uts/sun4v/rock_pcbe/Makefile	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,77 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
-# This Makefile builds the Rock Performance Counter BackEnd (PCBE).
-#
-
-UTSBASE = ../..
-
-#
-#	Define module and object file sets.
-#
-MODULE		= pcbe.SUNW,UltraSPARC-AT10
-OBJECTS		= $(RK_PCBE_OBJS:%=$(OBJS_DIR)/%)
-LINTS		= $(RK_PCBE_OBJS:%.o=$(LINTS_DIR)/%.ln)
-ROOTMODULE	= $(ROOT_PSM_PCBE_DIR)/$(MODULE)
-
-#
-#	Include common rules.
-#
-include $(UTSBASE)/sun4v/Makefile.sun4v
-
-#
-#	Define targets.
-#
-ALL_TARGET	= $(BINARY)
-LINT_MODULE	= rock_pcbe
-LINT_TARGET	= $(LINT_MODULE).lint
-INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
-EXTRA_OPTIONS	+= -URKPCBE_DBG
-
-#
-#	Default build targets.
-#
-.KEEP_STATE:
-
-def:		$(DEF_DEPS)
-
-all:		$(ALL_DEPS)
-
-clean:		$(CLEAN_DEPS)
-
-clobber:	$(CLOBBER_DEPS)
-
-lint:		$(LINT_DEPS)
-
-modlintlib:	$(MODLINTLIB_DEPS)
-
-clean.lint:	$(CLEAN_LINT_DEPS)
-
-install:	$(INSTALL_DEPS)
-
-#
-#	Include common targets.
-#
-include $(UTSBASE)/sun4v/Makefile.targ
--- a/usr/src/uts/sun4v/sys/error.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/sys/error.h	Thu Aug 06 17:39:39 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -118,7 +118,7 @@
 	uint64_t	stick;		/* Value of the %STICK register */
 	uint32_t	desc;		/* Error Descriptor */
 	uint32_t	attr;		/* error attributes bit field */
-	uint64_t	addr;		/* va for ERRH_ATTR_ASI, otherwise ra */
+	uint64_t	ra;		/* Real address */
 	uint32_t	sz;		/* Size of affected mem region */
 	uint16_t	cpuid;		/* Virtual ID of the affected CPU */
 	uint16_t	secs;		/* Seconds */
--- a/usr/src/uts/sun4v/sys/hsvc.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/sys/hsvc.h	Thu Aug 06 17:39:39 2009 -0700
@@ -42,7 +42,6 @@
 #define	HSVC_GROUP_CORE			0x0001
 #define	HSVC_GROUP_INTR			0x0002
 #define	HSVC_GROUP_SOFT_STATE		0x0003
-#define	HSVC_GROUP_MEM_IFLUSH		0x0010
 #define	HSVC_GROUP_TM			0x0080
 #define	HSVC_GROUP_VPCI			0x0100
 #define	HSVC_GROUP_LDC			0x0101
@@ -54,9 +53,6 @@
 #define	HSVC_GROUP_NIAGARA2_CPU		0x0202
 #define	HSVC_GROUP_NIU			0x0204
 #define	HSVC_GROUP_VFALLS_CPU		0x0205
-#define	HSVC_GROUP_RKPERF		0x0206
-#define	HSVC_GROUP_RKMMU_EXT		0x0207
-#define	HSVC_GROUP_RKCPU		0x0208
 #define	HSVC_GROUP_DIAG			0x0300
 
 #ifndef _ASM
@@ -82,8 +78,6 @@
  */
 #define	HSVC_REV_1		1
 
-extern	int	hsvc_kdi_mem_iflush_negotiated;
-
 /*
  * External interface
  */
--- a/usr/src/uts/sun4v/sys/hypervisor_api.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/sys/hypervisor_api.h	Thu Aug 06 17:39:39 2009 -0700
@@ -110,8 +110,6 @@
 
 #define	HV_MEM_SCRUB		0x31
 #define	HV_MEM_SYNC		0x32
-#define	HV_MEM_IFLUSH		0x33
-#define	HV_MEM_IFLUSH_ALL	0x34
 
 #define	HV_INTR_SEND		0x42
 
@@ -199,12 +197,6 @@
 #define	MAP_DTLB		0x1
 #define	MAP_ITLB		0x2
 
-/*
- * Definitions for TLB Search Order functions
- */
-#define	TLB_SO_DATA		0x1
-#define	TLB_SO_INS 		0x2
-#define	TLB_SO_ID 		TLB_SO_DATA | TLB_SO_INS
 
 /*
  * Interrupt state manipulation definitions.
@@ -325,7 +317,6 @@
  */
 #define	HVIO_DMA_SYNC_DIR_TO_DEV		0x01
 #define	HVIO_DMA_SYNC_DIR_FROM_DEV		0x02
-#define	HVIO_DMA_SYNC_DIR_NO_ICACHE_FLUSH	0x04
 
 /*
  * LDC Channel States
@@ -360,9 +351,6 @@
     uint64_t *scrubbed_len);
 extern uint64_t hv_mem_sync(uint64_t real_addr, uint64_t length,
     uint64_t *flushed_len);
-extern uint64_t hv_mem_iflush(uint64_t real_addr, uint64_t length,
-    uint64_t *flushed_len);
-extern uint64_t hv_mem_iflush_all(void);
 extern uint64_t hv_tm_enable(uint64_t enable);
 
 extern uint64_t hv_service_recv(uint64_t s_id, uint64_t buf_pa,
--- a/usr/src/uts/sun4v/sys/machcpuvar.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/sys/machcpuvar.h	Thu Aug 06 17:39:39 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -181,7 +181,6 @@
 	id_t		cpu_core;		/* cpu core id */
 	id_t		cpu_chip;		/* cpu chip id */
 	kthread_t	*startup_thread;
-	uint64_t	cpu_nre_error;		/* nonresumable error */
 };
 
 typedef	struct machcpu	machcpu_t;
--- a/usr/src/uts/sun4v/sys/machsystm.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/sys/machsystm.h	Thu Aug 06 17:39:39 2009 -0700
@@ -249,7 +249,6 @@
 extern void *contig_mem_alloc(size_t);
 extern void *contig_mem_alloc_align(size_t, size_t);
 extern void contig_mem_free(void *, size_t);
-extern void *contig_mem_alloc_align_sleep(size_t, size_t);
 
 /*
  * Caches
--- a/usr/src/uts/sun4v/sys/mmu.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/sys/mmu.h	Thu Aug 06 17:39:39 2009 -0700
@@ -156,18 +156,6 @@
 #define	MIN_NSHCONTEXTS			1
 #define	MIN_NTSBS			4
 
-/*
- * The number of shared contexts supported in search list entries for the
- * pagesize register.
- */
-#define	NSEARCH_SHCONTEXTS		1
-
-/*
- * The maximum number of entries allowed in a search list for the pagesize
- * register.
- */
-#define	MAX_PGSZ_SEARCH_ORDER		8
-
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/sun4v/sys/pte.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/sys/pte.h	Thu Aug 06 17:39:39 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -61,7 +61,7 @@
 		unsigned int	w:1;		/* <6> write perm */
 		unsigned int	ref:1;		/* <5> sw - ref */
 		unsigned int	wr_perm:1;	/* <4> sw - write perm */
-		unsigned int	xsoft:1;	/* <3> sw - soft execute */
+		unsigned int	rsvd:1;		/* <3> reserved */
 		unsigned int	sz:3;		/* <2:0> pagesize */
 	} tte_bit;
 	struct {
@@ -83,7 +83,6 @@
 #define	tte_no_sync	tte_bit.no_sync
 #define	tte_suspend	tte_bit.susp
 #define	tte_exec_perm	tte_bit.x
-#define	tte_soft_exec	tte_bit.xsoft
 #define	tte_lock	tte_bit.lock
 #define	tte_cp		tte_bit.cp
 #define	tte_cv		tte_bit.cv
@@ -163,7 +162,6 @@
 #define	TTE_HWWR_INT			0x00000040
 #define	TTE_REF_INT			0x00000020
 #define	TTE_WRPRM_INT			0x00000010
-#define	TTE_SOFTEXEC_INT		0x00000008
 
 #define	TTE_PROT_INT			(TTE_WRPRM_INT | TTE_PRIV_INT)
 
@@ -245,7 +243,6 @@
 #define	TTE_IS_8K(ttep)		(TTE_CSZ(ttep) == TTE8K)
 #define	TTE_IS_WRITABLE(ttep)	((ttep)->tte_wr_perm)
 #define	TTE_IS_EXECUTABLE(ttep)	((ttep)->tte_exec_perm)
-#define	TTE_IS_SOFTEXEC(ttep)	((ttep)->tte_soft_exec)
 #define	TTE_IS_PRIVILEGED(ttep)	((ttep)->tte_priv)
 #define	TTE_IS_NOSYNC(ttep)	((ttep)->tte_no_sync)
 #define	TTE_IS_LOCKED(ttep)	((ttep)->tte_lock)
@@ -275,8 +272,6 @@
 #define	TTE_CLR_WRT(ttep)	((ttep)->tte_wr_perm = 0)
 #define	TTE_SET_EXEC(ttep)	((ttep)->tte_exec_perm = 1)
 #define	TTE_CLR_EXEC(ttep)	((ttep)->tte_exec_perm = 0)
-#define	TTE_SET_SOFTEXEC(ttep)	((ttep)->tte_soft_exec = 1)
-#define	TTE_CLR_SOFTEXEC(ttep)	((ttep)->tte_soft_exec = 0)
 #define	TTE_SET_PRIV(ttep)	((ttep)->tte_priv = 1)
 #define	TTE_CLR_PRIV(ttep)	((ttep)->tte_priv = 0)
 
--- a/usr/src/uts/sun4v/sys/rock_hypervisor_api.h	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,100 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ROCK_HYPERVISOR_API_H
-#define	_SYS_ROCK_HYPERVISOR_API_H
-
-/*
- * sun4v rock Hypervisor API
- *
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Function numbers for managing the Rock TLB page size register.
- */
-#define	MMU_GET_NONPRIV_SEARCH		0x13b
-#define	MMU_SET_NONPRIV_SEARCH		0x13c
-#define	MMU_GET_PRIV_SEARCH		0x13d
-#define	MMU_SET_PRIV_SEARCH		0x13e
-
-/*
- * Function numbers for performance counters
- */
-#define	HV_RK_PERF_COUNT_INIT		0x108
-#define	HV_RK_PERF_COUNT_RELEASE	0x109
-#define	HV_RK_PERF_COUNT_SET		0x10A
-#define	HV_RK_PERF_COUNT_GET		0x10B
-#define	HV_RK_PERF_COUNT_START		0x10C
-#define	HV_RK_PERF_COUNT_OVERFLOW	0x10D
-#define	HV_RK_PERF_COUNT_STOP		0x10E
-
-#define	HV_RK_PERF_SAMPLE_INIT		0x135
-#define	HV_RK_PERF_SAMPLE_RELEASE	0x136
-#define	HV_RK_PERF_SAMPLE_CONFIG	0x137
-#define	HV_RK_PERF_SAMPLE_START		0x138
-#define	HV_RK_PERF_SAMPLE_PENDING	0x139
-#define	HV_RK_PERF_SAMPLE_STOP		0x13A
-
-#define	HV_RK_PERF_SRC_STRAND		0x1	/* Local Strand */
-#define	HV_RK_PERF_SRC_STRAND_M		0x2	/* Multiple Strands */
-#define	HV_RK_PERF_SRC_SIU		0x4	/* L2 txn source */
-#define	HV_RK_PERF_SRC_MMU		0x8	/* L2 txn source */
-#define	HV_RK_PERF_SRC_MASK		0xF
-
-#define	ROCK_HSVC_MAJOR		1
-#define	ROCK_HSVC_MINOR		0
-
-#ifndef	_ASM
-
-/* Performance Counter API */
-extern uint64_t hv_rk_perf_count_init(uint64_t counter);
-extern uint64_t hv_rk_perf_count_release(uint64_t counter);
-extern uint64_t hv_rk_perf_count_set(uint64_t counter, uint64_t value);
-extern uint64_t hv_rk_perf_count_get(uint64_t counter, uint64_t *value);
-extern uint64_t hv_rk_perf_count_start(uint64_t counter, uint64_t value);
-extern uint64_t hv_rk_perf_count_overflow(uint64_t counter, uint64_t *ovf_cnt);
-extern uint64_t hv_rk_perf_count_stop(uint64_t counter);
-
-/* Performance Sampler API */
-extern uint64_t hv_rk_perf_sample_init(uint64_t sampler, uint64_t ringbuf_pa);
-extern uint64_t hv_rk_perf_sample_release(uint64_t sampler);
-extern uint64_t hv_rk_perf_sample_config(uint64_t sampler, uint64_t reg_va,
-							uint64_t reg_value);
-extern uint64_t hv_rk_perf_sample_start(uint64_t sampler, uint64_t freq,
-					uint64_t list_size, uint64_t valist_pa);
-extern uint64_t hv_rk_perf_sample_pending(uint64_t sampler, uint64_t *pend_cnt);
-extern uint64_t hv_rk_perf_sample_stop(uint64_t counter);
-#endif	/* _ASM */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_ROCK_HYPERVISOR_API_H */
--- a/usr/src/uts/sun4v/sys/rockasi.h	Thu Aug 06 17:19:00 2009 -0700
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,68 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_ROCKASI_H
-#define	_SYS_ROCKASI_H
-
-/*
- * alternate address space identifiers
- *
- * 0x00 - 0x2F are privileged
- * 0x30 - 0x7f are hyperprivileged
- * 0x80 - 0xFF can be used by non-privileged, privileged & hyperprivileged
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * ROCK specific ASIs
- */
-#define	ASI_CACHE_SPARING_P	0xF4	/* Cache sparing */
-
-#ifndef	_ASM
-struct	cpsregs {
-	uint64_t	fails;
-	uint64_t	exog;
-	uint64_t	coh;
-	uint64_t	tcc;
-	uint64_t	instr;
-	uint64_t	precise;
-	uint64_t	async;
-	uint64_t	size;
-	uint64_t	ld;
-	uint64_t	st;
-	uint64_t	cti;
-	uint64_t	fp;
-	uint64_t	zeros;
-};
-#endif	/* _ASM */
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_ROCKASI_H */
--- a/usr/src/uts/sun4v/vm/mach_sfmmu.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/vm/mach_sfmmu.c	Thu Aug 06 17:39:39 2009 -0700
@@ -45,7 +45,6 @@
 #include <sys/vmsystm.h>
 #include <sys/bitmap.h>
 #include <vm/rm.h>
-#include <vm/vm_dep.h>
 #include <sys/t_lock.h>
 #include <sys/vm_machparam.h>
 #include <sys/promif.h>
@@ -60,7 +59,6 @@
 #include <sys/reboot.h>
 #include <sys/kdi.h>
 #include <sys/hypervisor_api.h>
-#include <sys/hsvc.h>
 
 /*
  * External routines and data structures
@@ -169,7 +167,7 @@
 		prom_panic("can't find kernel text pfn");
 	pfn &= TTE_PFNMASK(TTE4M);
 
-	attr = PROC_TEXT | HAT_NOSYNC | HAT_ATTR_NOSOFTEXEC;
+	attr = PROC_TEXT | HAT_NOSYNC;
 	flags = HAT_LOAD_LOCK | SFMMU_NO_TSBLOAD;
 	sfmmu_memtte(&ktext_tte, pfn, attr, TTE4M);
 	/*
@@ -185,7 +183,7 @@
 		prom_panic("can't find kernel data pfn");
 	pfn &= TTE_PFNMASK(TTE4M);
 
-	attr = PROC_DATA | HAT_NOSYNC | HAT_ATTR_NOSOFTEXEC;
+	attr = PROC_DATA | HAT_NOSYNC;
 	sfmmu_memtte(&kdata_tte, pfn, attr, TTE4M);
 	/*
 	 * We set the lock bit in the tte to lock the translation in
@@ -210,7 +208,7 @@
 		ASSERT(tsbsz >= MMU_PAGESIZE4M);
 		ASSERT(IS_P2ALIGNED(tsbsz, tsbsz));
 		ASSERT(IS_P2ALIGNED(va, tsbsz));
-		attr = PROC_DATA | HAT_NOSYNC | HAT_ATTR_NOSOFTEXEC;
+		attr = PROC_DATA | HAT_NOSYNC;
 		while (tsbsz != 0) {
 			ASSERT(i < MAX_BIGKTSB_TTES);
 			pfn = va_to_pfn(va);
@@ -294,8 +292,7 @@
 	pfn_t pfn = va_to_pfn(va);
 	uint64_t ret;
 
-	sfmmu_memtte(&tte, pfn, PROC_TEXT | HAT_NOSYNC | HAT_ATTR_NOSOFTEXEC,
-	    TTE8K);
+	sfmmu_memtte(&tte, pfn, (PROC_TEXT | HAT_NOSYNC), TTE8K);
 	ret = hv_mmu_map_perm_addr(va, KCONTEXT, *(uint64_t *)&tte,
 	    MAP_ITLB | (do_dtlb ? MAP_DTLB : 0));
 
@@ -481,22 +478,3 @@
 sfmmu_cache_flushall()
 {
 }
-
-/*
- * Initialise the real address field in sfmmu_pgsz_order.
- */
-void
-sfmmu_init_pgsz_hv(sfmmu_t *sfmmup)
-{
-	int i;
-
-	/*
-	 * Initialize mmu counts for pagesize register programming.
-	 */
-	for (i = 0; i < max_mmu_page_sizes; i++) {
-		sfmmup->sfmmu_mmuttecnt[i] = 0;
-	}
-
-	sfmmup->sfmmu_pgsz_order.hv_pgsz_order_pa =
-	    va_to_pa(&sfmmup->sfmmu_pgsz_order.hv_pgsz_order);
-}
--- a/usr/src/uts/sun4v/vm/mach_sfmmu.h	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/vm/mach_sfmmu.h	Thu Aug 06 17:39:39 2009 -0700
@@ -36,7 +36,6 @@
 
 #include <sys/x_call.h>
 #include <sys/hypervisor_api.h>
-#include <sys/mmu.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -61,29 +60,8 @@
 	hv_tsb_info_t	hv_tsb_info[NHV_TSB_INFO]; /* hypervisor TSB info */
 };
 
-/*
- * Defines for hypervisor pagesize search API.
- */
-
-#define	TLB_PGSZ_ENABLE_SHIFT	15
-#define	TLB_PGSZ_CTX_SHIFT	7
-#define	TLB_PGSZ_ENABLE		(1<<TLB_PGSZ_ENABLE_SHIFT)
-#define	TLB_PGSZ_CONTEXT1	(1<<TLB_PGSZ_CTX_SHIFT)
-#define	TLB_PGSZ_CONTEXT1_ENABLE (TLB_PGSZ_ENABLE|TLB_PGSZ_CONTEXT1)
-
-struct hv_pgsz_order {
-	uint64_t hv_pgsz_order_pa;	/* hypervisor pagesize order PA */
-					/* hypervisor pagesize order */
-	uint16_t hv_pgsz_order[MAX_PGSZ_SEARCH_ORDER];
-};
-
-#define	sfmmu_pgsz_order_hv sfmmu_pgsz_order.hv_pgsz_order
-
 #endif /* _ASM */
 
-/* value for sfmmu_pgsz_map if all shared pagesizes are allowed */
-#define	TLB_ALL_SHARED_PGSZ	0xff
-
 #ifdef _ASM
 
 /*
@@ -333,47 +311,6 @@
 label/**/1:
 
 /*
- * Support for non-coherent I$.
- *
- * In sun4v we use tte bit 3 as a software flag indicating whether
- * execute permission is given. IMMU miss traps cause the real execute
- * permission to be set. sfmmu_ttesync() will see if execute permission
- * has been set, and then set P_EXEC in page_t. This causes I-cache
- * flush when the page is freed.
- *
- * However, the hypervisor reserves bit 3 as part of a 4-bit page size.
- * We allow this flag to be set in hme TTE, but never in TSB or TLB.
- */
-#define	TTE_CLR_SOFTEXEC_ML(tte)	bclr TTE_SOFTEXEC_INT, tte
-#define	TTE_CHK_SOFTEXEC_ML(tte)	andcc tte, TTE_SOFTEXEC_INT, %g0
-
-/*
- * TTE_SET_EXEC_ML is a macro that updates the exec bit if it is
- * not already set. Will also set reference bit at the same time.
- *
- * Caller must check EXECPRM. Do not call if it is already set in the tte.
- *
- * Parameters:
- * tte      = reg containing tte
- * ttepa    = physical pointer to tte
- * tmp1     = tmp reg
- * label    = temporary label
- */
-
-#define	TTE_SET_EXEC_ML(tte, ttepa, tmp1, label)			\
-	/* BEGIN CSTYLED */						\
-	/* update execprm bit */					\
-label/**/1:								\
-	or	tte, (TTE_EXECPRM_INT | TTE_REF_INT), tmp1;		\
-	casxa	[ttepa]ASI_MEM, tte, tmp1; 	/* update bits */	\
-	cmp	tte, tmp1;						\
-	bne,a,pn %xcc, label/**/1;					\
-	  mov	tmp1, tte;						\
-	or	tte, (TTE_EXECPRM_INT | TTE_REF_INT), tte;		\
-	/* END CSTYLED */
-
-
-/*
  * TTE_SET_REF_ML is a macro that updates the reference bit if it is
  * not already set.
  *
@@ -597,27 +534,6 @@
 label:
 	/* END CSTYLED */
 
-/*
- * For shared context mappings, check against the page size bitmap in the
- * tsbmiss area to decide if we should use private mappings instead to reduce
- * the number of shared page size searches on Rock based platforms.
- * In:
- *   tsbarea (not clobbered)
- *   tte (not clobbered)
- *   tmp (clobbered)
- * Out:
- *   use_shctx - changed to 0 if page size bit is not set in mask.
- */
-#define	CHECK_SHARED_PGSZ(tsbarea, tte, tmp, use_shctx, label)  \
-	/* BEGIN CSTYLED */					     \
-	brz     use_shctx, label/**/1				    ;\
-	 and    tte, TTE_SZ_BITS, tmp			    	    ;\
-	ldub    [tsbarea + TSBMISS_PGSZ_BITMAP], use_shctx	    ;\
-	srlx    use_shctx, tmp, use_shctx			    ;\
-	and     use_shctx, 0x1, use_shctx			    ;\
-label/**/1:
-	/* END CSTYLED */
-
 #endif /* _ASM */
 
 #ifdef	__cplusplus
--- a/usr/src/uts/sun4v/vm/mach_sfmmu_asm.s	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/vm/mach_sfmmu_asm.s	Thu Aug 06 17:39:39 2009 -0700
@@ -41,7 +41,6 @@
 #include <sys/pte.h>
 #include <sys/mmu.h>
 #include <vm/hat_sfmmu.h>
-#include <vm/mach_sfmmu.h>
 #include <vm/seg_spt.h>
 #include <sys/machparam.h>
 #include <sys/privregs.h>
@@ -50,7 +49,6 @@
 #include <sys/machthread.h>
 #include <sys/clock.h>
 #include <sys/trapstat.h>
-#include <sys/rock_hypervisor_api.h>
 
 /*
  * sfmmu related subroutines
@@ -79,7 +77,8 @@
 /* ARGSUSED */
 void
 sfmmu_load_mmustate(sfmmu_t *sfmmup)
-{}
+{
+}
 
 #else	/* lint */
 
@@ -282,7 +281,7 @@
 	sethi	%hi(ksfmmup), %o3
 	ldx	[%o3 + %lo(ksfmmup)], %o3
 	cmp	%o3, %o0
-	be,pn	%xcc, 8f			! if kernel as, do nothing
+	be,pn	%xcc, 7f			! if kernel as, do nothing
 	  nop
 	
 	set     MMU_SCONTEXT, %o3
@@ -340,7 +339,7 @@
 
 	ldx	[%g2 + SCD_SFMMUP], %g3		! %g3 = scdp->scd_sfmmup
 	ldx	[%g3 + SFMMU_TSB], %o1		! %o1 = first scd tsbinfo
-	brz,pn %o1, 1f
+	brz,pn %o1, 9f
 	  nop					! panic if no third TSB
 
 	/* make 3rd UTSBREG */
@@ -383,26 +382,9 @@
 	mov	MMU_TSB_CTXNON0, %o5
 	ta	FAST_TRAP			! set TSB info for user process
 	brnz,a,pn %o0, panic_bad_hcall
-	  mov	MMU_TSB_CTXNON0, %o1
-	mov	%o3, %o0			! restore saved sfmmup to %o0
+	mov	MMU_TSB_CTXNON0, %o1
+	mov	%o3, %o0			! restore %o0
 6:
-	/*
-	 * If the TLB pagesize register is supported and pgsz_search_on is set
-	 * then we patch out the following branch instruction.
-	 */
-	.global sfmmu_pgsz_load_mmustate_patch
-sfmmu_pgsz_load_mmustate_patch:
-	ba,a	7f				! branch around pgsz search hcall
-	mov	%o0, %o3			! preserve sfmmup in %o3
-	ldx	[%o3 + SFMMU_PGSZ_ORDER + HV_PGSZ_ORDER_PA], %o0
-	mov	TLB_SO_ID, %o1			! flags apply to I and D
-	mov	MMU_SET_NONPRIV_SEARCH, %o5
-	ta	FAST_TRAP			! set page size search order
-	brnz,a,pn %o0, panic_bad_hcall
-	  mov	MMU_SET_NONPRIV_SEARCH, %o1
-	mov	%o3, %o0			! restore saved sfmmup to %o0
-7:	
-	mov	%o1, %o5			! preserve pgsz_search_on
 	ldx	[%o0 + SFMMU_ISMBLKPA], %o1	! copy members of sfmmu
 	CPU_TSBMISS_AREA(%o2, %o3)		! %o2 = tsbmiss area
 	stx	%o1, [%o2 + TSBMISS_ISMBLKPA]	! sfmmu_tsb_miss into the
@@ -413,7 +395,7 @@
 	stub	%o3, [%o2 + TSBMISS_UTTEFLAGS]
 	stub	%o4,  [%o2 + TSBMISS_URTTEFLAGS]
 	stx	%o1, [%o2 +  TSBMISS_SHARED_UHATID]
-	brz,pn	%o1, 8f				! check for sfmmu_srdp
+	brz,pn	%o1, 7f				! check for sfmmu_srdp
 	  add	%o0, SFMMU_HMERMAP, %o1
 	add	%o2, TSBMISS_SHMERMAP, %o2
 	mov	SFMMU_HMERGNMAP_WORDS, %o3
@@ -423,38 +405,31 @@
 	ldx	[%o0 + SFMMU_SCDP], %o4		! %o4 = sfmmu_scd
 	CPU_TSBMISS_AREA(%o2, %o3)		! %o2 = tsbmiss area
 	mov	SFMMU_HMERGNMAP_WORDS, %o3
-	brnz,pt	%o4, 9f				! check for sfmmu_scdp else
-	  nop
-	add	%o2, TSBMISS_SCDSHMERMAP, %o2	! zero tsbmiss scd_shmermap
+	brnz,pt	%o4, 8f				! check for sfmmu_scdp else
+	  add	%o2, TSBMISS_SCDSHMERMAP, %o2	! zero tsbmiss scd_shmermap
 	ZERO_REGION_MAP(%o2, %o3, zero_scd_mmustate)
-8:
+7:
 	retl
 	nop
-9:						
-	brz,a	%o5, 0f				! test pgsz_search_on
-	  or	%g0, TLB_ALL_SHARED_PGSZ, %o1	! enable all page sizes
-	ldub	[%o0 + SFMMU_PGSZ_MAP], %o1
-0:
-	stub	%o1, [%o2 + TSBMISS_PGSZ_BITMAP] ! set tsbmiss pgsz bitmap
-	add	%o2, TSBMISS_SCDSHMERMAP, %o2	! set tsbmiss scd_shmermap
-	add	%o4, SCD_HMERMAP, %o1	
+8:						! set tsbmiss scd_shmermap
+	add	%o4, SCD_HMERMAP, %o1
 	SET_REGION_MAP(%o1, %o2, %o3, %o4, load_scd_mmustate)
-	
 	retl
 	  nop
-1:
+9:
 	sethi   %hi(panicstr), %g1		! panic if no 3rd TSB  
         ldx     [%g1 + %lo(panicstr)], %g1                             
         tst     %g1
 	                                                   
-        bnz,pn  %xcc, 8b                                            
+        bnz,pn  %xcc, 7b                                            
           nop                                                            
                                                                         
         sethi   %hi(sfmmu_panic10), %o0                                 
         call    panic                                                 
           or      %o0, %lo(sfmmu_panic10), %o0                         
+
 	SET_SIZE(sfmmu_load_mmustate)
-
+	
 #endif /* lint */
 
 #if defined(lint)
--- a/usr/src/uts/sun4v/vm/mach_vm_dep.c	Thu Aug 06 17:19:00 2009 -0700
+++ b/usr/src/uts/sun4v/vm/mach_vm_dep.c	Thu Aug 06 17:39:39 2009 -0700
@@ -52,7 +52,6 @@
 #include <sys/stack.h>
 #include <sys/atomic.h>
 #include <sys/promif.h>
-#include <sys/hsvc.h>
 
 uint_t page_colors = 0;
 uint_t page_colors_mask = 0;
@@ -150,7 +149,6 @@
 static	vmem_t		*contig_mem_arena;
 static	vmem_t		*contig_mem_reloc_arena;
 static	kmutex_t	contig_mem_lock;
-static	kmutex_t	contig_mem_sleep_lock;
 #define	CONTIG_MEM_ARENA_QUANTUM	64
 #define	CONTIG_MEM_SLAB_ARENA_QUANTUM	MMU_PAGESIZE64K
 
@@ -617,15 +615,14 @@
 }
 
 /*
- * contig_mem_alloc_align_flag allocates real contiguous memory with the
+ * contig_mem_alloc_align allocates real contiguous memory with the
  * specified alignment up to contig_mem_import_size_max. The alignment must
  * be a power of 2 and no greater than contig_mem_import_size_max. We assert
  * the aligment is a power of 2. For non-debug, vmem_xalloc will panic
  * for non power of 2 alignments.
  */
-static	void *
-contig_mem_alloc_align_flag(size_t size, size_t align, int flag,
-    kmutex_t *lockp)
+void *
+contig_mem_alloc_align(size_t size, size_t align)
 {
 	void *buf;
 
@@ -644,48 +641,27 @@
 	 * allocations also prevents us from trying to allocate
 	 * more spans than necessary.
 	 */
-	mutex_enter(lockp);
+	mutex_enter(&contig_mem_lock);
 
 	buf = vmem_xalloc(contig_mem_arena, size, align, 0, 0,
-	    NULL, NULL, flag | VM_NORELOC);
+	    NULL, NULL, VM_NOSLEEP | VM_NORELOC);
 
 	if ((buf == NULL) && (size <= MMU_PAGESIZE)) {
-		mutex_exit(lockp);
+		mutex_exit(&contig_mem_lock);
 		return (vmem_xalloc(static_alloc_arena, size, align, 0, 0,
-		    NULL, NULL, flag));
+		    NULL, NULL, VM_NOSLEEP));
 	}
 
 	if (buf == NULL) {
 		buf = vmem_xalloc(contig_mem_reloc_arena, size, align, 0, 0,
-		    NULL, NULL, flag);
+		    NULL, NULL, VM_NOSLEEP);
 	}
 
-	mutex_exit(lockp);
+	mutex_exit(&contig_mem_lock);
 
 	return (buf);
 }
 
-void *
-contig_mem_alloc_align(size_t size, size_t align)
-{
-	return (contig_mem_alloc_align_flag
-	    (size, align, VM_NOSLEEP, &contig_mem_lock));
-}
-
-/*
- * This function is provided for callers that need physically contiguous
- * allocations but can sleep. We use the contig_mem_sleep_lock so that we
- * don't interfere with contig_mem_alloc_align calls that should never sleep.
- * Similarly to contig_mem_alloc_align, we use a lock to prevent allocating
- * unnecessary spans when called in parallel.
- */
-void *
-contig_mem_alloc_align_sleep(size_t size, size_t align)
-{
-	return (contig_mem_alloc_align_flag
-	    (size, align, VM_SLEEP, &contig_mem_sleep_lock));
-}
-
 void
 contig_mem_free(void *vaddr, size_t size)
 {
@@ -709,7 +685,6 @@
 contig_mem_init(void)
 {
 	mutex_init(&contig_mem_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&contig_mem_sleep_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	contig_mem_slab_arena = vmem_xcreate("contig_mem_slab_arena", NULL, 0,
 	    CONTIG_MEM_SLAB_ARENA_QUANTUM, contig_vmem_xalloc_aligned_wrapper,
@@ -811,96 +786,3 @@
 	uint_t spcolor = atomic_inc_32_nv(&sp_current_color);
 	return ((size_t)((spcolor & sp_color_mask) * SA(sp_color_stride)));
 }
-
-/*
- * This flag may be set via /etc/system to force the synchronization
- * of I-cache with memory after every bcopy.  The default is 0, meaning
- * that there is no need for an I-cache flush after each bcopy.  This
- * flag is relevant only on platforms that have non-coherent I-caches.
- */
-uint_t	force_sync_icache_after_bcopy = 0;
-
-/*
- * This flag may be set via /etc/system to force the synchronization
- * of I-cache to memory after every DMA. The default is 0, meaning
- * that there is no need for an I-cache flush after each dma write to
- * memory. This flag is relevant only on platforms that have
- * non-coherent I-caches.
- */
-uint_t	force_sync_icache_after_dma = 0;
-
-/*
- * This internal flag enables mach_sync_icache_pa, which is always
- * called from common code if it is defined. However, not all
- * platforms support the hv_mem_iflush firmware call.
- */
-static uint_t	do_mach_sync_icache_pa = 0;
-
-int	hsvc_kdi_mem_iflush_negotiated = B_FALSE;
-
-#define	MEM_IFLUSH_MAJOR	1
-#define	MEM_IFLUSH_MINOR	0
-static hsvc_info_t kdi_mem_iflush_hsvc = {
-	HSVC_REV_1,		/* HSVC rev num */
-	NULL,			/* Private */
-	HSVC_GROUP_MEM_IFLUSH,	/* Requested API Group */
-	MEM_IFLUSH_MAJOR,	/* Requested Major */
-	MEM_IFLUSH_MINOR,	/* Requested Minor */
-	"kdi"			/* Module name */
-};
-
-/*
- * Setup soft exec mode.
- * Since /etc/system is read later on init, it
- * may be used to override these flags.
- */
-void
-mach_setup_icache(uint_t coherency)
-{
-	int		status;
-	uint64_t	sup_minor;
-
-	if (coherency == 0 && icache_is_coherent) {
-		extern void kdi_flush_caches(void);
-		status = hsvc_register(&kdi_mem_iflush_hsvc, &sup_minor);
-		if (status != 0)
-			cmn_err(CE_PANIC, "I$ flush not implemented on "
-			    "I$ incoherent system");
-		hsvc_kdi_mem_iflush_negotiated = B_TRUE;
-		kdi_flush_caches();
-		icache_is_coherent = 0;
-		do_mach_sync_icache_pa = 1;
-	}
-}
-
-/*
- * Flush specified physical address range from I$ via hv_mem_iflush interface
- */
-/*ARGSUSED*/
-void
-mach_sync_icache_pa(caddr_t paddr, size_t size)
-{
-	if (do_mach_sync_icache_pa) {
-		uint64_t pa = (uint64_t)paddr;
-		uint64_t sz = (uint64_t)size;
-		uint64_t i, flushed;
-
-		for (i = 0; i < sz; i += flushed) {
-			if (hv_mem_iflush(pa + i, sz - i, &flushed) != H_EOK) {
-				cmn_err(CE_PANIC, "Flushing the Icache failed");
-				break;
-			}
-		}
-	}
-}
-
-/*
- * Flush the page if it has been marked as executed
- */
-/*ARGSUSED*/
-void
-mach_sync_icache_pp(page_t *pp)
-{
-	if (PP_ISEXEC(pp))
-		mach_sync_icache_pa((caddr_t)ptob(pp->p_pagenum), PAGESIZE);
-}