Mercurial > illumos > illumos-gate
changeset 10271:7c80b70bb8de
6858457 Remove Solaris support for UltraSPARC-AT10 processor
line wrap: on
line diff
--- a/usr/src/cmd/mdb/common/kmdb/kctl/kctl_main.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/cmd/mdb/common/kmdb/kctl/kctl_main.c Thu Aug 06 17:39:39 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,7 +41,6 @@ #include <sys/kdi_impl.h> #include <sys/ctf_api.h> #include <vm/seg_kmem.h> -#include <vm/hat.h> kctl_t kctl; @@ -153,9 +152,8 @@ if (hat_getpfnum(kas.a_hat, addr) != PFN_INVALID) return (EAGAIN); - /* Set HAT_ATTR_TEXT to override soft execute mode */ - if (segkmem_xalloc(NULL, addr, sz, VM_NOSLEEP, HAT_ATTR_TEXT, - segkmem_page_create, NULL) == NULL) + if (segkmem_xalloc(NULL, addr, sz, VM_NOSLEEP, 0, segkmem_page_create, + NULL) == NULL) return (ENOMEM); return (0);
--- a/usr/src/cmd/mdb/common/kmdb/kmdb_kvm.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/cmd/mdb/common/kmdb/kmdb_kvm.c Thu Aug 06 17:39:39 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -43,7 +43,6 @@ #include <strings.h> #include <dlfcn.h> -#include <sys/kdi_impl.h> #include <sys/isa_defs.h> #include <sys/kobj.h> #include <sys/kobj_impl.h> @@ -218,7 +217,6 @@ kmt_writer(void *buf, size_t nbytes, uint64_t addr) { kmt_bcopy(buf, (void *)(uintptr_t)addr, nbytes); - mdb.m_kdi->kdi_flush_caches(); return (nbytes); }
--- a/usr/src/cmd/picl/plugins/inc/picldefs.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/cmd/picl/plugins/inc/picldefs.h Thu Aug 06 17:39:39 2009 -0700 @@ -129,8 +129,6 @@ #define PICL_CLASS_SENSOR "sensor" #define PICL_CLASS_STACK "stack" #define PICL_CLASS_UNKNOWN "unknown" -#define PICL_CLASS_HUMIDITY_SENSOR "humidity-sensor" -#define PICL_CLASS_HUMIDITY_INDICATOR "humidity-indicator" /* * Solaris driver property names @@ -243,7 +241,6 @@ #define PICL_PROP_BASE_UNITS "BaseUnits" #define PICL_PROP_EXPONENT "Exponent" #define PICL_PROP_RATE_UNITS "RateUnits" -#define PICL_PROP_HUMIDITY "Humidity" /* * Various threshold property names
--- a/usr/src/cmd/picl/plugins/sun4v/snmp/snmpplugin.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/cmd/picl/plugins/sun4v/snmp/snmpplugin.c Thu Aug 06 17:39:39 2009 -0700 @@ -865,10 +865,6 @@ ADD_NODE(PICL_CLASS_RPM_SENSOR) add_prop(nodeh, &proph, node_name, row, PP_SPEED, snmp_syserr_p); - } else if (sensor_type == SSST_HUMIDITY) { - ADD_NODE(PICL_CLASS_HUMIDITY_SENSOR) - add_prop(nodeh, &proph, node_name, row, - PP_HUMIDITY, snmp_syserr_p); } else { ADD_NODE(PICL_CLASS_SENSOR) add_prop(nodeh, &proph, node_name, row, @@ -906,8 +902,6 @@ ADD_NODE(PICL_CLASS_RPM_INDICATOR) } else if (sensor_type == SSST_PRESENCE) { ADD_NODE(PICL_CLASS_PRESENCE_INDICATOR) - } else if (sensor_type == SSST_HUMIDITY) { - ADD_NODE(PICL_CLASS_HUMIDITY_INDICATOR) } else { ADD_NODE(PICL_CLASS_INDICATOR) }
--- a/usr/src/cmd/picl/plugins/sun4v/snmp/snmpplugin.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/cmd/picl/plugins/sun4v/snmp/snmpplugin.h Thu Aug 06 17:39:39 2009 -0700 @@ -111,8 +111,7 @@ PP_MFG_NAME, PP_MODEL_NAME, PP_DESCRIPTION, - PP_LABEL, - PP_HUMIDITY + PP_LABEL } sp_propid_t; /*
--- a/usr/src/common/atomic/sparcv9/atomic.s Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/common/atomic/sparcv9/atomic.s Thu Aug 06 17:39:39 2009 -0700 @@ -82,13 +82,6 @@ mov tmp2, %o7 /* restore callee's return address */ ; \ label/**/1: -#ifdef ATOMIC_SIMPLE_BO_ENABLE -/* - * For some processors, simple limit has proved benefical - */ -#define ATOMIC_BACKOFF_CPU(val, limit, ncpu, cas_cnt, label) \ - set 1 << ATOMIC_BO_ENABLE_SHIFT, limit -#else /* * For the kernel, we take into consideration of cas failures * and also scale the backoff limit w.r.t. the number of cpus. @@ -111,7 +104,6 @@ mov %g0, cas_cnt ; \ mov 1, val ; \ label/**/1: -#endif /* ATOMIC_SIMPLE_BO_ENABLE */ #endif /* ATOMIC_BO_ENABLE_SHIFT */ #else /* _KERNEL */ @@ -137,18 +129,11 @@ * The cas_cnt counts the cas instruction failure and is * initialized to 0. */ -#ifdef ATOMIC_SIMPLE_BO_ENABLE -#define ATOMIC_BACKOFF_INIT(val, ncpu, cas_cnt) \ - mov 1, val - -#else /* If not defined ATOMIC_SIMPLE_BO_ENABLE */ #define ATOMIC_BACKOFF_INIT(val, ncpu, cas_cnt) \ mov 1, val ; \ mov %g0, ncpu ; \ mov %g0, cas_cnt -#endif /* ATOMIC_SIMPLE_BO_ENABLE */ - #define ATOMIC_BACKOFF_BRANCH(cr, backoff, loop) \ bne,a,pn cr, backoff
--- a/usr/src/common/elfcap/elfcap.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/common/elfcap/elfcap.c Thu Aug 06 17:39:39 2009 -0700 @@ -150,10 +150,7 @@ AV_SPARC_FMAF, STRDESC("AV_SPARC_FMAF"), STRDESC("FMAF"), STRDESC("fmaf"), }, - { /* 0x00000200 */ - AV_SPARC_FMAU, STRDESC("AV_SPARC_FMAU"), - STRDESC("FMAU"), STRDESC("fmau"), - }, + RESERVED_ELFCAP_DESC, /* 0x00000200 */ { /* 0x00000400 */ AV_SPARC_VIS3, STRDESC("AV_SPARC_VIS3"), STRDESC("VIS3"), STRDESC("vis3"),
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/memcpy.s Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1704 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - - .file "memcpy.s" - -/* - * memcpy(s1, s2, len) - * - * Copy s2 to s1, always copy n bytes. - * Note: this C code does not work for overlapped copies. - * Memmove() and bcopy() do. - * - * Added entry __align_cpy_1 is generally for use of the compilers. - * - * Fast assembler language version of the following C-program for memcpy - * which represents the `standard' for the C-library. - * - * void * - * memcpy(void *s, const void *s0, size_t n) - * { - * if (n != 0) { - * char *s1 = s; - * const char *s2 = s0; - * do { - * *s1++ = *s2++; - * } while (--n != 0); - * } - * return (s); - * } - */ - -#include <sys/asm_linkage.h> -#include <sys/sun4asi.h> -#include <sys/trap.h> - -#ifdef __sparcv9 -#define SAVESIZE (8 * 1) -#define STACK_OFFSET (STACK_BIAS + MINFRAME) -#else -#define SAVESIZE (8 * 3) -#define STACK_OFFSET (STACK_BIAS + MINFRAME + 4) -#endif - -#define scratch_offset 0 -#define g4_offset 8 -#define g5_offset 16 - -#define ICACHE_LINE_SIZE 64 -#define BLOCK_SIZE 64 -#define FPRS_FEF 0x4 -#define PF_FAR 2048 -#define PF_NEAR 1024 - -#define SHORTCOPY 3 -#define SMALL_MAX 39 -#define MEDIUM_MAX 255 -#define MED_WMAX 256 /* max copy for medium word-aligned case */ -#define MED_MAX 256 /* max copy for medium longword-aligned case */ - -#ifndef BSTORE_SIZE -#define BSTORE_SIZE 256 /* min copy size for block store */ -#endif - -/* - * The LDDs will use the below ASI for performance - * This ASI minimizes cache pollution. - */ -#define ASI_CACHE_SPARING 0xf4 -#define ASI_CACHE_SPARING_PRIMARY 0xf4 - - ANSI_PRAGMA_WEAK(memmove,function) - ANSI_PRAGMA_WEAK(memcpy,function) - - ENTRY(memmove) - cmp %o1, %o0 ! if from address is >= to use forward copy - bgeu %ncc, .forcpy ! else use backward if ... - sub %o0, %o1, %o4 ! get difference of two addresses - cmp %o2, %o4 ! compare size and difference of addresses - bleu %ncc, .forcpy ! if size is bigger, do overlapped copy - nop - - ! - ! an overlapped copy that must be done "backwards" - ! -.ovbc: - mov %o0, %g1 ! save dest address for return val - add %o1, %o2, %o1 ! get to end of source space - add %o0, %o2, %o0 ! get to end of destination space - - cmp %o2, 24 - bgeu,pn %ncc, .dbalign - nop - cmp %o2, 4 - blt,pn %ncc, .byte - sub %o2, 3, %o2 -.byte4loop: - ldub [%o1-1], %o3 ! load last byte - stb %o3, [%o0-1] ! store last byte - sub %o1, 4, %o1 - ldub [%o1+2], %o3 ! load 2nd from last byte - stb %o3, [%o0-2] ! store 2nd from last byte - sub %o0, 4, %o0 - ldub [%o1+1], %o3 ! load 3rd from last byte - stb %o3, [%o0+1] ! store 3rd from last byte - subcc %o2, 4, %o2 - ldub [%o1], %o3 ! load 4th from last byte - bgu,pt %ncc, .byte4loop - stb %o3, [%o0] ! store 4th from last byte -.byte: - addcc %o2, 3, %o2 - bz,pt %ncc, .exit -.byteloop: - dec %o1 ! decrement src address - ldub [%o1], %o3 ! read a byte - dec %o0 ! decrement dst address - deccc %o2 ! decrement count - bgu,pt %ncc, .byteloop ! loop until done - stb %o3, [%o0] ! write byte -.exit: - retl - mov %g1, %o0 - - .align 16 -.dbalign: - andcc %o0, 7, %o5 ! bytes till DST 8 byte aligned - bz,pt %ncc, .dbmed - sub %o2, %o5, %o2 ! update count -.dbalign1: - dec %o1 ! decrement src address - ldub [%o1], %o3 ! read a byte - dec %o0 ! decrement dst address - deccc %o5 ! decrement count - bgu,pt %ncc, .dbalign1 ! loop until done - stb %o3, [%o0] ! store a byte - -! check for src long word alignment -.dbmed: - mov %asi, %g5 ! save curr %asi - wr %g0, ASI_CACHE_SPARING, %asi - andcc %o1, 7, %g0 ! chk src long word alignment - bnz,pn %ncc, .dbbck - nop -! -! Following code is for overlapping copies where src and dest -! are long word aligned -! - cmp %o2, 4095 - blt,pn %ncc, .dbmedl32enter ! go to no prefetch code - nop - prefetch [%o1 - (1 * BLOCK_SIZE)], #n_reads - sub %o2, 63, %o2 ! adjust length to allow cc test - ! for end of loop - prefetch [%o1 - (2 * BLOCK_SIZE)], #n_reads - prefetch [%o1 - (3 * BLOCK_SIZE)], #n_reads - prefetch [%o1 - (4 * BLOCK_SIZE)], #n_reads -.dbmedl64: - prefetch [%o1 - (5 * BLOCK_SIZE)], #n_reads - ldxa [%o1-8]%asi, %o3 ! load - subcc %o2, 64, %o2 ! decrement length count - stx %o3, [%o0-8] ! and store - ldxa [%o1-16]%asi, %o3 ! a block of 64 bytes - sub %o1, 64, %o1 ! decrease src ptr by 64 - stx %o3, [%o0-16] - sub %o0, 64, %o0 ! decrease dst ptr by 64 - ldxa [%o1+40]%asi, %o3 - ldxa [%o1+32]%asi, %o4 - ldxa [%o1+24]%asi, %o5 - stx %o3, [%o0+40] - stx %o4, [%o0+32] - stx %o5, [%o0+24] - ldxa [%o1+16]%asi, %o3 - ldxa [%o1+8]%asi, %o4 - stx %o3, [%o0+16] - stx %o4, [%o0+8] - ldxa [%o1]%asi, %o5 - bgu,pt %ncc, .dbmedl64 ! repeat if at least 64 bytes left - stx %o5, [%o0] - add %o2, 63, %o2 ! restore offset adjustment -.dbmedl32enter: - subcc %o2, 31, %o2 ! adjust length to allow cc test - ! for end of loop - ble,pt %ncc, .dbmedl31 ! skip big loop if less than 32 - nop -.dbmedl32: - ldx [%o1-8], %o4 ! load - subcc %o2, 32, %o2 ! decrement length count - stx %o4, [%o0-8] ! and store - ldx [%o1-16], %o3 ! a block of 32 bytes - sub %o1, 32, %o1 ! decrease src ptr by 32 - stx %o3, [%o0-16] - ldx [%o1+8], %o4 - sub %o0, 32, %o0 ! decrease dst ptr by 32 - stx %o4, [%o0+8] - ldx [%o1], %o3 - bgu,pt %ncc, .dbmedl32 ! repeat if at least 32 bytes left - stx %o3, [%o0] -.dbmedl31: - addcc %o2, 16, %o2 ! adjust remaining count - ble,pt %ncc, .dbmedl15 ! skip if 15 or fewer bytes left - nop ! - ldx [%o1-8], %o4 ! load and store 16 bytes - sub %o1, 16, %o1 ! decrease src ptr by 16 - stx %o4, [%o0-8] ! - sub %o2, 16, %o2 ! decrease count by 16 - ldx [%o1], %o3 ! - sub %o0, 16, %o0 ! decrease dst ptr by 16 - stx %o3, [%o0] -.dbmedl15: - addcc %o2, 15, %o2 ! restore count - bz,pt %ncc, .dbexit ! exit if finished - nop - cmp %o2, 8 - blt,pt %ncc, .dbremain ! skip if 7 or fewer bytes left - nop - ldx [%o1-8], %o4 ! load 8 bytes - sub %o1, 8, %o1 ! decrease src ptr by 8 - stx %o4, [%o0-8] ! and store 8 bytes - subcc %o2, 8, %o2 ! decrease count by 8 - bnz %ncc, .dbremain ! exit if finished - sub %o0, 8, %o0 ! decrease dst ptr by 8 - mov %g5, %asi ! restore %asi - retl - mov %g1, %o0 - -! -! Following code is for overlapping copies where src and dest -! are not long word aligned -! - .align 16 -.dbbck: - rd %fprs, %o3 ! o3 = fprs - - ! if fprs.fef == 0, set it. Checking it, requires 2 instructions. - ! So set it anyway, without checking. - wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 - - alignaddr %o1, %g0, %o5 ! align src - ldda [%o5]%asi, %d0 ! get first 8 byte block - andn %o2, 7, %o4 ! prepare src ptr for finishup code - cmp %o2, 32 - blt,pn %ncc, .dbmv8 - sub %o1, %o4, %o1 ! - cmp %o2, 4095 ! check for short memmoves - blt,pn %ncc, .dbmv32enter ! go to no prefetch code -.dbmv64: - ldda [%o5-8]%asi, %d2 ! load 8 bytes - ldda [%o5-16]%asi, %d4 ! load 8 bytes - sub %o5, 64, %o5 ! - ldda [%o5+40]%asi, %d6 ! load 8 bytes - sub %o0, 64, %o0 ! - ldda [%o5+32]%asi, %d8 ! load 8 bytes - sub %o2, 64, %o2 ! 64 less bytes to copy - ldda [%o5+24]%asi, %d18 ! load 8 bytes - cmp %o2, 64 ! do we have < 64 bytes remaining - ldda [%o5+16]%asi, %d28 ! load 8 bytes - ldda [%o5+8]%asi, %d30 ! load 8 bytes - prefetch [%o5 - (5 * BLOCK_SIZE)], #n_reads - faligndata %d2, %d0, %d10 ! extract 8 bytes out - ldda [%o5]%asi, %d0 ! load 8 bytes - std %d10, [%o0+56] ! store the current 8 bytes - faligndata %d4, %d2, %d12 ! extract 8 bytes out - std %d12, [%o0+48] ! store the current 8 bytes - faligndata %d6, %d4, %d14 ! extract 8 bytes out - std %d14, [%o0+40] ! store the current 8 bytes - faligndata %d8, %d6, %d16 ! extract 8 bytes out - std %d16, [%o0+32] ! store the current 8 bytes - faligndata %d18, %d8, %d20 ! extract 8 bytes out - std %d20, [%o0+24] ! store the current 8 bytes - faligndata %d28, %d18, %d22 ! extract 8 bytes out - std %d22, [%o0+16] ! store the current 8 bytes - faligndata %d30, %d28, %d24 ! extract 8 bytes out - std %d24, [%o0+8] ! store the current 8 bytes - faligndata %d0, %d30, %d26 ! extract 8 bytes out - bgeu,pt %ncc, .dbmv64 - std %d26, [%o0] ! store the current 8 bytes - - cmp %o2, 32 - blt,pn %ncc, .dbmvx - nop -.dbmv32: - ldda [%o5-8]%asi, %d2 ! load 8 bytes -.dbmv32enter: - ldda [%o5-16]%asi, %d4 ! load 8 bytes - sub %o5, 32, %o5 ! - ldda [%o5+8]%asi, %d6 ! load 8 bytes - sub %o0, 32, %o0 ! - faligndata %d2, %d0, %d10 ! extract 8 bytes out - ldda [%o5]%asi, %d0 ! load 8 bytes - sub %o2,32, %o2 ! 32 less bytes to copy - std %d10, [%o0+24] ! store the current 8 bytes - cmp %o2, 32 ! do we have < 32 bytes remaining - faligndata %d4, %d2, %d12 ! extract 8 bytes out - std %d12, [%o0+16] ! store the current 8 bytes - faligndata %d6, %d4, %d14 ! extract 8 bytes out - std %d14, [%o0+8] ! store the current 8 bytes - faligndata %d0, %d6, %d16 ! extract 8 bytes out - bgeu,pt %ncc, .dbmv32 - std %d16, [%o0] ! store the current 8 bytes -.dbmvx: - cmp %o2, 8 ! do we have < 8 bytes remaining - blt,pt %ncc, .dbmvfinish ! if yes, skip to finish up code - nop -.dbmv8: - ldda [%o5-8]%asi, %d2 - sub %o0, 8, %o0 ! since we are at the end - ! when we first enter the loop - sub %o2, 8, %o2 ! 8 less bytes to copy - sub %o5, 8, %o5 - cmp %o2, 8 ! do we have < 8 bytes remaining - faligndata %d2, %d0, %d8 ! extract 8 bytes out - std %d8, [%o0] ! store the current 8 bytes - bgeu,pt %ncc, .dbmv8 - fmovd %d2, %d0 -.dbmvfinish: - and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0 - tst %o2 - bz,pt %ncc, .dbexit - wr %o3, %g0, %fprs ! fprs = o3 restore fprs - -.dbremain: - cmp %o2, 4 - blt,pn %ncc, .dbbyte - nop - ldub [%o1-1], %o3 ! load last byte - stb %o3, [%o0-1] ! store last byte - sub %o1, 4, %o1 - ldub [%o1+2], %o3 ! load 2nd from last byte - stb %o3, [%o0-2] ! store 2nd from last byte - sub %o0, 4, %o0 - ldub [%o1+1], %o3 ! load 3rd from last byte - stb %o3, [%o0+1] ! store 3rd from last byte - subcc %o2, 4, %o2 - ldub [%o1], %o3 ! load 4th from last byte - stb %o3, [%o0] ! store 4th from last byte - bz,pt %ncc, .dbexit -.dbbyte: - dec %o1 ! decrement src address - ldub [%o1], %o3 ! read a byte - dec %o0 ! decrement dst address - deccc %o2 ! decrement count - bgu,pt %ncc, .dbbyte ! loop until done - stb %o3, [%o0] ! write byte -.dbexit: - mov %g5, %asi ! restore %asi - retl - mov %g1, %o0 - SET_SIZE(memmove) - - .align ICACHE_LINE_SIZE - ENTRY(memcpy) - ENTRY(__align_cpy_1) - ! adjust instruction alignment - nop ! Do not remove, these nops affect - nop ! icache alignment and performance -.forcpy: - cmp %o2, SMALL_MAX ! check for not small case - bgu,pn %ncc, .medium ! go to larger cases - mov %o0, %g1 ! save %o0 - cmp %o2, SHORTCOPY ! check for really short case - ble,pt %ncc, .smallleft ! - or %o0, %o1, %o3 ! prepare alignment check - andcc %o3, 0x3, %g0 ! test for alignment - bz,pt %ncc, .smallword ! branch to word aligned case - sub %o2, 3, %o2 ! adjust count to allow cc zero test -.smallnotalign4: - ldub [%o1], %o3 ! read byte - subcc %o2, 4, %o2 ! reduce count by 4 - stb %o3, [%o0] ! write byte - ldub [%o1+1], %o3 ! repeat for a total of 4 bytes - add %o1, 4, %o1 ! advance SRC by 4 - stb %o3, [%o0+1] - ldub [%o1-2], %o3 - add %o0, 4, %o0 ! advance DST by 4 - stb %o3, [%o0-2] - ldub [%o1-1], %o3 - bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain - stb %o3, [%o0-1] - add %o2, 3, %o2 ! restore count -.smallleft: - tst %o2 - bz,pt %ncc, .smallexit - nop -.smallleft3: ! 1, 2, or 3 bytes remain - ldub [%o1], %o3 ! load one byte - deccc %o2 ! reduce count for cc test - bz,pt %ncc, .smallexit - stb %o3, [%o0] ! store one byte - ldub [%o1+1], %o3 ! load second byte - deccc %o2 - bz,pt %ncc, .smallexit - stb %o3, [%o0+1] ! store second byte - ldub [%o1+2], %o3 ! load third byte - stb %o3, [%o0+2] ! store third byte - retl - mov %g1, %o0 ! restore %o0 - - .align 16 - nop ! affects loop icache alignment -.smallwords: - lduw [%o1], %o3 ! read word -.smallwordx: - subcc %o2, 8, %o2 ! update count - stw %o3, [%o0] ! write word - add %o1, 8, %o1 ! update SRC - lduw [%o1-4], %o3 ! read word - add %o0, 8, %o0 ! update DST - bgu,pt %ncc, .smallwords ! loop until done - stw %o3, [%o0-4] ! write word - addcc %o2, 7, %o2 ! restore count - bz,pt %ncc, .smallexit ! check for completion - nop - cmp %o2, 4 ! check for 4 or more bytes left - blt .smallleft3 ! if not, go to finish up - nop - lduw [%o1], %o3 - add %o1, 4, %o1 - subcc %o2, 4, %o2 - stw %o3, [%o0] - add %o0, 4, %o0 - bnz,pt %ncc, .smallleft3 - nop - retl - mov %g1, %o0 ! restore %o0 - -.smallword: - subcc %o2, 4, %o2 ! update count - bgu,pt %ncc, .smallwordx - lduw [%o1], %o3 ! read word - addcc %o2, 3, %o2 ! restore count - bz,pt %ncc, .smallexit - stw %o3, [%o0] ! write word - deccc %o2 ! reduce count for cc test - ldub [%o1+4], %o3 ! load one byte - bz,pt %ncc, .smallexit - stb %o3, [%o0+4] ! store one byte - ldub [%o1+5], %o3 ! load second byte - deccc %o2 - bz,pt %ncc, .smallexit - stb %o3, [%o0+5] ! store second byte - ldub [%o1+6], %o3 ! load third byte - stb %o3, [%o0+6] ! store third byte -.smallexit: - retl - mov %g1, %o0 ! restore %o0 - .align 16 -.medium: - neg %o0, %o5 - neg %o1, %o3 - andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned - and %o3, 7, %o3 ! bytes till SRC 8 byte aligned - cmp %o5, %o3 - bne %ncc, continue - sub %o5, %o3, %o3 ! -(bytes till SRC aligned after DST aligned) - ! o3={-7, -6, ... 7} o3>0 => SRC overaligned - ! src and dst are aligned. - mov %o3, %g5 ! save %o3 - andcc %o1, 7, %o3 ! is src buf aligned on a 8 byte bound - brz,pt %o3, src_dst_aligned_on_8 - mov %o3, %o5 - mov 8, %o4 - sub %o4, %o3, %o3 - cmp %o3, %o2 - bg,a,pn %ncc, 1f - mov %o2, %o3 -1: - ! %o3 has the bytes to be written in partial store. - sub %o2, %o3, %o2 - prefetch [%o1],2 - -7: - deccc %o3 ! byte clearing loop - ldub [%o1], %o4 ! load one byte - stb %o4, [%o0] - inc %o1 ! increment src - bgu,pt %ncc, 7b - inc %o0 ! increment dst - - mov %g5, %o3 ! restore %o3 -src_dst_aligned_on_8: - ! check if we are copying 1k or more bytes - cmp %o2, 511 - bgu,pt %ncc, copying_ge_512 - nop - ba .medlword - nop - -continue: - andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned - bz %ncc, 2f - nop - - sub %o2, %o5, %o2 ! update count - -1: - ldub [%o1], %o4 - deccc %o5 - inc %o1 - stb %o4, [%o0] - bgu,pt %ncc, 1b - inc %o0 - - ! Now DST is 8-byte aligned. o0, o1, o2 are current. - -2: - andcc %o1, 0x3, %g0 ! test alignment - bnz,pt %ncc, .mediumsetup ! branch to skip aligned cases - ! if src, dst not aligned - prefetch [%o1 + (1 * BLOCK_SIZE)], #n_reads - -/* - * Handle all cases where src and dest are aligned on word - * or long word boundaries. Use unrolled loops for better - * performance. This option wins over standard large data - * move when source and destination is in cache for medium - * to short data moves. - */ - andcc %o1, 0x7, %g0 ! test word alignment - bz,pt %ncc, src_dst_lword_aligned ! branch to long word aligned case - prefetch [%o1 + (2 * BLOCK_SIZE)], #n_reads - cmp %o2, MED_WMAX ! limit to store buffer size - bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop - nop - subcc %o2, 15, %o2 ! adjust length to allow cc test - ! for end of loop - ble,pt %ncc, .medw15 ! skip big loop if less than 16 - prefetch [%o1 + (3 * BLOCK_SIZE)], #n_reads -/* - * no need to put prefetch in loop as prefetches have - * already been issued for maximum loop size - */ -.medw16: - ld [%o1], %o4 ! load - subcc %o2, 16, %o2 ! decrement length count - stw %o4, [%o0] ! and store - ld [%o1+4], %o3 ! a block of 16 bytes - add %o1, 16, %o1 ! increase src ptr by 16 - stw %o3, [%o0+4] - ld [%o1-8], %o4 - add %o0, 16, %o0 ! increase dst ptr by 16 - stw %o4, [%o0-8] - ld [%o1-4], %o3 - bgu,pt %ncc, .medw16 ! repeat if at least 16 bytes left - stw %o3, [%o0-4] -.medw15: - addcc %o2, 15, %o2 ! restore count - bz,pt %ncc, .medwexit ! exit if finished - nop - cmp %o2, 8 - blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left - nop ! - ld [%o1], %o4 ! load 4 bytes - subcc %o2, 8, %o2 ! decrease count by 8 - stw %o4, [%o0] ! and store 4 bytes - add %o1, 8, %o1 ! increase src ptr by 8 - ld [%o1-4], %o3 ! load 4 bytes - add %o0, 8, %o0 ! increase dst ptr by 8 - stw %o3, [%o0-4] ! and store 4 bytes - bz %ncc, .medwexit ! exit if finished - nop -.medw7: ! count is ge 1, less than 8 - cmp %o2, 3 ! check for 4 bytes left - ble,pt %ncc, .medw3 ! skip if 3 or fewer bytes left - nop ! - ld [%o1], %o4 ! load 4 bytes - sub %o2, 4, %o2 ! decrease count by 4 - add %o1, 4, %o1 ! increase src ptr by 4 - stw %o4, [%o0] ! and store 4 bytes - add %o0, 4, %o0 ! increase dst ptr by 4 - tst %o2 ! check for zero bytes left - bz %ncc, .medwexit ! exit if finished - nop -.medw3: ! count is known to be 1, 2, or 3 - deccc %o2 ! reduce count by one - ldub [%o1], %o3 ! load one byte - bz,pt %ncc, .medwexit ! exit if last byte - stb %o3, [%o0] ! store one byte - ldub [%o1+1], %o3 ! load second byte - deccc %o2 ! reduce count by one - bz,pt %ncc, .medwexit ! exit if last byte - stb %o3, [%o0+1] ! store second byte - ldub [%o1+2], %o3 ! load third byte - stb %o3, [%o0+2] ! store third byte -.medwexit: - retl - mov %g1, %o0 ! restore %o0 - -/* - * Special case for handling when src and dest are both long word aligned - * and total data to move is between SMALL_MAX and MED_MAX bytes - */ - - .align 16 - nop -src_dst_lword_aligned: -.medlword: ! long word aligned - cmp %o2, MED_MAX ! limit to store buffer size - bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop - nop - subcc %o2, 31, %o2 ! adjust length to allow cc test - ! for end of loop - ble,pt %ncc, .medl31 ! skip big loop if less than 32 - prefetch [%o1 + (3 * BLOCK_SIZE)], #n_reads ! into the l2 cache -/* - * no need to put prefetch in loop as prefetches have - * already been issued for maximum loop size - */ -.medl32: - ldx [%o1], %o4 ! load - subcc %o2, 32, %o2 ! decrement length count - stx %o4, [%o0] ! and store - ldx [%o1+8], %o3 ! a block of 32 bytes - add %o1, 32, %o1 ! increase src ptr by 32 - stx %o3, [%o0+8] - ldx [%o1-16], %o4 - add %o0, 32, %o0 ! increase dst ptr by 32 - stx %o4, [%o0-16] - ldx [%o1-8], %o3 - bgu,pt %ncc, .medl32 ! repeat if at least 32 bytes left - stx %o3, [%o0-8] -.medl31: - addcc %o2, 16, %o2 ! adjust remaining count - ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left - nop ! - ldx [%o1], %o4 ! load and store 16 bytes - add %o1, 16, %o1 ! increase src ptr by 16 - stx %o4, [%o0] ! - sub %o2, 16, %o2 ! decrease count by 16 - ldx [%o1-8], %o3 ! - add %o0, 16, %o0 ! increase dst ptr by 16 - stx %o3, [%o0-8] -.medl15: - addcc %o2, 15, %o2 ! restore count - bz,pt %ncc, .medwexit ! exit if finished - nop - cmp %o2, 8 - blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left - nop - ldx [%o1], %o4 ! load 8 bytes - add %o1, 8, %o1 ! increase src ptr by 8 - stx %o4, [%o0] ! and store 8 bytes - subcc %o2, 8, %o2 ! decrease count by 8 - bz %ncc, .medwexit ! exit if finished - add %o0, 8, %o0 ! increase dst ptr by 8 - ba .medw7 - nop - - .align 16 - nop - nop - nop -unaligned_src_dst: - -.mediumsetup: - prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read -.mediumrejoin: - rd %fprs, %o4 ! check for unused fp - - add %o1, 8, %o1 ! prepare to round SRC upward - - sethi %hi(0x1234567f), %o5 ! For GSR.MASK - or %o5, 0x67f, %o5 - andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0 - bz,a %ncc, 3f - wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 -3: - cmp %o2, MEDIUM_MAX - bmask %o5, %g0, %g0 - - ! Compute o5 (number of bytes that need copying using the main loop). - ! First, compute for the medium case. - ! Then, if large case, o5 is replaced by count for block alignment. - ! Be careful not to read past end of SRC - ! Currently, o2 is the actual count remaining - ! o3 is how much sooner we'll cross the alignment boundary - ! in SRC compared to in DST - ! - ! Examples: Let # denote bytes that should not be accessed - ! Let x denote a byte already copied to align DST - ! Let . and - denote bytes not yet copied - ! Let | denote double alignment boundaries - ! - ! DST: ######xx|........|--------|..###### o2 = 18 - ! o0 - ! - ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8 - ! o1 - ! - ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8 - ! o1 - ! - ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8 - ! o1 - - mov %asi, %g5 ! save curr %asi - wr %g0, ASI_CACHE_SPARING, %asi - - or %g0, -8, %o5 - alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1 - - movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0 - add %o5, %o2, %o5 - add %o5, %o3, %o5 - - bleu %ncc, 4f - andn %o5, 7, %o5 ! 8 byte aligned count - neg %o0, %o5 ! 'large' case - and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned -4: - brgez,a %o3, .beginmedloop - ldda [%o1-8]%asi, %d0 - - add %o1, %o3, %o1 ! back up o1 -5: - ldda [%o1]ASI_FL8_P, %d2 - inc %o1 - andcc %o1, 7, %g0 - bnz %ncc, 5b - bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 - -.beginmedloop: - tst %o5 - bz %ncc, .endmedloop - sub %o2, %o5, %o2 ! update count for later - - ! Main loop to write out doubles. Note: o5 & 7 == 0 - - ldd [%o1], %d2 - subcc %o5, 8, %o5 ! update local count - bz,pn %ncc, 1f - add %o1, 8, %o1 ! update SRC - -.medloop: - faligndata %d0, %d2, %d4 - ldda [%o1]%asi, %d0 - subcc %o5, 8, %o5 ! update local count - add %o1, 16, %o1 ! update SRC - std %d4, [%o0] - bz,pn %ncc, 2f - faligndata %d2, %d0, %d6 - ldda [%o1 - 8]%asi, %d2 - subcc %o5, 8, %o5 ! update local count - std %d6, [%o0 + 8] - bnz,pt %ncc, .medloop - add %o0, 16, %o0 ! update DST - -1: - faligndata %d0, %d2, %d4 - fmovd %d2, %d0 - std %d4, [%o0] - ba .endmedloop - add %o0, 8, %o0 - -2: - std %d6, [%o0 + 8] - sub %o1, 8, %o1 - add %o0, 16, %o0 - - -.endmedloop: - ! Currently, o1 is pointing to the next double-aligned byte in SRC - ! The 8 bytes starting at [o1-8] are available in d0 - ! At least one, and possibly all, of these need to be written. - - cmp %o2, BLOCK_SIZE - bgu %ncc, .large ! otherwise, less than 16 bytes left - -#if 1 - - /* This code will use partial stores. */ - - mov %g0, %o5 - and %o3, 7, %o3 ! Number of bytes needed to completely - ! fill %d0 with good (unwritten) data. - - subcc %o2, 8, %o2 ! update count (maybe too much) - movl %ncc, %o2, %o5 - addcc %o3, %o5, %o5 ! extra bytes we can stuff into %d0 - sub %o3, %o5, %o3 ! update o3 (# bad bytes in %d0) - - bz %ncc, 2f - alignaddr %o3, %g0, %g0 ! set GSR.ALIGN - -1: - deccc %o5 - ldda [%o1]ASI_FL8_P, %d2 - inc %o1 - bgu %ncc, 1b - bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 - -2: - not %o3 - faligndata %d0, %d0, %d0 ! shift bytes to the left - and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3] - edge8n %g0, %o3, %o5 - stda %d0, [%o0]%o5, ASI_PST8_P - brlez %o2, .exit_memcpy - add %o0, %o3, %o0 ! update DST to last stored byte -3: - inc %o0 - deccc %o2 - ldub [%o1], %o3 - stb %o3, [%o0] - bgu %ncc, 3b - inc %o1 - -#else - - andcc %o3, 7, %o5 ! Number of bytes needed to completely - ! fill %d0 with good (unwritten) data. - bz %ncc, 2f - sub %o5, 8, %o3 ! -(number of good bytes in %d0) - cmp %o2, 8 - bl,a %ncc, 3f ! Not enough bytes to fill %d0 - add %o1, %o3, %o1 ! Back up %o1 - -1: - deccc %o5 - ldda [%o1]ASI_FL8_P, %d2 - inc %o1 - bgu %ncc, 1b - bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 - -2: - subcc %o2, 8, %o2 - std %d0, [%o0] - bz %ncc, .exit_memcpy - add %o0, 8, %o0 -3: - ldub [%o1], %o3 - deccc %o2 - inc %o1 - stb %o3, [%o0] - bgu %ncc, 3b - inc %o0 -#endif - -.exit_memcpy: - wr %o4, %g0, %fprs ! fprs = o4 restore fprs - mov %g5, %asi ! restore %asi - retl - mov %g1, %o0 - - .align ICACHE_LINE_SIZE -.large: - ! The following test for BSTORE_SIZE is used to decide whether - ! to store data with a block store or with individual stores. - ! The block store wins when the amount of data is so large - ! that it is causes other application data to be moved out - ! of the L1 or L2 cache. - ! On a Panther, block store can lose more often because block - ! store forces the stored data to be removed from the L3 cache. - ! - sethi %hi(BSTORE_SIZE),%o5 - or %o5,%lo(BSTORE_SIZE),%o5 - cmp %o2, %o5 - bgu %ncc, .xlarge - - ! %o0 I/O DST is 64-byte aligned - ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) - ! %d0 I/O already loaded with SRC data from [%o1-8] - ! %o2 I/O count (number of bytes that need to be written) - ! %o3 I Not written. If zero, then SRC is double aligned. - ! %o4 I Not written. Holds fprs. - ! %o5 O The number of doubles that remain to be written. - - ! Load the rest of the current block - ! Recall that %o1 is further into SRC than %o0 is into DST - - prefetch [%o0 + (0 * BLOCK_SIZE)], #n_writes - prefetch [%o0 + (1 * BLOCK_SIZE)], #n_writes - prefetch [%o0 + (2 * BLOCK_SIZE)], #n_writes - ldda [%o1]%asi, %d2 - prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read - ldda [%o1 + 0x8]%asi, %d4 - faligndata %d0, %d2, %d16 - ldda [%o1 + 0x10]%asi, %d6 - faligndata %d2, %d4, %d18 - ldda [%o1 + 0x18]%asi, %d8 - faligndata %d4, %d6, %d20 - ldda [%o1 + 0x20]%asi, %d10 - or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 - prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read - faligndata %d6, %d8, %d22 - ldda [%o1 + 0x28]%asi, %d12 - movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter) - faligndata %d8, %d10, %d24 - ldda [%o1 + 0x30]%asi, %d14 - faligndata %d10, %d12, %d26 - ldda [%o1 + 0x38]%asi, %d0 - sub %o2, BLOCK_SIZE, %o2 ! update count - prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read - add %o1, BLOCK_SIZE, %o1 ! update SRC - - ! Main loop. Write previous block. Load rest of current block. - ! Some bytes will be loaded that won't yet be written. -1: - ldda [%o1]%asi, %d2 - faligndata %d12, %d14, %d28 - ldda [%o1 + 0x8]%asi, %d4 - faligndata %d14, %d0, %d30 - std %d16, [%o0] - std %d18, [%o0+8] - std %d20, [%o0+16] - std %d22, [%o0+24] - std %d24, [%o0+32] - std %d26, [%o0+40] - std %d28, [%o0+48] - std %d30, [%o0+56] - sub %o2, BLOCK_SIZE, %o2 ! update count - prefetch [%o0 + (6 * BLOCK_SIZE)], #n_writes - prefetch [%o0 + (3 * BLOCK_SIZE)], #n_writes - add %o0, BLOCK_SIZE, %o0 ! update DST - ldda [%o1 + 0x10]%asi, %d6 - faligndata %d0, %d2, %d16 - ldda [%o1 + 0x18]%asi, %d8 - faligndata %d2, %d4, %d18 - ldda [%o1 + 0x20]%asi, %d10 - faligndata %d4, %d6, %d20 - ldda [%o1 + 0x28]%asi, %d12 - faligndata %d6, %d8, %d22 - ldda [%o1 + 0x30]%asi, %d14 - faligndata %d8, %d10, %d24 - ldda [%o1 + 0x38]%asi, %d0 - faligndata %d10, %d12, %d26 - cmp %o2, BLOCK_SIZE + 8 - prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read - bgu,pt %ncc, 1b - add %o1, BLOCK_SIZE, %o1 ! update SRC - faligndata %d12, %d14, %d28 - faligndata %d14, %d0, %d30 - stda %d16, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache - cmp %o2, BLOCK_SIZE - bne %ncc, 2f ! exactly 1 block remaining? - add %o0, BLOCK_SIZE, %o0 ! update DST - brz,a %o3, 3f ! is SRC double aligned? - ldd [%o1], %d2 - -2: - add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 - add %o5, %o3, %o5 - - ba .beginmedloop - andn %o5, 7, %o5 ! 8 byte aligned count - - ! This is when there is exactly 1 block remaining and SRC is aligned -3: - ! %d0 was loaded in the last iteration of the loop above, and - ! %d2 was loaded in the branch delay slot that got us here. - ldd [%o1 + 0x08], %d4 - ldd [%o1 + 0x10], %d6 - ldd [%o1 + 0x18], %d8 - ldd [%o1 + 0x20], %d10 - ldd [%o1 + 0x28], %d12 - ldd [%o1 + 0x30], %d14 - stda %d0, [%o0]ASI_BLK_P - - ba .exit_memcpy - nop - - - .align 16 - ! two nops here causes loop starting at 1f below to be - ! on a cache line boundary, improving performance - nop - nop -xlarge: -.xlarge: - /* - set 4096, %l2 - subcc %o2, %l2, %g0 - bge %ncc, size_ge_4k - nop - */ - ! %o0 I/O DST is 64-byte aligned - ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) - ! %d0 I/O already loaded with SRC data from [%o1-8] - ! %o2 I/O count (number of bytes that need to be written) - ! %o3 I Not written. If zero, then SRC is double aligned. - ! %o4 I Not written. Holds fprs. - ! %o5 O The number of doubles that remain to be written. - - ! Load the rest of the current block - ! Recall that %o1 is further into SRC than %o0 is into DST - - ! prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read - ! executed in delay slot for branch to .xlarge - prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read - prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read - ldda [%o1]%asi, %d2 - prefetch [%o1 + (6 * BLOCK_SIZE)], #one_read - ldda [%o1 + 0x8]%asi, %d4 - faligndata %d0, %d2, %d16 - ldda [%o1 + 0x10]%asi, %d6 - faligndata %d2, %d4, %d18 - ldda [%o1 + 0x18]%asi, %d8 - faligndata %d4, %d6, %d20 - ldda [%o1 + 0x20]%asi, %d10 - or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 - faligndata %d6, %d8, %d22 - ldda [%o1 + 0x28]%asi, %d12 - movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later) - faligndata %d8, %d10, %d24 - ldda [%o1 + 0x30]%asi, %d14 - faligndata %d10, %d12, %d26 - ldda [%o1 + 0x38]%asi, %d0 - sub %o2, BLOCK_SIZE, %o2 ! update count - prefetch [%o1 + (7 * BLOCK_SIZE)], #one_read - add %o1, BLOCK_SIZE, %o1 ! update SRC - - ! This point is 32-byte aligned since 24 instructions appear since - ! the previous alignment directive. - - - ! Main loop. Write previous block. Load rest of current block. - ! Some bytes will be loaded that won't yet be written. -1: - ldda [%o1]%asi, %d2 - faligndata %d12, %d14, %d28 - ldda [%o1 + 0x8]%asi, %d4 - faligndata %d14, %d0, %d30 - stda %d16, [%o0]ASI_BLK_P - sub %o2, BLOCK_SIZE, %o2 ! update count - ldda [%o1 + 0x10]%asi, %d6 - faligndata %d0, %d2, %d16 - ldda [%o1 + 0x18]%asi, %d8 - faligndata %d2, %d4, %d18 - ldda [%o1 + 0x20]%asi, %d10 - faligndata %d4, %d6, %d20 - ldda [%o1 + 0x28]%asi, %d12 - faligndata %d6, %d8, %d22 - ldda [%o1 + 0x30]%asi, %d14 - faligndata %d8, %d10, %d24 - ldda [%o1 + 0x38]%asi, %d0 - faligndata %d10, %d12, %d26 - ! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K - prefetch [%o1 + (8 * BLOCK_SIZE) + 8], #one_read - add %o0, BLOCK_SIZE, %o0 ! update DST - cmp %o2, BLOCK_SIZE + 8 - ! second prefetch important to correct for occasional dropped - ! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K - ! strong prefetch prevents drops on Panther, but Jaguar and earlier - ! US-III models treat strong prefetches as weak prefetchs - ! to avoid regressions on customer hardware, we retain the prefetch - prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read - bgu,pt %ncc, 1b - add %o1, BLOCK_SIZE, %o1 ! update SRC - - faligndata %d12, %d14, %d28 - faligndata %d14, %d0, %d30 - stda %d16, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache - cmp %o2, BLOCK_SIZE - bne %ncc, 2f ! exactly 1 block remaining? - add %o0, BLOCK_SIZE, %o0 ! update DST - brz,a %o3, 3f ! is SRC double aligned? - ldd [%o1], %d2 - -2: - add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 - add %o5, %o3, %o5 - - - ba .beginmedloop - andn %o5, 7, %o5 ! 8 byte aligned count - - - ! This is when there is exactly 1 block remaining and SRC is aligned -3: - ! %d0 was loaded in the last iteration of the loop above, and - ! %d2 was loaded in the branch delay slot that got us here. - ldd [%o1 + 0x08], %d4 - ldd [%o1 + 0x10], %d6 - ldd [%o1 + 0x18], %d8 - ldd [%o1 + 0x20], %d10 - ldd [%o1 + 0x28], %d12 - ldd [%o1 + 0x30], %d14 - stda %d0, [%o0]ASI_BLK_P - - ba .exit_memcpy - nop - -copying_ge_512: - mov %o0, %o5 ! save dst address for return value. - ! both src and dst are aligned to 8 byte boundary. - save %sp, -SA(STACK_OFFSET + SAVESIZE), %sp - mov %i0, %o0 - mov %i1, %o1 - mov %i2, %o2 - mov %i3, %o3 - mov %i5, %o5 -#ifndef __sparcv9 - stx %g4, [%sp + STACK_OFFSET + g4_offset] - stx %g5, [%sp + STACK_OFFSET + g5_offset] -#endif - rd %fprs, %g5 ! check for unused fp - andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0 - bz,a %ncc, 1f - wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 -1: - !predfetch src buf - sub %o1,1,%o3 - andn %o3,0x7f,%l1 - add %l1,128,%l1 - prefetch [%l1],2 !prefetch next 128b - prefetch [%l1+64],2 - prefetch [%l1+(2*64)],2 !cont from above - prefetch [%l1+(3*64)],2 - !predfetch dst buf - sub %o5,1,%o3 - andn %o3,0x7f,%l1 - add %l1,128,%l1 - prefetch [%l1],2 !prefetch next 128b - prefetch [%l1+64],2 - prefetch [%l1+(2*64)],2 !cont from above - prefetch [%l1+(3*64)],2 - - andcc %o5,0x7f,%o3 !o3=0 , means it is already 128 align - brz,pn %o3,aligned_on_128 - sub %o3,128,%o3 - - add %o2,%o3,%o2 -align_to_128: - ldxa [%o1]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o1,8,%o1 ! increment src pointer - stxa %o4,[%o5]ASI_CACHE_SPARING_PRIMARY - addcc %o3,8,%o3 - bl,pt %ncc,align_to_128 - add %o5,8,%o5 ! increment dst pointer - -aligned_on_128: - andcc %o5,0x1ff,%o3 !%o3=0 when it is 512 b aligned. - brnz,pn %o3, 4f - mov %o2,%l4 !l4=count from 512 align - set 4096, %l2 - subcc %o2, %l2, %g0 - bge,pn %ncc, stingray_optimized_copy - nop -4: - - sub %o5,8,%l6 !should be in current 512 chunk - andn %l6,0x1ff,%o3 !%o3=aligned 512b addr - add %o3,0x200,%o3 !%o3=next aligned 512b addr to start - ! stingray_optimized_copy - sub %o3,%o5,%o3 !o3=how many byte in the current remaining chunk - sub %o2,%o3,%l4 !l4=count from 512 align - /* - * if l4 is < 4096 do interleave_128_copy only. - */ - set 4096, %l2 - subcc %l4, %l2, %g0 - bge,pn %ncc,6f - nop - mov %g0, %l4 - add %o5, %o2, %l1 - ba interleave_128_copy - nop -6: - mov %o3, %o2 - subcc %o3,256,%g0 ! if it is > 256 bytes , could use the - ! interleave_128_copy - bl,pn %ncc,copy_word ! o.w use copy_word to finish the 512 byte - ! alignment. - !%o1=64 bytes data - !%o5=next 8 byte addr to write - !%o2=new count i.e how many bytes to write - add %o5,%o2,%l1 !cal the last byte to write %l1 - ba interleave_128_copy - nop - - .align 64 -interleave_128_copy: - ! %l1 has the addr of the dest. buffer at or beyond which no write - ! is to be done. - ! %l4 has the number of bytes to zero using stingray_optimized_bzero - !prefetch src - !prefetch src - - add %o1, 256, %o3 - prefetch [%o3], 2 !1st 64 byte line of next 256 byte block - add %o1, 384, %o3 - prefetch [%o3], 2 !3rd 64 byte line of next 256 byte block - add %o1, 320, %o3 - prefetch [%o3], 2 !2nd 64 byte line of next 256 byte block - add %o1, 448, %o3 - prefetch [%o3], 2 !4th 64 byte line of next 256 byte block - - !prefetch dst - - add %o5, 256, %o3 - prefetch [%o3], 2 !1st 64 byte line of next 256 byte block - add %o5, 384, %o3 - prefetch [%o3], 2 !3rd 64 byte line of next 256 byte block - add %o5, 320, %o3 - prefetch [%o3], 2 !2nd 64 byte line of next 256 byte block - add %o5, 448, %o3 - prefetch [%o3], 2 !4th 64 byte line of next 256 byte block - - ldxa [%o1]ASI_CACHE_SPARING_PRIMARY, %o4 - stxa %o4,[%o5]ASI_CACHE_SPARING_PRIMARY !1st 64 byte line - add %o1, 128, %o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, 128, %o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY !3rd 64 byte line - add %o1, (1 * 8), %o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (1 * 8), %o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (1 * 8 + 128), %o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (1 * 8 + 128), %o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (2 * 8),%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (2 * 8),%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (2 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (2 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (3 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (3 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (3 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (3 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (4 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (4 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (4 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (4 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (5 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (5 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (5 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (5 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (6 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (6 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (6 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (6 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (7 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (7 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (7 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (7 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (8 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (8 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (8 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (8 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (9 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (9 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (9 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (9 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (10 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (10 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (10 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (10 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (11 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (11 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (11 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (11 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (12 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (12 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (12 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (12 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (13 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (13 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (13 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (13 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (14 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (14 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (14 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (14 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (15 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (15 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, (15 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o5, (15 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o1, 256, %o1 - ! check if the next 256 byte copy will not exceed the number of - ! bytes remaining to be copied. - ! %l2 points to the dest buffer after copying 256 bytes more. - ! %l1 points to dest. buffer at or beyond which no writes should be done. - add %o5,512,%l2 - - subcc %l1,%l2,%g0 - bge,pt %ncc,interleave_128_copy - add %o5,256,%o5 - -copy_word: - and %o2,255,%o3 - and %o3,7,%o2 - - ! Set the remaining doubles - subcc %o3, 8, %o3 ! Can we store any doubles? - bl,pn %ncc, 6f - and %o2, 7, %o2 ! calc bytes left after doubles - - !prefetch src - - mov %o1, %o4 - prefetch [%o4], 2 !1st 64 byte line of next 256 byte block - add %o1, 128, %o4 - prefetch [%o4], 2 !3rd 64 byte line of next 256 byte block - add %o1, 64, %o4 - prefetch [%o4], 2 !2nd 64 byte line of next 256 byte block - add %o1, 192, %o4 - prefetch [%o4], 2 !4th 64 byte line of next 256 byte block - - !prefetch dst - - mov %o5, %o4 - prefetch [%o4], 2 !1st 64 byte line of next 256 byte block - add %o5, 128, %o4 - prefetch [%o4], 2 !3rd 64 byte line of next 256 byte block - add %o5, 64, %o4 - prefetch [%o4], 2 !2nd 64 byte line of next 256 byte block - add %o5, 192, %o4 - prefetch [%o4], 2 !4th 64 byte line of next 256 byte block - -5: - ldxa [%o1]ASI_CACHE_SPARING_PRIMARY, %o4 - add %o1, 8, %o1 - stxa %o4, [%o5]ASI_CACHE_SPARING_PRIMARY - subcc %o3, 8, %o3 - bge,pt %ncc, 5b - add %o5, 8, %o5 -6: - ! Set the remaining bytes - brz %o2, can_we_do_stingray_optimized_copy - nop - - ! Terminate the copy with a partial store. - ! The data should be at d0 - ldxa [%o1]ASI_CACHE_SPARING_PRIMARY, %o4 - stx %o4, [%sp + STACK_OFFSET + scratch_offset] - ldd [%sp + STACK_OFFSET + scratch_offset], %d0 - - dec %o2 ! needed to get the mask right - edge8n %g0, %o2, %o4 - stda %d0, [%o5]%o4, ASI_PST8_P -can_we_do_stingray_optimized_copy: - mov %l4, %o2 - brnz,pn %o2, stingray_optimized_copy - nop - -exit: - brnz %g5, 1f - nop - wr %g5, %g0, %fprs -1: -#ifndef __sparcv9 - ldx [%sp + STACK_OFFSET + g4_offset], %g4 - ldx [%sp + STACK_OFFSET + g5_offset], %g5 -#endif - ret ! %o0 was preserved - restore - - -stingray_optimized_copy: -!%o5 = next memory addr which is 512 b align -!%l4 = remaining byte from 512 align. - - add %o5, %l4, %o2 - - prefetch [%o1+0],2 - prefetch [%o1+(64*1)],2 - prefetch [%o1+(64*2)],2 - prefetch [%o1+(64*3)],2 - prefetch [%o1+(64*4)],2 - prefetch [%o1+(64*5)],2 - prefetch [%o1+(64*6)],2 - prefetch [%o1+(64*7)],2 - prefetch [%o1+(64*8)],2 - prefetch [%o1+(64*9)],2 - prefetch [%o1+(64*10)],2 - prefetch [%o1+(64*11)],2 - prefetch [%o1+(64*12)],2 - prefetch [%o1+(64*13)],2 - prefetch [%o1+(64*14)],2 - prefetch [%o1+(64*15)],2 - - prefetch [%o5+0],2 - prefetch [%o5+(64*1)],2 - prefetch [%o5+(64*2)],2 - prefetch [%o5+(64*3)],2 - prefetch [%o5+(64*4)],2 - prefetch [%o5+(64*5)],2 - prefetch [%o5+(64*6)],2 - prefetch [%o5+(64*7)],2 - prefetch [%o5+(64*8)],2 - prefetch [%o5+(64*9)],2 - prefetch [%o5+(64*10)],2 - prefetch [%o5+(64*11)],2 - prefetch [%o5+(64*12)],2 - prefetch [%o5+(64*13)],2 - prefetch [%o5+(64*14)],2 - prefetch [%o5+(64*15)],2 - - ba myloop2 - srl %l4, 12, %l4 - - ! Local register usage: - ! - ! %l1 address at short distance ahead of current %o1 for prefetching - ! into L1 cache. - ! %l2 address at far ahead of current %o1 for prefetching into L2 cache. - ! %l3 save %o5 at start of inner loop. - ! %l4 Number of 4k blocks to copy - ! %g1 save %o1 at start of inner loop. - ! %l5 iteration counter to make buddy loop execute 2 times. - ! %l6 iteration counter to make inner loop execute 32 times. - ! %l7 address at far ahead of current %o5 for prefetching destination - ! into L2 cache. - -.align 64 -myloop2: - set 2,%l5 ! %l5 is the loop count for the buddy loop, for 2 buddy lines. - add %o5, 0, %l3 - add %o1, 0, %g1 -buddyloop: - set PF_FAR, %g4 ! Prefetch far ahead. CHANGE FAR PREFETCH HERE. - add %o1, %g4, %l2 ! For prefetching far ahead, set %l2 far ahead - ! of %o1 - add %o1, PF_NEAR, %l1 ! For prefetching into L1 D$, set %l1 a - ! little ahead of %o1 - add %o5, %g4, %l7 ! For prefetching far ahead, set %l7 far ahead - ! of %o5 - - add %l2, %g4, %g4 ! %g4 is now double far ahead of the source - ! address in %o1. - prefetch [%g4+%g0],2 ! Prefetch ahead by several pages to get TLB - ! entry in advance. - set 2*PF_FAR, %g4 ! Prefetch double far ahead. SET DOUBLE FAR - ! PREFETCH HERE. - add %o5, %g4, %g4 ! %g4 is now double far ahead of the dest - ! address in %o5. - prefetch [%g4+%g0],2 ! Prefetch ahead by 2 pages to get TLB entry - ! in advance. - - set 4,%l6 ! %l6 = loop count for the inner loop, - ! for 4 x 8 = 32 lines. - set 0, %g4 - - ! Each iteration of the inner loop below copies 8 sequential lines. - ! This loop is iterated 4 times, to move a total of 32 lines, - ! all of which have the same value of PA[9], so we increment the base - ! address by 1024 bytes in each iteration, which varies PA[10]. */ -innerloop: - /* ---- copy line 1 of 8. ---- */ - prefetch [%l2+%g4],2 - prefetch [%l7+%g4],2 - prefetch [%l1+%g4],1 - - ldd [%o1],%d0 - ldd [%o1+8],%d2 - ldd [%o1+16],%d4 - ldd [%o1+24],%d6 - ldd [%o1+32],%d8 - ldd [%o1+40],%d10 - ldd [%o1+48],%d12 - ldd [%o1+56],%d14 - stda %d0,[%o5]ASI_BLK_P - add %g4, 64, %g4 - add %o5, 64, %o5 - add %o1, 64, %o1 /* increment %o1 for the next source line. */ - - /* ---- copy line 2 of 8. ---- */ - prefetch [%l2+%g4],2 - prefetch [%l7+%g4],2 - prefetch [%l1+%g4],1 - - ldd [%o1],%d0 - ldd [%o1+8],%d2 - ldd [%o1+16],%d4 - ldd [%o1+24],%d6 - ldd [%o1+32],%d8 - ldd [%o1+40],%d10 - ldd [%o1+48],%d12 - ldd [%o1+56],%d14 - stda %d0,[%o5]ASI_BLK_P - add %g4, 64, %g4 - add %o5, 64, %o5 - add %o1, 64, %o1 /* increment %o1 for the next source line. */ - - /* ---- copy line 3 of 8. ---- */ - prefetch [%l2+%g4],2 - prefetch [%l7+%g4],2 - prefetch [%l1+%g4],1 - - ldd [%o1],%d0 - ldd [%o1+8],%d2 - ldd [%o1+16],%d4 - ldd [%o1+24],%d6 - ldd [%o1+32],%d8 - ldd [%o1+40],%d10 - ldd [%o1+48],%d12 - ldd [%o1+56],%d14 - stda %d0,[%o5]ASI_BLK_P - add %g4, 64, %g4 - add %o5, 64, %o5 - add %o1, 64, %o1 /* increment %o1 for the next source line. */ - - /* ---- copy line 4 of 8. ---- */ - prefetch [%l2+%g4],2 - prefetch [%l7+%g4],2 - prefetch [%l1+%g4],1 - - ldd [%o1],%d0 - ldd [%o1+8],%d2 - ldd [%o1+16],%d4 - ldd [%o1+24],%d6 - ldd [%o1+32],%d8 - ldd [%o1+40],%d10 - ldd [%o1+48],%d12 - ldd [%o1+56],%d14 - stda %d0,[%o5]ASI_BLK_P - add %g4, 64, %g4 - add %o5, 64, %o5 - add %o1, 64, %o1 /* increment %o1 for the next source line. */ - - /* ---- copy line 5 of 8. ---- */ - prefetch [%l2+%g4],2 - prefetch [%l7+%g4],2 - prefetch [%l1+%g4],1 - - ldd [%o1],%d0 - ldd [%o1+8],%d2 - ldd [%o1+16],%d4 - ldd [%o1+24],%d6 - ldd [%o1+32],%d8 - ldd [%o1+40],%d10 - ldd [%o1+48],%d12 - ldd [%o1+56],%d14 - stda %d0,[%o5]ASI_BLK_P - add %g4, 64, %g4 - add %o5, 64, %o5 - add %o1, 64, %o1 /* increment %o1 for the next source line. */ - - /* ---- copy line 6 of 8. ---- */ - prefetch [%l2+%g4],2 - prefetch [%l7+%g4],2 - prefetch [%l1+%g4],1 - - ldd [%o1],%d0 - ldd [%o1+8],%d2 - ldd [%o1+16],%d4 - ldd [%o1+24],%d6 - ldd [%o1+32],%d8 - ldd [%o1+40],%d10 - ldd [%o1+48],%d12 - ldd [%o1+56],%d14 - stda %d0,[%o5]ASI_BLK_P - add %g4, 64, %g4 - add %o5, 64, %o5 - add %o1, 64, %o1 /* increment %o1 for the next source line. */ - - /* ---- copy line 7 of 8. ---- */ - prefetch [%l2+%g4],2 - prefetch [%l7+%g4],2 - prefetch [%l1+%g4],1 - - ldd [%o1],%d0 - ldd [%o1+8],%d2 - ldd [%o1+16],%d4 - ldd [%o1+24],%d6 - ldd [%o1+32],%d8 - ldd [%o1+40],%d10 - ldd [%o1+48],%d12 - ldd [%o1+56],%d14 - stda %d0,[%o5]ASI_BLK_P - add %g4, 64, %g4 - add %o5, 64, %o5 - add %o1, 64, %o1 /* increment %o1 for the next source line. */ - - /* ---- copy line 8 of 8. ---- */ - prefetch [%l2+%g4],2 - prefetch [%l7+%g4],2 - prefetch [%l1+%g4],1 - - ldd [%o1],%d0 - ldd [%o1+8],%d2 - ldd [%o1+16],%d4 - ldd [%o1+24],%d6 - ldd [%o1+32],%d8 - ldd [%o1+40],%d10 - ldd [%o1+48],%d12 - ldd [%o1+56],%d14 - stda %d0,[%o5]ASI_BLK_P - - subcc %l6,1,%l6 /* Decrement the inner loop counter. */ - - ! Now increment by 64 + 512 so we don't toggle PA[9] - add %g4, 576, %g4 - add %o5, 576, %o5 - - bg,pt %icc,innerloop - add %o1, 576, %o1 ! increment %o1 for the next source line. - ! END OF INNER LOOP - - - subcc %l5,1,%l5 - add %l3, 512, %o5 ! increment %o5 to first buddy line of dest. - bg,pt %icc,buddyloop - add %g1, 512 ,%o1 ! Set %o1 to the first of the odd buddy lines. - - subcc %l4, 1, %l4 - add %o5, 3584, %o5 ! Advance both base addresses to 4k above where - ! they started. - add %o1, 3584, %o1 ! They were already incremented by 512, - ! so just add 3584. - - bg,pt %icc,myloop2 - nop - - /****larryalg_end_here*************/ - - sub %o2,%o5,%o2 !how many byte left - brz,pn %o2,complete_write - mov %g0,%l4 - add %o5,%o2,%l1 !cal the last byte to write %l1 - subcc %o2,256,%g0 - bge,pt %ncc,interleave_128_copy - mov %g0,%l4 - - ba copy_word - nop - - -complete_write: - ba exit - nop - - - - SET_SIZE(memcpy) - SET_SIZE(__align_cpy_1)
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/memset.s Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,767 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - - - .file "memset.s" -/* - * char *memset(sp, c, n) - * - * Set an array of n chars starting at sp to the character c. - * Return sp. - * - * Fast assembler language version of the following C-program for memset - * which represents the `standard' for the C-library. - * - * void * - * memset(void *sp1, int c, size_t n) - * { - * if (n != 0) { - * char *sp = sp1; - * do { - * *sp++ = (char)c; - * } while (--n != 0); - * } - * return (sp1); - * } - */ - -#include <sys/asm_linkage.h> -#include <sys/sun4asi.h> - - ANSI_PRAGMA_WEAK(memset,function) - -#define SAVESIZE (8 * 1) -#ifdef __sparcv9 -#define STACK_OFFSET (STACK_BIAS + 0) -#else -#define STACK_OFFSET (STACK_BIAS + 0 + 0) -#endif -#define scratch_offset 0 - -#define ASI_CACHE_SPARING_PRIMARY 0xf4 -#define ALIGN8(X) (((X) + 7) & ~7) -#define ICACHE_LINE_SIZE 64 -#define FPRS_FEF 0x4 -#define PF_FAR 2048 - - .section ".text" - .align ICACHE_LINE_SIZE - - /* - * Optimizations done: - * - * No stores in delay slot of branch instructions. - * conditional stores where possible - * prefetch before doing stxa - * Bank interleaved writing. - */ - - ENTRY(memset) - add %sp, -SA(STACK_OFFSET + SAVESIZE), %sp - mov %o0, %o5 ! copy sp1 before using it - /* - * If 0 bytes to xfer return - */ - brnz %o2, continue - nop - retl - add %sp, SA(STACK_OFFSET + SAVESIZE), %sp -continue: - /* - * If the count is multiple of 8 and buffer is aligned to 8 - * we don't have to look at fprs - */ - or %o5, %o2, %o3 - and %o3, 7, %o3 - brnz %o3, check_fprs - mov 4, %g1 - prefetch [%o5],2 - ba skip_rd_fprs - nop - -check_fprs: - rd %fprs, %g1 ! g1 = fprs -skip_rd_fprs: - prefetch [%o5],2 - andcc %g1, 0x4, %g1 ! fprs.du = fprs.dl = 0 - bnz %ncc, 1f ! Is fprs.fef == 1 - nop - wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 -1: - and %o1, 0xff, %o1 ! o1 is (char)c - sll %o1, 8, %o3 - or %o1, %o3, %o1 ! now o1 has 2 bytes of c - sll %o1, 16, %o3 - or %o1, %o3, %o1 ! now o1 has 4 bytes of c - sllx %o1, 32, %o3 - or %o1, %o3, %o1 ! now o1 has 8 bytes of c - stx %o1, [%sp + STACK_OFFSET + scratch_offset] - ldd [%sp + STACK_OFFSET + scratch_offset], %d0 - cmp %o2, 8 - bge,pt %ncc, xfer_8_or_more - mov %o0, %o5 - /* - * Do a partial store of %o2 bytes - */ - andcc %o5, 7, %o3 ! is sp1 aligned on a 8 byte bound - brz,pt %o3, aligned_on_8 - sub %o5, %o3, %o5 ! align the destination buffer. - mov %o3, %o1 - mov 8, %o4 - sub %o4, %o3, %o3 - cmp %o3, %o2 - bg,a,pn %ncc, 1f - mov %o2, %o3 -1: - ! %o3 has the bytes to be written in partial store. - sub %o2, %o3, %o2 - dec %o3 - prefetch [%o5],2 - edge8n %g0, %o3, %o4 - srl %o4, %o1, %o4 - stda %d0, [%o5]%o4, ASI_PST8_P - brz %o2, simple_ret - add %o5, 8, %o5 -aligned_on_8: - prefetch [%o5],2 - dec %o2 ! needed to get the mask right - edge8n %g0, %o2, %o4 - stda %d0, [%o5]%o4, ASI_PST8_P - brnz %g1, 1f ! was fprs.fef == 1 - nop - wr %g1, %g0, %fprs ! fprs = g1 restore fprs -1: - retl - add %sp, SA(STACK_OFFSET + SAVESIZE), %sp - -xfer_8_or_more: - andcc %o5, 7, %o3 ! is sp1 aligned on a 8 byte bound - brz,pt %o3, blkchk - sub %o5, %o3, %o5 ! align the destination buffer. - sub %o3, 8, %o3 ! -(bytes till double aligned) - add %o2, %o3, %o2 ! update o2 with new count - xor %o3, 0xff, %o3 - and %o3, 7, %o3 - prefetch [%o5],2 - edge8ln %g0, %o3, %o4 - stda %d0, [%o5]%o4, ASI_PST8_P - add %o5, 8, %o5 - - - ! Now sp1 is double aligned (sp1 is found in %o5) -blkchk: - cmp %o2, 767 ! if large count use Block ld/st - bg,pt %ncc,blkwr - nop - - - and %o2, 24, %o3 ! o3 is {0, 8, 16, 24} - - brz %o3, skip_dw_loop - nop - -1: subcc %o3, 8, %o3 ! double-word loop - stx %o1, [%o5] - bgu,pt %ncc, 1b - add %o5, 8, %o5 -skip_dw_loop: - andncc %o2, 31, %o4 ! o4 has 32 byte aligned count - brz,pn %o4, 3f - nop - ba loop_32byte - nop - - .align ICACHE_LINE_SIZE - -loop_32byte: - subcc %o4, 32, %o4 ! main loop, 32 bytes per iteration - stx %o1, [%o5] - stx %o1, [%o5 + 8] - stx %o1, [%o5 + 16] - stx %o1, [%o5 + 24] - bne,pt %ncc, loop_32byte - add %o5, 32, %o5 -3: - and %o2, 7, %o2 ! o2 has the remaining bytes (<8) - brz %o2, skip_partial_copy - nop - - ! Terminate the copy with a partial store. - ! The data should be at d0 - prefetch [%o5],2 - dec %o2 ! needed to get the mask right - edge8n %g0, %o2, %o4 - stda %d0, [%o5]%o4, ASI_PST8_P - -skip_partial_copy: -simple_ret: - brz,a %g1, 1f ! was fprs.fef == 0 - wr %g1, %g0, %fprs ! fprs = g1 restore fprs -1: - retl - add %sp, SA(STACK_OFFSET + SAVESIZE), %sp - -blkwr: - sub %o5,1,%o3 - andn %o3,0x7f,%o4 - add %o4,128,%o4 - prefetch [%o4],2 !prefetch next 128b - prefetch [%o4+64],2 - prefetch [%o4+(2*64)],2 !cont from above - prefetch [%o4+(3*64)],2 - - andcc %o5,0x7f,%o3 !o3=0 , means it is already 128 align - brz,pn %o3,alreadyalign128 - sub %o3,128,%o3 - - add %o2,%o3,%o2 -align128: - stxa %o1,[%o5]ASI_CACHE_SPARING_PRIMARY - addcc %o3,8,%o3 - bl,pt %ncc,align128 - add %o5,8,%o5 - - - -alreadyalign128: - andcc %o5,0x1ff,%o3 !%o3=0 when it is 512 b aligned. - brnz,pn %o3, 4f - mov %o2,%g5 !g5=count from 512 align - set 4096, %o4 - subcc %o2, %o4, %g0 - bge,pn %ncc, larry_alg - nop -4: - - sub %o5,8,%o4 !should be in current 512 chunk - andn %o4,0x1ff,%o3 !%o3=aligned 512b addr - add %o3,0x200,%o3 !%o3=next aligned 512b addr which start larry process - sub %o3,%o5,%o3 !o3=how many byte in the current remaining chunk - sub %o2,%o3,%g5 !g5=count from 512 align - /* - * if g5 is < 4096 do start_128 only. - */ - set 4096, %o4 - subcc %g5, %o4, %g0 - bge,pn %ncc,6f - nop - mov %g0, %g5 - add %o5, %o2, %o4 - ba start_128 - nop -6: - mov %o3, %o2 - subcc %o3,256,%g0 !if it is > 256 bytes , could use the st-interleave alg to wr - bl,pn %ncc,storeword !o.w use storeword to finish the 512 byte alignment. - !%o1=64 bytes data - !%o5=next 8 byte addr to write - !%o2=new count i.e how many bytes to write - add %o5,%o2,%o4 !cal the last byte to write %o4 - ba start_128 - nop - - .align 64 -start_128: - add %o5, 256, %o3 - prefetch [%o3], 2 !1st 64 byte line of next 256 byte block - add %o5, 384, %o3 - prefetch [%o3], 2 !3rd 64 byte line of next 256 byte block - add %o5, 320, %o3 - prefetch [%o3], 2 !2nd 64 byte line of next 256 byte block - add %o5, 448, %o3 - prefetch [%o3], 2 !4th 64 byte line of next 256 byte block - mov %o5, %o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !1st 64 byte line - add %o5,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !3rd 64 byte line - add %o5,8,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(2 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128 ,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(3 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(4 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(5 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(6 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(7 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(8 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(9 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(10 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(11 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(12 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(13 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(14 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(15 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,512,%o3 !%o3=final byte of next 256 byte, to check if more 256 byte block ahead - subcc %o4,%o3,%g0 !%o4=final byte location;%o3=final byte of next 256 byte block - bge,pt %ncc,start_128 !branch taken means next 256 byte block is still within the limit. - add %o5,256,%o5 - -!need to connect the rest of the program -storeword: - and %o2,255,%o3 - and %o3,7,%o2 - - ! Set the remaining doubles - subcc %o3, 8, %o3 ! Can we store any doubles? - bl,pn %ncc, 6f - and %o2, 7, %o2 ! calc bytes left after doubles - -5: - stxa %o1, [%o5]ASI_CACHE_SPARING_PRIMARY - subcc %o3, 8, %o3 - bge,pt %ncc, 5b - add %o5, 8, %o5 -6: - ! Set the remaining bytes - brz %o2, check_larry_alg ! safe to check all 64-bits - - ! Terminate the copy with a partial store. - ! The data should be at d0 - dec %o2 ! needed to get the mask right - edge8n %g0, %o2, %o4 - stda %d0, [%o5]%o4, ASI_PST8_P -check_larry_alg: - mov %g5, %o2 - brnz,pn %o2, larry_alg - nop - -.exit: - brz,a %g1, 1f ! was fprs.fef == 0 - wr %g1, %g0, %fprs ! fprs = g1 restore fprs -1: - retl ! %o0 was preserved - add %sp, SA(STACK_OFFSET + SAVESIZE), %sp - -larry_alg: - add %sp, SA(STACK_OFFSET + SAVESIZE), %sp - save %sp, -SA(MINFRAME), %sp - mov %i0, %o0 - mov %i1, %o1 - mov %i2, %o2 - mov %i3, %o3 - mov %i5, %o5 -!%o5 = next memory addr which is 512 b align -!%g5 = remaining byte from 512 align. -init: - set 4096,%g6 - - prefetch [%o5+0],2 - prefetch [%o5+(64*1)],2 - prefetch [%o5+(64*2)],2 - prefetch [%o5+(64*3)],2 - prefetch [%o5+(64*4)],2 - prefetch [%o5+(64*5)],2 - prefetch [%o5+(64*6)],2 - prefetch [%o5+(64*7)],2 - prefetch [%o5+(64*8)],2 - prefetch [%o5+(64*9)],2 - prefetch [%o5+(64*10)],2 - prefetch [%o5+(64*11)],2 - prefetch [%o5+(64*12)],2 - prefetch [%o5+(64*13)],2 - prefetch [%o5+(64*14)],2 - prefetch [%o5+(64*15)],2 - ba myloop2 - add %o5,%g5,%g5 - /* Local register usage: - %l3 save %o5 at start of inner loop. - %l5 iteration counter to make buddy loop execute 2 times. - %l6 iteration counter to make inner loop execute 32 times. - %l7 address at far ahead of current %o5 for prefetching destination into L2 cache. - */ - - .align 64 -myloop2: - /* Section 1 */ - set 2,%l5 /* %l5 is the loop count for the buddy loop, for 2 buddy lines. */ - add %o5, 0, %l3 -buddyloop: - set PF_FAR, %l4 /* Prefetch far ahead. CHANGE FAR PREFETCH HERE. <<==== */ - add %o5, %l4, %l7 /* For prefetching far ahead, set %l7 far ahead of %o5 */ - - set 2*PF_FAR, %l4 /* Prefetch double far ahead. SET DOUBLE FAR PREFETCH HERE. <<==== */ - add %o5, %l4, %l4 /* %l4 is now double far ahead of the dest address in %o5. */ - prefetch [%l4+%g0],2 /* Prefetch ahead by 2 pages to get TLB entry in advance. */ - - set 4,%l6 /* %l6 = loop count for the inner loop, for 4 x 8 = 32 lines. */ - set 0, %l4 - - -/* Each iteration of the inner loop below writes 8 sequential lines. This loop is iterated 4 times, - to move a total of 32 lines, all of which have the same value of PA[9], so we increment the base - address by 1024 bytes in each iteration, which varies PA[10]. */ -innerloop: - add %o5, PF_FAR, %o3 - prefetch [%o3],2 - add %o3, 64, %o3 - prefetch [%o3],2 - add %o3, 64, %o3 - prefetch [%o3],2 - add %o3, 64, %o3 - prefetch [%o3],2 - add %o3, 64, %o3 - prefetch [%o3],2 - add %o3, 64, %o3 - prefetch [%o3],2 - add %o3, 64, %o3 - prefetch [%o3],2 - add %o3, 64, %o3 - prefetch [%o3],2 - - mov %o5, %o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !1st 64 byte line - add %o5,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !3rd 64 byte line - add %o5,8,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(2 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128 ,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(3 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(4 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(5 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(6 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(7 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(8 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(9 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(10 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(11 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(12 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(13 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(14 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(15 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - - add %o5,256,%o5 - - mov %o5, %o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !1st 64 byte line - add %o5,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !3rd 64 byte line - add %o5,8,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(2 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128 ,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(3 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(4 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(5 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(6 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(7 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(8 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(9 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(10 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(11 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(12 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(13 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(14 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(15 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - - subcc %l6,1,%l6 /* Decrement the inner loop counter. */ - - /* -------- Now increment by 256 + 512 so we don't toggle PA[9] -------- */ - add %o5, 768, %o5 - - bg,pt %ncc,innerloop - nop -/* ------------------------ END OF INNER LOOP -------------------------- */ - - subcc %l5,1,%l5 - add %l3, 512, %o5 /* increment %o5 to first buddy line of dest. */ - bg,pt %ncc,buddyloop - nop - add %o5, 3584, %o5 /* Advance both base addresses to 4k above where they started. */ - !%o5=next 4096 block. - add %o5,%g6,%i5 - subcc %g5,%i5,%g0 - bge,pt %ncc,myloop2 - nop - - - /****larryalg_end_here*************/ - - sub %g5,%o5,%o2 !how many byte left - brz,pn %o2,complete_write - mov %g0,%g5 - add %o5,%o2,%o4 !cal the last byte to write %o4 - subcc %o2,256,%g0 - bge,pt %ncc,memset_128 - mov %g0,%g5 - - ba memset_storeword - nop - - -complete_write: - brz,a %g1, 1f ! was fprs.fef == 0 - wr %g1, %g0, %fprs ! fprs = g1 restore fprs -1: - ret ! %o0 was preserved - restore - - .align 64 -memset_128: - add %o5, 256, %o3 - prefetch [%o3], 2 !1st 64 byte line of next 256 byte block - add %o5, 384, %o3 - prefetch [%o3], 2 !3rd 64 byte line of next 256 byte block - add %o5, 320, %o3 - prefetch [%o3], 2 !2nd 64 byte line of next 256 byte block - add %o5, 448, %o3 - prefetch [%o3], 2 !4th 64 byte line of next 256 byte block - mov %o5, %o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !1st 64 byte line - add %o5,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY !3rd 64 byte line - add %o5,8,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(2 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128 ,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(3 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(4 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(5 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(6 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(7 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(8 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(9 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(10 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(11 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(12 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(13 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(14 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,(15 * 8),%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY - add %o5,512,%l4 !%l4=final byte of next 256 byte, to check if more 256 byte block ahead - add %o3,128,%o3 - stxa %o1,[%o3]ASI_CACHE_SPARING_PRIMARY -!this branch condition is not needed if we are handling bytes before 4096b -!because we will only issue once, so %l6 is an invalid data -!the branch is really for handling bytes after 4096b, there could be -!multiple of 256 byte block to work on. - - subcc %o4,%l4,%g0 !%o4=final byte location;%l4=final byte of next 256 byte block - bge,pt %ncc,memset_128 !branch taken means next 256 byte block is still within the limit. - add %o5,256,%o5 - -!need to connect the rest of the program -memset_storeword: - and %o2,255,%o3 - and %o3,7,%o2 - - ! Set the remaining doubles - subcc %o3, 8, %o3 ! Can we store any doubles? - bl,pn %ncc, 6f - and %o2, 7, %o2 ! calc bytes left after doubles - -5: - stxa %o1, [%o5]ASI_CACHE_SPARING_PRIMARY - subcc %o3, 8, %o3 - bge,pt %ncc, 5b - add %o5, 8, %o5 -6: - ! Set the remaining bytes - brz %o2, complete_write ! safe to check all 64-bits - - ! Terminate the copy with a partial store. - ! The data should be at d0 - dec %o2 ! needed to get the mask right - edge8n %g0, %o2, %o4 - stda %d0, [%o5]%o4, ASI_PST8_P - - brz,a %g1, 1f ! was fprs.fef == 0 - wr %g1, %g0, %fprs ! fprs = g1 restore fprs -1: - ret ! %o0 was preserved - restore - - - SET_SIZE(memset)
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/misc.s Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/asm_linkage.h> - - ENTRY(_rock_pause) - membar #Halt - retl - nop - SET_SIZE(_rock_pause)
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/strcpy.s Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,340 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - - .file "strcpy.s" - -/* - * strcpy(s1, s2) - * - * Copy string s2 to s1. s1 must be large enough. Return s1. - * - * Fast assembler language version of the following C-program strcpy - * which represents the `standard' for the C-library. - * - * char * - * strcpy(s1, s2) - * register char *s1; - * register const char *s2; - * { - * char *os1 = s1; - * - * while(*s1++ = *s2++) - * ; - * return(os1); - * } - * - */ - -#include <sys/asm_linkage.h> - - ! This implementation of strcpy works by first checking the - ! source alignment and copying byte, half byte, or word - ! quantities until the source ptr is aligned at an extended - ! word boundary. Once this has occurred, the string is copied, - ! checking for zero bytes, depending upon its dst ptr alignment. - ! (methods for xword, word, half-word, and byte copies are present) - -#ifdef __sparcv9 -#define SAVESIZE (8 * 3) -#define STACK_OFFSET (STACK_BIAS + MINFRAME) -#else -#define SAVESIZE (8 * 5) -#define STACK_OFFSET (STACK_BIAS + MINFRAME + 4) -#endif - -#define LABEL_ADDRESS(label, reg) \ - .pushlocals ;\ -0: rd %pc, reg ;\ - add reg, (label) - 0b, reg ;\ - .poplocals - -offset_table: - .word .storexword - offset_table ! Offset 0 => xword aligned - .word .storebyte1241 - offset_table ! Offset 1 or 5 - .word .storehalfword - offset_table ! Offset 2 or 6 - .word .storebyte1421 - offset_table ! Offset 3 or 7 - .word .storeword - offset_table ! Offset 4 - - .align 64 -#ifdef __sparcv9 - .skip 20 -#else - .skip 12 -#endif - - ENTRY(strcpy) - add %sp, -SA(STACK_OFFSET + SAVESIZE), %sp -#ifndef __sparcv9 - stx %g4, [%sp + STACK_OFFSET + 24] - stx %g5, [%sp + STACK_OFFSET + 32] -#endif - sethi %hi(0x01010101), %o4 ! 0x01010000 - sub %o1, %o0, %o3 ! src - dst - or %o4, %lo(0x01010101), %o4 ! 0x01010101 - andcc %o1, 7, %g5 ! dword aligned ? - sllx %o4, 32, %o5 ! 0x01010101 << 32 - mov %o0, %o2 ! save dst - or %o4, %o5, %o4 ! 0x0101010101010101 - - bz,pt %ncc, .srcaligned ! yup - sllx %o4, 7, %o5 ! 0x8080808080808080 - - sub %g0, %g5, %g4 ! count = -off - ldx [%o1 + %g4], %o1 ! val = *(addr + -off) - mov -1, %g1 ! mask = -1 - sllx %g5, 3, %g4 ! shift = off * 8 - srlx %g1, %g4, %g1 ! -1 >> ((addr & 7) * 8) - orn %o1, %g1, %o1 ! val |= ~mask - - andn %o5, %o1, %g4 ! ~val & 0x80 - sub %o1, %o4, %g1 ! val - 0x01 - andcc %g4, %g1, %g4 ! ~val & 0x80 & (val - 0x01) - - sllx %g5, 3, %g4 - add %o2, 8, %o2 ! .zerobyte expects address = address + 8 - bnz,a,pn %xcc, .zerobyte ! Zero byte in the first xword - sllx %o1, %g4, %o1 ! and data to be left justified - - sub %o2, 8, %o2 - mov 8, %g4 - sub %g4, %g5, %g1 ! Bytes to be written - sub %g1, 1, %g4 - -1: stub %o1, [%o2 + %g4] - dec %g4 - brgez,pt %g4, 1b - srlx %o1, 8, %o1 - - add %o2, %g1, %o2 ! Move ptr by #bytes written - -.srcaligned: - !! Check if the first dword contains zero after src is aligned - ldx [%o2 + %o3], %o1 ! x = src[] - andn %o5, %o1, %g1 ! ~x & 0x8080808080808080 - sub %o1, %o4, %g4 ! x - 0x0101010101010101 - andcc %g4, %g1, %g0 ! ((x - 0x0101010101010101) & ~x & 0x8080808080808080) - bnz,a,pn %xcc, .zerobyte ! x has zero byte, handle end cases - add %o2, 8, %o2 ! src += 8, dst += 8 - - !! Determine the destination offset and branch - !! to appropriate location - and %o2, 3, %g4 - and %o2, 4, %g1 - or %g1, %g4, %g1 - movrnz %g4, 0, %g1 - movrnz %g1, 4, %g4 - - !! %g4 contains the index of the jump address - !! Load the address from the table. - LABEL_ADDRESS(offset_table, %g1) - sllx %g4, 2, %g4 - lduw [%g1 + %g4], %g4 - jmp %g1 + %g4 - add %o2, 8, %o2 ! src += 8, dst += 8 - -.storexword: - stx %o1, [%o2 - 8] ! store word to dst (address pre-incremented) - -1: - ldx [%o2 + %o3], %o1 ! src dword - add %o2, 8, %o2 ! src += 8, dst += 8 - andn %o5, %o1, %g1 ! ~dword & 0x8080808080808080 - sub %o1, %o4, %g4 ! dword - 0x0101010101010101 - andcc %g4, %g1, %g0 ! ((dword - 0x0101010101010101) & ~dword & 0x8080808080808080) - bz,a,pt %xcc, 1b ! no zero byte if magic expression == 0 - stx %o1, [%o2 - 8] ! store word to dst (address pre-incremented) - - ba,a .zerobyte - -.storebyte1421: - !! Offset 3 or 7 - srlx %o1, 56, %g1 ! %g1<7:0> = first byte; word aligned now - stb %g1, [%o2 - 8] ! store first byte - srlx %o1, 24, %g1 ! %g1<31:0> = bytes 2, 3, 4, 5 - stw %g1, [%o2 - 7] ! store bytes 2, 3, 4, 5 - srlx %o1, 8, %g1 ! %g1<15:0> = bytes 6, 7 - sth %g1, [%o2 - 3] ! store bytes 6, 7 - - stx %l0, [%sp + STACK_OFFSET + 0] - and %o2, 7, %g1 - stx %l1, [%sp + STACK_OFFSET + 8] - cmp %g1, 3 - stx %l2, [%sp + STACK_OFFSET + 16] - - move %ncc, 40, %l0 - move %ncc, 24, %l1 - move %ncc, -11, %l2 - - movne %ncc, 8, %l0 - movne %ncc, 56, %l1 - movne %ncc, -15, %l2 - - ba .dstaligned - mov %o1, %g5 - -.storebyte1241: - !! Offset 1 or 5 - srlx %o1, 56, %g1 ! %g1<7:0> = first byte; word aligned now - stb %g1, [%o2 - 8] ! store first byte - srlx %o1, 40, %g1 ! %g1<15:0> = bytes 2, 3 - sth %g1, [%o2 - 7] ! store bytes 2, 3 - srlx %o1, 8, %g1 ! %g1<31:0> = bytes 4, 5, 6, 7 - stw %g1, [%o2 - 5] ! store bytes 4, 5, 6, 7 - - stx %l0, [%sp + STACK_OFFSET + 0] - and %o2, 7, %g1 - stx %l1, [%sp + STACK_OFFSET + 8] - cmp %g1, 1 - stx %l2, [%sp + STACK_OFFSET + 16] - - move %ncc, 56, %l0 - move %ncc, 8, %l1 - move %ncc, -9, %l2 - - movne %ncc, 24, %l0 - movne %ncc, 40, %l1 - movne %ncc, -13, %l2 - - ba .dstaligned - mov %o1, %g5 - -.storehalfword: - srlx %o1, 48, %g1 ! get first and second byte - sth %g1, [%o2 - 8] ! store first and second byte; word aligned now - srlx %o1, 16, %g1 ! %g1<31:0> = bytes 3, 4, 5, 6 - stw %g1, [%o2 - 6] ! store bytes 3, 4, 5, 6 - - stx %l0, [%sp + STACK_OFFSET + 0] - and %o2, 7, %g1 - stx %l1, [%sp + STACK_OFFSET + 8] - cmp %g1, 2 - stx %l2, [%sp + STACK_OFFSET + 16] - - move %ncc, 48, %l0 - move %ncc, 16, %l1 - move %ncc, -10, %l2 - - movne %ncc, 16, %l0 - movne %ncc, 48, %l1 - movne %ncc, -14, %l2 - - ba .dstaligned - mov %o1, %g5 - -.storeword: - srlx %o1, 32, %g1 ! get bytes 1,2,3,4 - stw %g1, [%o2 - 8] ! store bytes 1,2,3,4 (address is pre-incremented) - - stx %l0, [%sp + STACK_OFFSET + 0] - mov 32, %l0 ! Num of bits to be shifted left - stx %l1, [%sp + STACK_OFFSET + 8] - mov 32, %l1 ! Num of bits to be shifted right - stx %l2, [%sp + STACK_OFFSET + 16] - mov -12, %l2 ! -offset - mov %o1, %g5 - - nop ! Do not delete. Used for alignment. -.dstaligned: - ldx [%o2 + %o3], %o1 ! x = src[] - add %o2, 8, %o2 ! src += 8, dst += 8 - andn %o5, %o1, %g1 ! ~x & 0x8080808080808080 - sub %o1, %o4, %g4 ! x - 0x0101010101010101 - andcc %g4, %g1, %g0 ! ((x - 0x0101010101010101) & ~x & 0x8080808080808080) - bnz,a,pn %xcc, .finishup ! x has zero byte, handle end cases - stb %g5, [%o2 - 9] - - sllx %g5, %l0, %g5 - srlx %o1, %l1, %g4 - or %g5, %g4, %g5 - - stx %g5, [%o2 + %l2] - ba .dstaligned - mov %o1, %g5 - -.finishup: - cmp %l0, 56 - be,pn %ncc, .zerobyte_restore - andcc %o2, 1, %g0 - bnz,a %ncc, 1f - srlx %g5, 8, %g5 - -1: srlx %l1, 4, %g4 ! g4 contains 1, 2 or 3 - sub %g4, 1, %g4 ! multiple of 16 - sllx %g4, 4, %g4 ! How many bits to shift - srlx %g5, %g4, %l0 - add %o2, %l2, %g1 - -2: sth %l0, [%g1] - sub %g4, 16, %g4 - add %g1, 2, %g1 - brgez,a,pt %g4, 2b - srlx %g5, %g4, %l0 - -.zerobyte_restore: - ldx [%sp + STACK_OFFSET + 0], %l0 - andn %o5, %o1, %o3 ! ~val & 0x80 - ldx [%sp + STACK_OFFSET + 8], %l1 - sub %o1, %o4, %g1 ! val - 0x01 - ldx [%sp + STACK_OFFSET + 16], %l2 - - ba 1f - andcc %o3, %g1, %o3 ! ~val & 0x80 & (val - 0x01) - -.zerobyte: - !! %o5: 0x8080808080808080 - !! %o4: 0x0101010101010101 - !! %o1: Left justified dowrd that contains 0 byte - !! %o2: Address to be written + 8 - - andn %o5, %o1, %o3 ! ~val & 0x80 - sub %o1, %o4, %g1 ! val - 0x01 - andcc %o3, %g1, %o3 ! ~val & 0x80 & (val - 0x01) - -1: srlx %o3, 7, %o3 ! shift 0x80 -> 0x01 - andn %o3, %o1, %o3 ! mask off leading 0x01 bytes - lzd %o3, %o4 ! 7, 15, ... 63 - - mov 64, %o5 ! Calc # of bytes to be discarded - inc %o4 ! Include the zero byte too - sub %o5, %o4, %o5 ! after the null byte - sub %o2, 8, %o2 ! Adjust address which is +8 here. - srlx %o1, %o5, %o1 ! Discard them - - srlx %o4, 3, %o4 ! Bits to bytes to be written - dec %o4 ! dec 1 to use it as offset - -2: stub %o1, [%o2 + %o4] - dec %o4 - brgez,pt %o4, 2b - srlx %o1, 8, %o1 - -#ifndef __sparcv9 - ldx [%sp + STACK_OFFSET + 24], %g4 - ldx [%sp + STACK_OFFSET + 32], %g5 -#endif - retl ! done with leaf function - add %sp, SA(STACK_OFFSET + SAVESIZE), %sp - SET_SIZE(strcpy)
--- a/usr/src/lib/libc/sparc_hwcap1/common/gen/strlen.s Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,127 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - - .file "strlen.s" - -/* - * strlen(s) - * - * Given string s, return length (not including the terminating null). - * - * Fast assembler language version of the following C-program strlen - * which represents the `standard' for the C-library. - * - * size_t - * strlen(s) - * register const char *s; - * { - * register const char *s0 = s + 1; - * - * while (*s++ != '\0') - * ; - * return (s - s0); - * } - */ - -#include <sys/asm_linkage.h> - - /* - * There are two key optimizations in the routine below. - * First, all memory accesses are 8 bytes wide. The time - * for long strings is dominated by the latency of load - * instructions in the inner loop, and going 8 bytes at - * a time means 1/8th as much latency. - * - * Scanning an 8 byte word for a '\0' is made fast by - * this formula (due to Alan Mycroft): - * ~x & 0x808080808080 & (x - 0x0101010101010101) - * The result of this formula is non-zero iff there's - * a '\0' somewhere in x. - * - * Second, the cost of short strings is dominated by the - * cost of figuring out which byte out of the last 8 - * contained the '\0' that terminated the string. We use - * properties of the formula above to convert scanning the - * word for '\0' into a single LZD instruction. - */ - .align 64 - .skip 4*4 ! force .findnull to align to 64 bytes - ENTRY_NP(strlen) - and %o0, 7, %o3 ! off = addr & 7 - sethi %hi(0x01010101), %o4 ! 0x01010000 - - sub %g0, %o3, %o2 ! count = -off - or %o4, %lo(0x01010101), %o4 ! 0x01010101 - - ldx [%o0 + %o2], %o1 ! val = *(addr + count) - sllx %o4, 32, %o5 ! 0x01010101 << 32 - - mov -1, %g1 ! mask = -1 - sllx %o3, 3, %o3 ! shift = off * 8 - - or %o4, %o5, %o4 ! 0x0101010101010101 - srlx %g1, %o3, %g1 ! -1 >> ((addr & 7) * 8) - - sllx %o4, 7, %o5 ! 0x8080808080808080 - orn %o1, %g1, %o1 ! val |= ~mask -.strlen_findnull: - !! %o0 - base address - !! %o1 - xword from memory - !! %o2 - index - !! %o3 - result of test for '\0' - !! %o4 - constant 0x0101.0101.0101.0101 - !! %o5 - constant 0x8080.8080.8080.8080 - !! %g1 - scratch - andn %o5, %o1, %o3 ! ~val & 0x80 - sub %o1, %o4, %g1 ! val - 0x01 - andcc %o3, %g1, %o3 ! ~val & 0x80 & (val - 0x01) - inc 8, %o2 - bz,a,pt %xcc, .strlen_findnull - ldx [%o0 + %o2], %o1 - - /* - * The result of Mycroft's formula is a pattern of 0x80 and - * 0x00 bytes. There's a 0x80 at every byte position where - * there was a '\0' character, but a string of 0x01 bytes - * immediately preceding a '\0' becomes a corresponding - * string of 0x80 bytes. (e.g. 0x0101010101010100 becomes - * 0x8080808080808080). We need one final step to discount - * any leading 0x01 bytes, and then LZD can tell us how many - * characters there were before the terminating '\0'. - */ - !! %o1 - last data word - !! %o2 - length+8, plus 1-8 extra - !! %o3 - xword with 0x80 for each 0x00 byte and leading 0x01 - sub %o2, 8, %o2 ! subtract off '\0' and last 8 - srlx %o3, 7, %o3 ! shift 0x80 -> 0x01 - andn %o3, %o1, %o3 ! mask off leading 0x01 bytes - lzd %o3, %o3 ! 7, 15, ... 63 - srlx %o3, 3, %o3 ! 0 ... 7 - - retl - add %o2, %o3, %o0 ! add back bytes before '\0' - - SET_SIZE(strlen)
--- a/usr/src/lib/libc/sparc_hwcap1/sparc/Makefile Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/lib/libc/sparc_hwcap1/sparc/Makefile Thu Aug 06 17:39:39 2009 -0700 @@ -28,8 +28,7 @@ LIBRARY= libc_hwcap1.a -EXTN_CPPFLAGS= -DSMT_PAUSE_FUNCTION=_rock_pause \ - -I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include +EXTN_CPPFLAGS= -I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include EXTN_ASFLAGS= -xarch=v8plusd EXTN_DYNFLAGS= -M mapfile @@ -40,10 +39,10 @@ PRFOBJS= \ memcpy.o \ + memmove.o \ memset.o \ strlen.o \ strcpy.o \ - misc.o MAPFILE_AUX = mapfile-vers-aux
--- a/usr/src/lib/libc/sparc_hwcap1/sparcv9/Makefile Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/lib/libc/sparc_hwcap1/sparcv9/Makefile Thu Aug 06 17:39:39 2009 -0700 @@ -27,8 +27,7 @@ LIBRARY= libc_hwcap1.a -EXTN_CPPFLAGS= -DSMT_PAUSE_FUNCTION=_rock_pause \ - -I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include +EXTN_CPPFLAGS= -I$(SRC)/uts/sun4v -I$(ROOT)/usr/platform/sun4v/include EXTN_ASFLAGS= -xarch=v9d EXTN_DYNFLAGS= -M mapfile @@ -39,10 +38,10 @@ PRFOBJS= \ memcpy.o \ + memmove.o \ memset.o \ strlen.o \ strcpy.o \ - misc.o MAPFILE_AUX = mapfile-vers-aux
--- a/usr/src/lib/libdisasm/sparc/dis_sparc_fmt.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/lib/libdisasm/sparc/dis_sparc_fmt.c Thu Aug 06 17:39:39 2009 -0700 @@ -20,12 +20,12 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* - * Copyright 2008 Jason King. All rights reserved. + * Copyright 2009 Jason King. All rights reserved. * Use is subject to license terms. */ @@ -389,15 +389,15 @@ uint32_t op3:6; uint32_t rs1:5; uint32_t i:1; - uint32_t undef:5; - uint32_t cmask:4; + uint32_t undef:6; + uint32_t cmask:3; uint32_t mmask:4; } formatmbr_t; #elif defined(_BIT_FIELDS_LTOH) typedef struct formatmbr { uint32_t mmask:4; - uint32_t cmask:4; - uint32_t undef:5; + uint32_t cmask:3; + uint32_t undef:6; uint32_t i:1; uint32_t rs1:5; uint32_t op3:6; @@ -566,8 +566,8 @@ "#LoadLoad", "#StoreLoad", "#LoadStore", "#StoreStore" }; -static const char *membar_cmask[4] = { - "#Lookaside", "#MemIssue", "#Sync", "#Halt" +static const char *membar_cmask[3] = { + "#Lookaside", "#MemIssue", "#Sync" }; /* v8 ancillary state register names */ @@ -592,15 +592,15 @@ "%pcr", "%pic", "%dcr", "%gsr", "%softint_set", "%softint_clr", "%softint", "%tick_cmpr", "%stick", "%stick_cmpr", NULL, NULL, - "%cps", NULL, NULL, NULL + NULL, NULL, NULL, NULL }; /* * on v9, only certain registers are valid for read or writing * these are bitmasks corresponding to which registers are valid in which - * case + * case. Any access to %dcr is illegal. */ -static const uint32_t v9_asr_rdmask = 0x13cb007d; -static const uint32_t v9_asr_wrmask = 0x13fb004d; +static const uint32_t v9_asr_rdmask = 0x03cb007d; +static const uint32_t v9_asr_wrmask = 0x03fb004d; /* privledged register names on v9 */ /* TODO: compat - NULL to %priv_nn */ @@ -617,7 +617,7 @@ /* hyper privileged register names on v9 */ static const char *v9_hprivreg_names[32] = { - "%hpstate", "%htstate", "%hrstba", "%hintp", + "%hpstate", "%htstate", NULL, "%hintp", NULL, "%htba", "%hver", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -629,8 +629,8 @@ static const uint32_t v9_pr_rdmask = 0x80017fff; static const uint32_t v9_pr_wrmask = 0x00017fff; -static const uint32_t v9_hpr_rdmask = 0x8000006f; -static const uint32_t v9_hpr_wrmask = 0x8000006f; +static const uint32_t v9_hpr_rdmask = 0x8000006b; +static const uint32_t v9_hpr_wrmask = 0x8000006b; static const char *prefetch_str[32] = { "#n_reads", "#one_read", @@ -784,7 +784,6 @@ int32_t disp; uint32_t flags = inp->in_data.in_def.in_flags; int octal = ((dhp->dh_flags & DIS_OCTAL) != 0); - int chkpt = 0; if ((dhp->dh_debug & DIS_DEBUG_PRTFMT) != 0) { prt_field("op", f->f2.op, 2); @@ -822,13 +821,6 @@ flags = FLG_RS1(REG_NONE)|FLG_DISP(DISP19); } - if (f->f2b.op2 == 0x01 && f->f2b.a == 1 && - f->f2b.p == 0 && f->f2b.cond == 0x8 && f->f2b.cc == 0x01) { - name = "chkpt"; - flags = FLG_RS1(REG_NONE)|FLG_DISP(DISP19); - chkpt = 1; - } - switch (FLG_DISP_VAL(flags)) { case DISP22: @@ -867,11 +859,7 @@ } } - if (!chkpt) { - (void) snprintf(buf, sizeof (buf), "%s%s%s", name, annul, pred); - } else { - (void) snprintf(buf, sizeof (buf), "%s", name); - } + (void) snprintf(buf, sizeof (buf), "%s%s%s", name, annul, pred); prt_name(dhp, buf, 1); @@ -884,19 +872,11 @@ break; case DISP19: - if (!chkpt) { - bprintf(dhp, - (octal != 0) ? "%s, %s0%-5lo <" : - "%s, %s0x%-04lx <", - r, - (disp < 0) ? "-" : "+", - (disp < 0) ? (-disp) : disp); - } else { - bprintf(dhp, - (octal != 0) ? "%s0%-5lo <" : "%s0x%-04lx <", - (disp < 0) ? "-" : "+", - (disp < 0) ? (-disp) : disp); - } + bprintf(dhp, + (octal != 0) ? "%s, %s0%-5lo <" : + "%s, %s0x%-04lx <", r, + (disp < 0) ? "-" : "+", + (disp < 0) ? (-disp) : disp); break; case DISP16: @@ -1328,7 +1308,7 @@ first = 0; - for (i = 0; i < 5; ++i) { + for (i = 0; i < 4; ++i) { if ((f->fmb.cmask & (1L << i)) != 0) { bprintf(dhp, "%s%s", (first != 0) ? "|" : "", @@ -1503,7 +1483,6 @@ int v9 = ((dhp->dh_flags & (DIS_SPARC_V9|DIS_SPARC_V9_SGI)) != 0); int p_rs1, p_t; - char failstr[8] = "fail"; if (f->ftcc.undef != 0) return (-1); @@ -1530,26 +1509,13 @@ (p_rs1 != 0) ? " + " : "", (p_t != 0) ? reg_names[f->f3.rs2] : ""); } else { - if ((p_rs1 == 0) && (f->ftcc.immtrap == 0xF)) { - (void) strlcat(failstr, - (const char *)&(inp->in_data.in_def.in_name[1]), - sizeof (failstr)); - - prt_name(dhp, failstr, 1); - bprintf(dhp, "%s%s%s", - (v9 != 0) ? icc_names[f->ftcc2.cc] : "", - (p_rs1 != 0) ? reg_names[f->ftcc2.rs1] : "", - (p_rs1 != 0) ? " + " : ""); - } else { bprintf(dhp, "%-9s %s%s%s%s0x%x", inp->in_data.in_def.in_name, (v9 != 0) ? icc_names[f->ftcc2.cc] : "", (v9 != 0) ? ", " : "", (p_rs1 != 0) ? reg_names[f->ftcc2.rs1] : "", (p_rs1 != 0) ? " + " : "", f->ftcc.immtrap); - } } - return (0); } @@ -1894,17 +1860,9 @@ return (0); case 0x3b: - if (f->f3.rd == 1) { - /* flusha */ - prt_name(dhp, "flusha", 1); - prt_address(dhp, instr, 0); - (void) strlcat(dhp->dh_buf, " ", dhp->dh_buflen); - prt_asi(dhp, instr); - } else { - /* flush */ - prt_name(dhp, name, 1); - prt_address(dhp, instr, 0); - } + /* flush */ + prt_name(dhp, name, 1); + prt_address(dhp, instr, 0); return (0); case 0x3c:
--- a/usr/src/lib/libdisasm/sparc/instr.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/lib/libdisasm/sparc/instr.c Thu Aug 06 17:39:39 2009 -0700 @@ -20,12 +20,12 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* - * Copyright 2007 Jason King. All rights reserved. + * Copyright 2009 Jason King. All rights reserved. * Use is subject to license terms. */ @@ -155,12 +155,12 @@ }; static const inst_t BPr_table_def[16] = { - INST("brnr", V9, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)), + INVALID, INST("brz", V9|V9S, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)), INST("brlez", V9|V9S, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)), INST("brlz", V9|V9S, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)), - INST("brr", V9, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)), + INVALID, INST("brnz", V9|V9S, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)), INST("brgz", V9|V9S, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)), INST("brgez", V9|V9S, FLG_PRED|FLG_DISP(DISP16)|FLG_RS1(REG_INT)), @@ -483,10 +483,7 @@ /* 0x10 */ INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, - INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, - INST("commit", V9, 0), - INVALID - + INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID }; static const table_t tr_table = { @@ -637,12 +634,7 @@ INST("fsqrtq", VALL, FLG_P1(REG_NONE)|FLG_P2(REG_FPQ)|FLG_NOIMM|FLG_P3(REG_FPQ)), - INVALID, - INST("frsqrt1xs", V9, - FLG_P1(REG_NONE)|FLG_P2(REG_FPQ)|FLG_NOIMM|FLG_P3(REG_FPQ)), - INST("frsqrt1xd", VALL, - FLG_P1(REG_NONE)|FLG_P2(REG_FPD)|FLG_NOIMM|FLG_P3(REG_FPD)), - INVALID, + INVALID, INVALID, INVALID, INVALID, /* 0x30 */ INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, @@ -683,31 +675,11 @@ FLG_P1(REG_FPQ)|FLG_P2(REG_FPQ)|FLG_NOIMM|FLG_P3(REG_FPQ)), /* 0x050 */ - INVALID, - INST("fnadds", V9S, - FLG_P1(REG_FP)|FLG_P2(REG_FP)|FLG_NOIMM|FLG_P3(REG_FP)), - INST("fnaddd", V9S, - FLG_P1(REG_FPD)|FLG_P2(REG_FPD)|FLG_NOIMM|FLG_P3(REG_FPD)), - INVALID, INVALID, INVALID, INVALID, INVALID, - INVALID, - INST("fnmuls", V9S, - FLG_P1(REG_FP)|FLG_P2(REG_FP)|FLG_NOIMM|FLG_P3(REG_FP)), - INST("fnmuld", V9S, - FLG_P1(REG_FPD)|FLG_P2(REG_FPD)|FLG_NOIMM|FLG_P3(REG_FPD)), - INVALID, INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, /* 0x060 */ - INVALID, - INST("fhadds", V9, - FLG_P1(REG_FP)|FLG_P2(REG_FP)|FLG_NOIMM|FLG_P3(REG_FP)), - INST("fhaddd", V9, - FLG_P1(REG_FPD)|FLG_P2(REG_FPD)|FLG_NOIMM|FLG_P3(REG_FPD)), - INVALID, INVALID, - INST("fhsubs", V9S, - FLG_P1(REG_FP)|FLG_P2(REG_FP)|FLG_NOIMM|FLG_P3(REG_FP)), - INST("fhsubd", V9S, - FLG_P1(REG_FPD)|FLG_P2(REG_FPD)|FLG_NOIMM|FLG_P3(REG_FPD)), - INVALID, + INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, /* 0x068 */ INVALID, @@ -723,16 +695,8 @@ INVALID, /* 0x070 */ - INVALID, - INST("fnhadds", V9S, - FLG_P1(REG_FP)|FLG_P2(REG_FP)|FLG_NOIMM|FLG_P3(REG_FP)), - INST("fnhaddd", V9S, - FLG_P1(REG_FPD)|FLG_P2(REG_FPD)|FLG_NOIMM|FLG_P3(REG_FPD)), - INVALID, INVALID, INVALID, INVALID, INVALID, - INVALID, - INST("fnsmuld", V9S, - FLG_P1(REG_FP)|FLG_P2(REG_FP)|FLG_NOIMM|FLG_P3(REG_FPD)), - INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, + INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, INVALID, /* 0x080 */ INVALID, @@ -1600,40 +1564,6 @@ .tbl_inp = fused_table_def }; -static const inst_t unfused_table_def[16] = { - /* 0x0 */ - INVALID, - INST("fumadds", V9, FLG_P1(REG_FP)), - INST("fumaddd", V9, FLG_P1(REG_FPD)), - INVALID, - - /* 0x4 */ - INVALID, - INST("fumsubs", V9, FLG_P1(REG_FP)), - INST("fumsubd", V9, FLG_P1(REG_FPD)), - INVALID, - - /* 0x8 */ - INVALID, - INST("fnumsubs", V9, FLG_P1(REG_FP)), - INST("fnumsubd", V9, FLG_P1(REG_FPD)), - INVALID, - - /* 0xc */ - INVALID, - INST("fnumadds", V9, FLG_P1(REG_FP)), - INST("fnumaddd", V9, FLG_P1(REG_FPD)), - INVALID -}; - -static const table_t unfused_table = { - .tbl_field = 8, - .tbl_len = 4, - .tbl_ovp = NULL, - .tbl_fmt = fmt_fused, - .tbl_inp = unfused_table_def -}; - static const inst_t alu_table_def[64] = { /* 0x00 */ INST("add", VALL, 0), @@ -1722,7 +1652,7 @@ INST("save", VALL, 0), INST("restore", VALL, 0), TABLE(tr_table, V9|V9S), - TABLE(unfused_table, V9|V9S) + INVALID };
--- a/usr/src/lib/libprtdiag/common/display_sun4v.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/lib/libprtdiag/common/display_sun4v.c Thu Aug 06 17:39:39 2009 -0700 @@ -106,8 +106,6 @@ static void sun4v_env_print_current_indicators(); static void sun4v_env_print_voltage_sensors(); static void sun4v_env_print_voltage_indicators(); -static void sun4v_env_print_humidity_sensors(); -static void sun4v_env_print_humidity_indicators(); static void sun4v_env_print_LEDs(); static void sun4v_print_fru_status(); static int is_fru_absent(picl_nodehdl_t); @@ -1101,16 +1099,6 @@ class_node_found = 0; all_status_ok = 1; - sun4v_env_print_humidity_sensors(); - exit_code |= (!all_status_ok); - - class_node_found = 0; - all_status_ok = 1; - sun4v_env_print_humidity_indicators(); - exit_code |= (!all_status_ok); - - class_node_found = 0; - all_status_ok = 1; sun4v_env_print_LEDs(); exit_code |= (!all_status_ok); @@ -1737,68 +1725,6 @@ } static void -sun4v_env_print_humidity_sensors() -{ - char *fmt = "%-34s %-14s %-10s\n"; - (void) picl_walk_tree_by_class(phyplatformh, - PICL_CLASS_HUMIDITY_SENSOR, - (void *)PICL_PROP_HUMIDITY, - sun4v_env_print_sensor_callback); - if (!class_node_found) - return; - log_printf("\nHumidity sensors:\n"); - if (syserrlog == 0) { - (void) picl_walk_tree_by_class(phyplatformh, - PICL_CLASS_HUMIDITY_SENSOR, - PICL_PROP_HUMIDITY, sun4v_env_print_sensor_callback); - if (all_status_ok) { - log_printf("All humidity sensors are OK.\n"); - return; - } - } - log_printf("-------------------------------------------------" - "-----------\n"); - log_printf(fmt, "Location", "Sensor", "Status", 0); - log_printf("-------------------------------------------------" - "-----------\n"); - (void) picl_walk_tree_by_class(phyplatformh, - PICL_CLASS_HUMIDITY_SENSOR, - (void *)PICL_PROP_HUMIDITY, - sun4v_env_print_sensor_callback); -} - -static void -sun4v_env_print_humidity_indicators() -{ - char *fmt = "%-34s %-14s %-8s\n"; - (void) picl_walk_tree_by_class(phyplatformh, - PICL_CLASS_HUMIDITY_INDICATOR, - (void *)PICL_PROP_CONDITION, - sun4v_env_print_indicator_callback); - if (!class_node_found) - return; - log_printf("\nHumidity indicators:\n"); - if (syserrlog == 0) { - (void) picl_walk_tree_by_class(phyplatformh, - PICL_CLASS_HUMIDITY_INDICATOR, (void *)PICL_PROP_CONDITION, - sun4v_env_print_indicator_callback); - if (all_status_ok) { - log_printf("All humidity indicators are OK.\n"); - return; - } - } - log_printf("-------------------------------------------------" - "-----------\n"); - log_printf(fmt, "Location", "Indicator", "Condition", 0); - log_printf("-------------------------------------------------" - "-----------\n"); - (void) picl_walk_tree_by_class(phyplatformh, - PICL_CLASS_HUMIDITY_INDICATOR, - (void *)PICL_PROP_CONDITION, - sun4v_env_print_indicator_callback); -} - -static void sun4v_env_print_LEDs() { char *fmt = "%-34s %-14s %-8s\n";
--- a/usr/src/pkgdefs/Makefile Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/pkgdefs/Makefile Thu Aug 06 17:39:39 2009 -0700 @@ -96,7 +96,6 @@ SUNWssad \ SUNWstc.u \ SUNWus.u \ - SUNWusat10.v \ SUNWust1.v \ SUNWust2.v \ SUNWwbsd
--- a/usr/src/pkgdefs/SUNWusat10.v/Makefile Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END -# - -# -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# - -include ../Makefile.com - -.KEEP_STATE: - -all: $(FILES) - -install: all pkg - -include ../Makefile.targ
--- a/usr/src/pkgdefs/SUNWusat10.v/pkginfo.tmpl Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END -# - -# -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# -# This required package information file describes characteristics of the -# package, such as package abbreviation, full package name, package version, -# and package architecture. -# -PKG="SUNWusat10" -NAME="UltraSPARC-AT10 (Root)" -ARCH="sparc.sun4v" -VERSION="ONVERS,REV=0.0.0" -SUNW_PRODNAME="SunOS" -SUNW_PRODVERS="RELEASE/VERSION" -SUNW_PKGTYPE="root" -MAXINST="1000" -CATEGORY="system" -DESC="UltraSPARC-AT10 core kernel software" -VENDOR="Sun Microsystems, Inc." -HOTLINE="Please contact your local service provider" -EMAIL="" -CLASSES="none" -BASEDIR=/ -SUNW_PKGVERS="1.0" -SUNW_PKG_ALLZONES="true" -SUNW_PKG_HOLLOW="true" -SUNW_PKG_THISZONE="false" -#VSTOCK="<reserved by Release Engineering for package part #>" -#ISTATES="<developer defined>" -#RSTATES='<developer defined>' -#ULIMIT="<developer defined>" -#ORDER="<developer defined>" -#PSTAMP="<developer defined>" -#INTONLY="<developer defined>"
--- a/usr/src/pkgdefs/SUNWusat10.v/prototype_com Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,52 +0,0 @@ -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END -# - -# -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# -# This required package information file contains a list of package contents. -# The 'pkgmk' command uses this file to identify the contents of a package -# and their location on the development machine when building the package. -# Can be created via a text editor or through use of the 'pkgproto' command. - -#!search <pathname pathname ...> # where to find pkg objects -#!include <filename> # include another 'prototype' file -#!default <mode> <owner> <group> # default used if not specified on entry -#!<param>=<value> # puts parameter in pkg environment - -# packaging files -i pkginfo -i copyright -# -# source locations relative to the prototype file -# -# SUNWusat10.v -# -d none platform 755 root sys -d none platform/sun4v 755 root sys -d none platform/sun4v/kernel 755 root sys -d none platform/sun4v/kernel/cpu 755 root sys -d none platform/sun4v/kernel/cpu/sparcv9 755 root sys -f none platform/sun4v/kernel/cpu/sparcv9/SUNW,UltraSPARC-AT10 755 root sys -d none platform/sun4v/kernel/pcbe 755 root sys -d none platform/sun4v/kernel/pcbe/sparcv9 755 root sys -f none platform/sun4v/kernel/pcbe/sparcv9/pcbe.SUNW,UltraSPARC-AT10 755 root sys
--- a/usr/src/pkgdefs/SUNWusat10.v/prototype_sparc Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END -# - -# -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# - -# Include ISA independent files (prototype_com) -!include prototype_com - -# List files which are SPARC specific here
--- a/usr/src/uts/common/io/mem.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/common/io/mem.c Thu Aug 06 17:39:39 2009 -0700 @@ -230,9 +230,6 @@ flags, name, valuep, lengthp, 0)); } -extern void mach_sync_icache_pa(caddr_t, size_t); -#pragma weak mach_sync_icache_pa - static int mmio(struct uio *uio, enum uio_rw rw, pfn_t pfn, off_t pageoff, int allowio, page_t *pp) @@ -271,18 +268,9 @@ error = EFAULT; } else error = EIO; - } else { + } else error = uiomove(va + pageoff, nbytes, rw, uio); - /* - * In case this has changed executable code, - * non-coherent I-caches must be flushed. - */ - if (rw != UIO_READ && &mach_sync_icache_pa != NULL) { - mach_sync_icache_pa((caddr_t)ptob(pfn), PAGESIZE); - } - } - if (devload) hat_unload(kas.a_hat, mm_map, PAGESIZE, HAT_UNLOAD_UNLOCK); else if (pp)
--- a/usr/src/uts/common/sys/auxv_SPARC.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/common/sys/auxv_SPARC.h Thu Aug 06 17:39:39 2009 -0700 @@ -45,7 +45,6 @@ #define AV_SPARC_VIS2 0x0040 /* VIS2 instruction set supported */ #define AV_SPARC_ASI_BLK_INIT 0x0080 /* ASI_BLK_INIT_xxx ASI */ #define AV_SPARC_FMAF 0x0100 /* Fused Multiply-Add */ -#define AV_SPARC_FMAU 0x0200 /* Unfused Multiply-Add */ #define AV_SPARC_VIS3 0x0400 /* VIS3 instruction set extensions */ #define AV_SPARC_HPC 0x0800 /* High Performance Computing insns */ #define AV_SPARC_RANDOM 0x1000 /* random instruction */ @@ -57,7 +56,7 @@ #define FMT_AV_SPARC \ "\20" \ "\21cspare" \ - "\20ima\17fjfmau\16trans\15random\14hpc\13vis3\12fmau\11fmaf" \ + "\20ima\17fjfmau\16trans\15random\14hpc\13vis3\12-\11fmaf" \ "\10ASIBlkInit\7vis2\6vis\5popc\4v8plus\3fsmuld\2div32\1mul32" /*
--- a/usr/src/uts/common/vm/hat.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/common/vm/hat.h Thu Aug 06 17:39:39 2009 -0700 @@ -425,25 +425,6 @@ #define HAT_STRUCTURE_LE 0x2000 #define HAT_ENDIAN_MASK 0x3000 -/* - * Attributes for non-coherent I-cache support. - * - * We detect if an I-cache has been filled by first resetting - * execute permission in a tte entry. This forces a trap when - * an instruction fetch first occurs in that page. In "soft - * execute mode", the hardware execute permission is cleared - * and a different software execution bit is set in the tte. - * - * HAT_ATTR_TEXT: set this flag to avoid the extra trap associated - * with soft execute mode. Same meaning as HAT_LOAD_TEXT. - * - * HAT_ATTR_NOSOFTEXEC: set this flag when installing a permanent - * mapping, or installing a mapping that will never be - * freed. Overrides soft execute mode. - */ -#define HAT_ATTR_TEXT 0x4000 -#define HAT_ATTR_NOSOFTEXEC 0x8000 - /* flags for hat_softlock */ #define HAT_COW 0x0001
--- a/usr/src/uts/common/vm/page.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/common/vm/page.h Thu Aug 06 17:39:39 2009 -0700 @@ -780,7 +780,7 @@ int page_reclaim_mem(pgcnt_t, pgcnt_t, int); void page_set_props(page_t *, uint_t); -void page_clr_all_props(page_t *, int); +void page_clr_all_props(page_t *); int page_clear_lck_cow(page_t *, int); kmutex_t *page_vnode_mutex(struct vnode *);
--- a/usr/src/uts/common/vm/page_retire.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/common/vm/page_retire.c Thu Aug 06 17:39:39 2009 -0700 @@ -535,7 +535,7 @@ ASSERT(!hat_page_is_mapped(pp)); ASSERT(!pp->p_vnode); - page_clr_all_props(pp, 0); + page_clr_all_props(pp); pagescrub(pp, 0, MMU_PAGESIZE); pp->p_next = NULL;
--- a/usr/src/uts/common/vm/seg_kmem.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/common/vm/seg_kmem.c Thu Aug 06 17:39:39 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -883,15 +883,6 @@ else allocflag = 0; - /* - * Support for non-coherent I-cache. - * Set HAT_LOAD_TEXT to override soft execute. - */ - if (attr & HAT_ATTR_TEXT) { - attr &= ~HAT_ATTR_TEXT; - allocflag |= HAT_LOAD_TEXT; - } - while (ppl != NULL) { page_t *pp = ppl; page_sub(&ppl, pp);
--- a/usr/src/uts/common/vm/vm_page.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/common/vm/vm_page.c Thu Aug 06 17:39:39 2009 -0700 @@ -620,7 +620,7 @@ * initialize other fields in the page_t */ PP_SETFREE(pp); - page_clr_all_props(pp, 0); + page_clr_all_props(pp); PP_SETAGED(pp); pp->p_offset = (u_offset_t)-1; pp->p_next = pp; @@ -2662,7 +2662,7 @@ PP_SETFREE(pp); ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) || !hat_ismod(pp)); - page_clr_all_props(pp, 0); + page_clr_all_props(pp); ASSERT(!hat_page_getshare(pp)); /* @@ -2803,7 +2803,7 @@ ASSERT(tpp->p_szc == szc); PP_SETFREE(tpp); - page_clr_all_props(tpp, 0); + page_clr_all_props(tpp); PP_SETAGED(tpp); tpp->p_offset = (u_offset_t)-1; ASSERT(tpp->p_next == tpp); @@ -3149,7 +3149,7 @@ ASSERT(tpp->p_szc == szc); PP_SETFREE(tpp); - page_clr_all_props(tpp, 0); + page_clr_all_props(tpp); PP_SETAGED(tpp); ASSERT(tpp->p_next == tpp); ASSERT(tpp->p_prev == tpp); @@ -3525,7 +3525,7 @@ page_vpsub(&vp->v_pages, pp); pp->p_hash = NULL; - page_clr_all_props(pp, 1); + page_clr_all_props(pp); PP_CLRSWAP(pp); pp->p_vnode = NULL; pp->p_offset = (u_offset_t)-1; @@ -4542,7 +4542,7 @@ old->p_vnode = NULL; PP_CLRSWAP(old); old->p_offset = (u_offset_t)-1; - page_clr_all_props(old, 1); + page_clr_all_props(old); /* * Wake up processes waiting for this page. The page's @@ -4888,7 +4888,7 @@ for (i = 0; i < npgs; i++) { ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO)); - page_clr_all_props(repl, 0); + page_clr_all_props(repl); page_set_props(repl, ppattr); page_relocate_hash(repl, targ); @@ -4899,7 +4899,7 @@ * page_relocate_hash(), they no longer * have any meaning. */ - page_clr_all_props(targ, 0); + page_clr_all_props(targ); ASSERT(targ->p_next == targ); ASSERT(targ->p_prev == targ); page_list_concat(&pl, &targ); @@ -4983,7 +4983,7 @@ pp = pplist; if (pp->p_szc == 0) { page_sub(&pplist, pp); - page_clr_all_props(pp, 0); + page_clr_all_props(pp); PP_SETFREE(pp); PP_SETAGED(pp); page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); @@ -4997,7 +4997,7 @@ do { ASSERT(PAGE_EXCL(tpp)); ASSERT(!hat_page_is_mapped(tpp)); - page_clr_all_props(tpp, 0); + page_clr_all_props(tpp); PP_SETFREE(tpp); PP_SETAGED(tpp); } while ((tpp = tpp->p_next) != pp); @@ -6110,25 +6110,9 @@ pp->p_nrm |= (uchar_t)flags; } -extern void mach_sync_icache_pp(page_t *); -#pragma weak mach_sync_icache_pp - -/* - * Flush I-cache if the page is being reassigned. The hashout flag is - * set when a page has been removed from a hash chain (i.e. vnode - * pages). If the page stays on the hash chain there is a chance it - * will be re-used, therefore there is no need to flush the - * I-cache. However, if the page is being removed from a hash chain - * then the page can be used for any new purpose, and the I-cache must - * be flushed. - */ -/* ARGSUSED */ void -page_clr_all_props(page_t *pp, int hashout) +page_clr_all_props(page_t *pp) { - if (&mach_sync_icache_pp != NULL && hashout) { - mach_sync_icache_pp(pp); - } pp->p_nrm = 0; }
--- a/usr/src/uts/sfmmu/ml/sfmmu_asm.s Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sfmmu/ml/sfmmu_asm.s Thu Aug 06 17:39:39 2009 -0700 @@ -248,7 +248,6 @@ */ ;\ sllx tagtarget, TTARGET_VA_SHIFT, tagtarget ;\ ldxa [ttepa]ASI_MEM, tte ;\ - TTE_CLR_SOFTEXEC_ML(tte) ;\ srlx tagtarget, TTARGET_VA_SHIFT, tagtarget ;\ sethi %hi(TSBTAG_INVALID), tmp2 ;\ add tsbep, TSBE_TAG, tmp1 ;\ @@ -371,7 +370,6 @@ #define TSB_UPDATE(tsbep, tteva, tagtarget, tmp1, tmp2, label) \ /* can't rd tteva after locking tsb because it can tlb miss */ ;\ ldx [tteva], tteva /* load tte */ ;\ - TTE_CLR_SOFTEXEC_ML(tteva) ;\ TSB_LOCK_ENTRY(tsbep, tmp1, tmp2, label) ;\ sethi %hi(TSBTAG_INVALID), tmp2 ;\ add tsbep, TSBE_TAG, tmp1 ;\ @@ -946,11 +944,6 @@ { } -void -sfmmu_patch_pgsz_reg(void) -{ -} - /* ARGSUSED */ void sfmmu_load_tsbe(struct tsbe *tsbep, uint64_t vaddr, tte_t *ttep, int phys) @@ -1441,19 +1434,6 @@ #endif /* sun4u */ SET_SIZE(sfmmu_patch_shctx) - ENTRY_NP(sfmmu_patch_pgsz_reg) -#ifdef sun4u - retl - nop -#else /* sun4u */ - set sfmmu_pgsz_load_mmustate_patch, %o0 - MAKE_NOP_INSTR(%o1) - st %o1, [%o0] - retl - flush %o0 -#endif /* sun4u */ - SET_SIZE(sfmmu_patch_pgsz_reg) - /* * Routine that loads an entry into a tsb using virtual addresses. * Locking is required since all cpus can use the same TSB. @@ -2408,13 +2388,6 @@ ba,a,pt %xcc, label/**/8 ;\ label/**/6: ;\ GET_SCDSHMERMAP(tsbarea, hmeblkpa, hatid, hmemisc) ;\ - /* ;\ - * hmemisc is set to 1 if this is a shared mapping. It will ;\ - * be cleared by CHECK_SHARED_PGSZ if this pagesize is not ;\ - * allowed, in order to limit the number of entries in the ;\ - * pagesize register. ;\ - */ ;\ - CHECK_SHARED_PGSZ(tsbarea, tte, hatid, hmemisc, label/**/9) ;\ ldn [tsbarea + (TSBMISS_SCRATCH + TSBMISS_HMEBP)], hatid ;\ label/**/7: ;\ set TTE_SUSPEND, hatid ;\ @@ -3295,37 +3268,8 @@ stub %g1, [%g6 + TSBMISS_URTTEFLAGS] SAVE_CTX1(%g7, %g2, %g1, tsb_shmel) - ba tsb_validtte #endif /* sun4u && !UTSB_PHYS */ -tsb_ism_validtte: -#ifdef sun4v - /* - * Check pagesize against bitmap for Rock page size register, - * for ism mappings. - * - * %g1, %g2 = scratch - * %g3 = tte - * g4 = tte pa - * g5 = tte va - * g6 = tsbmiss area - * %g7 = tt - */ - ldub [%g6 + TSBMISS_URTTEFLAGS], %g1 - and %g1, HAT_CHKCTX1_FLAG, %g2 - /* - * Clear the HAT_CHKCTX1_FLAG in %g2 if this shared pagesize is not allowed - * to limit the number of entries in the pagesize search register. - */ - CHECK_SHARED_PGSZ(%g6, %g3, %g7, %g2, ism_chk_pgsz) - andn %g1, HAT_CHKCTX1_FLAG, %g1 - or %g1, %g2, %g1 - stub %g1, [%g6 + TSBMISS_URTTEFLAGS] - brz %g2, tsb_validtte - rdpr %tt, %g7 - SAVE_CTX1(%g7, %g1, %g2, tsb_shctxl) -#endif /* sun4v */ - tsb_validtte: /* * g3 = tte @@ -3355,11 +3299,9 @@ ba,pt %xcc, tsb_update_tl1 nop 4: - /* - * ITLB translation was found but execute permission is - * disabled. If we have software execute permission (soft exec - * bit is set), then enable hardware execute permission. - * Otherwise continue with a protection violation. + /* + * If ITLB miss check exec bit. + * If not set treat as invalid TTE. */ cmp %g7, T_INSTR_MMU_MISS be,pn %icc, 5f @@ -3368,11 +3310,9 @@ bne,pt %icc, 3f andcc %g3, TTE_EXECPRM_INT, %g0 /* check execute bit is set */ 5: - bnz,pn %icc, 3f - TTE_CHK_SOFTEXEC_ML(%g3) /* check soft execute */ bz,pn %icc, tsb_protfault nop - TTE_SET_EXEC_ML(%g3, %g4, %g7, tsb_lset_exec) + 3: /* * Set reference bit if not already set @@ -3415,7 +3355,6 @@ #endif /* sun4v */ tsb_update_tl1: - TTE_CLR_SOFTEXEC_ML(%g3) srlx %g2, TTARGET_CTX_SHIFT, %g7 brz,pn %g7, tsb_kernel #ifdef sun4v @@ -3658,7 +3597,10 @@ ldub [%g6 + TSBMISS_URTTEFLAGS], %g5 or %g5, HAT_CHKCTX1_FLAG, %g5 stub %g5, [%g6 + TSBMISS_URTTEFLAGS] + rdpr %tt, %g5 + SAVE_CTX1(%g5, %g3, %g1, tsb_shctxl) #endif /* defined(sun4v) || defined(UTSB_PHYS) */ + /* * ISM pages are always locked down. * If we can't find the tte then pagefault @@ -3690,7 +3632,7 @@ /* NOT REACHED */ tsb_ism_32M_found: - brlz,a,pt %g3, tsb_ism_validtte + brlz,a,pt %g3, tsb_validtte rdpr %tt, %g7 ba,pt %xcc, tsb_ism_4M nop @@ -3708,7 +3650,7 @@ tsb_ism_4M) tsb_ism_256M_found: - brlz,a,pt %g3, tsb_ism_validtte + brlz,a,pt %g3, tsb_validtte rdpr %tt, %g7 tsb_ism_4M: @@ -3721,7 +3663,7 @@ /* NOT REACHED */ tsb_ism_4M_found: - brlz,a,pt %g3, tsb_ism_validtte + brlz,a,pt %g3, tsb_validtte rdpr %tt, %g7 tsb_ism_8K: @@ -3735,7 +3677,7 @@ /* NOT REACHED */ tsb_ism_8K_found: - brlz,a,pt %g3, tsb_ism_validtte + brlz,a,pt %g3, tsb_validtte rdpr %tt, %g7 tsb_pagefault:
--- a/usr/src/uts/sfmmu/ml/sfmmu_kdi.s Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sfmmu/ml/sfmmu_kdi.s Thu Aug 06 17:39:39 2009 -0700 @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -328,21 +328,7 @@ ldxa [%g2]ASI_MEM, %g1 brgez,a %g1, 4f clr %g1 -4: - /* - * If soft execute bit is set, make sure HW execute permission - * is also set. But, clear soft execute bit before giving tte to - * the caller. - */ - TTE_CHK_SOFTEXEC_ML(%g1) - bz,pt %icc, 6f - andcc %g1, TTE_EXECPRM_INT, %g0 - bnz,pt %icc, 7f - nop - TTE_SET_EXEC_ML(%g1, %g2, %g4, kdi_trap_vatotte) -7: - TTE_CLR_SOFTEXEC_ML(%g1) - ba,a 6f +4: ba,a 6f 5: add %g3, 1, %g3 set mmu_hashcnt, %g4
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c Thu Aug 06 17:39:39 2009 -0700 @@ -184,14 +184,6 @@ #define HAT_TMPNC 0x4 /* - * This flag is set to 0 via the MD in platforms that do not support - * I-cache coherency in hardware. Used to enable "soft exec" mode. - * The MD "coherency" property is optional, and defaults to 1 (because - * coherent I-cache is the norm.) - */ -uint_t icache_is_coherent = 1; - -/* * Flag to allow the creation of non-cacheable translations * to system memory. It is off by default. At the moment this * flag is used by the ecache error injector. The error injector @@ -227,7 +219,6 @@ uint_t disable_ism_large_pages = (1 << TTE512K); uint_t disable_auto_data_large_pages = 0; uint_t disable_auto_text_large_pages = 0; -uint_t disable_shctx_large_pages = 0; /* * Private sfmmu data structures for hat management @@ -294,14 +285,6 @@ /* Internal variable, set by MD if the HW supports shctx feature */ int shctx_on = 0; -/* Internal variable, set by MD if the HW supports the search order register */ -int pgsz_search_on = 0; -/* - * External /etc/system tunable, for controlling search order register - * support. - */ -int disable_pgsz_search = 0; - #ifdef DEBUG static void check_scd_sfmmu_list(sfmmu_t **, sfmmu_t *, int); #endif @@ -481,6 +464,7 @@ pfn_t, int); static void sfmmu_tlb_demap(caddr_t, sfmmu_t *, struct hme_blk *, int, int); static void sfmmu_tlb_range_demap(demap_range_t *); +static void sfmmu_invalidate_ctx(sfmmu_t *); static void sfmmu_sync_mmustate(sfmmu_t *); static void sfmmu_tsbinfo_setup_phys(struct tsb_info *, pfn_t); @@ -589,7 +573,7 @@ uint64_t mmu_saved_gnum = 0; /* to init incoming MMUs' gnums */ #define DEFAULT_NUM_CTXS_PER_MMU 8192 -uint_t nctxs = DEFAULT_NUM_CTXS_PER_MMU; +static uint_t nctxs = DEFAULT_NUM_CTXS_PER_MMU; int cache; /* describes system cache */ @@ -743,7 +727,11 @@ static void sfmmu_mlist_reloc_enter(page_t *, page_t *, kmutex_t **, kmutex_t **); static void sfmmu_mlist_reloc_exit(kmutex_t *, kmutex_t *); -static hatlock_t *sfmmu_hat_tryenter(sfmmu_t *); +static hatlock_t * + sfmmu_hat_enter(sfmmu_t *); +static hatlock_t * + sfmmu_hat_tryenter(sfmmu_t *); +static void sfmmu_hat_exit(hatlock_t *); static void sfmmu_hat_lock_all(void); static void sfmmu_hat_unlock_all(void); static void sfmmu_ismhat_enter(sfmmu_t *, int); @@ -1067,14 +1055,12 @@ disable_ism_large_pages |= disable_large_pages; disable_auto_data_large_pages = disable_large_pages; disable_auto_text_large_pages = disable_large_pages; - disable_shctx_large_pages |= disable_large_pages; /* * Initialize mmu-specific large page sizes. */ if (&mmu_large_pages_disabled) { disable_large_pages |= mmu_large_pages_disabled(HAT_LOAD); - disable_shctx_large_pages |= disable_large_pages; disable_ism_large_pages |= mmu_large_pages_disabled(HAT_LOAD_SHARE); disable_auto_data_large_pages |= @@ -1413,14 +1399,6 @@ shctx_on = 0; } - /* - * If support for page size search is disabled via /etc/system - * set pgsz_search_on to 0 here. - */ - if (pgsz_search_on && disable_pgsz_search) { - pgsz_search_on = 0; - } - if (shctx_on) { srd_buckets = kmem_zalloc(SFMMU_MAX_SRD_BUCKETS * sizeof (srd_buckets[0]), KM_SLEEP); @@ -1595,11 +1573,6 @@ sfmmup->sfmmu_scdp = NULL; sfmmup->sfmmu_scd_link.next = NULL; sfmmup->sfmmu_scd_link.prev = NULL; - - if (&mmu_set_pgsz_order && sfmmup != ksfmmup) { - mmu_set_pgsz_order(sfmmup, 0); - sfmmu_init_pgsz_hv(sfmmup); - } return (sfmmup); } @@ -2082,8 +2055,6 @@ newhat->sfmmu_scdismttecnt[i] = hat->sfmmu_scdismttecnt[i]; } - } else if (&mmu_set_pgsz_order) { - mmu_set_pgsz_order(newhat, 0); } sfmmu_check_page_sizes(newhat, 1); @@ -2579,7 +2550,7 @@ void sfmmu_memtte(tte_t *ttep, pfn_t pfn, uint_t attr, int tte_sz) { - ASSERT((attr & ~(SFMMU_LOAD_ALLATTR | HAT_ATTR_NOSOFTEXEC)) == 0); + ASSERT(!(attr & ~SFMMU_LOAD_ALLATTR)); ttep->tte_inthi = MAKE_TTE_INTHI(pfn, attr, tte_sz, 0 /* hmenum */); ttep->tte_intlo = MAKE_TTE_INTLO(pfn, attr, tte_sz, 0 /* hmenum */); @@ -2593,18 +2564,6 @@ if (TTE_IS_NFO(ttep) && TTE_IS_EXECUTABLE(ttep)) { panic("sfmmu_memtte: can't set both NFO and EXEC bits"); } - - /* - * Disable hardware execute permission to force a fault if - * this page is executed, so we can detect the execution. Set - * the soft exec bit to remember that this TTE has execute - * permission. - */ - if (TTE_IS_EXECUTABLE(ttep) && (attr & HAT_ATTR_NOSOFTEXEC) == 0 && - icache_is_coherent == 0) { - TTE_CLR_EXEC(ttep); - TTE_SET_SOFTEXEC(ttep); - } } /* @@ -3095,26 +3054,9 @@ (void *)hmeblkp); } ASSERT(TTE_CSZ(&tteold) == TTE_CSZ(ttep)); - - if (TTE_IS_EXECUTABLE(&tteold) && TTE_IS_SOFTEXEC(ttep)) { - TTE_SET_EXEC(ttep); - } } if (pp) { - /* - * If we know that this page will be executed, because - * it was in the past (PP_ISEXEC is already true), or - * if the caller says it will likely be executed - * (HAT_LOAD_TEXT is true), then there is no need to - * dynamically detect execution with a soft exec - * fault. Enable hardware execute permission now. - */ - if ((PP_ISEXEC(pp) || (flags & HAT_LOAD_TEXT)) && - TTE_IS_SOFTEXEC(ttep)) { - TTE_SET_EXEC(ttep); - } - if (size == TTE8K) { #ifdef VAC /* @@ -3138,12 +3080,6 @@ sfmmu_page_exit(pmtx); } - if (TTE_EXECUTED(ttep)) { - pmtx = sfmmu_page_enter(pp); - PP_SETEXEC(pp); - sfmmu_page_exit(pmtx); - } - } else if (sfmmu_pagearray_setup(vaddr, pps, ttep, remap)) { /* * sfmmu_pagearray_setup failed so return @@ -3151,9 +3087,6 @@ sfmmu_mlist_exit(pml); return (1); } - - } else if (TTE_IS_SOFTEXEC(ttep)) { - TTE_SET_EXEC(ttep); } /* @@ -3227,17 +3160,11 @@ if (!(sfmmup->sfmmu_tteflags & tteflag)) { hatlockp = sfmmu_hat_enter(sfmmup); sfmmup->sfmmu_tteflags |= tteflag; - if (&mmu_set_pgsz_order) { - mmu_set_pgsz_order(sfmmup, 1); - } sfmmu_hat_exit(hatlockp); } } else if (!(sfmmup->sfmmu_rtteflags & tteflag)) { hatlockp = sfmmu_hat_enter(sfmmup); sfmmup->sfmmu_rtteflags |= tteflag; - if (&mmu_set_pgsz_order && sfmmup != ksfmmup) { - mmu_set_pgsz_order(sfmmup, 1); - } sfmmu_hat_exit(hatlockp); } /* @@ -3284,8 +3211,7 @@ * ref bit in tteload. */ ASSERT(TTE_IS_REF(ttep)); - if (TTE_IS_MOD(&tteold) || (TTE_EXECUTED(&tteold) && - !TTE_IS_EXECUTABLE(ttep))) { + if (TTE_IS_MOD(&tteold)) { sfmmu_ttesync(sfmmup, vaddr, &tteold, pp); } /* @@ -3416,12 +3342,6 @@ sfmmu_page_exit(pmtx); } - if (TTE_EXECUTED(ttep)) { - pmtx = sfmmu_page_enter(pp); - PP_SETEXEC(pp); - sfmmu_page_exit(pmtx); - } - /* * If this is a remap we skip vac & contiguity checks. */ @@ -5052,11 +4972,9 @@ continue; } - if ((tteflags.tte_intlo & TTE_HWWR_INT) || - (TTE_EXECUTED(&tte) && - !TTE_IS_EXECUTABLE(&ttemod))) { + if (tteflags.tte_intlo & TTE_HWWR_INT) { /* - * need to sync if clearing modify/exec bit. + * need to sync if we are clearing modify bit. */ sfmmu_ttesync(sfmmup, addr, &tte, pp); } @@ -5109,14 +5027,6 @@ ttevalue.tte_intlo = MAKE_TTEATTR_INTLO(attr); ttemaskp->tte_inthi = TTEINTHI_ATTR; ttemaskp->tte_intlo = TTEINTLO_ATTR; - if (!icache_is_coherent) { - if (!(attr & PROT_EXEC)) { - TTE_SET_SOFTEXEC(ttemaskp); - } else { - TTE_CLR_EXEC(ttemaskp); - TTE_SET_SOFTEXEC(&ttevalue); - } - } break; case SFMMU_SETATTR: ASSERT(!(attr & ~HAT_PROT_MASK)); @@ -5171,9 +5081,6 @@ if (TTE_IS_EXECUTABLE(ttep)) { attr |= PROT_EXEC; } - if (TTE_IS_SOFTEXEC(ttep)) { - attr |= PROT_EXEC; - } if (!TTE_IS_PRIVILEGED(ttep)) { attr |= PROT_USER; } @@ -5390,11 +5297,6 @@ ttemod = tte; TTE_SET_LOFLAGS(&ttemod, tteflags, pprot); - ASSERT(TTE_IS_SOFTEXEC(&tte) == - TTE_IS_SOFTEXEC(&ttemod)); - ASSERT(TTE_IS_EXECUTABLE(&tte) == - TTE_IS_EXECUTABLE(&ttemod)); - #if defined(SF_ERRATA_57) if (check_exec && addr < errata57_limit) ttemod.tte_exec_perm = 0; @@ -6094,8 +5996,7 @@ continue; } - if (!(flags & HAT_UNLOAD_NOSYNC) || - (pp != NULL && TTE_EXECUTED(&tte))) { + if (!(flags & HAT_UNLOAD_NOSYNC)) { sfmmu_ttesync(sfmmup, addr, &tte, pp); } @@ -6435,49 +6336,37 @@ sfmmu_ttesync(struct hat *sfmmup, caddr_t addr, tte_t *ttep, page_t *pp) { uint_t rm = 0; - int sz = TTE_CSZ(ttep); + int sz; pgcnt_t npgs; ASSERT(TTE_IS_VALID(ttep)); - if (!TTE_IS_NOSYNC(ttep)) { - - if (TTE_IS_REF(ttep)) - rm |= P_REF; - - if (TTE_IS_MOD(ttep)) - rm |= P_MOD; - - if (rm != 0) { - if (sfmmup != NULL && sfmmup->sfmmu_rmstat) { - int i; - caddr_t vaddr = addr; - - for (i = 0; i < TTEPAGES(sz); i++) { - hat_setstat(sfmmup->sfmmu_as, vaddr, - MMU_PAGESIZE, rm); - vaddr += MMU_PAGESIZE; - } - } - } - } - - if (!pp) + if (TTE_IS_NOSYNC(ttep)) { return; - - /* - * If software says this page is executable, and the page was - * in fact executed (indicated by hardware exec permission - * being enabled), then set P_EXEC on the page to remember - * that it was executed. The I$ will be flushed when the page - * is reassigned. - */ - if (TTE_EXECUTED(ttep)) { - rm |= P_EXEC; - } else if (rm == 0) { + } + + if (TTE_IS_REF(ttep)) { + rm = P_REF; + } + if (TTE_IS_MOD(ttep)) { + rm |= P_MOD; + } + + if (rm == 0) { return; } + sz = TTE_CSZ(ttep); + if (sfmmup != NULL && sfmmup->sfmmu_rmstat) { + int i; + caddr_t vaddr = addr; + + for (i = 0; i < TTEPAGES(sz); i++, vaddr += MMU_PAGESIZE) { + hat_setstat(sfmmup->sfmmu_as, vaddr, MMU_PAGESIZE, rm); + } + + } + /* * XXX I want to use cas to update nrm bits but they * currently belong in common/vm and not in hat where @@ -6485,6 +6374,8 @@ * The nrm bits are protected by the same mutex as * the one that protects the page's mapping list. */ + if (!pp) + return; ASSERT(sfmmu_mlist_held(pp)); /* * If the tte is for a large page, we need to sync all the @@ -6503,8 +6394,7 @@ ASSERT(pp); ASSERT(sfmmu_mlist_held(pp)); if (((rm & P_REF) != 0 && !PP_ISREF(pp)) || - ((rm & P_MOD) != 0 && !PP_ISMOD(pp)) || - ((rm & P_EXEC) != 0 && !PP_ISEXEC(pp))) + ((rm & P_MOD) != 0 && !PP_ISMOD(pp))) hat_page_setattr(pp, rm); /* @@ -6826,7 +6716,6 @@ kmutex_t *low, *high; spgcnt_t npages, i; page_t *pl = NULL; - uint_t ppattr; int old_pil; cpuset_t cpuset; int cap_cpus; @@ -6977,9 +6866,8 @@ * Copy attributes. VAC consistency was handled above, * if required. */ - ppattr = hat_page_getattr(tpp, (P_MOD | P_REF | P_RO)); - page_clr_all_props(rpp, 0); - page_set_props(rpp, ppattr); + rpp->p_nrm = tpp->p_nrm; + tpp->p_nrm = 0; rpp->p_index = tpp->p_index; tpp->p_index = 0; #ifdef VAC @@ -7791,7 +7679,7 @@ noshuffle = flag & P_NSH; flag &= ~P_NSH; - ASSERT(!(flag & ~(P_MOD | P_REF | P_RO | P_EXEC))); + ASSERT(!(flag & ~(P_MOD | P_REF | P_RO))); /* * nothing to do if attribute already set @@ -8480,8 +8368,6 @@ int j; sf_scd_t *scdp; uchar_t rid; - hatlock_t *hatlockp; - int ismnotinscd = 0; ASSERT(SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)); scdp = sfmmup->sfmmu_scdp; @@ -8502,21 +8388,9 @@ /* ISMs is not in SCD */ npgs += ism_map[j].imap_ismhat->sfmmu_ttecnt[szc]; - ismnotinscd = 1; - } - } - } - - if (&mmu_set_pgsz_order) { - hatlockp = sfmmu_hat_enter(sfmmup); - if (ismnotinscd) { - SFMMU_FLAGS_SET(sfmmup, HAT_ISMNOTINSCD); - } else { - SFMMU_FLAGS_CLEAR(sfmmup, HAT_ISMNOTINSCD); - } - sfmmu_hat_exit(hatlockp); - } - + } + } + } sfmmup->sfmmu_ismttecnt[szc] = npgs; sfmmup->sfmmu_scdismttecnt[szc] = npgs_scd; return (npgs); @@ -8850,11 +8724,6 @@ sfmmu_hat_exit(hatlockp); } - if (&mmu_set_pgsz_order) { - hatlockp = sfmmu_hat_enter(sfmmup); - mmu_set_pgsz_order(sfmmup, 1); - sfmmu_hat_exit(hatlockp); - } sfmmu_ismhat_exit(sfmmup, 0); /* @@ -9050,11 +8919,6 @@ (void) ism_tsb_entries(sfmmup, i); } - if (&mmu_set_pgsz_order) { - hatlockp = sfmmu_hat_enter(sfmmup); - mmu_set_pgsz_order(sfmmup, 1); - sfmmu_hat_exit(hatlockp); - } sfmmu_ismhat_exit(sfmmup, 0); /* @@ -11027,7 +10891,7 @@ mutex_exit(low); } -hatlock_t * +static hatlock_t * sfmmu_hat_enter(sfmmu_t *sfmmup) { hatlock_t *hatlockp; @@ -11054,7 +10918,7 @@ return (NULL); } -void +static void sfmmu_hat_exit(hatlock_t *hatlockp) { if (hatlockp != NULL) @@ -12197,13 +12061,8 @@ * then we flush the shared TSBs, if we find a private hat, * which is part of an SCD, but where the region * is not part of the SCD then we flush the private TSBs. - * - * If the Rock page size register is present, then SCDs - * may contain both shared and private pages, so we cannot - * use this optimization to avoid flushing private TSBs. - */ - if (pgsz_search_on == 0 && - !sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && + */ + if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD)) { scdp = sfmmup->sfmmu_scdp; if (SF_RGNMAP_TEST(scdp->scd_hmeregion_map, rid)) { @@ -12332,13 +12191,8 @@ * which is part of an SCD, but where the region * corresponding to this va is not part of the SCD then we * flush the private TSBs. - * - * If the Rock page size register is present, then SCDs - * may contain both shared and private pages, so we cannot - * use this optimization to avoid flushing private TSBs. - */ - if (pgsz_search_on == 0 && - !sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && + */ + if (!sfmmup->sfmmu_scdhat && sfmmup->sfmmu_scdp != NULL && !SFMMU_FLAGS_ISSET(sfmmup, HAT_JOIN_SCD) && !SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMBUSY)) { if (!find_ism_rid(sfmmup, ism_sfmmup, va, @@ -12648,7 +12502,7 @@ * A per-process (PP) lock is used to synchronize ctx allocations in * resume() and ctx invalidations here. */ -void +static void sfmmu_invalidate_ctx(sfmmu_t *sfmmup) { cpuset_t cpuset; @@ -14174,9 +14028,6 @@ if (tteflag && !(sfmmup->sfmmu_rtteflags & tteflag)) { hatlockp = sfmmu_hat_enter(sfmmup); sfmmup->sfmmu_rtteflags |= tteflag; - if (&mmu_set_pgsz_order) { - mmu_set_pgsz_order(sfmmup, 1); - } sfmmu_hat_exit(hatlockp); } hatlockp = sfmmu_hat_enter(sfmmup); @@ -15232,9 +15083,6 @@ ASSERT(sfmmup->sfmmu_ttecnt[i] >= scdp->scd_rttecnt[i]); atomic_add_long(&sfmmup->sfmmu_ttecnt[i], -sfmmup->sfmmu_scdrttecnt[i]); - if (!sfmmup->sfmmu_ttecnt[i]) { - sfmmup->sfmmu_tteflags &= ~(1 << i); - } } /* update tsb0 inflation count */ if (old_scdp != NULL) { @@ -15245,9 +15093,6 @@ scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt); sfmmup->sfmmu_tsb0_4minflcnt -= scdp->scd_sfmmup->sfmmu_tsb0_4minflcnt; - if (&mmu_set_pgsz_order) { - mmu_set_pgsz_order(sfmmup, 0); - } sfmmu_hat_exit(hatlockp); if (old_scdp != NULL) { @@ -15307,7 +15152,7 @@ for (scdp = srdp->srd_scdp; scdp != NULL; scdp = scdp->scd_next) { SF_RGNMAP_EQUAL(&scdp->scd_region_map, - &sfmmup->sfmmu_region_map, SFMMU_RGNMAP_WORDS, ret); + &sfmmup->sfmmu_region_map, ret); if (ret == 1) { SF_SCD_INCR_REF(scdp); mutex_exit(&srdp->srd_scd_mutex); @@ -15455,10 +15300,6 @@ scdp->scd_rttecnt[i]); atomic_add_long(&sfmmup->sfmmu_ttecnt[i], sfmmup->sfmmu_scdrttecnt[i]); - if (sfmmup->sfmmu_ttecnt[i] && - (sfmmup->sfmmu_tteflags & (1 << i)) == 0) { - sfmmup->sfmmu_tteflags |= (1 << i); - } sfmmup->sfmmu_scdrttecnt[i] = 0; /* update ismttecnt to include SCD ism before hat leaves SCD */ sfmmup->sfmmu_ismttecnt[i] += sfmmup->sfmmu_scdismttecnt[i]; @@ -15472,9 +15313,6 @@ } sfmmup->sfmmu_scdp = NULL; - if (&mmu_set_pgsz_order) { - mmu_set_pgsz_order(sfmmup, 0); - } sfmmu_hat_exit(hatlockp); /* @@ -15520,8 +15358,7 @@ * It is possible that the scd has been freed and reallocated with a * different region map while we've been waiting for the srd_scd_mutex. */ - SF_RGNMAP_EQUAL(scd_rmap, &sp->scd_region_map, - SFMMU_RGNMAP_WORDS, ret); + SF_RGNMAP_EQUAL(scd_rmap, &sp->scd_region_map, ret); if (ret != 1) { mutex_exit(&srdp->srd_scd_mutex); return;
--- a/usr/src/uts/sfmmu/vm/hat_sfmmu.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.h Thu Aug 06 17:39:39 2009 -0700 @@ -112,7 +112,6 @@ #define P_TNC 0x10 /* non-caching is temporary bit */ #define P_KPMS 0x20 /* kpm mapped small (vac alias prevention) */ #define P_KPMC 0x40 /* kpm conflict page (vac alias prevention) */ -#define P_EXEC 0x80 /* execution reference (I-cache filled) */ #define PP_GENERIC_ATTR(pp) ((pp)->p_nrm & (P_MOD | P_REF | P_RO)) #define PP_ISMOD(pp) ((pp)->p_nrm & P_MOD) @@ -125,7 +124,6 @@ #endif #define PP_ISKPMS(pp) ((pp)->p_nrm & P_KPMS) #define PP_ISKPMC(pp) ((pp)->p_nrm & P_KPMC) -#define PP_ISEXEC(pp) ((pp)->p_nrm & P_EXEC) #define PP_SETMOD(pp) ((pp)->p_nrm |= P_MOD) #define PP_SETREF(pp) ((pp)->p_nrm |= P_REF) @@ -138,7 +136,6 @@ #endif #define PP_SETKPMS(pp) ((pp)->p_nrm |= P_KPMS) #define PP_SETKPMC(pp) ((pp)->p_nrm |= P_KPMC) -#define PP_SETEXEC(pp) ((pp)->p_nrm |= P_EXEC) #define PP_CLRMOD(pp) ((pp)->p_nrm &= ~P_MOD) #define PP_CLRREF(pp) ((pp)->p_nrm &= ~P_REF) @@ -150,17 +147,6 @@ #endif #define PP_CLRKPMS(pp) ((pp)->p_nrm &= ~P_KPMS) #define PP_CLRKPMC(pp) ((pp)->p_nrm &= ~P_KPMC) -#define PP_CLREXEC(pp) ((pp)->p_nrm &= ~P_EXEC) - -/* - * Support for non-coherent I-cache. If the MD property "coherency" - * is set to 0, it means that the I-cache must be flushed in - * software. Use the "soft exec" bit in the TTE to detect when a page - * has been executed, so that it can be flushed before it is re-used - * for another program. - */ -#define TTE_EXECUTED(ttep) \ - (TTE_IS_EXECUTABLE(ttep) && TTE_IS_SOFTEXEC(ttep)) /* * All shared memory segments attached with the SHM_SHARE_MMU flag (ISM) @@ -337,15 +323,15 @@ } /* - * Returns 1 if region map1 and map2 are equal. + * Returns 1 if map1 and map2 are equal. */ -#define SF_RGNMAP_EQUAL(map1, map2, words, rval) { \ +#define SF_RGNMAP_EQUAL(map1, map2, rval) { \ int _i; \ - for (_i = 0; _i < words; _i++) { \ + for (_i = 0; _i < SFMMU_RGNMAP_WORDS; _i++) { \ if ((map1)->bitmap[_i] != (map2)->bitmap[_i]) \ break; \ } \ - if (_i < words) \ + if (_i < SFMMU_RGNMAP_WORDS) \ rval = 0; \ else \ rval = 1; \ @@ -609,13 +595,9 @@ extern uint_t max_mmu_ctxdoms; extern mmu_ctx_t **mmu_ctxs_tbl; -extern uint_t nctxs; extern void sfmmu_cpu_init(cpu_t *); extern void sfmmu_cpu_cleanup(cpu_t *); -extern void sfmmu_invalidate_ctx(sfmmu_t *); -extern hatlock_t *sfmmu_hat_enter(sfmmu_t *); -extern void sfmmu_hat_exit(hatlock_t *); /* * The following structure is used to get MMU context domain information for @@ -652,6 +634,7 @@ uint64_t cnum:16; } sfmmu_ctx_t; + /* * The platform dependent hat structure. * tte counts should be protected by cas. @@ -713,11 +696,7 @@ sf_rgn_link_t *sfmmu_hmeregion_links[SFMMU_L1_HMERLINKS]; sf_rgn_link_t sfmmu_scd_link; /* link to scd or pending queue */ #ifdef sun4v - /* ttecnt for Rock pagesize register management */ - ulong_t sfmmu_mmuttecnt[MMU_PAGE_SIZES]; struct hv_tsb_block sfmmu_hvblock; - struct hv_pgsz_order sfmmu_pgsz_order; /* pagesize search order */ - uint8_t sfmmu_pgsz_map; /* bit map to control shared pgsz use */ #endif /* * sfmmu_ctxs is a variable length array of max_mmu_ctxdoms # of @@ -763,8 +742,6 @@ extern int disable_shctx; extern int shctx_on; -extern int pgsz_search_on; -extern int disable_pgsz_search; /* * bit mask for managing vac conflicts on large pages. @@ -878,7 +855,6 @@ #define HAT_CTX1_FLAG 0x100 /* ISM imap hatflag for ctx1 */ #define HAT_JOIN_SCD 0x200 /* region is joining scd */ #define HAT_ALLCTX_INVALID 0x400 /* all per-MMU ctxs are invalidated */ -#define HAT_ISMNOTINSCD 0x800 /* Not all ISM segs are in the SCD */ #define SFMMU_LGPGS_INUSE(sfmmup) \ (((sfmmup)->sfmmu_tteflags | (sfmmup)->sfmmu_rtteflags) || \ @@ -1822,8 +1798,7 @@ uintptr_t scratch[3]; ulong_t shmermap[SFMMU_HMERGNMAP_WORDS]; /* 8 bytes */ ulong_t scd_shmermap[SFMMU_HMERGNMAP_WORDS]; /* 8 bytes */ - uint8_t pgsz_bitmap; /* limits ctx1 page sizes */ - uint8_t pad[47]; /* pad to 64 bytes */ + uint8_t pad[48]; /* pad to 64 bytes */ }; /* @@ -2354,17 +2329,11 @@ #pragma weak mmu_large_pages_disabled #pragma weak mmu_set_ctx_page_sizes #pragma weak mmu_check_page_sizes -#pragma weak mmu_set_pgsz_order -#pragma weak sfmmu_init_pgsz_hv -#pragma weak mmu_enable_pgsz_search extern void mmu_init_scd(sf_scd_t *); extern uint_t mmu_large_pages_disabled(uint_t); extern void mmu_set_ctx_page_sizes(sfmmu_t *); extern void mmu_check_page_sizes(sfmmu_t *, uint64_t *); -extern void mmu_set_pgsz_order(sfmmu_t *, int); -extern void sfmmu_init_pgsz_hv(sfmmu_t *); -extern void mmu_enable_pgsz_search(); extern sfmmu_t *ksfmmup; extern caddr_t ktsb_base; @@ -2406,15 +2375,12 @@ extern uint_t disable_ism_large_pages; extern uint_t disable_auto_data_large_pages; extern uint_t disable_auto_text_large_pages; -extern uint_t disable_shctx_large_pages; - -extern void sfmmu_patch_shctx(void); -extern void sfmmu_patch_pgsz_reg(void); /* kpm externals */ extern pfn_t sfmmu_kpm_vatopfn(caddr_t); extern void sfmmu_kpm_patch_tlbm(void); extern void sfmmu_kpm_patch_tsbm(void); +extern void sfmmu_patch_shctx(void); extern void sfmmu_kpm_load_tsb(caddr_t, tte_t *, int); extern void sfmmu_kpm_unload_tsb(caddr_t, int); extern void sfmmu_kpm_tsbmtl(short *, uint_t *, int);
--- a/usr/src/uts/sparc/fpu/fpu_simulator.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sparc/fpu/fpu_simulator.c Thu Aug 06 17:39:39 2009 -0700 @@ -112,14 +112,6 @@ { "fpu_sim_fnmaddd", KSTAT_DATA_UINT64}, { "fpu_sim_fnmsubs", KSTAT_DATA_UINT64}, { "fpu_sim_fnmsubd", KSTAT_DATA_UINT64}, - { "fpu_sim_fumadds", KSTAT_DATA_UINT64}, - { "fpu_sim_fumaddd", KSTAT_DATA_UINT64}, - { "fpu_sim_fumsubs", KSTAT_DATA_UINT64}, - { "fpu_sim_fumsubd", KSTAT_DATA_UINT64}, - { "fpu_sim_fnumadds", KSTAT_DATA_UINT64}, - { "fpu_sim_fnumaddd", KSTAT_DATA_UINT64}, - { "fpu_sim_fnumsubs", KSTAT_DATA_UINT64}, - { "fpu_sim_fnumsubd", KSTAT_DATA_UINT64}, { "fpu_sim_invalid", KSTAT_DATA_UINT64}, }; @@ -185,14 +177,12 @@ enum fcc_type cc; uint32_t nfcc; /* fcc number field. */ uint64_t lusr; - uint_t fmau_mul_exceptions; nrs1 = inst.rs1; nrs2 = inst.rs2; nrd = inst.rd; fsr = *pfsr; pfpsd->fp_current_exceptions = 0; /* Init current exceptions. */ - fmau_mul_exceptions = 0; pfpsd->fp_fsrtem = fsr.tem; /* Obtain fsr's tem */ /* * Obtain rounding direction and precision @@ -200,7 +190,7 @@ pfpsd->fp_direction = GSR_IM(gsr) ? GSR_IRND(gsr) : fsr.rnd; pfpsd->fp_precision = fsr.rnp; - if (inst.op3 == 0x37) { /* FMA-fused opcode */ + if (inst.op3 == 0x37) { /* IMPDEP2B FMA-fused opcode */ fp_fma_inst_type *fma_inst; uint32_t nrs3; unpacked us3; @@ -263,121 +253,6 @@ FPUINFO_KSTAT_PREC(fma_inst->sz, fpu_sim_fnmsubs, fpu_sim_fnmsubd, fpu_sim_invalid); } - } else if (inst.op3 == fmau) { /* FMA-unfused opcode */ - fp_fma_inst_type *fmau_inst; - uint32_t nrs3; - unpacked us3; - unpacked ust; - /* - * For FMA-unfused, if either the multiply part or the add - * part raises an exception whose trap is enabled, we trap - * with cexc indicating only that exception and aexc un- - * changed. If neither part raises an exception whose trap - * is enabled, the instruction completes with cexc indicating - * just those exceptions that occurred in the add part and - * aexc accumulating all exceptions that occurred in either - * part. We use fmau_mul_exceptions to keep track of the - * exceptions that occurred in the multiply part while we - * simulate the add part. - */ - fmau_inst = (fp_fma_inst_type *) &inst; - nrs2 = fmau_inst->rs2; - nrs3 = fmau_inst->rs3; - switch (fmau_inst->var) { - case fmadd: - _fp_unpack(pfpsd, &us1, nrs1, fmau_inst->sz); - _fp_unpack(pfpsd, &us2, nrs2, fmau_inst->sz); - _fp_mul(pfpsd, &us1, &us2, &ust); - _fp_pack(pfpsd, &ust, nrd, fmau_inst->sz); - if ((pfpsd->fp_current_exceptions & fsr.tem) == 0) { - fmau_mul_exceptions = - pfpsd->fp_current_exceptions; - pfpsd->fp_current_exceptions = 0; - _fp_unpack(pfpsd, &us3, nrs3, fmau_inst->sz); - _fp_unpack(pfpsd, &ust, nrd, fmau_inst->sz); - _fp_add(pfpsd, &ust, &us3, &ud); - /* ensure QSNaN1 has precedence over QNaN3 */ - if ((us3.fpclass == fp_quiet) && - ((us1.fpclass == fp_signaling) || - (us2.fpclass == fp_signaling))) - ud = ust; - _fp_pack(pfpsd, &ud, nrd, fmau_inst->sz); - } - FPUINFO_KSTAT_PREC(fmau_inst->sz, fpu_sim_fumadds, - fpu_sim_fumaddd, fpu_sim_invalid); - break; - case fmsub: - _fp_unpack(pfpsd, &us1, nrs1, fmau_inst->sz); - _fp_unpack(pfpsd, &us2, nrs2, fmau_inst->sz); - _fp_mul(pfpsd, &us1, &us2, &ust); - _fp_pack(pfpsd, &ust, nrd, fmau_inst->sz); - if ((pfpsd->fp_current_exceptions & fsr.tem) == 0) { - fmau_mul_exceptions = - pfpsd->fp_current_exceptions; - pfpsd->fp_current_exceptions = 0; - _fp_unpack(pfpsd, &us3, nrs3, fmau_inst->sz); - _fp_unpack(pfpsd, &ust, nrd, fmau_inst->sz); - _fp_sub(pfpsd, &ust, &us3, &ud); - /* ensure QSNaN1 has precedence over QNaN3 */ - if ((us3.fpclass == fp_quiet) && - ((us1.fpclass == fp_signaling) || - (us2.fpclass == fp_signaling))) - ud = ust; - _fp_pack(pfpsd, &ud, nrd, fmau_inst->sz); - } - FPUINFO_KSTAT_PREC(fmau_inst->sz, fpu_sim_fumsubs, - fpu_sim_fumsubd, fpu_sim_invalid); - break; - case fnmadd: - _fp_unpack(pfpsd, &us1, nrs1, fmau_inst->sz); - _fp_unpack(pfpsd, &us2, nrs2, fmau_inst->sz); - _fp_mul(pfpsd, &us1, &us2, &ust); - _fp_pack(pfpsd, &ust, nrd, fmau_inst->sz); - if ((pfpsd->fp_current_exceptions & fsr.tem) == 0) { - fmau_mul_exceptions = - pfpsd->fp_current_exceptions; - pfpsd->fp_current_exceptions = 0; - _fp_unpack(pfpsd, &us3, nrs3, fmau_inst->sz); - _fp_unpack(pfpsd, &ust, nrd, fmau_inst->sz); - if (ust.fpclass != fp_quiet && - ust.fpclass != fp_signaling) - ust.sign ^= 1; - _fp_sub(pfpsd, &ust, &us3, &ud); - /* ensure QSNaN1 has precedence over QNaN3 */ - if ((us3.fpclass == fp_quiet) && - ((us1.fpclass == fp_signaling) || - (us2.fpclass == fp_signaling))) - ud = ust; - _fp_pack(pfpsd, &ud, nrd, fmau_inst->sz); - } - FPUINFO_KSTAT_PREC(fmau_inst->sz, fpu_sim_fnumadds, - fpu_sim_fnumaddd, fpu_sim_invalid); - break; - case fnmsub: - _fp_unpack(pfpsd, &us1, nrs1, fmau_inst->sz); - _fp_unpack(pfpsd, &us2, nrs2, fmau_inst->sz); - _fp_mul(pfpsd, &us1, &us2, &ust); - _fp_pack(pfpsd, &ust, nrd, fmau_inst->sz); - if ((pfpsd->fp_current_exceptions & fsr.tem) == 0) { - fmau_mul_exceptions = - pfpsd->fp_current_exceptions; - pfpsd->fp_current_exceptions = 0; - _fp_unpack(pfpsd, &us3, nrs3, fmau_inst->sz); - _fp_unpack(pfpsd, &ust, nrd, fmau_inst->sz); - if (ust.fpclass != fp_quiet && - ust.fpclass != fp_signaling) - ust.sign ^= 1; - _fp_add(pfpsd, &ust, &us3, &ud); - /* ensure QSNaN1 has precedence over QNaN3 */ - if ((us3.fpclass == fp_quiet) && - ((us1.fpclass == fp_signaling) || - (us2.fpclass == fp_signaling))) - ud = ust; - _fp_pack(pfpsd, &ud, nrd, fmau_inst->sz); - } - FPUINFO_KSTAT_PREC(fmau_inst->sz, fpu_sim_fnumsubs, - fpu_sim_fnumsubd, fpu_sim_invalid); - } } else { nfcc = nrd & 0x3; if (inst.op3 == 0x35) { /* fpop2 */ @@ -645,7 +520,7 @@ *pfsr = fsr; return (ftt_ieee); } else { /* Just set accrued exception field. */ - fsr.aexc |= pfpsd->fp_current_exceptions | fmau_mul_exceptions; + fsr.aexc |= pfpsd->fp_current_exceptions; } *pfsr = fsr; return (ftt_none); @@ -697,7 +572,7 @@ return (ftt); } else if ((fp.inst.hibits == 2) && ((fp.inst.op3 == 0x34) || (fp.inst.op3 == 0x35) || - (fp.inst.op3 == 0x37) || (fp.inst.op3 == 0x3f))) { + (fp.inst.op3 == 0x37))) { ftt = _fp_fpu_simulator(pfpsd, fp.inst, pfsr, gsr); if (ftt == ftt_none || ftt == ftt_ieee) { pregs->r_pc = pregs->r_npc; @@ -776,7 +651,7 @@ if ((fp.inst.hibits == 2) && ((fp.inst.op3 == 0x34) || (fp.inst.op3 == 0x35) || - (fp.inst.op3 == 0x37) || (fp.inst.op3 == 0x3f))) { + (fp.inst.op3 == 0x37))) { ftt = _fp_fpu_simulator(pfpsd, fp.inst, (fsr_type *)&tfsr, gsr); /* Do not retry emulated instruction. */ pregs->r_pc = pregs->r_npc; @@ -816,7 +691,7 @@ return (ftt); if ((fp.inst.hibits == 2) && /* fpops */ ((fp.inst.op3 == 0x34) || (fp.inst.op3 == 0x35) || - (fp.inst.op3 == 0x37) || (fp.inst.op3 == 0x3f))) { + (fp.inst.op3 == 0x37))) { ftt = _fp_fpu_simulator(pfpsd, fp.inst, (fsr_type *)&tfsr, gsr); /* Do not retry emulated instruction. */ pfpu->fpu_fsr = tfsr;
--- a/usr/src/uts/sparc/sys/fpu/fpu_simulator.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sparc/sys/fpu/fpu_simulator.h Thu Aug 06 17:39:39 2009 -0700 @@ -204,7 +204,7 @@ ft_op_38 = 0x38, fp_op_39 = 0x39, fp_op_3a = 0x3a, fp_op_3b = 0x3b, fp_op_3c = 0x3c, - fp_op_3d = 0x3d, fp_op_3e = 0x3e, fmau = 0x3f + fp_op_3d = 0x3d, fp_op_3e = 0x3e, fp_op_3f = 0x3f }; typedef /* FPU instruction. */ @@ -219,14 +219,14 @@ uint32_t rs2 : 5; /* Second operand. */ } fp_inst_type; -enum fp_op_fma_var { /* FMA-fused/unfused instr. variations */ +enum fp_op_fma_var { /* IMPDEP2B FMA-fused instr. variations */ fmadd = 0, fmsub = 1, fnmsub = 2, fnmadd = 3 }; -typedef /* FPU FMA-fused/unfused instructions. */ +typedef /* IMPDEP2B FPU FMA-fused instruction. */ struct { uint32_t hibits : 2; /* Top two bits. */ uint32_t rd : 5; /* Destination. */ @@ -330,14 +330,6 @@ struct kstat_named fpu_sim_fnmaddd; struct kstat_named fpu_sim_fnmsubs; struct kstat_named fpu_sim_fnmsubd; - struct kstat_named fpu_sim_fumadds; - struct kstat_named fpu_sim_fumaddd; - struct kstat_named fpu_sim_fumsubs; - struct kstat_named fpu_sim_fumsubd; - struct kstat_named fpu_sim_fnumadds; - struct kstat_named fpu_sim_fnumaddd; - struct kstat_named fpu_sim_fnumsubs; - struct kstat_named fpu_sim_fnumsubd; struct kstat_named fpu_sim_invalid; };
--- a/usr/src/uts/sun4/os/startup.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4/os/startup.c Thu Aug 06 17:39:39 2009 -0700 @@ -896,7 +896,7 @@ PRM_DEBUG(kmem64_pabase); PRM_DEBUG(kmem64_szc); sfmmu_memtte(&tte, kmem64_pabase >> MMU_PAGESHIFT, - PROC_DATA | HAT_NOSYNC | HAT_ATTR_NOSOFTEXEC, kmem64_szc); + PROC_DATA | HAT_NOSYNC, kmem64_szc); PRM_DEBUG(tte.ll); (void) sprintf(b, kmem64_obp_str, kmem64_base, kmem64_end, TTE_PAGEMASK(kmem64_szc), tte.ll); @@ -2850,8 +2850,6 @@ "h# %p constant KCONTEXT " "h# %p constant KHATID " "h# %x constant ASI_MEM " - "h# %x constant SOFTEXEC " - "h# %x constant EXECPRM " ": PHYS-X@ ( phys -- data ) " " ASI_MEM spacex@ " @@ -2954,11 +2952,7 @@ " ?dup if ( addr sfmmup hmeblkp ) " " nip swap HBLK_TO_TTEP ( ttep ) " " dup TTE_IS_VALID if ( valid-ttep ) " - " PHYS-X@ ( tte-data ) " - " dup SOFTEXEC and 0> if ( tte-data ) " - " SOFTEXEC - EXECPRM or ( tte-data ) " - " then ( tte-data ) " - " true ( tte-data true ) " + " PHYS-X@ true ( tte-data true ) " " else ( invalid-tte ) " " drop false ( false ) " " then ( false | tte-data true ) " @@ -3009,9 +3003,7 @@ KHMEHASH_SZ, KCONTEXT, KHATID, - ASI_MEM, - icache_is_coherent ? 0 : TTE_SOFTEXEC_INT, - TTE_EXECPRM_INT); + ASI_MEM); prom_interpret(bp, 0, 0, 0, 0, 0); kobj_free(bp, MMU_PAGESIZE);
--- a/usr/src/uts/sun4/vm/sfmmu.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4/vm/sfmmu.c Thu Aug 06 17:39:39 2009 -0700 @@ -199,10 +199,6 @@ sfmmu_patch_shctx(); } - if (&mmu_enable_pgsz_search) { - mmu_enable_pgsz_search(); - } - /* * The 8K-indexed kernel TSB space is used to hold * translations below...
--- a/usr/src/uts/sun4/vm/vm_dep.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4/vm/vm_dep.h Thu Aug 06 17:39:39 2009 -0700 @@ -859,16 +859,6 @@ extern size_t ndata_maxsize(struct memlist *); extern size_t ndata_spare(struct memlist *, size_t, size_t); -/* - * Platform specific support for non-coherent I-cache and soft exec - */ -extern uint_t icache_is_coherent; -extern uint_t force_sync_icache_after_bcopy; -extern uint_t force_sync_icache_after_dma; - -extern void mach_setup_icache(uint_t); -#pragma weak mach_setup_icache - #ifdef __cplusplus } #endif
--- a/usr/src/uts/sun4u/sys/pte.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4u/sys/pte.h Thu Aug 06 17:39:39 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -354,23 +354,6 @@ #endif /* !_ASM */ -/* - * There is no support for non-coherent I-cache in sun4u - */ -#define TTE_SOFTEXEC_INT 0x00000000 -#ifndef _ASM -#ifdef lint -/* fix lint warnings about constant conditionals and empty if */ -#define TTE_IS_SOFTEXEC(ttep) TTE_IS_EXECUTABLE(ttep) -#define TTE_SET_SOFTEXEC(ttep) TTE_SET_EXEC(ttep) -#define TTE_CLR_SOFTEXEC(ttep) TTE_CLR_EXEC(ttep) -#else -#define TTE_IS_SOFTEXEC(ttep) (0) -#define TTE_SET_SOFTEXEC(ttep) -#define TTE_CLR_SOFTEXEC(ttep) -#endif /* lint */ -#endif /* !_ASM */ - #ifdef __cplusplus } #endif
--- a/usr/src/uts/sun4u/vm/mach_sfmmu.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4u/vm/mach_sfmmu.h Thu Aug 06 17:39:39 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -112,12 +112,6 @@ /* END CSTYLED */ /* - * This macro is to control the pagesizes used for shared context on - * Rock systems. - */ -#define CHECK_SHARED_PGSZ(tsbarea, tte, tmp, use_shctx, label) - -/* * This macro is used in the MMU code to check if TL should be lowered from * 2 to 1 to pop trapstat's state. See the block comment in trapstat.c * for details. @@ -267,12 +261,6 @@ /* CSTYLED */ \ label/**/1: -/* - * No support for non-coherent I-cache in sun4u - */ -#define TTE_SET_EXEC_ML(tte, ttepa, tmp1, label) -#define TTE_CLR_SOFTEXEC_ML(tte) -#define TTE_CHK_SOFTEXEC_ML(tte) andcc tte, 0, %g0 /* * TTE_SET_REF_ML is a macro that updates the reference bit if it is
--- a/usr/src/uts/sun4v/Makefile.files Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/Makefile.files Thu Aug 06 17:39:39 2009 -0700 @@ -179,7 +179,6 @@ # NI_PCBE_OBJS = niagara_pcbe.o N2_PCBE_OBJS = niagara2_pcbe.o -RK_PCBE_OBJS = rock_pcbe.o # # cpu modules @@ -190,7 +189,6 @@ NIAGARACPU_OBJS += niagara_asm.o atomic.o NIAGARA2CPU_OBJS = niagara2.o niagara_copy.o common_asm.o niagara_perfctr.o NIAGARA2CPU_OBJS += niagara2_asm.o atomic.o -ROCKCPU_OBJS = rock.o rock_copy.o common_asm.o rock_asm.o atomic.o # # platform module
--- a/usr/src/uts/sun4v/Makefile.sun4v.shared Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/Makefile.sun4v.shared Thu Aug 06 17:39:39 2009 -0700 @@ -433,9 +433,9 @@ # # cpu modules # -CPU_KMODS += generic niagara niagara2 vfalls rock +CPU_KMODS += generic niagara niagara2 vfalls -LINT_CPU_KMODS += generic rock +LINT_CPU_KMODS += generic # # Performance Counter BackEnd Modules (/usr/kernel/pcbe): @@ -443,4 +443,3 @@ PCBE_KMODS += niagara_pcbe PCBE_KMODS += niagara2_pcbe PCBE_KMODS += vfalls_pcbe -PCBE_KMODS += rock_pcbe
--- a/usr/src/uts/sun4v/cpu/rock.c Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1014 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/types.h> -#include <sys/systm.h> -#include <sys/archsystm.h> -#include <sys/machparam.h> -#include <sys/machsystm.h> -#include <sys/cpu.h> -#include <sys/elf_SPARC.h> -#include <vm/page.h> -#include <vm/vm_dep.h> -#include <sys/cpuvar.h> -#include <sys/async.h> -#include <sys/cmn_err.h> -#include <sys/debug.h> -#include <sys/dditypes.h> -#include <sys/sunddi.h> -#include <sys/cpu_module.h> -#include <sys/prom_debug.h> -#include <sys/vmsystm.h> -#include <sys/prom_plat.h> -#include <sys/sysmacros.h> -#include <sys/intreg.h> -#include <sys/machtrap.h> -#include <sys/ontrap.h> -#include <sys/ivintr.h> -#include <sys/atomic.h> -#include <sys/panic.h> -#include <sys/dtrace.h> -#include <vm/seg_spt.h> -#include <sys/hypervisor_api.h> -#include <sys/rock_hypervisor_api.h> -#include <sys/hsvc.h> -#include <vm/hat_sfmmu.h> -#include <sys/mutex_impl.h> - -uint_t root_phys_addr_lo_mask = 0xffffffffU; -uint8_t enable_tm = 1; - -char cpu_module_name[] = "SUNW,UltraSPARC-AT10"; -boolean_t hsvc_tm_available = B_TRUE; - -static hsvc_info_t rock_tm_hsvc = { - HSVC_REV_1, /* HSVC rev num */ - NULL, /* Private */ - HSVC_GROUP_TM, /* Requested API Group */ - ROCK_HSVC_MAJOR, /* Requested Major */ - ROCK_HSVC_MINOR, /* Requested Minor */ - cpu_module_name /* Module name */ -}; - -boolean_t hsvc_mmu_ext_available = B_TRUE; - -static hsvc_info_t rock_mmu_ext_hsvc = { - HSVC_REV_1, /* HSVC rev num */ - NULL, /* Private */ - HSVC_GROUP_RKMMU_EXT, /* Requested API Group */ - ROCK_HSVC_MAJOR, /* Requested Major */ - ROCK_HSVC_MINOR, /* Requested Minor */ - cpu_module_name /* Module name */ -}; - -static void encode_pgsz_order(uint64_t, int, int, uint16_t *, uchar_t *); -static void set_pgsz_order(uchar_t, uchar_t, uint64_t *, int *, int *, - sfmmu_t *); - -extern void rock_mutex_delay(void); - -/* - * External /etc/system tunable, for controlling whether shared or private pages - * come first in the pagesize order register. - */ -int pgsz_order_shared_first = 1; - -#define MCOREID_MASK 0x1E -#define MCOREID_SHIFT 1 - -static uint_t mmu_disable_large_pages = ((1 << TTE512K) | (1 << TTE32M) | - (1 << TTE2G) | (1 << TTE16G)); -static uint_t mmu_disable_ism_large_pages = ((1 << TTE512K) | (1 << TTE32M) | - (1 << TTE2G) | (1 << TTE16G)); -static uint_t mmu_disable_auto_data_large_pages = ((1 << TTE512K) | - (1 << TTE32M) | (1 << TTE2G) | (1 << TTE16G)); -static uint_t mmu_disable_auto_text_large_pages = ((1 << TTE512K) | - (1 << TTE32M) | (1 << TTE2G) | (1 << TTE16G)); - -void -cpu_setup(void) -{ - extern int cpc_has_overflow_intr; - uint64_t sup_minor; - int status; - - /* - * The setup common to all CPU modules is done in cpu_setup_common - * routine. - */ - cpu_setup_common(NULL); - - /* - * Rock's max nctxs is 64K. Set it accordingly. - */ - nctxs = MAX_NCTXS; - - /* - * Rock I$ is non-coherent. - */ - mach_setup_icache(0); - -#ifdef DEBUG - /* - * These should always be present on Rock - */ - if (cpu_hwcap_flags == 0) - cmn_err(CE_WARN, "hwcap-list missing from MD"); -#endif - cpu_hwcap_flags |= AV_SPARC_ASI_CACHE_SPARING; - - cache |= (CACHE_PTAG | CACHE_IOCOHERENT); - - if (use_page_coloring) { - do_pg_coloring = 1; - } - - /* - * Rock generates hpriv performance event trap instead of pic overflow - * trap. To get the attention of the guest hv in-turn generates pic - * overflow trap. Therefore enable support for that. - */ - cpc_has_overflow_intr = 1; - - /* - * Enable 4M pages for OOB. - */ - max_uheap_lpsize = MMU_PAGESIZE4M; - max_ustack_lpsize = MMU_PAGESIZE4M; - max_privmap_lpsize = MMU_PAGESIZE4M; - - /* - * hv_tm_enable is a part of TM group. We need to - * negotiate that API group before we can use it. - */ - status = hsvc_register(&rock_tm_hsvc, &sup_minor); - if ((status != 0) || (sup_minor < (uint64_t)ROCK_HSVC_MINOR)) { - cmn_err(CE_WARN, "%s cannot negotiate hypervisor services: " - "major: 0x%lx minor: 0x%lx group: 0x%x errno: %d", - cpu_module_name, rock_tm_hsvc.hsvc_major, - rock_tm_hsvc.hsvc_minor, HSVC_GROUP_TM, status); - hsvc_tm_available = B_FALSE; - } - - /* - * Negotiate API group for rock mmu extensions. - */ - status = hsvc_register(&rock_mmu_ext_hsvc, &sup_minor); - if ((status != 0) || (sup_minor < - (uint64_t)ROCK_HSVC_MINOR)) { - cmn_err(CE_WARN, "%s cannot negotiate hypervisor services: " - "major: 0x%lx minor: 0x%lx group: 0x%x errno: %d", - cpu_module_name, rock_mmu_ext_hsvc.hsvc_major, - rock_mmu_ext_hsvc.hsvc_minor, HSVC_GROUP_RKMMU_EXT, - status); - hsvc_mmu_ext_available = B_FALSE; - } -} - -/* - * Set the magic constants of the implementation. - */ -void -cpu_fiximp(struct cpu_node *cpunode) -{ - /* - * The Cache node is optional in MD. Therefore in case it - * does not exist, use hardcoded values. - */ -#ifdef DEBUG - /* - * ...that said, we do want this info to come from the MD. - */ - if (cpunode->ecache_size == 0 || cpunode->ecache_linesize == 0 || - cpunode->ecache_associativity == 0) { - cmn_err(CE_WARN, "ecache info missing from MD"); - } -#endif - if (cpunode->ecache_size == 0) - cpunode->ecache_size = 2 * 1024 * 1024; - if (cpunode->ecache_linesize == 0) - cpunode->ecache_linesize = 64; - if (cpunode->ecache_associativity == 0) - cpunode->ecache_associativity = 8; -} - -void -dtrace_flush_sec(uintptr_t addr) -{ - pfn_t pfn; - proc_t *procp = ttoproc(curthread); - page_t *pp; - caddr_t va; - - pfn = hat_getpfnum(procp->p_as->a_hat, (void *)addr); - if (pfn != -1) { - ASSERT(pf_is_memory(pfn)); - pp = page_numtopp_noreclaim(pfn, SE_SHARED); - if (pp != NULL) { - va = ppmapin(pp, PROT_READ | PROT_WRITE, (void *)addr); - /* sparc needs 8-byte align */ - doflush((caddr_t)((uintptr_t)va & -8l)); - ppmapout(va); - page_unlock(pp); - } - } -} - -void -cpu_map_exec_units(struct cpu *cp) -{ - ASSERT(MUTEX_HELD(&cpu_lock)); - - /* - * The cpu_ipipe and cpu_fpu fields are initialized based on - * the execution unit sharing information from the MD. They - * default to the CPU id in the absence of such information. - */ - cp->cpu_m.cpu_ipipe = cpunodes[cp->cpu_id].exec_unit_mapping; - if (cp->cpu_m.cpu_ipipe == NO_EU_MAPPING_FOUND) - cp->cpu_m.cpu_ipipe = (id_t)(cp->cpu_id); - - cp->cpu_m.cpu_fpu = cpunodes[cp->cpu_id].fpu_mapping; - if (cp->cpu_m.cpu_fpu == NO_EU_MAPPING_FOUND) - cp->cpu_m.cpu_fpu = (id_t)(cp->cpu_id); - - cp->cpu_m.cpu_core = (cp->cpu_id & MCOREID_MASK) >> MCOREID_SHIFT; - - /* - * The cpu_chip field is initialized based on the information - * in the MD and assume that all cpus within a chip - * share the same L2 cache. If no such info is available, we - * set the cpu to CPU_CHIPID_INVALID. - */ - cp->cpu_m.cpu_mpipe = cpunodes[cp->cpu_id].l2_cache_mapping; - if (cp->cpu_m.cpu_mpipe == NO_L2_CACHE_MAPPING_FOUND) - cp->cpu_m.cpu_mpipe = CPU_L2_CACHEID_INVALID; - - cp->cpu_m.cpu_chip = cpunodes[cp->cpu_id].l2_cache_mapping; - if (cp->cpu_m.cpu_chip == NO_L2_CACHE_MAPPING_FOUND) - cp->cpu_m.cpu_chip = CPU_CHIPID_INVALID; -} - -void -cpu_init_private(struct cpu *cp) -{ - cpu_map_exec_units(cp); - mutex_delay = rock_mutex_delay; -} - -/*ARGSUSED*/ -void -cpu_uninit_private(struct cpu *cp) -{ -} - -/* - * cpu_feature_init - * - * This function is called once per strand. - */ -void -cpu_feature_init(void) -{ - static int set_mutex_backoff_tunables = 0; - /* - * Set constants for mutex_backoff only once. - * On Rock, setting this to 8 gives the best performance, - * even for multi-chip systems. - */ - if (! set_mutex_backoff_tunables) { - mutex_backoff_base = 1; - mutex_cap_factor = 8; - set_mutex_backoff_tunables = 1; - } - - /* - * Enable or disable for each cpu if hypervisor API is negotiated. - */ - if (hsvc_tm_available == B_TRUE) - (void) hv_tm_enable((uint64_t)enable_tm); -} - -/* - * Flush specified address range from I$ via hv_mem_iflush interface - * Note that the hypervisor interface expects physical address range - * and can flush less than the requested size. - */ - -void -rock_sync_icache(caddr_t addr, size_t size) -{ - uint64_t pa, i, flushlen, flushed; - - if (!force_sync_icache_after_bcopy) - /* - * Do not clear the I-cache after bcopy. - * The default value is 0. This flag made be - * set via /etc/system. - */ - return; - - if (!tba_taken_over) - /* - * Very early in boot, va_to_pa() will try to call back - * into OBP. Very *very* early in boot, this will fail - * because we haven't set up the OBP callback handler. - * (Without this check, kmdb boot will fail.) - */ - return; - - for (i = 0; i < size; i += flushed) { - pa = va_to_pa(addr + i); - ASSERT(pa != -1); - - /* - * Only flush the required length up to a PAGESIZE. - */ - - flushlen = MIN((size - i), (PAGESIZE - (pa & MMU_PAGEOFFSET))); - - /* - * Flush I$ up to the page bounday. This call should never - * fail. If it does, we panic the system as I$ may contain - * stale instructions, which can result in silent data - * corruption. - */ - - if (hv_mem_iflush(pa, flushlen, &flushed) != H_EOK) { - cmn_err(CE_PANIC, "Flushing the Icache failed"); - } - - } -} - -/* - * There are no Hypervisor trapstat(1m) interfaces for Rock - * If trapstat(1m) wants to do its thing, it will have to - * take over all TLB miss handling. - */ -int -cpu_trapstat_conf(int cmd) -{ - int status; - - switch (cmd) { - case CPU_TSTATCONF_INIT: - case CPU_TSTATCONF_FINI: - case CPU_TSTATCONF_ENABLE: - case CPU_TSTATCONF_DISABLE: - status = ENOTSUP; - break; - default: - status = EINVAL; - break; - } - return (status); -} - -/*ARGSUSED*/ -void -cpu_trapstat_data(void *buf, uint_t tstat_pgszs) -{ -} - -#define MAX_PAGE_COLORS (1 << MAX_PAGE_COLORS_SHIFT) -#define MAX_PAGE_COLORS_SHIFT (5) - -/*ARGSUSED*/ -uint_t -page_pfn_2_color_cpu(pfn_t pfn, uchar_t szc, void *cookie) -{ - uint_t color; - - pfn = PFN_BASE(pfn, szc); - color = pfn ^ (pfn >> 20); - color = color ^ (color >> 10); - return ((color ^ (color >> 5)) & 0x1f); -} - -/* - * this macro rotates value "x" n steps to the right - * mask consists of "n + m" bits - * ASSERT(x < (1 << (n + m)); - */ -#define ROTATE_BITS(x, n, m) (((x) >> (n)) | (((x) & ((1 << (n)) - 1)) << m)) - - -uchar_t clr2sqnclr_table[MMU_PAGE_SIZES][MAX_PAGE_COLORS]; - -/* - * on Rock, the hash cache index is calculated as follows: - * pa[47:43]^pa[42:38]^pa[37:33]^pa[32:28]^ - * pa[27:23]^pa[22:18]^pa[17:13].pa[12:6] - * That is, every 5 bits is folded and XORd together. Page sizes - * differ by 3 bits, which is a factor of 8. This function computes - * the next sequential color by rotating by 3 steps within a field of 5 bits - * for every page size. - */ -void -clr2sqnclr_table_init() -{ - uchar_t szc; - uint_t color; - uint_t rot = 0; - - for (szc = 0; szc < MMU_PAGE_SIZES; szc++) { - rot = (szc * 3) % MAX_PAGE_COLORS_SHIFT; - for (color = 0; color < MAX_PAGE_COLORS; color++) { - clr2sqnclr_table[szc][color] = - ROTATE_BITS(color, rot, - (MAX_PAGE_COLORS_SHIFT - rot)); - } - } -} - -uint_t -clr2sqnclr(uchar_t szc, uint_t color) -{ - ASSERT(szc < MMU_PAGE_SIZES); - ASSERT(color < MAX_PAGE_COLORS); - - return (clr2sqnclr_table[szc][color]); -} - -#if MMU_PAGE_SIZES > 8 -#error MMU_PAGE_SIZES can be at most 8 -#endif - -uint_t -page_get_nsz_color_mask_cpu(uchar_t szc, uint_t mask) -{ - static uint_t rock_color_masks[7] = {0x18, 6, 0x11, 0xc, 3, 0x18, 6}; - - ASSERT(szc < MMU_PAGE_SIZES - 1); - return (mask & rock_color_masks[szc]); -} - -/*ARGSUSED*/ -uint_t -page_get_nsz_color_cpu(uchar_t szc, uint_t color) -{ - return (color); -} - -uint_t -page_get_color_shift_cpu(uchar_t szc, uchar_t nszc) -{ - ASSERT(nszc >= szc); - return (0); -} - -/*ARGSUSED*/ -pfn_t -page_next_pfn_for_color_cpu(pfn_t pfn, uchar_t szc, uint_t color, - uint_t ceq_mask, uint_t color_mask, void *cookie) -{ - uint_t sqn_ceq_mask = clr2sqnclr(szc, ceq_mask); - uint_t sqn_color = clr2sqnclr(szc, color); - uint_t pfn_shift = PNUM_SHIFT(szc); - pfn_t cpfn, npfn, base_pfn = pfn & (~(pfn_t)color_mask << pfn_shift); - uint_t base_sqn_color, nsqn_color, wrap = 0; - - ASSERT((color & ~ceq_mask) == 0); - - base_sqn_color = clr2sqnclr(szc, - page_pfn_2_color_cpu(base_pfn, szc, NULL)) ^ sqn_color; - nsqn_color = base_sqn_color; - - cpfn = (pfn_t)-1L; - do { - npfn = base_pfn | (nsqn_color << pfn_shift); - - ASSERT(((page_pfn_2_color_cpu(npfn, szc, NULL) ^ color) & - ceq_mask) == 0); - - if (npfn > pfn && npfn < cpfn) - cpfn = npfn; - - nsqn_color = INC_MASKED(nsqn_color, sqn_ceq_mask, color_mask); - if (nsqn_color != base_sqn_color) - continue; - - if (cpfn != (pfn_t)-1L) - break; - - base_pfn += ((pfn_t)color_mask + 1) << pfn_shift; - - base_sqn_color = clr2sqnclr(szc, - page_pfn_2_color_cpu(base_pfn, szc, NULL)) ^ sqn_color; - nsqn_color = base_sqn_color; - wrap++; - - } while (nsqn_color != base_sqn_color || wrap < 2); - - ASSERT(cpfn != (pfn_t)-1L); - - return (cpfn); -} - -void -page_coloring_init_cpu() -{ - int i; - uint_t colors = 1 << MAX_PAGE_COLORS_SHIFT; - - for (i = 0; i < mmu_page_sizes; i++) { - hw_page_array[i].hp_colors = colors; - } - - /* - * initialise conversion table between page colors and - * sequential colors - */ - clr2sqnclr_table_init(); - -} - -/* - * group colorequiv colors on Rock by low order bits of the color first - */ -void -page_set_colorequiv_arr_cpu(void) -{ - static uint_t nequiv_shades_log2[MMU_PAGE_SIZES] = {0, 3, 0, 0, 0, 0}; - - if (colorequiv > 1) { - int i; - uint_t sv_a = lowbit(colorequiv) - 1; - - if (sv_a > 15) - sv_a = 15; - - for (i = 0; i < MMU_PAGE_SIZES; i++) { - uint_t colors; - uint_t a = sv_a; - - if ((colors = hw_page_array[i].hp_colors) <= 1) - continue; - while ((colors >> a) == 0) - a--; - if (a > (colorequivszc[i] & 0xf) + - (colorequivszc[i] >> 4)) { - if (a <= nequiv_shades_log2[i]) { - colorequivszc[i] = (uchar_t)a; - } else { - colorequivszc[i] = - ((a - nequiv_shades_log2[i]) << 4) | - nequiv_shades_log2[i]; - } - } - } - } -} - -/* - * Calculate the page sizes needed to program Rock TLB page size register. - * The invctx parameter is a flag which indicates that it will be necessary to - * synchronize by invalidating contexts if the sfmmu pagesize register is - * updated. - */ -void -mmu_set_pgsz_order(sfmmu_t *sfmmup, int invctx) -{ - uchar_t private_pgsz_mask; - uchar_t shared_pgsz_mask; - uint16_t pgsz_order_hv[MAX_PGSZ_SEARCH_ORDER]; - uint64_t pgsz_order = 0; - uchar_t pgsz_map = 0; - int private_pgsz_num = 0; - int shared_pgsz_num = 0; - int tot_pgsz_num; - sf_scd_t *scdp; - int ret; - int i; - - /* - * The hatlock must be held in all cases except when the sfmmu is - * being initialized by hat_alloc() or we are calling hat_dup(), in - * these cases no other thread will be using the sfmmu yet. - */ - - ASSERT(!invctx || sfmmu_hat_lock_held(sfmmup)); - - if (pgsz_search_on == 0) - return; - - /* Always enable 8K private mappings */ - private_pgsz_mask = 1 << TTE8K; - - /* Enable 64K private mappings unless specifically disabled */ - if (!(disable_large_pages & (1 << TTE64K))) { - private_pgsz_mask |= 1 << TTE64K; - } - - /* - * First check for ISM segments not in an SCD. The algorithm for - * creating an SCD is to create one when an (D)ISM segment is attached - * unless the process's shared segments are a subset of an SCD which - * already exists. - * - * This situation also arises when we attach to more than the maximum - * number of (D)ISM segments defined in the region bit map - * (currently 64). - * - * We have set mmu_disable_ism_large_pages to force ISM segments to use - * only 4M and 256M pages. - */ - if (SFMMU_FLAGS_ISSET(sfmmup, HAT_ISMNOTINSCD)) { - private_pgsz_mask |= 1 << TTE4M; - if (SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM)) { - private_pgsz_mask |= 1 << TTE256M; - } - } - - /* Now check for regions not included in the SCD. */ - if ((scdp = sfmmup->sfmmu_scdp) != NULL) { - SF_RGNMAP_EQUAL(&scdp->scd_hmeregion_map, - &sfmmup->sfmmu_hmeregion_map, - SFMMU_HMERGNMAP_WORDS, ret); - if (!ret) { - private_pgsz_mask |= sfmmup->sfmmu_rtteflags; - } - } else { - private_pgsz_mask |= sfmmup->sfmmu_rtteflags; - } - - private_pgsz_mask |= sfmmup->sfmmu_tteflags; - - /* - * If the process is part of an SCD then enable 4M and 256M shared - * page sizes - unless these are specifically disabled. If the 4M - * shared page size is specifically disabled and the process has (D)ISM - * segments attached or 4M regions then enable the private 4M page size. - * If the 256M shared page size is disabled and the process has a 256M - * page size region then enable the 256M private page size. The trap - * handler looks at the shared page sizes enabled and if a shared - * mapping does not correspond to one these sizes then it is treated - * as a private mapping. - * - * The SCD includes the process's main text segment and (D)ISM segments - * but we only enable the 4M shared page size so an 8K main text - * segment will be treated as private due to the trap handler support. - * - * Note that for simplicity the ordering of the shared page sizes is - * hard coded. - */ - shared_pgsz_mask = 0; - if (sfmmup->sfmmu_scdp != NULL) { - if (!(disable_shctx_large_pages & (1 << TTE4M))) { - shared_pgsz_mask |= 1 << TTE4M; - } else if (sfmmup->sfmmu_iblk != NULL || - (sfmmup->sfmmu_rtteflags & - (1 << TTE4M))) { - private_pgsz_mask |= 1 << TTE4M; - } - - if (SFMMU_FLAGS_ISSET(sfmmup, HAT_256M_ISM) || - (sfmmup->sfmmu_rtteflags & (1 << TTE256M))) { - if (!(disable_shctx_large_pages & (1 << TTE256M))) { - shared_pgsz_mask |= 1 << TTE256M; - } else { - private_pgsz_mask |= 1 << TTE256M; - } - } - } - - set_pgsz_order(private_pgsz_mask, shared_pgsz_mask, &pgsz_order, - &private_pgsz_num, &shared_pgsz_num, sfmmup); - - encode_pgsz_order(pgsz_order, private_pgsz_num, shared_pgsz_num, - pgsz_order_hv, &pgsz_map); - - tot_pgsz_num = private_pgsz_num + shared_pgsz_num; - ASSERT(tot_pgsz_num <= MAX_PGSZ_SEARCH_ORDER); - - for (i = 0; i < tot_pgsz_num; i++) { - if (pgsz_order_hv[i] != sfmmup->sfmmu_pgsz_order_hv[i]) - break; - } - - /* - * If either we've reached the maximum number of page sizes or the - * next element is 0, indicating the end of the list, then both the - * entries and their number in both arrays is the same and we return. - */ - if ((i == tot_pgsz_num) && (i == MAX_PGSZ_SEARCH_ORDER || - sfmmup->sfmmu_pgsz_order_hv[i] == 0)) { - ASSERT(pgsz_map == sfmmup->sfmmu_pgsz_map); - return; - } - - /* Otherwise update the sw page size register setting */ - if (invctx) { - sfmmu_invalidate_ctx(sfmmup); - } - - for (i = 0; i < tot_pgsz_num; i++) { - sfmmup->sfmmu_pgsz_order_hv[i] = pgsz_order_hv[i]; - } - - /* Disable next entry in search list to mark the end */ - if (i < MAX_PGSZ_SEARCH_ORDER) { - sfmmup->sfmmu_pgsz_order_hv[i] = 0; - } - sfmmup->sfmmu_pgsz_map = pgsz_map; -} - -/* - * Encode the Rock TLB page size register. - * - * Input: - * pgsz_order, ordered list of page sizes, private and shared, the order - * between these depends on the pgsz_order_shared_first config variable. - * private_pgsz_num, number of private page sizes. - * shared_pgsz_num, number of shared page sizes. - * Output: - * pgsz_order_hv contains the encoded pagesize search order for the hv - * pgsz_map field contains the page size bit map used by the trap - * handler to prevent unauthorized shared page sizes being used. - */ - -static void -encode_pgsz_order(uint64_t pgsz_order, int private_pgsz_num, - int shared_pgsz_num, uint16_t *pgsz_order_hv, uchar_t *pgsz_map) -{ - int i; - int tot_pgsz_num; - uint16_t pgsz_entry; - uint16_t first_entry_mask, second_entry_mask; - int first_pgsz_num; - - ASSERT(private_pgsz_num < MMU_PAGE_SIZES); - ASSERT(shared_pgsz_num < MMU_PAGE_SIZES); - ASSERT(private_pgsz_num > 0); - - if (pgsz_order_shared_first) { - first_entry_mask = TLB_PGSZ_CONTEXT1_ENABLE; - second_entry_mask = TLB_PGSZ_ENABLE; - first_pgsz_num = shared_pgsz_num; - } else { - first_entry_mask = TLB_PGSZ_ENABLE; - second_entry_mask = TLB_PGSZ_CONTEXT1_ENABLE; - first_pgsz_num = private_pgsz_num; - } - - tot_pgsz_num = private_pgsz_num + shared_pgsz_num; - for (i = 0; i < tot_pgsz_num; i++) { - pgsz_entry = pgsz_order & TTE_SZ_BITS; - if (i < first_pgsz_num) { - if (pgsz_order_shared_first) { - *pgsz_map |= (1 << pgsz_entry); - } - pgsz_entry |= first_entry_mask; - } else { - if (!pgsz_order_shared_first) { - *pgsz_map |= (1 << pgsz_entry); - } - pgsz_entry |= second_entry_mask; - } - pgsz_order >>= 4; - pgsz_order_hv[i] = pgsz_entry; - } -} - -/* - * The function returns the mmu-specific values for the - * hat's disable_large_pages, disable_ism_large_pages, and - * disable_auto_data_large_pages and - * disable_text_data_large_pages variables. - */ -uint_t -mmu_large_pages_disabled(uint_t flag) -{ - uint_t pages_disable = 0; - - if (flag == HAT_LOAD) { - pages_disable = mmu_disable_large_pages; - } else if (flag == HAT_LOAD_SHARE) { - pages_disable = mmu_disable_ism_large_pages; - } else if (flag == HAT_AUTO_DATA) { - pages_disable = mmu_disable_auto_data_large_pages; - } else if (flag == HAT_AUTO_TEXT) { - pages_disable = mmu_disable_auto_text_large_pages; - } - return (pages_disable); -} - -/* - * Uses private and shared page size bitmaps to produce an ordered list - * of page sizes and counts to be passed to encode_pgsz_order(). - * - * Input: - * private_pgsz_mask, bit map of private page sizes. - * shared_pgsz_mask, bit map of private page sizes. - * sfmmup, pointer to hat structure. - * - * Output: - * pgsz_order, ordered list of page sizes. - * private_pgsz_num, number of private page sizes in pgsz_order. - * shared_pgsz_num, number of shared page sizes in pgsz_order. - */ -static void -set_pgsz_order(uchar_t private_pgsz_mask, uchar_t shared_pgsz_mask, - uint64_t *pgsz_order, int *private_pgsz_num, int *shared_pgsz_num, - sfmmu_t *sfmmup) -{ - int64_t sortcnt[MMU_PAGE_SIZES]; - int8_t tmp_pgsz[MMU_PAGE_SIZES]; - ulong_t tmp; - uint8_t i, j, max; - - *private_pgsz_num = 0; - *shared_pgsz_num = 0; - *pgsz_order = 0; - - /* Sort pages by area mapped */ - for (i = 0; i < mmu_page_sizes; i++) { - tmp = sfmmup->sfmmu_ttecnt[i] + sfmmup->sfmmu_ismttecnt[i]; - sortcnt[i] = tmp << TTE_PAGE_SHIFT(i); - } - - for (j = 0; j < mmu_page_sizes; j++) { - for (i = mmu_page_sizes - 1, max = 0; i > 0; i--) { - if (sortcnt[i] > sortcnt[max]) - max = i; - } - tmp_pgsz[j] = max; - sortcnt[max] = -1; - } - - /* Add shared page sizes to page order if these come first */ - if (pgsz_order_shared_first) { - if (shared_pgsz_mask & (1 << TTE256M)) { - *pgsz_order = TTE256M; - (*shared_pgsz_num)++; - } - if (shared_pgsz_mask & (1 << TTE4M)) { - *pgsz_order |= (TTE4M << (*shared_pgsz_num * 4)); - (*shared_pgsz_num)++; - } - } - - - /* Add private page sizes to page order */ - for (i = 0; i < mmu_page_sizes; i++) { - if (private_pgsz_mask & (1 << tmp_pgsz[i])) { - *pgsz_order |= (tmp_pgsz[i] << - ((*private_pgsz_num + *shared_pgsz_num) * 4)); - (*private_pgsz_num)++; - } - } - - /* Add shared page sizes to page order if these come last */ - if (!pgsz_order_shared_first) { - if (shared_pgsz_mask & (1 << TTE256M)) { - *pgsz_order |= (TTE256M << - ((*private_pgsz_num + *shared_pgsz_num) * 4)); - (*shared_pgsz_num)++; - } - if (shared_pgsz_mask & (1 << TTE4M)) { - *pgsz_order |= (TTE4M << - ((*private_pgsz_num + *shared_pgsz_num) * 4)); - (*shared_pgsz_num)++; - } - } - - ASSERT(*pgsz_order); - ASSERT(*private_pgsz_num); - ASSERT((*private_pgsz_num + *shared_pgsz_num) - <= MAX_PGSZ_SEARCH_ORDER); -} - -/* - * This routine is called without holding the hat lock to determine - * whether the process's optimal page size order has changed significantly - * since the page size register was last set. If it has changed we get the - * hat lock and call mmu_set_pgsz_order() to update the effective pagesize - * order. - */ -void -mmu_check_page_sizes(sfmmu_t *sfmmup, uint64_t *ttecnt) -{ - int64_t sortcnt[MMU_PAGE_SIZES]; - int8_t tmp_pgsz[MMU_PAGE_SIZES]; - ulong_t tmp; - int8_t i, j, max; - uint_t pgsz; - uint16_t *pgsz_order_hv; - int page_order_changed; - hatlock_t *hatlockp; - int pgsz_count = 0; - - ASSERT(!sfmmu_hat_lock_held(sfmmup)); - - if (pgsz_search_on == 0) - return; - - /* - * Check if ttecnt has changed significantly, since the last time we - * were called. If the shared page sizes have changed then this is - * handled by mmu_set_pgsz_order() being called directly when we join - * the SCD. - */ - for (i = 0; i < mmu_page_sizes; i++) { - if (ttecnt[i] > (sfmmup->sfmmu_mmuttecnt[i] << 1) || - ttecnt[i] < (sfmmup->sfmmu_mmuttecnt[i] >> 1)) - break; - } - - if (i == mmu_page_sizes) { - return; - } - - /* Sort pages by area mapped */ - for (i = 0; i < mmu_page_sizes; i++) { - tmp = ttecnt[i]; - sortcnt[i] = tmp << TTE_PAGE_SHIFT(i); - } - - for (j = 0; j < mmu_page_sizes; j++) { - for (i = mmu_page_sizes - 1, max = 0; i > 0; i--) { - if (sortcnt[i] > sortcnt[max]) - max = i; - } - tmp_pgsz[j] = max; - sortcnt[max] = -1; - } - - /* - * Check if the order of the private page sizes has changed. We call - * mmu_set_pgsz_order() directly if additional page sizes are used, - * so we can assume that the number of entries is unchanged. - */ - pgsz_order_hv = sfmmup->sfmmu_pgsz_order_hv; - if (pgsz_order_shared_first) { - /* skip over shared pgsz entries */ - while ((pgsz_order_hv[pgsz_count] & TLB_PGSZ_CONTEXT1_ENABLE) == - TLB_PGSZ_CONTEXT1_ENABLE) { - pgsz_count++; - } - } - - i = 0; - page_order_changed = 0; - while ((pgsz_order_hv[pgsz_count] & TLB_PGSZ_ENABLE) && - !(pgsz_order_hv[pgsz_count] & TLB_PGSZ_CONTEXT1) && - (pgsz_count < MAX_PGSZ_SEARCH_ORDER)) { - pgsz = (pgsz_order_hv[pgsz_count] & TTE_SZ_BITS); - ASSERT(pgsz < MMU_PAGE_SIZES); - - if (pgsz != tmp_pgsz[i]) { - page_order_changed = 1; - break; - } - pgsz_count++; - i++; - } - - if (page_order_changed) { - hatlockp = sfmmu_hat_enter(sfmmup); - /* Save old values of ttecnt */ - for (i = 0; i < mmu_page_sizes; i++) { - sfmmup->sfmmu_mmuttecnt[i] = ttecnt[i]; - } - mmu_set_pgsz_order(sfmmup, 1); - sfmmu_hat_exit(hatlockp); - } -} - -/* - * If the mmu extension API is supported and pgsz_search_on is set, - * patch out the instruction to branch over the hypervisor call in - * sfmmu_load_mmustate(). - */ -void -mmu_enable_pgsz_search() -{ - if ((hsvc_mmu_ext_available == B_TRUE) && pgsz_search_on) { - /* patch in hcall to set pgsz order */ - sfmmu_patch_pgsz_reg(); - } -}
--- a/usr/src/uts/sun4v/cpu/rock_asm.s Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,486 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/asm_linkage.h> -#include <sys/hypervisor_api.h> /* For FAST_TRAP */ -#include <sys/rock_hypervisor_api.h> -#include <sys/sun4asi.h> /* ASI_BLK_P */ -#include <sys/machthread.h> /* THREAD_REG */ -#include <sys/fsr.h> /* FPRS_FEF, FPRS_DU */ -#include <vm/hat_sfmmu.h> /* TSBTAG_INVALID */ - -#if defined(lint) -#include <sys/types.h> - -void -cpu_smt_pause(void) -{} - -void -fp_zero(void) -{} - -/*ARGSUSED*/ -uint64_t -hv_rk_perf_count_init(uint64_t counter) -{ return (0); } - -/*ARGSUSED*/ -uint64_t -hv_rk_perf_count_release(uint64_t counter) -{ return (0); } - -/*ARGSUSED*/ -uint64_t -hv_rk_perf_count_set(uint64_t counter, uint64_t value) -{ return (0); } - -/*ARGSUSED*/ -uint64_t -hv_rk_perf_count_get(uint64_t counter, uint64_t *value) -{ return (0); } - -/*ARGSUSED*/ -uint64_t -hv_rk_perf_count_start(uint64_t counter, uint64_t value) -{ return (0); } - -/*ARGSUSED*/ -uint64_t -hv_rk_perf_count_overflow(uint64_t counter, uint64_t *ovf_cnt) -{ return (0); } - -/*ARGSUSED*/ -uint64_t -hv_rk_perf_count_stop(uint64_t counter) -{ return (0); } - -/*ARGSUSED*/ -uint64_t -hv_rk_perf_sample_init(uint64_t sampler, uint64_t ringbuf_pa) -{ return (0); } - -/*ARGSUSED*/ -uint64_t -hv_rk_perf_sample_release(uint64_t sampler) -{ return (0); } - -/*ARGSUSED*/ -uint64_t -hv_rk_perf_sample_start(uint64_t sampler, uint64_t freq, - uint64_t list_size, uint64_t valist_pa) -{ return (0); } - -/*ARGSUSED*/ -uint64_t -hv_rk_perf_sample_config(uint64_t sampler, uint64_t reg_va, uint64_t reg_value) -{ return (0); } - -/*ARGSUSED*/ -uint64_t -hv_rk_perf_sample_pending(uint64_t sampler, uint64_t *pend_cnt) -{ return (0); } - -/*ARGSUSED*/ -uint64_t -hv_rk_perf_sample_stop(uint64_t sampler) -{ return (0); } - -/*ARGSUSED*/ -void -cpu_inv_tsb(caddr_t tsb_base, uint_t tsb_bytes) -{} - -void -cpu_atomic_delay(void) -{} - -void -rock_mutex_delay(void) -{} -#else /* lint */ - -/* - * Called from various spin loops to prevent this strand from - * stealing too many cycles from its sibling, who is presumably - * doing useful work. - * - * With a 2.1 GHz clock, 100 membar #Halt instructions plus - * the call/return overhead will take approximately 500 nanoseconds. - * That is a suitable time for a PAUSE, as it is roughly equal to - * two memory accesses. - */ - ENTRY_NP(cpu_smt_pause) - mov 10, %o0 -1: membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - subcc %o0, 1, %o0 - bg,pt %xcc, 1b - membar #Halt - retl - membar #Halt - SET_SIZE(cpu_smt_pause) - -/* - * fp_zero() - clear all fp data registers and the fsr - */ - -.global fp_zero_zero -.align 8 -fp_zero_zero: - .xword 0 - - ENTRY_NP(fp_zero) - sethi %hi(fp_zero_zero), %o0 - ldx [%o0 + %lo(fp_zero_zero)], %fsr - movxtod %g0, %d0 - fzero %d2 - movxtod %g0, %d4 - fzero %d6 - movxtod %g0, %d8 - fzero %d10 - movxtod %g0, %d12 - fzero %d14 - movxtod %g0, %d16 - fzero %d18 - movxtod %g0, %d20 - fzero %d22 - movxtod %g0, %d24 - fzero %d26 - movxtod %g0, %d28 - fzero %d30 - movxtod %g0, %d32 - fzero %d34 - movxtod %g0, %d36 - fzero %d38 - movxtod %g0, %d40 - fzero %d42 - movxtod %g0, %d44 - fzero %d46 - movxtod %g0, %d48 - fzero %d50 - movxtod %g0, %d52 - fzero %d54 - movxtod %g0, %d56 - fzero %d58 - movxtod %g0, %d60 - retl - fzero %d62 - SET_SIZE(fp_zero) - - /* hcalls for performance counters */ - - /* - * uint64_t hv_rk_perf_count_init(uint64_t counter); - */ - ENTRY(hv_rk_perf_count_init) - mov HV_RK_PERF_COUNT_INIT, %o5 - ta FAST_TRAP - retl - nop - SET_SIZE(hv_rk_perf_count_init) - - /* - * uint64_t hv_rk_perf_count_release(uint64_t counter); - */ - ENTRY(hv_rk_perf_count_release) - mov HV_RK_PERF_COUNT_RELEASE, %o5 - ta FAST_TRAP - retl - nop - SET_SIZE(hv_rk_perf_count_release) - - /* - * uint64_t hv_rk_perf_count_set(uint64_t counter, uint64_t value) - */ - ENTRY(hv_rk_perf_count_set) - mov HV_RK_PERF_COUNT_SET, %o5 - ta FAST_TRAP - retl - nop - SET_SIZE(hv_rk_perf_count_set) - - /* - * uint64_t hv_rk_perf_count_get(uint64_t counter, uint64_t *value) - */ - ENTRY(hv_rk_perf_count_get) - mov HV_RK_PERF_COUNT_GET, %o5 - mov %o1, %o2 ! Save the address - ta FAST_TRAP - retl - stx %o1, [%o2] ! Value is returned in %o1 by the HV - SET_SIZE(hv_rk_perf_count_get) - - /* - * uint64_t hv_rk_perf_count_start(uint64_t counter, uint64_t value) - */ - ENTRY(hv_rk_perf_count_start) - mov HV_RK_PERF_COUNT_START, %o5 - ta FAST_TRAP - retl - nop - SET_SIZE(hv_rk_perf_count_start) - - /* - * uint64_t hv_rk_perf_count_overflow(uint64_t counter, - * uint64_t *ovf_cnt) - */ - ENTRY(hv_rk_perf_count_overflow) - mov %o1, %o2 - mov HV_RK_PERF_COUNT_OVERFLOW, %o5 - ta FAST_TRAP - retl - stx %o1, [%o2] - SET_SIZE(hv_rk_perf_count_overflow) - - /* - * uint64_t hv_rk_perf_count_stop(uint64_t counter) - */ - ENTRY(hv_rk_perf_count_stop) - mov HV_RK_PERF_COUNT_STOP, %o5 - ta FAST_TRAP - retl - nop - SET_SIZE(hv_rk_perf_count_stop) - - /* - * uint64_t hv_rk_perf_sample_init(uint64_t counter, - uint64_t ringbuf_pa) - */ - ENTRY(hv_rk_perf_sample_init) - mov HV_RK_PERF_SAMPLE_INIT, %o5 - ta FAST_TRAP - retl - nop - SET_SIZE(hv_rk_perf_sample_init) - - /* - * uint64_t hv_rk_perf_sample_release(uint64_t counter) - */ - ENTRY(hv_rk_perf_sample_release) - mov HV_RK_PERF_SAMPLE_RELEASE, %o5 - ta FAST_TRAP - retl - nop - SET_SIZE(hv_rk_perf_sample_release) - - /* - * uint64_t hv_rk_perf_sample_config(uint64_t sampler, uint64_t reg_va, - * uint64_t reg_value) - */ - ENTRY(hv_rk_perf_sample_config) - mov HV_RK_PERF_SAMPLE_CONFIG, %o5 - ta FAST_TRAP - retl - nop - SET_SIZE(hv_rk_perf_sample_config) - - /* - * uint64_t hv_rk_perf_sample_start(uint64_t sampler, uint64_t freq, - * uint64_t list_size, uint64_t valist_pa) - */ - ENTRY(hv_rk_perf_sample_start) - mov HV_RK_PERF_SAMPLE_START, %o5 - ta FAST_TRAP - retl - nop - SET_SIZE(hv_rk_perf_sample_start) - - /* - * uint64_t hv_rk_perf_sample_pending(uint64_t sampler, - * uint64_t *pend_cnt) - */ - ENTRY(hv_rk_perf_sample_pending) - mov %o1, %o2 - mov HV_RK_PERF_SAMPLE_PENDING, %o5 - ta FAST_TRAP - retl - stx %o1, [%o2] - SET_SIZE(hv_rk_perf_sample_pending) - - /* - * uint64_t hv_rk_perf_sample_stop(uint64_t sampler) - */ - ENTRY(hv_rk_perf_sample_stop) - mov HV_RK_PERF_SAMPLE_STOP, %o5 - ta FAST_TRAP - retl - nop - SET_SIZE(hv_rk_perf_sample_stop) - -/* - * Invalidate all of the entries within the TSB, by setting the inv bit - * in the tte_tag field of each tsbe. - * - * We take advantage of the fact that the TSBs are page aligned and a - * multiple of PAGESIZE to use ASI_BLK_INIT_xxx ASI. - * - * See TSB_LOCK_ENTRY and the miss handlers for how this works in practice - * (in short, we set all bits in the upper word of the tag, and we give the - * invalid bit precedence over other tag bits in both places). - */ - -#define VIS_BLOCKSIZE 64 -#include "assym.h" /* T_PREEMPT */ - - ENTRY(cpu_inv_tsb) - - ! Get space for aligned block of saved fp regs. - save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp - - ! kpreempt_disable(); - ldsb [THREAD_REG + T_PREEMPT], %l3 - inc %l3 - stb %l3, [THREAD_REG + T_PREEMPT] - - ! See if fpu was in use. If it was, we need to save off the - ! floating point registers to the stack. - rd %fprs, %l0 ! %l0 = cached copy of fprs - mov %g0, %l2 - - btst FPRS_FEF, %l0 - bz,pt %icc, 4f - nop - - ! If upper half fp registers are in use, save them as they will be - ! used below. - btst FPRS_DU, %l0 - bz,pt %icc, 4f - nop - - ! save in-use fpregs on stack - - add %fp, STACK_BIAS - 65, %l1 ! get stack frame for fp regs - and %l1, -VIS_BLOCKSIZE, %l1 ! block align frame - stda %d32, [%l1]ASI_BLK_P ! %l1 = addr of saved fp regs - - ! Set a flag saying fp regs are saved. - mov 1, %l2 - - ! enable fp - -4: membar #StoreStore|#StoreLoad|#LoadStore - wr %g0, FPRS_FEF|FPRS_DU, %fprs - wr %g0, ASI_BLK_P, %asi - - ! load up FP registers with invalid TSB tag. - set TSBTAG_INVALID, %l3 - movxtod %l3, %d32 - movxtod %l3, %d36 - movxtod %l3, %d40 ! Invalidate context - movxtod %l3, %d44 - movxtod %g0, %d34 - movxtod %g0, %d38 - movxtod %g0, %d42 ! Zero in TTE - movxtod %g0, %d46 - - ba,pt %xcc, .cpu_inv_doblock - mov (4*VIS_BLOCKSIZE), %i4 ! we do 4 stda's each loop below - -.cpu_inv_blkstart: - stda %d32, [%i0+128]%asi - stda %d32, [%i0+64]%asi - stda %d32, [%i0]%asi - - add %i0, %i4, %i0 - sub %i1, %i4, %i1 - -.cpu_inv_doblock: - cmp %i1, (4*VIS_BLOCKSIZE) ! check for completion - bgeu,a %icc, .cpu_inv_blkstart - stda %d32, [%i0+192]%asi - -.cpu_inv_finish: - membar #Sync - brz,a %l2, .cpu_inv_finished - wr %l0, 0, %fprs ! restore fprs - - ! restore fpregs from stack - ldda [%l1]ASI_BLK_P, %d32 - - membar #Sync - wr %l0, 0, %fprs ! restore fprs - -.cpu_inv_finished: - ! kpreempt_enable(); - ldsb [THREAD_REG + T_PREEMPT], %l3 - dec %l3 - stb %l3, [THREAD_REG + T_PREEMPT] - ret - restore - SET_SIZE(cpu_inv_tsb) - -/* - * This is CPU specific delay routine for atomic backoff. - * It is used in case of Rock CPU. The rd instruction uses - * less resources than casx on these CPUs. - */ - .align 32 - ENTRY(cpu_atomic_delay) - rd %ccr, %g0 - rd %ccr, %g0 - retl - rd %ccr, %g0 - SET_SIZE(cpu_atomic_delay) - -/* - * Delay to last ~100 nano seconds on a 2.1 GHz. Membars - * should be linear and not in a loop to avoid impact - * on the sibling strand (BR pipeline is shared by - * two sibling strands). - */ - .align 64 - ENTRY(rock_mutex_delay) - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - membar #Halt - retl - membar #Halt - SET_SIZE(rock_mutex_delay) -#endif /* lint */
--- a/usr/src/uts/sun4v/cpu/rock_copy.s Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4941 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/param.h> -#include <sys/errno.h> -#include <sys/asm_linkage.h> -#include <sys/vtrace.h> -#include <sys/machthread.h> -#include <sys/clock.h> -#include <sys/asi.h> -#include <sys/fsr.h> -#include <sys/privregs.h> -#include <sys/rockasi.h> - -#if !defined(lint) -#include "assym.h" -#endif /* lint */ - -/* - * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed - * to "break even" using FP/VIS-accelerated memory operations. - * The FPBLK code assumes a minimum number of bytes are available - * to be moved on entry. Check that code carefully before - * reducing VIS_COPY_THRESHOLD below 256. - */ -/* - * This shadows sys/machsystm.h which can't be included due to - * the lack of _ASM guards in include files it references. - * Change it here, change it there. - */ -#define VIS_COPY_THRESHOLD 256 - -/* - * TEST for very short copies - * Be aware that the maximum unroll for the short unaligned case - * is SHORTCOPY+1 - */ -#define SHORTCOPY 3 -#define CHKSIZE 39 - -/* - * Indicates that we're to trampoline to the error handler. - * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag. - * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag. - */ -#define FPUSED_FLAG 1 -#define TRAMP_FLAG 2 -#define KCOPY_FLAG 4 -#define FPSAVED_FLAG 8 -#define MASK_FLAGS 0xf - -/* - * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault - * handler was set - */ -#define LOFAULT_SET 2 - -/* - * Number of outstanding prefetches. - * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with - * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a - * reach of 5*BLOCK_SIZE. The double prefetch gives an typical improvement - * of 5% for large copies as compared to a single prefetch. The reason - * for the improvement is that with Cheetah and Jaguar, some prefetches - * are dropped due to the prefetch queue being full. The second prefetch - * reduces the number of cache lines that are dropped. - * Do not remove the double prefetch or change either FIRST_PREFETCH - * or SECOND_PREFETCH without extensive performance tests to prove - * there is no loss of performance. - * XXX: For ROCK, the prefetch depth can be upto 16, but sticking - * with 8 as of now pending more clarity on this. - */ -#define FIRST_PREFETCH 8 -#define SECOND_PREFETCH 5 - -#define VIS_BLOCKSIZE 64 - -/* - * Size of stack frame in order to accomodate a 64-byte aligned - * floating-point register save area and 2 64-bit temp locations. - * All copy functions use two quadrants of fp registers; to assure a - * block-aligned two block buffer in which to save we must reserve - * three blocks on stack. Not all functions preserve %pfrs on stack - * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all. - * - * _______________________________________ <-- %fp + STACK_BIAS - * | We may need to preserve 2 quadrants | - * | of fp regs, but since we do so with | - * | BST/BLD we need room in which to | - * | align to VIS_BLOCKSIZE bytes. So | - * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET - * |-------------------------------------| - * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET - * |-------------------------------------| - * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET - * --------------------------------------- - */ -#define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8)) -#define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3) -#define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1) -#define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) -#define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) - -#define ICACHE_LINE_SIZE 64 - -#define MEDIUM_MAX 255 -#define MED_WMAX 256 /* max copy for medium word-aligned case */ -#define MED_MAX 256 /* max copy for medium longword-aligned case */ - -#define PAGE_MASK 8191 -#define ST_CACHE_ALIGN 127 - -#ifndef BSTORE_SIZE -#define BSTORE_SIZE 256 /* min copy size for block store */ -#endif - -/* - * Common macros used by the various versions of the block copy - * routines in this file. - */ - -/* - * In FP copies if we do not have preserved data to restore over - * the fp regs we used then we must zero those regs to avoid - * exposing portions of the data to later threads (data security). - * - * Copy functions use either quadrants 1 and 3 or 2 and 4. - * - * FZEROQ3Q4: Zero quadrants 3 and 4, ie %d32 - %d46 and %d48 - %d62 - * - */ -#define FZEROQ3Q4 \ - movxtod %g0, %d32 ;\ - movxtod %g0, %d34 ;\ - fsrc1 %d0, %d36 ;\ - fsrc1 %d0, %d38 ;\ - fsrc1 %d0, %d40 ;\ - fsrc1 %d0, %d42 ;\ - fsrc1 %d0, %d44 ;\ - fsrc1 %d0, %d46 ;\ - fsrc1 %d0, %d48 ;\ - fsrc1 %d0, %d50 ;\ - fsrc1 %d0, %d52 ;\ - fsrc1 %d0, %d54 ;\ - fsrc1 %d0, %d56 ;\ - fsrc1 %d0, %d58 ;\ - fsrc1 %d0, %d60 ;\ - fsrc1 %d0, %d62 - - -/* - * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack. - * Used to save and restore in-use fp registers when we want to use FP - * and find fp already in use and copy size still large enough to justify - * the additional overhead of this save and restore. - * - * A membar #Sync is needed before save to sync fp ops initiated before - * the call to the copy function (by whoever has fp in use); for example - * an earlier block load to the quadrant we are about to save may still be - * "in flight". A membar #Sync is required at the end of the save to - * sync our block store (the copy code is about to begin ldd's to the - * first quadrant). Note, however, that since Cheetah pipeline block load - * is blocking we can omit the initial membar before saving fp state (they're - * commented below in case of future porting to a chip that does not block - * on block load). - * - * Similarly: a membar #Sync before restore allows the block stores of - * the copy operation to complete before we fill the quadrants with their - * original data, and a membar #Sync after restore lets the block loads - * of the restore complete before we return to whoever has the fp regs - * in use. To avoid repeated membar #Sync we make it the responsibility - * of the copy code to membar #Sync immediately after copy is complete - * and before using the BLD_*_FROMSTACK macro. - */ -#if !defined(lint) -#define BST_FPQ3Q4_TOSTACK(tmp1) \ - /* membar #Sync */ ;\ - add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ - and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ - stda %d32, [tmp1]ASI_BLK_P ;\ - add tmp1, VIS_BLOCKSIZE, tmp1 ;\ - stda %d48, [tmp1]ASI_BLK_P ;\ - membar #Sync - -#define BLD_FPQ3Q4_FROMSTACK(tmp1) \ - /* membar #Sync - provided at copy completion */ ;\ - add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ - and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ - ldda [tmp1]ASI_BLK_P, %d32 ;\ - add tmp1, VIS_BLOCKSIZE, tmp1 ;\ - ldda [tmp1]ASI_BLK_P, %d48 ;\ - membar #Sync -#endif - -/* - * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger, - * prevent preemption if there is no t_lwp to save FP state to on context - * switch) before commencing a FP copy, and reallow it on completion or - * in error trampoline paths when we were using FP copy. - * - * Both macros may call other functions, so be aware that all outputs are - * forfeit after using these macros. For this reason we do not pass registers - * to use - we just use any outputs we want. - * - * For fpRAS we need to perform the fpRAS mechanism test on the same - * CPU as we use for the copy operation, both so that we validate the - * CPU we perform the copy on and so that we know which CPU failed - * if a failure is detected. Hence we need to be bound to "our" CPU. - * This could be achieved through disabling preemption (and we have do it that - * way for threads with no t_lwp) but for larger copies this may hold - * higher priority threads off of cpu for too long (eg, realtime). So we - * make use of the lightweight t_nomigrate mechanism where we can (ie, when - * we have a t_lwp). - * - * Pseudo code: - * - * FP_NOMIGRATE: - * - * if (curthread->t_lwp) { - * thread_nomigrate(); - * } else { - * kpreempt_disable(); - * } - * - * FP_ALLOWMIGRATE: - * - * if (curthread->t_lwp) { - * thread_allowmigrate(); - * } else { - * kpreempt_enable(); - * } - */ - -#define FP_NOMIGRATE(label1, label2) \ - ldn [THREAD_REG + T_LWP], %o0 ;\ - brz,a,pn %o0, label1/**/f ;\ - ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ - call thread_nomigrate ;\ - nop ;\ - ba label2/**/f ;\ - nop ;\ -label1: ;\ - inc %o1 ;\ - stb %o1, [THREAD_REG + T_PREEMPT] ;\ -label2: - -#define FP_ALLOWMIGRATE(label1, label2) \ - ldn [THREAD_REG + T_LWP], %o0 ;\ - brz,a,pn %o0, label1/**/f ;\ - ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ - call thread_allowmigrate ;\ - nop ;\ - ba label2/**/f ;\ - nop ;\ -label1: ;\ - dec %o1 ;\ - brnz,pn %o1, label2/**/f ;\ - stb %o1, [THREAD_REG + T_PREEMPT] ;\ - ldn [THREAD_REG + T_CPU], %o0 ;\ - ldub [%o0 + CPU_KPRUNRUN], %o0 ;\ - brz,pt %o0, label2/**/f ;\ - nop ;\ - call kpreempt ;\ - rdpr %pil, %o0 ;\ -label2: - -/* - * Copy a block of storage, returning an error code if `from' or - * `to' takes a kernel pagefault which cannot be resolved. - * Returns errno value on pagefault error, 0 if all ok - */ - -#if defined(lint) - -/* ARGSUSED */ -int -kcopy(const void *from, void *to, size_t count) -{ return(0); } - -#else /* lint */ - - .seg ".text" - .align 4 - - ENTRY(kcopy) - - sethi %hi(.copyerr_no_fp_used), %o4 - or %o4, %lo(.copyerr_fp_used), %o4 - stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault - ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler - or %o5, KCOPY_FLAG, %o5 - membar #Sync ! sync error barrier - ba,pt %ncc, .forcpy ! common code - nop - - -/* - * We got here because of a fault in .copyerr_fp_used. We can't safely - * restore fp state, so we panic. - */ -fp_panic_msg: - .asciz "Unable to restore fp state after copy operation" - - .align 4 -.copyerr2: - set fp_panic_msg, %o0 - call panic - nop - -/* - * We got here because of a fault during a small kcopy or bcopy. - * No floating point registers were used in this copy. - * Errno value is in %g1. - */ -.copyerr_no_fp_used: - btst TRAMP_FLAG, %o5 - membar #Sync - andn %o5, TRAMP_FLAG, %o5 - bnz,pn %ncc, 3f - stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - retl - mov %g1, %o0 -3: - jmp %o5 ! goto real handler - mov %g0, %o0 ! - -/* - * We got here because of a fault during a small kcopy or bcopy. - * floating point registers were used in this copy. - * Errno value is in %g1. - */ -.copyerr_fp_used: - set .copyerr2, %l0 - membar #Sync ! sync error barrier - stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault - btst FPUSED_FLAG, %l6 - bz %ncc, 1f - and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0 - - ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr - wr %o2, 0, %gsr - - ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 - ! No need to restore regs if they were not saved - btst FPSAVED_FLAG, %l6 - bz %ncc, 4f - nop - - BLD_FPQ3Q4_FROMSTACK(%o2) - - ba,pt %ncc, 1f - wr %o3, 0, %fprs ! restore fprs - -4: - FZEROQ3Q4 - wr %o3, 0, %fprs ! restore fprs - - ! - ! Need to cater for the different expectations of kcopy - ! and bcopy. kcopy will *always* set a t_lofault handler - ! If it fires, we're expected to just return the error code - ! and *not* to invoke any existing error handler. As far as - ! bcopy is concerned, we only set t_lofault if there was an - ! existing lofault handler. In that case we're expected to - ! invoke the previously existing handler after resetting the - ! t_lofault value. - ! -1: - andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off - membar #Sync ! sync error barrier - stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - FP_ALLOWMIGRATE(5, 6) - - btst TRAMP_FLAG, %l0 - bnz,pn %ncc, 3f - nop - ret - restore %g1, 0, %o0 - -3: - ! - ! We're here via bcopy. There *must* have been an error handler - ! in place otherwise we would have died a nasty death already. - ! - jmp %l6 ! goto real handler - restore %g0, 0, %o0 ! dispose of copy window - - SET_SIZE(kcopy) -#endif /* lint */ - -#define ALIGN8(X) (((X) + 7) & ~7) -#define ICACHE_LINE_SIZE 64 -#define PF_FAR 2048 -#define PF_NEAR 1024 -#define SMALL_MAX 39 -/* - * Copy a block of storage - must not overlap (from + len <= to). - * Registers: l6 - saved t_lofault - * (for short copies, o5 - saved t_lofault) - * - * Copy a page of memory. - * Assumes double word alignment and a count >= 256. - */ -#if defined(lint) - -/* ARGSUSED */ -void -bcopy(const void *from, void *to, size_t count) -{} -#else /* lint */ - - .align ICACHE_LINE_SIZE - ENTRY(bcopy) - ENTRY(__align_cpy_1) - ldn [THREAD_REG + T_LOFAULT], %o5 ! save t_lofault - tst %o5 - bz,pt %icc, .forcpy - nop - sethi %hi(.copyerr_no_fp_used), %o4 - or %o4, %lo(.copyerr_no_fp_used), %o4 - membar #Sync ! sync error barrier - stn %o4, [THREAD_REG + T_LOFAULT] ! install new vector - or %o5, TRAMP_FLAG, %o5 ! error should trampoline -.forcpy: - cmp %o2, SMALL_MAX ! check for not small case - bgu,pn %ncc, .medium_bcopy ! go to larger cases - cmp %o2, SHORTCOPY ! check for really short case - ble,pt %ncc, .smallleft_bcopy ! - or %o1, %o0, %o3 ! prepare alignment check - andcc %o3, 0x3, %g0 ! test for alignment - bz,pt %ncc, .smallword_bcopy ! branch to word aligned case - sub %o2, 3, %o2 ! adjust count to allow cc zero test -.smallnotalign4_bcopy: - ldub [%o0], %o3 ! read byte - subcc %o2, 4, %o2 ! reduce count by 4 - stb %o3, [%o1] ! write byte - ldub [%o0+1], %o3 ! repeat for a total of 4 bytes - add %o0, 4, %o0 ! advance SRC by 4 - stb %o3, [%o1+1] - ldub [%o0-2], %o3 - add %o1, 4, %o1 ! advance DST by 4 - stb %o3, [%o1-2] - ldub [%o0-1], %o3 - bgu,pt %ncc, .smallnotalign4_bcopy ! loop til 3 or fewer bytes remain - stb %o3, [%o1-1] - add %o2, 3, %o2 ! restore count -.smallleft_bcopy: - tst %o2 - bz,pt %ncc, .smallexit_bcopy - nop -.smallleft3_bcopy: ! 1, 2, or 3 bytes remain - ldub [%o0], %o3 ! load one byte - deccc %o2 ! reduce count for cc test - bz,pt %ncc, .smallexit_bcopy - stb %o3, [%o1] ! store one byte - ldub [%o0+1], %o3 ! load second byte - deccc %o2 - bz,pt %ncc, .smallexit_bcopy - stb %o3, [%o1+1] ! store second byte - ldub [%o0+2], %o3 ! load third byte - stb %o3, [%o1+2] ! store third byte - membar #Sync ! sync error barrier - andn %o5, TRAMP_FLAG, %o5 - stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - retl - clr %o0 - - .align 16 - nop ! affects loop icache alignment -.smallwords_bcopy: - lduw [%o0], %o3 ! read word -.smallwordx_bcopy: - subcc %o2, 8, %o2 ! update count - stw %o3, [%o1] ! write word - add %o0, 8, %o0 ! update SRC - lduw [%o0-4], %o3 ! read word - add %o1, 8, %o1 ! update DST - bgu,pt %ncc, .smallwords_bcopy ! loop until done - stw %o3, [%o1-4] ! write word - addcc %o2, 7, %o2 ! restore count - bz,pt %ncc, .smallexit_bcopy ! check for completion - nop - cmp %o2, 4 ! check for 4 or more bytes left - blt .smallleft3_bcopy ! if not, go to finish up - nop - lduw [%o0], %o3 - add %o0, 4, %o0 - subcc %o2, 4, %o2 - stw %o3, [%o1] - add %o1, 4, %o1 - bnz,pt %ncc, .smallleft3_bcopy - nop - membar #Sync ! sync error barrier - andn %o5, TRAMP_FLAG, %o5 - stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - retl - clr %o0 - -.smallword_bcopy: - subcc %o2, 4, %o2 ! update count - bgu,pt %ncc, .smallwordx_bcopy - lduw [%o0], %o3 ! read word - addcc %o2, 3, %o2 ! restore count - bz,pt %ncc, .smallexit_bcopy - stw %o3, [%o1] ! write word - deccc %o2 ! reduce count for cc test - ldub [%o0+4], %o3 ! load one byte - bz,pt %ncc, .smallexit_bcopy - stb %o3, [%o1+4] ! store one byte - ldub [%o0+5], %o3 ! load second byte - deccc %o2 - bz,pt %ncc, .smallexit_bcopy - stb %o3, [%o1+5] ! store second byte - ldub [%o0+6], %o3 ! load third byte - stb %o3, [%o1+6] ! store third byte -.smallexit_bcopy: - membar #Sync ! sync error barrier - andn %o5, TRAMP_FLAG, %o5 - stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - retl - clr %o0 - .align 16 -.medium_bcopy: - neg %o1, %g5 - neg %o0, %o3 - andcc %g5, 7, %g5 ! bytes till DST 8 byte aligned - and %o3, 7, %o3 ! bytes till SRC 8 byte aligned - cmp %g5, %o3 - bne %ncc, continue - sub %g5, %o3, %o3 ! -(bytes till SRC aligned after DST aligned) - ! o3={-7, -6, ... 7} o3>0 => SRC overaligned - ! src and dst are aligned. - mov %o3, %g1 ! save %o3 - andcc %o0, 7, %o3 ! is src buf aligned on a 8 byte bound - brz,pt %o3, src_dst_aligned_on_8 - nop - mov %o3, %g5 - mov 8, %o4 - sub %o4, %o3, %o3 - cmp %o3, %o2 - bg,a,pn %ncc, 1f - mov %o2, %o3 -1: - ! %o3 has the bytes to be written in partial store. - sub %o2, %o3, %o2 - prefetch [%o0],2 -7: - deccc %o3 ! byte clearing loop - ldub [%o0], %o4 ! load one byte - stb %o4, [%o1] - inc %o1 ! increment dst - bgu,pt %ncc, 7b - inc %o0 ! increment src - mov %g1, %o3 ! restore %o3 -src_dst_aligned_on_8: - ! check if we are copying 1k or more bytes - cmp %o2, 511 - bgu,pt %ncc, copying_ge_512 - nop - ba .medlword_bcopy - nop - -continue: - andcc %g5, 7, %g5 ! bytes till DST 8 byte aligned - bz %ncc, 2f - nop - - sub %o2, %g5, %o2 ! update count - -1: - ldub [%o0], %o4 - deccc %g5 - inc %o0 - stb %o4, [%o1] - bgu,pt %ncc, 1b - inc %o1 - - ! Now DST is 8-byte aligned. dst, from, o2 are current. - -2: - andcc %o0, 0x3, %g0 ! test alignment - bnz,pt %ncc, .mediumsetup_bcopy ! branch to skip aligned cases - ! if src, dst not aligned - prefetch [%o0 + (1 * VIS_BLOCKSIZE)], #n_reads - -/* - * Handle all cases where src and dest are aligned on word - * or long word boundaries. Use unrolled loops for better - * performance. This option wins over standard large data - * move when source and destination is in cache for medium - * to short data moves. - */ - andcc %o0, 0x7, %g0 ! test word alignment - bz,pt %ncc, src_dst_lword_aligned ! branch to long word aligned case - prefetch [%o0 + (2 * VIS_BLOCKSIZE)], #n_reads - cmp %o2, MED_WMAX ! limit to store buffer size - bgu,pt %ncc, .mediumrejoin_bcopy ! otherwise rejoin main loop - nop - subcc %o2, 15, %o2 ! adjust length to allow cc test - ! for end of loop - ble,pt %ncc, .medw15_bcopy ! skip big loop if less than 16 - prefetch [%o0 + (3 * VIS_BLOCKSIZE)], #n_reads -/* - * no need to put prefetch in loop as prefetches have - * already been issued for maximum loop size - */ -.medw16_bcopy: - ld [%o0], %o4 ! load - subcc %o2, 16, %o2 ! decrement length count - stw %o4, [%o1] ! and store - ld [%o0+4], %o3 ! a block of 16 bytes - add %o0, 16, %o0 ! increase src ptr by 16 - stw %o3, [%o1+4] - ld [%o0-8], %o4 - add %o1, 16, %o1 ! increase dst ptr by 16 - stw %o4, [%o1-8] - ld [%o0-4], %o3 - bgu,pt %ncc, .medw16_bcopy ! repeat if at least 16 bytes left - stw %o3, [%o1-4] -.medw15_bcopy: - addcc %o2, 15, %o2 ! restore count - bz,pt %ncc, .medwexit_bcopy ! exit if finished - nop - cmp %o2, 8 - blt,pt %ncc, .medw7_bcopy ! skip if 7 or fewer bytes left - nop ! - ld [%o0], %o4 ! load 4 bytes - subcc %o2, 8, %o2 ! decrease count by 8 - stw %o4, [%o1] ! and store 4 bytes - add %o0, 8, %o0 ! increase src ptr by 8 - ld [%o0-4], %o3 ! load 4 bytes - add %o1, 8, %o1 ! increase dst ptr by 8 - stw %o3, [%o1-4] ! and store 4 bytes - bz %ncc, .medwexit_bcopy ! exit if finished - nop -.medw7_bcopy: ! count is ge 1, less than 8 - cmp %o2, 3 ! check for 4 bytes left - ble,pt %ncc, .medw3_bcopy ! skip if 3 or fewer bytes left - nop ! - ld [%o0], %o4 ! load 4 bytes - sub %o2, 4, %o2 ! decrease count by 4 - add %o0, 4, %o0 ! increase src ptr by 4 - stw %o4, [%o1] ! and store 4 bytes - add %o1, 4, %o1 ! increase dst ptr by 4 - tst %o2 ! check for zero bytes left - bz %ncc, .medwexit_bcopy ! exit if finished - nop -.medw3_bcopy: ! count is known to be 1, 2, or 3 - deccc %o2 ! reduce count by one - ldub [%o0], %o3 ! load one byte - bz,pt %ncc, .medwexit_bcopy ! exit if last byte - stb %o3, [%o1] ! store one byte - ldub [%o0+1], %o3 ! load second byte - deccc %o2 ! reduce count by one - bz,pt %ncc, .medwexit_bcopy ! exit if last byte - stb %o3, [%o1+1] ! store second byte - ldub [%o0+2], %o3 ! load third byte - stb %o3, [%o1+2] ! store third byte -.medwexit_bcopy: - membar #Sync ! sync error barrier - andn %o5, TRAMP_FLAG, %o5 - stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - retl - clr %o0 - -/* - * Special case for handling when src and dest are both long word aligned - * and total data to move is between SMALL_MAX and MED_MAX bytes - */ - - .align 16 - nop -src_dst_lword_aligned: -.medlword_bcopy: ! long word aligned - cmp %o2, MED_MAX ! limit to store buffer size - bgu,pt %ncc, .mediumrejoin_bcopy ! otherwise rejoin main loop - nop - subcc %o2, 31, %o2 ! adjust length to allow cc test - ! for end of loop - ble,pt %ncc, .medl31_bcopy ! skip big loop if less than 32 - prefetch [%o0 + (3 * VIS_BLOCKSIZE)], #n_reads ! into the l2 cache -/* - * no need to put prefetch in loop as prefetches have - * already been issued for maximum loop size - */ -.medl32_bcopy: - ldx [%o0], %o4 ! load - subcc %o2, 32, %o2 ! decrement length count - stx %o4, [%o1] ! and store - ldx [%o0+8], %o3 ! a block of 32 bytes - add %o0, 32, %o0 ! increase src ptr by 32 - stx %o3, [%o1+8] - ldx [%o0-16], %o4 - add %o1, 32, %o1 ! increase dst ptr by 32 - stx %o4, [%o1-16] - ldx [%o0-8], %o3 - bgu,pt %ncc, .medl32_bcopy ! repeat if at least 32 bytes left - stx %o3, [%o1-8] -.medl31_bcopy: - addcc %o2, 16, %o2 ! adjust remaining count - ble,pt %ncc, .medl15_bcopy ! skip if 15 or fewer bytes left - nop ! - ldx [%o0], %o4 ! load and store 16 bytes - add %o0, 16, %o0 ! increase src ptr by 16 - stx %o4, [%o1] ! - sub %o2, 16, %o2 ! decrease count by 16 - ldx [%o0-8], %o3 ! - add %o1, 16, %o1 ! increase dst ptr by 16 - stx %o3, [%o1-8] -.medl15_bcopy: - addcc %o2, 15, %o2 ! restore count - bz,pt %ncc, .medwexit_bcopy ! exit if finished - nop - cmp %o2, 8 - blt,pt %ncc, .medw7_bcopy ! skip if 7 or fewer bytes left - nop - ldx [%o0], %o4 ! load 8 bytes - add %o0, 8, %o0 ! increase src ptr by 8 - stx %o4, [%o1] ! and store 8 bytes - subcc %o2, 8, %o2 ! decrease count by 8 - bz %ncc, .medwexit_bcopy ! exit if finished - add %o1, 8, %o1 ! increase dst ptr by 8 - ba .medw7_bcopy - nop - - .align 16 - nop - nop - nop -unaligned_src_dst: - -.mediumsetup_bcopy: - prefetch [%o0 + (2 * VIS_BLOCKSIZE)], #one_read -.mediumrejoin_bcopy: - ! %o5 has the saved T_LOFAULT when we come here. - ! We set a new error handler if the T_LOFAULT was set earlier OR - ! KCOPY_FLAG is set. - save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp - mov %i5, %l6 - andn %l6, TRAMP_FLAG, %o2 - brz,pt %o2, 1f - nop - ! We enter here if KCOPY_FLAG was set OR - ! T_LOFAULT was set earlier. - ! We only change the error handler pointer here. - ! The flags TRAMP_FLAG or KCOPY_FLAG is left as it is in %l6. - sethi %hi(.copyerr_fp_used), %o2 - or %o2, %lo(.copyerr_fp_used), %o2 - membar #Sync ! sync error barrier - stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector -1: - FP_NOMIGRATE(6, 7) - mov %i0, %o0 - mov %i1, %o1 - mov %i2, %o2 - mov %i3, %o3 - mov %i5, %o5 - rd %fprs, %o4 ! check for unused fp - st %o4, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs - - ! FPU enabled ? If not, enable it. - btst FPRS_FEF, %o4 - bz,a,pt %icc, continue_bcopy - wr %g0, FPRS_FEF, %fprs - - ! save the FP registers even if DU is not set. - - BST_FPQ3Q4_TOSTACK(%o4) - or %l6, FPSAVED_FLAG, %l6 - -continue_bcopy: - rd %gsr, %o4 - stx %o4, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr - or %l6, FPUSED_FLAG, %l6 - - add %o0, 8, %o0 ! prepare to round SRC upward - - sethi %hi(0x1234567f), %o5 ! For GSR.MASK - or %o5, 0x67f, %o5 - - cmp %o2, MEDIUM_MAX - bmask %o5, %g0, %g0 - - ! Compute o5 (number of bytes that need copying using the main loop). - ! First, compute for the medium case. - ! Then, if large case, o5 is replaced by count for block alignment. - ! Be careful not to read past end of SRC - ! Currently, o2 is the actual count remaining - ! o3 is how much sooner we'll cross the alignment boundary - ! in SRC compared to in DST - ! - ! Examples: Let # denote bytes that should not be accessed - ! Let x denote a byte already copied to align DST - ! Let . and - denote bytes not yet copied - ! Let | denote double alignment boundaries - ! - ! DST: ######xx|........|--------|..###### o2 = 18 - ! dst - ! - ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8 - ! from - ! - ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8 - ! from - ! - ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8 - ! from - - mov %asi, %g1 ! save curr %asi - wr %g0, ASI_CACHE_SPARING_P, %asi - - or %g0, -8, %o5 - alignaddr %o0, %g0, %o0 ! set GSR.ALIGN and align from - - movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0 - add %o5, %o2, %o5 - add %o5, %o3, %o5 - - bleu %ncc, 4f - andn %o5, 7, %o5 ! 8 byte aligned count - neg %o1, %o5 ! 'large' case - and %o5, VIS_BLOCKSIZE-1, %o5 ! bytes till DST block aligned -4: - brgez,a %o3, .beginmedloop_bcopy - ldda [%o0-8]%asi, %d32 - - add %o0, %o3, %o0 ! back up from -5: - ldda [%o0]ASI_FL8_P, %d34 - inc %o0 - andcc %o0, 7, %g0 - bnz %ncc, 5b - bshuffle %d32, %d34, %d32 ! shifts d32 left 1 byte and or's in d34 - -.beginmedloop_bcopy: - tst %o5 - bz %ncc, .endmedloop_bcopy - sub %o2, %o5, %o2 ! update count for later - - ! Main loop to write out doubles. Note: o5 & 7 == 0 - - ldd [%o0], %d34 - subcc %o5, 8, %o5 ! update local count - bz,pn %ncc, 1f - add %o0, 8, %o0 ! update SRC - -.medloop_bcopy: - faligndata %d32, %d34, %d36 - ldda [%o0]%asi, %d32 - subcc %o5, 8, %o5 ! update local count - add %o0, 16, %o0 ! update SRC - std %d36, [%o1] - bz,pn %ncc, 2f - faligndata %d34, %d32, %d38 - ldda [%o0 - 8]%asi, %d34 - subcc %o5, 8, %o5 ! update local count - std %d38, [%o1 + 8] - bnz,pt %ncc, .medloop_bcopy - add %o1, 16, %o1 ! update DST - -1: - faligndata %d32, %d34, %d36 - fmovd %d34, %d32 - std %d36, [%o1] - ba .endmedloop_bcopy - add %o1, 8, %o1 - -2: - std %d38, [%o1 + 8] - sub %o0, 8, %o0 - add %o1, 16, %o1 - - -.endmedloop_bcopy: - ! Currently, from is pointing to the next double-aligned byte in SRC - ! The 8 bytes starting at [from-8] are available in d32 - ! At least one, and possibly all, of these need to be written. - - cmp %o2, VIS_BLOCKSIZE - bgu %ncc, .large_bcopy ! otherwise, less than 16 bytes left - -#if 1 - - /* This code will use partial stores. */ - - mov %g0, %o5 - and %o3, 7, %o3 ! Number of bytes needed to completely - ! fill %d32 with good (unwritten) data. - - subcc %o2, 8, %o2 ! update count (maybe too much) - movl %ncc, %o2, %o5 - addcc %o3, %o5, %o5 ! extra bytes we can stuff into %d32 - sub %o3, %o5, %o3 ! update o3 (# bad bytes in %d32) - - bz %ncc, 2f - alignaddr %o3, %g0, %g0 ! set GSR.ALIGN - -1: - deccc %o5 - ldda [%o0]ASI_FL8_P, %d34 - inc %o0 - bgu %ncc, 1b - bshuffle %d32, %d34, %d32 ! shifts d32 left 1 byte and or's in d34 - -2: - not %o3 - faligndata %d32, %d32, %d32 ! shift bytes to the left - and %o3, 7, %o3 ! last byte to be stored in [%o1+%o3] - edge8n %g0, %o3, %o5 - stda %d32, [%o1]%o5, ASI_PST8_P - brlez %o2, exit_bcopy - add %o1, %o3, %o1 ! update DST to last stored byte -3: - inc %o1 - deccc %o2 - ldub [%o0], %o3 - stb %o3, [%o1] - bgu %ncc, 3b - inc %o0 - -#else - - andcc %o3, 7, %o5 ! Number of bytes needed to completely - ! fill %d32 with good (unwritten) data. - bz %ncc, 2f - sub %o5, 8, %o3 ! -(number of good bytes in %d32) - cmp %o2, 8 - bl,a %ncc, 3f ! Not enough bytes to fill %d32 - add %o0, %o3, %o0 ! Back up %o0 - -1: - deccc %o5 - ldda [%o0]ASI_FL8_P, %d34 - inc %o0 - bgu %ncc, 1b - bshuffle %d32, %d34, %d32 ! shifts d32 left 1 byte and or's in d34 - -2: - subcc %o2, 8, %o2 - std %d32, [%o1] - bz %ncc, exit_bcopy - add %o1, 8, %o1 -3: - ldub [%o0], %o3 - deccc %o2 - inc %o0 - stb %o3, [%o1] - bgu %ncc, 3b - inc %o1 -#endif - -exit_bcopy: - membar #Sync - - ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr - wr %o2, 0, %gsr - - ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 - ! No need to restore regs if they were not saved - btst FPSAVED_FLAG, %l6 - bz %ncc, 4f - nop - - BLD_FPQ3Q4_FROMSTACK(%o2) - - ba,pt %ncc, 5f - wr %o3, 0, %fprs ! restore fprs -4: - FZEROQ3Q4 - wr %o3, 0, %fprs ! restore fprs -5: - membar #Sync ! sync error barrier - andn %l6, MASK_FLAGS, %l6 - stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - - mov %g1, %asi ! restore %asi - FP_ALLOWMIGRATE(6, 7) - ret - restore %g0, 0, %o0 - - - .align ICACHE_LINE_SIZE -.large_bcopy: - ! The following test for BSTORE_SIZE is used to decide whether - ! to store data with a block store or with individual stores. - ! The block store wins when the amount of data is so large - ! that it is causes other application data to be moved out - ! of the L1 or L2 cache. - ! On a Panther, block store can lose more often because block - ! store forces the stored data to be removed from the L3 cache. - ! - sethi %hi(BSTORE_SIZE),%o5 - or %o5,%lo(BSTORE_SIZE),%o5 - cmp %o2, %o5 - bgu %ncc, .xlarge_bcopy - - ! %o1 I/O DST is 64-byte aligned - ! %o0 I/O 8-byte aligned (and we've set GSR.ALIGN) - ! %d32 I/O already loaded with SRC data from [%o0-8] - ! %o2 I/O count (number of bytes that need to be written) - ! %o3 I Not written. If zero, then SRC is double aligned. - ! %o4 I Not written. Holds fprs. - ! %o5 O The number of doubles that remain to be written. - - ! Load the rest of the current block - ! Recall that %o0 is further into SRC than %o1 is into DST - - prefetch [%o1 + (0 * VIS_BLOCKSIZE)], #n_writes - prefetch [%o1 + (1 * VIS_BLOCKSIZE)], #n_writes - prefetch [%o1 + (2 * VIS_BLOCKSIZE)], #n_writes - ldda [%o0]%asi, %d34 - prefetch [%o0 + (3 * VIS_BLOCKSIZE)], #one_read - ldda [%o0 + 0x8]%asi, %d36 - faligndata %d32, %d34, %d48 - ldda [%o0 + 0x10]%asi, %d38 - faligndata %d34, %d36, %d50 - ldda [%o0 + 0x18]%asi, %d40 - faligndata %d36, %d38, %d52 - ldda [%o0 + 0x20]%asi, %d42 - or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 - prefetch [%o0 + (4 * VIS_BLOCKSIZE)], #one_read - faligndata %d38, %d40, %d54 - ldda [%o0 + 0x28]%asi, %d44 - movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter) - faligndata %d40, %d42, %d56 - ldda [%o0 + 0x30]%asi, %d46 - faligndata %d42, %d44, %d58 - ldda [%o0 + 0x38]%asi, %d32 - sub %o2, VIS_BLOCKSIZE, %o2 ! update count - prefetch [%o0 + (5 * VIS_BLOCKSIZE)], #one_read - add %o0, VIS_BLOCKSIZE, %o0 ! update SRC - - ! Main loop. Write previous block. Load rest of current block. - ! Some bytes will be loaded that won't yet be written. -1: - ldda [%o0]%asi, %d34 - faligndata %d44, %d46, %d60 - ldda [%o0 + 0x8]%asi, %d36 - faligndata %d46, %d32, %d62 - std %d48, [%o1] - std %d50, [%o1+8] - std %d52, [%o1+16] - std %d54, [%o1+24] - std %d56, [%o1+32] - std %d58, [%o1+40] - std %d60, [%o1+48] - std %d62, [%o1+56] - sub %o2, VIS_BLOCKSIZE, %o2 ! update count - prefetch [%o1 + (6 * VIS_BLOCKSIZE)], #n_writes - prefetch [%o1 + (3 * VIS_BLOCKSIZE)], #n_writes - add %o1, VIS_BLOCKSIZE, %o1 ! update DST - ldda [%o0 + 0x10]%asi, %d38 - faligndata %d32, %d34, %d48 - ldda [%o0 + 0x18]%asi, %d40 - faligndata %d34, %d36, %d50 - ldda [%o0 + 0x20]%asi, %d42 - faligndata %d36, %d38, %d52 - ldda [%o0 + 0x28]%asi, %d44 - faligndata %d38, %d40, %d54 - ldda [%o0 + 0x30]%asi, %d46 - faligndata %d40, %d42, %d56 - ldda [%o0 + 0x38]%asi, %d32 - faligndata %d42, %d44, %d58 - cmp %o2, VIS_BLOCKSIZE + 8 - prefetch [%o0 + (5 * VIS_BLOCKSIZE)], #one_read - bgu,pt %ncc, 1b - add %o0, VIS_BLOCKSIZE, %o0 ! update SRC - faligndata %d44, %d46, %d60 - faligndata %d46, %d32, %d62 - stda %d48, [%o1]ASI_BLK_P ! store 64 bytes, bypass cache - cmp %o2, VIS_BLOCKSIZE - bne %ncc, 2f ! exactly 1 block remaining? - add %o1, VIS_BLOCKSIZE, %o1 ! update DST - brz,a %o3, 3f ! is SRC double aligned? - ldd [%o0], %d34 - -2: - add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 - add %o5, %o3, %o5 - - ba .beginmedloop_bcopy - andn %o5, 7, %o5 ! 8 byte aligned count - - ! This is when there is exactly 1 block remaining and SRC is aligned -3: - ! %d32 was loaded in the last iteration of the loop above, and - ! %d34 was loaded in the branch delay slot that got us here. - ldd [%o0 + 0x08], %d36 - ldd [%o0 + 0x10], %d38 - ldd [%o0 + 0x18], %d40 - ldd [%o0 + 0x20], %d42 - ldd [%o0 + 0x28], %d44 - ldd [%o0 + 0x30], %d46 - stda %d32, [%o1]ASI_BLK_P - - ba exit_bcopy - nop - - .align 16 - ! two nops here causes loop starting at 1f below to be - ! on a cache line boundary, improving performance - nop - nop -xlarge: -.xlarge_bcopy: - /* - set 4096, %l2 - subcc %o2, %l2, %g0 - bge %ncc, size_ge_4k - nop - */ - ! %o1 I/O DST is 64-byte aligned - ! %o0 I/O 8-byte aligned (and we've set GSR.ALIGN) - ! %d32 I/O already loaded with SRC data from [%o0-8] - ! %o2 I/O count (number of bytes that need to be written) - ! %o3 I Not written. If zero, then SRC is double aligned. - ! %o4 I Not written. Holds fprs. - ! %o5 O The number of doubles that remain to be written. - - ! Load the rest of the current block - ! Recall that %o0 is further into SRC than %o1 is into DST - - ! prefetch [%o0 + (3 * VIS_BLOCKSIZE)], #one_read - ! executed in delay slot for branch to .xlarge - prefetch [%o0 + (4 * VIS_BLOCKSIZE)], #one_read - prefetch [%o0 + (5 * VIS_BLOCKSIZE)], #one_read - ldda [%o0]%asi, %d34 - prefetch [%o0 + (6 * VIS_BLOCKSIZE)], #one_read - ldda [%o0 + 0x8]%asi, %d36 - faligndata %d32, %d34, %d48 - ldda [%o0 + 0x10]%asi, %d38 - faligndata %d34, %d36, %d50 - ldda [%o0 + 0x18]%asi, %d40 - faligndata %d36, %d38, %d52 - ldda [%o0 + 0x20]%asi, %d42 - or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 - faligndata %d38, %d40, %d54 - ldda [%o0 + 0x28]%asi, %d44 - movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later) - faligndata %d40, %d42, %d56 - ldda [%o0 + 0x30]%asi, %d46 - faligndata %d42, %d44, %d58 - ldda [%o0 + 0x38]%asi, %d32 - sub %o2, VIS_BLOCKSIZE, %o2 ! update count - prefetch [%o0 + (7 * VIS_BLOCKSIZE)], #one_read - add %o0, VIS_BLOCKSIZE, %o0 ! update SRC - - ! This point is 32-byte aligned since 24 instructions appear since - ! the previous alignment directive. - - - ! Main loop. Write previous block. Load rest of current block. - ! Some bytes will be loaded that won't yet be written. -1: - ldda [%o0]%asi, %d34 - faligndata %d44, %d46, %d60 - ldda [%o0 + 0x8]%asi, %d36 - faligndata %d46, %d32, %d62 - stda %d48, [%o1]ASI_BLK_P - sub %o2, VIS_BLOCKSIZE, %o2 ! update count - ldda [%o0 + 0x10]%asi, %d38 - faligndata %d32, %d34, %d48 - ldda [%o0 + 0x18]%asi, %d40 - faligndata %d34, %d36, %d50 - ldda [%o0 + 0x20]%asi, %d42 - faligndata %d36, %d38, %d52 - ldda [%o0 + 0x28]%asi, %d44 - faligndata %d38, %d40, %d54 - ldda [%o0 + 0x30]%asi, %d46 - faligndata %d40, %d42, %d56 - ldda [%o0 + 0x38]%asi, %d32 - faligndata %d42, %d44, %d58 - ! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K - prefetch [%o0 + (8 * VIS_BLOCKSIZE) + 8], #one_read - add %o1, VIS_BLOCKSIZE, %o1 ! update DST - cmp %o2, VIS_BLOCKSIZE + 8 - ! second prefetch important to correct for occasional dropped - ! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K - ! strong prefetch prevents drops on Panther, but Jaguar and earlier - ! US-III models treat strong prefetches as weak prefetchs - ! to avoid regressions on customer hardware, we retain the prefetch - prefetch [%o0 + (5 * VIS_BLOCKSIZE)], #one_read - bgu,pt %ncc, 1b - add %o0, VIS_BLOCKSIZE, %o0 ! update SRC - - faligndata %d44, %d46, %d60 - faligndata %d46, %d32, %d62 - stda %d48, [%o1]ASI_BLK_P ! store 64 bytes, bypass cache - cmp %o2, VIS_BLOCKSIZE - bne %ncc, 2f ! exactly 1 block remaining? - add %o1, VIS_BLOCKSIZE, %o1 ! update DST - brz,a %o3, 3f ! is SRC double aligned? - ldd [%o0], %d34 - -2: - add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 - add %o5, %o3, %o5 - - - ba .beginmedloop_bcopy - andn %o5, 7, %o5 ! 8 byte aligned count - - - ! This is when there is exactly 1 block remaining and SRC is aligned -3: - ! %d32 was loaded in the last iteration of the loop above, and - ! %d34 was loaded in the branch delay slot that got us here. - ldd [%o0 + 0x08], %d36 - ldd [%o0 + 0x10], %d38 - ldd [%o0 + 0x18], %d40 - ldd [%o0 + 0x20], %d42 - ldd [%o0 + 0x28], %d44 - ldd [%o0 + 0x30], %d46 - stda %d32, [%o1]ASI_BLK_P - - ba exit_bcopy - nop - -copying_ge_512: - ! both src and dst are aligned to 8 byte boundary - ! and the number of bytes to copy is 512 or more. - ! %o5 has the saved T_LOFAULT when we come here. - ! We set a new error handler if the T_LOFAULT was set earlier OR - ! KCOPY_FLAG is set. - save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp - mov %i5, %l6 - andn %l6, TRAMP_FLAG, %o2 - brz,pt %o2, 1f - nop - ! We enter here if KCOPY_FLAG was set OR - ! T_LOFAULT was set earlier. - ! We only change the error handler pointer here. - ! The flags TRAMP_FLAG or KCOPY_FLAG is left as it is in %l6. - sethi %hi(.copyerr_fp_used), %o2 - or %o2, %lo(.copyerr_fp_used), %o2 - membar #Sync ! sync error barrier - stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector -1: - FP_NOMIGRATE(6, 7) - mov %i0, %o0 - mov %i1, %o1 - mov %i2, %o2 - mov %i3, %o3 - mov %i5, %o5 - rd %fprs, %o5 ! check for unused fp - st %o5, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs - - ! FPU enabled ? If not, enable it. - btst FPRS_FEF, %o5 - bz,a,pt %icc, 1f - wr %g0, FPRS_FEF, %fprs - - - ! save the FP registers even if DU is not set. - - BST_FPQ3Q4_TOSTACK(%o5) - or %l6, FPSAVED_FLAG, %l6 -1: - rd %gsr, %o5 - stx %o5, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr - or %l6, FPUSED_FLAG, %l6 - !prefetch 256 bytes from nearest 128 byte aligned src buf - sub %o0,1,%o3 - andn %o3,0x7f,%l1 - add %l1,128,%l1 - prefetch [%l1],2 - prefetch [%l1+64],2 - prefetch [%l1+(2*64)],2 - prefetch [%l1+(3*64)],2 - !prefetch 256 bytes from nearest 128 byte aligned dst buf - sub %o1,1,%o3 - andn %o3,0x7f,%l1 - add %l1,128,%l1 - prefetch [%l1],2 - prefetch [%l1+64],2 - prefetch [%l1+(2*64)],2 - prefetch [%l1+(3*64)],2 - - andcc %o1,0x7f,%o3 !Check if buffers are 128 byte aligned - brz,pn %o3,aligned_on_128 - sub %o3,128,%o3 - - add %o2,%o3,%o2 -align_to_128: - ldxa [%o0]ASI_CACHE_SPARING_P, %o4 - add %o0,8,%o0 ! increment src pointer - stxa %o4,[%o1]ASI_CACHE_SPARING_P - addcc %o3,8,%o3 - bl,pt %ncc,align_to_128 - add %o1,8,%o1 ! increment dst pointer - -aligned_on_128: - andcc %o1,0x1ff,%o3 !Check if buffers are 512 byte aligned. - brnz,pn %o3, 4f - mov %o2,%l4 !l4=number of bytes to copy - ! buffers are now 512 byte aligned. - ! if we have 4096 or more bytes to copy we will use the - ! stingray_optimized_copy - set 4096, %l2 - subcc %o2, %l2, %g0 - bge,pn %ncc, stingray_optimized_copy - nop -4: - ! determine how many bytes are left to be copied after the buffers - ! are aligned to 512 byte boundary. - ! if we have 4096 or more then we can perform stingray_optimized_copy - ! register l4 will contain the number of bytes to copy after buffers\ - ! are aligned to 512 byte boundary. l4 is set to 0 if we have less than - ! 4096 bytes to copy after aligning buffers to 512 byte. - sub %o1,8,%o5 ! should be in current 512 chunk - andn %o5,0x1ff,%o3 ! %o3=aligned 512b addr - add %o3,0x200,%o3 ! %o3=next aligned 512b addr - sub %o3,%o1,%o3 ! %o3=how many bytes to copy for 512 byte - ! alignment - sub %o2,%o3,%l4 ! l4=bytes to copy after aligning buffers to 512 - ! if l4 is < 4096 do interleave128_copy only. - set 4096, %l2 - subcc %l4, %l2, %g0 - bge,pn %ncc,6f - nop - mov %g0, %l4 - add %o1, %o2, %l1 - ba interleave128_copy - nop -6: - mov %o3, %o2 - subcc %o3,256,%g0 !use interleave128_copy if 256 or more - bl,pn %ncc,copy_word !o.w use copy_word to finish the 512 byte alignment. - !%o2=new count i.e how many bytes to write - add %o1,%o2,%l1 !cal the last byte to write %l1 - ba interleave128_copy - nop - - .align 64 -interleave128_copy: - ! %l1 has the addr of the dest. buffer at or beyond which no write - ! is to be done. - ! %l4 has the number of bytes to zero using stingray_optimized_bzero - !prefetch src - - add %o0, 256, %o3 - prefetch [%o3], 2 !1st 64 byte line of next 256 byte block - add %o0, 384, %o3 - prefetch [%o3], 2 !3rd 64 byte line of next 256 byte block - add %o0, 320, %o3 - prefetch [%o3], 2 !2nd 64 byte line of next 256 byte block - add %o0, 448, %o3 - prefetch [%o3], 2 !4th 64 byte line of next 256 byte block - - !prefetch dst - - add %o1, 256, %o3 - prefetch [%o3], 2 !1st 64 byte line of next 256 byte block - add %o1, 384, %o3 - prefetch [%o3], 2 !3rd 64 byte line of next 256 byte block - add %o1, 320, %o3 - prefetch [%o3], 2 !2nd 64 byte line of next 256 byte block - add %o1, 448, %o3 - prefetch [%o3], 2 !4th 64 byte line of next 256 byte block - - ldxa [%o0]ASI_CACHE_SPARING_P, %o4 - stxa %o4,[%o1]ASI_CACHE_SPARING_P - add %o0, 128, %o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, 128, %o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (1 * 8), %o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (1 * 8), %o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (1 * 8 + 128), %o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (1 * 8 + 128), %o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (2 * 8),%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (2 * 8),%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (2 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (2 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (3 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (3 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (3 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (3 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (4 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (4 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (4 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (4 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (5 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (5 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (5 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (5 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (6 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (6 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (6 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (6 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (7 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (7 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (7 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (7 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (8 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (8 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (8 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (8 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (9 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (9 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (9 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (9 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (10 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (10 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (10 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (10 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (11 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (11 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (11 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (11 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (12 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (12 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (12 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (12 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (13 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (13 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (13 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (13 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (14 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (14 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (14 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (14 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (15 * 8) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (15 * 8) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, (15 * 8 + 128) ,%o3 - ldxa [%o3]ASI_CACHE_SPARING_P, %o4 - add %o1, (15 * 8 + 128) ,%o3 - stxa %o4,[%o3]ASI_CACHE_SPARING_P - add %o0, 256, %o0 - - ! check if the next 256 byte copy will not exceed the number of - ! bytes remaining to be copied. - ! %l2 points to the dest buffer after copying 256 bytes more. - ! %l1 points to dest. buffer at or beyond which no writes should be done. - add %o1,512,%l2 - subcc %l1,%l2,%g0 - bge,pt %ncc,interleave128_copy - add %o1,256,%o1 - -copy_word: - and %o2,255,%o3 - and %o3,7,%o2 - - ! Set the remaining doubles - subcc %o3, 8, %o3 ! Can we store any doubles? - bl,pn %ncc, 6f - and %o2, 7, %o2 ! calc bytes left after doubles - - !prefetch src - - mov %o0, %o4 - prefetch [%o4], 2 !1st 64 byte line of next 256 byte block - add %o0, 128, %o4 - prefetch [%o4], 2 !3rd 64 byte line of next 256 byte block - add %o0, 64, %o4 - prefetch [%o4], 2 !2nd 64 byte line of next 256 byte block - add %o0, 192, %o4 - prefetch [%o4], 2 !4th 64 byte line of next 256 byte block - - !prefetch dst - - mov %o1, %o4 - prefetch [%o4], 2 !1st 64 byte line of next 256 byte block - add %o1, 128, %o4 - prefetch [%o4], 2 !3rd 64 byte line of next 256 byte block - add %o1, 64, %o4 - prefetch [%o4], 2 !2nd 64 byte line of next 256 byte block - add %o1, 192, %o4 - prefetch [%o4], 2 !4th 64 byte line of next 256 byte block - -5: - ldxa [%o0]ASI_CACHE_SPARING_P, %o4 - add %o0, 8, %o0 - stxa %o4, [%o1]ASI_CACHE_SPARING_P - subcc %o3, 8, %o3 - bge,pt %ncc, 5b - add %o1, 8, %o1 -6: - ! Set the remaining bytes - brz %o2, can_we_do_stingray_optimized_copy - nop - -7: - deccc %o2 ! byte clearing loop - ldub [%o0], %o4 ! load one byte - stb %o4, [%o1] - inc %o1 ! increment dst - bgu,pt %ncc, 7b - inc %o0 ! increment src - -can_we_do_stingray_optimized_copy: - ! %l4 contains the number of bytes to be copied - mov %l4, %o2 - brnz,pn %o2, stingray_optimized_copy - nop - -exit: - membar #Sync - - ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o5 ! restore gsr - wr %o5, 0, %gsr - - ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 - ! No need to restore regs if they were not saved - btst FPSAVED_FLAG, %l6 - bz %ncc, 4f - nop - - BLD_FPQ3Q4_FROMSTACK(%o5) - - ba,pt %ncc, 5f - wr %o3, 0, %fprs ! restore fprs -4: - FZEROQ3Q4 - wr %o3, 0, %fprs ! restore fprs -5: - membar #Sync ! sync error barrier - andn %l6, MASK_FLAGS, %l6 - stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - FP_ALLOWMIGRATE(6, 7) - ret - restore %g0, 0, %o0 - - -stingray_optimized_copy: - ! This code tries to maximize bandwidth by being clever about - ! accessing the two cache lines that are BUDDY PAIRS in the L3 cache. - ! THIS VERSION IS OPTIMIZED FOR THE CASE OF SWAPPING PA BITS 6 and 9. - ! To keep this code simple, we assume the addresses given are aligned - ! at least on a 128-byte boundary, and the length is assumed to be - ! a multiple of 4k bytes. - ! THIS VERSION USES BLKSTORES, AND PREFETCHES BOTH SOURCE AND - ! DESTINATION DATA. - - add %o1, %l4, %o2 - - !save original value of %o0 so we can restore it. - or %g0,%o0,%l2 - - wr %g0,ASI_BLK_P,%asi - - prefetch [%o0+0],2 - prefetch [%o0+(64*1)],2 - prefetch [%o0+(64*2)],2 - prefetch [%o0+(64*3)],2 - prefetch [%o0+(64*4)],2 - prefetch [%o0+(64*5)],2 - prefetch [%o0+(64*6)],2 - prefetch [%o0+(64*7)],2 - prefetch [%o0+(64*8)],2 - prefetch [%o0+(64*9)],2 - prefetch [%o0+(64*10)],2 - prefetch [%o0+(64*11)],2 - prefetch [%o0+(64*12)],2 - prefetch [%o0+(64*13)],2 - prefetch [%o0+(64*14)],2 - prefetch [%o0+(64*15)],2 - - prefetch [%o1+0],2 - prefetch [%o1+(64*1)],2 - prefetch [%o1+(64*2)],2 - prefetch [%o1+(64*3)],2 - prefetch [%o1+(64*4)],2 - prefetch [%o1+(64*5)],2 - prefetch [%o1+(64*6)],2 - prefetch [%o1+(64*7)],2 - prefetch [%o1+(64*8)],2 - prefetch [%o1+(64*9)],2 - prefetch [%o1+(64*10)],2 - prefetch [%o1+(64*11)],2 - prefetch [%o1+(64*12)],2 - prefetch [%o1+(64*13)],2 - prefetch [%o1+(64*14)],2 - prefetch [%o1+(64*15)],2 - - ba stingray_optimized_4k_copy_loop - srl %l4, 12, %l4 - - ! Local register usage: - ! %l1 address at short distance ahead of current src buf for prefetching - ! into L1 cache. - ! %l2 address at far ahead of current src buf for prefetching - ! into L2 cache. - ! %l3 save %o1 at start of inner loop. - ! %l4 Number of 4k blocks to copy - ! %g1 save src buf pointer at start of inner loop. - ! %l5 iteration counter to make buddy loop execute 2 times. - ! %o5 iteration counter to make inner loop execute 4 times. - ! %l7 address at far ahead of current dst buf for prefetching dest - ! into L2 cache. - - .align 64 -stingray_optimized_4k_copy_loop: - set 2, %l5 ! %l5 is the loop count for the buddy loop - add %o1, 0, %l3 - add %o0, 0, %g1 -buddyloop_bcopy: - set PF_FAR, %g5 - add %o0, %g5, %l2 ! Set %l2 to far ahead of src buffer to prefetch - ! For prefetching into L1 D$, set %l1 a little ahead of src buffer - add %o0, PF_NEAR, %l1 - add %o1, %g5, %l7 ! Set %l7 to far ahead of dst buffer to prefetch - - add %l2, %g5, %g5 ! %g5 is now double far ahead of the src buffer - prefetch [%g5+%g0],2 ! Prefetch ahead to get TLB entry in advance. - set 2*PF_FAR, %g5 - add %o1, %g5, %g5 ! %g5 is now double far ahead of the dst buffer - prefetch [%g5+%g0],2 ! Prefetch ahead to get TLB entry in advance. - - set 4,%o5 ! %o5 = loop count for the inner loop - set 0, %g5 - - ! Each iteration of the inner loop below copies 8 sequential lines. - ! This loop is iterated 4 times, to move a total of 32 lines, all of - ! which have the same value of PA[9], so we increment the base - ! address by 1024 bytes in each iteration, which varies PA[10]. -innerloop_bcopy: - ! copy line 1 of 8 - prefetch [%l2+%g5],2 - prefetch [%l7+%g5],2 - prefetch [%l1+%g5],1 - - ldd [%o0],%d32 - ldd [%o0+8],%d34 - ldd [%o0+16],%d36 - ldd [%o0+24],%d38 - ldd [%o0+32],%d40 - ldd [%o0+40],%d42 - ldd [%o0+48],%d44 - ldd [%o0+56],%d46 - stda %d32,[%o1+0] %asi - add %g5, 64, %g5 - add %o1, 64, %o1 - add %o0, 64, %o0 - - ! copy line 2 of 8 - prefetch [%l2+%g5],2 - prefetch [%l7+%g5],2 - prefetch [%l1+%g5],1 - - ldd [%o0],%d32 - ldd [%o0+8],%d34 - ldd [%o0+16],%d36 - ldd [%o0+24],%d38 - ldd [%o0+32],%d40 - ldd [%o0+40],%d42 - ldd [%o0+48],%d44 - ldd [%o0+56],%d46 - stda %d32,[%o1+0] %asi - add %g5, 64, %g5 - add %o1, 64, %o1 - add %o0, 64, %o0 - - ! copy line 3 of 8 - prefetch [%l2+%g5],2 - prefetch [%l7+%g5],2 - prefetch [%l1+%g5],1 - - ldd [%o0],%d32 - ldd [%o0+8],%d34 - ldd [%o0+16],%d36 - ldd [%o0+24],%d38 - ldd [%o0+32],%d40 - ldd [%o0+40],%d42 - ldd [%o0+48],%d44 - ldd [%o0+56],%d46 - stda %d32,[%o1+0] %asi - add %g5, 64, %g5 - add %o1, 64, %o1 - add %o0, 64, %o0 - - ! copy line 4 of 8 - prefetch [%l2+%g5],2 - prefetch [%l7+%g5],2 - prefetch [%l1+%g5],1 - - ldd [%o0],%d32 - ldd [%o0+8],%d34 - ldd [%o0+16],%d36 - ldd [%o0+24],%d38 - ldd [%o0+32],%d40 - ldd [%o0+40],%d42 - ldd [%o0+48],%d44 - ldd [%o0+56],%d46 - stda %d32,[%o1+0] %asi - add %g5, 64, %g5 - add %o1, 64, %o1 - add %o0, 64, %o0 - - ! copy line 5 of 8 - prefetch [%l2+%g5],2 - prefetch [%l7+%g5],2 - prefetch [%l1+%g5],1 - - ldd [%o0],%d32 - ldd [%o0+8],%d34 - ldd [%o0+16],%d36 - ldd [%o0+24],%d38 - ldd [%o0+32],%d40 - ldd [%o0+40],%d42 - ldd [%o0+48],%d44 - ldd [%o0+56],%d46 - stda %d32,[%o1+0] %asi - add %g5, 64, %g5 - add %o1, 64, %o1 - add %o0, 64, %o0 - - ! copy line 6 of 8 - prefetch [%l2+%g5],2 - prefetch [%l7+%g5],2 - prefetch [%l1+%g5],1 - - ldd [%o0],%d32 - ldd [%o0+8],%d34 - ldd [%o0+16],%d36 - ldd [%o0+24],%d38 - ldd [%o0+32],%d40 - ldd [%o0+40],%d42 - ldd [%o0+48],%d44 - ldd [%o0+56],%d46 - stda %d32,[%o1+0] %asi - add %g5, 64, %g5 - add %o1, 64, %o1 - add %o0, 64, %o0 - - ! copy line 7 of 8 - prefetch [%l2+%g5],2 - prefetch [%l7+%g5],2 - prefetch [%l1+%g5],1 - - ldd [%o0],%d32 - ldd [%o0+8],%d34 - ldd [%o0+16],%d36 - ldd [%o0+24],%d38 - ldd [%o0+32],%d40 - ldd [%o0+40],%d42 - ldd [%o0+48],%d44 - ldd [%o0+56],%d46 - stda %d32,[%o1+0] %asi - add %g5, 64, %g5 - add %o1, 64, %o1 - add %o0, 64, %o0 - - ! copy line 8 of 8 - prefetch [%l2+%g5],2 - prefetch [%l7+%g5],2 - prefetch [%l1+%g5],1 - - ldd [%o0],%d32 - ldd [%o0+8],%d34 - ldd [%o0+16],%d36 - ldd [%o0+24],%d38 - ldd [%o0+32],%d40 - ldd [%o0+40],%d42 - ldd [%o0+48],%d44 - ldd [%o0+56],%d46 - stda %d32,[%o1+0] %asi - - subcc %o5,1,%o5 ! Decrement the inner loop counter. - - ! Now increment by 64 + 512 so we don't toggle PA[9] - - add %g5, 576, %g5 - add %o1, 576, %o1 ! increment dst buffer - - bg,pt %icc,innerloop_bcopy - add %o0, 576, %o0 ! increment src buffer - ! END OF INNER LOOP - - - subcc %l5,1,%l5 - add %l3, 512, %o1 ! increment dst buf to the first buddy line - bg,pt %icc,buddyloop_bcopy - add %g1, 512 ,%o0 ! increment src buf to the first buddy lines. */ - - subcc %l4, 1, %l4 - add %o1, 3584, %o1 ! Advance src and dst buffers by 4k - add %o0, 3584, %o0 ! They were already incremented by 512, - ! so just add 3584. - - bg,pt %icc,stingray_optimized_4k_copy_loop - nop - - ! End of stingray_optimized_copy - ! if we have 256 or more bytes to copy we use interleave128_copy - ! else we use copy_word - - sub %o2,%o1,%o2 ! bytes remaining to be copied - brz,pn %o2,exit - mov %g0,%l4 - add %o1,%o2,%l1 !cal the last byte to write %l1 - subcc %o2,256,%g0 - bge,pt %ncc,interleave128_copy - mov %g0, %l4 - - ba copy_word - nop - - SET_SIZE(bcopy) - SET_SIZE(__align_cpy_1) -#endif /* lint */ - -#define REALSRC %i0 -#define DST %i1 -#define CNT %i2 -#define SRC %i3 -#define TMP %i5 - -/* - * Block copy with possibly overlapped operands. - */ - -#if defined(lint) - -/*ARGSUSED*/ -void -ovbcopy(const void *from, void *to, size_t count) -{} - -#else /* lint */ - - ENTRY(ovbcopy) - tst %o2 ! check count - bgu,a %ncc, 1f ! nothing to do or bad arguments - subcc %o0, %o1, %o3 ! difference of from and to address - - retl ! return - nop -1: - bneg,a %ncc, 2f - neg %o3 ! if < 0, make it positive -2: cmp %o2, %o3 ! cmp size and abs(from - to) - bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, - .empty ! no overlap - cmp %o0, %o1 ! compare from and to addresses - blu %ncc, .ov_bkwd ! if from < to, copy backwards - nop - ! - ! Copy forwards. - ! -.ov_fwd: - ldub [%o0], %o3 ! read from address - inc %o0 ! inc from address - stb %o3, [%o1] ! write to address - deccc %o2 ! dec count - bgu %ncc, .ov_fwd ! loop till done - inc %o1 ! inc to address - - retl ! return - nop - ! - ! Copy backwards. - ! -.ov_bkwd: - deccc %o2 ! dec count - ldub [%o0 + %o2], %o3 ! get byte at end of src - bgu %ncc, .ov_bkwd ! loop till done - stb %o3, [%o1 + %o2] ! delay slot, store at end of dst - - retl ! return - nop - - SET_SIZE(ovbcopy) - -#endif /* lint */ - - -/* - * hwblkpagecopy() - * - * Copies exactly one page. This routine assumes the caller (ppcopy) - * has already disabled kernel preemption and has checked - * use_hw_bcopy. Preventing preemption also prevents cpu migration. - */ -#ifdef lint -/*ARGSUSED*/ -void -hwblkpagecopy(const void *src, void *dst) -{ } -#else /* lint */ - ENTRY(hwblkpagecopy) - ! get another window w/space for three aligned blocks of saved fpregs - save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp - - ! %i0 - source address (arg) - ! %i1 - destination address (arg) - ! %i2 - length of region (not arg) - ! %l0 - saved fprs - ! %l1 - pointer to saved fpregs - - rd %fprs, %l0 ! check for unused fp - - ! FPU enabled ? If not, enable it. - btst FPRS_FEF, %l0 - bz,a,pt %icc, 1f - wr %g0, FPRS_FEF, %fprs - - ! save the FP registers even if DU is not set. - - BST_FPQ3Q4_TOSTACK(%l1) - -1: set PAGESIZE, CNT - mov %i1, %o0 ! store destination address for flushing - mov REALSRC, SRC - - prefetch [SRC], #one_read - prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read - prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read - prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read - ldd [SRC], %d32 -#if FIRST_PREFETCH > 4 - prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read -#endif - ldd [SRC + 0x08], %d34 -#if FIRST_PREFETCH > 5 - prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read -#endif - ldd [SRC + 0x10], %d36 -#if FIRST_PREFETCH > 6 - prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read -#endif - faligndata %d32, %d34, %d48 - ldd [SRC + 0x18], %d38 -#if FIRST_PREFETCH > 7 - prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read -#endif - faligndata %d34, %d36, %d50 - ldd [SRC + 0x20], %d40 - faligndata %d36, %d38, %d52 - ldd [SRC + 0x28], %d42 - faligndata %d38, %d40, %d54 - ldd [SRC + 0x30], %d44 - faligndata %d40, %d42, %d56 - ldd [SRC + 0x38], %d46 - faligndata %d42, %d44, %d58 - ldd [SRC + VIS_BLOCKSIZE], %d32 - sub CNT, VIS_BLOCKSIZE, CNT - add SRC, VIS_BLOCKSIZE, SRC - ba,a,pt %ncc, 2f - nop - .align ICACHE_LINE_SIZE -2: - ldd [SRC + 0x08], %d34 - faligndata %d44, %d46, %d60 - ldd [SRC + 0x10], %d36 - faligndata %d46, %d32, %d62 - stda %d48, [DST]ASI_BLK_P - ldd [SRC + 0x18], %d38 - faligndata %d32, %d34, %d48 - ldd [SRC + 0x20], %d40 - faligndata %d34, %d36, %d50 - ldd [SRC + 0x28], %d42 - faligndata %d36, %d38, %d52 - ldd [SRC + 0x30], %d44 - faligndata %d38, %d40, %d54 - ldd [SRC + 0x38], %d46 - faligndata %d40, %d42, %d56 - ldd [SRC + VIS_BLOCKSIZE], %d32 - faligndata %d42, %d44, %d58 - prefetch [SRC + ((FIRST_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read - sub CNT, VIS_BLOCKSIZE, CNT - add DST, VIS_BLOCKSIZE, DST - cmp CNT, VIS_BLOCKSIZE + 8 - prefetch [SRC + ((SECOND_PREFETCH) * VIS_BLOCKSIZE)], #one_read - bgu,pt %ncc, 2b - add SRC, VIS_BLOCKSIZE, SRC - - ! trailing block - ldd [SRC + 0x08], %d34 - faligndata %d44, %d46, %d60 - ldd [SRC + 0x10], %d36 - faligndata %d46, %d32, %d62 - stda %d48, [DST]ASI_BLK_P - ldd [SRC + 0x18], %d38 - ldd [SRC + 0x20], %d40 - ldd [SRC + 0x28], %d42 - ldd [SRC + 0x30], %d44 - ldd [SRC + 0x38], %d46 - sub CNT, VIS_BLOCKSIZE, CNT - add DST, VIS_BLOCKSIZE, DST - add SRC, VIS_BLOCKSIZE, SRC - stda %d32, [DST]ASI_BLK_P - - set PAGESIZE, %o1 - call rock_sync_icache - nop - - membar #Sync - - btst FPRS_FEF, %l0 - bz,pt %icc, 2f - nop - - BLD_FPQ3Q4_FROMSTACK(%l3) - ba 3f - nop - -2: FZEROQ3Q4 - -3: wr %l0, 0, %fprs ! restore fprs - ret - restore %g0, 0, %o0 - - SET_SIZE(hwblkpagecopy) -#endif /* lint */ - - -/* - * Transfer data to and from user space - - * Note that these routines can cause faults - * It is assumed that the kernel has nothing at - * less than KERNELBASE in the virtual address space. - * - * Note that copyin(9F) and copyout(9F) are part of the - * DDI/DKI which specifies that they return '-1' on "errors." - * - * Sigh. - * - * So there's two extremely similar routines - xcopyin() and xcopyout() - * which return the errno that we've faithfully computed. This - * allows other callers (e.g. uiomove(9F)) to work correctly. - * Given that these are used pretty heavily, we expand the calling - * sequences inline for all flavours (rather than making wrappers). - * - * There are also stub routines for xcopyout_little and xcopyin_little, - * which currently are intended to handle requests of <= 16 bytes from - * do_unaligned. Future enhancement to make them handle 8k pages efficiently - * is left as an exercise... - */ - -/* - * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) - * - * General theory of operation: - * - * The only difference between copy{in,out} and - * xcopy{in,out} is in the error handling routine they invoke - * when a memory access error occurs. xcopyOP returns the errno - * while copyOP returns -1 (see above). copy{in,out}_noerr set - * a special flag (by oring the TRAMP_FLAG into the fault handler address) - * if they are called with a fault handler already in place. That flag - * causes the default handlers to trampoline to the previous handler - * upon an error. - * - * None of the copyops routines grab a window until it's decided that - * we need to do a HW block copy operation. This saves a window - * spill/fill when we're called during socket ops. The typical IO - * path won't cause spill/fill traps. - * - * This code uses a set of 4 limits for the maximum size that will - * be copied given a particular input/output address alignment. - * If the value for a particular limit is zero, the copy will be performed - * by the plain copy loops rather than FPBLK. - * - * See the description of bcopy above for more details of the - * data copying algorithm and the default limits. - * - */ - -/* - * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). - */ - -#if defined(lint) - - -#else /* lint */ -/* - * We save the arguments in the following registers in case of a fault: - * kaddr - %l1 - * uaddr - %l2 - * count - %l3 - */ -#define SAVE_SRC %l1 -#define SAVE_DST %l2 -#define SAVE_COUNT %l3 - -#define SM_SAVE_SRC %g4 -#define SM_SAVE_DST %g5 -#define SM_SAVE_COUNT %o5 -#define ERRNO %l5 - - -#define REAL_LOFAULT %l4 -/* - * Generic copyio fault handler. This is the first line of defense when a - * fault occurs in (x)copyin/(x)copyout. In order for this to function - * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. - * This allows us to share common code for all the flavors of the copy - * operations, including the _noerr versions. - * - * Note that this function will restore the original input parameters before - * calling REAL_LOFAULT. So the real handler can vector to the appropriate - * member of the t_copyop structure, if needed. - */ - ENTRY(copyio_fault) - membar #Sync - mov %g1,ERRNO ! save errno in ERRNO - btst FPUSED_FLAG, %l6 - bz %ncc, 1f - nop - - ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 - wr %o2, 0, %gsr ! restore gsr - - ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 - btst FPRS_FEF, %o3 - bz,pt %icc, 4f - nop - - BLD_FPQ3Q4_FROMSTACK(%o2) - - ba,pt %ncc, 1f - wr %o3, 0, %fprs ! restore fprs - -4: - FZEROQ3Q4 - wr %o3, 0, %fprs ! restore fprs - -1: - andn %l6, FPUSED_FLAG, %l6 - membar #Sync - stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - FP_ALLOWMIGRATE(5, 6) - - mov SAVE_SRC, %i0 - mov SAVE_DST, %i1 - jmp REAL_LOFAULT - mov SAVE_COUNT, %i2 - - SET_SIZE(copyio_fault) - - -#endif - -#if defined(lint) - -/*ARGSUSED*/ -int -copyout(const void *kaddr, void *uaddr, size_t count) -{ return (0); } - -#else /* lint */ - - ENTRY(copyout) - - cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case - bleu,pt %ncc, .copyout_small ! go to larger cases - xor %o0, %o1, %o3 ! are src, dst alignable? - btst 7, %o3 ! - bz,pt %ncc, .copyout_8 ! check for longword alignment - nop - btst 1, %o3 ! - bz,pt %ncc, .copyout_2 ! check for half-word - nop - sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_1)], %o3 - tst %o3 - bz,pn %icc, .copyout_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyout_small ! go to small copy - nop - ba,pt %ncc, .copyout_more ! otherwise go to large copy - nop -.copyout_2: - btst 3, %o3 ! - bz,pt %ncc, .copyout_4 ! check for word alignment - nop - sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_2)], %o3 - tst %o3 - bz,pn %icc, .copyout_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyout_small ! go to small copy - nop - ba,pt %ncc, .copyout_more ! otherwise go to large copy - nop -.copyout_4: - ! already checked longword, must be word aligned - sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_4)], %o3 - tst %o3 - bz,pn %icc, .copyout_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyout_small ! go to small copy - nop - ba,pt %ncc, .copyout_more ! otherwise go to large copy - nop -.copyout_8: - sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_8)], %o3 - tst %o3 - bz,pn %icc, .copyout_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyout_small ! go to small copy - nop - ba,pt %ncc, .copyout_more ! otherwise go to large copy - nop - - .align 16 - nop ! instruction alignment - ! see discussion at start of file -.copyout_small: - sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault - or %o5, %lo(.sm_copyout_err), %o5 - ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler - membar #Sync ! sync error barrier - stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault -.sm_do_copyout: - mov %o0, SM_SAVE_SRC - mov %o1, SM_SAVE_DST - cmp %o2, SHORTCOPY ! check for really short case - bleu,pt %ncc, .co_sm_left ! - mov %o2, SM_SAVE_COUNT - cmp %o2, CHKSIZE ! check for medium length cases - bgu,pn %ncc, .co_med ! - or %o0, %o1, %o3 ! prepare alignment check - andcc %o3, 0x3, %g0 ! test for alignment - bz,pt %ncc, .co_sm_word ! branch to word aligned case -.co_sm_movebytes: - sub %o2, 3, %o2 ! adjust count to allow cc zero test -.co_sm_notalign4: - ldub [%o0], %o3 ! read byte - subcc %o2, 4, %o2 ! reduce count by 4 - stba %o3, [%o1]ASI_USER ! write byte - inc %o1 ! advance DST by 1 - ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes - add %o0, 4, %o0 ! advance SRC by 4 - stba %o3, [%o1]ASI_USER - inc %o1 ! advance DST by 1 - ldub [%o0 - 2], %o3 - stba %o3, [%o1]ASI_USER - inc %o1 ! advance DST by 1 - ldub [%o0 - 1], %o3 - stba %o3, [%o1]ASI_USER - bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain - inc %o1 ! advance DST by 1 - add %o2, 3, %o2 ! restore count -.co_sm_left: - tst %o2 - bz,pt %ncc, .co_sm_exit ! check for zero length - nop - ldub [%o0], %o3 ! load one byte - deccc %o2 ! reduce count for cc test - bz,pt %ncc, .co_sm_exit - stba %o3,[%o1]ASI_USER ! store one byte - ldub [%o0 + 1], %o3 ! load second byte - deccc %o2 - inc %o1 - bz,pt %ncc, .co_sm_exit - stba %o3,[%o1]ASI_USER ! store second byte - ldub [%o0 + 2], %o3 ! load third byte - inc %o1 - stba %o3,[%o1]ASI_USER ! store third byte - membar #Sync ! sync error barrier - stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - retl - mov %g0, %o0 ! return 0 - .align 16 -.co_sm_words: - lduw [%o0], %o3 ! read word -.co_sm_wordx: - subcc %o2, 8, %o2 ! update count - stwa %o3, [%o1]ASI_USER ! write word - add %o0, 8, %o0 ! update SRC - lduw [%o0 - 4], %o3 ! read word - add %o1, 4, %o1 ! update DST - stwa %o3, [%o1]ASI_USER ! write word - bgt,pt %ncc, .co_sm_words ! loop til done - add %o1, 4, %o1 ! update DST - addcc %o2, 7, %o2 ! restore count - bz,pt %ncc, .co_sm_exit - nop - deccc %o2 - bz,pt %ncc, .co_sm_byte -.co_sm_half: - subcc %o2, 2, %o2 ! reduce count by 2 - lduh [%o0], %o3 ! read half word - add %o0, 2, %o0 ! advance SRC by 2 - stha %o3, [%o1]ASI_USER ! write half word - bgt,pt %ncc, .co_sm_half ! loop til done - add %o1, 2, %o1 ! advance DST by 2 - addcc %o2, 1, %o2 ! restore count - bz,pt %ncc, .co_sm_exit - nop -.co_sm_byte: - ldub [%o0], %o3 - stba %o3, [%o1]ASI_USER - membar #Sync ! sync error barrier - stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - retl - mov %g0, %o0 ! return 0 - .align 16 -.co_sm_word: - subcc %o2, 4, %o2 ! update count - bgt,pt %ncc, .co_sm_wordx - lduw [%o0], %o3 ! read word - addcc %o2, 3, %o2 ! restore count - bz,pt %ncc, .co_sm_exit - stwa %o3, [%o1]ASI_USER ! write word - deccc %o2 ! reduce count for cc test - ldub [%o0 + 4], %o3 ! load one byte - add %o1, 4, %o1 - bz,pt %ncc, .co_sm_exit - stba %o3, [%o1]ASI_USER ! store one byte - ldub [%o0 + 5], %o3 ! load second byte - deccc %o2 - inc %o1 - bz,pt %ncc, .co_sm_exit - stba %o3, [%o1]ASI_USER ! store second byte - ldub [%o0 + 6], %o3 ! load third byte - inc %o1 - stba %o3, [%o1]ASI_USER ! store third byte -.co_sm_exit: - membar #Sync ! sync error barrier - stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - retl - mov %g0, %o0 ! return 0 - - .align 16 -.co_med: - xor %o0, %o1, %o3 ! setup alignment check - btst 1, %o3 - bnz,pt %ncc, .co_sm_movebytes ! unaligned - nop - btst 3, %o3 - bnz,pt %ncc, .co_med_half ! halfword aligned - nop - btst 7, %o3 - bnz,pt %ncc, .co_med_word ! word aligned - nop -.co_med_long: - btst 3, %o0 ! check for - bz,pt %ncc, .co_med_long1 ! word alignment - nop -.co_med_long0: - ldub [%o0], %o3 ! load one byte - inc %o0 - stba %o3,[%o1]ASI_USER ! store byte - inc %o1 - btst 3, %o0 - bnz,pt %ncc, .co_med_long0 - dec %o2 -.co_med_long1: ! word aligned - btst 7, %o0 ! check for long word - bz,pt %ncc, .co_med_long2 - nop - lduw [%o0], %o3 ! load word - add %o0, 4, %o0 ! advance SRC by 4 - stwa %o3, [%o1]ASI_USER ! store word - add %o1, 4, %o1 ! advance DST by 4 - sub %o2, 4, %o2 ! reduce count by 4 -! -! Now long word aligned and have at least 32 bytes to move -! -.co_med_long2: - sub %o2, 31, %o2 ! adjust count to allow cc zero test - sub %o1, 8, %o1 ! adjust pointer to allow store in - ! branch delay slot instead of add -.co_med_lmove: - add %o1, 8, %o1 ! advance DST by 8 - ldx [%o0], %o3 ! read long word - subcc %o2, 32, %o2 ! reduce count by 32 - stxa %o3, [%o1]ASI_USER ! write long word - add %o1, 8, %o1 ! advance DST by 8 - ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words - add %o0, 32, %o0 ! advance SRC by 32 - stxa %o3, [%o1]ASI_USER - ldx [%o0 - 16], %o3 - add %o1, 8, %o1 ! advance DST by 8 - stxa %o3, [%o1]ASI_USER - ldx [%o0 - 8], %o3 - add %o1, 8, %o1 ! advance DST by 8 - bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left - stxa %o3, [%o1]ASI_USER - add %o1, 8, %o1 ! advance DST by 8 - addcc %o2, 24, %o2 ! restore count to long word offset - ble,pt %ncc, .co_med_lextra ! check for more long words to move - nop -.co_med_lword: - ldx [%o0], %o3 ! read long word - subcc %o2, 8, %o2 ! reduce count by 8 - stxa %o3, [%o1]ASI_USER ! write long word - add %o0, 8, %o0 ! advance SRC by 8 - bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left - add %o1, 8, %o1 ! advance DST by 8 -.co_med_lextra: - addcc %o2, 7, %o2 ! restore rest of count - bz,pt %ncc, .co_sm_exit ! if zero, then done - deccc %o2 - bz,pt %ncc, .co_sm_byte - nop - ba,pt %ncc, .co_sm_half - nop - - .align 16 - nop ! instruction alignment - ! see discussion at start of file -.co_med_word: - btst 3, %o0 ! check for - bz,pt %ncc, .co_med_word1 ! word alignment - nop -.co_med_word0: - ldub [%o0], %o3 ! load one byte - inc %o0 - stba %o3,[%o1]ASI_USER ! store byte - inc %o1 - btst 3, %o0 - bnz,pt %ncc, .co_med_word0 - dec %o2 -! -! Now word aligned and have at least 36 bytes to move -! -.co_med_word1: - sub %o2, 15, %o2 ! adjust count to allow cc zero test -.co_med_wmove: - lduw [%o0], %o3 ! read word - subcc %o2, 16, %o2 ! reduce count by 16 - stwa %o3, [%o1]ASI_USER ! write word - add %o1, 4, %o1 ! advance DST by 4 - lduw [%o0 + 4], %o3 ! repeat for a total for 4 words - add %o0, 16, %o0 ! advance SRC by 16 - stwa %o3, [%o1]ASI_USER - add %o1, 4, %o1 ! advance DST by 4 - lduw [%o0 - 8], %o3 - stwa %o3, [%o1]ASI_USER - add %o1, 4, %o1 ! advance DST by 4 - lduw [%o0 - 4], %o3 - stwa %o3, [%o1]ASI_USER - bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left - add %o1, 4, %o1 ! advance DST by 4 - addcc %o2, 12, %o2 ! restore count to word offset - ble,pt %ncc, .co_med_wextra ! check for more words to move - nop -.co_med_word2: - lduw [%o0], %o3 ! read word - subcc %o2, 4, %o2 ! reduce count by 4 - stwa %o3, [%o1]ASI_USER ! write word - add %o0, 4, %o0 ! advance SRC by 4 - bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left - add %o1, 4, %o1 ! advance DST by 4 -.co_med_wextra: - addcc %o2, 3, %o2 ! restore rest of count - bz,pt %ncc, .co_sm_exit ! if zero, then done - deccc %o2 - bz,pt %ncc, .co_sm_byte - nop - ba,pt %ncc, .co_sm_half - nop - - .align 16 - nop ! instruction alignment - nop ! see discussion at start of file - nop -.co_med_half: - btst 1, %o0 ! check for - bz,pt %ncc, .co_med_half1 ! half word alignment - nop - ldub [%o0], %o3 ! load one byte - inc %o0 - stba %o3,[%o1]ASI_USER ! store byte - inc %o1 - dec %o2 -! -! Now half word aligned and have at least 38 bytes to move -! -.co_med_half1: - sub %o2, 7, %o2 ! adjust count to allow cc zero test -.co_med_hmove: - lduh [%o0], %o3 ! read half word - subcc %o2, 8, %o2 ! reduce count by 8 - stha %o3, [%o1]ASI_USER ! write half word - add %o1, 2, %o1 ! advance DST by 2 - lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords - add %o0, 8, %o0 ! advance SRC by 8 - stha %o3, [%o1]ASI_USER - add %o1, 2, %o1 ! advance DST by 2 - lduh [%o0 - 4], %o3 - stha %o3, [%o1]ASI_USER - add %o1, 2, %o1 ! advance DST by 2 - lduh [%o0 - 2], %o3 - stha %o3, [%o1]ASI_USER - bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left - add %o1, 2, %o1 ! advance DST by 2 - addcc %o2, 7, %o2 ! restore count - bz,pt %ncc, .co_sm_exit - deccc %o2 - bz,pt %ncc, .co_sm_byte - nop - ba,pt %ncc, .co_sm_half - nop - -/* - * We got here because of a fault during short copyout. - * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). - */ -.sm_copyout_err: - membar #Sync - stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - mov SM_SAVE_SRC, %o0 - mov SM_SAVE_DST, %o1 - mov SM_SAVE_COUNT, %o2 - ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler - tst %o3 - bz,pt %ncc, 3f ! if not, return error - nop - ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with - jmp %o5 ! original arguments - nop -3: - retl - or %g0, -1, %o0 ! return error value - - SET_SIZE(copyout) - -/* - * The _more entry points are not intended to be used directly by - * any caller from outside this file. They are provided to allow - * profiling and dtrace of the portions of the copy code that uses - * the floating point registers. - * This entry is particularly important as DTRACE (at least as of - * 4/2004) does not support leaf functions. - */ - - ENTRY(copyout_more) -.copyout_more: - save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp - set .copyout_err, REAL_LOFAULT - -/* - * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes - */ -.do_copyout: - set copyio_fault, %l7 ! .copyio_fault is lofault val - - ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler - membar #Sync ! sync error barrier - stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault - - mov %i0, SAVE_SRC - mov %i1, SAVE_DST - mov %i2, SAVE_COUNT - - FP_NOMIGRATE(6, 7) - - rd %fprs, %o2 ! check for unused fp - st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs - - ! FPU enabled ? If not, enable it. - btst FPRS_FEF, %o2 - bz,a,pt %icc, .do_blockcopyout - wr %g0, FPRS_FEF, %fprs - - ! save the FP registers even if DU is not set. - - BST_FPQ3Q4_TOSTACK(%o2) - -.do_blockcopyout: - rd %gsr, %o2 - stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr - or %l6, FPUSED_FLAG, %l6 - - andcc DST, VIS_BLOCKSIZE - 1, TMP - mov ASI_USER, %asi - bz,pt %ncc, 2f - neg TMP - add TMP, VIS_BLOCKSIZE, TMP - - ! TMP = bytes required to align DST on FP_BLOCK boundary - ! Using SRC as a tmp here - cmp TMP, 3 - bleu,pt %ncc, 1f - sub CNT,TMP,CNT ! adjust main count - sub TMP, 3, TMP ! adjust for end of loop test -.co_blkalign: - ldub [REALSRC], SRC ! move 4 bytes per loop iteration - stba SRC, [DST]%asi - subcc TMP, 4, TMP - ldub [REALSRC + 1], SRC - add REALSRC, 4, REALSRC - stba SRC, [DST + 1]%asi - ldub [REALSRC - 2], SRC - add DST, 4, DST - stba SRC, [DST - 2]%asi - ldub [REALSRC - 1], SRC - bgu,pt %ncc, .co_blkalign - stba SRC, [DST - 1]%asi - - addcc TMP, 3, TMP ! restore count adjustment - bz,pt %ncc, 2f ! no bytes left? - nop -1: ldub [REALSRC], SRC - inc REALSRC - inc DST - deccc TMP - bgu %ncc, 1b - stba SRC, [DST - 1]%asi - -2: - andn REALSRC, 0x7, SRC - alignaddr REALSRC, %g0, %g0 - - ! SRC - 8-byte aligned - ! DST - 64-byte aligned - prefetch [SRC], #one_read - prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read - prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read - prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read - ldd [SRC], %d32 -#if FIRST_PREFETCH > 4 - prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read -#endif - ldd [SRC + 0x08], %d34 -#if FIRST_PREFETCH > 5 - prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read -#endif - ldd [SRC + 0x10], %d36 -#if FIRST_PREFETCH > 6 - prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read -#endif - faligndata %d32, %d34, %d48 - ldd [SRC + 0x18], %d38 -#if FIRST_PREFETCH > 7 - prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read -#endif - faligndata %d34, %d36, %d50 - ldd [SRC + 0x20], %d40 - faligndata %d36, %d38, %d52 - ldd [SRC + 0x28], %d42 - faligndata %d38, %d40, %d54 - ldd [SRC + 0x30], %d44 - faligndata %d40, %d42, %d56 - ldd [SRC + 0x38], %d46 - faligndata %d42, %d44, %d58 - ldd [SRC + VIS_BLOCKSIZE], %d32 - sub CNT, VIS_BLOCKSIZE, CNT - add SRC, VIS_BLOCKSIZE, SRC - add REALSRC, VIS_BLOCKSIZE, REALSRC - ba,a,pt %ncc, 1f - nop - .align ICACHE_LINE_SIZE -1: - ldd [SRC + 0x08], %d34 - faligndata %d44, %d46, %d60 - ldd [SRC + 0x10], %d36 - faligndata %d46, %d32, %d62 - stda %d48, [DST]ASI_BLK_AIUS - ldd [SRC + 0x18], %d38 - faligndata %d32, %d34, %d48 - ldd [SRC + 0x20], %d40 - faligndata %d34, %d36, %d50 - ldd [SRC + 0x28], %d42 - faligndata %d36, %d38, %d52 - ldd [SRC + 0x30], %d44 - faligndata %d38, %d40, %d54 - ldd [SRC + 0x38], %d46 - faligndata %d40, %d42, %d56 - sub CNT, VIS_BLOCKSIZE, CNT - ldd [SRC + VIS_BLOCKSIZE], %d32 - faligndata %d42, %d44, %d58 - prefetch [SRC + ((FIRST_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read - add DST, VIS_BLOCKSIZE, DST - prefetch [SRC + ((SECOND_PREFETCH) * VIS_BLOCKSIZE)], #one_read - add REALSRC, VIS_BLOCKSIZE, REALSRC - cmp CNT, VIS_BLOCKSIZE + 8 - bgu,pt %ncc, 1b - add SRC, VIS_BLOCKSIZE, SRC - - ! only if REALSRC & 0x7 is 0 - cmp CNT, VIS_BLOCKSIZE - bne %ncc, 3f - andcc REALSRC, 0x7, %g0 - bz,pt %ncc, 2f - nop -3: - faligndata %d44, %d46, %d60 - faligndata %d46, %d32, %d62 - stda %d48, [DST]ASI_BLK_AIUS - add DST, VIS_BLOCKSIZE, DST - ba,pt %ncc, 3f - nop -2: - ldd [SRC + 0x08], %d34 - faligndata %d44, %d46, %d60 - ldd [SRC + 0x10], %d36 - faligndata %d46, %d32, %d62 - stda %d48, [DST]ASI_BLK_AIUS - ldd [SRC + 0x18], %d38 - ldd [SRC + 0x20], %d40 - ldd [SRC + 0x28], %d42 - ldd [SRC + 0x30], %d44 - ldd [SRC + 0x38], %d46 - sub CNT, VIS_BLOCKSIZE, CNT - add DST, VIS_BLOCKSIZE, DST - add SRC, VIS_BLOCKSIZE, SRC - add REALSRC, VIS_BLOCKSIZE, REALSRC - stda %d32, [DST]ASI_BLK_AIUS - add DST, VIS_BLOCKSIZE, DST - ba,a,pt %ncc, 4f - nop - -3: tst CNT - bz,a %ncc, 4f - nop - -5: ldub [REALSRC], TMP - inc REALSRC - inc DST - deccc CNT - bgu %ncc, 5b - stba TMP, [DST - 1]%asi -4: - -.copyout_exit: - membar #Sync - - ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 - wr %o2, 0, %gsr ! restore gsr - - ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 - btst FPRS_FEF, %o3 - bz,pt %icc, 4f - nop - - BLD_FPQ3Q4_FROMSTACK(%o2) - - ba,pt %ncc, 1f - wr %o3, 0, %fprs ! restore fprs - -4: - FZEROQ3Q4 - wr %o3, 0, %fprs ! restore fprs - -1: - membar #Sync - andn %l6, FPUSED_FLAG, %l6 - stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - FP_ALLOWMIGRATE(5, 6) - ret - restore %g0, 0, %o0 - -/* - * We got here because of a fault during copyout. - * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). - */ -.copyout_err: - ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler - tst %o4 - bz,pt %ncc, 2f ! if not, return error - nop - ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with - jmp %g2 ! original arguments - restore %g0, 0, %g0 ! dispose of copy window -2: - ret - restore %g0, -1, %o0 ! return error value - - - SET_SIZE(copyout_more) - -#endif /* lint */ - - -#ifdef lint - -/*ARGSUSED*/ -int -xcopyout(const void *kaddr, void *uaddr, size_t count) -{ return (0); } - -#else /* lint */ - - ENTRY(xcopyout) - cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case - bleu,pt %ncc, .xcopyout_small ! go to larger cases - xor %o0, %o1, %o3 ! are src, dst alignable? - btst 7, %o3 ! - bz,pt %ncc, .xcopyout_8 ! - nop - btst 1, %o3 ! - bz,pt %ncc, .xcopyout_2 ! check for half-word - nop - sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_1)], %o3 - tst %o3 - bz,pn %icc, .xcopyout_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .xcopyout_small ! go to small copy - nop - ba,pt %ncc, .xcopyout_more ! otherwise go to large copy - nop -.xcopyout_2: - btst 3, %o3 ! - bz,pt %ncc, .xcopyout_4 ! check for word alignment - nop - sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_2)], %o3 - tst %o3 - bz,pn %icc, .xcopyout_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .xcopyout_small ! go to small copy - nop - ba,pt %ncc, .xcopyout_more ! otherwise go to large copy - nop -.xcopyout_4: - ! already checked longword, must be word aligned - sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_4)], %o3 - tst %o3 - bz,pn %icc, .xcopyout_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .xcopyout_small ! go to small copy - nop - ba,pt %ncc, .xcopyout_more ! otherwise go to large copy - nop -.xcopyout_8: - sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_8)], %o3 - tst %o3 - bz,pn %icc, .xcopyout_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .xcopyout_small ! go to small copy - nop - ba,pt %ncc, .xcopyout_more ! otherwise go to large copy - nop - -.xcopyout_small: - sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault - or %o5, %lo(.sm_xcopyout_err), %o5 - ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler - membar #Sync ! sync error barrier - ba,pt %ncc, .sm_do_copyout ! common code - stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault - -.xcopyout_more: - save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp - sethi %hi(.xcopyout_err), REAL_LOFAULT - ba,pt %ncc, .do_copyout ! common code - or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT - -/* - * We got here because of fault during xcopyout - * Errno value is in ERRNO - */ -.xcopyout_err: - ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler - tst %o4 - bz,pt %ncc, 2f ! if not, return error - nop - ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with - jmp %g2 ! original arguments - restore %g0, 0, %g0 ! dispose of copy window -2: - ret - restore ERRNO, 0, %o0 ! return errno value - -.sm_xcopyout_err: - - membar #Sync - stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - mov SM_SAVE_SRC, %o0 - mov SM_SAVE_DST, %o1 - mov SM_SAVE_COUNT, %o2 - ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler - tst %o3 - bz,pt %ncc, 3f ! if not, return error - nop - ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with - jmp %o5 ! original arguments - nop -3: - retl - or %g1, 0, %o0 ! return errno value - - SET_SIZE(xcopyout) - -#endif /* lint */ - -#ifdef lint - -/*ARGSUSED*/ -int -xcopyout_little(const void *kaddr, void *uaddr, size_t count) -{ return (0); } - -#else /* lint */ - - ENTRY(xcopyout_little) - sethi %hi(.xcopyio_err), %o5 - or %o5, %lo(.xcopyio_err), %o5 - ldn [THREAD_REG + T_LOFAULT], %o4 - membar #Sync ! sync error barrier - stn %o5, [THREAD_REG + T_LOFAULT] - mov %o4, %o5 - - subcc %g0, %o2, %o3 - add %o0, %o2, %o0 - bz,pn %ncc, 2f ! check for zero bytes - sub %o2, 1, %o4 - add %o0, %o4, %o0 ! start w/last byte - add %o1, %o2, %o1 - ldub [%o0 + %o3], %o4 - -1: stba %o4, [%o1 + %o3]ASI_AIUSL - inccc %o3 - sub %o0, 2, %o0 ! get next byte - bcc,a,pt %ncc, 1b - ldub [%o0 + %o3], %o4 - -2: - membar #Sync ! sync error barrier - stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - retl - mov %g0, %o0 ! return (0) - - SET_SIZE(xcopyout_little) - -#endif /* lint */ - -/* - * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) - */ - -#if defined(lint) - -/*ARGSUSED*/ -int -copyin(const void *uaddr, void *kaddr, size_t count) -{ return (0); } - -#else /* lint */ - - ENTRY(copyin) - cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case - bleu,pt %ncc, .copyin_small ! go to larger cases - xor %o0, %o1, %o3 ! are src, dst alignable? - btst 7, %o3 ! - bz,pt %ncc, .copyin_8 ! check for longword alignment - nop - btst 1, %o3 ! - bz,pt %ncc, .copyin_2 ! check for half-word - nop - sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_1)], %o3 - tst %o3 - bz,pn %icc, .copyin_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyin_small ! go to small copy - nop - ba,pt %ncc, .copyin_more ! otherwise go to large copy - nop -.copyin_2: - btst 3, %o3 ! - bz,pt %ncc, .copyin_4 ! check for word alignment - nop - sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_2)], %o3 - tst %o3 - bz,pn %icc, .copyin_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyin_small ! go to small copy - nop - ba,pt %ncc, .copyin_more ! otherwise go to large copy - nop -.copyin_4: - ! already checked longword, must be word aligned - sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_4)], %o3 - tst %o3 - bz,pn %icc, .copyin_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyin_small ! go to small copy - nop - ba,pt %ncc, .copyin_more ! otherwise go to large copy - nop -.copyin_8: - sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_8)], %o3 - tst %o3 - bz,pn %icc, .copyin_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyin_small ! go to small copy - nop - ba,pt %ncc, .copyin_more ! otherwise go to large copy - nop - - .align 16 - nop ! instruction alignment - ! see discussion at start of file -.copyin_small: - sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault - or %o5, %lo(.sm_copyin_err), %o5 - ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp - membar #Sync ! sync error barrier - stn %o5, [THREAD_REG + T_LOFAULT] -.sm_do_copyin: - mov %o0, SM_SAVE_SRC - mov %o1, SM_SAVE_DST - cmp %o2, SHORTCOPY ! check for really short case - bleu,pt %ncc, .ci_sm_left ! - mov %o2, SM_SAVE_COUNT - cmp %o2, CHKSIZE ! check for medium length cases - bgu,pn %ncc, .ci_med ! - or %o0, %o1, %o3 ! prepare alignment check - andcc %o3, 0x3, %g0 ! test for alignment - bz,pt %ncc, .ci_sm_word ! branch to word aligned case -.ci_sm_movebytes: - sub %o2, 3, %o2 ! adjust count to allow cc zero test -.ci_sm_notalign4: - lduba [%o0]ASI_USER, %o3 ! read byte - subcc %o2, 4, %o2 ! reduce count by 4 - stb %o3, [%o1] ! write byte - add %o0, 1, %o0 ! advance SRC by 1 - lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes - add %o0, 1, %o0 ! advance SRC by 1 - stb %o3, [%o1 + 1] - add %o1, 4, %o1 ! advance DST by 4 - lduba [%o0]ASI_USER, %o3 - add %o0, 1, %o0 ! advance SRC by 1 - stb %o3, [%o1 - 2] - lduba [%o0]ASI_USER, %o3 - add %o0, 1, %o0 ! advance SRC by 1 - bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain - stb %o3, [%o1 - 1] - add %o2, 3, %o2 ! restore count -.ci_sm_left: - tst %o2 - bz,pt %ncc, .ci_sm_exit - nop - lduba [%o0]ASI_USER, %o3 ! load one byte - deccc %o2 ! reduce count for cc test - bz,pt %ncc, .ci_sm_exit - stb %o3,[%o1] ! store one byte - inc %o0 - lduba [%o0]ASI_USER, %o3 ! load second byte - deccc %o2 - bz,pt %ncc, .ci_sm_exit - stb %o3,[%o1 + 1] ! store second byte - inc %o0 - lduba [%o0]ASI_USER, %o3 ! load third byte - stb %o3,[%o1 + 2] ! store third byte - membar #Sync ! sync error barrier - stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - retl - mov %g0, %o0 ! return 0 - .align 16 -.ci_sm_words: - lduwa [%o0]ASI_USER, %o3 ! read word -.ci_sm_wordx: - subcc %o2, 8, %o2 ! update count - stw %o3, [%o1] ! write word - add %o0, 4, %o0 ! update SRC - add %o1, 8, %o1 ! update DST - lduwa [%o0]ASI_USER, %o3 ! read word - add %o0, 4, %o0 ! update SRC - bgt,pt %ncc, .ci_sm_words ! loop til done - stw %o3, [%o1 - 4] ! write word - addcc %o2, 7, %o2 ! restore count - bz,pt %ncc, .ci_sm_exit - nop - deccc %o2 - bz,pt %ncc, .ci_sm_byte -.ci_sm_half: - subcc %o2, 2, %o2 ! reduce count by 2 - lduha [%o0]ASI_USER, %o3 ! read half word - add %o0, 2, %o0 ! advance SRC by 2 - add %o1, 2, %o1 ! advance DST by 2 - bgt,pt %ncc, .ci_sm_half ! loop til done - sth %o3, [%o1 - 2] ! write half word - addcc %o2, 1, %o2 ! restore count - bz,pt %ncc, .ci_sm_exit - nop -.ci_sm_byte: - lduba [%o0]ASI_USER, %o3 - stb %o3, [%o1] - membar #Sync ! sync error barrier - stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - retl - mov %g0, %o0 ! return 0 - .align 16 -.ci_sm_word: - subcc %o2, 4, %o2 ! update count - bgt,pt %ncc, .ci_sm_wordx - lduwa [%o0]ASI_USER, %o3 ! read word - addcc %o2, 3, %o2 ! restore count - bz,pt %ncc, .ci_sm_exit - stw %o3, [%o1] ! write word - deccc %o2 ! reduce count for cc test - add %o0, 4, %o0 - lduba [%o0]ASI_USER, %o3 ! load one byte - bz,pt %ncc, .ci_sm_exit - stb %o3, [%o1 + 4] ! store one byte - inc %o0 - lduba [%o0]ASI_USER, %o3 ! load second byte - deccc %o2 - bz,pt %ncc, .ci_sm_exit - stb %o3, [%o1 + 5] ! store second byte - inc %o0 - lduba [%o0]ASI_USER, %o3 ! load third byte - stb %o3, [%o1 + 6] ! store third byte -.ci_sm_exit: - membar #Sync ! sync error barrier - stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - retl - mov %g0, %o0 ! return 0 - - .align 16 -.ci_med: - xor %o0, %o1, %o3 ! setup alignment check - btst 1, %o3 - bnz,pt %ncc, .ci_sm_movebytes ! unaligned - nop - btst 3, %o3 - bnz,pt %ncc, .ci_med_half ! halfword aligned - nop - btst 7, %o3 - bnz,pt %ncc, .ci_med_word ! word aligned - nop -.ci_med_long: - btst 3, %o0 ! check for - bz,pt %ncc, .ci_med_long1 ! word alignment - nop -.ci_med_long0: - lduba [%o0]ASI_USER, %o3 ! load one byte - inc %o0 - stb %o3,[%o1] ! store byte - inc %o1 - btst 3, %o0 - bnz,pt %ncc, .ci_med_long0 - dec %o2 -.ci_med_long1: ! word aligned - btst 7, %o0 ! check for long word - bz,pt %ncc, .ci_med_long2 - nop - lduwa [%o0]ASI_USER, %o3 ! load word - add %o0, 4, %o0 ! advance SRC by 4 - stw %o3, [%o1] ! store word - add %o1, 4, %o1 ! advance DST by 4 - sub %o2, 4, %o2 ! reduce count by 4 -! -! Now long word aligned and have at least 32 bytes to move -! -.ci_med_long2: - sub %o2, 31, %o2 ! adjust count to allow cc zero test -.ci_med_lmove: - ldxa [%o0]ASI_USER, %o3 ! read long word - subcc %o2, 32, %o2 ! reduce count by 32 - stx %o3, [%o1] ! write long word - add %o0, 8, %o0 ! advance SRC by 8 - ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words - add %o0, 8, %o0 ! advance SRC by 8 - stx %o3, [%o1 + 8] - add %o1, 32, %o1 ! advance DST by 32 - ldxa [%o0]ASI_USER, %o3 - add %o0, 8, %o0 ! advance SRC by 8 - stx %o3, [%o1 - 16] - ldxa [%o0]ASI_USER, %o3 - add %o0, 8, %o0 ! advance SRC by 8 - bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left - stx %o3, [%o1 - 8] - addcc %o2, 24, %o2 ! restore count to long word offset - ble,pt %ncc, .ci_med_lextra ! check for more long words to move - nop -.ci_med_lword: - ldxa [%o0]ASI_USER, %o3 ! read long word - subcc %o2, 8, %o2 ! reduce count by 8 - stx %o3, [%o1] ! write long word - add %o0, 8, %o0 ! advance SRC by 8 - bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left - add %o1, 8, %o1 ! advance DST by 8 -.ci_med_lextra: - addcc %o2, 7, %o2 ! restore rest of count - bz,pt %ncc, .ci_sm_exit ! if zero, then done - deccc %o2 - bz,pt %ncc, .ci_sm_byte - nop - ba,pt %ncc, .ci_sm_half - nop - - .align 16 - nop ! instruction alignment - ! see discussion at start of file -.ci_med_word: - btst 3, %o0 ! check for - bz,pt %ncc, .ci_med_word1 ! word alignment - nop -.ci_med_word0: - lduba [%o0]ASI_USER, %o3 ! load one byte - inc %o0 - stb %o3,[%o1] ! store byte - inc %o1 - btst 3, %o0 - bnz,pt %ncc, .ci_med_word0 - dec %o2 -! -! Now word aligned and have at least 36 bytes to move -! -.ci_med_word1: - sub %o2, 15, %o2 ! adjust count to allow cc zero test -.ci_med_wmove: - lduwa [%o0]ASI_USER, %o3 ! read word - subcc %o2, 16, %o2 ! reduce count by 16 - stw %o3, [%o1] ! write word - add %o0, 4, %o0 ! advance SRC by 4 - lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words - add %o0, 4, %o0 ! advance SRC by 4 - stw %o3, [%o1 + 4] - add %o1, 16, %o1 ! advance DST by 16 - lduwa [%o0]ASI_USER, %o3 - add %o0, 4, %o0 ! advance SRC by 4 - stw %o3, [%o1 - 8] - lduwa [%o0]ASI_USER, %o3 - add %o0, 4, %o0 ! advance SRC by 4 - bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left - stw %o3, [%o1 - 4] - addcc %o2, 12, %o2 ! restore count to word offset - ble,pt %ncc, .ci_med_wextra ! check for more words to move - nop -.ci_med_word2: - lduwa [%o0]ASI_USER, %o3 ! read word - subcc %o2, 4, %o2 ! reduce count by 4 - stw %o3, [%o1] ! write word - add %o0, 4, %o0 ! advance SRC by 4 - bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left - add %o1, 4, %o1 ! advance DST by 4 -.ci_med_wextra: - addcc %o2, 3, %o2 ! restore rest of count - bz,pt %ncc, .ci_sm_exit ! if zero, then done - deccc %o2 - bz,pt %ncc, .ci_sm_byte - nop - ba,pt %ncc, .ci_sm_half - nop - - .align 16 - nop ! instruction alignment - ! see discussion at start of file -.ci_med_half: - btst 1, %o0 ! check for - bz,pt %ncc, .ci_med_half1 ! half word alignment - nop - lduba [%o0]ASI_USER, %o3 ! load one byte - inc %o0 - stb %o3,[%o1] ! store byte - inc %o1 - dec %o2 -! -! Now half word aligned and have at least 38 bytes to move -! -.ci_med_half1: - sub %o2, 7, %o2 ! adjust count to allow cc zero test -.ci_med_hmove: - lduha [%o0]ASI_USER, %o3 ! read half word - subcc %o2, 8, %o2 ! reduce count by 8 - sth %o3, [%o1] ! write half word - add %o0, 2, %o0 ! advance SRC by 2 - lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords - add %o0, 2, %o0 ! advance SRC by 2 - sth %o3, [%o1 + 2] - add %o1, 8, %o1 ! advance DST by 8 - lduha [%o0]ASI_USER, %o3 - add %o0, 2, %o0 ! advance SRC by 2 - sth %o3, [%o1 - 4] - lduha [%o0]ASI_USER, %o3 - add %o0, 2, %o0 ! advance SRC by 2 - bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left - sth %o3, [%o1 - 2] - addcc %o2, 7, %o2 ! restore count - bz,pt %ncc, .ci_sm_exit - deccc %o2 - bz,pt %ncc, .ci_sm_byte - nop - ba,pt %ncc, .ci_sm_half - nop - -.sm_copyin_err: - membar #Sync - stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - mov SM_SAVE_SRC, %o0 - mov SM_SAVE_DST, %o1 - mov SM_SAVE_COUNT, %o2 - ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler - tst %o3 - bz,pt %ncc, 3f ! if not, return error - nop - ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with - jmp %o5 ! original arguments - nop -3: - retl - or %g0, -1, %o0 ! return errno value - - SET_SIZE(copyin) - - -/* - * The _more entry points are not intended to be used directly by - * any caller from outside this file. They are provided to allow - * profiling and dtrace of the portions of the copy code that uses - * the floating point registers. - * This entry is particularly important as DTRACE (at least as of - * 4/2004) does not support leaf functions. - */ - - ENTRY(copyin_more) -.copyin_more: - save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp - set .copyin_err, REAL_LOFAULT - -/* - * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes - */ -.do_copyin: - set copyio_fault, %l7 ! .copyio_fault is lofault val - - ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler - membar #Sync ! sync error barrier - stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault - - mov %i0, SAVE_SRC - mov %i1, SAVE_DST - mov %i2, SAVE_COUNT - - FP_NOMIGRATE(6, 7) - - rd %fprs, %o2 ! check for unused fp - st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs - - ! FPU enabled ? If not, enable it. - btst FPRS_FEF, %o2 - bz,a,pt %icc, .do_blockcopyin - wr %g0, FPRS_FEF, %fprs - - ! save the FP registers even if DU is not set. - - BST_FPQ3Q4_TOSTACK(%o2) - -.do_blockcopyin: - rd %gsr, %o2 - stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr - or %l6, FPUSED_FLAG, %l6 - - andcc DST, VIS_BLOCKSIZE - 1, TMP - mov ASI_USER, %asi - bz,pt %ncc, 2f - neg TMP - add TMP, VIS_BLOCKSIZE, TMP - - ! TMP = bytes required to align DST on FP_BLOCK boundary - ! Using SRC as a tmp here - cmp TMP, 3 - bleu,pt %ncc, 1f - sub CNT,TMP,CNT ! adjust main count - sub TMP, 3, TMP ! adjust for end of loop test -.ci_blkalign: - lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration - stb SRC, [DST] - subcc TMP, 4, TMP - lduba [REALSRC + 1]%asi, SRC - add REALSRC, 4, REALSRC - stb SRC, [DST + 1] - lduba [REALSRC - 2]%asi, SRC - add DST, 4, DST - stb SRC, [DST - 2] - lduba [REALSRC - 1]%asi, SRC - bgu,pt %ncc, .ci_blkalign - stb SRC, [DST - 1] - - addcc TMP, 3, TMP ! restore count adjustment - bz,pt %ncc, 2f ! no bytes left? - nop -1: lduba [REALSRC]%asi, SRC - inc REALSRC - inc DST - deccc TMP - bgu %ncc, 1b - stb SRC, [DST - 1] - -2: - andn REALSRC, 0x7, SRC - alignaddr REALSRC, %g0, %g0 - - ! SRC - 8-byte aligned - ! DST - 64-byte aligned - prefetcha [SRC]%asi, #one_read - prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #one_read - prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #one_read - prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read - ldda [SRC]%asi, %d32 -#if FIRST_PREFETCH > 4 - prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read -#endif - ldda [SRC + 0x08]%asi, %d34 -#if FIRST_PREFETCH > 5 - prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read -#endif - ldda [SRC + 0x10]%asi, %d36 -#if FIRST_PREFETCH > 6 - prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read -#endif - faligndata %d32, %d34, %d48 - ldda [SRC + 0x18]%asi, %d38 -#if FIRST_PREFETCH > 7 - prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read -#endif - faligndata %d34, %d36, %d50 - ldda [SRC + 0x20]%asi, %d40 - faligndata %d36, %d38, %d52 - ldda [SRC + 0x28]%asi, %d42 - faligndata %d38, %d40, %d54 - ldda [SRC + 0x30]%asi, %d44 - faligndata %d40, %d42, %d56 - ldda [SRC + 0x38]%asi, %d46 - faligndata %d42, %d44, %d58 - ldda [SRC + VIS_BLOCKSIZE]%asi, %d32 - sub CNT, VIS_BLOCKSIZE, CNT - add SRC, VIS_BLOCKSIZE, SRC - add REALSRC, VIS_BLOCKSIZE, REALSRC - ba,a,pt %ncc, 1f - nop - .align ICACHE_LINE_SIZE -1: - ldda [SRC + 0x08]%asi, %d34 - faligndata %d44, %d46, %d60 - ldda [SRC + 0x10]%asi, %d36 - faligndata %d46, %d32, %d62 - stda %d48, [DST]ASI_BLK_P - ldda [SRC + 0x18]%asi, %d38 - faligndata %d32, %d34, %d48 - ldda [SRC + 0x20]%asi, %d40 - faligndata %d34, %d36, %d50 - ldda [SRC + 0x28]%asi, %d42 - faligndata %d36, %d38, %d52 - ldda [SRC + 0x30]%asi, %d44 - faligndata %d38, %d40, %d54 - ldda [SRC + 0x38]%asi, %d46 - faligndata %d40, %d42, %d56 - sub CNT, VIS_BLOCKSIZE, CNT - ldda [SRC + VIS_BLOCKSIZE]%asi, %d32 - faligndata %d42, %d44, %d58 - prefetcha [SRC + ((FIRST_PREFETCH) * VIS_BLOCKSIZE) + 8]%asi, #one_read - add DST, VIS_BLOCKSIZE, DST - prefetcha [SRC + ((SECOND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read - add REALSRC, VIS_BLOCKSIZE, REALSRC - cmp CNT, VIS_BLOCKSIZE + 8 - bgu,pt %ncc, 1b - add SRC, VIS_BLOCKSIZE, SRC - - ! only if REALSRC & 0x7 is 0 - cmp CNT, VIS_BLOCKSIZE - bne %ncc, 3f - andcc REALSRC, 0x7, %g0 - bz,pt %ncc, 2f - nop -3: - faligndata %d44, %d46, %d60 - faligndata %d46, %d32, %d62 - stda %d48, [DST]ASI_BLK_P - add DST, VIS_BLOCKSIZE, DST - ba,pt %ncc, 3f - nop -2: - ldda [SRC + 0x08]%asi, %d34 - faligndata %d44, %d46, %d60 - ldda [SRC + 0x10]%asi, %d36 - faligndata %d46, %d32, %d62 - stda %d48, [DST]ASI_BLK_P - ldda [SRC + 0x18]%asi, %d38 - ldda [SRC + 0x20]%asi, %d40 - ldda [SRC + 0x28]%asi, %d42 - ldda [SRC + 0x30]%asi, %d44 - ldda [SRC + 0x38]%asi, %d46 - sub CNT, VIS_BLOCKSIZE, CNT - add DST, VIS_BLOCKSIZE, DST - add SRC, VIS_BLOCKSIZE, SRC - add REALSRC, VIS_BLOCKSIZE, REALSRC - stda %d32, [DST]ASI_BLK_P - add DST, VIS_BLOCKSIZE, DST - ba,a,pt %ncc, 4f - nop - -3: tst CNT - bz,a %ncc, 4f - nop - -5: lduba [REALSRC]ASI_USER, TMP - inc REALSRC - inc DST - deccc CNT - bgu %ncc, 5b - stb TMP, [DST - 1] -4: - -.copyin_exit: - membar #Sync - - ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr - wr %o2, 0, %gsr - - ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 - btst FPRS_FEF, %o3 - bz,pt %icc, 4f - nop - - BLD_FPQ3Q4_FROMSTACK(%o2) - - ba,pt %ncc, 1f - wr %o3, 0, %fprs ! restore fprs - -4: - FZEROQ3Q4 - wr %o3, 0, %fprs ! restore fprs - -1: - membar #Sync ! sync error barrier - andn %l6, FPUSED_FLAG, %l6 - stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - FP_ALLOWMIGRATE(5, 6) - ret - restore %g0, 0, %o0 -/* - * We got here because of a fault during copyin - * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). - */ -.copyin_err: - ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler - tst %o4 - bz,pt %ncc, 2f ! if not, return error - nop - ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with - jmp %g2 ! original arguments - restore %g0, 0, %g0 ! dispose of copy window -2: - ret - restore %g0, -1, %o0 ! return error value - - - SET_SIZE(copyin_more) - -#endif /* lint */ - -#ifdef lint - -/*ARGSUSED*/ -int -xcopyin(const void *uaddr, void *kaddr, size_t count) -{ return (0); } - -#else /* lint */ - - ENTRY(xcopyin) - - cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case - bleu,pt %ncc, .xcopyin_small ! go to larger cases - xor %o0, %o1, %o3 ! are src, dst alignable? - btst 7, %o3 ! - bz,pt %ncc, .xcopyin_8 ! check for longword alignment - nop - btst 1, %o3 ! - bz,pt %ncc, .xcopyin_2 ! check for half-word - nop - sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_1)], %o3 - tst %o3 - bz,pn %icc, .xcopyin_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .xcopyin_small ! go to small copy - nop - ba,pt %ncc, .xcopyin_more ! otherwise go to large copy - nop -.xcopyin_2: - btst 3, %o3 ! - bz,pt %ncc, .xcopyin_4 ! check for word alignment - nop - sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_2)], %o3 - tst %o3 - bz,pn %icc, .xcopyin_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .xcopyin_small ! go to small copy - nop - ba,pt %ncc, .xcopyin_more ! otherwise go to large copy - nop -.xcopyin_4: - ! already checked longword, must be word aligned - sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_4)], %o3 - tst %o3 - bz,pn %icc, .xcopyin_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .xcopyin_small ! go to small copy - nop - ba,pt %ncc, .xcopyin_more ! otherwise go to large copy - nop -.xcopyin_8: - sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_8)], %o3 - tst %o3 - bz,pn %icc, .xcopyin_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .xcopyin_small ! go to small copy - nop - ba,pt %ncc, .xcopyin_more ! otherwise go to large copy - nop - -.xcopyin_small: - sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value - or %o5, %lo(.sm_xcopyin_err), %o5 - ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul - membar #Sync ! sync error barrier - ba,pt %ncc, .sm_do_copyin ! common code - stn %o5, [THREAD_REG + T_LOFAULT] - -.xcopyin_more: - save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp - sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value - ba,pt %ncc, .do_copyin - or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT - -/* - * We got here because of fault during xcopyin - * Errno value is in ERRNO - */ -.xcopyin_err: - ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler - tst %o4 - bz,pt %ncc, 2f ! if not, return error - nop - ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with - jmp %g2 ! original arguments - restore %g0, 0, %g0 ! dispose of copy window -2: - ret - restore ERRNO, 0, %o0 ! return errno value - -.sm_xcopyin_err: - - membar #Sync - stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - mov SM_SAVE_SRC, %o0 - mov SM_SAVE_DST, %o1 - mov SM_SAVE_COUNT, %o2 - ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler - tst %o3 - bz,pt %ncc, 3f ! if not, return error - nop - ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with - jmp %o5 ! original arguments - nop -3: - retl - or %g1, 0, %o0 ! return errno value - - SET_SIZE(xcopyin) - -#endif /* lint */ - -#ifdef lint - -/*ARGSUSED*/ -int -xcopyin_little(const void *uaddr, void *kaddr, size_t count) -{ return (0); } - -#else /* lint */ - - ENTRY(xcopyin_little) - sethi %hi(.xcopyio_err), %o5 - or %o5, %lo(.xcopyio_err), %o5 - ldn [THREAD_REG + T_LOFAULT], %o4 - membar #Sync ! sync error barrier - stn %o5, [THREAD_REG + T_LOFAULT] - mov %o4, %o5 - - subcc %g0, %o2, %o3 - add %o0, %o2, %o0 - bz,pn %ncc, 2f ! check for zero bytes - sub %o2, 1, %o4 - add %o0, %o4, %o0 ! start w/last byte - add %o1, %o2, %o1 - lduba [%o0 + %o3]ASI_AIUSL, %o4 - -1: stb %o4, [%o1 + %o3] - inccc %o3 - sub %o0, 2, %o0 ! get next byte - bcc,a,pt %ncc, 1b - lduba [%o0 + %o3]ASI_AIUSL, %o4 - -2: - membar #Sync ! sync error barrier - stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - retl - mov %g0, %o0 ! return (0) - -.xcopyio_err: - membar #Sync ! sync error barrier - stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault - retl - mov %g1, %o0 - - SET_SIZE(xcopyin_little) - -#endif /* lint */ - - -/* - * Copy a block of storage - must not overlap (from + len <= to). - * No fault handler installed (to be called under on_fault()) - */ -#if defined(lint) - -/* ARGSUSED */ -void -copyin_noerr(const void *ufrom, void *kto, size_t count) -{} - -#else /* lint */ - ENTRY(copyin_noerr) - - cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case - bleu,pt %ncc, .copyin_ne_small ! go to larger cases - xor %o0, %o1, %o3 ! are src, dst alignable? - btst 7, %o3 ! - bz,pt %ncc, .copyin_ne_8 ! check for longword alignment - nop - btst 1, %o3 ! - bz,pt %ncc, .copyin_ne_2 ! check for half-word - nop - sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_1)], %o3 - tst %o3 - bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyin_ne_small ! go to small copy - nop - ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy - nop -.copyin_ne_2: - btst 3, %o3 ! - bz,pt %ncc, .copyin_ne_4 ! check for word alignment - nop - sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_2)], %o3 - tst %o3 - bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyin_ne_small ! go to small copy - nop - ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy - nop -.copyin_ne_4: - ! already checked longword, must be word aligned - sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_4)], %o3 - tst %o3 - bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyin_ne_small ! go to small copy - nop - ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy - nop -.copyin_ne_8: - sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_8)], %o3 - tst %o3 - bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyin_ne_small ! go to small copy - nop - ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy - nop - -.copyin_ne_small: - ldn [THREAD_REG + T_LOFAULT], %o4 - tst %o4 - bz,pn %ncc, .sm_do_copyin - nop - sethi %hi(.sm_copyio_noerr), %o5 - or %o5, %lo(.sm_copyio_noerr), %o5 - membar #Sync ! sync error barrier - ba,pt %ncc, .sm_do_copyin - stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault - -.copyin_noerr_more: - save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp - sethi %hi(.copyio_noerr), REAL_LOFAULT - ba,pt %ncc, .do_copyin - or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT - -.copyio_noerr: - jmp %l6 - restore %g0,0,%g0 - -.sm_copyio_noerr: - membar #Sync - stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault - jmp %o4 - nop - - SET_SIZE(copyin_noerr) -#endif /* lint */ - -/* - * Copy a block of storage - must not overlap (from + len <= to). - * No fault handler installed (to be called under on_fault()) - */ - -#if defined(lint) - -/* ARGSUSED */ -void -copyout_noerr(const void *kfrom, void *uto, size_t count) -{} - -#else /* lint */ - ENTRY(copyout_noerr) - - cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case - bleu,pt %ncc, .copyout_ne_small ! go to larger cases - xor %o0, %o1, %o3 ! are src, dst alignable? - btst 7, %o3 ! - bz,pt %ncc, .copyout_ne_8 ! check for longword alignment - nop - btst 1, %o3 ! - bz,pt %ncc, .copyout_ne_2 ! check for half-word - nop - sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_1)], %o3 - tst %o3 - bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyout_ne_small ! go to small copy - nop - ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy - nop -.copyout_ne_2: - btst 3, %o3 ! - bz,pt %ncc, .copyout_ne_4 ! check for word alignment - nop - sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_2)], %o3 - tst %o3 - bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyout_ne_small ! go to small copy - nop - ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy - nop -.copyout_ne_4: - ! already checked longword, must be word aligned - sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_4)], %o3 - tst %o3 - bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyout_ne_small ! go to small copy - nop - ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy - nop -.copyout_ne_8: - sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit - ld [%o3 + %lo(hw_copy_limit_8)], %o3 - tst %o3 - bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy - cmp %o2, %o3 ! if length <= limit - bleu,pt %ncc, .copyout_ne_small ! go to small copy - nop - ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy - nop - -.copyout_ne_small: - ldn [THREAD_REG + T_LOFAULT], %o4 - tst %o4 - bz,pn %ncc, .sm_do_copyout - nop - sethi %hi(.sm_copyio_noerr), %o5 - or %o5, %lo(.sm_copyio_noerr), %o5 - membar #Sync ! sync error barrier - ba,pt %ncc, .sm_do_copyout - stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault - -.copyout_noerr_more: - save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp - sethi %hi(.copyio_noerr), REAL_LOFAULT - ba,pt %ncc, .do_copyout - or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT - - SET_SIZE(copyout_noerr) -#endif /* lint */ - - -/* - * hwblkclr - clears block-aligned, block-multiple-sized regions that are - * longer than 256 bytes in length using spitfire's block stores. If - * the criteria for using this routine are not met then it calls bzero - * and returns 1. Otherwise 0 is returned indicating success. - * Caller is responsible for ensuring use_hw_bzero is true and that - * kpreempt_disable() has been called. - */ -#ifdef lint -/*ARGSUSED*/ -int -hwblkclr(void *addr, size_t len) -{ - return(0); -} -#else /* lint */ - ! %i0 - start address - ! %i1 - length of region (multiple of 64) - ! %l0 - saved fprs - ! %l1 - pointer to saved %d32 block - ! %l2 - saved curthread->t_lwp - - - ENTRY(hwblkclr) - ! get another window w/space for one aligned block of saved fpregs - save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp - -#ifdef ROCK_CR_6654578 - ! Address aligned to 128 byte - andcc %i0, ST_CACHE_ALIGN, %g0 - bnz,pn %ncc, .normal_hwblkclr - nop - ! multiple of 8k len, call page_hwblkclr - set PAGE_MASK, %i3 - andcc %i1, %i3, %g0 - bnz,pn %ncc, .normal_hwblkclr - nop - mov %i0, %o0 - call page_hwblkclr - mov %i1, %o1 - ret - restore %g0, 0, %o0 ! I$ sync not required - -.normal_hwblkclr: -#endif - ! Must be block-aligned - andcc %i0, (VIS_BLOCKSIZE-1), %g0 - bnz,pn %ncc, 1f - nop - - ! ... and must be 256 bytes or more - cmp %i1, 256 - blu,pn %ncc, 1f - nop - - ! ... and length must be a multiple of VIS_BLOCKSIZE - andcc %i1, (VIS_BLOCKSIZE-1), %g0 - bz,pn %ncc, 2f - nop - -1: ! punt, call bzero but notify the caller that bzero was used - mov %i0, %o0 - call bzero - mov %i1, %o1 - ! call rock_sync_icache - mov %i0, %o0 - call rock_sync_icache - mov %i0, %o0 - ret - restore %g0, 0, %o0 ! did not use block operations - -2: mov %g0, %l3 ! clear flag to say fp regs not saved - rd %fprs, %l0 ! check for unused fp - - ! FPU enabled ? If not, enable it. - btst FPRS_FEF, %l0 - bz,a,pt %icc, 1f - wr %g0, FPRS_FEF, %fprs - - ! save the FP registers even if DU is not set. - - membar #Sync - add %fp, STACK_BIAS - 65, %l1 - and %l1, -VIS_BLOCKSIZE, %l1 - stda %d32, [%l1]ASI_BLK_P - ! Set a flag saying fp regs are saved. - mov 1, %l3 - - ! Need to wait only here for the above save to be completed - membar #StoreStore|#StoreLoad|#LoadStore - -1: wr %g0, ASI_BLK_P, %asi - - ! Clear block - movxtod %g0, %d32 - movxtod %g0, %d34 - fsrc1 %d32, %d36 - fsrc1 %d32, %d38 - fsrc1 %d32, %d40 - fsrc1 %d32, %d42 - fsrc1 %d32, %d44 - fsrc1 %d32, %d46 - - mov 256, %i3 - ba,pt %ncc, .pz_doblock - nop - -.pz_blkstart: - ! stda %d32, [%i0 + 192]%asi ! in dly slot of branch that got us here -#ifdef ROCK_CR_6654578 - prefetcha [%i0 + VIS_COPY_THRESHOLD + 128]%asi, #n_writes -#endif - stda %d32, [%i0 + 128]%asi -#ifdef ROCK_CR_6654578 - prefetcha [%i0 + VIS_COPY_THRESHOLD + 64]%asi, #n_writes -#endif - stda %d32, [%i0 + 64]%asi -#ifdef ROCK_CR_6654578 - prefetcha [%i0 + VIS_COPY_THRESHOLD + 0]%asi, #n_writes -#endif - stda %d32, [%i0]%asi -.pz_zinst: - add %i0, %i3, %i0 - sub %i1, %i3, %i1 -.pz_doblock: -#ifdef ROCK_CR_6654578 - prefetcha [%i0 + VIS_COPY_THRESHOLD + 192]%asi, #n_writes -#endif - cmp %i1, 256 - bgeu,a %ncc, .pz_blkstart - stda %d32, [%i0 + 192]%asi - - cmp %i1, 64 - blu %ncc, .pz_finish - - andn %i1, (64-1), %i3 - srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words - set .pz_zinst, %i4 - sub %i4, %i2, %i4 - jmp %i4 - nop - -.pz_finish: - brz,a %l3, .pz_finished - wr %l0, 0, %fprs ! restore fprs - - ! restore fpregs from stack - ldda [%l1]ASI_BLK_P, %d32 - wr %l0, 0, %fprs ! restore fprs - -.pz_finished: - membar #Sync - ret - restore %g0, 0, %o0 ! return (bzero or not) - - SET_SIZE(hwblkclr) -#endif /* lint */ - -#ifdef lint -/*ARGSUSED*/ -void -hw_pa_bcopy32(uint64_t src, uint64_t dst) -{} -#else /*!lint */ - /* - * Copy 32 bytes of data from src (%o0) to dst (%o1) - * using physical addresses. - */ - ENTRY_NP(hw_pa_bcopy32) - rdpr %pstate, %g1 - andn %g1, PSTATE_IE, %g2 - wrpr %g0, %g2, %pstate - - rdpr %pstate, %g0 - ldxa [%o0]ASI_MEM, %o2 - add %o0, 8, %o0 - ldxa [%o0]ASI_MEM, %o3 - add %o0, 8, %o0 - ldxa [%o0]ASI_MEM, %o4 - add %o0, 8, %o0 - ldxa [%o0]ASI_MEM, %o5 - stxa %o2, [%o1]ASI_MEM - add %o1, 8, %o1 - stxa %o3, [%o1]ASI_MEM - add %o1, 8, %o1 - stxa %o4, [%o1]ASI_MEM - add %o1, 8, %o1 - stxa %o5, [%o1]ASI_MEM - - retl - wrpr %g0, %g1, %pstate - - SET_SIZE(hw_pa_bcopy32) - -#endif /* lint */ - - -/* - * Zero a block of storage. - * - * uzero is used by the kernel to zero a block in user address space. - */ - - -#if defined(lint) - -/* ARGSUSED */ -int -kzero(void *addr, size_t count) -{ return(0); } - -/* ARGSUSED */ -void -uzero(void *addr, size_t count) -{} - -#else /* lint */ - - ENTRY(uzero) - ! - ! Set a new lo_fault handler only if we came in with one - ! already specified. - ! - wr %g0, ASI_USER, %asi - ldn [THREAD_REG + T_LOFAULT], %o5 - tst %o5 - bz,pt %ncc, .do_zero - sethi %hi(.zeroerr), %o2 - or %o2, %lo(.zeroerr), %o2 - membar #Sync - ba,pt %ncc, .do_zero - stn %o2, [THREAD_REG + T_LOFAULT] - - ENTRY(kzero) - ! - ! Always set a lo_fault handler - ! - wr %g0, ASI_P, %asi - ldn [THREAD_REG + T_LOFAULT], %o5 - sethi %hi(.zeroerr), %o2 - or %o5, LOFAULT_SET, %o5 - or %o2, %lo(.zeroerr), %o2 - membar #Sync - ba,pt %ncc, .do_zero - stn %o2, [THREAD_REG + T_LOFAULT] - -/* - * We got here because of a fault during kzero or if - * uzero or bzero was called with t_lofault non-zero. - * Otherwise we've already run screaming from the room. - * Errno value is in %g1. Note that we're here iff - * we did set t_lofault. - */ -.zeroerr: - ! - ! Undo asi register setting. Just set it to be the - ! kernel default without checking. - ! - wr %g0, ASI_P, %asi - - ! - ! We did set t_lofault. It may well have been zero coming in. - ! -1: - tst %o5 - membar #Sync - bne,pn %ncc, 3f - andncc %o5, LOFAULT_SET, %o5 -2: - ! - ! Old handler was zero. Just return the error. - ! - retl ! return - mov %g1, %o0 ! error code from %g1 -3: - ! - ! We're here because %o5 was non-zero. It was non-zero - ! because either LOFAULT_SET was present, a previous fault - ! handler was present or both. In all cases we need to reset - ! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET - ! before we either simply return the error or we invoke the - ! previously specified handler. - ! - be %ncc, 2b - stn %o5, [THREAD_REG + T_LOFAULT] - jmp %o5 ! goto real handler - nop - SET_SIZE(kzero) - SET_SIZE(uzero) - -#endif /* lint */ - -/* - * Zero a block of storage. - */ - -#if defined(lint) - -/* ARGSUSED */ -void -bzero(void *addr, size_t count) -{} - -#else /* lint */ - - ENTRY(bzero) - - wr %g0, ASI_P, %asi - ldn [THREAD_REG + T_LOFAULT], %o5 ! save old vector - tst %o5 - bz,pt %ncc, .do_zero - sethi %hi(.zeroerr), %o2 - or %o2, %lo(.zeroerr), %o2 - membar #Sync ! sync error barrier - stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector - -.do_zero: - /* - * If 0 bytes to xfer return - */ - brnz %o1, continue_bzero - nop - ba .bzero_exit - nop -continue_bzero: - prefetch [%o0],2 - cmp %o1, 8 - bge,pt %ncc, xfer_8_or_more - nop - -.byteclr: - deccc %o1 ! byte clearing loop - stba %g0, [%o0]%asi - bgu,pt %ncc, .byteclr - inc %o0 - ba .bzero_exit - nop - -xfer_8_or_more: - andcc %o0, 7, %o3 ! is add aligned on a 8 byte bound - brz,pt %o3, blkchk - sub %o3, 8, %o3 ! -(bytes till double aligned) - add %o1, %o3, %o1 ! update o1 with new count -1: - stba %g0, [%o0]%asi - inccc %o3 - bl,pt %ncc, 1b - inc %o0 - - ! Now addr is double word aligned -blkchk: - cmp %o1, 767 ! if large count use Block ld/st - bg,pt %ncc,blkwr - nop - and %o1, 24, %o3 ! o3 is {0, 8, 16, 24} - brz %o3, skip_dw_loop - nop -1: subcc %o3, 8, %o3 ! double-word loop - stxa %g0, [%o0]%asi - bgu,pt %ncc, 1b - add %o0, 8, %o0 -skip_dw_loop: - andncc %o1, 31, %o4 ! o4 has 32 byte aligned count - brz,pn %o4, 3f - nop - ba loop_32byte - nop - - .align ICACHE_LINE_SIZE - -loop_32byte: - subcc %o4, 32, %o4 ! main loop, 32 bytes per iteration - stxa %g0, [%o0]%asi - stxa %g0, [%o0 + 8]%asi - stxa %g0, [%o0 + 16]%asi - stxa %g0, [%o0 + 24]%asi - bne,pt %ncc, loop_32byte - add %o0, 32, %o0 -3: - and %o1, 7, %o1 ! o1 has the remaining bytes (<8) - brnz %o1, .byteclr - nop - ba .bzero_exit - nop -blkwr: - sub %o0,1,%o3 - andn %o3,0x7f,%o4 - add %o4,128,%o4 - prefetch [%o4],2 !prefetch next 128b - prefetch [%o4+64],2 - prefetch [%o4+(2*64)],2 - prefetch [%o4+(3*64)],2 - - andcc %o0,0x7f,%o3 !o3=0 , means it is already 128 align - brz,pn %o3,aligned_on_128_bzero - sub %o3,128,%o3 - - add %o1,%o3,%o1 -align_to_128_bzero: - stxa %g0,[%o0]%asi - addcc %o3,8,%o3 - bl,pt %ncc,align_to_128_bzero - add %o0,8,%o0 - - - -aligned_on_128_bzero: - ! if the addr is 512 byte aligned and bytes to zero - ! are greater than or equal to 4096 do a stingray_optimized_bzero - andcc %o0,0x1ff,%o3 ! Is addr 512 byte aligned ? - brnz,pn %o3, 4f - mov %o1,%g5 - set 4096, %g4 - subcc %o1, %g4, %g0 - bge,pn %ncc, stingray_optimized_bzero - nop -4: - ! addr(dest. buffer) is not aligned to 512 byte - ! if the number of bytes to zero are less than 4096 after - ! aligning the addr to 512 byte then do interleave128_bzero. - - sub %o0,8,%o4 - andn %o4,0x1ff,%o3 - add %o3,0x200,%o3 !o3 = addr aligned to 512 byte. - sub %o3,%o0,%o3 !o3 = number of bytes to zero to align addr to 512 - sub %o1,%o3,%g5 !g5 = bytes to zero from 512 byte aligned addr - set 4096, %g4 - subcc %g5, %g4, %g0 - bge,pn %ncc,6f - nop - ! clear %g5 to indicate that there is no need to do - ! stingray_optimized_bzero - mov %g0, %g5 - add %o0, %o1, %o4 - ba interleave128_bzero - nop -6: - ! %g5 contains the number of bytes to zero after 512 byte alignment - ! We zero the bytes in dest. buffer until it is 512 byte aligned - ! and call stingray_optimized_bzero - ! if the nuber of bytes to zero(until 512 alignment) is less than 256 - ! we call bzero_word, else we call interleave128_bzero - mov %o3, %o1 - subcc %o3,256,%g0 - bl,pn %ncc,bzero_word - add %o0,%o1,%o4 !cal the last byte to write %o4 - ba interleave128_bzero - nop - - .align 64 -interleave128_bzero: - ! %o0 has the dest. buffer addr - ! %o1 has the number of bytes to zero - ! %o4 has the addr of the dest. buffer at or beyond which no write - ! is to be done. - ! %g5 has the number of bytes to zero using stingray_optimized_bzero - - add %o0, 256, %o3 - prefetch [%o3], 2 !1st 64 byte line of next 256 byte block - add %o0, 384, %o3 - prefetch [%o3], 2 !3rd 64 byte line of next 256 byte block - add %o0, 320, %o3 - prefetch [%o3], 2 !2nd 64 byte line of next 256 byte block - add %o0, 448, %o3 - prefetch [%o3], 2 !4th 64 byte line of next 256 byte block - mov %o0, %o3 - stxa %g0,[%o3]%asi !1st 64 byte line - add %o0,128,%o3 - stxa %g0,[%o3]%asi !3rd 64 byte line - add %o0,8,%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(2 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128 ,%o3 - stxa %g0,[%o3]%asi - add %o0,(3 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(4 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(5 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(6 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(7 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(8 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(9 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(10 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(11 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(12 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(13 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(14 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(15 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - ! check if the next 256 byte copy will not exceed the number of - ! bytes remaining to be copied. - ! %g4 points to the dest buffer after copying 256 bytes more. - ! %o4 points to dest. buffer at or beyond which no writes should be done. - add %o0,512,%g4 - subcc %o4,%g4,%g0 - bge,pt %ncc,interleave128_bzero - add %o0,256,%o0 - -bzero_word: - and %o1,255,%o3 - and %o3,7,%o1 - - ! Set the remaining doubles - subcc %o3, 8, %o3 ! Can we store any doubles? - bl,pn %ncc, 6f - and %o1, 7, %o1 ! calc bytes left after doubles - -5: - stxa %g0, [%o0]%asi - subcc %o3, 8, %o3 - bge,pt %ncc, 5b - add %o0, 8, %o0 -6: - ! Set the remaining bytes - brz %o1, can_we_do_stingray_optimized_bzero - -7: - deccc %o1 ! byte clearing loop - stba %g0, [%o0]%asi - bgu,pt %ncc, 7b - inc %o0 -can_we_do_stingray_optimized_bzero: - mov %g5, %o1 - brnz,pn %o1, stingray_optimized_bzero - nop - - ba .bzero_exit - nop - -stingray_optimized_bzero: - save %sp, -SA(MINFRAME), %sp - mov %i0, %o0 - mov %i1, %o1 - mov %i2, %o2 - mov %i3, %o3 - mov %i5, %o5 -init: - set 4096,%o2 - - prefetch [%o0+0],2 - prefetch [%o0+(64*1)],2 - prefetch [%o0+(64*2)],2 - prefetch [%o0+(64*3)],2 - prefetch [%o0+(64*4)],2 - prefetch [%o0+(64*5)],2 - prefetch [%o0+(64*6)],2 - prefetch [%o0+(64*7)],2 - prefetch [%o0+(64*8)],2 - prefetch [%o0+(64*9)],2 - prefetch [%o0+(64*10)],2 - prefetch [%o0+(64*11)],2 - prefetch [%o0+(64*12)],2 - prefetch [%o0+(64*13)],2 - prefetch [%o0+(64*14)],2 - prefetch [%o0+(64*15)],2 - ba stingray_optimized_4k_zero_loop - add %o0,%g5,%g5 - ! Local register usage: - ! prefetching into L1 cache. - ! %l3 dest. buffer at start of inner loop. - ! %l5 iteration counter to make buddy loop execute 2 times. - ! %l6 iteration counter to make inner loop execute 4 times. - ! %l7 address at far ahead of current dest. buffer for prefetching - ! into L2 cache. - - .align 64 -stingray_optimized_4k_zero_loop: - set 2,%l5 - add %o0, 0, %l3 -bzero_buddyloop: - set PF_FAR, %g4 - add %o0, %g4, %l7 - - ! Prefetch ahead by 2 pages to get TLB entry in advance. - set 2*PF_FAR, %g4 - add %o0, %g4, %g4 - prefetch [%g4+%g0],2 - - set 4,%l6 - set 0, %g4 - - ! Each iteration of the inner loop below writes 8 sequential lines. - ! This loop is iterated 4 times, to move a total of 32 lines, all of - ! which have the same value of PA[9], so we increment the base - ! address by 1024 bytes in each iteration, which varies PA[10]. -bzero_innerloop: - add %o0, PF_FAR, %o3 - prefetch [%o3],2 - add %o3, 64, %o3 - prefetch [%o3],2 - add %o3, 64, %o3 - prefetch [%o3],2 - add %o3, 64, %o3 - prefetch [%o3],2 - add %o3, 64, %o3 - prefetch [%o3],2 - add %o3, 64, %o3 - prefetch [%o3],2 - add %o3, 64, %o3 - prefetch [%o3],2 - add %o3, 64, %o3 - prefetch [%o3],2 - - mov %o0, %o3 - stxa %g0,[%o3]%asi !1st 64 byte line - add %o0,128,%o3 - stxa %g0,[%o3]%asi !3rd 64 byte line - add %o0,8,%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(2 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128 ,%o3 - stxa %g0,[%o3]%asi - add %o0,(3 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(4 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(5 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(6 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(7 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(8 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(9 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(10 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(11 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(12 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(13 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(14 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(15 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - - add %o0,256,%o0 - - mov %o0, %o3 - stxa %g0,[%o3]%asi !1st 64 byte line - add %o0,128,%o3 - stxa %g0,[%o3]%asi !3rd 64 byte line - add %o0,8,%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(2 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128 ,%o3 - stxa %g0,[%o3]%asi - add %o0,(3 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(4 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(5 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(6 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(7 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(8 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(9 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(10 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(11 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(12 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(13 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(14 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - add %o0,(15 * 8),%o3 - stxa %g0,[%o3]%asi - add %o3,128,%o3 - stxa %g0,[%o3]%asi - - subcc %l6,1,%l6 ! Decrement the inner loop counter. - - ! Now increment by 256 + 512 so we don't toggle PA[9] - add %o0, 768, %o0 - - bg,pt %ncc,bzero_innerloop - nop - ! END OF INNER LOOP - - subcc %l5,1,%l5 - add %l3, 512, %o0 ! increment %o0 to first buddy line of dest. - bg,pt %ncc, bzero_buddyloop - nop - add %o0, 3584, %o0 ! Advance both base addresses by 4k - add %o0,%o2,%i5 - subcc %g5,%i5,%g0 - bge,pt %ncc,stingray_optimized_4k_zero_loop - nop - - ! stingray_optimized_bzero_ends_here - - mov %o0, %i0 - mov %o1, %i1 - mov %o2, %i2 - mov %o3, %i3 - mov %o5, %i5 - restore - sub %g5,%o0,%o1 !how many byte left - brz,pn %o1,.bzero_exit - mov %g0,%g5 - add %o0,%o1,%o4 !cal the last byte to write %o4 - subcc %o1,256,%g0 - bge,pt %ncc,interleave128_bzero - mov %g0,%g5 - - ba bzero_word - nop - -.bzero_exit: - ! - ! We're just concerned with whether t_lofault was set - ! when we came in. We end up here from either kzero() - ! or bzero(). kzero() *always* sets a lofault handler. - ! It ors LOFAULT_SET into %o5 to indicate it has done - ! this even if the value of %o5 is otherwise zero. - ! bzero() sets a lofault handler *only* if one was - ! previously set. Accordingly we need to examine - ! %o5 and if it is non-zero be sure to clear LOFAULT_SET - ! before resetting the error handler. - ! - tst %o5 - bz %ncc, 1f - andn %o5, LOFAULT_SET, %o5 - membar #Sync ! sync error barrier - stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault -1: - retl - clr %o0 ! return (0) - - SET_SIZE(bzero) -#endif /* lint */ - -#ifdef ROCK_CR_6654578 -/* This code tries to maximize bandwidth by being clever about accessing - * the two cache lines that are BUDDY PAIRS in the L3 cache. When line 0 - * of a pair is accessed, it will take hundreds of cycles to get the line - * from memory, which brings in a 128-byte line to L3. Until the line is - * installed in L3, any other access to that line (such as buddy line 1) - * is blocked. For best throughput, we access many lines that are the first - * of their buddy pairs, and only after many such accesses have been made, - * we access the sequence of second buddy pair lines. Hopefully the second - * set of accesses comes after the L3 lines are installed, so the accesses - * hitin L3 without being delayed. This should yield better throughput. - * To keep this code simple, we assume the addresses given are aligned at - * least on a 128 byte boundary, and the length is assumed to be a multiple - * of 8k bytes. - */ - -#ifdef lint -/*ARGSUSED*/ -int -page_hwblkclr(void *addr, size_t len) -{ - return(0); -} -#else /* lint */ - ENTRY(page_hwblkclr) - save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp - - ! %i0 address - ! %i1 len - - rd %fprs, %l0 - mov %g0, %l2 ! clear flag to say fp regs not saved - - ! FPU enabled ? If not, enable it. - btst FPRS_FEF, %l0 - bz,a,pt %icc, 1f - wr %g0, FPRS_FEF, %fprs - - ! save in-use fpregs on stack - - add %fp, STACK_BIAS - 65, %l1 ! get stack frame for fp regs - and %l1, -VIS_BLOCKSIZE, %l1 ! block align frame - stda %d32, [%l1]ASI_BLK_P ! %l1 = addr of saved fp regs - - ! Set a flag saying fp regs are saved. - mov 1, %l2 - - ! enable fp - -1: membar #StoreStore|#StoreLoad|#LoadStore - - movxtod %g0, %d32 - movxtod %g0, %d34 - movxtod %g0, %d36 - movxtod %g0, %d38 - movxtod %g0, %d40 - movxtod %g0, %d42 - movxtod %g0, %d44 - movxtod %g0, %d46 - - ba myloop2 - srl %i1,12,%i1 -.align 64 -myloop2: - mov 2,%l5 - mov %i0, %l3 -buddyloop: - set 4096, %l4 - add %i0, %l4, %l4 - prefetcha [%l4]ASI_BLK_P, #n_writes - mov 32,%l6 -innerloop: - - subcc %l6,1,%l6 - stda %d32,[%i0]ASI_BLK_P - bg,pt %icc,innerloop - add %i0, 128, %i0 - - subcc %l5,1,%l5 - add %l3, 64, %i0 - bg,pt %icc,buddyloop - nop - subcc %i1,1,%i1 - add %i0, 4032, %i0 - bg,pt %icc,myloop2 - nop - - brz,a %l2, 2f - wr %l0, 0, %fprs ! restore fprs - - ! restore fpregs from stack - ldda [%l1]ASI_BLK_P, %d32 - - wr %l0, 0, %fprs ! restore fprs -2: - membar #Sync - - ret - restore %g0, 0, %o0 - - SET_SIZE(page_hwblkclr) -#endif /* lint */ -#endif /* ROCK_CR_6654578 */ - -#if defined(lint) - -int use_hw_bcopy = 1; -int use_hw_bzero = 1; -uint_t hw_copy_limit_1 = 0x100; -uint_t hw_copy_limit_2 = 0x200; -uint_t hw_copy_limit_4 = 0x400; -uint_t hw_copy_limit_8 = 0x400; - -#else /* !lint */ - - DGDEF(use_hw_bcopy) - .word 1 - DGDEF(use_hw_bzero) - .word 1 - DGDEF(hw_copy_limit_1) - .word 0x100 - DGDEF(hw_copy_limit_2) - .word 0x200 - DGDEF(hw_copy_limit_4) - .word 0x400 - DGDEF(hw_copy_limit_8) - .word 0x400 - - - .align 64 - .section ".text" -#endif /* !lint */
--- a/usr/src/uts/sun4v/io/px/px_lib4v.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/io/px/px_lib4v.c Thu Aug 06 17:39:39 2009 -0700 @@ -41,8 +41,6 @@ #include <sys/hotplug/pci/pcihp.h> #include "px_lib4v.h" #include "px_err.h" -#include <vm/vm_dep.h> -#include <vm/hat_sfmmu.h> /* mask for the ranges property in calculating the real PFN range */ uint_t px_ranges_phi_mask = ((1 << 28) -1); @@ -547,9 +545,6 @@ else sync_dir = HVIO_DMA_SYNC_DIR_TO_DEV; - if (force_sync_icache_after_dma == 0 && !icache_is_coherent) - sync_dir |= HVIO_DMA_SYNC_DIR_NO_ICACHE_FLUSH; - off += mp->dmai_offset; pg_off = off & MMU_PAGEOFFSET; @@ -560,27 +555,12 @@ end = MMU_BTOPR(off + len - 1); for (idx = MMU_BTOP(off); idx < end; idx++, len -= bytes_synced, pg_off = 0) { - size_t bytes_to_sync = MIN(len, MMU_PAGESIZE - pg_off); - - while (hvio_dma_sync(hdl, - MMU_PTOB(PX_GET_MP_PFN(mp, idx)) + pg_off, - bytes_to_sync, sync_dir, &bytes_synced) != H_EOK) { - - if (!(sync_dir & HVIO_DMA_SYNC_DIR_NO_ICACHE_FLUSH)) { - bytes_synced = 0; - break; - } + size_t bytes_to_sync = bytes_to_sync = + MIN(len, MMU_PAGESIZE - pg_off); - /* - * Some versions of firmware do not support - * this sync_dir flag. If the call fails clear - * the flag and retry the call. Also, set the - * global so that we dont set the sync_dir - * flag again. - */ - sync_dir &= ~HVIO_DMA_SYNC_DIR_NO_ICACHE_FLUSH; - force_sync_icache_after_dma = 1; - } + if (hvio_dma_sync(hdl, MMU_PTOB(PX_GET_MP_PFN(mp, idx)) + + pg_off, bytes_to_sync, sync_dir, &bytes_synced) != H_EOK) + break; DBG(DBG_LIB_DMA, dip, "px_lib_dma_sync: Called hvio_dma_sync " "ra = %p bytes to sync = %x bytes synced %x\n",
--- a/usr/src/uts/sun4v/io/px/px_lib4v.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/io/px/px_lib4v.h Thu Aug 06 17:39:39 2009 -0700 @@ -97,8 +97,7 @@ #define PX_VPCI_MINOR_VER_0 0x0ull #define PX_VPCI_MINOR_VER_1 0x1ull -#define PX_VPCI_MINOR_VER_2 0x2ull -#define PX_VPCI_MINOR_VER PX_VPCI_MINOR_VER_2 +#define PX_VPCI_MINOR_VER PX_VPCI_MINOR_VER_1 extern uint64_t hvio_config_get(devhandle_t dev_hdl, pci_device_t bdf, pci_config_offset_t off, pci_config_size_t size, pci_cfg_data_t *data_p);
--- a/usr/src/uts/sun4v/ml/hcall.s Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/ml/hcall.s Thu Aug 06 17:39:39 2009 -0700 @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -321,20 +321,10 @@ /*ARGSUSED*/ uint64_t -hv_mem_iflush(uint64_t real_addr, uint64_t length, uint64_t *flushed_len) -{ return (0); } - -/*ARGSUSED*/ -uint64_t -hv_mem_iflush_all() -{ return (0); } - -/*ARGSUSED*/ -uint64_t hv_tm_enable(uint64_t enable) { return (0); } -/*ARGSUSED*/ +/*ARGSUSED*/ uint64_t hv_mach_set_watchdog(uint64_t timeout, uint64_t *time_remaining) { return (0); } @@ -742,34 +732,7 @@ SET_SIZE(hv_mem_sync) /* - * HV_MEM_IFLUSH - * arg0 memory real address - * arg1 flush length - * ret0 status - * ret1 flushed length - * - */ - ENTRY(hv_mem_iflush) - mov %o2, %o4 - mov HV_MEM_IFLUSH, %o5 - ta FAST_TRAP - retl - stx %o1, [%o4] - SET_SIZE(hv_mem_iflush) - - /* - * HV_MEM_IFLUSH_ALL - * ret0 status - */ - ENTRY(hv_mem_iflush_all) - mov HV_MEM_IFLUSH_ALL, %o5 - ta FAST_TRAP - retl - nop - SET_SIZE(hv_mem_iflush_all) - - /* - * uint64_t hv_rk_tm_enable(uint64_t enable) + * uint64_t hv_tm_enable(uint64_t enable) */ ENTRY(hv_tm_enable) mov HV_TM_ENABLE, %o5
--- a/usr/src/uts/sun4v/ml/mach_interrupt.s Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/ml/mach_interrupt.s Thu Aug 06 17:39:39 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,20 +41,7 @@ #include <sys/error.h> #include <sys/mmu.h> #include <vm/hat_sfmmu.h> - #define INTR_REPORT_SIZE 64 -#define ERRH_ASI_SHIFT 56 /* bits[63:56]; see errh_er_t */ -#define NRE_ASI 0x00000001 /* ASI observed in attr field */ -#define NRE_CTX 0x00000002 /* ASI equals ASI_MMU_CTX */ -#define CRP_OBSERVED (NRE_ASI | NRE_CTX) - -#define OR_MCPU_NRE_ERROR(reg1,reg2,val) \ - add reg1, CPU_MCPU, reg2; \ - add reg2, MCPU_NRE_ERROR, reg2; \ - ldxa [reg2]ASI_MEM, reg1; \ - or reg1, val, reg1; \ - stxa reg1, [reg2]ASI_MEM - #ifdef TRAPTRACE #include <sys/traptrace.h> @@ -533,10 +520,6 @@ CPU_PADDR(%g1, %g4) ! %g1 = cpu struct paddr - add %g1, CPU_MCPU, %g4 - add %g4, MCPU_NRE_ERROR, %g4 ! &CPU->cpu_m.cpu_nre_error - stxa %g0, [%g4]ASI_MEM ! clear cpu_nre_error - 2: set CPU_NRQ_BASE_OFF, %g4 ldxa [%g1 + %g4]ASI_MEM, %g4 ! %g4 = queue base PA add %g6, %g4, %g4 ! %g4 = PA of ER in Q @@ -548,7 +531,7 @@ bne,pn %xcc, 1f ! first 8 byte is not 0 nop - /* BEGIN: move 64 bytes from queue to buf */ + /* Now we can move 64 bytes from queue to buf */ set 0, %g5 ldxa [%g4 + %g5]ASI_MEM, %g1 stxa %g1, [%g7 + %g5]ASI_MEM ! byte 0 - 7 @@ -558,14 +541,7 @@ add %g5, 8, %g5 ldxa [%g4 + %g5]ASI_MEM, %g1 stxa %g1, [%g7 + %g5]ASI_MEM ! byte 16 - 23 - /* Check for sun4v ASI */ - and %g1, ERRH_ATTR_ASI, %g1 ! isolate ASI bit - cmp %g1, ERRH_ATTR_ASI - bne,pt %xcc, 3f - nop - CPU_PADDR(%g1, %g5) - OR_MCPU_NRE_ERROR(%g1, %g5, NRE_ASI) ! cpu_nre_error |= NRE_ASI -3: set 24, %g5 + add %g5, 8, %g5 ldxa [%g4 + %g5]ASI_MEM, %g1 stxa %g1, [%g7 + %g5]ASI_MEM ! byte 24 - 31 add %g5, 8, %g5 @@ -574,20 +550,12 @@ add %g5, 8, %g5 ldxa [%g4 + %g5]ASI_MEM, %g1 stxa %g1, [%g7 + %g5]ASI_MEM ! byte 40 - 47 - /* Check for ASI==ASI_MMU_CTX */ - srlx %g1, ERRH_ASI_SHIFT, %g1 ! isolate the ASI field - cmp %g1, ASI_MMU_CTX ! ASI=0x21 for CRP - bne,pt %xcc, 4f - nop - CPU_PADDR(%g1, %g5) - OR_MCPU_NRE_ERROR(%g1, %g5, NRE_CTX) ! cpu_nre_error |= NRE_CTX -4: set 48, %g5 + add %g5, 8, %g5 ldxa [%g4 + %g5]ASI_MEM, %g1 stxa %g1, [%g7 + %g5]ASI_MEM ! byte 48 - 55 add %g5, 8, %g5 ldxa [%g4 + %g5]ASI_MEM, %g1 stxa %g1, [%g7 + %g5]ASI_MEM ! byte 56 - 63 - /* END: move 64 bytes from queue to buf */ set CPU_NRQ_SIZE, %g5 ! %g5 = queue size sub %g5, 1, %g5 ! %g5 = queu size mask @@ -608,36 +576,6 @@ membar #Sync /* - * For CRP, force a hat reload as if the context were stolen - * by storing INVALID_CONTEXT in the secondary and nulling TSB. - * Primary will be reset by usr_rtt for user-mode traps, or - * has been reset in iae_crp or dae_crp for kernel-mode. - */ - CPU_PADDR(%g1, %g5) - add %g1, CPU_MCPU, %g5 - add %g5, MCPU_NRE_ERROR, %g5 ! &CPU->cpu_m.cpu_nre_error - ldxa [%g5]ASI_MEM, %g4 - cmp %g4, CRP_OBSERVED ! confirm CRP - bne,pt %xcc, 5f - nop - mov INVALID_CONTEXT, %g5 ! force hat reload of context - mov MMU_SCONTEXT, %g7 - sethi %hi(FLUSH_ADDR), %g4 - stxa %g5, [%g7]ASI_MMU_CTX ! set secondary context reg - flush %g4 - mov %o0, %g4 - mov %o1, %g5 - mov %o5, %g7 - mov %g0, %o0 - mov %g0, %o1 - mov MMU_TSB_CTXNON0, %o5 - ta FAST_TRAP ! null TSB - nop - mov %g4, %o0 - mov %g5, %o1 - mov %g7, %o5 - - /* * Call sys_trap. %g2 is TL(arg2), %g3 is head and tail * offset(arg3). * %g3 looks like following: @@ -648,7 +586,7 @@ * * Run at PIL 14 unless we're already at PIL 15. */ -5: sllx %g3, 32, %g3 ! %g3.h = tail offset + sllx %g3, 32, %g3 ! %g3.h = tail offset or %g3, %g2, %g3 ! %g3.l = head offset rdpr %tl, %g2 ! %g2 = current tl
--- a/usr/src/uts/sun4v/ml/mach_offsets.in Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/ml/mach_offsets.in Thu Aug 06 17:39:39 2009 -0700 @@ -98,7 +98,6 @@ cpu_nrq_base_pa MCPU_NRQ_BASE cpu_nrq_size MCPU_NRQ_SIZE cpu_tstat_flags MCPU_TSTAT_FLAGS - cpu_nre_error MCPU_NRE_ERROR \#define CPU_MPCB_PA (CPU_MCPU + MCPU_MPCB_PA) \#define CPU_KWBUF_FULL (CPU_MCPU + MCPU_KWBUF_FULL) @@ -145,8 +144,6 @@ sfmmu_cext sfmmu_ctx_lock sfmmu_ctxs - sfmmu_pgsz_order - sfmmu_pgsz_map sf_scd SCD_SIZE scd_sfmmup @@ -184,7 +181,6 @@ scratch TSBMISS_SCRATCH shmermap TSBMISS_SHMERMAP scd_shmermap TSBMISS_SCDSHMERMAP - pgsz_bitmap TSBMISS_PGSZ_BITMAP \#define TSB_TAGACC (0 * TSBMISS_SCRATCH_INCR) \#define TSBMISS_HMEBP (1 * TSBMISS_SCRATCH_INCR) @@ -252,9 +248,6 @@ hv_tsb_info_pa hv_tsb_info_cnt -hv_pgsz_order - hv_pgsz_order_pa - cpu_node CPU_NODE_SIZE nodeid clock_freq
--- a/usr/src/uts/sun4v/ml/trap_table.s Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/ml/trap_table.s Thu Aug 06 17:39:39 2009 -0700 @@ -1396,10 +1396,6 @@ * (0=kernel, 1=invalid, or 2=user) rather than context ID) */ ALTENTRY(exec_fault) - set icache_is_coherent, %g6 /* check soft exec mode */ - ld [%g6], %g6 - brz,pn %g6, sfmmu_slow_immu_miss - nop TRACE_TSBHIT(TT_MMU_EXEC) MMU_FAULT_STATUS_AREA(%g4) ldx [%g4 + MMFSA_I_ADDR], %g2 /* g2 = address */
--- a/usr/src/uts/sun4v/os/error.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/os/error.c Thu Aug 06 17:39:39 2009 -0700 @@ -38,8 +38,6 @@ #include <sys/error.h> #include <sys/fm/util.h> #include <sys/ivintr.h> -#include <sys/machasi.h> -#include <sys/mmu.h> #include <sys/archsystm.h> #define MAX_CE_FLTS 10 @@ -212,7 +210,6 @@ int expected = DDI_FM_ERR_UNEXPECTED; uint64_t exec_mode; uint8_t u_spill_fill; - int u_kill = 1; mcpup = &(CPU->cpu_m); @@ -278,33 +275,8 @@ break; } /* - * Context Register Parity - for reload of secondary - * context register, see nonresumable_error. - */ - if ((errh_flt.errh_er.attr & ERRH_ATTR_ASI) && - (errh_flt.errh_er.asi == ASI_MMU_CTX)) { - - if (aflt->flt_tl) /* TL>0, so panic */ - break; - - /* Panic on unknown context registers */ - if (errh_flt.errh_er.addr < MMU_PCONTEXT0 || - errh_flt.errh_er.addr + errh_flt.errh_er.sz - > MMU_SCONTEXT1 + sizeof (uint64_t)) { - cmn_err(CE_WARN, "Parity error on " - "unknown context register\n"); - aflt->flt_panic = 1; - break; - } - - u_kill = 0; /* do not terminate */ - break; - } - /* - * All other PR_NRE fall through in order to - * check for protection. The list can include - * ERRH_ATTR_FRF, ERRH_ATTR_IRF, ERRH_ATTR_MEM, - * and ERRH_ATTR_PIO. + * Fall through, precise fault also need to check + * to see if it was protected. */ /*FALLTHRU*/ @@ -344,7 +316,7 @@ * for fatal errors. */ if (aflt->flt_class == BUS_FAULT) { - aflt->flt_addr = errh_flt.errh_er.addr; + aflt->flt_addr = errh_flt.errh_er.ra; errh_cpu_run_bus_error_handlers(aflt, expected); } @@ -393,13 +365,13 @@ errh_page_retire(&errh_flt, PR_UE); /* - * If we queued an error for a thread that should terminate - * and it was in user mode or protected by t_lofault, set AST - * flag so the queue will be drained before returning to user - * mode. Note that user threads can be killed via pcb_flags. + * If we queued an error and the it was in user mode, or + * protected by t_lofault, or user_spill_fill is set, we + * set AST flag so the queue will be drained before + * returning to user mode. */ - if (u_kill && (!aflt->flt_priv || - aflt->flt_prot == AFLT_PROT_COPY || u_spill_fill)) { + if (!aflt->flt_priv || aflt->flt_prot == AFLT_PROT_COPY || + u_spill_fill) { int pcb_flag = 0; if (aflt->flt_class == CPU_FAULT) @@ -550,7 +522,7 @@ * If we are going to panic, scrub the page first */ if (errh_fltp->cmn_asyncflt.flt_panic) - mem_scrub(errh_fltp->errh_er.addr, + mem_scrub(errh_fltp->errh_er.ra, errh_fltp->errh_er.sz); } break; @@ -606,7 +578,7 @@ static void errh_page_retire(errh_async_flt_t *errh_fltp, uchar_t flag) { - uint64_t flt_real_addr_start = errh_fltp->errh_er.addr; + uint64_t flt_real_addr_start = errh_fltp->errh_er.ra; uint64_t flt_real_addr_end = flt_real_addr_start + errh_fltp->errh_er.sz - 1; int64_t current_addr;
--- a/usr/src/uts/sun4v/os/fillsysinfo.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/os/fillsysinfo.c Thu Aug 06 17:39:39 2009 -0700 @@ -41,7 +41,6 @@ #include <sys/cmp.h> #include <sys/async.h> #include <vm/page.h> -#include <vm/vm_dep.h> #include <vm/hat_sfmmu.h> #include <sys/sysmacros.h> #include <sys/mach_descrip.h> @@ -66,7 +65,6 @@ static uint64_t get_mmu_tsbs(md_t *, mde_cookie_t); static uint64_t get_mmu_shcontexts(md_t *, mde_cookie_t); static uint64_t get_cpu_pagesizes(md_t *, mde_cookie_t); -static int check_mmu_pgsz_search(md_t *, mde_cookie_t); static char *construct_isalist(md_t *, mde_cookie_t, char **); static void init_md_broken(md_t *, mde_cookie_t *); static int get_l2_cache_info(md_t *, mde_cookie_t, uint64_t *, uint64_t *, @@ -356,68 +354,13 @@ } md_free_scan_dag(mdp, &node); } + + md_free_scan_dag(mdp, &eunit); } } /* - * Setup instruction cache coherency. The "memory-coherent" property - * is optional. Default for Icache_coherency is 1 (I$ is coherent). - * If we find an Icache with coherency == 0, then enable non-coherent - * Icache support. - */ -void -setup_icache_coherency(md_t *mdp) -{ - int ncache; - mde_cookie_t *cachelist; - int i; - - ncache = md_alloc_scan_dag(mdp, md_root_node(mdp), "cache", - "fwd", &cachelist); - - /* - * The "cache" node is optional in MD, therefore ncaches can be 0. - */ - if (ncache < 1) { - return; - } - - for (i = 0; i < ncache; i++) { - uint64_t cache_level; - uint64_t memory_coherent; - uint8_t *type; - int typelen; - - if (md_get_prop_val(mdp, cachelist[i], "level", - &cache_level)) - continue; - - if (cache_level != 1) - continue; - - if (md_get_prop_data(mdp, cachelist[i], "type", - &type, &typelen)) - continue; - - if (strcmp((char *)type, "instn") != 0) - continue; - - if (md_get_prop_val(mdp, cachelist[i], "memory-coherent", - &memory_coherent)) - continue; - - if (memory_coherent != 0) - continue; - - mach_setup_icache(memory_coherent); - break; - } - - md_free_scan_dag(mdp, &cachelist); -} - -/* * All the common setup of sun4v CPU modules is done by this routine. */ void @@ -461,11 +404,6 @@ shctx_on = 1; } - /* - * Get and check page search register properties. - */ - pgsz_search_on = check_mmu_pgsz_search(mdp, cpulist[0]); - for (i = 0; i < nocpus; i++) fill_cpu(mdp, cpulist[i]); @@ -474,7 +412,6 @@ setup_chip_mappings(mdp); setup_exec_unit_mappings(mdp); - setup_icache_coherency(mdp); /* * If MD is broken then append the passed ISA set, @@ -1116,50 +1053,3 @@ md_free_scan_dag(mdp, &platlist); } - -/* - * This routine gets the MD properties associated with the TLB search order API - * and compares these against the expected values for a processor which supports - * this API. The return value is used to determine whether use the API. - */ -static int -check_mmu_pgsz_search(md_t *mdp, mde_cookie_t cpu_node_cookie) -{ - - uint64_t mmu_search_nshared_contexts; - uint64_t mmu_max_search_order; - uint64_t mmu_non_priv_search_unified; - uint64_t mmu_search_page_size_list; - - if (md_get_prop_val(mdp, cpu_node_cookie, - "mmu-search-#shared-contexts", &mmu_search_nshared_contexts)) - mmu_search_nshared_contexts = 0; - - if (mmu_search_nshared_contexts == 0 || - mmu_search_nshared_contexts != NSEARCH_SHCONTEXTS) - return (0); - - if (md_get_prop_val(mdp, cpu_node_cookie, "mmu-max-search-order", - &mmu_max_search_order)) - mmu_max_search_order = 0; - - if (mmu_max_search_order == 0 || mmu_max_search_order != - MAX_PGSZ_SEARCH_ORDER) - return (0); - - if (md_get_prop_val(mdp, cpu_node_cookie, - "mmu-non-priv-search-unified", &mmu_non_priv_search_unified)) - mmu_non_priv_search_unified = -1; - - if (mmu_non_priv_search_unified != 1) { - return (0); - } - - if (md_get_prop_val(mdp, cpu_node_cookie, - "mmu-search-page-size-list", &mmu_search_page_size_list)) { - mmu_search_page_size_list = 0; - return (0); - } - - return (1); -}
--- a/usr/src/uts/sun4v/os/mach_cpu_states.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/os/mach_cpu_states.c Thu Aug 06 17:39:39 2009 -0700 @@ -1075,20 +1075,7 @@ void kdi_flush_caches(void) { - /* - * May not be implemented by all sun4v architectures. - * - * Cannot use hsvc_version to see if the group is already - * negotiated or not because, this function is called by - * KMDB when it is at the console prompt which is running - * at highest PIL. hsvc_version grabs an adaptive mutex and - * this is a no-no at this PIL level. - */ - if (hsvc_kdi_mem_iflush_negotiated) { - uint64_t status = hv_mem_iflush_all(); - if (status != H_EOK) - cmn_err(CE_PANIC, "Flushing all I$ entries failed"); - } + /* Not required on sun4v architecture. */ } /*ARGSUSED*/ @@ -1101,16 +1088,6 @@ void cpu_kdi_init(kdi_t *kdi) { - /* - * Any API negotiation this early in the boot will be unsuccessful. - * Therefore firmware for Sun4v platforms that have incoherent I$ are - * assumed to support pre-negotiated MEM_IFLUSH APIs. Successful - * invokation the MEM_IFLUSH_ALL is a test for is availability. - * Set a flag if successful indicating its availabitlity. - */ - if (hv_mem_iflush_all() == 0) - hsvc_kdi_mem_iflush_negotiated = B_TRUE; - kdi->kdi_flush_caches = kdi_flush_caches; kdi->mkdi_cpu_init = kdi_cpu_init; kdi->mkdi_cpu_ready_iter = kdi_cpu_ready_iter;
--- a/usr/src/uts/sun4v/pcbe/rock_pcbe.c Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2316 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Rock Performance Counter Back End - */ - -#include <sys/cpuvar.h> -#include <sys/systm.h> -#include <sys/cmn_err.h> -#include <sys/cpc_impl.h> -#include <sys/cpc_pcbe.h> -#include <sys/modctl.h> -#include <sys/machsystm.h> -#include <sys/sdt.h> -#include <sys/hypervisor_api.h> -#include <sys/rock_hypervisor_api.h> -#include <sys/hsvc.h> - -#define NT_END 0xFF - -/* Counter Types */ -#define NUM_PCBE_COUNTERS 6 -#define RK_PERF_CYC 0x0100 -#define RK_PERF_INSTR 0x0200 -#define RK_PERF_L2 0x0400 -#define RK_PERF_MMU 0x0800 -#define RK_PERF_YANK 0x2000 -#define RK_PERF_SIBLK 0x4000 -#define RK_PERF_LVLK 0x8000 -#define RK_PERF_SPEC 0x1000 /* Reserved */ - -#define NORMAL_COUNTER 0x1 -#define SYNTHETIC_COUNTER 0x2 - -/* ASI_PERF_MMU_CNT_FILTER TXN bits */ -#define ASI_PERF_MMU_CNT_FILTER_UTLB_HITS 0x1 -#define ASI_PERF_MMU_CNT_FILTER_UTLB_MISS 0x2 -#define ASI_PERF_MMU_CNT_FILTER_DATA_ACCESS 0x8 -#define ASI_PERF_MMU_CNT_FILTER_INSTR_ACCESS 0x10 -#define ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL 0x20 -#define ASI_PERF_MMU_CNT_FILTER_EA_REAL 0x40 - -#define MMU_ALL_TXNS (ASI_PERF_MMU_CNT_FILTER_UTLB_HITS | \ - ASI_PERF_MMU_CNT_FILTER_UTLB_MISS | \ - ASI_PERF_MMU_CNT_FILTER_DATA_ACCESS | \ - ASI_PERF_MMU_CNT_FILTER_INSTR_ACCESS | \ - ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \ - ASI_PERF_MMU_CNT_FILTER_EA_REAL) - -#define MMU_ITLB_MISS (ASI_PERF_MMU_CNT_FILTER_EA_REAL | \ - ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \ - ASI_PERF_MMU_CNT_FILTER_INSTR_ACCESS | \ - ASI_PERF_MMU_CNT_FILTER_UTLB_MISS | \ - ASI_PERF_MMU_CNT_FILTER_UTLB_HITS) - -#define MMU_DTLB_MISS (ASI_PERF_MMU_CNT_FILTER_EA_REAL | \ - ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \ - ASI_PERF_MMU_CNT_FILTER_DATA_ACCESS | \ - ASI_PERF_MMU_CNT_FILTER_UTLB_MISS | \ - ASI_PERF_MMU_CNT_FILTER_UTLB_HITS) - -#define MMU_UTLB_MISS (ASI_PERF_MMU_CNT_FILTER_EA_REAL | \ - ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \ - ASI_PERF_MMU_CNT_FILTER_INSTR_ACCESS | \ - ASI_PERF_MMU_CNT_FILTER_DATA_ACCESS | \ - ASI_PERF_MMU_CNT_FILTER_UTLB_MISS) - -#define MMU_UTLB_HIT (ASI_PERF_MMU_CNT_FILTER_EA_REAL | \ - ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \ - ASI_PERF_MMU_CNT_FILTER_INSTR_ACCESS | \ - ASI_PERF_MMU_CNT_FILTER_DATA_ACCESS | \ - ASI_PERF_MMU_CNT_FILTER_UTLB_HITS) - -#define MMU_ITLB_MISS_UTLB_HIT (ASI_PERF_MMU_CNT_FILTER_EA_REAL | \ - ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \ - ASI_PERF_MMU_CNT_FILTER_INSTR_ACCESS | \ - ASI_PERF_MMU_CNT_FILTER_UTLB_HITS) - -#define MMU_ITLB_MISS_UTLB_MISS (ASI_PERF_MMU_CNT_FILTER_EA_REAL | \ - ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \ - ASI_PERF_MMU_CNT_FILTER_INSTR_ACCESS | \ - ASI_PERF_MMU_CNT_FILTER_UTLB_MISS) - -#define MMU_DTLB_MISS_UTLB_HIT (ASI_PERF_MMU_CNT_FILTER_EA_REAL | \ - ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \ - ASI_PERF_MMU_CNT_FILTER_DATA_ACCESS | \ - ASI_PERF_MMU_CNT_FILTER_UTLB_HITS) - -#define MMU_DTLB_MISS_UTLB_MISS (ASI_PERF_MMU_CNT_FILTER_EA_REAL | \ - ASI_PERF_MMU_CNT_FILTER_EA_VIRTUAL | \ - ASI_PERF_MMU_CNT_FILTER_DATA_ACCESS | \ - ASI_PERF_MMU_CNT_FILTER_UTLB_MISS) - -/* - * These values will be loaded to nametable.bits which is a 32 bit number. - * Please see the description of bits in nametable struct. If the counters - * are a part of different pic, then we can re-use GROUP and TYPE. - */ -#define SYN_BIT ((uint32_t)1 << 31) /* Set bit 32 */ -#define GROUP_MASK 0xFFF000 /* Bits 12-23 */ -#define ID_TO_GROUP(GROUP_ID) ((GROUP_ID)<<12) -#define GROUP(SYN_COUNTER) ((SYN_COUNTER) & GROUP_MASK) -#define TYPE(SYN_COUNTER) ((SYN_COUNTER) & 0x000FFF) /* Bits 0-12 */ - -/* Synthetic counter types */ -#define L2_GROUP_DS ID_TO_GROUP(0) -#define DS_DRAM 0x0 /* From PRM */ -#define DS_L3 0x1 /* ditto */ -#define DS_OTHER_L2 0x2 /* ditto */ -#define DS_LOCAL_L2 0x3 /* ditto */ - -#define L2_DS_DRAM (SYN_BIT | L2_GROUP_DS | DS_DRAM) -#define L2_DS_L3 (SYN_BIT | L2_GROUP_DS | DS_L3) -#define L2_DS_OTHER_L2 (SYN_BIT | L2_GROUP_DS | DS_OTHER_L2) -#define L2_DS_LOCAL_L2 (SYN_BIT | L2_GROUP_DS | DS_LOCAL_L2) - -#define L2_GROUP_TXN_MISS ID_TO_GROUP(1) -#define TXN_LD 0x3 /* From PRM */ -#define TXN_ST 0x18 /* ditto */ -#define L2_TXN_LD_MISS (SYN_BIT | L2_GROUP_TXN_MISS | TXN_LD) -#define L2_TXN_ST_MISS (SYN_BIT | L2_GROUP_TXN_MISS | TXN_ST) - -#define L2_GROUP_TXN_HIT ID_TO_GROUP(2) -#define L2_TXN_LD_HIT (SYN_BIT | L2_GROUP_TXN_HIT | TXN_LD) -#define L2_TXN_ST_HIT (SYN_BIT | L2_GROUP_TXN_HIT | TXN_ST) - -#define L2_GROUP_EVT ID_TO_GROUP(3) -#define EVT_L2_MISS 0x8 /* From PRM */ -#define EVT_L2_PEND_ST 0x2 /* ditto */ -#define EVT_L2_PRIOR_MISS 0x1 /* ditto */ -#define EVT_L2_NOEVENTS 0x0 /* ditto */ -#define L2_HIT 0 -#define L2_MISS 1 - -#define L2_EVT_HIT (SYN_BIT | L2_GROUP_EVT | L2_HIT) -#define L2_EVT_MISS (SYN_BIT | L2_GROUP_EVT | L2_MISS) - -/* Instruction types. Corresponds to ASI_PERF_IS_INFO.TYP */ -#define I_GROUP_TYPE ID_TO_GROUP(0) -#define TYPE_HELPER (1<<0) -#define TYPE_LD (1<<1) -#define TYPE_ST (1<<2) -#define TYPE_CTI (1<<3) -#define TYPE_FP (1<<4) -#define TYPE_INT_ALU (1<<5) -#define TYPE_CMPLX_ALU (1<<6) - -#define INSTR_TYPE_LD (SYN_BIT | I_GROUP_TYPE | TYPE_LD) -#define INSTR_TYPE_ST (SYN_BIT | I_GROUP_TYPE | TYPE_ST) -#define INSTR_TYPE_CTI (SYN_BIT | I_GROUP_TYPE | TYPE_CTI) -#define INSTR_TYPE_FP (SYN_BIT | I_GROUP_TYPE | TYPE_FP) - -/* Execution modes. Corresponds to ASI_PERF_IS_INFO.MODE */ -#define I_GROUP_MODE ID_TO_GROUP(1) -#define MODE_NOR 0x0 /* From PRM */ -#define MODE_OOO 0x1 /* ditto */ -#define MODE_EXE 0x2 /* ditto */ -#define MODE_DLY 0x3 /* ditto */ -#define MODE_DEF 0x4 /* ditto */ -#define MODE_HWS 0x5 /* ditto */ - -#define INSTR_MODE_NOR (SYN_BIT | I_GROUP_MODE | MODE_NOR) -#define INSTR_MODE_OOO (SYN_BIT | I_GROUP_MODE | MODE_OOO) -#define INSTR_MODE_EXE (SYN_BIT | I_GROUP_MODE | MODE_EXE) -#define INSTR_MODE_DLY (SYN_BIT | I_GROUP_MODE | MODE_DLY) -#define INSTR_MODE_DEF (SYN_BIT | I_GROUP_MODE | MODE_DEF) -#define INSTR_MODE_HWS (SYN_BIT | I_GROUP_MODE | MODE_HWS) - -/* Instruction events. Corresponds to ASI_PERF_IS_INFO.EVT */ -#define I_GROUP_EVT ID_TO_GROUP(2) - -/* Bit numbers from PRM */ -#define EVT_DC_MISS (1<<0) -#define EVT_PRIOR_MISS (1<<1) -#define EVT_DTLB_MISS (1<<2) -#define EVT_LDB_FULL (1<<3) -#define EVT_STB_FULL (1<<4) -#define EVT_FE_STALL (1<<5) -#define EVT_FROM_DQ (1<<6) -#define EVT_CORRECT_BP (1<<7) -#define EVT_BYPASS_RAW (1<<8) -#define EVT_NONBYPASS_RAW (1<<9) -#define EVT_CTI_TAKEN (1<<10) -#define EVT_FAILED_SPEC (1<<11) - -#define INSTR_EVT_DC_MISS (SYN_BIT | I_GROUP_EVT | EVT_DC_MISS) -#define INSTR_EVT_PRIOR_MISS (SYN_BIT | I_GROUP_EVT | EVT_PRIOR_MISS) -#define INSTR_EVT_DTLB_MISS (SYN_BIT | I_GROUP_EVT | EVT_DTLB_MISS) -#define INSTR_EVT_LDB_FULL (SYN_BIT | I_GROUP_EVT | EVT_LDB_FULL) -#define INSTR_EVT_STB_FULL (SYN_BIT | I_GROUP_EVT | EVT_STB_FULL) -#define INSTR_EVT_FE_STALL (SYN_BIT | I_GROUP_EVT | EVT_FE_STALL) -#define INSTR_EVT_FROM_DQ (SYN_BIT | I_GROUP_EVT | EVT_FROM_DQ) -#define INSTR_EVT_CORRECT_BP (SYN_BIT | I_GROUP_EVT | EVT_CORRECT_BP) -#define INSTR_EVT_BYPASS_RAW (SYN_BIT | I_GROUP_EVT | EVT_BYPASS_RAW) -#define INSTR_EVT_NONBYPASS_RAW (SYN_BIT | I_GROUP_EVT | EVT_NONBYPASS_RAW) -#define INSTR_EVT_CTI_TAKEN (SYN_BIT | I_GROUP_EVT | EVT_CTI_TAKEN) -#define INSTR_EVT_FAILED_SPEC (SYN_BIT | I_GROUP_EVT | EVT_FAILED_SPEC) - -/* - * Synthetic counters to count MCCDESR error events - * All the events are mutually exclusive therefore can be counted - * simultaneously. Hence each one is a different pic. Therefore - * there is no need to have GROUP or TYPE for these counters. - */ -#define MCCDESR_YANK (SYN_BIT) -#define MCCDESR_SIBLK (SYN_BIT) -#define MCCDESR_LVLK (SYN_BIT) - -/* Number of samples to be taken before Performance Event Trap is generated */ -/* Maximum frequencies that can be configured */ -#define INSTR_SAM_MAX_FREQ 0x3FF /* 10 bits */ -#define L2_SAM_MAX_FREQ 0xFFFF /* 16 bits */ -#define MMU_SAM_MAX_FREQ 0xFFFF /* 16 bits */ - -/* Minimum frequencies that should be configured to prevent DOS */ -#define INSTR_SAM_MIN_FREQ 100 -#define L2_SAM_MIN_FREQ 250 -#define MMU_SAM_MIN_FREQ 250 - -/* Default frequencies that are configured */ -#define INSTR_SAM_DEF_FREQ 250 -#define L2_SAM_DEF_FREQ 1000 - -/* Number of bits in the hardware for the counter */ -#define CYC_COUNTER_BITS 18 -#define INSTR_COUNTER_BITS 18 -#define L2_COUNTER_BITS 48 -#define MMU_COUNTER_BITS 48 -#define YANK_COUNTER_BITS 64 -#define SIBLK_COUNTER_BITS 64 -#define LVLK_COUNTER_BITS 64 - -#define RK_PERF_COUNT_TOE_SHIFT (63) - -#define STATE_CONFIGURED 0x1 -#define STATE_PROGRAMMED 0x2 -#define STATE_STOPPED 0x4 -#define STATE_RELEASED 0x8 -#define UNINITIALIZED 2 /* should be other than 0/1 */ -#define TLZ 1 /* Do not make it zero */ -#define TLNZ 2 - -#define CPU_REF_URL " Documentation for Sun processors can be found at: " \ - "http://www.sun.com/processors/manuals" - -#define MIN_RINGBUF_ENTRIES 100 - -#define RINGBUF_GET_HEAD(RB) \ - (uint64_t *)((uint64_t)(&RB->va_values) + RB->head); - -#define RINGBUF_GET_TAIL(RB) \ - (uint64_t *)((uint64_t)(&RB->va_values) + RB->tail); - -#define RINGBUF_SET_HEAD(RB, PTR) \ - RB->head = (uint64_t)PTR - (uint64_t)(&RB->va_values); \ - RB->hwm = RB->head + (RB->size >> 1); \ - if (RB->hwm >= RB->size) \ - RB->hwm -= RB->size; - -#define RINGBUF_MOVE_HEAD(RB, PTR, SAMPLE_SZ) \ - PTR = (uint64_t *)((uint64_t)PTR + SAMPLE_SZ); \ - if (PTR >= (uint64_t *)((uint64_t)(&RB->va_values) + RB->size)) \ - PTR = (uint64_t *)&RB->va_values; - -#define MAKE_MASK(NBITS, SHIFT) (((unsigned long)(1<<(NBITS))-1)<<SHIFT) - -#define COUNTER_MAX(_p) ((int64_t)((1ULL << (_p->counter_bits - 1)) - 1)) -#define COUNTER_MIN(_p) ((int64_t)-(COUNTER_MAX(_p))) -#define COUNTER_MASK(_p) (bitmask(_p->counter_bits)) - -/* Global Structures and typedefs */ -struct _rk_pcbe_ringbuf { /* INIT-ER WRITTER READER */ - uint32_t head; /* offset guest guest guest */ - uint32_t tail; /* offset guest hv both */ - uint32_t size; /* bytes guest n/a both */ - uint32_t hwm; /* bytes guest hv guest */ - uint64_t va_values; /* guest hv guest */ -}; - -typedef struct _rk_pcbe_ringbuf rk_pcbe_ringbuf_t; - -typedef struct _sampler { - rk_pcbe_ringbuf_t *ring_buffer; /* Ring buffer start address */ - uint64_t synthetic_pic; - uint32_t frequency; /* Sampling Frequency */ - uint32_t syn_counter; /* Synthetic Counter Type */ - uint32_t sample_size; /* Size of each sample in bytes */ - uint32_t flags; /* instr sampler: priv */ - uint8_t tl; /* Trap Level Filtering */ - uint8_t nohws; /* Filter out HW Scouting samples */ -} sampler_t; - -typedef struct _rk_pcbe_config { - uint8_t pcbe_picno; /* 0-6:instr,l2,mmu,yank,siblk,lvlk */ - uint8_t counter_bits; /* Number of counter bits */ - uint8_t counter_type; /* Normal or Synthetic */ - uint8_t toe; /* Trap on Enable */ - uint32_t counter; /* Counter name */ - uint32_t src_type; /* Strand, Strands, SIU, MMU */ - uint32_t flags; /* instr counter:priv. l2,mmu:Xn */ - uint64_t pcbe_pic; /* PIC counter value */ - uint8_t inuse; /* pic in use or not */ - uint8_t state; /* Current state of the pic */ - processorid_t cpu; /* CPU associated to this pic */ - sampler_t sampler; -#ifdef RKPCBE_DBG - char name[64]; /* Human readable counter name */ -#endif -} rk_pcbe_config_t; - -/* Function Prototypes for those that are invoked using rk_pcbe_ops */ -static int rk_pcbe_init(void); -static int rk_pcbe_fini(void); -static uint_t rk_pcbe_ncounters(void); -static const char *rk_pcbe_impl_name(void); -static const char *rk_pcbe_cpuref(void); -static char *rk_pcbe_list_events(uint_t picnum); -static char *rk_pcbe_list_attrs(void); -static uint64_t rk_pcbe_event_coverage(char *event); -static uint64_t rk_pcbe_overflow_bitmap(void); -static int rk_pcbe_configure(uint_t picnum, char *event, uint64_t preset, - uint32_t flags, uint_t nattrs, kcpc_attr_t *attrs, void **data, - void *token); -static void rk_pcbe_program(void *token); -static void rk_pcbe_allstop(void); -static void rk_pcbe_sample(void *token); -static void rk_pcbe_free(void *config); - -pcbe_ops_t rk_pcbe_ops = { - PCBE_VER_1, - CPC_CAP_OVERFLOW_INTERRUPT, - rk_pcbe_ncounters, - rk_pcbe_impl_name, - rk_pcbe_cpuref, - rk_pcbe_list_events, - rk_pcbe_list_attrs, - rk_pcbe_event_coverage, - rk_pcbe_overflow_bitmap, - rk_pcbe_configure, - rk_pcbe_program, - rk_pcbe_allstop, - rk_pcbe_sample, - rk_pcbe_free -}; - -/* - * bits: - * - * | 31 |30 24|23 12|11 0 - * | Syn/Normal | Rsvd | Group | Type | - */ -struct nametable { - const uint32_t bits; - const char *name; -}; - -/* Instruction Counter. picno: 0 */ -static const struct nametable Rock_names0[] = { - {0x1, "Instr_All"}, - /* Synthetic counters */ - {INSTR_MODE_NOR, "Instr_Normal"}, - {INSTR_MODE_OOO, "Instr_Out_Of_Order"}, - {INSTR_MODE_EXE, "Instr_Execute_Ahead"}, - {INSTR_MODE_DLY, "Instr_Delay"}, - {INSTR_MODE_DEF, "Instr_Deferred"}, - {INSTR_MODE_HWS, "Instr_Scout"}, - - {INSTR_TYPE_LD, "Instr_Load"}, - {INSTR_TYPE_ST, "Instr_Store"}, - {INSTR_TYPE_CTI, "Instr_Branch"}, - {INSTR_TYPE_FP, "Instr_Float"}, - - {INSTR_EVT_DC_MISS, "Instr_Dcache_Miss"}, - {INSTR_EVT_PRIOR_MISS, "Instr_Prior_Miss"}, - {INSTR_EVT_DTLB_MISS, "Instr_Dtlb_Miss"}, - {INSTR_EVT_LDB_FULL, "Instr_Loadbuf_Full"}, - {INSTR_EVT_STB_FULL, "Instr_Storebuf_Full"}, - {INSTR_EVT_FE_STALL, "Instr_Stall"}, - {INSTR_EVT_FROM_DQ, "Instr_DQ"}, - {INSTR_EVT_CORRECT_BP, "Instr_Correct_Branch_Predict"}, - {INSTR_EVT_BYPASS_RAW, "Instr_Bypass_Raw"}, - {INSTR_EVT_NONBYPASS_RAW, "Instr_Nonbypass_Raw"}, - {INSTR_EVT_CTI_TAKEN, "Instr_Branch_Taken"}, - {INSTR_EVT_FAILED_SPEC, "Instr_Failed_Spec"}, - - {NT_END, ""} -}; - -/* L2 Counters. picno: 1 */ -static const struct nametable Rock_names1[] = { - {0x1, "L2_Icache_Load"}, - {0x2, "L2_Dcache_Load"}, - {0x4, "L2_Instr_Prefetch"}, - {0x8, "L2_Store_Prefetch"}, - {0x10, "L2_Store"}, - {0x20, "L2_Atomic_Ops"}, - {0x40, "L2_Flush"}, - /* Synthetic counters */ - {L2_DS_L3, "L2_Load_From_L3"}, - {L2_DS_DRAM, "L2_Load_From_Dram"}, - {L2_DS_OTHER_L2, "L2_Load_From_Other_L2"}, - - {L2_TXN_LD_MISS, "L2_Load_Miss"}, - {L2_TXN_ST_MISS, "L2_Store_Miss"}, - {L2_TXN_LD_HIT, "L2_Load_Hit"}, - {L2_TXN_ST_HIT, "L2_Store_Hit"}, - - {L2_EVT_HIT, "L2_Hit"}, - {L2_EVT_MISS, "L2_Miss"}, - {NT_END, ""} -}; - -/* MMU Counters. picno: 2 */ -static const struct nametable Rock_names2[] = { - {MMU_ALL_TXNS, "MMU_All"}, - {MMU_ITLB_MISS, "MMU_Itlb_Miss"}, - {MMU_DTLB_MISS, "MMU_Dtlb_Miss"}, - {MMU_UTLB_MISS, "MMU_Utlb_Miss"}, - {MMU_UTLB_HIT, "MMU_Utlb_Hit"}, - {MMU_ITLB_MISS_UTLB_MISS, "MMU_I_Utlb_Miss"}, - {MMU_ITLB_MISS_UTLB_HIT, "MMU_I_Utlb_Hit"}, - {MMU_DTLB_MISS_UTLB_MISS, "MMU_D_Utlb_Miss"}, - {MMU_DTLB_MISS_UTLB_HIT, "MMU_D_Utlb_Hit"}, - {NT_END, ""} -}; - -/* YANK Counter. picno: 3 */ -static const struct nametable Rock_names3[] = { - {MCCDESR_YANK, "Yank"}, - {NT_END, ""} -}; - -/* SIBLK Counter. picno: 4 */ -static const struct nametable Rock_names4[] = { - {MCCDESR_SIBLK, "Siblk"}, - {NT_END, ""} -}; - -/* LVLK Counter. picno: 5 */ -static const struct nametable Rock_names5[] = { - {MCCDESR_LVLK, "Lvlk"}, - {NT_END, ""} -}; - -static const struct nametable *Rock_names[NUM_PCBE_COUNTERS] = { - Rock_names0, - Rock_names1, - Rock_names2, - Rock_names3, - Rock_names4, - Rock_names5 -}; - -extern char cpu_module_name[]; -uint32_t num_ringbuf_entries = 500; /* Should be a EVEN # */ -static const struct nametable **events; -static char *pic_events[NUM_PCBE_COUNTERS]; -static rk_pcbe_config_t *active_pics[NUM_PCBE_COUNTERS][NCPU]; -static boolean_t rock_pcbe_hsvc_available = B_TRUE; - -static char *rock_name; -static char rock_cpuref[256]; -static char pcbe_module_name[64] = "pcbe."; - -static hsvc_info_t rock_pcbe_hsvc = { - HSVC_REV_1, /* HSVC rev num */ - NULL, /* Private */ - HSVC_GROUP_RKPERF, /* Requested API Group */ - ROCK_HSVC_MAJOR, /* Requested Major */ - ROCK_HSVC_MINOR, /* Requested Minor */ - pcbe_module_name /* Module name */ -}; - -/* Function Definitions */ -static struct modlpcbe modlpcbe = { - &mod_pcbeops, - "Perf Counters v1.1", - &rk_pcbe_ops -}; - -static struct modlinkage modl = { - MODREV_1, - &modlpcbe, -}; - -/* - * Below two structures are used to pass data from program_*_sampler() to - * program_a_sampler() - */ -struct asi { - uint64_t va; - uint64_t value; -}; - -typedef struct _s { - char name[32]; /* User friendly name */ - int asi_config_num; /* Num of ASIs to be configured */ - struct asi asi_config[10]; /* ASIs that gets configured */ - int asi_sample_num; /* Num of data return ASIs */ - uint64_t asi_sample[10]; /* Data return ASIs when sampled */ -} program_sampler_data_t; - -/* Local Function prototypes */ -static void rk_pcbe_stop_synthetic(rk_pcbe_config_t *pic); -static void rk_pcbe_release(rk_pcbe_config_t *pic); -static void rk_pcbe_free_synthetic(rk_pcbe_config_t *pic); - -static int rk_pcbe_program_normal(rk_pcbe_config_t *pic); -static int rk_pcbe_program_synthetic(rk_pcbe_config_t *pic); -static int program_l2_sampler(rk_pcbe_config_t *pic); -static int program_instr_sampler(rk_pcbe_config_t *pic); -static int program_a_sampler(rk_pcbe_config_t *pic, - program_sampler_data_t *sdata); - -static int rk_pcbe_sample_internal(rk_pcbe_config_t *pic, uint64_t *diffp); -static int rk_pcbe_sample_synthetic(rk_pcbe_config_t *pic, int64_t *diffp); -static int sample_l2_sampler(rk_pcbe_config_t *pic, int64_t *diffp); -static int sample_instr_sampler(rk_pcbe_config_t *pic, int64_t *diffp); -static int sample_mccdesr(rk_pcbe_config_t *pic, int64_t *diffp); -static int synthesize_sample_count(rk_pcbe_config_t *pic, uint64_t sample_count, - uint64_t sample_hit_count, char *name, int64_t *diffp); - -static int alloc_ringbuffer(rk_pcbe_config_t *pic, uint32_t size, - uint32_t num_samples); -static void free_ringbuffer(rk_pcbe_config_t *pic); -static void print_hv_error(uint64_t rc, int *cntp, char *funcname, - rk_pcbe_config_t *pic); -static void set_string_constants(void); -static uint64_t bitmask(uint8_t); - -#ifdef RKPCBE_DBG -static void print_pic(rk_pcbe_config_t *pic, char *heading); -static void set_pic_name(rk_pcbe_config_t *pic); -/* lock for print clarity */ -static kmutex_t print_pic_lock; -#define PRINT_PIC(pic, heading) \ - print_pic(pic, heading) -#define DBG_PRINT(_z) printf _z -#else -#define PRINT_PIC(pic, heading) (void)0 -#define DBG_PRINT(ignore) (void)0 -#endif - -int -_init(void) -{ - if (rk_pcbe_init() != 0) - return (ENOTSUP); - return (mod_install(&modl)); -} - -int -_fini(void) -{ - if (rk_pcbe_fini() != 0) - return (EBUSY); - return (mod_remove(&modl)); -} - -int -_info(struct modinfo *mi) -{ - return (mod_info(&modl, mi)); -} - -static int -rk_pcbe_init(void) -{ - const struct nametable *n; - int i, status, j; - size_t size; - uint64_t rock_pcbe_hsvc_sup_minor; - - set_string_constants(); - /* - * Validate API version for Rock pcbe hypervisor services - */ - status = hsvc_register(&rock_pcbe_hsvc, &rock_pcbe_hsvc_sup_minor); - if ((status != 0) || (rock_pcbe_hsvc_sup_minor < - (uint64_t)ROCK_HSVC_MINOR)) { - cmn_err(CE_WARN, "%s cannot negotiate hypervisor services: " - "major: 0x%lx minor: 0x%lx group: 0x%x errno: %d", - pcbe_module_name, rock_pcbe_hsvc.hsvc_major, - rock_pcbe_hsvc.hsvc_minor, HSVC_GROUP_RKPERF, status); - rock_pcbe_hsvc_available = B_FALSE; - return (-1); - } - - events = Rock_names; - /* - * Initialize the list of events for each PIC. - * Do two passes: one to compute the size necessary and another - * to copy the strings. Need room for event, comma, and NULL terminator. - */ - for (i = 0; i < NUM_PCBE_COUNTERS; i++) { - size = 0; - for (n = events[i]; n->bits != NT_END; n++) - size += strlen(n->name) + 1; - pic_events[i] = kmem_alloc(size + 1, KM_SLEEP); - *pic_events[i] = '\0'; - for (n = events[i]; n->bits != NT_END; n++) { - (void) strcat(pic_events[i], n->name); - (void) strcat(pic_events[i], ","); - } - /* - * Remove trailing comma. - */ - pic_events[i][size - 1] = '\0'; - - /* Initialize all active pics as NULL */ - for (j = 0; j < NCPU; j++) - active_pics[i][j] = NULL; - } -#ifdef RKPCBE_DBG - mutex_init(&print_pic_lock, NULL, MUTEX_DRIVER, - (void *)ipltospl(PIL_15)); -#endif - return (0); -} - -static int -rk_pcbe_fini(void) -{ - return (0); -} - -static uint_t -rk_pcbe_ncounters(void) -{ - return (NUM_PCBE_COUNTERS); -} - -static const char * -rk_pcbe_impl_name(void) -{ - return (rock_name); -} - -static const char * -rk_pcbe_cpuref(void) -{ - return (rock_cpuref); -} - -static char * -rk_pcbe_list_events(uint_t picnum) -{ - ASSERT(picnum >= (uint_t)0 && picnum < cpc_ncounters); - - return (pic_events[picnum]); -} - -static char * -rk_pcbe_list_attrs(void) -{ - /* - * If no value is spcified in the command line for the - * attributes then, a default value of 1 is passed into - * pcbe from cpc. Specifying a value as zero is as good as - * not specifying it. - * 'source' attribute is equivallent of 'single, shared, - * siu, mmu' all put together. 'source' will take precedence - * over others. - * Valid 'source' values are defined in rock_hypervisor_api.h. - * If multiple flags need to be specified then user has to - * specify the bitwise OR of the flags he/she is interested in. - * populate_pic_config validates the correctness of the flags - * specified. - * tl is little odd. To consider instructions at - * tl == 0, specify tl = TLZ in command line - * tl > 0, specify tl = TLNZ in command line - * The reason for this oddness: attr = 0 means, neglect - * that attr. - */ - return ("freq,source,single,shared,siu,mmu,nohws,tl,hpriv"); -} - -static const struct nametable * -find_event(int picno, char *name) -{ - const struct nametable *n; - - for (n = events[picno]; n->bits != NT_END; n++) - if (strcmp(name, n->name) == 0) - return (n); - - return (NULL); -} - -static uint64_t -rk_pcbe_event_coverage(char *event) -{ - uint64_t bitmap = 0; - int i; - - /* There is no intersection of events between different PICs */ - for (i = 0; i < NUM_PCBE_COUNTERS; i++) { - if (find_event(i, event) != NULL) { - bitmap = 1 << i; - break; - } - } - return (bitmap); -} - -static uint64_t -rk_pcbe_overflow_bitmap(void) -{ - int i; - rk_pcbe_config_t *pic; - uint64_t ovf_bitmask = 0, ovf_cnt; - - for (i = 0; i < NUM_PCBE_COUNTERS; i++) { - pic = active_pics[i][CPU->cpu_id]; - - if (pic == NULL || pic->inuse != B_TRUE) - continue; - - DBG_PRINT(("CPU-%d: Pic %s (#%d, cntr %X) overflowed\n", - CPU->cpu_id, pic->name, pic->pcbe_picno, pic->counter)); - - /* Check if any of the active pics overflowed */ - if (pic->counter_type == NORMAL_COUNTER) { - hv_rk_perf_count_overflow((uint64_t)(pic->counter | - pic->src_type), &ovf_cnt); - if (ovf_cnt > 0) - pic->pcbe_pic += (0x1ULL << pic->counter_bits); - } else { - /* - * Synthetic counters don't overflow, so we must have gotten - * here because the ringbuffer is getting half-full or - * one of the normal counter which is a part of synthetic - * counter did overflow. Force cpc to call - * rk_pcbe_sample_synthetic by setting ovf_cnt to 1. If - * returned 0, then cpc prints a WARNING message: - * "WARNING: interrupt 0x80c at level 15 not serviced" - */ - ovf_cnt = B_TRUE; - } - - if (ovf_cnt > 0) - ovf_bitmask |= (1 << pic->pcbe_picno); - } - return (ovf_bitmask); -} - -/* - * populate_pic_config - * - * Checks the validity of all the attributes and then updates flags - * to reflect priv bits for Cycle and Instruction counters and - * transaction bits for L2 and makes sure that flags is 0 for MMU. - * - * Along with validating the inputs, pic is populated with appropriate - * values. - * - * Returns 0 on success and CPC_INVALID_ATTRIBUTE on failure. - */ -static int -populate_pic_config(uint_t picnum, uint_t nattrs, kcpc_attr_t *attrs, - uint32_t bits, rk_pcbe_config_t *pic) -{ - int i; - uint32_t freq = 0; - uint32_t *flagsp = &(pic->flags); - uint32_t source = 0; - - pic->pcbe_picno = (uint8_t)picnum; - pic->toe = B_TRUE; - pic->sampler.synthetic_pic = 0; - pic->sampler.ring_buffer = NULL; - pic->inuse = UNINITIALIZED; - pic->counter_type = ((bits & SYN_BIT) == 0) ? NORMAL_COUNTER : - SYNTHETIC_COUNTER; - - /* - * Initialized to 0. If a valid source attribute is specified, then - * src_type field gets populated later, else will be defaulted to - * HV_RK_PERF_SRC_STRAND - */ - pic->src_type = 0; - /* - * Initialized to zero. In all the fallthrough case, this - * is checked to determine if certain fields needs to be - * populated or not - */ - pic->counter = 0; - - /* - * When synthetic counter's ring buffer reaches HWM, HV generates - * PIC overflow trap to get guest's attention. This is not same as - * a hardware counter overflow. Size of the ring buffer is configurable - * and since there is no definite size, CPC_OVF_NOTIFY_EMT flag has no - * meaning wrt synthetic counters. - */ - if ((bits & SYN_BIT) && (*flagsp & CPC_OVF_NOTIFY_EMT)) - return (CPC_PIC_NOT_CAPABLE); - - /* - * This flag is used by CPC to inform the application of a counter - * overflow. It is of no use to PCBE. - */ - *flagsp &= ~(CPC_OVF_NOTIFY_EMT); - - switch (picnum) { -#define PRIV_BITS_MASK 0x7 -#define PRIV_BIT0_MASK 0x1 -#define PRIV_BIT1_MASK 0x2 -#define PRIV_BIT2_MASK 0x4 - - case 0: /* Instruction Counter */ - pic->counter = RK_PERF_INSTR; - pic->counter_bits = INSTR_COUNTER_BITS; - - freq = INSTR_SAM_DEF_FREQ; /* Default Frequency */ - - for (i = 0; i < nattrs; i++) { - if ((strcmp(attrs[i].ka_name, "freq") == 0)) { - if ((bits & SYN_BIT) == 0 && - attrs[i].ka_val) { - return (CPC_INVALID_ATTRIBUTE); - } - freq = attrs[i].ka_val; - } else if ((strcmp(attrs[i].ka_name, - "single") == 0) && attrs[i].ka_val) - pic->src_type |= - HV_RK_PERF_SRC_STRAND; - else if ((strcmp(attrs[i].ka_name, - "shared") == 0) && attrs[i].ka_val) - pic->src_type |= - HV_RK_PERF_SRC_STRAND_M; - else if ((strcmp(attrs[i].ka_name, - "hpriv") == 0) && attrs[i].ka_val) - *flagsp |= CPC_COUNT_HV; - else if ((strcmp(attrs[i].ka_name, - "source") == 0) && attrs[i].ka_val) - source = attrs[i].ka_val & - HV_RK_PERF_SRC_MASK; - else if ((strcmp(attrs[i].ka_name, - "nohws") == 0) && attrs[i].ka_val) { - if (bits & SYN_BIT) - pic->sampler.nohws = B_TRUE; - else if (attrs[i].ka_val) - return (CPC_INVALID_ATTRIBUTE); - } else if ((strcmp(attrs[i].ka_name, - "tl") == 0) && attrs[i].ka_val) { - if (bits & SYN_BIT) { - pic->sampler.tl = - (uint8_t)attrs[i].ka_val; - } else if (attrs[i].ka_val) - return (CPC_INVALID_ATTRIBUTE); - } else { - if (attrs[i].ka_val) - return (CPC_INVALID_ATTRIBUTE); - } - } - - if (source) { - if (source & (HV_RK_PERF_SRC_SIU | - HV_RK_PERF_SRC_MMU)) - return (CPC_INVALID_ATTRIBUTE); - pic->src_type = source; - } - - if (pic->src_type == 0) - pic->src_type = HV_RK_PERF_SRC_STRAND; - - /* - * hpriv, sys, user are sent as bits 3, 2, 1 from kcpc. - * They are maintained by PCBE as bits 2, 1, & 0. - */ - *flagsp >>= 1; - *flagsp &= PRIV_BITS_MASK; - if (bits & SYN_BIT) { - pic->sampler.flags = *flagsp; - pic->sampler.syn_counter = bits; - if (freq > INSTR_SAM_MAX_FREQ) { - cmn_err(CE_NOTE, "CPU-%d: freq set " - "> MAX. Resetting to %d", - CPU->cpu_id, INSTR_SAM_MAX_FREQ); - freq = INSTR_SAM_MAX_FREQ; - } - if (freq < INSTR_SAM_MIN_FREQ) { - cmn_err(CE_NOTE, "CPU-%d: freq set " - "< MIN. Resetting to %d", - CPU->cpu_id, INSTR_SAM_MIN_FREQ); - freq = INSTR_SAM_MIN_FREQ; - } - pic->sampler.frequency = freq; - } - /* - * When programming counter priv bits should be - * 0, 1, & 2, i.e., in reverse order. Therefore swap - * bits 2 & 0. - */ - *flagsp = ((*flagsp & PRIV_BIT0_MASK) << 2) | - ((*flagsp & PRIV_BIT2_MASK) >> 2) | - (*flagsp & PRIV_BIT1_MASK); - break; - case 1: /* L2 counter */ - /* - * nouser and sys are also invalid attributes for L2 - * and MMU counters. If user has not specified any - * attributes then *flagsp contains CPC_COUNT_USER. - * Any priv attrs are not applicable for L2 counters. - */ - if (*flagsp != CPC_COUNT_USER) - return (CPC_INVALID_ATTRIBUTE); - - pic->counter_bits = L2_COUNTER_BITS; - if ((bits & SYN_BIT) == 0) { - /* - * Normal counter: - * Find the attibutes for L2 Counter. - */ - for (i = 0; i < nattrs; i++) { - if ((strcmp(attrs[i].ka_name, - "single") == 0) && attrs[i].ka_val) - pic->src_type |= - HV_RK_PERF_SRC_STRAND; - else if ((strcmp(attrs[i].ka_name, - "shared") == 0) && attrs[i].ka_val) - pic->src_type |= - HV_RK_PERF_SRC_STRAND_M; - else if ((strcmp(attrs[i].ka_name, - "siu") == 0) && attrs[i].ka_val) - pic->src_type |= - HV_RK_PERF_SRC_SIU; - else if ((strcmp(attrs[i].ka_name, - "mmu") == 0) && attrs[i].ka_val) - pic->src_type |= - HV_RK_PERF_SRC_MMU; - else if ((strcmp(attrs[i].ka_name, - "source") == 0) && attrs[i].ka_val) - source = attrs[i].ka_val & - HV_RK_PERF_SRC_MASK; - else if (attrs[i].ka_val) - return (CPC_INVALID_ATTRIBUTE); - } - if (source) - pic->src_type = source; - - if (pic->src_type == 0) - pic->src_type = HV_RK_PERF_SRC_STRAND; - - /* At least one hot Xn flag for L2 counters */ - *flagsp = bits; - } else { - /* - * Synthetic Counter - */ - pic->sampler.syn_counter = bits; - freq = L2_SAM_DEF_FREQ; /* Default Frequency */ - /* - * Find the attibutes for L2 Sampler. - */ - for (i = 0; i < nattrs; i++) { - if ((strcmp(attrs[i].ka_name, - "freq") == 0) && attrs[i].ka_val) - freq = attrs[i].ka_val; - else if ((strcmp(attrs[i].ka_name, - "single") == 0) && attrs[i].ka_val) - pic->src_type |= - HV_RK_PERF_SRC_STRAND; - else if ((strcmp(attrs[i].ka_name, - "shared") == 0) && attrs[i].ka_val) - pic->src_type |= - HV_RK_PERF_SRC_STRAND_M; - else if ((strcmp(attrs[i].ka_name, - "siu") == 0) && attrs[i].ka_val) - pic->src_type |= - HV_RK_PERF_SRC_SIU; - else if ((strcmp(attrs[i].ka_name, - "mmu") == 0) && attrs[i].ka_val) - pic->src_type |= - HV_RK_PERF_SRC_MMU; - else if ((strcmp(attrs[i].ka_name, - "source") == 0) && attrs[i].ka_val) - source = attrs[i].ka_val & - HV_RK_PERF_SRC_MASK; - else if (attrs[i].ka_val) - return (CPC_INVALID_ATTRIBUTE); - } - if (source) - pic->src_type = source; - - if (pic->src_type == 0) - pic->src_type = HV_RK_PERF_SRC_STRAND; - - /* Range check to avoid DOS */ - if (freq > L2_SAM_MAX_FREQ) { - cmn_err(CE_NOTE, "CPU-%d: freq set " - "> MAX. Resetting to %d", - CPU->cpu_id, L2_SAM_MAX_FREQ); - freq = L2_SAM_MAX_FREQ; - } - if (freq < L2_SAM_MIN_FREQ) { - cmn_err(CE_NOTE, "CPU-%d: freq set " - "< MIN. Resetting to %d", - CPU->cpu_id, L2_SAM_MIN_FREQ); - freq = L2_SAM_MIN_FREQ; - } - pic->sampler.frequency = freq; - *flagsp = 0; - } - pic->counter = RK_PERF_L2; - break; - case 2: /* MMU Counter */ - if (*flagsp != CPC_COUNT_USER) - return (CPC_INVALID_ATTRIBUTE); - - *flagsp = bits; - pic->counter_bits = MMU_COUNTER_BITS; - - for (i = 0; i < nattrs; i++) { - if ((strcmp(attrs[i].ka_name, "single") == 0) && - attrs[i].ka_val) - pic->src_type |= HV_RK_PERF_SRC_STRAND; - else if - ((strcmp(attrs[i].ka_name, "shared") == - 0) && attrs[i].ka_val) - pic->src_type |= - HV_RK_PERF_SRC_STRAND_M; - else if ((strcmp(attrs[i].ka_name, - "source") == 0) && attrs[i].ka_val) - source = attrs[i].ka_val & - HV_RK_PERF_SRC_MASK; - else if (attrs[i].ka_val) - return (CPC_INVALID_ATTRIBUTE); - } - if (source) { - if (source & (HV_RK_PERF_SRC_SIU | - HV_RK_PERF_SRC_MMU)) - return (CPC_INVALID_ATTRIBUTE); - pic->src_type = source; - } - - - if (pic->src_type == 0) - pic->src_type = HV_RK_PERF_SRC_STRAND; - - pic->counter = RK_PERF_MMU; - break; - case 3: /* YANK Counter */ - pic->counter = RK_PERF_YANK; - pic->counter_bits = YANK_COUNTER_BITS; - /* FALLTHROUGH */ - case 4: /* SIBLK Counter */ - if (pic->counter == 0) { - pic->counter = RK_PERF_SIBLK; - pic->counter_bits = SIBLK_COUNTER_BITS; - } - /* FALLTHROUGH */ - case 5: /* LVLK Counter */ - if (pic->counter == 0) { - pic->counter = RK_PERF_LVLK; - pic->counter_bits = LVLK_COUNTER_BITS; - } - - if (*flagsp != CPC_COUNT_USER) - return (CPC_INVALID_ATTRIBUTE); - - for (i = 0; i < nattrs; i++) { - if ((strcmp(attrs[i].ka_name, "single") == - 0) && attrs[i].ka_val) - pic->src_type |= HV_RK_PERF_SRC_STRAND; - else if - ((strcmp(attrs[i].ka_name, "shared") == - 0) && attrs[i].ka_val) - pic->src_type |= - HV_RK_PERF_SRC_STRAND_M; - else if ((strcmp(attrs[i].ka_name, - "source") == 0) && attrs[i].ka_val) - source = attrs[i].ka_val & - HV_RK_PERF_SRC_MASK; - else if (attrs[i].ka_val) - return (CPC_INVALID_ATTRIBUTE); - } - if (source) { - if (source & (HV_RK_PERF_SRC_SIU | - HV_RK_PERF_SRC_MMU)) - return (CPC_INVALID_ATTRIBUTE); - pic->src_type = source; - } - - - if (pic->src_type == 0) - pic->src_type = HV_RK_PERF_SRC_STRAND; - - *flagsp = 0; - pic->sampler.frequency = 0; - pic->sampler.syn_counter = bits; - break; - } - - if ((int64_t)pic->pcbe_pic > COUNTER_MAX(pic) || - (int64_t)pic->pcbe_pic < COUNTER_MIN(pic)) - return (CPC_ATTRIBUTE_OUT_OF_RANGE); - - pic->pcbe_pic &= COUNTER_MASK(pic); - -#ifdef RKPCBE_DBG - set_pic_name(pic); -#endif - return (0); -} - -/*ARGSUSED7*/ -static int -rk_pcbe_configure(uint_t picnum, char *event, uint64_t preset, uint32_t flags, - uint_t nattrs, kcpc_attr_t *attrs, void **data, void *token) -{ - rk_pcbe_config_t *pic; - const struct nametable *n; - int rc; - - /* Is API version for Rock pcbe hypervisor services negotiated? */ - if (rock_pcbe_hsvc_available == B_FALSE) - return (CPC_RESOURCE_UNAVAIL); - - /* - * If we've been handed an existing configuration, we need only preset - * the counter value. - */ - if (*data != NULL) { - pic = *data; - if ((int64_t)preset > COUNTER_MAX(pic) || - (int64_t)preset < COUNTER_MIN(pic)) - return (CPC_ATTRIBUTE_OUT_OF_RANGE); - pic->pcbe_pic = preset & COUNTER_MASK(pic); - return (0); - } - - if (picnum < (uint_t)0 || picnum > NUM_PCBE_COUNTERS) - return (CPC_INVALID_PICNUM); - - /* - * Find other requests that will be programmed with this one, and ensure - * they don't conflict. - * Any other counter in this pic group is active? - */ - if (active_pics[picnum][CPU->cpu_id] != NULL) - return (CPC_CONFLICTING_REQS); - - if ((n = find_event(picnum, event)) == NULL) - return (CPC_INVALID_EVENT); - - /* Check for supported attributes and populate pic */ - pic = kmem_zalloc(sizeof (rk_pcbe_config_t), KM_SLEEP); - pic->flags = flags; - pic->pcbe_pic = preset; - - if (rc = populate_pic_config(picnum, nattrs, attrs, n->bits, pic)) { - kmem_free(pic, sizeof (rk_pcbe_config_t)); - return (rc); - } - - /* - * num_ringbuf_entries should be always even. Since this - * /etc/system tunable, need to check for this. - */ - if (num_ringbuf_entries & 1) { - num_ringbuf_entries++; - cmn_err(CE_WARN, "num_ringbuf_entries should be even." - " Changing %u to %u\n", num_ringbuf_entries - 1, - num_ringbuf_entries); - } - if (num_ringbuf_entries < MIN_RINGBUF_ENTRIES) { - cmn_err(CE_WARN, "num_ringbuf_entries should be at least " - "%u. Changing %u to %u\n", MIN_RINGBUF_ENTRIES, - num_ringbuf_entries, MIN_RINGBUF_ENTRIES); - num_ringbuf_entries = MIN_RINGBUF_ENTRIES; - } - - pic->state = STATE_CONFIGURED; - pic->cpu = CPU->cpu_id; - active_pics[picnum][pic->cpu] = pic; - *data = pic; - - if (pic->counter_type == NORMAL_COUNTER) - PRINT_PIC(pic, "After Configuration (N)"); - return (0); -} - -static void -rk_pcbe_program(void *token) -{ - rk_pcbe_config_t *pic = NULL; - int rc; - uint64_t counter; - - while ((pic = (rk_pcbe_config_t *)kcpc_next_config(token, pic, NULL)) - != NULL) { - - if (pic->inuse == B_FALSE) - continue; - - counter = (uint64_t)(pic->counter | pic->src_type); - rc = (int)hv_rk_perf_count_init(counter); - - if (curthread->t_cpc_ctx) { - /* - * If in thread context, pic should get an exclusive - * lock. If it cannot then invalidate the pic. - */ - if (rc != H_EOK) { - kcpc_invalidate_config(token); - continue; - } - } else { - /* Must be cpu context */ - ASSERT(CPU->cpu_cpc_ctx); - if (rc == H_EWOULDBLOCK && - (pic->src_type & HV_RK_PERF_SRC_STRAND_M)) { - /* pic in use by a cpu of current guest */ - pic->inuse = B_FALSE; - continue; - } else if (rc != H_EOK) { - /* - * Either the counter is in use by a different - * guest or another cpu in the current guest is - * already using it in single source mode. In - * either case, invalidate the pic. - */ - kcpc_invalidate_config(token); - continue; - } - } - - /* - * rc = H_EOK, hence current cpu was successful in - * obtaining exclusive access to the counter, Set this - * pic as active. - */ - if (CPU->cpu_id != pic->cpu) { - active_pics[pic->pcbe_picno][pic->cpu] = NULL; - pic->cpu = CPU->cpu_id; - active_pics[pic->pcbe_picno][pic->cpu] = pic; - } - pic->inuse = B_TRUE; - - if (pic->counter_type == NORMAL_COUNTER) - rc = rk_pcbe_program_normal(pic); - else - rc = rk_pcbe_program_synthetic(pic); - - pic->state = STATE_PROGRAMMED; - - if (rc != H_EOK) { - kcpc_invalidate_config(token); - continue; - } - } -} - -static void -rk_pcbe_allstop(void) -{ - int i; - rk_pcbe_config_t *pic; - uint64_t diff; - - for (i = 0; i < NUM_PCBE_COUNTERS; i++) { - pic = active_pics[i][CPU->cpu_id]; - - if (pic == NULL || pic->state != STATE_PROGRAMMED) - continue; - - ASSERT(pic->inuse == B_TRUE && CPU->cpu_id == pic->cpu); - - /* Stop all active pics */ - if (pic->counter_type == NORMAL_COUNTER) { - hv_rk_perf_count_stop((uint64_t)(pic->counter | - pic->src_type)); - DBG_PRINT(("CPU-%d: Counter %s(%X) stopped.\n", - CPU->cpu_id, pic->name, pic->counter)); - } else { - DBG_PRINT(("CPU-%d: Stopping counter %s(%lX)\n", - CPU->cpu_id, pic->name, - pic->sampler.synthetic_pic)); - rk_pcbe_stop_synthetic(pic); - } - - /* Mark pic as stopped */ - pic->state = STATE_STOPPED; - - /* - * If running in lwp context, kcpc ensures a cpu that - * executed pcbe_program will be the one that executes - * pcbe_allstop. However, pcbe_free may be executed on - * a different strand. HV puts a restriction that the - * strand that programmed the counter should be the one - * that releases it. Therefore, when counters are bound - * to thread context, counters are released everytime - * they are stopped. - */ - if (CPU->cpu_cpc_ctx == NULL) { - /* - * If counter is being released, cache the current - * sample since we cannot sample a counter that has - * been released. - */ - if (rk_pcbe_sample_internal(pic, &diff) == H_EOK) - pic->pcbe_pic = diff; - else - pic->pcbe_pic = 0; - rk_pcbe_release(pic); - } - } -} - -static void -rk_pcbe_sample(void *token) -{ - rk_pcbe_config_t *pic = NULL; - uint64_t *pic_data; - int rc; - uint64_t diff; - - while ((pic = (rk_pcbe_config_t *) - kcpc_next_config(token, pic, &pic_data)) != NULL) { - - if (pic->inuse != B_TRUE) { - continue; - } - - /* - * If counter is already released, then return the - * cached value - */ - if (pic->state == STATE_RELEASED) { - *pic_data += pic->pcbe_pic; - pic->pcbe_pic = 0; - continue; - } - - ASSERT(CPU->cpu_id == pic->cpu); - - rc = rk_pcbe_sample_internal(pic, &diff); - - if (pic->state == STATE_STOPPED) { - pic->pcbe_pic = 0; - rk_pcbe_release(pic); - } - - if (rc == H_EOK) { - *pic_data += diff; - } else { - kcpc_invalidate_config(token); - } - } -} - -static void -rk_pcbe_free(void *config) -{ - rk_pcbe_config_t *pic = (rk_pcbe_config_t *)config; - - /* Release counter */ - if (pic->inuse == B_TRUE) { - if (pic->state != STATE_RELEASED) { - rk_pcbe_release(pic); - } - if (pic->counter_type == SYNTHETIC_COUNTER) - rk_pcbe_free_synthetic(pic); - } - - /* Mark pic as inactive */ - active_pics[pic->pcbe_picno][pic->cpu] = NULL; - kmem_free(pic, sizeof (rk_pcbe_config_t)); -} - -static void -rk_pcbe_release(rk_pcbe_config_t *pic) -{ - int rc = 0; - - ASSERT(pic->inuse == B_TRUE && pic->state != STATE_RELEASED); - - DBG_PRINT(("CPU-%d: Releasing Pic %s (#%d, cntr %X) %p", - CPU->cpu_id, pic->name, pic->pcbe_picno, pic->counter, - (void *)pic)); - - rc = (int)hv_rk_perf_count_release((uint64_t) - (pic->counter | pic->src_type)); - if (rc != 0) { - cmn_err(CE_WARN, "CPU-%d: Releasing Pic-%d, counter: %X failed " - "%p. rc=%d", CPU->cpu_id, pic->pcbe_picno, pic->counter, - (void *)pic, rc); - } - if (pic->counter_type == SYNTHETIC_COUNTER && - !(pic->counter == RK_PERF_YANK || pic->counter == RK_PERF_SIBLK || - pic->counter == RK_PERF_LVLK)) { - rc = (int)hv_rk_perf_sample_release((uint64_t) - (pic->counter | pic->src_type)); - if (rc != 0) { - cmn_err(CE_WARN, "CPU-%d: Releasing Pic-%d, sampler: %X" - " failed %p. rc=%d", CPU->cpu_id, pic->pcbe_picno, - pic->counter, (void *)pic, rc); - return; - } - } - pic->state = STATE_RELEASED; -} - -static int -rk_pcbe_program_normal(rk_pcbe_config_t *pic) -{ - uint64_t counter; - uint64_t config_value; - uint64_t rc = H_EOK; - - ASSERT(pic->inuse == B_TRUE); - - counter = (uint64_t)(pic->counter | pic->src_type); - - /* Preset the counter value if non zero */ - if (pic->pcbe_pic > 0) { - DBG_PRINT(("CPU-%d: Counter getting preset to %lu (%lX)\n", - CPU->cpu_id, pic->pcbe_pic, pic->pcbe_pic)); - rc = (int)hv_rk_perf_count_set(counter, pic->pcbe_pic); - } - - if (rc != H_EOK) { - cmn_err(CE_WARN, "{%d} Pic %d cntr %X not set", - CPU->cpu_id, pic->pcbe_picno, pic->counter); - PRINT_PIC(pic, "Set counter failed"); - return ((int)rc); - } - - /* Configure and start counter */ - config_value = ((uint64_t)pic->toe << RK_PERF_COUNT_TOE_SHIFT) - | pic->flags; - rc = (int)hv_rk_perf_count_start(counter, config_value); - - if (rc != H_EOK) { - cmn_err(CE_WARN, "{%d} Pic %d cntr %X not configured", - CPU->cpu_id, pic->pcbe_picno, pic->counter); - PRINT_PIC(pic, "Configure counter failed"); - } - return ((int)rc); -} - -static int -rk_pcbe_program_synthetic(rk_pcbe_config_t *pic) -{ - int rc; - ASSERT(pic->inuse == B_TRUE); - switch (pic->counter) { - case RK_PERF_INSTR: - rc = program_instr_sampler(pic); - break; - case RK_PERF_L2: - rc = program_l2_sampler(pic); - break; - case RK_PERF_YANK: - /* FALLTHROUGH */ - case RK_PERF_SIBLK: - /* FALLTHROUGH */ - case RK_PERF_LVLK: - rc = rk_pcbe_program_normal(pic); - break; - default: - PRINT_PIC(pic, "rk_pcbe_program_synthetic"); - ASSERT(0); - rc = H_EINVAL; - break; - } - return (rc); -} - -static void -rk_pcbe_free_synthetic(rk_pcbe_config_t *pic) -{ - ASSERT(pic->inuse == B_TRUE); - switch (pic->counter) { - case RK_PERF_INSTR: - /* FALLTHROUGH */ - case RK_PERF_L2: - free_ringbuffer(pic); - break; - case RK_PERF_YANK: - /* FALLTHROUGH */ - case RK_PERF_SIBLK: - /* FALLTHROUGH */ - case RK_PERF_LVLK: - /* Do nothing */ - break; - default: - PRINT_PIC(pic, "rk_pcbe_free_synthetic"); - ASSERT(0); - break; - } -} - -static int -rk_pcbe_sample_internal(rk_pcbe_config_t *pic, uint64_t *data) -{ - uint64_t counter_value; - int rc; - int64_t diff; - - if (pic->counter_type == NORMAL_COUNTER) { - rc = (int)hv_rk_perf_count_get((uint64_t)(pic->counter | - pic->src_type), &counter_value); - if (rc == H_EOK) { - counter_value &= COUNTER_MASK(pic); - diff = counter_value - pic->pcbe_pic; - pic->pcbe_pic = counter_value; - /* - * When counter overflows the overflow handler - * (rk_pcbe_overflow_bitmap) would have added - * MAX count value to pic->pcbe_pic. Therefore - * -ve implies that the counter has overflowed. - * The actual count amounts to, - * (counter_value - (pic->pcbe_pic - MAX)) + MAX - * => counter_value - pic->pcbe_pic + (2 * MAX) - * => diff + (2 * MAX) - */ - if (diff < 0) { - diff += - (0x1ULL << (pic->counter_bits + 1)); - } - } - } else { - /* - * Difference returned by synthetic counters will - * be always +ve - */ - rc = rk_pcbe_sample_synthetic(pic, &diff); - } - - if (rc == H_EOK) - *data = (uint64_t)diff; - - return ((int)rc); -} - -/* All sample_synthetic code may be executed at TL=1 */ -static int -rk_pcbe_sample_synthetic(rk_pcbe_config_t *pic, int64_t *diffp) -{ - int rc; - ASSERT(pic->inuse == B_TRUE); - switch (pic->counter) { - case RK_PERF_INSTR: - rc = sample_instr_sampler(pic, diffp); - break; - case RK_PERF_L2: - rc = sample_l2_sampler(pic, diffp); - break; - case RK_PERF_YANK: - /* FALLTHROUGH */ - case RK_PERF_SIBLK: - /* FALLTHROUGH */ - case RK_PERF_LVLK: - rc = sample_mccdesr(pic, diffp); - break; - default: - PRINT_PIC(pic, "rk_pcbe_sample_synthetic"); - ASSERT(0); - break; - } - return (rc); -} - -static void -rk_pcbe_stop_synthetic(rk_pcbe_config_t *pic) -{ - uint64_t counter = (uint64_t)(pic->counter | pic->src_type); - - ASSERT(pic->inuse == B_TRUE); - switch (pic->counter) { - case RK_PERF_INSTR: - /* FALLTHROUGH */ - case RK_PERF_L2: - hv_rk_perf_count_stop(counter); - hv_rk_perf_sample_stop(counter); - break; - case RK_PERF_YANK: - /* FALLTHROUGH */ - case RK_PERF_SIBLK: - /* FALLTHROUGH */ - case RK_PERF_LVLK: - hv_rk_perf_count_stop(counter); - break; - default: - PRINT_PIC(pic, "rk_pcbe_stop_synthetic"); - ASSERT(0); - break; - } -} - -static int -program_l2_sampler(rk_pcbe_config_t *pic) -{ -#define ASI_PERF_L2_TXN_INFO 0xF10010 -#define ASI_PERF_L2_EA_MASK 0xF10018 -#define ASI_PERF_L2_EA_MATCH 0xF10020 -#define ASI_PERF_L2_TXN_INFO_FILTER 0xF10030 -#define ASI_PERF_L2_CC 0xF10038 -#define TXN_ICACHE_LOAD 0x1 -#define TXN_DCACHE_LOAD 0x2 -#define TXN_INSTR_PREFETCH 0x4 -#define TXN_STORE_PREFETCH 0x8 -#define TXN_DCACHE_STORE 0x10 -#define TXN_ATOMIC_LOAD_STORE 0x20 -#define TXN_FLUSH 0x40 -#define L2_ALL_TXNS (TXN_ICACHE_LOAD | TXN_DCACHE_LOAD | \ - TXN_INSTR_PREFETCH | TXN_STORE_PREFETCH | \ - TXN_DCACHE_STORE | TXN_ATOMIC_LOAD_STORE | TXN_FLUSH) -#define L2_TXN_SHIFT 3 -#define L2_ALL_EVT 0x3 -#define L2_ALL_EVT_SHIFT 10 -#define L2_TXN_INFO_FILTER_MASK (L2_ALL_EVT << L2_ALL_EVT_SHIFT) | \ - (L2_ALL_TXNS << L2_TXN_SHIFT) - - program_sampler_data_t sdata; - int i = 0; - - (void) strcpy(sdata.name, "program_l2_sampler"); - pic->flags = L2_ALL_TXNS; /* For L2 counter */ - - /* - * If (((Reported EA ^ MATCH) & MASK) == 0) then sample is taken - */ - sdata.asi_config[i].va = ASI_PERF_L2_EA_MASK; - sdata.asi_config[i].value = 0; - i++; - - sdata.asi_config[i].va = ASI_PERF_L2_EA_MATCH; - sdata.asi_config[i].value = 0; - i++; - - sdata.asi_config[i].va = ASI_PERF_L2_CC; - sdata.asi_config[i].value = pic->sampler.frequency; - i++; - - sdata.asi_config[i].va = ASI_PERF_L2_TXN_INFO_FILTER; - sdata.asi_config[i].value = L2_TXN_INFO_FILTER_MASK; - - sdata.asi_config_num = i + 1; - - sdata.asi_sample[0] = ASI_PERF_L2_TXN_INFO; - sdata.asi_sample_num = 1; - - return (program_a_sampler(pic, &sdata)); -} - -static int -sample_l2_sampler(rk_pcbe_config_t *pic, int64_t *diffp) -{ -#define DS_SHIFT 34 -#define EVT_SHIFT 22 -#define TXN_SHIFT 7 -#define DS_MASK MAKE_MASK(2, 0) -#define EVT_MASK MAKE_MASK(4, 0) -#define TXN_MASK MAKE_MASK(7, 0) - - rk_pcbe_ringbuf_t *ringbuf = pic->sampler.ring_buffer; - uint32_t value, target; - uint64_t *head, *tail; - uint32_t sample_count = 0, sample_hit_count = 0; - uint32_t size = pic->sampler.sample_size; - uint8_t ds, evt; - int ret; - - head = RINGBUF_GET_HEAD(ringbuf); - tail = RINGBUF_GET_TAIL(ringbuf); - - if (head == tail) { - DBG_PRINT(("CPU-%d: HEAD eq TAIL to start with\n", - CPU->cpu_id)); - } - - /* Consume samples */ - while (head != tail) { - uint64_t rawvalue = *head; - DBG_PRINT(("CPU-%d: rawvalue=0x%lX\n", CPU->cpu_id, rawvalue)); - target = TYPE(pic->sampler.syn_counter); - - switch (GROUP(pic->sampler.syn_counter)) { - case L2_GROUP_DS: - value = (rawvalue >> DS_SHIFT) & DS_MASK; - DBG_PRINT(("CPU-%d: value=0x%X, target=0x%X\n", - CPU->cpu_id, value, target)); - switch (target) { - case DS_DRAM: /* FALLTHROUGH */ - case DS_L3: /* FALLTHROUGH */ - case DS_OTHER_L2: /* FALLTHROUGH */ - if (value == target) - sample_hit_count++; - break; - } - break; - case L2_GROUP_TXN_MISS: - value = (rawvalue >> TXN_SHIFT) & TXN_MASK; - ds = (uint8_t)((rawvalue >> DS_SHIFT) & DS_MASK); - evt = (uint8_t)((rawvalue >> EVT_SHIFT) & EVT_MASK); - DBG_PRINT(("CPU-%d: value=0x%X, target=0x%X, " - " ds: 0x%X, evt: 0x%X\n", CPU->cpu_id, value, - target, ds, evt)); - if (((value & target) != 0) && (evt == EVT_L2_MISS || - evt == EVT_L2_PRIOR_MISS) && (ds != DS_LOCAL_L2)) - sample_hit_count++; - break; - case L2_GROUP_TXN_HIT: - value = (rawvalue >> TXN_SHIFT) & TXN_MASK; - ds = (uint8_t)((rawvalue >> DS_SHIFT) & DS_MASK); - evt = (uint8_t)((rawvalue >> EVT_SHIFT) & EVT_MASK); - DBG_PRINT(("CPU-%d: value=0x%X, target=0x%X, " - " ds: 0x%X, evt: 0x%X\n", CPU->cpu_id, value, - target, ds, evt)); - if (((value & target) != 0) && (evt == EVT_L2_PEND_ST || - evt == EVT_L2_NOEVENTS) && (ds == DS_LOCAL_L2)) - sample_hit_count++; - break; - case L2_GROUP_EVT: - evt = (rawvalue >> EVT_SHIFT) & EVT_MASK; - ds = (uint8_t)((rawvalue >> DS_SHIFT) & DS_MASK); - DBG_PRINT(("CPU-%d: evt=0x%X, target=0x%X, " - "ds: 0x%X\n", CPU->cpu_id, evt, target, ds)); - - switch (target) { - case L2_HIT: - if ((evt == EVT_L2_NOEVENTS || evt == - EVT_L2_PEND_ST) && ds == DS_LOCAL_L2) - sample_hit_count++; - break; - case L2_MISS: - if ((evt == EVT_L2_MISS || evt == - EVT_L2_PRIOR_MISS) && ds == DS_LOCAL_L2) - sample_hit_count++; - break; - } - } - sample_count++; - RINGBUF_MOVE_HEAD(ringbuf, head, size); - } - RINGBUF_SET_HEAD(ringbuf, head); - - ret = synthesize_sample_count(pic, sample_count, sample_hit_count, - "sample_l2_sampler", diffp); - - return (ret); -} - -static int -program_instr_sampler(rk_pcbe_config_t *pic) -{ -#define ASI_PERF_IS_PC_MASK 0x10 -#define ASI_PERF_IS_PC_MATCH 0x18 -#define ASI_PERF_IS_CC_LATENCY_MASK 0x160 -#define ASI_PERF_IS_CONTEXT_FILTER 0x168 -#define ASI_PERF_IS_INFO_MASK 0x170 -#define ASI_PERF_IS_INFO_MATCH 0x178 - -#define ASI_PERF_IS_CONTEXT 0x108 -#define ASI_PERF_IS_INFO 0x148 - -#define IS_BHR_LATENCY_CLAT_MASK 0xFFF -#define IS_CC_FILTER_TGTF_MASK 0x10 -#define IS_CC_FILTER_TOF_MASK 0x8 -#define IS_CC_LATENCY_FREQ_SHIFT 22 - - - program_sampler_data_t sdata; - int i = 0; - - (void) strcpy(sdata.name, "program_instr_sampler"); - /* - * If (((Reported Value ^ MATCH) & MASK) == 0) then sample is taken; - */ - sdata.asi_config[i].va = ASI_PERF_IS_PC_MASK; - sdata.asi_config[i].value = 0; - i++; - - sdata.asi_config[i].va = ASI_PERF_IS_PC_MATCH; - sdata.asi_config[i].value = 0; - i++; - - /* - * Set CLAT_MASK to 0xFFF, meaning, drop instruction samples - * whose latency is zero, means, sample all of them, because - * all instructions has at least a latency of 1 cycle. - */ - sdata.asi_config[i].va = ASI_PERF_IS_CONTEXT_FILTER; - sdata.asi_config[i].value = (uint64_t)(IS_CC_FILTER_TGTF_MASK | - IS_CC_FILTER_TOF_MASK | pic->sampler.flags); - i++; - - /* - * Even though frequency is set when started, it has to be - * specified here, because, if left zero, then a PET is - * immediately generated since the candidate counter is zero. - */ - sdata.asi_config[i].va = ASI_PERF_IS_CC_LATENCY_MASK; - sdata.asi_config[i].value = ((((uint64_t)pic->sampler.frequency) << - IS_CC_LATENCY_FREQ_SHIFT) | IS_BHR_LATENCY_CLAT_MASK); - i++; - - sdata.asi_config[i].va = ASI_PERF_IS_INFO_MASK; - sdata.asi_config[i].value = 0; - i++; - - sdata.asi_config[i].va = ASI_PERF_IS_INFO_MATCH; - sdata.asi_config[i].value = 0; - - sdata.asi_config_num = i + 1; - - sdata.asi_sample[0] = ASI_PERF_IS_INFO; - sdata.asi_sample[1] = ASI_PERF_IS_CONTEXT; - sdata.asi_sample_num = 2; - - return (program_a_sampler(pic, &sdata)); -} - -static int -sample_instr_sampler(rk_pcbe_config_t *pic, int64_t *diffp) -{ -#define I_MODE_SHIFT 34 -#define I_TYPE_SHIFT 0 -#define I_EVT_SHIFT 7 -#define I_MODE_MASK MAKE_MASK(3, 0) -#define I_TYPE_MASK MAKE_MASK(7, 0) -#define I_EVT_MASK MAKE_MASK(12, 0) - - rk_pcbe_ringbuf_t *ringbuf = pic->sampler.ring_buffer; - uint32_t size = pic->sampler.sample_size; - uint32_t value, target, shift, mask; - uint32_t sample_count = 0, sample_hit_count = 0; - uint64_t *head, *tail; - int ret; - - switch (GROUP(pic->sampler.syn_counter)) { - case I_GROUP_MODE: - mask = I_MODE_MASK; - shift = I_MODE_SHIFT; - break; - case I_GROUP_TYPE: - mask = I_TYPE_MASK; - shift = I_TYPE_SHIFT; - break; - case I_GROUP_EVT: - mask = I_EVT_MASK; - shift = I_EVT_SHIFT; - break; - default: - PRINT_PIC(pic, "No I_GROUP found"); - ASSERT(0); - break; - } - - head = RINGBUF_GET_HEAD(ringbuf); - tail = RINGBUF_GET_TAIL(ringbuf); - - if (head == tail) { - DBG_PRINT(("CPU-%d: HEAD eq TAIL to start with\n", - CPU->cpu_id)); - } - - /* Consume samples */ - while (head != tail) { - /* - * Data returned will be in the same order as the asi_list - * passed to hypervisor during hv_rk_perf_sample_start call. - */ - uint64_t rawvalue = *head; - uint64_t context = *(head + 1); - uint8_t tl = (uint8_t)((context >> 2) & 7); - int drop_sample = B_FALSE; - - if (rawvalue != 0) { - value = (rawvalue >> shift) & mask; - target = TYPE(pic->sampler.syn_counter); - DBG_PRINT(("CPU-%d: rawvalue=0x%lX, value=0x%X," - "target=0x%X\n", CPU->cpu_id, rawvalue, value, - target)); - - /* - * Several EVT fields are only valid for certain - * instruction types. Need to check TYP field - * before trusting what's in EVT. - */ - if (GROUP(pic->sampler.syn_counter) == I_GROUP_EVT) { - uint64_t type = rawvalue >> I_TYPE_SHIFT; - - switch (target) { - case EVT_DC_MISS: - case EVT_PRIOR_MISS: - case EVT_LDB_FULL: - case EVT_BYPASS_RAW: - case EVT_NONBYPASS_RAW: - if ((type & TYPE_LD) == 0) - drop_sample = B_TRUE; - break; - case EVT_STB_FULL: - if ((type & TYPE_ST) == 0) - drop_sample = B_TRUE; - break; - case EVT_DTLB_MISS: - if ((type & (TYPE_LD|TYPE_ST)) == 0) - drop_sample = B_TRUE; - break; - case EVT_CORRECT_BP: - case EVT_CTI_TAKEN: - if ((type & TYPE_CTI) == 0) - drop_sample = B_TRUE; - break; - } - DBG_PRINT(("CPU-%d: rawvalue=%lX, cleaned value" - "=%X, target=%X\n", CPU->cpu_id, rawvalue, - value, target)); - } - - /* - * If user does not want to count instructions in scout - * mode, and if the instruction sampled was in scout - * mode, drop the sample. - */ - if (pic->sampler.nohws == B_TRUE) { - uint64_t mode = (rawvalue >> I_MODE_SHIFT) & - I_MODE_MASK; - if (mode == MODE_HWS) - drop_sample = B_TRUE; - } - - /* - * If user wants to count instructions at a particular - * trap level (0 or >0), and the samples are in - * different trap level, drop the sample. - */ - switch (pic->sampler.tl) { - case TLZ: /* Sample ONLY instr at TL == 0 */ - if (tl != 0) - drop_sample = B_TRUE; - break; - case TLNZ: /* Sample ONLY instr at TL > 0 */ - if (tl == 0) - drop_sample = B_TRUE; - break; - } - - switch (GROUP(pic->sampler.syn_counter)) { - case I_GROUP_MODE: - /* Fields that are integers */ - if (value == target && drop_sample == B_FALSE) - sample_hit_count++; - break; - case I_GROUP_EVT: - case I_GROUP_TYPE: - /* Fields that are bit vectors */ - if (value & target && drop_sample == B_FALSE) - sample_hit_count++; - break; - default: - ASSERT(0); /* missing case statement */ - } - } - sample_count++; - RINGBUF_MOVE_HEAD(ringbuf, head, size); - } - RINGBUF_SET_HEAD(ringbuf, head); - - ret = synthesize_sample_count(pic, sample_count, sample_hit_count, - "sample_instr_sampler", diffp); - - return (ret); -} - -/* - * mccdesr counters are synthetic counters. Hypervisor maintains - * a 64 bit memory based counter. Therefore we can assume that - * this counter never overflows. - */ -static int -sample_mccdesr(rk_pcbe_config_t *pic, int64_t *diffp) -{ - uint64_t rc = 0; - uint64_t counter_value; - rc = hv_rk_perf_count_get((uint64_t)(pic->counter | - pic->src_type), &counter_value); - if (rc == H_EOK) { - counter_value &= COUNTER_MASK(pic); - *diffp = counter_value - pic->pcbe_pic; - pic->pcbe_pic = counter_value; - if (*diffp < 0) { - cmn_err(CE_WARN, "CPU-%d: Pic-%d, counter: %X overflow", - CPU->cpu_id, pic->pcbe_picno, pic->counter); - } - } else { - cmn_err(CE_WARN, "CPU-%d: Failed to sample pic-%d, counter-%X", - CPU->cpu_id, pic->pcbe_picno, pic->counter); - } - return ((int)rc); -} - -static int -program_a_sampler(rk_pcbe_config_t *pic, program_sampler_data_t *sdata) -{ - uint64_t ringbuf_pa, asi_list_pa, counter, rc; - int hv_call_cnt = 1, ret = 0, need_init = 0, i; - uint64_t temp_pcbe_pic = 0; - - counter = (uint64_t)(pic->counter | pic->src_type); - - if (pic->sampler.ring_buffer == NULL) { - pic->sampler.sample_size = sdata->asi_sample_num * - sizeof (uint64_t); - rc = alloc_ringbuffer(pic, pic->sampler.sample_size, - num_ringbuf_entries); - if (rc != 0) - return ((int)rc); - need_init = 1; - PRINT_PIC(pic, "After Configuration (S)"); - } - - if (need_init || pic->state == STATE_RELEASED) { - ringbuf_pa = va_to_pa(pic->sampler.ring_buffer); - rc = hv_rk_perf_sample_init(counter, ringbuf_pa); - print_hv_error(rc, &hv_call_cnt, sdata->name, pic); - if (rc != H_EOK) - return ((int)rc); - } - - /* - * If (((Reported Value ^ MATCH) & MASK) == 0) then sample is taken; - */ - for (i = 0; i < sdata->asi_config_num; i++) { - rc = hv_rk_perf_sample_config(counter, sdata->asi_config[i].va, - sdata->asi_config[i].value); - ret |= (int)rc; - print_hv_error(rc, &hv_call_cnt, sdata->name, pic); - } - - /* - * pic->pcbe_pic is used to hold preset value in case of synthetic - * counters - */ - if (pic->pcbe_pic > 0) { - temp_pcbe_pic = pic->pcbe_pic; - pic->pcbe_pic = 0; - } - ret |= rk_pcbe_program_normal(pic); /* Reset to zero & start counting */ - pic->pcbe_pic = temp_pcbe_pic; - - /* - * Start sampling - * - * Data returned in the ringbuffer by the hypervisor will be in the - * same order as it is programmed - */ - asi_list_pa = va_to_pa(sdata->asi_sample); - rc = hv_rk_perf_sample_start(counter, pic->sampler.frequency, - sdata->asi_sample_num * sizeof (uint64_t), asi_list_pa); - ret |= (int)rc; - print_hv_error(rc, &hv_call_cnt, sdata->name, pic); - return (ret); -} - -static int -synthesize_sample_count(rk_pcbe_config_t *pic, uint64_t sample_count, - uint64_t sample_hit_count, char *name, int64_t *diffp) -{ - - uint64_t total_count, rc, ovf_count, hit_count = 0; - int hv_call_cnt = 1, ret = 0; - /* - * Since ring buffer is consumed, clear pending sample count. - * Sample count is discarded, therefore reusing a variable. - */ - rc = hv_rk_perf_sample_pending((uint64_t)(pic->counter | - pic->src_type), &total_count); - ret |= (int)rc; - print_hv_error(rc, &hv_call_cnt, name, pic); - - /* Check if the counter overflowed */ - rc = hv_rk_perf_count_overflow((uint64_t)(pic->counter | - pic->src_type), &ovf_count); - ret |= (int)rc; - print_hv_error(rc, &hv_call_cnt, name, pic); - - if (rc != H_EOK) - ovf_count = 0; - - rc = hv_rk_perf_count_get((uint64_t)(pic->counter | - pic->src_type), &total_count); - ret |= (int)rc; - print_hv_error(rc, &hv_call_cnt, name, pic); - - if (rc != H_EOK) - total_count = 0; - - total_count &= COUNTER_MASK(pic); - - /* - * Reset it to zero so that we need not maintain old value - */ - rc = hv_rk_perf_count_set((uint64_t)(pic->counter | pic->src_type), 0); - ret |= (int)rc; - print_hv_error(rc, &hv_call_cnt, name, pic); - - /* - * ovf_count > 0 means, counter has hit max, ovf_count times - * before counting total_count of instructions. Therefore - * add total_count to ovf_count times max count value. - */ - if (ovf_count) - total_count += (ovf_count * (0x1ULL << pic->counter_bits)); - - if (sample_count > 0) - hit_count = (sample_hit_count * total_count) / sample_count; - - *diffp = (int64_t)hit_count; - DBG_PRINT(("CPU-%d: sample_instr_load. hit_count: %lu, *diffp: %ld\n", - CPU->cpu_id, hit_count, *diffp)); - if (*diffp < 0) { - cmn_err(CE_WARN, "CPU-%d Negative instr count. hit_count: %lu, " - "*diffp: %ld\n", CPU->cpu_id, hit_count, *diffp); - } - - if (pic->pcbe_pic) { - *diffp += pic->pcbe_pic; /* Add the preset value */ - /* - * pic->pcbe_pic is used to hold preset value in case of synthetic - * counters - */ - pic->pcbe_pic = 0; - } - return (ret); -} - -static int -alloc_ringbuffer(rk_pcbe_config_t *pic, uint32_t size, - uint32_t num_samples) -{ - uint32_t ringbuf_size; - uint32_t asize = 2; - rk_pcbe_ringbuf_t *ringbuf; - ASSERT(!(num_samples & 1)); /* Assert number of samples is even */ - - ringbuf_size = sizeof (rk_pcbe_ringbuf_t) + (size * num_samples); - - /* Size should be a power of 2 */ - while ((ringbuf_size & (asize - 1)) != ringbuf_size) - asize <<= 1; - - ringbuf = contig_mem_alloc_align_sleep(asize, 0); - if (ringbuf == NULL) { - cmn_err(CE_WARN, "CPU-%d: Ringbuffer memory allocation failed!", - CPU->cpu_id); - return (-1); - } - pic->sampler.ring_buffer = ringbuf; - ringbuf->head = NULL; - ringbuf->tail = NULL; - ringbuf->size = size * num_samples; - ringbuf->hwm = ringbuf->size >> 1; - return (0); -} - -static void -free_ringbuffer(rk_pcbe_config_t *pic) -{ - rk_pcbe_ringbuf_t *ringbuf = pic->sampler.ring_buffer; - /* - * When multiple pics are used and one of the pics was not configurable - * (eg: Bad attribute), then cpc calls rk_pcbe_free for the pics that - * were already configured. This results in calling this routine with - * NULL ringbuf, since ringbuf is allocated when the first sample is - * taken. To protect against this condition, we need do the following - * check before calling contig_mem_free since it uses ringbuf->size. - */ - if (ringbuf) { - uint32_t ringbuf_size; - uint32_t asize = 2; - DBG_PRINT(("CPU-%d: free_ringbuffer freeing %d bytes\n", - CPU->cpu_id, - (int)(sizeof (rk_pcbe_ringbuf_t) + ringbuf->size))); - ringbuf_size = sizeof (rk_pcbe_ringbuf_t) + ringbuf->size; - while ((ringbuf_size & (asize - 1)) != ringbuf_size) - asize <<= 1; - contig_mem_free(ringbuf, asize); - } -} - -static void -print_hv_error(uint64_t rc, int *cntp, char *funcname, rk_pcbe_config_t *pic) -{ - ASSERT(cntp && pic); - if (rc != H_EOK) { - cmn_err(CE_WARN, "{%d} pgm-hw call-%d in %s returned 0x%lX for " - "pic %d cntr %X", CPU->cpu_id, *cntp, funcname, rc, - pic->pcbe_picno, pic->counter); - } - (*cntp)++; -} - -static void -set_string_constants(void) -{ - if (strncmp(cpu_module_name, "SUNW,", 5) == 0) - rock_name = &cpu_module_name[5]; - else - rock_name = cpu_module_name; - (void) strcpy(rock_cpuref, "See the \""); - (void) strcat(rock_cpuref, rock_name); - (void) strcat(rock_cpuref, " User's Manual\" for descriptions of " - "these events. "CPU_REF_URL); - (void) strcat(pcbe_module_name, cpu_module_name); -} - -static uint64_t -bitmask(uint8_t bits) -{ - if (bits < 64) - return ((1ULL << bits) - 1); - return (-1); -} - -#ifdef RKPCBE_DBG -static void -set_pic_name(rk_pcbe_config_t *pic) -{ - uint32_t bits; - const struct nametable *n; - - /* - * For normal instruction counter, the 'bits' value is not saved. - */ - if (pic->counter_type == NORMAL_COUNTER) { - if (pic->counter == RK_PERF_INSTR) { - (void) strcpy(pic->name, "Instr_All"); - return; - } - bits = pic->flags; - } - else - bits = pic->sampler.syn_counter; - - for (n = events[pic->pcbe_picno]; n->bits != NT_END; n++) { - if (n->bits == bits) { - (void) strcpy(pic->name, n->name); - break; - } - } -} - -static void -print_pic(rk_pcbe_config_t *pic, char *heading) -{ - ASSERT(pic); - /* - * On multi strand system, the print gets clobberd. Therefore - * grab a lock so that the output is legible. - */ - mutex_enter(&print_pic_lock); - printf("{CPU-%d} %s:\n", CPU->cpu_id, heading); - printf("pic addr : %p\n", (void *)pic); - printf("name : %s\n", pic->name); - printf("pcbe_picno : %d\n", pic->pcbe_picno); - printf("counter_bits : 0x%X\n", pic->counter_bits); - printf("counter_type : 0x%X\n", pic->counter_type); - printf("toe : %d\n", pic->toe); - printf("counter : 0x%X\n", pic->counter); - printf("src_type : 0x%X\n", pic->src_type); - printf("flags : 0x%X\n", pic->flags); - printf("pcbe_pic : %ld\n", pic->pcbe_pic); - printf("inuse : %d\n", pic->inuse); - printf("state : 0x%X\n", pic->state); - printf("cpu : %d\n", pic->cpu); - if (pic->counter_type == SYNTHETIC_COUNTER) { - printf("Synthetic counter:\n"); - printf("\tsyn_pic: 0x%X\n", (int)pic->sampler.synthetic_pic); - printf("\tfreq : %d\n", pic->sampler.frequency); - printf("\tsyn_cnt: 0x%X\n", pic->sampler.syn_counter); - printf("\tsize : %d bytes\n", pic->sampler.sample_size); - printf("\tflags : 0x%X\n", pic->sampler.flags); - printf("\ttl : 0x%X\n", pic->sampler.tl); - printf("\tnohws : 0x%X\n", pic->sampler.nohws); - printf("\trbuf : 0x%p\n", (void *)pic->sampler.ring_buffer); - if (pic->sampler.ring_buffer) { - rk_pcbe_ringbuf_t *rb = pic->sampler.ring_buffer; - printf("\tRingbuffer:\n"); - printf("\t\tHead: 0x%X\n", rb->head); - printf("\t\tTail: 0x%X\n", rb->tail); - printf("\t\tSize: 0x%X\n", rb->size); - printf("\t\tHwm : 0x%X\n", rb->hwm); - } - } - printf("-----------------\n"); - mutex_exit(&print_pic_lock); -} -#endif
--- a/usr/src/uts/sun4v/rock/Makefile Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,113 +0,0 @@ -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END -# - -# -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# -# This makefile drives the production of the UltraSPARC-AT10 cpu module. -# -# sun4v implementation architecture dependent -# - -# -# Path to the base of the uts directory tree (usually /usr/src/uts). -# -UTSBASE = ../.. - -# -# Define the module and object file sets. -# -MODULE = SUNW,UltraSPARC-AT10 -OBJECTS = $(ROCKCPU_OBJS:%=$(OBJS_DIR)/%) -LINTS = $(ROCKCPU_OBJS:%.o=$(LINTS_DIR)/%.ln) -ROOTMODULE = $(ROOT_PSM_CPU_DIR)/$(MODULE) - -CPU_DIR = . -HERE = ../rock - -# -# Include common rules. -# -include $(UTSBASE)/sun4v/Makefile.sun4v - -# -# Override defaults -# -CLEANFILES += $(CPULIB) $(SYM_MOD) - -# -# Define targets -# -ALL_TARGET = $(SYM_MOD) -LINT_TARGET = $(MODULE).lint -INSTALL_TARGET = def $(BINARY) $(ROOTMODULE) - -# -# The ATOMIC_BO_ENABLE_SHIFT enables backoff in atomic routines. -# ATOMIC_SIMPLE_BO_ENABLE enables simple backoff required for rock -# -ATOMIC_BO_FLAG = -DATOMIC_BO_ENABLE_SHIFT=14 -DATOMIC_SIMPLE_BO_ENABLE - -# -# lint pass one enforcement -# -CFLAGS += $(CCVERBOSE) $(ATOMIC_BO_FLAG) - -# -# cpu-module-specific flags -# -CPPFLAGS += -DCPU_MODULE $(ATOMIC_BO_FLAG) -AS_CPPFLAGS += -DCPU_MODULE -DCUSTOM_FPZERO $(ATOMIC_BO_FLAG) -LINTFLAGS += -DCUSTOM_FPZERO - -# -# Default build targets. -# -.KEEP_STATE: - -def: $(DEF_DEPS) - -all: $(ALL_DEPS) - -clean: $(CLEAN_DEPS) - -clobber: $(CLOBBER_DEPS) - -lint: $(LINT_DEPS) - -modlintlib: $(MODLINTLIB_DEPS) - -clean.lint: $(CLEAN_LINT_DEPS) - -install: $(INSTALL_DEPS) - -$(CPULIB): $(BINARY) - $(LD) -o $(CPULIB) -G $(BINARY) - -$(SYM_MOD): $(UNIX_O) $(CPULIB) - @echo "resolving symbols against unix.o" - @(cd $(UNIX_DIR); pwd; \ - CPU_DIR=$(HERE) SYM_MOD=$(HERE)/$(SYM_MOD) $(MAKE) symcheck) - -# Include common targets. -# -include $(UTSBASE)/$(PLATFORM)/Makefile.targ
--- a/usr/src/uts/sun4v/rock_pcbe/Makefile Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ -# -# CDDL HEADER START -# -# The contents of this file are subject to the terms of the -# Common Development and Distribution License (the "License"). -# You may not use this file except in compliance with the License. -# -# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE -# or http://www.opensolaris.org/os/licensing. -# See the License for the specific language governing permissions -# and limitations under the License. -# -# When distributing Covered Code, include this CDDL HEADER in each -# file and include the License file at usr/src/OPENSOLARIS.LICENSE. -# If applicable, add the following below this CDDL HEADER, with the -# fields enclosed by brackets "[]" replaced with your own identifying -# information: Portions Copyright [yyyy] [name of copyright owner] -# -# CDDL HEADER END -# - -# -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# -# This Makefile builds the Rock Performance Counter BackEnd (PCBE). -# - -UTSBASE = ../.. - -# -# Define module and object file sets. -# -MODULE = pcbe.SUNW,UltraSPARC-AT10 -OBJECTS = $(RK_PCBE_OBJS:%=$(OBJS_DIR)/%) -LINTS = $(RK_PCBE_OBJS:%.o=$(LINTS_DIR)/%.ln) -ROOTMODULE = $(ROOT_PSM_PCBE_DIR)/$(MODULE) - -# -# Include common rules. -# -include $(UTSBASE)/sun4v/Makefile.sun4v - -# -# Define targets. -# -ALL_TARGET = $(BINARY) -LINT_MODULE = rock_pcbe -LINT_TARGET = $(LINT_MODULE).lint -INSTALL_TARGET = $(BINARY) $(ROOTMODULE) -EXTRA_OPTIONS += -URKPCBE_DBG - -# -# Default build targets. -# -.KEEP_STATE: - -def: $(DEF_DEPS) - -all: $(ALL_DEPS) - -clean: $(CLEAN_DEPS) - -clobber: $(CLOBBER_DEPS) - -lint: $(LINT_DEPS) - -modlintlib: $(MODLINTLIB_DEPS) - -clean.lint: $(CLEAN_LINT_DEPS) - -install: $(INSTALL_DEPS) - -# -# Include common targets. -# -include $(UTSBASE)/sun4v/Makefile.targ
--- a/usr/src/uts/sun4v/sys/error.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/sys/error.h Thu Aug 06 17:39:39 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -118,7 +118,7 @@ uint64_t stick; /* Value of the %STICK register */ uint32_t desc; /* Error Descriptor */ uint32_t attr; /* error attributes bit field */ - uint64_t addr; /* va for ERRH_ATTR_ASI, otherwise ra */ + uint64_t ra; /* Real address */ uint32_t sz; /* Size of affected mem region */ uint16_t cpuid; /* Virtual ID of the affected CPU */ uint16_t secs; /* Seconds */
--- a/usr/src/uts/sun4v/sys/hsvc.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/sys/hsvc.h Thu Aug 06 17:39:39 2009 -0700 @@ -42,7 +42,6 @@ #define HSVC_GROUP_CORE 0x0001 #define HSVC_GROUP_INTR 0x0002 #define HSVC_GROUP_SOFT_STATE 0x0003 -#define HSVC_GROUP_MEM_IFLUSH 0x0010 #define HSVC_GROUP_TM 0x0080 #define HSVC_GROUP_VPCI 0x0100 #define HSVC_GROUP_LDC 0x0101 @@ -54,9 +53,6 @@ #define HSVC_GROUP_NIAGARA2_CPU 0x0202 #define HSVC_GROUP_NIU 0x0204 #define HSVC_GROUP_VFALLS_CPU 0x0205 -#define HSVC_GROUP_RKPERF 0x0206 -#define HSVC_GROUP_RKMMU_EXT 0x0207 -#define HSVC_GROUP_RKCPU 0x0208 #define HSVC_GROUP_DIAG 0x0300 #ifndef _ASM @@ -82,8 +78,6 @@ */ #define HSVC_REV_1 1 -extern int hsvc_kdi_mem_iflush_negotiated; - /* * External interface */
--- a/usr/src/uts/sun4v/sys/hypervisor_api.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/sys/hypervisor_api.h Thu Aug 06 17:39:39 2009 -0700 @@ -110,8 +110,6 @@ #define HV_MEM_SCRUB 0x31 #define HV_MEM_SYNC 0x32 -#define HV_MEM_IFLUSH 0x33 -#define HV_MEM_IFLUSH_ALL 0x34 #define HV_INTR_SEND 0x42 @@ -199,12 +197,6 @@ #define MAP_DTLB 0x1 #define MAP_ITLB 0x2 -/* - * Definitions for TLB Search Order functions - */ -#define TLB_SO_DATA 0x1 -#define TLB_SO_INS 0x2 -#define TLB_SO_ID TLB_SO_DATA | TLB_SO_INS /* * Interrupt state manipulation definitions. @@ -325,7 +317,6 @@ */ #define HVIO_DMA_SYNC_DIR_TO_DEV 0x01 #define HVIO_DMA_SYNC_DIR_FROM_DEV 0x02 -#define HVIO_DMA_SYNC_DIR_NO_ICACHE_FLUSH 0x04 /* * LDC Channel States @@ -360,9 +351,6 @@ uint64_t *scrubbed_len); extern uint64_t hv_mem_sync(uint64_t real_addr, uint64_t length, uint64_t *flushed_len); -extern uint64_t hv_mem_iflush(uint64_t real_addr, uint64_t length, - uint64_t *flushed_len); -extern uint64_t hv_mem_iflush_all(void); extern uint64_t hv_tm_enable(uint64_t enable); extern uint64_t hv_service_recv(uint64_t s_id, uint64_t buf_pa,
--- a/usr/src/uts/sun4v/sys/machcpuvar.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/sys/machcpuvar.h Thu Aug 06 17:39:39 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -181,7 +181,6 @@ id_t cpu_core; /* cpu core id */ id_t cpu_chip; /* cpu chip id */ kthread_t *startup_thread; - uint64_t cpu_nre_error; /* nonresumable error */ }; typedef struct machcpu machcpu_t;
--- a/usr/src/uts/sun4v/sys/machsystm.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/sys/machsystm.h Thu Aug 06 17:39:39 2009 -0700 @@ -249,7 +249,6 @@ extern void *contig_mem_alloc(size_t); extern void *contig_mem_alloc_align(size_t, size_t); extern void contig_mem_free(void *, size_t); -extern void *contig_mem_alloc_align_sleep(size_t, size_t); /* * Caches
--- a/usr/src/uts/sun4v/sys/mmu.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/sys/mmu.h Thu Aug 06 17:39:39 2009 -0700 @@ -156,18 +156,6 @@ #define MIN_NSHCONTEXTS 1 #define MIN_NTSBS 4 -/* - * The number of shared contexts supported in search list entries for the - * pagesize register. - */ -#define NSEARCH_SHCONTEXTS 1 - -/* - * The maximum number of entries allowed in a search list for the pagesize - * register. - */ -#define MAX_PGSZ_SEARCH_ORDER 8 - #ifdef __cplusplus } #endif
--- a/usr/src/uts/sun4v/sys/pte.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/sys/pte.h Thu Aug 06 17:39:39 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -61,7 +61,7 @@ unsigned int w:1; /* <6> write perm */ unsigned int ref:1; /* <5> sw - ref */ unsigned int wr_perm:1; /* <4> sw - write perm */ - unsigned int xsoft:1; /* <3> sw - soft execute */ + unsigned int rsvd:1; /* <3> reserved */ unsigned int sz:3; /* <2:0> pagesize */ } tte_bit; struct { @@ -83,7 +83,6 @@ #define tte_no_sync tte_bit.no_sync #define tte_suspend tte_bit.susp #define tte_exec_perm tte_bit.x -#define tte_soft_exec tte_bit.xsoft #define tte_lock tte_bit.lock #define tte_cp tte_bit.cp #define tte_cv tte_bit.cv @@ -163,7 +162,6 @@ #define TTE_HWWR_INT 0x00000040 #define TTE_REF_INT 0x00000020 #define TTE_WRPRM_INT 0x00000010 -#define TTE_SOFTEXEC_INT 0x00000008 #define TTE_PROT_INT (TTE_WRPRM_INT | TTE_PRIV_INT) @@ -245,7 +243,6 @@ #define TTE_IS_8K(ttep) (TTE_CSZ(ttep) == TTE8K) #define TTE_IS_WRITABLE(ttep) ((ttep)->tte_wr_perm) #define TTE_IS_EXECUTABLE(ttep) ((ttep)->tte_exec_perm) -#define TTE_IS_SOFTEXEC(ttep) ((ttep)->tte_soft_exec) #define TTE_IS_PRIVILEGED(ttep) ((ttep)->tte_priv) #define TTE_IS_NOSYNC(ttep) ((ttep)->tte_no_sync) #define TTE_IS_LOCKED(ttep) ((ttep)->tte_lock) @@ -275,8 +272,6 @@ #define TTE_CLR_WRT(ttep) ((ttep)->tte_wr_perm = 0) #define TTE_SET_EXEC(ttep) ((ttep)->tte_exec_perm = 1) #define TTE_CLR_EXEC(ttep) ((ttep)->tte_exec_perm = 0) -#define TTE_SET_SOFTEXEC(ttep) ((ttep)->tte_soft_exec = 1) -#define TTE_CLR_SOFTEXEC(ttep) ((ttep)->tte_soft_exec = 0) #define TTE_SET_PRIV(ttep) ((ttep)->tte_priv = 1) #define TTE_CLR_PRIV(ttep) ((ttep)->tte_priv = 0)
--- a/usr/src/uts/sun4v/sys/rock_hypervisor_api.h Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,100 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ROCK_HYPERVISOR_API_H -#define _SYS_ROCK_HYPERVISOR_API_H - -/* - * sun4v rock Hypervisor API - * - */ - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Function numbers for managing the Rock TLB page size register. - */ -#define MMU_GET_NONPRIV_SEARCH 0x13b -#define MMU_SET_NONPRIV_SEARCH 0x13c -#define MMU_GET_PRIV_SEARCH 0x13d -#define MMU_SET_PRIV_SEARCH 0x13e - -/* - * Function numbers for performance counters - */ -#define HV_RK_PERF_COUNT_INIT 0x108 -#define HV_RK_PERF_COUNT_RELEASE 0x109 -#define HV_RK_PERF_COUNT_SET 0x10A -#define HV_RK_PERF_COUNT_GET 0x10B -#define HV_RK_PERF_COUNT_START 0x10C -#define HV_RK_PERF_COUNT_OVERFLOW 0x10D -#define HV_RK_PERF_COUNT_STOP 0x10E - -#define HV_RK_PERF_SAMPLE_INIT 0x135 -#define HV_RK_PERF_SAMPLE_RELEASE 0x136 -#define HV_RK_PERF_SAMPLE_CONFIG 0x137 -#define HV_RK_PERF_SAMPLE_START 0x138 -#define HV_RK_PERF_SAMPLE_PENDING 0x139 -#define HV_RK_PERF_SAMPLE_STOP 0x13A - -#define HV_RK_PERF_SRC_STRAND 0x1 /* Local Strand */ -#define HV_RK_PERF_SRC_STRAND_M 0x2 /* Multiple Strands */ -#define HV_RK_PERF_SRC_SIU 0x4 /* L2 txn source */ -#define HV_RK_PERF_SRC_MMU 0x8 /* L2 txn source */ -#define HV_RK_PERF_SRC_MASK 0xF - -#define ROCK_HSVC_MAJOR 1 -#define ROCK_HSVC_MINOR 0 - -#ifndef _ASM - -/* Performance Counter API */ -extern uint64_t hv_rk_perf_count_init(uint64_t counter); -extern uint64_t hv_rk_perf_count_release(uint64_t counter); -extern uint64_t hv_rk_perf_count_set(uint64_t counter, uint64_t value); -extern uint64_t hv_rk_perf_count_get(uint64_t counter, uint64_t *value); -extern uint64_t hv_rk_perf_count_start(uint64_t counter, uint64_t value); -extern uint64_t hv_rk_perf_count_overflow(uint64_t counter, uint64_t *ovf_cnt); -extern uint64_t hv_rk_perf_count_stop(uint64_t counter); - -/* Performance Sampler API */ -extern uint64_t hv_rk_perf_sample_init(uint64_t sampler, uint64_t ringbuf_pa); -extern uint64_t hv_rk_perf_sample_release(uint64_t sampler); -extern uint64_t hv_rk_perf_sample_config(uint64_t sampler, uint64_t reg_va, - uint64_t reg_value); -extern uint64_t hv_rk_perf_sample_start(uint64_t sampler, uint64_t freq, - uint64_t list_size, uint64_t valist_pa); -extern uint64_t hv_rk_perf_sample_pending(uint64_t sampler, uint64_t *pend_cnt); -extern uint64_t hv_rk_perf_sample_stop(uint64_t counter); -#endif /* _ASM */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ROCK_HYPERVISOR_API_H */
--- a/usr/src/uts/sun4v/sys/rockasi.h Thu Aug 06 17:19:00 2009 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,68 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_ROCKASI_H -#define _SYS_ROCKASI_H - -/* - * alternate address space identifiers - * - * 0x00 - 0x2F are privileged - * 0x30 - 0x7f are hyperprivileged - * 0x80 - 0xFF can be used by non-privileged, privileged & hyperprivileged - */ - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * ROCK specific ASIs - */ -#define ASI_CACHE_SPARING_P 0xF4 /* Cache sparing */ - -#ifndef _ASM -struct cpsregs { - uint64_t fails; - uint64_t exog; - uint64_t coh; - uint64_t tcc; - uint64_t instr; - uint64_t precise; - uint64_t async; - uint64_t size; - uint64_t ld; - uint64_t st; - uint64_t cti; - uint64_t fp; - uint64_t zeros; -}; -#endif /* _ASM */ -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_ROCKASI_H */
--- a/usr/src/uts/sun4v/vm/mach_sfmmu.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/vm/mach_sfmmu.c Thu Aug 06 17:39:39 2009 -0700 @@ -45,7 +45,6 @@ #include <sys/vmsystm.h> #include <sys/bitmap.h> #include <vm/rm.h> -#include <vm/vm_dep.h> #include <sys/t_lock.h> #include <sys/vm_machparam.h> #include <sys/promif.h> @@ -60,7 +59,6 @@ #include <sys/reboot.h> #include <sys/kdi.h> #include <sys/hypervisor_api.h> -#include <sys/hsvc.h> /* * External routines and data structures @@ -169,7 +167,7 @@ prom_panic("can't find kernel text pfn"); pfn &= TTE_PFNMASK(TTE4M); - attr = PROC_TEXT | HAT_NOSYNC | HAT_ATTR_NOSOFTEXEC; + attr = PROC_TEXT | HAT_NOSYNC; flags = HAT_LOAD_LOCK | SFMMU_NO_TSBLOAD; sfmmu_memtte(&ktext_tte, pfn, attr, TTE4M); /* @@ -185,7 +183,7 @@ prom_panic("can't find kernel data pfn"); pfn &= TTE_PFNMASK(TTE4M); - attr = PROC_DATA | HAT_NOSYNC | HAT_ATTR_NOSOFTEXEC; + attr = PROC_DATA | HAT_NOSYNC; sfmmu_memtte(&kdata_tte, pfn, attr, TTE4M); /* * We set the lock bit in the tte to lock the translation in @@ -210,7 +208,7 @@ ASSERT(tsbsz >= MMU_PAGESIZE4M); ASSERT(IS_P2ALIGNED(tsbsz, tsbsz)); ASSERT(IS_P2ALIGNED(va, tsbsz)); - attr = PROC_DATA | HAT_NOSYNC | HAT_ATTR_NOSOFTEXEC; + attr = PROC_DATA | HAT_NOSYNC; while (tsbsz != 0) { ASSERT(i < MAX_BIGKTSB_TTES); pfn = va_to_pfn(va); @@ -294,8 +292,7 @@ pfn_t pfn = va_to_pfn(va); uint64_t ret; - sfmmu_memtte(&tte, pfn, PROC_TEXT | HAT_NOSYNC | HAT_ATTR_NOSOFTEXEC, - TTE8K); + sfmmu_memtte(&tte, pfn, (PROC_TEXT | HAT_NOSYNC), TTE8K); ret = hv_mmu_map_perm_addr(va, KCONTEXT, *(uint64_t *)&tte, MAP_ITLB | (do_dtlb ? MAP_DTLB : 0)); @@ -481,22 +478,3 @@ sfmmu_cache_flushall() { } - -/* - * Initialise the real address field in sfmmu_pgsz_order. - */ -void -sfmmu_init_pgsz_hv(sfmmu_t *sfmmup) -{ - int i; - - /* - * Initialize mmu counts for pagesize register programming. - */ - for (i = 0; i < max_mmu_page_sizes; i++) { - sfmmup->sfmmu_mmuttecnt[i] = 0; - } - - sfmmup->sfmmu_pgsz_order.hv_pgsz_order_pa = - va_to_pa(&sfmmup->sfmmu_pgsz_order.hv_pgsz_order); -}
--- a/usr/src/uts/sun4v/vm/mach_sfmmu.h Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/vm/mach_sfmmu.h Thu Aug 06 17:39:39 2009 -0700 @@ -36,7 +36,6 @@ #include <sys/x_call.h> #include <sys/hypervisor_api.h> -#include <sys/mmu.h> #ifdef __cplusplus extern "C" { @@ -61,29 +60,8 @@ hv_tsb_info_t hv_tsb_info[NHV_TSB_INFO]; /* hypervisor TSB info */ }; -/* - * Defines for hypervisor pagesize search API. - */ - -#define TLB_PGSZ_ENABLE_SHIFT 15 -#define TLB_PGSZ_CTX_SHIFT 7 -#define TLB_PGSZ_ENABLE (1<<TLB_PGSZ_ENABLE_SHIFT) -#define TLB_PGSZ_CONTEXT1 (1<<TLB_PGSZ_CTX_SHIFT) -#define TLB_PGSZ_CONTEXT1_ENABLE (TLB_PGSZ_ENABLE|TLB_PGSZ_CONTEXT1) - -struct hv_pgsz_order { - uint64_t hv_pgsz_order_pa; /* hypervisor pagesize order PA */ - /* hypervisor pagesize order */ - uint16_t hv_pgsz_order[MAX_PGSZ_SEARCH_ORDER]; -}; - -#define sfmmu_pgsz_order_hv sfmmu_pgsz_order.hv_pgsz_order - #endif /* _ASM */ -/* value for sfmmu_pgsz_map if all shared pagesizes are allowed */ -#define TLB_ALL_SHARED_PGSZ 0xff - #ifdef _ASM /* @@ -333,47 +311,6 @@ label/**/1: /* - * Support for non-coherent I$. - * - * In sun4v we use tte bit 3 as a software flag indicating whether - * execute permission is given. IMMU miss traps cause the real execute - * permission to be set. sfmmu_ttesync() will see if execute permission - * has been set, and then set P_EXEC in page_t. This causes I-cache - * flush when the page is freed. - * - * However, the hypervisor reserves bit 3 as part of a 4-bit page size. - * We allow this flag to be set in hme TTE, but never in TSB or TLB. - */ -#define TTE_CLR_SOFTEXEC_ML(tte) bclr TTE_SOFTEXEC_INT, tte -#define TTE_CHK_SOFTEXEC_ML(tte) andcc tte, TTE_SOFTEXEC_INT, %g0 - -/* - * TTE_SET_EXEC_ML is a macro that updates the exec bit if it is - * not already set. Will also set reference bit at the same time. - * - * Caller must check EXECPRM. Do not call if it is already set in the tte. - * - * Parameters: - * tte = reg containing tte - * ttepa = physical pointer to tte - * tmp1 = tmp reg - * label = temporary label - */ - -#define TTE_SET_EXEC_ML(tte, ttepa, tmp1, label) \ - /* BEGIN CSTYLED */ \ - /* update execprm bit */ \ -label/**/1: \ - or tte, (TTE_EXECPRM_INT | TTE_REF_INT), tmp1; \ - casxa [ttepa]ASI_MEM, tte, tmp1; /* update bits */ \ - cmp tte, tmp1; \ - bne,a,pn %xcc, label/**/1; \ - mov tmp1, tte; \ - or tte, (TTE_EXECPRM_INT | TTE_REF_INT), tte; \ - /* END CSTYLED */ - - -/* * TTE_SET_REF_ML is a macro that updates the reference bit if it is * not already set. * @@ -597,27 +534,6 @@ label: /* END CSTYLED */ -/* - * For shared context mappings, check against the page size bitmap in the - * tsbmiss area to decide if we should use private mappings instead to reduce - * the number of shared page size searches on Rock based platforms. - * In: - * tsbarea (not clobbered) - * tte (not clobbered) - * tmp (clobbered) - * Out: - * use_shctx - changed to 0 if page size bit is not set in mask. - */ -#define CHECK_SHARED_PGSZ(tsbarea, tte, tmp, use_shctx, label) \ - /* BEGIN CSTYLED */ \ - brz use_shctx, label/**/1 ;\ - and tte, TTE_SZ_BITS, tmp ;\ - ldub [tsbarea + TSBMISS_PGSZ_BITMAP], use_shctx ;\ - srlx use_shctx, tmp, use_shctx ;\ - and use_shctx, 0x1, use_shctx ;\ -label/**/1: - /* END CSTYLED */ - #endif /* _ASM */ #ifdef __cplusplus
--- a/usr/src/uts/sun4v/vm/mach_sfmmu_asm.s Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/vm/mach_sfmmu_asm.s Thu Aug 06 17:39:39 2009 -0700 @@ -41,7 +41,6 @@ #include <sys/pte.h> #include <sys/mmu.h> #include <vm/hat_sfmmu.h> -#include <vm/mach_sfmmu.h> #include <vm/seg_spt.h> #include <sys/machparam.h> #include <sys/privregs.h> @@ -50,7 +49,6 @@ #include <sys/machthread.h> #include <sys/clock.h> #include <sys/trapstat.h> -#include <sys/rock_hypervisor_api.h> /* * sfmmu related subroutines @@ -79,7 +77,8 @@ /* ARGSUSED */ void sfmmu_load_mmustate(sfmmu_t *sfmmup) -{} +{ +} #else /* lint */ @@ -282,7 +281,7 @@ sethi %hi(ksfmmup), %o3 ldx [%o3 + %lo(ksfmmup)], %o3 cmp %o3, %o0 - be,pn %xcc, 8f ! if kernel as, do nothing + be,pn %xcc, 7f ! if kernel as, do nothing nop set MMU_SCONTEXT, %o3 @@ -340,7 +339,7 @@ ldx [%g2 + SCD_SFMMUP], %g3 ! %g3 = scdp->scd_sfmmup ldx [%g3 + SFMMU_TSB], %o1 ! %o1 = first scd tsbinfo - brz,pn %o1, 1f + brz,pn %o1, 9f nop ! panic if no third TSB /* make 3rd UTSBREG */ @@ -383,26 +382,9 @@ mov MMU_TSB_CTXNON0, %o5 ta FAST_TRAP ! set TSB info for user process brnz,a,pn %o0, panic_bad_hcall - mov MMU_TSB_CTXNON0, %o1 - mov %o3, %o0 ! restore saved sfmmup to %o0 + mov MMU_TSB_CTXNON0, %o1 + mov %o3, %o0 ! restore %o0 6: - /* - * If the TLB pagesize register is supported and pgsz_search_on is set - * then we patch out the following branch instruction. - */ - .global sfmmu_pgsz_load_mmustate_patch -sfmmu_pgsz_load_mmustate_patch: - ba,a 7f ! branch around pgsz search hcall - mov %o0, %o3 ! preserve sfmmup in %o3 - ldx [%o3 + SFMMU_PGSZ_ORDER + HV_PGSZ_ORDER_PA], %o0 - mov TLB_SO_ID, %o1 ! flags apply to I and D - mov MMU_SET_NONPRIV_SEARCH, %o5 - ta FAST_TRAP ! set page size search order - brnz,a,pn %o0, panic_bad_hcall - mov MMU_SET_NONPRIV_SEARCH, %o1 - mov %o3, %o0 ! restore saved sfmmup to %o0 -7: - mov %o1, %o5 ! preserve pgsz_search_on ldx [%o0 + SFMMU_ISMBLKPA], %o1 ! copy members of sfmmu CPU_TSBMISS_AREA(%o2, %o3) ! %o2 = tsbmiss area stx %o1, [%o2 + TSBMISS_ISMBLKPA] ! sfmmu_tsb_miss into the @@ -413,7 +395,7 @@ stub %o3, [%o2 + TSBMISS_UTTEFLAGS] stub %o4, [%o2 + TSBMISS_URTTEFLAGS] stx %o1, [%o2 + TSBMISS_SHARED_UHATID] - brz,pn %o1, 8f ! check for sfmmu_srdp + brz,pn %o1, 7f ! check for sfmmu_srdp add %o0, SFMMU_HMERMAP, %o1 add %o2, TSBMISS_SHMERMAP, %o2 mov SFMMU_HMERGNMAP_WORDS, %o3 @@ -423,38 +405,31 @@ ldx [%o0 + SFMMU_SCDP], %o4 ! %o4 = sfmmu_scd CPU_TSBMISS_AREA(%o2, %o3) ! %o2 = tsbmiss area mov SFMMU_HMERGNMAP_WORDS, %o3 - brnz,pt %o4, 9f ! check for sfmmu_scdp else - nop - add %o2, TSBMISS_SCDSHMERMAP, %o2 ! zero tsbmiss scd_shmermap + brnz,pt %o4, 8f ! check for sfmmu_scdp else + add %o2, TSBMISS_SCDSHMERMAP, %o2 ! zero tsbmiss scd_shmermap ZERO_REGION_MAP(%o2, %o3, zero_scd_mmustate) -8: +7: retl nop -9: - brz,a %o5, 0f ! test pgsz_search_on - or %g0, TLB_ALL_SHARED_PGSZ, %o1 ! enable all page sizes - ldub [%o0 + SFMMU_PGSZ_MAP], %o1 -0: - stub %o1, [%o2 + TSBMISS_PGSZ_BITMAP] ! set tsbmiss pgsz bitmap - add %o2, TSBMISS_SCDSHMERMAP, %o2 ! set tsbmiss scd_shmermap - add %o4, SCD_HMERMAP, %o1 +8: ! set tsbmiss scd_shmermap + add %o4, SCD_HMERMAP, %o1 SET_REGION_MAP(%o1, %o2, %o3, %o4, load_scd_mmustate) - retl nop -1: +9: sethi %hi(panicstr), %g1 ! panic if no 3rd TSB ldx [%g1 + %lo(panicstr)], %g1 tst %g1 - bnz,pn %xcc, 8b + bnz,pn %xcc, 7b nop sethi %hi(sfmmu_panic10), %o0 call panic or %o0, %lo(sfmmu_panic10), %o0 + SET_SIZE(sfmmu_load_mmustate) - + #endif /* lint */ #if defined(lint)
--- a/usr/src/uts/sun4v/vm/mach_vm_dep.c Thu Aug 06 17:19:00 2009 -0700 +++ b/usr/src/uts/sun4v/vm/mach_vm_dep.c Thu Aug 06 17:39:39 2009 -0700 @@ -52,7 +52,6 @@ #include <sys/stack.h> #include <sys/atomic.h> #include <sys/promif.h> -#include <sys/hsvc.h> uint_t page_colors = 0; uint_t page_colors_mask = 0; @@ -150,7 +149,6 @@ static vmem_t *contig_mem_arena; static vmem_t *contig_mem_reloc_arena; static kmutex_t contig_mem_lock; -static kmutex_t contig_mem_sleep_lock; #define CONTIG_MEM_ARENA_QUANTUM 64 #define CONTIG_MEM_SLAB_ARENA_QUANTUM MMU_PAGESIZE64K @@ -617,15 +615,14 @@ } /* - * contig_mem_alloc_align_flag allocates real contiguous memory with the + * contig_mem_alloc_align allocates real contiguous memory with the * specified alignment up to contig_mem_import_size_max. The alignment must * be a power of 2 and no greater than contig_mem_import_size_max. We assert * the aligment is a power of 2. For non-debug, vmem_xalloc will panic * for non power of 2 alignments. */ -static void * -contig_mem_alloc_align_flag(size_t size, size_t align, int flag, - kmutex_t *lockp) +void * +contig_mem_alloc_align(size_t size, size_t align) { void *buf; @@ -644,48 +641,27 @@ * allocations also prevents us from trying to allocate * more spans than necessary. */ - mutex_enter(lockp); + mutex_enter(&contig_mem_lock); buf = vmem_xalloc(contig_mem_arena, size, align, 0, 0, - NULL, NULL, flag | VM_NORELOC); + NULL, NULL, VM_NOSLEEP | VM_NORELOC); if ((buf == NULL) && (size <= MMU_PAGESIZE)) { - mutex_exit(lockp); + mutex_exit(&contig_mem_lock); return (vmem_xalloc(static_alloc_arena, size, align, 0, 0, - NULL, NULL, flag)); + NULL, NULL, VM_NOSLEEP)); } if (buf == NULL) { buf = vmem_xalloc(contig_mem_reloc_arena, size, align, 0, 0, - NULL, NULL, flag); + NULL, NULL, VM_NOSLEEP); } - mutex_exit(lockp); + mutex_exit(&contig_mem_lock); return (buf); } -void * -contig_mem_alloc_align(size_t size, size_t align) -{ - return (contig_mem_alloc_align_flag - (size, align, VM_NOSLEEP, &contig_mem_lock)); -} - -/* - * This function is provided for callers that need physically contiguous - * allocations but can sleep. We use the contig_mem_sleep_lock so that we - * don't interfere with contig_mem_alloc_align calls that should never sleep. - * Similarly to contig_mem_alloc_align, we use a lock to prevent allocating - * unnecessary spans when called in parallel. - */ -void * -contig_mem_alloc_align_sleep(size_t size, size_t align) -{ - return (contig_mem_alloc_align_flag - (size, align, VM_SLEEP, &contig_mem_sleep_lock)); -} - void contig_mem_free(void *vaddr, size_t size) { @@ -709,7 +685,6 @@ contig_mem_init(void) { mutex_init(&contig_mem_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&contig_mem_sleep_lock, NULL, MUTEX_DEFAULT, NULL); contig_mem_slab_arena = vmem_xcreate("contig_mem_slab_arena", NULL, 0, CONTIG_MEM_SLAB_ARENA_QUANTUM, contig_vmem_xalloc_aligned_wrapper, @@ -811,96 +786,3 @@ uint_t spcolor = atomic_inc_32_nv(&sp_current_color); return ((size_t)((spcolor & sp_color_mask) * SA(sp_color_stride))); } - -/* - * This flag may be set via /etc/system to force the synchronization - * of I-cache with memory after every bcopy. The default is 0, meaning - * that there is no need for an I-cache flush after each bcopy. This - * flag is relevant only on platforms that have non-coherent I-caches. - */ -uint_t force_sync_icache_after_bcopy = 0; - -/* - * This flag may be set via /etc/system to force the synchronization - * of I-cache to memory after every DMA. The default is 0, meaning - * that there is no need for an I-cache flush after each dma write to - * memory. This flag is relevant only on platforms that have - * non-coherent I-caches. - */ -uint_t force_sync_icache_after_dma = 0; - -/* - * This internal flag enables mach_sync_icache_pa, which is always - * called from common code if it is defined. However, not all - * platforms support the hv_mem_iflush firmware call. - */ -static uint_t do_mach_sync_icache_pa = 0; - -int hsvc_kdi_mem_iflush_negotiated = B_FALSE; - -#define MEM_IFLUSH_MAJOR 1 -#define MEM_IFLUSH_MINOR 0 -static hsvc_info_t kdi_mem_iflush_hsvc = { - HSVC_REV_1, /* HSVC rev num */ - NULL, /* Private */ - HSVC_GROUP_MEM_IFLUSH, /* Requested API Group */ - MEM_IFLUSH_MAJOR, /* Requested Major */ - MEM_IFLUSH_MINOR, /* Requested Minor */ - "kdi" /* Module name */ -}; - -/* - * Setup soft exec mode. - * Since /etc/system is read later on init, it - * may be used to override these flags. - */ -void -mach_setup_icache(uint_t coherency) -{ - int status; - uint64_t sup_minor; - - if (coherency == 0 && icache_is_coherent) { - extern void kdi_flush_caches(void); - status = hsvc_register(&kdi_mem_iflush_hsvc, &sup_minor); - if (status != 0) - cmn_err(CE_PANIC, "I$ flush not implemented on " - "I$ incoherent system"); - hsvc_kdi_mem_iflush_negotiated = B_TRUE; - kdi_flush_caches(); - icache_is_coherent = 0; - do_mach_sync_icache_pa = 1; - } -} - -/* - * Flush specified physical address range from I$ via hv_mem_iflush interface - */ -/*ARGSUSED*/ -void -mach_sync_icache_pa(caddr_t paddr, size_t size) -{ - if (do_mach_sync_icache_pa) { - uint64_t pa = (uint64_t)paddr; - uint64_t sz = (uint64_t)size; - uint64_t i, flushed; - - for (i = 0; i < sz; i += flushed) { - if (hv_mem_iflush(pa + i, sz - i, &flushed) != H_EOK) { - cmn_err(CE_PANIC, "Flushing the Icache failed"); - break; - } - } - } -} - -/* - * Flush the page if it has been marked as executed - */ -/*ARGSUSED*/ -void -mach_sync_icache_pp(page_t *pp) -{ - if (PP_ISEXEC(pp)) - mach_sync_icache_pa((caddr_t)ptob(pp->p_pagenum), PAGESIZE); -}