Mercurial > illumos > illumos-gate
changeset 4077:0b7e19932b34
6492718 bcopy can be improved on niagara2
6500001 Niagara2 copyin/copyout can be improved
6533524 niagara: hwblkpagecopy allocates unnecessarily large stack frame
author | wh94709 |
---|---|
date | Thu, 19 Apr 2007 21:55:12 -0700 |
parents | b8174b986feb |
children | 63cca4b3778c |
files | usr/src/lib/libc_psr/Makefile usr/src/lib/libc_psr/sun4v/common/memcpy.s usr/src/lib/libc_psr/sun4v/common/memset.s usr/src/lib/libc_psr/sun4v_hwcap1/Makefile.com usr/src/lib/libc_psr/sun4v_hwcap1/common/memcpy.s usr/src/lib/libc_psr/sun4v_hwcap1/common/memset.s usr/src/lib/libc_psr/sun4v_hwcap2/Makefile usr/src/lib/libc_psr/sun4v_hwcap2/Makefile.com usr/src/lib/libc_psr/sun4v_hwcap2/mapfile usr/src/lib/libc_psr/sun4v_hwcap2/sparc/Makefile usr/src/lib/libc_psr/sun4v_hwcap2/sparcv9/Makefile usr/src/pkgdefs/SUNWcar.v/prototype_com usr/src/uts/sun4v/cpu/niagara_copy.s |
diffstat | 13 files changed, 3355 insertions(+), 1065 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/lib/libc_psr/Makefile Thu Apr 19 21:15:35 2007 -0700 +++ b/usr/src/lib/libc_psr/Makefile Thu Apr 19 21:55:12 2007 -0700 @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -27,7 +27,8 @@ include ../Makefile.lib -SUBDIRS= sun4u sun4u-opl sun4u-us3 sun4u_hwcap1 sun4u_hwcap2 sun4v sun4v_hwcap1 etc +SUBDIRS = etc sun4u sun4u-opl sun4u-us3 sun4u_hwcap1 sun4u_hwcap2 \ + sun4v sun4v_hwcap1 sun4v_hwcap2 all := TARGET= all install := TARGET= install
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/lib/libc_psr/sun4v/common/memcpy.s Thu Apr 19 21:55:12 2007 -0700 @@ -0,0 +1,1068 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +.ident "%Z%%M% %I% %E% SMI" + + .file "memcpy.s" +/* + * memcpy(s1, s2, len) + * + * Copy s2 to s1, always copy n bytes. + * Note: this does not work for overlapped copies, bcopy() does + * + * Fast assembler language version of the following C-program for memcpy + * which represents the `standard' for the C-library. + * + * void * + * memcpy(void *s, const void *s0, size_t n) + * { + * if (n != 0) { + * char *s1 = s; + * const char *s2 = s0; + * do { + * *s1++ = *s2++; + * } while (--n != 0); + * } + * return ( s ); + * } + * + * Flow : + * + * if (count < 17) { + * Do the byte copy + * Return destination address + * } + * if (count < 128) { + * Is source aligned on word boundary + * If no then align source on word boundary then goto .ald + * If yes goto .ald + * .ald: + * Is destination aligned on word boundary + * Depending on destination offset (last 2 bits of destination) + * copy data by shifting and merging. + * Copy residue bytes as byte copy + * Return destination address + * } else { + * Align destination on block boundary + * Depending on the source offset (last 4 bits of source address) align + * the data and store to destination. Both the load and store are done + * using ASI_BLK_INIT_ST_QUAD_LDD_P. + * For remaining count copy as much data in 8-byte chunk from source to + * destination. + * Followed by trailing copy using byte copy. + * Return saved destination address + * } + * + */ + +#include <sys/asm_linkage.h> +#include <sys/niagaraasi.h> +#include <sys/asi.h> +#include <sys/trap.h> + +#ifdef NIAGARA2_IMPL +#include <sys/sun4asi.h> + +#define ALIGN_OFF_1_7 \ + faligndata %d0, %d2, %d48 ;\ + faligndata %d2, %d4, %d50 ;\ + faligndata %d4, %d6, %d52 ;\ + faligndata %d6, %d8, %d54 ;\ + faligndata %d8, %d10, %d56 ;\ + faligndata %d10, %d12, %d58 ;\ + faligndata %d12, %d14, %d60 ;\ + faligndata %d14, %d16, %d62 + +#define ALIGN_OFF_8_15 \ + faligndata %d2, %d4, %d48 ;\ + faligndata %d4, %d6, %d50 ;\ + faligndata %d6, %d8, %d52 ;\ + faligndata %d8, %d10, %d54 ;\ + faligndata %d10, %d12, %d56 ;\ + faligndata %d12, %d14, %d58 ;\ + faligndata %d14, %d16, %d60 ;\ + faligndata %d16, %d18, %d62 + +#define ALIGN_OFF_16_23 \ + faligndata %d4, %d6, %d48 ;\ + faligndata %d6, %d8, %d50 ;\ + faligndata %d8, %d10, %d52 ;\ + faligndata %d10, %d12, %d54 ;\ + faligndata %d12, %d14, %d56 ;\ + faligndata %d14, %d16, %d58 ;\ + faligndata %d16, %d18, %d60 ;\ + faligndata %d18, %d20, %d62 + +#define ALIGN_OFF_24_31 \ + faligndata %d6, %d8, %d48 ;\ + faligndata %d8, %d10, %d50 ;\ + faligndata %d10, %d12, %d52 ;\ + faligndata %d12, %d14, %d54 ;\ + faligndata %d14, %d16, %d56 ;\ + faligndata %d16, %d18, %d58 ;\ + faligndata %d18, %d20, %d60 ;\ + faligndata %d20, %d22, %d62 + +#define ALIGN_OFF_32_39 \ + faligndata %d8, %d10, %d48 ;\ + faligndata %d10, %d12, %d50 ;\ + faligndata %d12, %d14, %d52 ;\ + faligndata %d14, %d16, %d54 ;\ + faligndata %d16, %d18, %d56 ;\ + faligndata %d18, %d20, %d58 ;\ + faligndata %d20, %d22, %d60 ;\ + faligndata %d22, %d24, %d62 + +#define ALIGN_OFF_40_47 \ + faligndata %d10, %d12, %d48 ;\ + faligndata %d12, %d14, %d50 ;\ + faligndata %d14, %d16, %d52 ;\ + faligndata %d16, %d18, %d54 ;\ + faligndata %d18, %d20, %d56 ;\ + faligndata %d20, %d22, %d58 ;\ + faligndata %d22, %d24, %d60 ;\ + faligndata %d24, %d26, %d62 + +#define ALIGN_OFF_48_55 \ + faligndata %d12, %d14, %d48 ;\ + faligndata %d14, %d16, %d50 ;\ + faligndata %d16, %d18, %d52 ;\ + faligndata %d18, %d20, %d54 ;\ + faligndata %d20, %d22, %d56 ;\ + faligndata %d22, %d24, %d58 ;\ + faligndata %d24, %d26, %d60 ;\ + faligndata %d26, %d28, %d62 + +#define ALIGN_OFF_56_63 \ + faligndata %d14, %d16, %d48 ;\ + faligndata %d16, %d18, %d50 ;\ + faligndata %d18, %d20, %d52 ;\ + faligndata %d20, %d22, %d54 ;\ + faligndata %d22, %d24, %d56 ;\ + faligndata %d24, %d26, %d58 ;\ + faligndata %d26, %d28, %d60 ;\ + faligndata %d28, %d30, %d62 + +#else /* NIAGARA2_IMPL */ +/* + * This define is to align data for the unaligned source cases. + * The data1, data2 and data3 is merged into data1 and data2. + * The data3 is preserved for next merge. + */ +#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \ + sllx data1, lshift, data1 ;\ + srlx data2, rshift, tmp ;\ + or data1, tmp, data1 ;\ + sllx data2, lshift, data2 ;\ + srlx data3, rshift, tmp ;\ + or data2, tmp, data2 +/* + * Align the data. Merge the data1 and data2 into data1. + */ +#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \ + sllx data1, lshift, data1 ;\ + srlx data2, rshift, tmp ;\ + or data1, tmp, data1 +#endif /* NIAGARA2_IMPL */ + +/* + * Align the data in case of backward copy. + */ +#define ALIGN_DATA_BC(data1, data2, rshift, lshift, tmp) \ + srlx data1, rshift, data1 ;\ + sllx data2, lshift, tmp ;\ + or data1, tmp, data1 + + ANSI_PRAGMA_WEAK(memmove,function) + ANSI_PRAGMA_WEAK(memcpy,function) + +#include "synonyms.h" + + ENTRY(memmove) + cmp %o1, %o0 ! if from address is >= to use forward copy + bgeu %ncc, forcpy ! else use backward if ... + sub %o0, %o1, %o4 ! get difference of two addresses + cmp %o2, %o4 ! compare size and difference of addresses + bleu %ncc, forcpy ! if size is bigger, do overlapped copy + nop + + ! + ! an overlapped copy that must be done "backwards" + ! +.ovbc: + mov %o0, %o5 ! save des address for return val + add %o1, %o2, %o1 ! get to end of source space + add %o0, %o2, %o0 ! get to end of destination space + +.chksize: + cmp %o2, 0x20 + bgu,pn %ncc, .dbalign + nop + +.bytecp: + tst %o2 + bleu,a,pn %ncc, exitovbc + nop + +1: + dec %o0 ! decrement to address + dec %o1 ! decrement from address + ldub [%o1], %o4 + deccc %o2 + bgu,pt %ncc, 1b + stb %o4, [%o0] +exitovbc: + retl + mov %o5, %o0 + +.dbalign: + andcc %o0, 7, %o3 + bz %ncc, .dbbck + nop + ! %o3 has bytes till dst 8 bytes aligned + sub %o2, %o3, %o2 ! update o2 with new count +2: + dec %o1 + dec %o0 + ldub [%o1], %o4 + deccc %o3 + bgu,pt %ncc, 2b + stb %o4, [%o0] + + ! Now Destination is 8 byte aligned +.dbbck: + save %sp, -SA(MINFRAME), %sp + + andn %i2, 0x7, %i3 ! %i3 count is multiple of 8 bytes size + sub %i2, %i3, %i2 ! Residue bytes in %i2 + + andcc %i1, 7, %g1 ! is src aligned on 8 bytes + ! %g1 has src offset + bz %ncc, .dbcopybc + nop + + sll %g1, 3, %o1 ! left shift + mov 0x40, %g5 + sub %g5, %o1, %g5 ! right shift = (64 - left shift) + +.cpy_dbwdbc: + sub %i1, %g1, %i1 ! align the src at 8 bytes. + ldx [%i1], %o2 +2: + sub %i0, 0x8, %i0 + ldx [%i1-0x8], %o4 ! we are at the end + ALIGN_DATA_BC(%o2, %o4, %g5, %o1, %o3) + stx %o2, [%i0] + mov %o4, %o2 + subcc %i3, 0x8, %i3 + bgu,pt %ncc, 2b + sub %i1, 0x8, %i1 + ba .bytebc + add %i1, %g1, %i1 + +.dbcopybc: + sub %i1, 8, %i1 + sub %i0, 8, %i0 ! we are at the end + ldx [%i1], %o2 + stx %o2, [%i0] + subcc %i3, 0x8, %i3 + bgu,pt %ncc, .dbcopybc + nop + +.bytebc: + tst %i2 + bleu,a,pn %ncc, exitbc + nop + +1: + dec %i0 ! decrement to address + dec %i1 ! decrement from address + ldub [%i1], %i4 + deccc %i2 + bgu,pt %ncc, 1b + stb %i4, [%i0] +exitbc: + ret + restore %i5, %g0, %o0 + + SET_SIZE(memmove) + + + ENTRY(memcpy) + ENTRY(__align_cpy_1) +forcpy: + mov %o0, %g5 ! save des address for return val + cmp %o2, 17 ! for small counts copy bytes + bleu,pt %ncc, .dbytecp + nop + + cmp %o2, 0x80 ! For lengths less than 128 bytes no + bleu,pn %ncc, .no_blkcpy ! copy using ASI_BLK_INIT_ST_QUAD_LDD_P + + /* + * Make sure that source and destination buffers are 64 bytes apart. + * If they are not, do not use ASI_BLK_INIT_ST_QUAD_LDD_P asi to copy + * the data. + */ + subcc %o1, %o0, %o3 + blu %ncc, .blkalgndst + cmp %o3, 0x40 ! if src - dst >= 0x40 + bgeu,pt %ncc, .blkalgndst ! then use ASI_BLK_INIT_ST_QUAD_LDD_P +.no_blkcpy: + andcc %o1, 3, %o5 ! is src word aligned + bz,pn %ncc, .aldst + cmp %o5, 2 ! is src half-word aligned + be,pt %ncc, .s2algn + cmp %o5, 3 ! src is byte aligned +.s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it + inc 1, %o1 + stb %o3, [%g5] ! move a byte to align src + inc 1, %g5 + bne,pt %ncc, .s2algn + dec %o2 + b .ald ! now go align dest + andcc %g5, 3, %o5 + +.s2algn:lduh [%o1], %o3 ! know src is 2 byte alinged + inc 2, %o1 + srl %o3, 8, %o4 + stb %o4, [%g5] ! have to do bytes, + stb %o3, [%g5 + 1] ! don't know dst alingment + inc 2, %g5 + dec 2, %o2 + +.aldst: andcc %g5, 3, %o5 ! align the destination address +.ald: bz,pn %ncc, .w4cp + cmp %o5, 2 + bz,pn %ncc, .w2cp + cmp %o5, 3 +.w3cp: lduw [%o1], %o4 + inc 4, %o1 + srl %o4, 24, %o5 + stb %o5, [%g5] + bne,pt %ncc, .w1cp + inc %g5 + dec 1, %o2 + andn %o2, 3, %o3 ! o3 is aligned word count + dec 4, %o3 ! avoid reading beyond tail of src + sub %o1, %g5, %o1 ! o1 gets the difference + +1: sll %o4, 8, %g1 ! save residual bytes + lduw [%o1+%g5], %o4 + deccc 4, %o3 + srl %o4, 24, %o5 ! merge with residual + or %o5, %g1, %g1 + st %g1, [%g5] + bnz,pt %ncc, 1b + inc 4, %g5 + sub %o1, 3, %o1 ! used one byte of last word read + and %o2, 3, %o2 + b 7f + inc 4, %o2 + +.w1cp: srl %o4, 8, %o5 + sth %o5, [%g5] + inc 2, %g5 + dec 3, %o2 + andn %o2, 3, %o3 ! o3 is aligned word count + dec 4, %o3 ! avoid reading beyond tail of src + sub %o1, %g5, %o1 ! o1 gets the difference + +2: sll %o4, 24, %g1 ! save residual bytes + lduw [%o1+%g5], %o4 + deccc 4, %o3 + srl %o4, 8, %o5 ! merge with residual + or %o5, %g1, %g1 + st %g1, [%g5] + bnz,pt %ncc, 2b + inc 4, %g5 + sub %o1, 1, %o1 ! used three bytes of last word read + and %o2, 3, %o2 + b 7f + inc 4, %o2 + +.w2cp: lduw [%o1], %o4 + inc 4, %o1 + srl %o4, 16, %o5 + sth %o5, [%g5] + inc 2, %g5 + dec 2, %o2 + andn %o2, 3, %o3 ! o3 is aligned word count + dec 4, %o3 ! avoid reading beyond tail of src + sub %o1, %g5, %o1 ! o1 gets the difference + +3: sll %o4, 16, %g1 ! save residual bytes + lduw [%o1+%g5], %o4 + deccc 4, %o3 + srl %o4, 16, %o5 ! merge with residual + or %o5, %g1, %g1 + st %g1, [%g5] + bnz,pt %ncc, 3b + inc 4, %g5 + sub %o1, 2, %o1 ! used two bytes of last word read + and %o2, 3, %o2 + b 7f + inc 4, %o2 + +.w4cp: andn %o2, 3, %o3 ! o3 is aligned word count + sub %o1, %g5, %o1 ! o1 gets the difference + +1: lduw [%o1+%g5], %o4 ! read from address + deccc 4, %o3 ! decrement count + st %o4, [%g5] ! write at destination address + bgu,pt %ncc, 1b + inc 4, %g5 ! increment to address + b 7f + and %o2, 3, %o2 ! number of leftover bytes, if any + + ! + ! differenced byte copy, works with any alignment + ! +.dbytecp: + b 7f + sub %o1, %g5, %o1 ! o1 gets the difference + +4: stb %o4, [%g5] ! write to address + inc %g5 ! inc to address +7: deccc %o2 ! decrement count + bgeu,a,pt %ncc,4b ! loop till done + ldub [%o1+%g5], %o4 ! read from address + retl ! %o0 was preserved + nop + +.blkalgndst: + save %sp, -SA(MINFRAME), %sp + +#ifdef NIAGARA2_IMPL + rd %fprs, %l7 ! save orig %fprs into %l7 + + ! if fprs.fef == 0, set it. Checking it, reqires 2 instructions. + ! So set it anyway, without checking. + wr %g0, 0x4, %fprs ! fprs.fef = 1 +#endif /* NIAGARA2_IMPL */ + + ! Block (64 bytes) align the destination. + andcc %i0, 0x3f, %i3 ! is dst block aligned + bz %ncc, .chksrc ! dst already block aligned + sub %i3, 0x40, %i3 + neg %i3 ! bytes till dst 64 bytes aligned + sub %i2, %i3, %i2 ! update i2 with new count + + ! Based on source and destination alignment do + ! either 8 bytes, 4 bytes, 2 bytes or byte copy. + + ! Is dst & src 8B aligned + or %i0, %i1, %o2 + andcc %o2, 0x7, %g0 + bz %ncc, .alewdcp + nop + + ! Is dst & src 4B aligned + andcc %o2, 0x3, %g0 + bz %ncc, .alwdcp + nop + + ! Is dst & src 2B aligned + andcc %o2, 0x1, %g0 + bz %ncc, .alhlfwdcp + nop + + ! 1B aligned +1: ldub [%i1], %o2 + stb %o2, [%i0] + inc %i1 + deccc %i3 + bgu,pt %ncc, 1b + inc %i0 + + ba .chksrc + nop + + ! dst & src 4B aligned +.alwdcp: + ld [%i1], %o2 + st %o2, [%i0] + add %i1, 0x4, %i1 + subcc %i3, 0x4, %i3 + bgu,pt %ncc, .alwdcp + add %i0, 0x4, %i0 + + ba .chksrc + nop + + ! dst & src 2B aligned +.alhlfwdcp: + lduh [%i1], %o2 + stuh %o2, [%i0] + add %i1, 0x2, %i1 + subcc %i3, 0x2, %i3 + bgu,pt %ncc, .alhlfwdcp + add %i0, 0x2, %i0 + + ba .chksrc + nop + + ! dst & src 8B aligned +.alewdcp: + ldx [%i1], %o2 + stx %o2, [%i0] + add %i1, 0x8, %i1 + subcc %i3, 0x8, %i3 + bgu,pt %ncc, .alewdcp + add %i0, 0x8, %i0 + + ! Now Destination is block (64 bytes) aligned +.chksrc: + andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size + sub %i2, %i3, %i2 ! Residue bytes in %i2 + + mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi + +#ifdef NIAGARA2_IMPL + andn %i1, 0x3f, %l0 ! %l0 has block aligned src address + prefetch [%l0+0x0], #one_read + andcc %i1, 0x3f, %g0 ! is src 64B aligned + bz,pn %ncc, .blkcpy + nop + + ! handle misaligned source cases + alignaddr %i1, %g0, %g0 ! generate %gsr + + srl %i1, 0x3, %l1 ! src add bits 3, 4, 5 are now least + ! significant in %l1 + andcc %l1, 0x7, %l2 ! mask everything except bits 1, 2, 3 + add %i1, %i3, %i1 + + ! switch statement to get to right 8 byte block within + ! 64 byte block + cmp %l2, 0x4 + bgeu,a hlf + cmp %l2, 0x6 + cmp %l2, 0x2 + bgeu,a sqtr + nop + cmp %l2, 0x1 + be,a off15 + nop + ba off7 + nop +sqtr: + be,a off23 + nop + ba,a off31 + nop + +hlf: + bgeu,a fqtr + nop + cmp %l2, 0x5 + be,a off47 + nop + ba off39 + nop +fqtr: + be,a off55 + nop + + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +7: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_56_63 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 7b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + +off7: + ldda [%l0]ASI_BLK_P, %d0 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +0: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_1_7 + fmovd %d16, %d0 + fmovd %d18, %d2 + fmovd %d20, %d4 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 0b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + +off15: + ldd [%l0+0x8], %d2 + ldd [%l0+0x10], %d4 + ldd [%l0+0x18], %d6 + ldd [%l0+0x20], %d8 + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +1: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_8_15 + fmovd %d18, %d2 + fmovd %d20, %d4 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 1b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + +off23: + ldd [%l0+0x10], %d4 + ldd [%l0+0x18], %d6 + ldd [%l0+0x20], %d8 + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +2: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_16_23 + fmovd %d20, %d4 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 2b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + +off31: + ldd [%l0+0x18], %d6 + ldd [%l0+0x20], %d8 + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +3: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_24_31 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 3b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + +off39: + ldd [%l0+0x20], %d8 + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +4: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_32_39 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 4b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + +off47: + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +5: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_40_47 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 5b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + +off55: + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +6: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_48_55 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 6b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + +.blkcpy: + prefetch [%i1+0x40], #one_read + prefetch [%i1+0x80], #one_read +8: + stxa %g0, [%i0]%asi ! initialize the cache line + ldda [%i1]ASI_BLK_P, %d0 + stda %d0, [%i0]ASI_BLK_P + + add %i1, 0x40, %i1 + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 8b + prefetch [%i1+0x80], #one_read + membar #Sync + +.blkdone: +#else /* NIAGARA2_IMPL */ + andcc %i1, 0xf, %l1 ! is src quadword aligned + bz,pn %ncc, .blkcpy ! src offset in %l1 + nop + cmp %l1, 0x8 + bgu %ncc, .cpy_upper_double + nop + blu %ncc, .cpy_lower_double + nop + + ! Falls through when source offset is equal to 8 i.e. + ! source is double word aligned. + ! In this case no shift/merge of data is required + sub %i1, %l1, %i1 ! align the src at 16 bytes. + andn %i1, 0x3f, %o0 ! %o0 has block aligned source + prefetch [%o0+0x0], #one_read + ldda [%i1+0x0]%asi, %o2 +loop0: + ldda [%i1+0x10]%asi, %o4 + prefetch [%o0+0x40], #one_read + + stxa %o3, [%i0+0x0]%asi + stxa %o4, [%i0+0x8]%asi + + ldda [%i1+0x20]%asi, %o2 + stxa %o5, [%i0+0x10]%asi + stxa %o2, [%i0+0x18]%asi + + ldda [%i1+0x30]%asi, %o4 + stxa %o3, [%i0+0x20]%asi + stxa %o4, [%i0+0x28]%asi + + ldda [%i1+0x40]%asi, %o2 + stxa %o5, [%i0+0x30]%asi + stxa %o2, [%i0+0x38]%asi + + add %o0, 0x40, %o0 + add %i1, 0x40, %i1 + subcc %i3, 0x40, %i3 + bgu,pt %ncc, loop0 + add %i0, 0x40, %i0 + ba .blkdone + add %i1, %l1, %i1 ! increment the source by src offset + +.cpy_lower_double: + sub %i1, %l1, %i1 ! align the src at 16 bytes. + sll %l1, 3, %l2 ! %l2 left shift + mov 0x40, %l3 + sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift) + andn %i1, 0x3f, %o0 ! %o0 has block aligned source + prefetch [%o0+0x0], #one_read + ldda [%i1+0x0]%asi, %o2 ! partial data in %o2 and %o3 has + ! complete data +loop1: + ldda [%i1+0x10]%asi, %o4 ! %o4 has partial data for this read. + ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) ! merge %o2, %o3 and %o4 + ! into %o2 and %o3 + prefetch [%o0+0x40], #one_read + stxa %o2, [%i0+0x0]%asi + stxa %o3, [%i0+0x8]%asi + + ldda [%i1+0x20]%asi, %o2 + ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) ! merge %o2 with %o5 and + stxa %o4, [%i0+0x10]%asi ! %o4 from previous read + stxa %o5, [%i0+0x18]%asi ! into %o4 and %o5 + + ! Repeat the same for next 32 bytes. + + ldda [%i1+0x30]%asi, %o4 + ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) + stxa %o2, [%i0+0x20]%asi + stxa %o3, [%i0+0x28]%asi + + ldda [%i1+0x40]%asi, %o2 + ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) + stxa %o4, [%i0+0x30]%asi + stxa %o5, [%i0+0x38]%asi + + add %o0, 0x40, %o0 + add %i1, 0x40, %i1 + subcc %i3, 0x40, %i3 + bgu,pt %ncc, loop1 + add %i0, 0x40, %i0 + ba .blkdone + add %i1, %l1, %i1 ! increment the source by src offset + +.cpy_upper_double: + sub %i1, %l1, %i1 ! align the src at 16 bytes. + mov 0x8, %l2 + sub %l1, %l2, %l2 + sll %l2, 3, %l2 ! %l2 left shift + mov 0x40, %l3 + sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift) + andn %i1, 0x3f, %o0 ! %o0 has block aligned source + prefetch [%o0+0x0], #one_read + ldda [%i1+0x0]%asi, %o2 ! partial data in %o3 for this read and + ! no data in %o2 +loop2: + ldda [%i1+0x10]%asi, %o4 ! %o4 has complete data and %o5 has + ! partial + ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) ! merge %o3, %o4 and %o5 + ! into %o3 and %o4 + prefetch [%o0+0x40], #one_read + stxa %o3, [%i0+0x0]%asi + stxa %o4, [%i0+0x8]%asi + + ldda [%i1+0x20]%asi, %o2 + ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) ! merge %o2 and %o3 with + stxa %o5, [%i0+0x10]%asi ! %o5 from previous read + stxa %o2, [%i0+0x18]%asi ! into %o5 and %o2 + + ! Repeat the same for next 32 bytes. + + ldda [%i1+0x30]%asi, %o4 + ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) + stxa %o3, [%i0+0x20]%asi + stxa %o4, [%i0+0x28]%asi + + ldda [%i1+0x40]%asi, %o2 + ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) + stxa %o5, [%i0+0x30]%asi + stxa %o2, [%i0+0x38]%asi + + add %o0, 0x40, %o0 + add %i1, 0x40, %i1 + subcc %i3, 0x40, %i3 + bgu,pt %ncc, loop2 + add %i0, 0x40, %i0 + ba .blkdone + add %i1, %l1, %i1 ! increment the source by src offset + + + ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P +.blkcpy: + andn %i1, 0x3f, %o0 ! %o0 has block aligned source + prefetch [%o0+0x0], #one_read +1: + prefetch [%o0+0x40], #one_read + + ldda [%i1+0x0]%asi, %o2 + ldda [%i1+0x10]%asi, %o4 + + stxa %o2, [%i0+0x0]%asi + stxa %o3, [%i0+0x8]%asi + stxa %o4, [%i0+0x10]%asi + stxa %o5, [%i0+0x18]%asi + + ldda [%i1+0x20]%asi, %o2 + ldda [%i1+0x30]%asi, %o4 + + stxa %o2, [%i0+0x20]%asi + stxa %o3, [%i0+0x28]%asi + stxa %o4, [%i0+0x30]%asi + stxa %o5, [%i0+0x38]%asi + + add %o0, 0x40, %o0 + add %i1, 0x40, %i1 + subcc %i3, 0x40, %i3 + bgu,pt %ncc, 1b + add %i0, 0x40, %i0 + +.blkdone: + membar #Sync +#endif /* NIAGARA2_IMPL */ + + mov ASI_PNF, %asi ! restore %asi to default + ! ASI_PRIMARY_NOFAULT value + tst %i2 + bz,pt %ncc, .blkexit + nop + + ! Handle trailing bytes + cmp %i2, 0x8 + blu,pt %ncc, .residue + nop + + ! Can we do some 8B ops + or %i1, %i0, %o2 + andcc %o2, 0x7, %g0 + bnz %ncc, .last4 + nop + + ! Do 8byte ops as long as possible +.last8: + ldx [%i1], %o2 + stx %o2, [%i0] + add %i1, 0x8, %i1 + sub %i2, 0x8, %i2 + cmp %i2, 0x8 + bgu,pt %ncc, .last8 + add %i0, 0x8, %i0 + + tst %i2 + bz,pt %ncc, .blkexit + nop + + ba .residue + nop + +.last4: + ! Can we do 4B ops + andcc %o2, 0x3, %g0 + bnz %ncc, .last2 + nop +1: + ld [%i1], %o2 + st %o2, [%i0] + add %i1, 0x4, %i1 + sub %i2, 0x4, %i2 + cmp %i2, 0x4 + bgu,pt %ncc, 1b + add %i0, 0x4, %i0 + + cmp %i2, 0 + bz,pt %ncc, .blkexit + nop + + ba .residue + nop + +.last2: + ! Can we do 2B ops + andcc %o2, 0x1, %g0 + bnz %ncc, .residue + nop + +1: + lduh [%i1], %o2 + stuh %o2, [%i0] + add %i1, 0x2, %i1 + sub %i2, 0x2, %i2 + cmp %i2, 0x2 + bgu,pt %ncc, 1b + add %i0, 0x2, %i0 + + cmp %i2, 0 + bz,pt %ncc, .blkexit + nop + +.residue: + ldub [%i1], %o2 + stb %o2, [%i0] + inc %i1 + deccc %i2 + bgu,pt %ncc, .residue + inc %i0 + +.blkexit: +#ifdef NIAGARA2_IMPL + and %l7, 0x4, %l7 ! fprs.du = fprs.dl = 0 + wr %l7, %g0, %fprs ! fprs = %l7 - restore fprs.fef +#endif /* NIAGARA2_IMPL */ + ret + restore %g5, %g0, %o0 + SET_SIZE(memcpy) + SET_SIZE(__align_cpy_1)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/lib/libc_psr/sun4v/common/memset.s Thu Apr 19 21:55:12 2007 -0700 @@ -0,0 +1,250 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +.ident "%Z%%M% %I% %E% SMI" + + .file "memset.s" +/* + * char *memset(sp, c, n) + * + * Set an array of n chars starting at sp to the character c. + * Return sp. + * + * Fast assembler language version of the following C-program for memset + * which represents the `standard' for the C-library. + * + * void * + * memset(void *sp1, int c, size_t n) + * { + * if (n != 0) { + * char *sp = sp1; + * do { + * *sp++ = (char)c; + * } while (--n != 0); + * } + * return (sp1); + * } + * + * Flow : + * + * For small 6 or fewer bytes stores, bytes will be stored. + * + * For less than 32 bytes stores, align the address on 4 byte boundary. + * Then store as many 4-byte chunks, followed by trailing bytes. + * + * For sizes greater than 32 bytes, align the address on 8 byte boundary. + * if (count > 64) { + * store as many 8-bytes chunks to block align the address + * store using ASI_BLK_INIT_ST_QUAD_LDD_P + * } + * Store as many 8-byte chunks, followed by trialing bytes. + * + */ + +#include <sys/asm_linkage.h> +#include <sys/niagaraasi.h> +#include <sys/asi.h> + + ANSI_PRAGMA_WEAK(memset,function) + +#include "synonyms.h" + + .section ".text" + .align 32 + + ENTRY(memset) + + mov %o0, %o5 ! copy sp1 before using it + cmp %o2, 7 ! if small counts, just write bytes + blu,pn %ncc, .wrchar + and %o1, 0xff, %o1 ! o1 is (char)c + + sll %o1, 8, %o3 + or %o1, %o3, %o1 ! now o1 has 2 bytes of c + sll %o1, 16, %o3 + + cmp %o2, 0x20 + blu,pn %ncc, .wdalign + or %o1, %o3, %o1 ! now o1 has 4 bytes of c + + sllx %o1, 32, %o3 + or %o1, %o3, %o1 ! now o1 has 8 bytes of c + +.dbalign: + andcc %o5, 7, %o3 ! is sp1 aligned on a 8 byte bound + bz,pt %ncc, .blkalign ! already double aligned + sub %o3, 8, %o3 ! -(bytes till double aligned) + add %o2, %o3, %o2 ! update o2 with new count + + ! Set -(%o3) bytes till sp1 double aligned +1: stb %o1, [%o5] ! there is at least 1 byte to set + inccc %o3 ! byte clearing loop + bl,pt %ncc, 1b + inc %o5 + + ! Now sp1 is double aligned (sp1 is found in %o5) +.blkalign: + mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi + + cmp %o2, 0x40 ! check if there are 64 bytes to set + blu,pn %ncc, 5f + mov %o2, %o3 + + andcc %o5, 63, %o3 ! is sp1 block aligned? + bz,pt %ncc, .blkwr ! now block aligned + sub %o3, 64, %o3 ! o3 is -(bytes till block aligned) + add %o2, %o3, %o2 ! o2 is the remainder + + ! Store -(%o3) bytes till dst is block (64 byte) aligned. + ! Use double word stores. + ! Recall that dst is already double word aligned +1: + stx %o1, [%o5] + addcc %o3, 8, %o3 + bl,pt %ncc, 1b + add %o5, 8, %o5 + + ! Now sp1 is block aligned +.blkwr: + and %o2, 63, %o3 ! calc bytes left after blk store. + andn %o2, 63, %o4 ! calc size of blocks in bytes + + cmp %o4, 0x100 ! check if there are 256 bytes to set + blu,pn %ncc, 3f + nop +2: + stxa %o1, [%o5+0x0]%asi + stxa %o1, [%o5+0x40]%asi + stxa %o1, [%o5+0x80]%asi + stxa %o1, [%o5+0xc0]%asi + + stxa %o1, [%o5+0x8]%asi + stxa %o1, [%o5+0x10]%asi + stxa %o1, [%o5+0x18]%asi + stxa %o1, [%o5+0x20]%asi + stxa %o1, [%o5+0x28]%asi + stxa %o1, [%o5+0x30]%asi + stxa %o1, [%o5+0x38]%asi + + stxa %o1, [%o5+0x48]%asi + stxa %o1, [%o5+0x50]%asi + stxa %o1, [%o5+0x58]%asi + stxa %o1, [%o5+0x60]%asi + stxa %o1, [%o5+0x68]%asi + stxa %o1, [%o5+0x70]%asi + stxa %o1, [%o5+0x78]%asi + + stxa %o1, [%o5+0x88]%asi + stxa %o1, [%o5+0x90]%asi + stxa %o1, [%o5+0x98]%asi + stxa %o1, [%o5+0xa0]%asi + stxa %o1, [%o5+0xa8]%asi + stxa %o1, [%o5+0xb0]%asi + stxa %o1, [%o5+0xb8]%asi + + stxa %o1, [%o5+0xc8]%asi + stxa %o1, [%o5+0xd0]%asi + stxa %o1, [%o5+0xd8]%asi + stxa %o1, [%o5+0xe0]%asi + stxa %o1, [%o5+0xe8]%asi + stxa %o1, [%o5+0xf0]%asi + stxa %o1, [%o5+0xf8]%asi + + sub %o4, 0x100, %o4 + cmp %o4, 0x100 + bgu,pt %ncc, 2b + add %o5, 0x100, %o5 + +3: + cmp %o4, 0x40 ! check if 64 bytes to set + blu %ncc, 5f + nop +4: + stxa %o1, [%o5+0x0]%asi + stxa %o1, [%o5+0x8]%asi + stxa %o1, [%o5+0x10]%asi + stxa %o1, [%o5+0x18]%asi + stxa %o1, [%o5+0x20]%asi + stxa %o1, [%o5+0x28]%asi + stxa %o1, [%o5+0x30]%asi + stxa %o1, [%o5+0x38]%asi + + subcc %o4, 0x40, %o4 + bgu,pt %ncc, 4b + add %o5, 0x40, %o5 + +5: + ! Set the remaining doubles + membar #Sync + mov ASI_PNF, %asi ! restore %asi to default + ! ASI_PRIMARY_NOFAULT value + subcc %o3, 8, %o3 ! Can we store any doubles? + blu,pn %ncc, .wrchar + and %o2, 7, %o2 ! calc bytes left after doubles + +6: + stx %o1, [%o5] ! store the doubles + subcc %o3, 8, %o3 + bgeu,pt %ncc, 6b + add %o5, 8, %o5 + + ba .wrchar + nop + +.wdalign: + andcc %o5, 3, %o3 ! is sp1 aligned on a word boundary + bz,pn %ncc, .wrword + andn %o2, 3, %o3 ! create word sized count in %o3 + + dec %o2 ! decrement count + stb %o1, [%o5] ! clear a byte + b .wdalign + inc %o5 ! next byte + +.wrword: + st %o1, [%o5] ! 4-byte writing loop + subcc %o3, 4, %o3 + bnz,pt %ncc, .wrword + inc 4, %o5 + + and %o2, 3, %o2 ! leftover count, if any + +.wrchar: + ! Set the remaining bytes, if any + cmp %o2, 0 + be %ncc, .exit + nop + +7: + deccc %o2 + stb %o1, [%o5] + bgu,pt %ncc, 7b + inc %o5 + +.exit: + retl ! %o0 was preserved + nop + + SET_SIZE(memset)
--- a/usr/src/lib/libc_psr/sun4v_hwcap1/Makefile.com Thu Apr 19 21:15:35 2007 -0700 +++ b/usr/src/lib/libc_psr/sun4v_hwcap1/Makefile.com Thu Apr 19 21:55:12 2007 -0700 @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -47,6 +47,6 @@ # # build rules # -pics/%.o: ../../$(ALT_PLAT)/common/%.s +pics/%.o: ../../$(PLATFORM)/common/%.s $(AS) $(ASFLAGS) $< -o $@ $(POST_PROCESS_O)
--- a/usr/src/lib/libc_psr/sun4v_hwcap1/common/memcpy.s Thu Apr 19 21:15:35 2007 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,619 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -.ident "%Z%%M% %I% %E% SMI" - - .file "memcpy.s" -/* - * memcpy(s1, s2, len) - * - * Copy s2 to s1, always copy n bytes. - * Note: this does not work for overlapped copies, bcopy() does - * - * Fast assembler language version of the following C-program for memcpy - * which represents the `standard' for the C-library. - * - * void * - * memcpy(void *s, const void *s0, size_t n) - * { - * if (n != 0) { - * char *s1 = s; - * const char *s2 = s0; - * do { - * *s1++ = *s2++; - * } while (--n != 0); - * } - * return ( s ); - * } - * - * Flow : - * - * if (count < 17) { - * Do the byte copy - * Return destination address - * } - * if (count < 128) { - * Is source aligned on word boundary - * If no then align source on word boundary then goto .ald - * If yes goto .ald - * .ald: - * Is destination aligned on word boundary - * Depending on destination offset (last 2 bits of destination) - * copy data by shifting and merging. - * Copy residue bytes as byte copy - * Return destination address - * } else { - * Align destination on block boundary - * Depending on the source offset (last 4 bits of source address) align - * the data and store to destination. Both the load and store are done - * using ASI_BLK_INIT_ST_QUAD_LDD_P. - * For remaining count copy as much data in 8-byte chunk from source to - * destination. - * Followed by trailing copy using byte copy. - * Return saved destination address - * } - * - */ - -#include <sys/asm_linkage.h> -#include <sys/niagaraasi.h> -#include <sys/asi.h> -#include <sys/trap.h> - -/* - * This define is to align data for the unaligned source cases. - * The data1, data2 and data3 is merged into data1 and data2. - * The data3 is preserved for next merge. - */ -#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \ - sllx data1, lshift, data1 ;\ - srlx data2, rshift, tmp ;\ - or data1, tmp, data1 ;\ - sllx data2, lshift, data2 ;\ - srlx data3, rshift, tmp ;\ - or data2, tmp, data2 -/* - * Align the data. Merge the data1 and data2 into data1. - */ -#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \ - sllx data1, lshift, data1 ;\ - srlx data2, rshift, tmp ;\ - or data1, tmp, data1 -/* - * Align the data in case of backward copy. - */ -#define ALIGN_DATA_BC(data1, data2, rshift, lshift, tmp) \ - srlx data1, rshift, data1 ;\ - sllx data2, lshift, tmp ;\ - or data1, tmp, data1 - - ANSI_PRAGMA_WEAK(memmove,function) - ANSI_PRAGMA_WEAK(memcpy,function) - -#include "synonyms.h" - - ENTRY(memmove) - cmp %o1, %o0 ! if from address is >= to use forward copy - bgeu %ncc, forcpy ! else use backward if ... - sub %o0, %o1, %o4 ! get difference of two addresses - cmp %o2, %o4 ! compare size and difference of addresses - bleu %ncc, forcpy ! if size is bigger, do overlapped copy - nop - - ! - ! an overlapped copy that must be done "backwards" - ! -.ovbc: - mov %o0, %o5 ! save des address for return val - add %o1, %o2, %o1 ! get to end of source space - add %o0, %o2, %o0 ! get to end of destination space - -.chksize: - cmp %o2, 0x20 - bgu,pn %ncc, .dbalign - nop - -.bytecp: - tst %o2 - bleu,a,pn %ncc, exitovbc - nop - -1: - dec %o0 ! decrement to address - dec %o1 ! decrement from address - ldub [%o1], %o4 - deccc %o2 - bgu,pt %ncc, 1b - stb %o4, [%o0] -exitovbc: - retl - mov %o5, %o0 - -.dbalign: - andcc %o0, 7, %o3 - bz %ncc, .dbbck - nop - ! %o3 has bytes till dst 8 bytes aligned - sub %o2, %o3, %o2 ! update o2 with new count -2: - dec %o1 - dec %o0 - ldub [%o1], %o4 - deccc %o3 - bgu,pt %ncc, 2b - stb %o4, [%o0] - - ! Now Destination is 8 byte aligned -.dbbck: - save %sp, -SA(MINFRAME), %sp - - andn %i2, 0x7, %i3 ! %i3 count is multiple of 8 bytes size - sub %i2, %i3, %i2 ! Residue bytes in %i2 - - andcc %i1, 7, %g1 ! is src aligned on 8 bytes - ! %g1 has src offset - bz %ncc, .dbcopybc - nop - - sll %g1, 3, %o1 ! left shift - mov 0x40, %g5 - sub %g5, %o1, %g5 ! right shift = (64 - left shift) - -.cpy_dbwdbc: - sub %i1, %g1, %i1 ! align the src at 8 bytes. - ldx [%i1], %o2 -2: - sub %i0, 0x8, %i0 - ldx [%i1-0x8], %o4 ! we are at the end - ALIGN_DATA_BC(%o2, %o4, %g5, %o1, %o3) - stx %o2, [%i0] - mov %o4, %o2 - subcc %i3, 0x8, %i3 - bgu,pt %ncc, 2b - sub %i1, 0x8, %i1 - ba .bytebc - add %i1, %g1, %i1 - -.dbcopybc: - sub %i1, 8, %i1 - sub %i0, 8, %i0 ! we are at the end - ldx [%i1], %o2 - stx %o2, [%i0] - subcc %i3, 0x8, %i3 - bgu,pt %ncc, .dbcopybc - nop - -.bytebc: - tst %i2 - bleu,a,pn %ncc, exitbc - nop - -1: - dec %i0 ! decrement to address - dec %i1 ! decrement from address - ldub [%i1], %i4 - deccc %i2 - bgu,pt %ncc, 1b - stb %i4, [%i0] -exitbc: - ret - restore %i5, %g0, %o0 - - SET_SIZE(memmove) - - - ENTRY(memcpy) - ENTRY(__align_cpy_1) -forcpy: - mov %o0, %g5 ! save des address for return val - cmp %o2, 17 ! for small counts copy bytes - bleu,pt %ncc, .dbytecp - nop - - cmp %o2, 0x80 ! For lengths less than 128 bytes no - bleu,pn %ncc, .no_blkcpy ! copy using ASI_BLK_INIT_ST_QUAD_LDD_P - - /* - * Make sure that source and destination buffers are 64 bytes apart. - * If they are not, do not use ASI_BLK_INIT_ST_QUAD_LDD_P asi to copy - * the data. - */ - subcc %o1, %o0, %o3 - blu %ncc, .blkalgndst - cmp %o3, 0x40 ! if src - dst >= 0x40 - bgeu,pt %ncc, .blkalgndst ! then use ASI_BLK_INIT_ST_QUAD_LDD_P -.no_blkcpy: - andcc %o1, 3, %o5 ! is src word aligned - bz,pn %ncc, .aldst - cmp %o5, 2 ! is src half-word aligned - be,pt %ncc, .s2algn - cmp %o5, 3 ! src is byte aligned -.s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it - inc 1, %o1 - stb %o3, [%g5] ! move a byte to align src - inc 1, %g5 - bne,pt %ncc, .s2algn - dec %o2 - b .ald ! now go align dest - andcc %g5, 3, %o5 - -.s2algn:lduh [%o1], %o3 ! know src is 2 byte alinged - inc 2, %o1 - srl %o3, 8, %o4 - stb %o4, [%g5] ! have to do bytes, - stb %o3, [%g5 + 1] ! don't know dst alingment - inc 2, %g5 - dec 2, %o2 - -.aldst: andcc %g5, 3, %o5 ! align the destination address -.ald: bz,pn %ncc, .w4cp - cmp %o5, 2 - bz,pn %ncc, .w2cp - cmp %o5, 3 -.w3cp: lduw [%o1], %o4 - inc 4, %o1 - srl %o4, 24, %o5 - stb %o5, [%g5] - bne,pt %ncc, .w1cp - inc %g5 - dec 1, %o2 - andn %o2, 3, %o3 ! o3 is aligned word count - dec 4, %o3 ! avoid reading beyond tail of src - sub %o1, %g5, %o1 ! o1 gets the difference - -1: sll %o4, 8, %g1 ! save residual bytes - lduw [%o1+%g5], %o4 - deccc 4, %o3 - srl %o4, 24, %o5 ! merge with residual - or %o5, %g1, %g1 - st %g1, [%g5] - bnz,pt %ncc, 1b - inc 4, %g5 - sub %o1, 3, %o1 ! used one byte of last word read - and %o2, 3, %o2 - b 7f - inc 4, %o2 - -.w1cp: srl %o4, 8, %o5 - sth %o5, [%g5] - inc 2, %g5 - dec 3, %o2 - andn %o2, 3, %o3 ! o3 is aligned word count - dec 4, %o3 ! avoid reading beyond tail of src - sub %o1, %g5, %o1 ! o1 gets the difference - -2: sll %o4, 24, %g1 ! save residual bytes - lduw [%o1+%g5], %o4 - deccc 4, %o3 - srl %o4, 8, %o5 ! merge with residual - or %o5, %g1, %g1 - st %g1, [%g5] - bnz,pt %ncc, 2b - inc 4, %g5 - sub %o1, 1, %o1 ! used three bytes of last word read - and %o2, 3, %o2 - b 7f - inc 4, %o2 - -.w2cp: lduw [%o1], %o4 - inc 4, %o1 - srl %o4, 16, %o5 - sth %o5, [%g5] - inc 2, %g5 - dec 2, %o2 - andn %o2, 3, %o3 ! o3 is aligned word count - dec 4, %o3 ! avoid reading beyond tail of src - sub %o1, %g5, %o1 ! o1 gets the difference - -3: sll %o4, 16, %g1 ! save residual bytes - lduw [%o1+%g5], %o4 - deccc 4, %o3 - srl %o4, 16, %o5 ! merge with residual - or %o5, %g1, %g1 - st %g1, [%g5] - bnz,pt %ncc, 3b - inc 4, %g5 - sub %o1, 2, %o1 ! used two bytes of last word read - and %o2, 3, %o2 - b 7f - inc 4, %o2 - -.w4cp: andn %o2, 3, %o3 ! o3 is aligned word count - sub %o1, %g5, %o1 ! o1 gets the difference - -1: lduw [%o1+%g5], %o4 ! read from address - deccc 4, %o3 ! decrement count - st %o4, [%g5] ! write at destination address - bgu,pt %ncc, 1b - inc 4, %g5 ! increment to address - b 7f - and %o2, 3, %o2 ! number of leftover bytes, if any - - ! - ! differenced byte copy, works with any alignment - ! -.dbytecp: - b 7f - sub %o1, %g5, %o1 ! o1 gets the difference - -4: stb %o4, [%g5] ! write to address - inc %g5 ! inc to address -7: deccc %o2 ! decrement count - bgeu,a,pt %ncc,4b ! loop till done - ldub [%o1+%g5], %o4 ! read from address - retl ! %o0 was preserved - nop - -.blkalgndst: - save %sp, -SA(MINFRAME), %sp - - ! Block (64 bytes) align the destination. - ! Do not know the alignement of src at this time. - ! Therefore using byte copy. - - andcc %i0, 0x3f, %i3 ! is dst block aligned - bz %ncc, .chksrc ! dst already block aligned - sub %i3, 0x40, %i3 - neg %i3 ! bytes till dst 64 bytes aligned - sub %i2, %i3, %i2 ! update i2 with new count - -1: ldub [%i1], %i4 - stb %i4, [%i0] - inc %i1 - deccc %i3 - bgu,pt %ncc, 1b - inc %i0 - - ! Now Destination is block (64 bytes) aligned -.chksrc: - andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size - sub %i2, %i3, %i2 ! Residue bytes in %i2 - - mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi - - andcc %i1, 0xf, %l1 ! is src quadword aligned - bz,pn %ncc, .blkcpy ! src offset in %l1 - nop - cmp %l1, 0x8 - bgu %ncc, .cpy_upper_double - nop - blu %ncc, .cpy_lower_double - nop - - ! Falls through when source offset is equal to 8 i.e. - ! source is double word aligned. - ! In this case no shift/merge of data is required - sub %i1, %l1, %i1 ! align the src at 16 bytes. - andn %i1, 0x3f, %o0 ! %o0 has block aligned source - prefetch [%o0+0x0], #one_read - ldda [%i1+0x0]%asi, %o2 -loop0: - ldda [%i1+0x10]%asi, %o4 - prefetch [%o0+0x40], #one_read - - stxa %o3, [%i0+0x0]%asi - stxa %o4, [%i0+0x8]%asi - - ldda [%i1+0x20]%asi, %o2 - stxa %o5, [%i0+0x10]%asi - stxa %o2, [%i0+0x18]%asi - - ldda [%i1+0x30]%asi, %o4 - stxa %o3, [%i0+0x20]%asi - stxa %o4, [%i0+0x28]%asi - - ldda [%i1+0x40]%asi, %o2 - stxa %o5, [%i0+0x30]%asi - stxa %o2, [%i0+0x38]%asi - - add %o0, 0x40, %o0 - add %i1, 0x40, %i1 - subcc %i3, 0x40, %i3 - bgu,pt %ncc, loop0 - add %i0, 0x40, %i0 - ba .blkdone - add %i1, %l1, %i1 ! increment the source by src offset - -.cpy_lower_double: - sub %i1, %l1, %i1 ! align the src at 16 bytes. - sll %l1, 3, %l2 ! %l2 left shift - mov 0x40, %l3 - sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift) - andn %i1, 0x3f, %o0 ! %o0 has block aligned source - prefetch [%o0+0x0], #one_read - ldda [%i1+0x0]%asi, %o2 ! partial data in %o2 and %o3 has - ! complete data -loop1: - ldda [%i1+0x10]%asi, %o4 ! %o4 has partial data for this read. - ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) ! merge %o2, %o3 and %o4 - ! into %o2 and %o3 - prefetch [%o0+0x40], #one_read - stxa %o2, [%i0+0x0]%asi - stxa %o3, [%i0+0x8]%asi - - ldda [%i1+0x20]%asi, %o2 - ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) ! merge %o2 with %o5 and - stxa %o4, [%i0+0x10]%asi ! %o4 from previous read - stxa %o5, [%i0+0x18]%asi ! into %o4 and %o5 - - ! Repeat the same for next 32 bytes. - - ldda [%i1+0x30]%asi, %o4 - ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) - stxa %o2, [%i0+0x20]%asi - stxa %o3, [%i0+0x28]%asi - - ldda [%i1+0x40]%asi, %o2 - ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) - stxa %o4, [%i0+0x30]%asi - stxa %o5, [%i0+0x38]%asi - - add %o0, 0x40, %o0 - add %i1, 0x40, %i1 - subcc %i3, 0x40, %i3 - bgu,pt %ncc, loop1 - add %i0, 0x40, %i0 - ba .blkdone - add %i1, %l1, %i1 ! increment the source by src offset - -.cpy_upper_double: - sub %i1, %l1, %i1 ! align the src at 16 bytes. - mov 0x8, %l2 - sub %l1, %l2, %l2 - sll %l2, 3, %l2 ! %l2 left shift - mov 0x40, %l3 - sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift) - andn %i1, 0x3f, %o0 ! %o0 has block aligned source - prefetch [%o0+0x0], #one_read - ldda [%i1+0x0]%asi, %o2 ! partial data in %o3 for this read and - ! no data in %o2 -loop2: - ldda [%i1+0x10]%asi, %o4 ! %o4 has complete data and %o5 has - ! partial - ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) ! merge %o3, %o4 and %o5 - ! into %o3 and %o4 - prefetch [%o0+0x40], #one_read - stxa %o3, [%i0+0x0]%asi - stxa %o4, [%i0+0x8]%asi - - ldda [%i1+0x20]%asi, %o2 - ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) ! merge %o2 and %o3 with - stxa %o5, [%i0+0x10]%asi ! %o5 from previous read - stxa %o2, [%i0+0x18]%asi ! into %o5 and %o2 - - ! Repeat the same for next 32 bytes. - - ldda [%i1+0x30]%asi, %o4 - ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) - stxa %o3, [%i0+0x20]%asi - stxa %o4, [%i0+0x28]%asi - - ldda [%i1+0x40]%asi, %o2 - ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) - stxa %o5, [%i0+0x30]%asi - stxa %o2, [%i0+0x38]%asi - - add %o0, 0x40, %o0 - add %i1, 0x40, %i1 - subcc %i3, 0x40, %i3 - bgu,pt %ncc, loop2 - add %i0, 0x40, %i0 - ba .blkdone - add %i1, %l1, %i1 ! increment the source by src offset - - - ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P -.blkcpy: - andn %i1, 0x3f, %o0 ! %o0 has block aligned source - prefetch [%o0+0x0], #one_read -1: - prefetch [%o0+0x40], #one_read - - ldda [%i1+0x0]%asi, %o2 - ldda [%i1+0x10]%asi, %o4 - - stxa %o2, [%i0+0x0]%asi - stxa %o3, [%i0+0x8]%asi - stxa %o4, [%i0+0x10]%asi - stxa %o5, [%i0+0x18]%asi - - ldda [%i1+0x20]%asi, %o2 - ldda [%i1+0x30]%asi, %o4 - - stxa %o2, [%i0+0x20]%asi - stxa %o3, [%i0+0x28]%asi - stxa %o4, [%i0+0x30]%asi - stxa %o5, [%i0+0x38]%asi - - add %o0, 0x40, %o0 - add %i1, 0x40, %i1 - subcc %i3, 0x40, %i3 - bgu,pt %ncc, 1b - add %i0, 0x40, %i0 - -.blkdone: - membar #Sync - mov ASI_PNF, %asi ! restore %asi to default - ! ASI_PRIMARY_NOFAULT value - - ! Copy as much rest of the data as double word copy. -.cpy_wd: - cmp %i2, 0x8 - blu %ncc, .dbdone ! Not enough bytes to copy as double - nop - - andn %i2, 0x7, %i3 ! %i3 count is multiple of 8 bytes size - sub %i2, %i3, %i2 ! Residue bytes in %i2 - - andcc %i1, 7, %l1 ! is src aligned on a 8 bytes - bz %ncc, .dbcopy - nop - - sll %l1, 3, %l2 ! left shift - mov 0x40, %l3 - sub %l3, %l2, %l3 ! right shift = (64 - left shift) - -.copy_wd: - sub %i1, %l1, %i1 ! align the src at 8 bytes. - ldx [%i1], %o2 -2: - ldx [%i1+8], %o4 - ALIGN_DATA_EW(%o2, %o4, %l2, %l3, %o3) - stx %o2, [%i0] - mov %o4, %o2 - add %i1, 0x8, %i1 - subcc %i3, 0x8, %i3 - bgu,pt %ncc, 2b - add %i0, 0x8, %i0 - ba .dbdone - add %i1, %l1, %i1 - -.dbcopy: - ldx [%i1], %o2 - stx %o2, [%i0] - add %i1, 0x8, %i1 - subcc %i3, 0x8, %i3 - bgu,pt %ncc, .dbcopy - add %i0, 0x8, %i0 - -.dbdone: - tst %i2 - bz,pt %ncc, .blkexit - nop - -.residue: - ldub [%i1], %i4 - stb %i4, [%i0] - inc %i1 - deccc %i2 - bgu,pt %ncc, .residue - inc %i0 - -.blkexit: - ret - restore %g5, %g0, %o0 - SET_SIZE(memcpy) - SET_SIZE(__align_cpy_1)
--- a/usr/src/lib/libc_psr/sun4v_hwcap1/common/memset.s Thu Apr 19 21:15:35 2007 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,250 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -.ident "%Z%%M% %I% %E% SMI" - - .file "memset.s" -/* - * char *memset(sp, c, n) - * - * Set an array of n chars starting at sp to the character c. - * Return sp. - * - * Fast assembler language version of the following C-program for memset - * which represents the `standard' for the C-library. - * - * void * - * memset(void *sp1, int c, size_t n) - * { - * if (n != 0) { - * char *sp = sp1; - * do { - * *sp++ = (char)c; - * } while (--n != 0); - * } - * return (sp1); - * } - * - * Flow : - * - * For small 6 or fewer bytes stores, bytes will be stored. - * - * For less than 32 bytes stores, align the address on 4 byte boundary. - * Then store as many 4-byte chunks, followed by trailing bytes. - * - * For sizes greater than 32 bytes, align the address on 8 byte boundary. - * if (count > 64) { - * store as many 8-bytes chunks to block align the address - * store using ASI_BLK_INIT_ST_QUAD_LDD_P - * } - * Store as many 8-byte chunks, followed by trialing bytes. - * - */ - -#include <sys/asm_linkage.h> -#include <sys/niagaraasi.h> -#include <sys/asi.h> - - ANSI_PRAGMA_WEAK(memset,function) - -#include "synonyms.h" - - .section ".text" - .align 32 - - ENTRY(memset) - - mov %o0, %o5 ! copy sp1 before using it - cmp %o2, 7 ! if small counts, just write bytes - blu,pn %ncc, .wrchar - and %o1, 0xff, %o1 ! o1 is (char)c - - sll %o1, 8, %o3 - or %o1, %o3, %o1 ! now o1 has 2 bytes of c - sll %o1, 16, %o3 - - cmp %o2, 0x20 - blu,pn %ncc, .wdalign - or %o1, %o3, %o1 ! now o1 has 4 bytes of c - - sllx %o1, 32, %o3 - or %o1, %o3, %o1 ! now o1 has 8 bytes of c - -.dbalign: - andcc %o5, 7, %o3 ! is sp1 aligned on a 8 byte bound - bz,pt %ncc, .blkalign ! already double aligned - sub %o3, 8, %o3 ! -(bytes till double aligned) - add %o2, %o3, %o2 ! update o2 with new count - - ! Set -(%o3) bytes till sp1 double aligned -1: stb %o1, [%o5] ! there is at least 1 byte to set - inccc %o3 ! byte clearing loop - bl,pt %ncc, 1b - inc %o5 - - ! Now sp1 is double aligned (sp1 is found in %o5) -.blkalign: - mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi - - cmp %o2, 0x40 ! check if there are 64 bytes to set - blu,pn %ncc, 5f - mov %o2, %o3 - - andcc %o5, 63, %o3 ! is sp1 block aligned? - bz,pt %ncc, .blkwr ! now block aligned - sub %o3, 64, %o3 ! o3 is -(bytes till block aligned) - add %o2, %o3, %o2 ! o2 is the remainder - - ! Store -(%o3) bytes till dst is block (64 byte) aligned. - ! Use double word stores. - ! Recall that dst is already double word aligned -1: - stx %o1, [%o5] - addcc %o3, 8, %o3 - bl,pt %ncc, 1b - add %o5, 8, %o5 - - ! Now sp1 is block aligned -.blkwr: - and %o2, 63, %o3 ! calc bytes left after blk store. - andn %o2, 63, %o4 ! calc size of blocks in bytes - - cmp %o4, 0x100 ! check if there are 256 bytes to set - blu,pn %ncc, 3f - nop -2: - stxa %o1, [%o5+0x0]%asi - stxa %o1, [%o5+0x40]%asi - stxa %o1, [%o5+0x80]%asi - stxa %o1, [%o5+0xc0]%asi - - stxa %o1, [%o5+0x8]%asi - stxa %o1, [%o5+0x10]%asi - stxa %o1, [%o5+0x18]%asi - stxa %o1, [%o5+0x20]%asi - stxa %o1, [%o5+0x28]%asi - stxa %o1, [%o5+0x30]%asi - stxa %o1, [%o5+0x38]%asi - - stxa %o1, [%o5+0x48]%asi - stxa %o1, [%o5+0x50]%asi - stxa %o1, [%o5+0x58]%asi - stxa %o1, [%o5+0x60]%asi - stxa %o1, [%o5+0x68]%asi - stxa %o1, [%o5+0x70]%asi - stxa %o1, [%o5+0x78]%asi - - stxa %o1, [%o5+0x88]%asi - stxa %o1, [%o5+0x90]%asi - stxa %o1, [%o5+0x98]%asi - stxa %o1, [%o5+0xa0]%asi - stxa %o1, [%o5+0xa8]%asi - stxa %o1, [%o5+0xb0]%asi - stxa %o1, [%o5+0xb8]%asi - - stxa %o1, [%o5+0xc8]%asi - stxa %o1, [%o5+0xd0]%asi - stxa %o1, [%o5+0xd8]%asi - stxa %o1, [%o5+0xe0]%asi - stxa %o1, [%o5+0xe8]%asi - stxa %o1, [%o5+0xf0]%asi - stxa %o1, [%o5+0xf8]%asi - - sub %o4, 0x100, %o4 - cmp %o4, 0x100 - bgu,pt %ncc, 2b - add %o5, 0x100, %o5 - -3: - cmp %o4, 0x40 ! check if 64 bytes to set - blu %ncc, 5f - nop -4: - stxa %o1, [%o5+0x0]%asi - stxa %o1, [%o5+0x8]%asi - stxa %o1, [%o5+0x10]%asi - stxa %o1, [%o5+0x18]%asi - stxa %o1, [%o5+0x20]%asi - stxa %o1, [%o5+0x28]%asi - stxa %o1, [%o5+0x30]%asi - stxa %o1, [%o5+0x38]%asi - - subcc %o4, 0x40, %o4 - bgu,pt %ncc, 4b - add %o5, 0x40, %o5 - -5: - ! Set the remaining doubles - membar #Sync - mov ASI_PNF, %asi ! restore %asi to default - ! ASI_PRIMARY_NOFAULT value - subcc %o3, 8, %o3 ! Can we store any doubles? - blu,pn %ncc, .wrchar - and %o2, 7, %o2 ! calc bytes left after doubles - -6: - stx %o1, [%o5] ! store the doubles - subcc %o3, 8, %o3 - bgeu,pt %ncc, 6b - add %o5, 8, %o5 - - ba .wrchar - nop - -.wdalign: - andcc %o5, 3, %o3 ! is sp1 aligned on a word boundary - bz,pn %ncc, .wrword - andn %o2, 3, %o3 ! create word sized count in %o3 - - dec %o2 ! decrement count - stb %o1, [%o5] ! clear a byte - b .wdalign - inc %o5 ! next byte - -.wrword: - st %o1, [%o5] ! 4-byte writing loop - subcc %o3, 4, %o3 - bnz,pt %ncc, .wrword - inc 4, %o5 - - and %o2, 3, %o2 ! leftover count, if any - -.wrchar: - ! Set the remaining bytes, if any - cmp %o2, 0 - be %ncc, .exit - nop - -7: - deccc %o2 - stb %o1, [%o5] - bgu,pt %ncc, 7b - inc %o5 - -.exit: - retl ! %o0 was preserved - nop - - SET_SIZE(memset)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/lib/libc_psr/sun4v_hwcap2/Makefile Thu Apr 19 21:55:12 2007 -0700 @@ -0,0 +1,55 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +include ../../Makefile.lib + +PLATFORM= sun4v + +SUBDIRS= $(MACH) +SUBDIRS64= $(MACH64) + +all := TARGET= all +install := TARGET= install +clean := TARGET= clean +clobber := TARGET= clobber + +.KEEP_STATE: + +all install clean clobber : lib32 \ + $(BUILD64) lib64 + +lint: + @ $(ECHO) "Nothing to lint here: skipping" + +lib32: $(SUBDIRS) + +lib64: $(SUBDIRS64) + +$(SUBDIRS) $(SUBDIRS64): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/lib/libc_psr/sun4v_hwcap2/Makefile.com Thu Apr 19 21:55:12 2007 -0700 @@ -0,0 +1,52 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +LIBRARY = libc_psr_hwcap2.a +VERS = .1 + +include $(SRC)/lib/Makefile.lib +include $(SRC)/Makefile.psm + +# +# Since libc_psr is strictly assembly, deactivate the CTF build logic. +# +CTFCONVERT_POST = : +CTFMERGE_LIB = : + +LIBS = $(DYNLIB) +IFLAGS = -I$(SRC)/lib/libc/inc -I$(SRC)/uts/sun4v \ + -I$(ROOT)/usr/platform/sun4v/include -I$(ROOT)/usr/include/v9 +CPPFLAGS = -D_REENTRANT -D$(MACH) -DNIAGARA2_IMPL $(IFLAGS) $(CPPFLAGS.master) +ASDEFS = -D__STDC__ -D_ASM $(CPPFLAGS) +ASFLAGS = -P $(ASDEFS) + +# +# build rules +# +pics/%.o: ../../$(PLATFORM)/common/%.s + $(AS) $(ASFLAGS) $< -o $@ + $(POST_PROCESS_O)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/lib/libc_psr/sun4v_hwcap2/mapfile Thu Apr 19 21:55:12 2007 -0700 @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +hwcap_1 = vis asi_blk_init;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/lib/libc_psr/sun4v_hwcap2/sparc/Makefile Thu Apr 19 21:55:12 2007 -0700 @@ -0,0 +1,65 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +# +# Platform specific Makefile for libc_psr. +# + +PLATFORM = sun4v +CLASS = 32 + +OBJECTS = memcpy.o memset.o + +include ../Makefile.com + +MAPFILES = ../../sun4v/mapfile-vers ../mapfile $(MAPFILE-FLTR) + +ASFLAGS += -xarch=v8plusa + +# Redefine shared object build rule to use $(LD) directly (this avoids .init +# and .fini sections being added). + +BUILD.SO = $(LD) -o $@ -G $(DYNFLAGS) $(PICS) $(LDLIBS) + +.KEEP_STATE: + +MODULE = libc_psr_hwcap2.so.1 +LIBC_PSR_DIR = $(ROOT_PSM_LIB_DIR)/libc_psr +LIBC_PSR_LIB = $(LIBC_PSR_DIR)/$(MODULE) +INS.libc = $(RM) -r $@; $(INS) -s -m $(FILEMODE) -f $(@D) $(MODULE) + +$(LIBC_PSR_DIR): + -$(INS.dir.root.bin) + +$(LIBC_PSR_LIB): $(LIBC_PSR_DIR) $(MODULE) + -$(INS.libc) + +all: $(LIBS) + +install: all $(LIBC_PSR_LIB) + +include ../../Makefile.targ
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/lib/libc_psr/sun4v_hwcap2/sparcv9/Makefile Thu Apr 19 21:55:12 2007 -0700 @@ -0,0 +1,68 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# + +# +# Platform specific Makefile for libc_psr. +# + +PLATFORM = sun4v +CLASS = 64 + +OBJECTS = memcpy.o memset.o + +include ../Makefile.com + +MAPFILES = ../../sun4v/mapfile-vers ../mapfile $(MAPFILE-FLTR) + +ASDEFS += -D__sparcv9 +ASFLAGS += -xarch=v9a + +include $(SRC)/Makefile.master.64 + +# Redefine shared object build rule to use $(LD) directly (this avoids .init +# and .fini sections being added). + +BUILD.SO = $(LD) -o $@ -G $(DYNFLAGS) $(PICS) $(LDLIBS) + +.KEEP_STATE: + +MODULE = libc_psr_hwcap2.so.1 +LIBC_PSR64_DIR = $(ROOT_PSM_LIB_DIR)/$(MACH64)/libc_psr +LIBC_PSR64_LIB = $(LIBC_PSR64_DIR)/$(MODULE) +INS.libc.64 = $(RM) -r $@; $(INS) -m $(FILEMODE) -f $(@D) $(MODULE) + +$(LIBC_PSR64_DIR): + -$(INS.dir.root.bin) + +$(LIBC_PSR64_LIB): $(LIBC_PSR64_DIR) $(MODULE) + -$(INS.libc.64) + +all: $(LIBS) + +install: all $(LIBC_PSR64_LIB) + +include ../../Makefile.targ
--- a/usr/src/pkgdefs/SUNWcar.v/prototype_com Thu Apr 19 21:15:35 2007 -0700 +++ b/usr/src/pkgdefs/SUNWcar.v/prototype_com Thu Apr 19 21:55:12 2007 -0700 @@ -19,7 +19,7 @@ # CDDL HEADER END # # -# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # #pragma ident "%Z%%M% %I% %E% SMI" @@ -55,9 +55,11 @@ d none platform/sun4v/lib/libc_psr 755 root bin v none platform/sun4v/lib/libc_psr.so.1 644 root bin f none platform/sun4v/lib/libc_psr/libc_psr_hwcap1.so.1 644 root bin +f none platform/sun4v/lib/libc_psr/libc_psr_hwcap2.so.1 644 root bin f none platform/sun4v/lib/libmd_psr.so.1 755 root bin d none platform/sun4v/lib/sparcv9 755 root bin d none platform/sun4v/lib/sparcv9/libc_psr 755 root bin v none platform/sun4v/lib/sparcv9/libc_psr.so.1 644 root bin f none platform/sun4v/lib/sparcv9/libc_psr/libc_psr_hwcap1.so.1 644 root bin +f none platform/sun4v/lib/sparcv9/libc_psr/libc_psr_hwcap2.so.1 644 root bin f none platform/sun4v/lib/sparcv9/libmd_psr.so.1 755 root bin
--- a/usr/src/uts/sun4v/cpu/niagara_copy.s Thu Apr 19 21:15:35 2007 -0700 +++ b/usr/src/uts/sun4v/cpu/niagara_copy.s Thu Apr 19 21:55:12 2007 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -109,14 +108,192 @@ srlx data2, rshift, tmp ;\ or data1, tmp, data1 +#if !defined(NIAGARA_IMPL) +/* + * Flags set in the lower bits of the t_lofault address: + * FPUSED_FLAG: The FP registers were in use and must be restored + * BCOPY_FLAG: Set for bcopy calls, cleared for kcopy calls + * COPY_FLAGS: Both of the above + * + * Other flags: + * KPREEMPT_FLAG: kpreempt needs to be called + */ +#define FPUSED_FLAG 1 +#define BCOPY_FLAG 2 +#define COPY_FLAGS (FPUSED_FLAG | BCOPY_FLAG) +#define KPREEMPT_FLAG 4 + +#define ALIGN_OFF_1_7 \ + faligndata %d0, %d2, %d48 ;\ + faligndata %d2, %d4, %d50 ;\ + faligndata %d4, %d6, %d52 ;\ + faligndata %d6, %d8, %d54 ;\ + faligndata %d8, %d10, %d56 ;\ + faligndata %d10, %d12, %d58 ;\ + faligndata %d12, %d14, %d60 ;\ + faligndata %d14, %d16, %d62 + +#define ALIGN_OFF_8_15 \ + faligndata %d2, %d4, %d48 ;\ + faligndata %d4, %d6, %d50 ;\ + faligndata %d6, %d8, %d52 ;\ + faligndata %d8, %d10, %d54 ;\ + faligndata %d10, %d12, %d56 ;\ + faligndata %d12, %d14, %d58 ;\ + faligndata %d14, %d16, %d60 ;\ + faligndata %d16, %d18, %d62 + +#define ALIGN_OFF_16_23 \ + faligndata %d4, %d6, %d48 ;\ + faligndata %d6, %d8, %d50 ;\ + faligndata %d8, %d10, %d52 ;\ + faligndata %d10, %d12, %d54 ;\ + faligndata %d12, %d14, %d56 ;\ + faligndata %d14, %d16, %d58 ;\ + faligndata %d16, %d18, %d60 ;\ + faligndata %d18, %d20, %d62 + +#define ALIGN_OFF_24_31 \ + faligndata %d6, %d8, %d48 ;\ + faligndata %d8, %d10, %d50 ;\ + faligndata %d10, %d12, %d52 ;\ + faligndata %d12, %d14, %d54 ;\ + faligndata %d14, %d16, %d56 ;\ + faligndata %d16, %d18, %d58 ;\ + faligndata %d18, %d20, %d60 ;\ + faligndata %d20, %d22, %d62 + +#define ALIGN_OFF_32_39 \ + faligndata %d8, %d10, %d48 ;\ + faligndata %d10, %d12, %d50 ;\ + faligndata %d12, %d14, %d52 ;\ + faligndata %d14, %d16, %d54 ;\ + faligndata %d16, %d18, %d56 ;\ + faligndata %d18, %d20, %d58 ;\ + faligndata %d20, %d22, %d60 ;\ + faligndata %d22, %d24, %d62 + +#define ALIGN_OFF_40_47 \ + faligndata %d10, %d12, %d48 ;\ + faligndata %d12, %d14, %d50 ;\ + faligndata %d14, %d16, %d52 ;\ + faligndata %d16, %d18, %d54 ;\ + faligndata %d18, %d20, %d56 ;\ + faligndata %d20, %d22, %d58 ;\ + faligndata %d22, %d24, %d60 ;\ + faligndata %d24, %d26, %d62 + +#define ALIGN_OFF_48_55 \ + faligndata %d12, %d14, %d48 ;\ + faligndata %d14, %d16, %d50 ;\ + faligndata %d16, %d18, %d52 ;\ + faligndata %d18, %d20, %d54 ;\ + faligndata %d20, %d22, %d56 ;\ + faligndata %d22, %d24, %d58 ;\ + faligndata %d24, %d26, %d60 ;\ + faligndata %d26, %d28, %d62 + +#define ALIGN_OFF_56_63 \ + faligndata %d14, %d16, %d48 ;\ + faligndata %d16, %d18, %d50 ;\ + faligndata %d18, %d20, %d52 ;\ + faligndata %d20, %d22, %d54 ;\ + faligndata %d22, %d24, %d56 ;\ + faligndata %d24, %d26, %d58 ;\ + faligndata %d26, %d28, %d60 ;\ + faligndata %d28, %d30, %d62 + +#define VIS_BLOCKSIZE 64 + +/* + * Size of stack frame in order to accomodate a 64-byte aligned + * floating-point register save area and 2 64-bit temp locations. + * All copy functions use three quadrants of fp registers; to assure a + * block-aligned three block buffer in which to save we must reserve + * four blocks on stack. + * + * _______________________________________ <-- %fp + STACK_BIAS + * | We may need to preserve 3 quadrants | + * | of fp regs, but since we do so with | + * | BST/BLD we need room in which to | + * | align to VIS_BLOCKSIZE bytes. So | + * | this area is 4 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET + * |-------------------------------------| + * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET + * |-------------------------------------| + * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET + * --------------------------------------- + */ +#define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8)) +#define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 4) +#define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 3) + 1) +#define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) +#define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) + +/* + * In FP copies if we do not have preserved data to restore over + * the fp regs we used then we must zero those regs to avoid + * exposing portions of the data to later threads (data security). + */ +#define FZERO \ + fzero %f0 ;\ + fzero %f2 ;\ + faddd %f0, %f2, %f4 ;\ + fmuld %f0, %f2, %f6 ;\ + faddd %f0, %f2, %f8 ;\ + fmuld %f0, %f2, %f10 ;\ + faddd %f0, %f2, %f12 ;\ + fmuld %f0, %f2, %f14 ;\ + faddd %f0, %f2, %f16 ;\ + fmuld %f0, %f2, %f18 ;\ + faddd %f0, %f2, %f20 ;\ + fmuld %f0, %f2, %f22 ;\ + faddd %f0, %f2, %f24 ;\ + fmuld %f0, %f2, %f26 ;\ + faddd %f0, %f2, %f28 ;\ + fmuld %f0, %f2, %f30 ;\ + faddd %f0, %f2, %f48 ;\ + fmuld %f0, %f2, %f50 ;\ + faddd %f0, %f2, %f52 ;\ + fmuld %f0, %f2, %f54 ;\ + faddd %f0, %f2, %f56 ;\ + fmuld %f0, %f2, %f58 ;\ + faddd %f0, %f2, %f60 ;\ + fmuld %f0, %f2, %f62 + +/* + * Macros to save and restore fp registers to/from the stack. + * Used to save and restore in-use fp registers when we want to use FP. + */ +#define BST_FP_TOSTACK(tmp1) \ + /* membar #Sync */ ;\ + add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ + and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ + stda %f0, [tmp1]ASI_BLK_P ;\ + add tmp1, VIS_BLOCKSIZE, tmp1 ;\ + stda %f16, [tmp1]ASI_BLK_P ;\ + add tmp1, VIS_BLOCKSIZE, tmp1 ;\ + stda %f48, [tmp1]ASI_BLK_P ;\ + membar #Sync + +#define BLD_FP_FROMSTACK(tmp1) \ + /* membar #Sync - provided at copy completion */ ;\ + add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ + and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ + ldda [tmp1]ASI_BLK_P, %f0 ;\ + add tmp1, VIS_BLOCKSIZE, tmp1 ;\ + ldda [tmp1]ASI_BLK_P, %f16 ;\ + add tmp1, VIS_BLOCKSIZE, tmp1 ;\ + ldda [tmp1]ASI_BLK_P, %f48 ;\ + membar #Sync +#endif /* NIAGARA_IMPL */ + /* * Copy a block of storage, returning an error code if `from' or * `to' takes a kernel pagefault which cannot be resolved. * Returns errno value on pagefault error, 0 if all ok */ - - #if defined(lint) /* ARGSUSED */ @@ -131,6 +308,113 @@ ENTRY(kcopy) +#if !defined(NIAGARA_IMPL) + save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp + sethi %hi(.copyerr), %l7 ! copyerr is lofault value + or %l7, %lo(.copyerr), %l7 + ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler + ! Note that we carefully do *not* flag the setting of + ! t_lofault. + membar #Sync ! sync error barrier + b .do_copy ! common code + stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault + +/* + * We got here because of a fault during kcopy or bcopy if a fault + * handler existed when bcopy was called. + * Errno value is in %g1. + */ +.copyerr: + sethi %hi(.copyerr2), %l1 + or %l1, %lo(.copyerr2), %l1 + membar #Sync ! sync error barrier + stn %l1, [THREAD_REG + T_LOFAULT] ! set t_lofault + btst FPUSED_FLAG, %o5 + bz,pt %xcc, 1f + and %o5, BCOPY_FLAG, %l1 ! copy flag to %l1 + + membar #Sync ! sync error barrier + ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr + wr %o2, 0, %gsr + + ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 + btst FPRS_FEF, %o3 + bz,pt %icc, 4f + nop + + ! restore fpregs from stack + BLD_FP_FROMSTACK(%o2) + + ba,pt %ncc, 2f + wr %o3, 0, %fprs ! restore fprs + +4: + FZERO + wr %o3, 0, %fprs ! restore fprs + +2: + ldn [THREAD_REG + T_LWP], %o2 + brnz,pt %o2, 1f + nop + + ldsb [THREAD_REG + T_PREEMPT], %l0 + deccc %l0 + bnz,pn %ncc, 1f + stb %l0, [THREAD_REG + T_PREEMPT] + + ldsb [THREAD_REG + T_PREEMPT], %l0 + dec %l0 + stb %l0, [THREAD_REG + T_PREEMPT] + + ! Check for a kernel preemption request + ldn [THREAD_REG + T_CPU], %l0 + ldub [%l0 + CPU_KPRUNRUN], %l0 + brnz,a,pt %l0, 1f ! Need to call kpreempt? + or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag + + ! The kcopy will always set a t_lofault handler. If it fires, + ! we're expected to just return the error code and not to + ! invoke any existing error handler. As far as bcopy is concerned, + ! we only set t_lofault if there was an existing lofault handler. + ! In that case we're expected to invoke the previously existing + ! handler after restting the t_lofault value. +1: + andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address + membar #Sync ! sync error barrier + stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault + + ! call kpreempt if necessary + btst KPREEMPT_FLAG, %l1 + bz,pt %icc, 2f + nop + call kpreempt + rdpr %pil, %o0 ! pass %pil +2: + btst BCOPY_FLAG, %l1 + bnz,pn %ncc, 3f + nop + ret + restore %g1, 0, %o0 + +3: + ! We're here via bcopy. There must have been an error handler + ! in place otherwise we would have died a nasty death already. + jmp %o5 ! goto real handler + restore %g0, 0, %o0 ! dispose of copy window + +/* + * We got here because of a fault in .copyerr. We can't safely restore fp + * state, so we panic. + */ +fp_panic_msg: + .asciz "Unable to restore fp state after copy operation" + + .align 4 +.copyerr2: + set fp_panic_msg, %o0 + call panic + nop +#else /* NIAGARA_IMPL */ save %sp, -SA(MINFRAME), %sp set .copyerr, %l7 ! copyerr is lofault value ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler @@ -152,6 +436,7 @@ stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault ret restore %g1, 0, %o0 +#endif /* NIAGARA_IMPL */ SET_SIZE(kcopy) #endif /* lint */ @@ -171,8 +456,25 @@ ENTRY(bcopy) +#if !defined(NIAGARA_IMPL) + save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp + ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler + brz,pt %o5, .do_copy + nop + sethi %hi(.copyerr), %l7 ! copyerr is lofault value + or %l7, %lo(.copyerr), %l7 + membar #Sync ! sync error barrier + stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault + ! We've already captured whether t_lofault was zero on entry. + ! We need to mark ourselves as being from bcopy since both + ! kcopy and bcopy use the same code path. If BCOPY_FLAG is + ! set and the saved lofault was zero, we won't reset lofault on + ! returning. + or %o5, BCOPY_FLAG, %o5 +#else /* NIAGARA_IMPL */ save %sp, -SA(MINFRAME), %sp clr %o5 ! flag LOFAULT_SET is not set for bcopy +#endif /* NIAGARA_IMPL */ .do_copy: cmp %i2, 12 ! for small counts @@ -185,8 +487,7 @@ set use_hw_bcopy, %o2 ld [%o2], %o2 - tst %o2 - bz .bcb_punt + brz,pn %o2, .bcb_punt nop subcc %i1, %i0, %i3 @@ -205,26 +506,111 @@ /* * Copy that reach here have at least 2 blocks of data to copy. */ +#if !defined(NIAGARA_IMPL) + ldn [THREAD_REG + T_LWP], %o3 + brnz,pt %o3, 1f + nop + + ! kpreempt_disable(); + ldsb [THREAD_REG + T_PREEMPT], %o2 + inc %o2 + stb %o2, [THREAD_REG + T_PREEMPT] + +1: + rd %fprs, %o2 ! check for unused fp + st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs + btst FPRS_FEF, %o2 + bz,a,pt %icc, .do_blockcopy + wr %g0, FPRS_FEF, %fprs + + ! save in-use fpregs on stack + BST_FP_TOSTACK(%o2) +#endif /* NIAGARA_IMPL */ + .do_blockcopy: + +#if !defined(NIAGARA_IMPL) + rd %gsr, %o2 + stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr + or %o5, FPUSED_FLAG, %o5 ! fp regs are in use +#endif /* NIAGARA_IMPL */ + ! Swap src/dst since the code below is memcpy code ! and memcpy/bcopy have different calling sequences mov %i1, %i5 mov %i0, %i1 mov %i5, %i0 + ! Block (64 bytes) align the destination. andcc %i0, 0x3f, %i3 ! is dst aligned on a 64 bytes bz %xcc, .chksrc ! dst is already double aligned sub %i3, 0x40, %i3 neg %i3 ! bytes till dst 64 bytes aligned sub %i2, %i3, %i2 ! update i2 with new count -1: ldub [%i1], %i4 - stb %i4, [%i0] + ! Based on source and destination alignment do + ! either 8 bytes, 4 bytes, 2 bytes or byte copy. + + ! Is dst & src 8B aligned + or %i0, %i1, %o2 + andcc %o2, 0x7, %g0 + bz %ncc, .alewdcp + nop + + ! Is dst & src 4B aligned + andcc %o2, 0x3, %g0 + bz %ncc, .alwdcp + nop + + ! Is dst & src 2B aligned + andcc %o2, 0x1, %g0 + bz %ncc, .alhlfwdcp + nop + + ! 1B aligned +1: ldub [%i1], %o2 + stb %o2, [%i0] inc %i1 deccc %i3 - bgu %xcc, 1b + bgu,pt %ncc, 1b inc %i0 + ba .chksrc + nop + + ! dst & src 4B aligned +.alwdcp: + ld [%i1], %o2 + st %o2, [%i0] + add %i1, 0x4, %i1 + subcc %i3, 0x4, %i3 + bgu,pt %ncc, .alwdcp + add %i0, 0x4, %i0 + + ba .chksrc + nop + + ! dst & src 2B aligned +.alhlfwdcp: + lduh [%i1], %o2 + stuh %o2, [%i0] + add %i1, 0x2, %i1 + subcc %i3, 0x2, %i3 + bgu,pt %ncc, .alhlfwdcp + add %i0, 0x2, %i0 + + ba .chksrc + nop + + ! dst & src 8B aligned +.alewdcp: + ldx [%i1], %o2 + stx %o2, [%i0] + add %i1, 0x8, %i1 + subcc %i3, 0x8, %i3 + bgu,pt %ncc, .alewdcp + add %i0, 0x8, %i0 + ! Now Destination is block (64 bytes) aligned .chksrc: andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size @@ -232,6 +618,286 @@ mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi +#if !defined(NIAGARA_IMPL) + andn %i1, 0x3f, %l0 ! %l0 has block aligned src address + prefetch [%l0+0x0], #one_read + andcc %i1, 0x3f, %g0 ! is src 64B aligned + bz,pn %ncc, .blkcpy + nop + + ! handle misaligned source cases + alignaddr %i1, %g0, %g0 ! generate %gsr + + srl %i1, 0x3, %l1 ! src add bits 3, 4, 5 are now least + ! significant in %l1 + andcc %l1, 0x7, %l2 ! mask everything except bits 1, 2, 3 + add %i1, %i3, %i1 + + ! switch statement to get to right 8 byte block within + ! 64 byte block + cmp %l2, 0x4 + bgeu,a hlf + cmp %l2, 0x6 + cmp %l2, 0x2 + bgeu,a sqtr + nop + cmp %l2, 0x1 + be,a off15 + nop + ba off7 + nop +sqtr: + be,a off23 + nop + ba,a off31 + nop + +hlf: + bgeu,a fqtr + nop + cmp %l2, 0x5 + be,a off47 + nop + ba off39 + nop +fqtr: + be,a off55 + nop + + ! Falls through when the source offset is greater than 56 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +7: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_56_63 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 7b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + + ! This copy case for source offset between 1 and 7 +off7: + ldda [%l0]ASI_BLK_P, %d0 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +0: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_1_7 + fmovd %d16, %d0 + fmovd %d18, %d2 + fmovd %d20, %d4 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 0b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + + ! This copy case for source offset between 8 and 15 +off15: + ldd [%l0+0x8], %d2 + ldd [%l0+0x10], %d4 + ldd [%l0+0x18], %d6 + ldd [%l0+0x20], %d8 + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +1: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_8_15 + fmovd %d18, %d2 + fmovd %d20, %d4 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 1b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + + ! This copy case for source offset between 16 and 23 +off23: + ldd [%l0+0x10], %d4 + ldd [%l0+0x18], %d6 + ldd [%l0+0x20], %d8 + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +2: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_16_23 + fmovd %d20, %d4 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 2b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + + ! This copy case for source offset between 24 and 31 +off31: + ldd [%l0+0x18], %d6 + ldd [%l0+0x20], %d8 + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +3: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_24_31 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 3b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + + ! This copy case for source offset between 32 and 39 +off39: + ldd [%l0+0x20], %d8 + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +4: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_32_39 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 4b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + + ! This copy case for source offset between 40 and 47 +off47: + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +5: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_40_47 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 5b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + + ! This copy case for source offset between 48 and 55 +off55: + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +6: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_48_55 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 6b + prefetch [%l0+0x80], #one_read + ba .blkdone + membar #Sync + + ! Both source and destination are block aligned. +.blkcpy: + prefetch [%i1+0x40], #one_read + prefetch [%i1+0x80], #one_read +8: + stxa %g0, [%i0]%asi ! initialize the cache line + ldda [%i1]ASI_BLK_P, %d0 + stda %d0, [%i0]ASI_BLK_P + + add %i1, 0x40, %i1 + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 8b + prefetch [%i1+0x80], #one_read + membar #Sync + +.blkdone: +#else /* NIAGARA_IMPL */ andcc %i1, 0xf, %o2 ! is src quadword aligned bz,pn %xcc, .blkcpy ! src offset in %o2 nop @@ -393,19 +1059,167 @@ add %i0, 0x40, %i0 .blkdone: - tst %i2 - bz,pt %xcc, .blkexit + membar #Sync +#endif /* NIAGARA_IMPL */ + + brz,pt %i2, .blkexit + nop + + ! Handle trailing bytes + cmp %i2, 0x8 + blu,pt %ncc, .residue + nop + + ! Can we do some 8B ops + or %i1, %i0, %o2 + andcc %o2, 0x7, %g0 + bnz %ncc, .last4 + nop + + ! Do 8byte ops as long as possible +.last8: + ldx [%i1], %o2 + stx %o2, [%i0] + add %i1, 0x8, %i1 + sub %i2, 0x8, %i2 + cmp %i2, 0x8 + bgu,pt %ncc, .last8 + add %i0, 0x8, %i0 + + brz,pt %i2, .blkexit + nop + + ba .residue + nop + +.last4: + ! Can we do 4B ops + andcc %o2, 0x3, %g0 + bnz %ncc, .last2 + nop +1: + ld [%i1], %o2 + st %o2, [%i0] + add %i1, 0x4, %i1 + sub %i2, 0x4, %i2 + cmp %i2, 0x4 + bgu,pt %ncc, 1b + add %i0, 0x4, %i0 + + brz,pt %i2, .blkexit + nop + + ba .residue + nop + +.last2: + ! Can we do 2B ops + andcc %o2, 0x1, %g0 + bnz %ncc, .residue + nop + +1: + lduh [%i1], %o2 + stuh %o2, [%i0] + add %i1, 0x2, %i1 + sub %i2, 0x2, %i2 + cmp %i2, 0x2 + bgu,pt %ncc, 1b + add %i0, 0x2, %i0 + + brz,pt %i2, .blkexit nop .residue: - ldub [%i1], %i4 - stb %i4, [%i0] + ldub [%i1], %o2 + stb %o2, [%i0] inc %i1 deccc %i2 - bgu %xcc, .residue + bgu,pt %ncc, .residue inc %i0 .blkexit: +#if !defined(NIAGARA_IMPL) + btst FPUSED_FLAG, %o5 + bz %icc, 1f + and %o5, COPY_FLAGS, %l1 ! Store flags in %l1 + ! We can't clear the flags from %o5 yet + ! If there's an error, .copyerr will + ! need them + + ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr + wr %o2, 0, %gsr + + ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 + btst FPRS_FEF, %o3 + bz,pt %icc, 4f + nop + + ! restore fpregs from stack + BLD_FP_FROMSTACK(%o2) + + ba,pt %ncc, 2f + wr %o3, 0, %fprs ! restore fprs + +4: + FZERO + wr %o3, 0, %fprs ! restore fprs + +2: + ldn [THREAD_REG + T_LWP], %o2 + brnz,pt %o2, 1f + nop + + ldsb [THREAD_REG + T_PREEMPT], %l0 + deccc %l0 + bnz,pn %ncc, 1f + stb %l0, [THREAD_REG + T_PREEMPT] + + ! Check for a kernel preemption request + ldn [THREAD_REG + T_CPU], %l0 + ldub [%l0 + CPU_KPRUNRUN], %l0 + brnz,a,pt %l0, 1f ! Need to call kpreempt? + or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag + + ldsb [THREAD_REG + T_PREEMPT], %l0 + dec %l0 + stb %l0, [THREAD_REG + T_PREEMPT] +1: + btst BCOPY_FLAG, %l1 + bz,pn %icc, 3f + andncc %o5, COPY_FLAGS, %o5 + + ! Here via bcopy. Check to see if the handler was NULL. + ! If so, just return quietly. Otherwise, reset the + ! handler and go home. + bnz,pn %ncc, 3f + nop + + ! Null handler. + btst KPREEMPT_FLAG, %l1 + bz,pt %icc, 2f + nop + call kpreempt + rdpr %pil, %o0 ! pass %pil +2: + + ret + restore %g0, 0, %o0 + + ! Here via kcopy or bcopy with a handler. + ! Reset the fault handler. +3: + membar #Sync + stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault + + ! call kpreempt if necessary + btst KPREEMPT_FLAG, %l1 + bz,pt %icc, 4f + nop + call kpreempt + rdpr %pil, %o0 +4: +#else /* NIAGARA_IMPL */ membar #Sync ! sync error barrier ! Restore t_lofault handler, if came here from kcopy(). tst %o5 @@ -413,6 +1227,7 @@ andn %o5, LOFAULT_SET, %o5 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1: +#endif /* NIAGARA_IMPL */ ret restore %g0, 0, %o0 @@ -634,6 +1449,28 @@ bgeu,a %ncc, 1b ! loop till done ldub [%i0+%i1], %o4 ! read from address .cpdone: +#if !defined(NIAGARA_IMPL) + ! FPUSED_FLAG will not have been set in any path leading to + ! this point. No need to deal with it. + btst BCOPY_FLAG, %o5 + bz,pn %icc, 2f + andcc %o5, BCOPY_FLAG, %o5 + ! Here via bcopy. Check to see if the handler was NULL. + ! If so, just return quietly. Otherwise, reset the + ! handler and go home. + bnz,pn %ncc, 2f + nop + ! + ! Null handler. + ! + ret + restore %g0, 0, %o0 + ! Here via kcopy or bcopy with a handler. + ! Reset the fault handler. +2: + membar #Sync + stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault +#else /* NIAGARA_IMPL */ membar #Sync ! sync error barrier ! Restore t_lofault handler, if came here from kcopy(). tst %o5 @@ -641,6 +1478,7 @@ andn %o5, LOFAULT_SET, %o5 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1: +#endif /* NIAGARA_IMPL */ ret restore %g0, 0, %o0 ! return (0) @@ -736,7 +1574,7 @@ { } #else /* lint */ ENTRY(hwblkpagecopy) - save %sp, -SA(MINFRAME + 4*64), %sp + save %sp, -SA(MINFRAME), %sp ! %i0 - source address (arg) ! %i1 - destination address (arg) @@ -915,8 +1753,34 @@ * member of the t_copyop structure, if needed. */ ENTRY(copyio_fault) +#if !defined(NIAGARA_IMPL) + btst FPUSED_FLAG, SAVED_LOFAULT + bz 1f + andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT + + ld [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 + wr %o2, 0, %gsr ! restore gsr + + ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 + btst FPRS_FEF, %o3 + bz %icc, 4f + nop + + ! restore fpregs from stack + BLD_FP_FROMSTACK(%o2) + + ba,pt %ncc, 1f + wr %o3, 0, %fprs ! restore fprs + +4: + FZERO ! zero all of the fpregs + wr %o3, 0, %fprs ! restore fprs + +1: +#else /* NIAGARA_IMPL */ membar #Sync stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault +#endif /* NIAGARA_IMPL */ restore @@ -1237,89 +2101,394 @@ clr %o0 .big_copyout: - ! ! We're going to go off and do a block copy. ! Switch fault handlers and grab a window. We ! don't do a membar #Sync since we've done only ! kernel data to this point. - ! stn %o4, [THREAD_REG + T_LOFAULT] - save %sp, -SA(MINFRAME), %sp ! Copy out that reach here are larger than 256 bytes. The ! hw_copy_limit_1 is set to 256. Never set this limit less ! 128 bytes. +#if !defined(NIAGARA_IMPL) + save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp + + rd %fprs, %o2 ! check for unused fp + st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save %fprs + btst FPRS_FEF, %o2 + bz,a,pt %icc, .do_block_copyout + wr %g0, FPRS_FEF, %fprs + + ! save in-use fpregs on stack + BST_FP_TOSTACK(%o2) +#else /* NIAGARA_IMPL */ + save %sp, -SA(MINFRAME), %sp +#endif /* NIAGARA_IMPL */ + .do_block_copyout: +#if !defined(NIAGARA_IMPL) + rd %gsr, %o2 + stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr + ! set the lower bit saved t_lofault to indicate that we need + ! clear %fprs register on the way out + or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT +#endif /* NIAGARA_IMPL */ + ! Swap src/dst since the code below is memcpy code ! and memcpy/bcopy have different calling sequences mov %i1, %i5 mov %i0, %i1 mov %i5, %i0 - andcc %i0, 7, %i3 ! is dst double aligned - bz %ncc, copyout_blkcpy - sub %i3, 8, %i3 - neg %i3 ! bytes till double aligned - sub %i2, %i3, %i2 ! update %i2 with new count - - ! Align Destination on double-word boundary - -1: ldub [%i1], %i4 + ! Block (64 bytes) align the destination. + andcc %i0, 0x3f, %i3 ! is dst block aligned + bz %ncc, copyout_blalign ! dst already block aligned + sub %i3, 0x40, %i3 + neg %i3 ! bytes till dst 64 bytes aligned + sub %i2, %i3, %i2 ! update i2 with new count + + ! Based on source and destination alignment do + ! either 8 bytes, 4 bytes, 2 bytes or byte copy. + + ! Is dst & src 8B aligned + or %i0, %i1, %o2 + andcc %o2, 0x7, %g0 + bz %ncc, .co_alewdcp + nop + + ! Is dst & src 4B aligned + andcc %o2, 0x3, %g0 + bz %ncc, .co_alwdcp + nop + + ! Is dst & src 2B aligned + andcc %o2, 0x1, %g0 + bz %ncc, .co_alhlfwdcp + nop + + ! 1B aligned +1: ldub [%i1], %o2 + stba %o2, [%i0]ASI_USER inc %i1 - stba %i4, [%i0]ASI_USER deccc %i3 - bgu %ncc, 1b - inc %i0 - -copyout_blkcpy: - andcc %i0, 63, %i3 - bz,pn %ncc, copyout_blalign ! now block aligned - sub %i3, 64, %i3 - neg %i3 ! bytes till block aligned - sub %i2, %i3, %i2 ! update %i2 with new count - - ! Copy %i3 bytes till dst is block (64 byte) aligned. use - ! double word copies. - - andcc %i1, 7, %g1 ! is src aligned on a 8 bytes - bz %ncc, .co_dbcopy ! %g1 has source offset (last 3-bits) - sll %g1, 3, %l1 ! left shift - mov 0x40, %l2 - sub %l2, %l1, %l2 ! right shift = (64 - left shift) - - ! Now use double word copies to align destination. -.co_double: - sub %i1, %g1, %i1 ! align the src at 8 bytes. - ldx [%i1], %o2 -2: - ldx [%i1+8], %o4 - ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3) - stxa %o2, [%i0]ASI_USER - mov %o4, %o2 - add %i1, 0x8, %i1 - subcc %i3, 0x8, %i3 - bgu,pt %ncc, 2b - add %i0, 0x8, %i0 + bgu,pt %ncc, 1b + inc %i0 + + ba copyout_blalign + nop + + ! dst & src 4B aligned +.co_alwdcp: + ld [%i1], %o2 + sta %o2, [%i0]ASI_USER + add %i1, 0x4, %i1 + subcc %i3, 0x4, %i3 + bgu,pt %ncc, .co_alwdcp + add %i0, 0x4, %i0 + ba copyout_blalign - add %i1, %g1, %i1 - - ! Both source and destination are double aligned. - ! No shift and merge of data required in this case. -.co_dbcopy: + nop + + ! dst & src 2B aligned +.co_alhlfwdcp: + lduh [%i1], %o2 + stuha %o2, [%i0]ASI_USER + add %i1, 0x2, %i1 + subcc %i3, 0x2, %i3 + bgu,pt %ncc, .co_alhlfwdcp + add %i0, 0x2, %i0 + + ba copyout_blalign + nop + + ! dst & src 8B aligned +.co_alewdcp: ldx [%i1], %o2 stxa %o2, [%i0]ASI_USER add %i1, 0x8, %i1 subcc %i3, 0x8, %i3 - bgu,pt %ncc, .co_dbcopy + bgu,pt %ncc, .co_alewdcp add %i0, 0x8, %i0 + ! Now Destination is block (64 bytes) aligned copyout_blalign: andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size sub %i2, %i3, %i2 ! Residue bytes in %i2 mov ASI_BLK_INIT_QUAD_LDD_AIUS, %asi +#if !defined(NIAGARA_IMPL) + andn %i1, 0x3f, %l0 ! %l0 has block aligned src address + prefetch [%l0+0x0], #one_read + andcc %i1, 0x3f, %g0 ! is src 64B aligned + bz,pn %ncc, .co_blkcpy + nop + + ! handle misaligned source cases + alignaddr %i1, %g0, %g0 ! generate %gsr + + srl %i1, 0x3, %l1 ! src add bits 3, 4, 5 are now least + ! significant in %l1 + andcc %l1, 0x7, %l2 ! mask everything except bits 1, 2, 3 + add %i1, %i3, %i1 + + ! switch statement to get to right 8 byte block within + ! 64 byte block + cmp %l2, 0x4 + bgeu,a co_hlf + cmp %l2, 0x6 + cmp %l2, 0x2 + bgeu,a co_sqtr + nop + cmp %l2, 0x1 + be,a co_off15 + nop + ba co_off7 + nop +co_sqtr: + be,a co_off23 + nop + ba,a co_off31 + nop + +co_hlf: + bgeu,a co_fqtr + nop + cmp %l2, 0x5 + be,a co_off47 + nop + ba co_off39 + nop +co_fqtr: + be,a co_off55 + nop + + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +7: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_56_63 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_AIUS + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 7b + prefetch [%l0+0x80], #one_read + ba .co_blkdone + membar #Sync + +co_off7: + ldda [%l0]ASI_BLK_P, %d0 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +0: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_1_7 + fmovd %d16, %d0 + fmovd %d18, %d2 + fmovd %d20, %d4 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_AIUS + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 0b + prefetch [%l0+0x80], #one_read + ba .co_blkdone + membar #Sync + +co_off15: + ldd [%l0+0x8], %d2 + ldd [%l0+0x10], %d4 + ldd [%l0+0x18], %d6 + ldd [%l0+0x20], %d8 + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +1: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_8_15 + fmovd %d18, %d2 + fmovd %d20, %d4 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_AIUS + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 1b + prefetch [%l0+0x80], #one_read + ba .co_blkdone + membar #Sync + +co_off23: + ldd [%l0+0x10], %d4 + ldd [%l0+0x18], %d6 + ldd [%l0+0x20], %d8 + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +2: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_16_23 + fmovd %d20, %d4 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_AIUS + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 2b + prefetch [%l0+0x80], #one_read + ba .co_blkdone + membar #Sync + +co_off31: + ldd [%l0+0x18], %d6 + ldd [%l0+0x20], %d8 + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +3: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_24_31 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_AIUS + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 3b + prefetch [%l0+0x80], #one_read + ba .co_blkdone + membar #Sync + +co_off39: + ldd [%l0+0x20], %d8 + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +4: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_32_39 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_AIUS + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 4b + prefetch [%l0+0x80], #one_read + ba .co_blkdone + membar #Sync + +co_off47: + ldd [%l0+0x28], %d10 + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +5: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_40_47 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_AIUS + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 5b + prefetch [%l0+0x80], #one_read + ba .co_blkdone + membar #Sync + +co_off55: + ldd [%l0+0x30], %d12 + ldd [%l0+0x38], %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +6: + add %l0, 0x40, %l0 + stxa %g0, [%i0]%asi ! initialize the cache line + + ldda [%l0]ASI_BLK_P, %d16 + ALIGN_OFF_48_55 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_AIUS + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 6b + prefetch [%l0+0x80], #one_read + ba .co_blkdone + membar #Sync + +.co_blkcpy: + prefetch [%i1+0x40], #one_read + prefetch [%i1+0x80], #one_read +8: + stxa %g0, [%i0]%asi ! initialize the cache line + ldda [%i1]ASI_BLK_P, %d0 + stda %d0, [%i0]ASI_BLK_AIUS + + add %i1, 0x40, %i1 + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 8b + prefetch [%i1+0x80], #one_read + membar #Sync + +.co_blkdone: +#else /* NIAGARA_IMPL */ andcc %i1, 0xf, %o2 ! is src quadword aligned bz,pn %xcc, .co_blkcpy ! src offset in %o2 (last 4-bits) nop @@ -1516,50 +2685,74 @@ .co_blkdone: membar #Sync - - ! Copy as much rest of the data as double word copy. -.co_dwcp: - cmp %i2, 0x8 ! Not enough bytes to copy as double - blu %ncc, .co_dbdone +#endif /* NIAGARA_IMPL */ + + brz,pt %i2, .copyout_exit nop - andn %i2, 0x7, %i3 ! %i3 count is multiple of 8 bytes size - sub %i2, %i3, %i2 ! Residue bytes in %i2 - - andcc %i1, 7, %g1 ! is src aligned on a 8 bytes - bz %ncc, .co_cpy_db + ! Handle trailing bytes + cmp %i2, 0x8 + blu,pt %ncc, .co_residue nop - sll %g1, 3, %l0 ! left shift - mov 0x40, %l1 - sub %l1, %l0, %l1 ! right shift = (64 - left shift) - -.co_cpy_wd: - sub %i1, %g1, %i1 ! align the src at 8 bytes. - ldx [%i1], %o2 -3: - ldx [%i1+8], %o4 - ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3) - stxa %o2, [%i0]ASI_USER - mov %o4, %o2 - add %i1, 0x8, %i1 - subcc %i3, 0x8, %i3 - bgu,pt %ncc, 3b - add %i0, 0x8, %i0 - ba .co_dbdone - add %i1, %g1, %i1 - -.co_cpy_db: + ! Can we do some 8B ops + or %i1, %i0, %o2 + andcc %o2, 0x7, %g0 + bnz %ncc, .co_last4 + nop + + ! Do 8byte ops as long as possible +.co_last8: ldx [%i1], %o2 stxa %o2, [%i0]ASI_USER add %i1, 0x8, %i1 - subcc %i3, 0x8, %i3 - bgu,pt %ncc, .co_cpy_db + sub %i2, 0x8, %i2 + cmp %i2, 0x8 + bgu,pt %ncc, .co_last8 add %i0, 0x8, %i0 -.co_dbdone: - tst %i2 - bz,pt %xcc, .copyout_exit + brz,pt %i2, .copyout_exit + nop + + ba .co_residue + nop + +.co_last4: + ! Can we do 4B ops + andcc %o2, 0x3, %g0 + bnz %ncc, .co_last2 + nop +1: + ld [%i1], %o2 + sta %o2, [%i0]ASI_USER + add %i1, 0x4, %i1 + sub %i2, 0x4, %i2 + cmp %i2, 0x4 + bgu,pt %ncc, 1b + add %i0, 0x4, %i0 + + brz,pt %i2, .copyout_exit + nop + + ba .co_residue + nop + +.co_last2: + ! Can we do 2B ops + andcc %o2, 0x1, %g0 + bnz %ncc, .co_residue + nop + +1: + lduh [%i1], %o2 + stuha %o2, [%i0]ASI_USER + add %i1, 0x2, %i1 + sub %i2, 0x2, %i2 + cmp %i2, 0x2 + bgu,pt %ncc, 1b + add %i0, 0x2, %i0 + + brz,pt %i2, .copyout_exit nop ! Copy the residue as byte copy @@ -1568,11 +2761,35 @@ stba %i4, [%i0]ASI_USER inc %i1 deccc %i2 - bgu %xcc, .co_residue + bgu,pt %xcc, .co_residue inc %i0 .copyout_exit: +#if !defined(NIAGARA_IMPL) + ld [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 + wr %o2, 0, %gsr ! restore gsr + + ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 + btst FPRS_FEF, %o3 + bz %icc, 4f + nop + + ! restore fpregs from stack + BLD_FP_FROMSTACK(%o2) + + ba,pt %ncc, 2f + wr %o3, 0, %fprs ! restore fprs + +4: + FZERO ! zero all of the fpregs + wr %o3, 0, %fprs ! restore fprs + +2: membar #Sync + andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT +#else /* NIAGARA_IMPL */ + membar #Sync +#endif /* NIAGARA_IMPL */ stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault ret restore %g0, 0, %o0 @@ -1956,87 +3173,393 @@ clr %o0 .big_copyin: - ! ! We're going off to do a block copy. ! Switch fault hendlers and grab a window. We ! don't do a membar #Sync since we've done only ! kernel data to this point. - ! stn %o4, [THREAD_REG + T_LOFAULT] - save %sp, -SA(MINFRAME), %sp ! Copy in that reach here are larger than 256 bytes. The ! hw_copy_limit_1 is set to 256. Never set this limit less ! 128 bytes. +#if !defined(NIAGARA_IMPL) + save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp + + rd %fprs, %o2 ! check for unused fp + st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save %fprs + btst FPRS_FEF, %o2 + bz,a,pt %icc, .do_blockcopyin + wr %g0, FPRS_FEF, %fprs + + ! save in-use fpregs on stack + BST_FP_TOSTACK(%o2) +#else /* NIAGARA_IMPL */ + save %sp, -SA(MINFRAME), %sp +#endif /* NIAGARA_IMPL */ + .do_blockcopyin: +#if !defined(NIAGARA_IMPL) + rd %gsr, %o2 + stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr + ! set the lower bit saved t_lofault to indicate that we need + ! clear %fprs register on the way out + or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT +#endif /* NIAGARA_IMPL */ + ! Swap src/dst since the code below is memcpy code ! and memcpy/bcopy have different calling sequences mov %i1, %i5 mov %i0, %i1 mov %i5, %i0 - andcc %i0, 7, %i3 ! is dst double aligned - bz %ncc, copyin_blkcpy - sub %i3, 8, %i3 - neg %i3 ! bytes till double aligned - sub %i2, %i3, %i2 ! update %i2 with new count - - ! Align Destination on double-word boundary - -1: lduba [%i1]ASI_USER, %i4 + ! Block (64 bytes) align the destination. + andcc %i0, 0x3f, %i3 ! is dst block aligned + bz %ncc, copyin_blalign ! dst already block aligned + sub %i3, 0x40, %i3 + neg %i3 ! bytes till dst 64 bytes aligned + sub %i2, %i3, %i2 ! update i2 with new count + + ! Based on source and destination alignment do + ! either 8 bytes, 4 bytes, 2 bytes or byte copy. + + ! Is dst & src 8B aligned + or %i0, %i1, %o2 + andcc %o2, 0x7, %g0 + bz %ncc, .ci_alewdcp + nop + + ! Is dst & src 4B aligned + andcc %o2, 0x3, %g0 + bz %ncc, .ci_alwdcp + nop + + ! Is dst & src 2B aligned + andcc %o2, 0x1, %g0 + bz %ncc, .ci_alhlfwdcp + nop + + ! 1B aligned +1: lduba [%i1]ASI_USER, %o2 + stb %o2, [%i0] inc %i1 - stb %i4, [%i0] deccc %i3 - bgu %ncc, 1b - inc %i0 - -copyin_blkcpy: - andcc %i0, 63, %i3 - bz,pn %ncc, copyin_blalign ! now block aligned - sub %i3, 64, %i3 - neg %i3 ! bytes till block aligned - sub %i2, %i3, %i2 ! update %i2 with new count - - ! Copy %i3 bytes till dst is block (64 byte) aligned. use - ! double word copies. - - andcc %i1, 7, %g1 ! is src aligned on a 8 bytes - bz %ncc, .ci_dbcopy ! %g1 has source offset (last 3-bits) - sll %g1, 3, %l1 ! left shift - mov 0x40, %l2 - sub %l2, %l1, %l2 ! right shift = (64 - left shift) - - ! Now use double word copies to align destination. -.ci_double: - sub %i1, %g1, %i1 ! align the src at 8 bytes. - ldxa [%i1]ASI_USER, %o2 -2: - add %i1, 0x8, %i1 - ldxa [%i1]ASI_USER, %o4 - ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3) - stx %o2, [%i0] - mov %o4, %o2 - subcc %i3, 0x8, %i3 - bgu,pt %ncc, 2b - add %i0, 0x8, %i0 + bgu,pt %ncc, 1b + inc %i0 + + ba copyin_blalign + nop + + ! dst & src 4B aligned +.ci_alwdcp: + lda [%i1]ASI_USER, %o2 + st %o2, [%i0] + add %i1, 0x4, %i1 + subcc %i3, 0x4, %i3 + bgu,pt %ncc, .ci_alwdcp + add %i0, 0x4, %i0 + ba copyin_blalign - add %i1, %g1, %i1 - - ! Both source and destination are double aligned. - ! No shift and merge of data required in this case. -.ci_dbcopy: + nop + + ! dst & src 2B aligned +.ci_alhlfwdcp: + lduha [%i1]ASI_USER, %o2 + stuh %o2, [%i0] + add %i1, 0x2, %i1 + subcc %i3, 0x2, %i3 + bgu,pt %ncc, .ci_alhlfwdcp + add %i0, 0x2, %i0 + + ba copyin_blalign + nop + + ! dst & src 8B aligned +.ci_alewdcp: ldxa [%i1]ASI_USER, %o2 stx %o2, [%i0] add %i1, 0x8, %i1 subcc %i3, 0x8, %i3 - bgu,pt %ncc, .ci_dbcopy + bgu,pt %ncc, .ci_alewdcp add %i0, 0x8, %i0 copyin_blalign: andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size sub %i2, %i3, %i2 ! Residue bytes in %i2 +#if !defined(NIAGARA_IMPL) + mov ASI_USER, %asi + + andn %i1, 0x3f, %l0 ! %l0 has block aligned src address + prefetch [%l0+0x0], #one_read + andcc %i1, 0x3f, %g0 ! is src 64B aligned + bz,pn %ncc, .ci_blkcpy + nop + + ! handle misaligned source cases + alignaddr %i1, %g0, %g0 ! generate %gsr + + srl %i1, 0x3, %l1 ! src add bits 3, 4, 5 are now least + ! significant in %l1 + andcc %l1, 0x7, %l2 ! mask everything except bits 1, 2, 3 + add %i1, %i3, %i1 + + ! switch statement to get to right 8 byte block within + ! 64 byte block + cmp %l2, 0x4 + bgeu,a ci_hlf + cmp %l2, 0x6 + cmp %l2, 0x2 + bgeu,a ci_sqtr + nop + cmp %l2, 0x1 + be,a ci_off15 + nop + ba ci_off7 + nop +ci_sqtr: + be,a ci_off23 + nop + ba,a ci_off31 + nop + +ci_hlf: + bgeu,a ci_fqtr + nop + cmp %l2, 0x5 + be,a ci_off47 + nop + ba ci_off39 + nop +ci_fqtr: + be,a ci_off55 + nop + + ldda [%l0+0x38]%asi, %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +7: + add %l0, 0x40, %l0 + stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line + + ldda [%l0]ASI_BLK_AIUS, %d16 + ALIGN_OFF_56_63 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 7b + prefetch [%l0+0x80], #one_read + ba .ci_blkdone + membar #Sync + +ci_off7: + ldda [%l0]ASI_BLK_AIUS, %d0 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +0: + add %l0, 0x40, %l0 + stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line + + ldda [%l0]ASI_BLK_AIUS, %d16 + ALIGN_OFF_1_7 + fmovd %d16, %d0 + fmovd %d18, %d2 + fmovd %d20, %d4 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 0b + prefetch [%l0+0x80], #one_read + ba .ci_blkdone + membar #Sync + +ci_off15: + ldda [%l0+0x8]%asi, %d2 + ldda [%l0+0x10]%asi, %d4 + ldda [%l0+0x18]%asi, %d6 + ldda [%l0+0x20]%asi, %d8 + ldda [%l0+0x28]%asi, %d10 + ldda [%l0+0x30]%asi, %d12 + ldda [%l0+0x38]%asi, %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +1: + add %l0, 0x40, %l0 + stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line + + ldda [%l0]ASI_BLK_AIUS, %d16 + ALIGN_OFF_8_15 + fmovd %d18, %d2 + fmovd %d20, %d4 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 1b + prefetch [%l0+0x80], #one_read + ba .ci_blkdone + membar #Sync + +ci_off23: + ldda [%l0+0x10]%asi, %d4 + ldda [%l0+0x18]%asi, %d6 + ldda [%l0+0x20]%asi, %d8 + ldda [%l0+0x28]%asi, %d10 + ldda [%l0+0x30]%asi, %d12 + ldda [%l0+0x38]%asi, %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +2: + add %l0, 0x40, %l0 + stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line + + ldda [%l0]ASI_BLK_AIUS, %d16 + ALIGN_OFF_16_23 + fmovd %d20, %d4 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 2b + prefetch [%l0+0x80], #one_read + ba .ci_blkdone + membar #Sync + +ci_off31: + ldda [%l0+0x18]%asi, %d6 + ldda [%l0+0x20]%asi, %d8 + ldda [%l0+0x28]%asi, %d10 + ldda [%l0+0x30]%asi, %d12 + ldda [%l0+0x38]%asi, %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +3: + add %l0, 0x40, %l0 + stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line + + ldda [%l0]ASI_BLK_AIUS, %d16 + ALIGN_OFF_24_31 + fmovd %d22, %d6 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 3b + prefetch [%l0+0x80], #one_read + ba .ci_blkdone + membar #Sync + +ci_off39: + ldda [%l0+0x20]%asi, %d8 + ldda [%l0+0x28]%asi, %d10 + ldda [%l0+0x30]%asi, %d12 + ldda [%l0+0x38]%asi, %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +4: + add %l0, 0x40, %l0 + stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line + + ldda [%l0]ASI_BLK_AIUS, %d16 + ALIGN_OFF_32_39 + fmovd %d24, %d8 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 4b + prefetch [%l0+0x80], #one_read + ba .ci_blkdone + membar #Sync + +ci_off47: + ldda [%l0+0x28]%asi, %d10 + ldda [%l0+0x30]%asi, %d12 + ldda [%l0+0x38]%asi, %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +5: + add %l0, 0x40, %l0 + stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line + + ldda [%l0]ASI_BLK_AIUS, %d16 + ALIGN_OFF_40_47 + fmovd %d26, %d10 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 5b + prefetch [%l0+0x80], #one_read + ba .ci_blkdone + membar #Sync + +ci_off55: + ldda [%l0+0x30]%asi, %d12 + ldda [%l0+0x38]%asi, %d14 + prefetch [%l0+0x40], #one_read + prefetch [%l0+0x80], #one_read +6: + add %l0, 0x40, %l0 + stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line + + ldda [%l0]ASI_BLK_AIUS, %d16 + ALIGN_OFF_48_55 + fmovd %d28, %d12 + fmovd %d30, %d14 + + stda %d48, [%i0]ASI_BLK_P + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 6b + prefetch [%l0+0x80], #one_read + ba .ci_blkdone + membar #Sync + +.ci_blkcpy: + prefetch [%i1+0x40], #one_read + prefetch [%i1+0x80], #one_read +8: + stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line + ldda [%i1]ASI_BLK_AIUS, %d0 + stda %d0, [%i0]ASI_BLK_P + + add %i1, 0x40, %i1 + subcc %i3, 0x40, %i3 + add %i0, 0x40, %i0 + bgu,pt %ncc, 8b + prefetch [%i1+0x80], #one_read + membar #Sync + +.ci_blkdone: +#else /* NIAGARA_IMPL */ mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi andcc %i1, 0xf, %o2 ! is src quadword aligned @@ -2238,50 +3761,74 @@ .ci_blkdone: membar #Sync - - ! Copy as much rest of the data as double word copy. -.ci_dwcp: - cmp %i2, 0x8 ! Not enough bytes to copy as double - blu %ncc, .ci_dbdone +#endif /* NIAGARA_IMPL */ + + brz,pt %i2, .copyin_exit nop - andn %i2, 0x7, %i3 ! %i3 count is multiple of 8 bytes size - sub %i2, %i3, %i2 ! Residue bytes in %i2 - - andcc %i1, 7, %g1 ! is src aligned on a 8 bytes - bz %ncc, .ci_cpy_db + ! Handle trailing bytes + cmp %i2, 0x8 + blu,pt %ncc, .ci_residue nop - sll %g1, 3, %l0 ! left shift - mov 0x40, %l1 - sub %l1, %l0, %l1 ! right shift = (64 - left shift) - -.ci_cpy_dbwd: - sub %i1, %g1, %i1 ! align the src at 8 bytes. - ldxa [%i1]ASI_USER, %o2 -3: - add %i1, 0x8, %i1 - ldxa [%i1]ASI_USER, %o4 - ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3) - stx %o2, [%i0] - mov %o4, %o2 - subcc %i3, 0x8, %i3 - bgu,pt %ncc, 3b - add %i0, 0x8, %i0 - ba .ci_dbdone - add %i1, %g1, %i1 - -.ci_cpy_db: + ! Can we do some 8B ops + or %i1, %i0, %o2 + andcc %o2, 0x7, %g0 + bnz %ncc, .ci_last4 + nop + + ! Do 8byte ops as long as possible +.ci_last8: ldxa [%i1]ASI_USER, %o2 stx %o2, [%i0] add %i1, 0x8, %i1 - subcc %i3, 0x8, %i3 - bgu,pt %ncc, .ci_cpy_db + sub %i2, 0x8, %i2 + cmp %i2, 0x8 + bgu,pt %ncc, .ci_last8 add %i0, 0x8, %i0 -.ci_dbdone: - tst %i2 - bz,pt %xcc, .copyin_exit + brz,pt %i2, .copyin_exit + nop + + ba .ci_residue + nop + +.ci_last4: + ! Can we do 4B ops + andcc %o2, 0x3, %g0 + bnz %ncc, .ci_last2 + nop +1: + lda [%i1]ASI_USER, %o2 + st %o2, [%i0] + add %i1, 0x4, %i1 + sub %i2, 0x4, %i2 + cmp %i2, 0x4 + bgu,pt %ncc, 1b + add %i0, 0x4, %i0 + + brz,pt %i2, .copyin_exit + nop + + ba .ci_residue + nop + +.ci_last2: + ! Can we do 2B ops + andcc %o2, 0x1, %g0 + bnz %ncc, .ci_residue + nop + +1: + lduha [%i1]ASI_USER, %o2 + stuh %o2, [%i0] + add %i1, 0x2, %i1 + sub %i2, 0x2, %i2 + cmp %i2, 0x2 + bgu,pt %ncc, 1b + add %i0, 0x2, %i0 + + brz,pt %i2, .copyin_exit nop ! Copy the residue as byte copy @@ -2290,11 +3837,35 @@ stb %i4, [%i0] inc %i1 deccc %i2 - bgu %xcc, .ci_residue + bgu,pt %xcc, .ci_residue inc %i0 .copyin_exit: +#if !defined(NIAGARA_IMPL) + ld [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 + wr %o2, 0, %gsr ! restore gsr + + ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 + btst FPRS_FEF, %o3 + bz %icc, 4f + nop + + ! restore fpregs from stack + BLD_FP_FROMSTACK(%o2) + + ba,pt %ncc, 2f + wr %o3, 0, %fprs ! restore fprs + +4: + FZERO ! zero all of the fpregs + wr %o3, 0, %fprs ! restore fprs + +2: + membar #Sync ! sync error barrier + andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT +#else /* NIAGARA_IMPL */ membar #Sync +#endif /* NIAGARA_IMPL */ stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault ret restore %g0, 0, %o0