changeset 10024:2213a466547f

6847710 Solaris libc memcpy and memset are ignorant of AMD L3 caches 6688056 amd64 memset.s and memcpy.s are missing AMD comments
author bostrovs
date Thu, 02 Jul 2009 10:30:52 -0700
parents 71bf38dba3d6
children 9214f62864a1
files usr/src/lib/libc/amd64/Makefile usr/src/lib/libc/amd64/gen/cache.s usr/src/lib/libc/amd64/gen/memcpy.s usr/src/lib/libc/amd64/gen/memset.s usr/src/lib/libc/amd64/gen/proc64_id.c usr/src/lib/libc/amd64/gen/proc64_id.h usr/src/lib/libc/amd64/gen/proc64_support.s
diffstat 7 files changed, 115 insertions(+), 132 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/lib/libc/amd64/Makefile	Thu Jul 02 12:58:38 2009 -0400
+++ b/usr/src/lib/libc/amd64/Makefile	Thu Jul 02 10:30:52 2009 -0700
@@ -104,7 +104,6 @@
 	alloca.o		\
 	attrat.o		\
 	byteorder.o		\
-	cache.o			\
 	cuexit.o		\
 	ecvt.o			\
 	errlst.o		\
--- a/usr/src/lib/libc/amd64/gen/cache.s	Thu Jul 02 12:58:38 2009 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,108 +0,0 @@
-/*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2002 Advanced Micro Devices, Inc.
- * 
- * All rights reserved.
- * 
- * Redistribution and  use in source and binary  forms, with or
- * without  modification,  are   permitted  provided  that  the
- * following conditions are met:
- * 
- * + Redistributions  of source  code  must  retain  the  above
- *   copyright  notice,   this  list  of   conditions  and  the
- *   following disclaimer.
- * 
- * + Redistributions  in binary  form must reproduce  the above
- *   copyright  notice,   this  list  of   conditions  and  the
- *   following  disclaimer in  the  documentation and/or  other
- *   materials provided with the distribution.
- * 
- * + Neither the  name of Advanced Micro Devices,  Inc. nor the
- *   names  of  its contributors  may  be  used  to endorse  or
- *   promote  products  derived   from  this  software  without
- *   specific prior written permission.
- * 
- * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
- * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
- * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
- * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
- * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
- * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
- * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
- * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
- * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
- * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
- * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
- * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
- * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
- * POSSIBILITY OF SUCH DAMAGE.
- * 
- * It is  licensee's responsibility  to comply with  any export
- * regulations applicable in licensee's jurisdiction.
- */
-
-	.file	"cache.s"
-
-#include "SYS.h"
-#include "cache.h"
-#include "proc64_id.h"
-
-        .global .amd64cache1, .amd64cache1half, .amd64cache2, .amd64cache2half
-	.global .largest_level_cache_size
-
-        .data
-
-        .align  8
-
-// defaults to SledgeHammer
-.amd64cache1:	.quad	AMD_DFLT_L1_CACHE_SIZE
-.amd64cache1half: .quad	AMD_DFLT_L1_HALF_CACHE_SIZE
-.amd64cache2:	.quad	AMD_DFLT_L2_CACHE_SIZE
-.amd64cache2half: .quad	AMD_DFLT_L2_HALF_CACHE_SIZE
-.largest_level_cache_size:
-		.int	AMD_DFLT_L2_CACHE_SIZE
-
-        .text
-
-// AMD cache size determination
-
-	ENTRY(__amd64id)
-
-        push    %rbx
-
-        mov     $CPUIDLARGESTFUNCTIONEX, %eax           # get highest level of support
-        cpuid
-
-        cmp     $AMDIDL2INFO, %eax                      # check for support of cache info
-        jb      1f
-
-        mov     $AMDIDL1INFO, %eax                      # get L1 info
-        cpuid
-
-        shr     $24, %ecx
-        shl     $10, %ecx
-        mov     %rcx, _sref_(.amd64cache1)
-
-        shr     $1, %ecx
-        mov     %rcx, _sref_(.amd64cache1half)
-
-        mov     $AMDIDL2INFO, %eax                      # get L2 info
-        cpuid
-
-        shr     $16, %ecx
-        shl     $10, %ecx
-        mov     %rcx, _sref_(.amd64cache2)
-	mov	%ecx, _sref_(.largest_level_cache_size)
-
-        shr     $1, %ecx
-        mov     %rcx, _sref_(.amd64cache2half)
-
-1:
-        pop	%rbx
-        ret
-
-	SET_SIZE(__amd64id)
--- a/usr/src/lib/libc/amd64/gen/memcpy.s	Thu Jul 02 12:58:38 2009 -0400
+++ b/usr/src/lib/libc/amd64/gen/memcpy.s	Thu Jul 02 10:30:52 2009 -0700
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -53,6 +53,11 @@
  *
  * Pseudo code:
  *
+ * NOTE: On AMD NO_SSE is always set.  Performance on Opteron did not improve
+ * using 16-byte stores.  Setting NO_SSE on AMD should be re-evaluated on
+ * future AMD processors.
+ *
+ *
  * If (size <= 128 bytes) {
  *	do unrolled code (primarily 8-byte loads/stores) regardless of
  *	alignment.
@@ -2274,7 +2279,7 @@
 	mov    .largest_level_cache_size(%rip),%r9d
 	shr    %r9		# take half of it
 	cmp    %r9,%r8  
-	jg     L(byte8_nt_top)
+	jge    L(byte8_nt_top)
 	# Find out whether to use rep movsq
 	cmp    $4096,%r8
 	jle    L(byte8_top)
--- a/usr/src/lib/libc/amd64/gen/memset.s	Thu Jul 02 12:58:38 2009 -0400
+++ b/usr/src/lib/libc/amd64/gen/memset.s	Thu Jul 02 10:30:52 2009 -0700
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -29,6 +29,10 @@
  * All rights reserved.
  */
 
+/*
+ * Portions Copyright 2009 Advanced Micro Devices, Inc.
+ */
+
 	.file	"memset.s"
 
 #include <sys/asm_linkage.h>
@@ -47,6 +51,11 @@
  *
  * Pseudo code:
  *
+ * NOTE: On AMD NO_SSE is always set.  Performance on Opteron did not improve
+ * using 16-byte stores.  Setting NO_SSE on AMD should be re-evaluated on
+ * future AMD processors.
+ *
+ *
  * If (size <= 144 bytes) {
  *	do unrolled code (primarily 8-byte stores) regardless of alignment.
  * } else {
@@ -914,17 +923,25 @@
 
 		.balign 16
 L(Loop8byte_nt_move):
-		lea    -0x40(%r8),%r8		# 64
+		lea    -0x80(%r8),%r8		# 128
 		movnti %rdx,(%rdi)
 		movnti %rdx,0x8(%rdi)
 		movnti %rdx,0x10(%rdi)
 		movnti %rdx,0x18(%rdi)
-		cmp    $0x40,%r8
 		movnti %rdx,0x20(%rdi)
 		movnti %rdx,0x28(%rdi)
 		movnti %rdx,0x30(%rdi)
 		movnti %rdx,0x38(%rdi)
-		lea    0x40(%rdi),%rdi
+		cmp    $0x80,%r8
+		movnti %rdx,0x40(%rdi)
+		movnti %rdx,0x48(%rdi)
+		movnti %rdx,0x50(%rdi)
+		movnti %rdx,0x58(%rdi)
+		movnti %rdx,0x60(%rdi)
+		movnti %rdx,0x68(%rdi)
+		movnti %rdx,0x70(%rdi)
+		movnti %rdx,0x78(%rdi)
+		lea    0x80(%rdi),%rdi  
 		jge    L(Loop8byte_nt_move)
 
 		sfence
--- a/usr/src/lib/libc/amd64/gen/proc64_id.c	Thu Jul 02 12:58:38 2009 -0400
+++ b/usr/src/lib/libc/amd64/gen/proc64_id.c	Thu Jul 02 10:30:52 2009 -0700
@@ -24,7 +24,9 @@
  * All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
+/*
+ * Portions Copyright 2009 Advanced Micro Devices, Inc.
+ */
 
 #include <sys/types.h>
 #include "proc64_id.h"
@@ -44,8 +46,6 @@
 	uint_t edx;
 };
 
-extern void	__amd64id(void);
-
 /*
  * get_intel_cache_info()
  *	Get cpu cache sizes for optimized 64-bit libc functions mem* and str*.
@@ -103,8 +103,62 @@
 		}
 	}
 
-	__intel_set_cache_sizes(l1_cache_size, l2_cache_size,
-	    largest_level_cache);
+	__set_cache_sizes(l1_cache_size, l2_cache_size, largest_level_cache);
+}
+
+/*
+ * get_amd_cache_info()
+ *      Same as get_intel_cache_info() but for AMD processors
+ */
+static void
+get_amd_cache_info(void)
+{
+	uint_t l1_cache_size = AMD_DFLT_L1_CACHE_SIZE;
+	uint_t l2_cache_size = AMD_DFLT_L2_CACHE_SIZE;
+	uint_t l3_cache_size = 0;
+	uint_t largest_level_cache = 0;
+	struct cpuid_values cpuid_info;
+	uint_t maxeax;
+	int ncores;
+
+	cpuid_info.eax = 0;
+	__libc_get_cpuid(0x80000000, (uint_t *)&cpuid_info, -1);
+	maxeax = cpuid_info.eax;
+
+	if (maxeax >= 0x80000005) {	/* We have L1D info */
+		__libc_get_cpuid(0x80000005, (uint_t *)&cpuid_info, -1);
+		l1_cache_size = ((cpuid_info.ecx >> 24) & 0xff) * 1024;
+	}
+
+	if (maxeax >= 0x80000006) {	/* We have L2 and L3 info */
+		__libc_get_cpuid(0x80000006, (uint_t *)&cpuid_info, -1);
+		l2_cache_size = ((cpuid_info.ecx >> 16) & 0xffff) * 1024;
+		l3_cache_size = ((cpuid_info.edx >> 18) & 0x3fff) * 512 * 1024;
+	}
+
+	/*
+	 * L3 cache is shared between cores on the processor
+	 */
+	if (maxeax >= 0x80000008 && l3_cache_size != 0) {
+		largest_level_cache = l3_cache_size;
+
+		/*
+		 * Divide by number of cores on the processor
+		 */
+		__libc_get_cpuid(0x80000008, (uint_t *)&cpuid_info, -1);
+		ncores = (cpuid_info.ecx & 0xff) + 1;
+		if (ncores > 1)
+			largest_level_cache /= ncores;
+
+		/*
+		 * L3 is a victim cache for L2
+		 */
+		largest_level_cache += l2_cache_size;
+	} else
+		largest_level_cache = l2_cache_size;
+
+		__set_cache_sizes(l1_cache_size, l2_cache_size,
+		    largest_level_cache);
 }
 
 /*
@@ -126,7 +180,7 @@
 	if ((cpuid_info.ebx == 0x68747541) && /* Auth */
 	    (cpuid_info.edx == 0x69746e65) && /* enti */
 	    (cpuid_info.ecx == 0x444d4163)) { /* cAMD */
-		__amd64id();
+		get_amd_cache_info();
 		return;
 	}
 
@@ -174,7 +228,7 @@
 		}
 		__intel_set_memops_method(use_sse);
 	} else {
-		__intel_set_cache_sizes(INTEL_DFLT_L1_CACHE_SIZE,
+		__set_cache_sizes(INTEL_DFLT_L1_CACHE_SIZE,
 		    INTEL_DFLT_L2_CACHE_SIZE,
 		    INTEL_DFLT_LARGEST_CACHE_SIZE);
 		__intel_set_memops_method(use_sse);
--- a/usr/src/lib/libc/amd64/gen/proc64_id.h	Thu Jul 02 12:58:38 2009 -0400
+++ b/usr/src/lib/libc/amd64/gen/proc64_id.h	Thu Jul 02 10:30:52 2009 -0700
@@ -24,11 +24,13 @@
  * All rights reserved.
  */
 
+/*
+ * Portions Copyright 2009 Advanced Micro Devices, Inc.
+ */
+
 #ifndef	_PROC64_ID_H
 #define	_PROC64_ID_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/x86_archext.h>
 
 #ifdef	__cplusplus
@@ -56,9 +58,7 @@
  * Cache size defaults for AMD SledgeHammer
  */
 #define	AMD_DFLT_L1_CACHE_SIZE		(64 * 1024)
-#define	AMD_DFLT_L1_HALF_CACHE_SIZE	(32 * 1024)
 #define	AMD_DFLT_L2_CACHE_SIZE		(1024 * 1024)
-#define	AMD_DFLT_L2_HALF_CACHE_SIZE	(512 * 1024)
 
 #ifdef _ASM
 	.extern .memops_method
@@ -66,7 +66,7 @@
 
 void __libc_get_cpuid(int cpuid_function, void *out_reg, int cache_index);
 void __intel_set_memops_method(long sse_level);
-void __intel_set_cache_sizes(long l1_cache_size, long l2_cache_size,
+void __set_cache_sizes(long l1_cache_size, long l2_cache_size,
     long largest_level_cache);
 
 #endif /* _ASM */
--- a/usr/src/lib/libc/amd64/gen/proc64_support.s	Thu Jul 02 12:58:38 2009 -0400
+++ b/usr/src/lib/libc/amd64/gen/proc64_support.s	Thu Jul 02 10:30:52 2009 -0700
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -30,6 +30,10 @@
  */
 
 /*
+ * Portions Copyright 2009 Advanced Micro Devices, Inc.
+ */
+
+/*
  * Assembler support routines to getcpuid information used to set
  * cache size information. Cache information used by memset, strcpy, etc..
  */
@@ -40,15 +44,26 @@
 #include "proc64_id.h"
 
 	.global .memops_method
+	.global .amd64cache1, .amd64cache1half, .amd64cache2, .amd64cache2half
+	.global .largest_level_cache_size
+
 
 /*
- * Defaults for Core 2 Duo
+ * Defaults for Core 2 Duo and AMD's SledgeHammer
  */
 	.data
 	.balign  8
 .memops_method:
 	.int	NO_SSE
 
+	.balign	8
+.amd64cache1:	.quad	AMD_DFLT_L1_CACHE_SIZE
+.amd64cache1half: .quad	AMD_DFLT_L1_CACHE_SIZE/2
+.amd64cache2:	.quad	AMD_DFLT_L2_CACHE_SIZE
+.amd64cache2half: .quad	AMD_DFLT_L2_CACHE_SIZE/2
+.largest_level_cache_size:
+		.int	AMD_DFLT_L2_CACHE_SIZE
+
 /*
  * Get cpuid data.
  * (void)__libc_get_cpuid(int cpuid_function, void *out_reg, int cache_index )
@@ -80,9 +95,10 @@
 
 /*
  * Set cache info global variables used by various libc primitives.
- * __intel_set_cache_sizes(long l1_cache_size, long l2_cache_size, long largest_level_cache);
+ * __set_cache_sizes(long l1_cache_size, long l2_cache_size,
+ *    long largest_level_cache);
  */
-	ENTRY(__intel_set_cache_sizes)
+	ENTRY(__set_cache_sizes)
 	# rdi = l1_cache_size, rsi = l2_cache_size, rdx = largest_level_cache
 
         mov     %rdi,.amd64cache1(%rip)
@@ -95,4 +111,4 @@
 
 	mov	%rdx,.largest_level_cache_size(%rip)
 	ret
-	SET_SIZE(__intel_set_cache_sizes)
+	SET_SIZE(__set_cache_sizes)