changeset 10583:1058268e7f53

6869408 64-bit libc string functions could be improved with SSE Contributed by Ling Ma <ling.ma@intel.com>, Xinping Huang <xinping.huang@intel.com> and Robert Kasten <robert.a.kasten@intel.com>
author Edward Gillett <Edward.Gillett@Sun.COM>
date Fri, 18 Sep 2009 14:25:49 -0700
parents cdf5d98e419e
children b6eb77b5edec
files usr/src/lib/libc/amd64/gen/proc64_id.c usr/src/lib/libc/amd64/gen/proc64_id.h usr/src/lib/libc/amd64/gen/proc64_support.s usr/src/lib/libc/amd64/gen/strcmp.s usr/src/lib/libc/amd64/gen/strcpy.s usr/src/lib/libc/amd64/gen/strlen.s
diffstat 6 files changed, 4531 insertions(+), 1533 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/lib/libc/amd64/gen/proc64_id.c	Fri Sep 18 12:50:18 2009 -0700
+++ b/usr/src/lib/libc/amd64/gen/proc64_id.c	Fri Sep 18 14:25:49 2009 -0700
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2008, Intel Corporation.
+ * Copyright (c) 2009, Intel Corporation.
  * All rights reserved.
  */
 
@@ -226,6 +226,7 @@
 		if (cpuid_info.edx & CPUID_INTC_EDX_SSE2) {
 			use_sse |= USE_SSE2;
 		}
+		use_sse |= USE_BSF;
 		__intel_set_memops_method(use_sse);
 	} else {
 		__set_cache_sizes(INTEL_DFLT_L1_CACHE_SIZE,
--- a/usr/src/lib/libc/amd64/gen/proc64_id.h	Fri Sep 18 12:50:18 2009 -0700
+++ b/usr/src/lib/libc/amd64/gen/proc64_id.h	Fri Sep 18 14:25:49 2009 -0700
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright (c) 2008, Intel Corporation
+ * Copyright (c) 2009, Intel Corporation
  * All rights reserved.
  */
 
@@ -38,7 +38,7 @@
 #endif
 
 /*
- * Defines to determine what SSE instructions can be used for memops or strops.
+ * Defines to determine what SSE instructions can be used for memops or strops
  */
 #define	NO_SSE		0x00	/* Default -- Don't use SSE instructions */
 #define	USE_SSE2	0x01	/* SSE2 */
@@ -46,6 +46,7 @@
 #define	USE_SSSE3	0x04	/* Supplemental SSE3 */
 #define	USE_SSE4_1	0x08	/* SSE 4.1 */
 #define	USE_SSE4_2	0x10	/* SSE 4.2 */
+#define	USE_BSF		0x20	/* USE BSF class of instructions */
 
 /*
  * Cache size defaults for Core 2 Duo
--- a/usr/src/lib/libc/amd64/gen/proc64_support.s	Fri Sep 18 12:50:18 2009 -0700
+++ b/usr/src/lib/libc/amd64/gen/proc64_support.s	Fri Sep 18 14:25:49 2009 -0700
@@ -25,7 +25,7 @@
  */
 
 /*
- * Copyright (c) 2008, Intel Corporation
+ * Copyright (c) 2009, Intel Corporation
  * All rights reserved.
  */
 
@@ -38,8 +38,6 @@
  * cache size information. Cache information used by memset, strcpy, etc..
  */
 
-	.file	"proc64_support.s"
-
 #include <sys/asm_linkage.h>
 #include "proc64_id.h"
 
--- a/usr/src/lib/libc/amd64/gen/strcmp.s	Fri Sep 18 12:50:18 2009 -0700
+++ b/usr/src/lib/libc/amd64/gen/strcmp.s	Fri Sep 18 14:25:49 2009 -0700
@@ -1,540 +1,2049 @@
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
  */
 
 /*
- * Copyright (c) 2002 Advanced Micro Devices, Inc.
- * 
+ * Copyright (c) 2009, Intel Corporation
  * All rights reserved.
- * 
- * Redistribution and  use in source and binary  forms, with or
- * without  modification,  are   permitted  provided  that  the
- * following conditions are met:
- * 
- * + Redistributions  of source  code  must  retain  the  above
- *   copyright  notice,   this  list  of   conditions  and  the
- *   following disclaimer.
- * 
- * + Redistributions  in binary  form must reproduce  the above
- *   copyright  notice,   this  list  of   conditions  and  the
- *   following  disclaimer in  the  documentation and/or  other
- *   materials provided with the distribution.
- * 
- * + Neither the  name of Advanced Micro Devices,  Inc. nor the
- *   names  of  its contributors  may  be  used  to endorse  or
- *   promote  products  derived   from  this  software  without
- *   specific prior written permission.
- * 
- * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
- * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
- * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
- * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
- * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
- * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
- * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
- * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
- * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
- * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
- * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
- * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
- * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
- * POSSIBILITY OF SUCH DAMAGE.
- * 
- * It is  licensee's responsibility  to comply with  any export
- * regulations applicable in licensee's jurisdiction.
  */
 
-	.file	"strcmp.s"
+/*
+ *	str[n]cmp - compare chars between two string
+ */
 
 #include "SYS.h"
-#include "cache.h"
+#include "proc64_id.h"
 
 #define LABEL(s) .strcmp/**/s
 
 #ifdef USE_AS_STRNCMP
+	/*
+	 * Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+	 * if the new counter > the old one or is 0. 
+	 */
+#define UPDATE_STRNCMP_COUNTER				\
+	/* calculate left number to compare */		\
+	lea	-16(%rcx, %r11), %r9;			\
+	cmp	%r9, %r11;				\
+	jb	LABEL(strcmp_exitz);			\
+	test	%r9, %r9;				\
+	je	LABEL(strcmp_exitz);			\
+	mov	%r9, %r11
+#else
+#define UPDATE_STRNCMP_COUNTER
+#endif
+
+	/*
+	 * This implementation uses SSE to compare up to 16 bytes at a time.
+	 */
+#ifdef USE_AS_STRNCMP
 	ENTRY(strncmp)
+	test	%rdx, %rdx
+	je	LABEL(strcmp_exitz)
+	mov	%rdx, %r11
 #else
 	ENTRY(strcmp)			/* (const char *, const char *) */
 #endif
-        xor     %ecx, %ecx
+	mov	%esi, %ecx
+	mov	%edi, %eax
+	and	$0x3f, %rcx		/* rsi alignment in cache line */
+	and	$0x3f, %rax		/* rdi alignment in cache line */
+	cmp	$0x30, %ecx
+	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
+	cmp	$0x30, %eax
+	ja	LABEL(crosscache)	/* rdi: 16-byte load will cross cache line */
+	movlpd	(%rdi), %xmm1
+	movlpd	(%rsi), %xmm2
+	movhpd	8(%rdi), %xmm1
+	movhpd	8(%rsi), %xmm2
+	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
+	psubb	%xmm0, %xmm1		/* packed sub of comparison results*/
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
+	jnz	LABEL(less16bytes)	/* If not, found mismatch or null char */
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)	/* finish comparision */
+#endif
+	add	$16, %rsi		/* prepare to search next 16 bytes */
+	add	$16, %rdi		/* prepare to search next 16 bytes */
+
+	/*
+	 * Determine rdi and rsi string offsets from 16-byte alignment.
+	 * Use relative offset difference between the two to determine which case
+	 * below to use.
+	 */
+	.p2align 4
+LABEL(crosscache):
+	and	$0xfffffffffffffff0, %rsi	/* force %rsi to be 16 byte aligned */
+	and	$0xfffffffffffffff0, %rdi	/* force %rdi to be 16 byte aligned */
+	mov	$0xffff, %edx			/* for equivalent offset */
+	xor	%r8d, %r8d
+	and	$0xf, %ecx			/* offset of rsi */
+	and	$0xf, %eax			/* offset of rdi */
+	cmp	%eax, %ecx
+	je	LABEL(ashr_0)			/* both strings have the same alignment */
+	ja	LABEL(bigger)
+	mov	%edx, %r8d			/* r8d is offset flag for exit tail */
+	xchg	%ecx, %eax
+	xchg	%rsi, %rdi
+LABEL(bigger):
+	mov	%rcx, %r9
+	sub	%rax, %r9
+	lea	LABEL(unaligned_table)(%rip), %r10
+	movslq	(%r10, %r9, 4), %r9
+	lea	(%r10, %r9), %r10
+	jmp	*%r10				/* jump to corresponding case */
+
+/*
+ * ashr_0 handles the following cases:
+ * 	str1 offset = str2 offset
+ */
+	.p2align 4
+LABEL(ashr_0):
+	movdqa	(%rsi), %xmm1
+	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
+	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
+	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
+	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
+	pmovmskb %xmm1, %r9d
+	shr	%cl, %edx			/* adjust 0xffff for offset */
+	shr	%cl, %r9d			/* adjust for 16-byte offset */
+	sub	%r9d, %edx
+	/*
+	 * edx must be the same with r9d if in left byte (16-rcx) is equal to
+	 * the start from (16-rax) and no null char was seen.
+	 */
+	jne	LABEL(less32bytes)		/* mismatch or null char */
+	UPDATE_STRNCMP_COUNTER
+	mov	$16, %rcx
+	mov	$16, %r9
+	pxor	%xmm0, %xmm0			/* clear xmm0, may have changed above */
+
+	/*
+	 * Now both strings are aligned at 16-byte boundary. Loop over strings
+	 * checking 32-bytes per iteration.
+	 */
+	.p2align 4
+LABEL(loop_ashr_0):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)		/* mismatch or null char seen */
 
 #ifdef USE_AS_STRNCMP
-	test	%rdx, %rdx		/* (const char *, const char *, size_t) */
-        mov	%r14, -8 (%rsp)
-	mov	%rdx, %r14
-	mov	%edx, %eax
-	jz	LABEL(exitz)		/* early exit */
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
+	add	$16, %rcx
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rcx
+	jmp	LABEL(loop_ashr_0)
 
-LABEL(aligntry):
-        mov     %rsi, %r8		/* align by "source" */
-        and     $8 - 1, %r8		/* between 0 and 8 characters compared */
-	jz	LABEL(alignafter)
+/*
+ * ashr_1 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 15
+ */
+	.p2align 4
+LABEL(ashr_1):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
+	pslldq	$15, %xmm2		/* shift first string to align with second */	
+	pcmpeqb	%xmm1, %xmm2		/* compare 16 bytes for equality */
+	psubb	%xmm0, %xmm2		/* packed sub of comparison results*/
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx		/* adjust 0xffff for offset */
+	shr	%cl, %r9d		/* adjust for 16-byte offset */
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)	/* mismatch or null char seen */
+	movdqa	(%rdi), %xmm3
+	UPDATE_STRNCMP_COUNTER
 
-LABEL(align):
-        sub     $8, %r8
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx		/* index for loads */	
+	mov	$1, %r9d		/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	1(%rdi), %r10	 
+	and	$0xfff, %r10		/* offset into 4K page */
+	sub	$0x1000, %r10		/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
 
-        .p2align 4
+	.p2align 4
+LABEL(loop_ashr_1):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_1)	/* cross page boundary */	
+
+LABEL(gobble_ashr_1):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4		 /* store for next cycle */
 
-LABEL(alignloop):
-        mov     (%rsi, %rcx), %al
-        mov	(%rdi, %rcx), %dl
+	psrldq	$1, %xmm3		
+	pslldq	$15, %xmm2		
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
+
+	pcmpeqb	%xmm1, %xmm0	
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	dec	%r14
-	jl	LABEL(exitafter)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3	
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_1)	/* cross page boundary */	
 
-        cmp     %dl, %al		/* check if same character */
-        jne     LABEL(exitafter)
-        test    %al, %al		/* check if character a NUL */
-        jz      LABEL(exitafter)
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4		/* store for next cycle */
 
-        inc     %ecx
+	psrldq	$1, %xmm3			
+	pslldq 	$15, %xmm2		
+	por	%xmm3, %xmm2		/* merge into one 16byte value */
 
-        inc     %r8
-        jnz     LABEL(alignloop)
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-        test	%r14, %r14
-        jz	LABEL(exitafter)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3		
+	jmp	LABEL(loop_ashr_1)		
+
+	/*
+	 * Nibble avoids loads across page boundary. This is to avoid a potential
+	 * access into unmapped memory.
+	 */
+	.p2align 4
+LABEL(nibble_ashr_1):
+	psrldq	$1, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x7fff, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$15, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_1)	
+
+/*
+ * ashr_2 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 14
+ */
+	.p2align 4
+LABEL(ashr_2):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$14, %xmm2		
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$2, %r9d	/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	2(%rdi), %r10	 
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
+
+	.p2align 4
+LABEL(loop_ashr_2):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_2)	
+
+LABEL(gobble_ashr_2):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$2, %xmm3		
+	pslldq	$14, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0	
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        .p2align 4
-
-LABEL(alignafter):
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
 
-        mov	%r15, -32 (%rsp)
-        mov	%rbp, -24 (%rsp)
-        mov	%rbx, -16 (%rsp)
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_2)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
 
-LABEL(pagealigntry):			/* page align by "destination" */
-        lea	(%rdi, %rcx), %ebp
-	mov	$AMD64PAGESIZE, %r15d
-        and     $AMD64PAGEMASK, %ebp
-        sub	%r15d, %ebp
-	/*
-	 * When we go to 64gobble, %ebp was adjusted at the top of 64loop.
-	 * When we go to 64nibble(crossing page boundary), we'll compare
-	 * 128 byte since we'll fall through to 64gobble. Therefore, %ebp
-	 * needs to be re-adjusted(add 64) when we fall into 64nibble.
-	 * It can be done by adjusting %r15 since %r15 is only used to
-	 * rewind %ebp when crossing page boundary.
-	 */
-	sub	$64, %r15d
+	psrldq	$2, %xmm3			
+	pslldq 	$14, %xmm2		
+	por	%xmm3, %xmm2
 
-LABEL(64):                              /* 64-byte */
-	mov     $0xfefefefefefefeff, %rbx /* magic number */
-
-        .p2align 4
-
-LABEL(64loop):
-	add	$64, %ebp		/* check if "destination" crosses a page unevenly */
-	jle	LABEL(64gobble)
-
-        sub	%r15d, %ebp
-        lea	64 (%rcx), %r8
-
-        .p2align 4
-
-LABEL(64nibble):
-        mov     (%rsi, %rcx), %al
-        mov	(%rdi, %rcx), %dl
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	dec	%r14
-	jle	LABEL(exit)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        cmp     %dl, %al		/* check if same character */
-        jne     LABEL(exit)
-        test    %al, %al		/* check if character a NUL */
-        jz      LABEL(exit)
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3	
+	jmp	LABEL(loop_ashr_2)		
 
-        inc	%ecx
+	.p2align 4
+LABEL(nibble_ashr_2):
+	psrldq	$2, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x3fff, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$14, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_2)	
 
-        cmp	%ecx, %r8d
-        ja	LABEL(64nibble)
+/*
+ * ashr_3 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 13
+ */
+	.p2align 4
+LABEL(ashr_3):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$13, %xmm2		
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
 
-        .p2align 4
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$3, %r9d	/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	3(%rdi), %r10	 
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
 
-LABEL(64gobble):
-        mov     (%rsi, %rcx), %rax
-        mov     (%rdi, %rcx), %rdx
+	.p2align 4
+LABEL(loop_ashr_3):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_3)	
+
+LABEL(gobble_ashr_3):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$3, %xmm3		
+	pslldq	$13, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	sub	$8, %r14
-	jle	LABEL(tail)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3	
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_3)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$3, %xmm3			
+	pslldq 	$13, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        mov     %rbx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_3)		
+
+	.p2align 4
+LABEL(nibble_ashr_3):
+	psrldq	$3, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x1fff, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$13, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_3)	
 
-        mov     %rbx, %r9
-        add     %rdx, %r9
-        sbb     %r11, %r11
+/*
+ * ashr_4 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 12
+ */
+	.p2align 4
+LABEL(ashr_4):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$12, %xmm2		
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
 
-        xor     %rax, %r8
-        or      %rbx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$4, %r9d	/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	4(%rdi), %r10	 
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
+
+	.p2align 4
+LABEL(loop_ashr_4):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_4)	
+
+LABEL(gobble_ashr_4):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$4, %xmm3		
+	pslldq	$12, %xmm2		
+	por	%xmm3, %xmm2
 
-        xor     %rdx, %r9
-        or      %rbx, %r9
-        sub     %r11, %r9
-        jnz     LABEL(tail)
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
 
-        cmp     %rdx, %rax
-        jne     LABEL(tail)
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_4)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
 
-        mov     8 (%rsi, %rcx), %rax
-        mov     8 (%rdi, %rcx), %rdx
-        add     $8, %ecx
+	psrldq	$4, %xmm3			
+	pslldq 	$12, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	sub	$8, %r14
-	jle	LABEL(tail)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        mov     %rbx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_4)		
+
+	.p2align 4
+LABEL(nibble_ashr_4):
+	psrldq	$4, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x0fff, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$12, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_4)	
 
-        mov     %rbx, %r9
-        add     %rdx, %r9
-        sbb     %r11, %r11
+/*
+ * ashr_5 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 11
+ */
+	.p2align 4
+LABEL(ashr_5):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$11, %xmm2		
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
 
-        xor     %rax, %r8
-        or      %rbx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$5, %r9d	/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	5(%rdi), %r10	 
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
+
+	.p2align 4
+LABEL(loop_ashr_5):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_5)	
+
+LABEL(gobble_ashr_5):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$5, %xmm3		
+	pslldq	$11, %xmm2		
+	por	%xmm3, %xmm2
 
-        xor     %rdx, %r9
-        or      %rbx, %r9
-        sub     %r11, %r9
-        jnz     LABEL(tail)
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
 
-        cmp     %rdx, %rax
-        jne     LABEL(tail)
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_5)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
 
-        mov     8 (%rsi, %rcx), %rax
-        mov     8 (%rdi, %rcx), %rdx
-        add     $8, %ecx
+	psrldq	$5, %xmm3			
+	pslldq 	$11, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	sub	$8, %r14
-	jle	LABEL(tail)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        mov     %rbx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_5)		
+
+	.p2align 4
+LABEL(nibble_ashr_5):
+	psrldq	$5, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x07ff, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$11, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+ 	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_5)	
 
-        mov     %rbx, %r9
-        add     %rdx, %r9
-        sbb     %r11, %r11
+/*
+ * ashr_6 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 10
+ */
+	.p2align 4
+LABEL(ashr_6):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$10, %xmm2		
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
 
-        xor     %rax, %r8
-        or      %rbx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$6, %r9d	/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	6(%rdi), %r10	 
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
+
+	.p2align 4
+LABEL(loop_ashr_6):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_6)	
+
+LABEL(gobble_ashr_6):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$6, %xmm3		
+	pslldq	$10, %xmm2		
+	por	%xmm3, %xmm2
 
-        xor     %rdx, %r9
-        or      %rbx, %r9
-        sub     %r11, %r9
-        jnz     LABEL(tail)
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3	
 
-        cmp     %rdx, %rax
-        jne     LABEL(tail)
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_6)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
 
-        mov     8 (%rsi, %rcx), %rax
-        mov     8 (%rdi, %rcx), %rdx
-        add     $8, %ecx
+	psrldq	$6, %xmm3			
+	pslldq 	$10, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	sub	$8, %r14
-	jle	LABEL(tail)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3	
+	jmp	LABEL(loop_ashr_6)		
+
+	.p2align 4
+LABEL(nibble_ashr_6):
+	psrldq	$6, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x03ff, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$10, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+ 	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_6)	
+
+/*
+ * ashr_7 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 9
+ */
+	.p2align 4
+LABEL(ashr_7):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$9, %xmm2		
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$7, %r9d	/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	7(%rdi), %r10	 
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
+
+	.p2align 4
+LABEL(loop_ashr_7):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_7)	
+
+LABEL(gobble_ashr_7):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$7, %xmm3		
+	pslldq	$9, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        mov     %rbx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
 
-        mov     %rbx, %r9
-        add     %rdx, %r9
-        sbb     %r11, %r11
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_7)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
 
-        xor     %rax, %r8
-        or      %rbx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
+	psrldq	$7, %xmm3			
+	pslldq 	$9, %xmm2		
+	por	%xmm3, %xmm2
 
-        xor     %rdx, %r9
-        or      %rbx, %r9
-        sub     %r11, %r9
-        jnz     LABEL(tail)
-
-        cmp     %rdx, %rax
-        jne     LABEL(tail)
-
-        mov     8 (%rsi, %rcx), %rax
-        mov     8 (%rdi, %rcx), %rdx
-        add     $8, %ecx
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	sub	$8, %r14
-	jle	LABEL(tail)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_7)		
+
+	.p2align 4
+LABEL(nibble_ashr_7):
+	psrldq	$7, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x01ff, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$9, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+ 	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_7)	
+
+/*
+ * ashr_8 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 8
+ */
+	.p2align 4
+LABEL(ashr_8):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$8, %xmm2		
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$8, %r9d	/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	8(%rdi), %r10	 
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
+
+	.p2align 4
+LABEL(loop_ashr_8):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_8)	
+
+LABEL(gobble_ashr_8):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$8, %xmm3		
+	pslldq	$8, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3	
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_8)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$8, %xmm3			
+	pslldq 	$8, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        mov     %rbx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_8)		
 
-        mov     %rbx, %r9
-        add     %rdx, %r9
-        sbb     %r11, %r11
+	.p2align 4
+LABEL(nibble_ashr_8):
+	psrldq	$8, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x00ff, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$8, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+ 	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_8)	
 
-        xor     %rax, %r8
-        or      %rbx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
+/*
+ * ashr_9 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 7
+ */
+	.p2align 4
+LABEL(ashr_9):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$7, %xmm2		
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
 
-        xor     %rdx, %r9
-        or      %rbx, %r9
-        sub     %r11, %r9
-        jnz     LABEL(tail)
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$9, %r9d	/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	9(%rdi), %r10	 
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
 
-        cmp     %rdx, %rax
-        jne     LABEL(tail)
+	.p2align 4
+LABEL(loop_ashr_9):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_9)	
+
+LABEL(gobble_ashr_9):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
 
-        mov     8 (%rsi, %rcx), %rax
-        mov     8 (%rdi, %rcx), %rdx
-        add     $8, %ecx
+	psrldq	$9, %xmm3		
+	pslldq	$7, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	sub	$8, %r14
-	jle	LABEL(tail)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_9)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$9, %xmm3			
+	pslldq 	$7, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3		/* store for next cycle */
+	jmp	LABEL(loop_ashr_9)		
+
+	.p2align 4
+LABEL(nibble_ashr_9):
+	psrldq	$9, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x007f, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$7, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+ 	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_9)	
+
+/*
+ * ashr_10 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 6
+ */
+	.p2align 4
+LABEL(ashr_10):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$6, %xmm2		
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$10, %r9d	/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	10(%rdi), %r10	 
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
+
+	.p2align 4
+LABEL(loop_ashr_10):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_10)	
+
+LABEL(gobble_ashr_10):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$10, %xmm3		
+	pslldq	$6, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        mov     %rbx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_10)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$10, %xmm3			
+	pslldq 	$6, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
-        mov     %rbx, %r9
-        add     %rdx, %r9
-        sbb     %r11, %r11
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_10)		
+
+	.p2align 4
+LABEL(nibble_ashr_10):
+	psrldq	$10, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x003f, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$6, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+ 	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_10)	
 
-        xor     %rax, %r8
-        or      %rbx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
+/*
+ * ashr_11 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 5
+ */
+	.p2align 4
+LABEL(ashr_11):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$5, %xmm2		
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
 
-        xor     %rdx, %r9
-        or      %rbx, %r9
-        sub     %r11, %r9
-        jnz     LABEL(tail)
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$11, %r9d	/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	11(%rdi), %r10	 
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
 
-        cmp     %rdx, %rax
-        jne     LABEL(tail)
+	.p2align 4
+LABEL(loop_ashr_11):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_11)	
+
+LABEL(gobble_ashr_11):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
 
-        mov     8 (%rsi, %rcx), %rax
-        mov     8 (%rdi, %rcx), %rdx
-        add     $8, %ecx
+	psrldq	$11, %xmm3		
+	pslldq	$5, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	sub	$8, %r14
-	jle	LABEL(tail)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_11)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$11, %xmm3			
+	pslldq 	$5, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        mov     %rbx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_11)		
 
-        mov     %rbx, %r9
-        add     %rdx, %r9
-        sbb     %r11, %r11
+	.p2align 4
+LABEL(nibble_ashr_11):
+	psrldq	$11, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x001f, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$5, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+ 	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_11)	
 
-        xor     %rax, %r8
-        or      %rbx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
+/*
+ * ashr_12 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 4
+ */
+	.p2align 4
+LABEL(ashr_12):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$4, %xmm2		
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
 
-        xor     %rdx, %r9
-        or      %rbx, %r9
-        sub     %r11, %r9
-        jnz     LABEL(tail)
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$12, %r9d	/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	12(%rdi), %r10	 
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
 
-        cmp     %rdx, %rax
-        jne     LABEL(tail)
+	.p2align 4
+LABEL(loop_ashr_12):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_12)	
+
+LABEL(gobble_ashr_12):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
 
-        mov     8 (%rsi, %rcx), %rax
-        mov     8 (%rdi, %rcx), %rdx
-        add     $8, %ecx
+	psrldq	$12, %xmm3		
+	pslldq	$4, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0	
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	sub	$8, %r14
-	jle	LABEL(tail)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_12)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$12, %xmm3			
+	pslldq 	$4, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
+#ifdef USE_AS_STRNCMP
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        mov     %rbx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_12)		
 
-        mov     %rbx, %r9
-        add     %rdx, %r9
-        sbb     %r11, %r11
-
-        xor     %rax, %r8
-        or      %rbx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
+	.p2align 4
+LABEL(nibble_ashr_12):
+	psrldq	$12, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x000f, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$4, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+ 	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_12)	
 
-        xor     %rdx, %r9
-        or      %rbx, %r9
-        sub     %r11, %r9
-        jnz     LABEL(tail)
+/*
+ * ashr_13 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 3
+ */
+	.p2align 4
+LABEL(ashr_13):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$3, %xmm2		
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
 
-        cmp     %rdx, %rax
-        jne     LABEL(tail)
-
-        add	$8, %ecx
+	UPDATE_STRNCMP_COUNTER
 
-        jmp	LABEL(64loop)
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$13, %r9d	/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	13(%rdi), %r10	 
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
 
-LABEL(64after):
+	.p2align 4
+LABEL(loop_ashr_13):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_13)	
 
-LABEL(tailtry):
+LABEL(gobble_ashr_13):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$13, %xmm3		
+	pslldq	$3, %xmm2		
+	por	%xmm3, %xmm2
 
-LABEL(tail):				/* byte tail */
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
+
 #ifdef USE_AS_STRNCMP
-	add	$7, %r14
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        cmp     %dl, %al		/* check if same character */
-        jne     LABEL(exit)
-        test    %al, %al		/* check if character a NUL */
-        jz      LABEL(exit)
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_13)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
 
-        shr	$8, %rax
-        shr	$8, %rdx
+	psrldq	$13, %xmm3			
+	pslldq 	$3, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	dec	%r14
-	jl	LABEL(exit)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        cmp     %dl, %al
-        jne     LABEL(exit)
-        test    %al, %al
-        jz      LABEL(exit)
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_13)		
+	
+	.p2align 4
+LABEL(nibble_ashr_13):
+	psrldq	$13, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x0007, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$3, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+ 	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_13)	
 
-        shr	$8, %rax
-        shr	$8, %rdx
+/*
+ * ashr_14 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 2
+ */
+	.p2align 4
+LABEL(ashr_14):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq  $2, %xmm2		
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$14, %r9d	/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	14(%rdi), %r10  
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
+
+	.p2align 4
+LABEL(loop_ashr_14):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_14)	
+
+LABEL(gobble_ashr_14):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$14, %xmm3		
+	pslldq	$2, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	dec	%r14
-	jl	LABEL(exit)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        cmp     %dl, %al
-        jne     LABEL(exit)
-        test    %al, %al
-        jz      LABEL(exit)
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_14)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
 
-        shr	$8, %rax
-        shr	$8, %rdx
+	psrldq	$14, %xmm3			
+	pslldq 	$2, %xmm2		
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	dec	%r14
-	jl	LABEL(exit)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        cmp     %dl, %al
-        jne     LABEL(exit)
-        test    %al, %al
-        jz      LABEL(exit)
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_14)		
+
+	.p2align 4
+LABEL(nibble_ashr_14):
+	psrldq	$14, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x0003, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$2, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+ 	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_14)	
 
-        shr	$8, %rax
-        shr	$8, %rdx
+/*
+ * ashr_15 handles the following cases: 
+ * 	abs(str1 offset - str2 offset) = 1
+ */
+	.p2align 4
+LABEL(ashr_15):
+	pxor	%xmm0, %xmm0
+	movdqa	(%rdi), %xmm2
+	movdqa	(%rsi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pslldq	$1, %xmm2		
+	pcmpeqb	%xmm1, %xmm2
+	psubb	%xmm0, %xmm2
+	pmovmskb %xmm2, %r9d
+	shr	%cl, %edx
+	shr	%cl, %r9d
+	sub	%r9d, %edx
+	jnz	LABEL(less32bytes)
+
+	movdqa	(%rdi), %xmm3
+
+	UPDATE_STRNCMP_COUNTER
+
+	pxor	%xmm0, %xmm0
+	mov	$16, %rcx	/* index for loads */
+	mov	$15, %r9d	/* rdi bytes already examined. Used in exit code */
+	/*
+	 * Setup %r10 value allows us to detect crossing a page boundary.
+	 * When %r10 goes positive we are crossing a page boundary and
+	 * need to do a nibble.
+	 */
+	lea	15(%rdi), %r10	
+	and	$0xfff, %r10	/* offset into 4K page */
+	sub	$0x1000, %r10	/* subtract 4K pagesize */
+	movdqa	%xmm3, %xmm4
+
+	.p2align 4
+LABEL(loop_ashr_15):
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_15)
+
+LABEL(gobble_ashr_15):
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
+
+	psrldq	$15, %xmm3
+	pslldq	$1, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	dec	%r14
-	jl	LABEL(exit)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        cmp     %dl, %al
-        jne     LABEL(exit)
-        test    %al, %al
-        jz      LABEL(exit)
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+
+	add	$16, %r10
+	jg	LABEL(nibble_ashr_15)	/* cross page boundary */
+
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	(%rdi, %rcx), %xmm2
+	movdqa	%xmm2, %xmm4
 
-        shr	$8, %eax
-        shr	$8, %edx
+	psrldq	$15, %xmm3
+	pslldq 	$1, %xmm2
+	por	%xmm3, %xmm2
+
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm2, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0xffff, %edx
+	jnz	LABEL(exit)
 
 #ifdef USE_AS_STRNCMP
-	dec	%r14
-	jl	LABEL(exit)
+	sub	$16, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
 
-        cmp     %dl, %al
-        jne     LABEL(exit)
-        test    %al, %al
-        jz      LABEL(exit)
+	add	$16, %rcx
+	movdqa	%xmm4, %xmm3
+	jmp	LABEL(loop_ashr_15)
 
-        shr	$8, %eax
-        shr	$8, %edx
+	.p2align 4
+LABEL(nibble_ashr_15):
+	psrldq	$15, %xmm4		
+	movdqa	(%rsi, %rcx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm1
+	psubb	%xmm0, %xmm1
+	pmovmskb %xmm1, %edx
+	sub	$0x0001, %edx
+	jnz	LABEL(exit)
+#ifdef USE_AS_STRNCMP
+	cmp	$1, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+ 	pxor	%xmm0, %xmm0
+	sub	$0x1000, %r10		/* subtract 4K from %r10 */
+	jmp	LABEL(gobble_ashr_15)	
+
+	.p2align 4
+LABEL(exit):
+	lea	-16(%r9, %rcx), %rax	/* locate the exact offset for rdi */
+LABEL(less32bytes):
+	lea	(%rdi, %rax), %rdi	/* locate the exact address for first operand(rdi) */
+	lea	(%rsi, %rcx), %rsi	/* locate the exact address for second operand(rsi) */
+	test	%r8d, %r8d
+	jz	LABEL(ret)
+	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
+
+	.p2align 4
+LABEL(ret):
+LABEL(less16bytes):
+	/*
+	 * Check to see if BSF is fast on this processor. If not, use a different
+	 * exit tail.
+	 */
+	testl	$USE_BSF,.memops_method(%rip)
+	jz	LABEL(AMD_exit)
+	bsf	%rdx, %rdx		/* find and store bit index in %rdx */	
 
 #ifdef USE_AS_STRNCMP
-	dec	%r14
-	jl	LABEL(exit)
+	sub	%rdx, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif	
+	xor	%ecx, %ecx		/* clear %ecx */
+	xor	%eax, %eax		/* clear %eax */
+
+	movb	(%rsi, %rdx), %cl
+	movb	(%rdi, %rdx), %al
+
+	sub	%ecx, %eax
+	ret
+
+#ifdef USE_AS_STRNCMP
+LABEL(strcmp_exitz):
+	xor	%eax, %eax
+	ret
 #endif
 
-        cmp     %dl, %al
-        jne     LABEL(exit)
-        test    %al, %al
-        jz      LABEL(exit)
+	/*
+	 * This exit tail does not use the bsf instruction.
+	 */
+	.p2align 4
+LABEL(AMD_exit):
+	test	%dl, %dl
+	jz	LABEL(next_8_bytes)
+
+	test	$0x01, %dl
+	jnz	LABEL(Byte0)
+
+	test	$0x02, %dl
+	jnz	LABEL(Byte1)
+
+	test	$0x04, %dl
+	jnz	LABEL(Byte2)
+
+	test	$0x08, %dl
+	jnz	LABEL(Byte3)
+
+	test	$0x10, %dl
+	jnz	LABEL(Byte4)
+
+	test	$0x20, %dl
+	jnz	LABEL(Byte5)
+
+	test	$0x40, %dl
+	jnz	LABEL(Byte6)
+
+#ifdef USE_AS_STRNCMP
+	sub	$7, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	movzx	7(%rsi), %ecx
+	movzx	7(%rdi), %eax
+
+	sub	%ecx, %eax
+	ret
 
-        shr	$8, %eax
-        shr	$8, %edx
+	.p2align 4
+LABEL(Byte0):
+	/*
+	 * never need to handle byte 0 for strncmpy
+#ifdef USE_AS_STRNCMP
+	sub	$0, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	*/
+	movzx	(%rsi), %ecx
+	movzx	(%rdi), %eax
+
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+LABEL(Byte1):
+
+#ifdef USE_AS_STRNCMP
+	sub	$1, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	movzx	1(%rsi), %ecx
+	movzx	1(%rdi), %eax
+
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+LABEL(Byte2):
+
+#ifdef USE_AS_STRNCMP
+	sub	$2, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	movzx	2(%rsi), %ecx
+	movzx	2(%rdi), %eax
+
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+LABEL(Byte3):
 
 #ifdef USE_AS_STRNCMP
-	dec	%r14
-	jl	LABEL(exit)
+	sub	$3, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
+	movzx	3(%rsi), %ecx
+	movzx	3(%rdi), %eax
+
+	sub	%ecx, %eax
+	ret
 
-        cmp     %dl, %al
-        jne     LABEL(exit)
+	.p2align 4
+LABEL(Byte4):
 
-        .p2align 4,, 15
-
-LABEL(tailafter):
+#ifdef USE_AS_STRNCMP
+	sub	$4, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	movzx	4(%rsi), %ecx
+	movzx	4(%rdi), %eax
 
-LABEL(exit):
-	mov	-32 (%rsp), %r15
-	mov	-24 (%rsp), %rbp
-        mov	-16 (%rsp), %rbx
+	sub	%ecx, %eax
+	ret
 
-        .p2align 4,, 3
+	.p2align 4
+LABEL(Byte5):
 
-LABEL(exitafter):
 #ifdef USE_AS_STRNCMP
-	test	%r14, %r14
-	cmovl	%edx, %eax
+	sub	$5, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
+	movzx	5(%rsi), %ecx
+	movzx	5(%rdi), %eax
 
-	movzx	%al, %eax
-	movzx	%dl, %edx
-	sub	%eax, %edx
-	xchg	%edx, %eax
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+LABEL(Byte6):
 
 #ifdef USE_AS_STRNCMP
-LABEL(exitz):
-	mov	-8 (%rsp), %r14
+	sub	$6, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	movzx	6(%rsi), %ecx
+	movzx	6(%rdi), %eax
+
+	sub	%ecx, %eax
+	ret
+
+	.p2align 4
+LABEL(next_8_bytes):
+	add	$8, %rdi
+	add	$8, %rsi
+#ifdef USE_AS_STRNCMP
+	sub	$8, %r11
+	jbe	LABEL(strcmp_exitz)
 #endif
-        ret
+	test	$0x01, %dh
+	jnz	LABEL(Byte0)
+
+	test	$0x02, %dh
+	jnz	LABEL(Byte1)
+
+	test	$0x04, %dh
+	jnz	LABEL(Byte2)
+
+	test	$0x08, %dh
+	jnz	LABEL(Byte3)
+
+	test	$0x10, %dh
+	jnz	LABEL(Byte4)
+
+	test	$0x20, %dh
+	jnz	LABEL(Byte5)
+
+	test	$0x40, %dh
+	jnz	LABEL(Byte6)
 
 #ifdef USE_AS_STRNCMP
+	sub	$7, %r11
+	jbe	LABEL(strcmp_exitz)
+#endif
+	movzx	7(%rsi), %ecx
+	movzx	7(%rdi), %eax
+
+	sub	%ecx, %eax
+	ret
+
+	.pushsection .rodata
+	.p2align 4
+LABEL(unaligned_table):
+	.int	LABEL(ashr_0) - LABEL(unaligned_table)
+	.int	LABEL(ashr_15) - LABEL(unaligned_table)
+	.int	LABEL(ashr_14) - LABEL(unaligned_table)
+	.int	LABEL(ashr_13) - LABEL(unaligned_table)
+	.int	LABEL(ashr_12) - LABEL(unaligned_table)
+	.int	LABEL(ashr_11) - LABEL(unaligned_table)
+	.int	LABEL(ashr_10) - LABEL(unaligned_table)
+	.int	LABEL(ashr_9) - LABEL(unaligned_table)
+	.int	LABEL(ashr_8) - LABEL(unaligned_table)
+	.int	LABEL(ashr_7) - LABEL(unaligned_table)
+	.int	LABEL(ashr_6) - LABEL(unaligned_table)
+	.int	LABEL(ashr_5) - LABEL(unaligned_table)
+	.int	LABEL(ashr_4) - LABEL(unaligned_table)
+	.int	LABEL(ashr_3) - LABEL(unaligned_table)
+	.int	LABEL(ashr_2) - LABEL(unaligned_table)
+	.int	LABEL(ashr_1) - LABEL(unaligned_table)
+	.popsection
+#ifdef USE_AS_STRNCMP
 	SET_SIZE(strncmp)
 #else
 	SET_SIZE(strcmp)		/* (const char *, const char *) */
--- a/usr/src/lib/libc/amd64/gen/strcpy.s	Fri Sep 18 12:50:18 2009 -0700
+++ b/usr/src/lib/libc/amd64/gen/strcpy.s	Fri Sep 18 14:25:49 2009 -0700
@@ -1,862 +1,2582 @@
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
  */
 
 /*
- * Copyright (c) 2002 Advanced Micro Devices, Inc.
- * 
+ * Copyright (c) 2009, Intel Corporation
  * All rights reserved.
- * 
- * Redistribution and  use in source and binary  forms, with or
- * without  modification,  are   permitted  provided  that  the
- * following conditions are met:
- * 
- * + Redistributions  of source  code  must  retain  the  above
- *   copyright  notice,   this  list  of   conditions  and  the
- *   following disclaimer.
- * 
- * + Redistributions  in binary  form must reproduce  the above
- *   copyright  notice,   this  list  of   conditions  and  the
- *   following  disclaimer in  the  documentation and/or  other
- *   materials provided with the distribution.
- * 
- * + Neither the  name of Advanced Micro Devices,  Inc. nor the
- *   names  of  its contributors  may  be  used  to endorse  or
- *   promote  products  derived   from  this  software  without
- *   specific prior written permission.
- * 
- * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
- * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
- * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
- * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
- * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
- * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
- * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
- * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
- * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
- * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
- * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
- * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
- * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
- * POSSIBILITY OF SUCH DAMAGE.
- * 
- * It is  licensee's responsibility  to comply with  any export
- * regulations applicable in licensee's jurisdiction.
  */
 
-	.file	"strcpy.s"
-
+/*
+ *	str[n]cpy - copy [n] chars from second operand into first operand
+ */
 #include "SYS.h"
-#include "cache.h"
+#include "proc64_id.h"
 
 #define LABEL(s) .strcpy/**/s
 
 #ifdef USE_AS_STRNCPY
 	ENTRY(strncpy)
+	test	%edx, %edx
+	jz	LABEL(strncpy_exitz)
+	mov	%rdx, %r8
 #else
-	ENTRY(strcpy)                        /* (char *, const char *) */
+	ENTRY(strcpy)				/* (char *, const char *) */
+	xor	%rdx, %rdx
+#endif
+	mov	%esi, %ecx
+	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
+	and	$0xf, %rcx
+	mov	%rdi, %rax			/* save destination address for return value */
+
+
+	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char checks */
+	pcmpeqb	(%rsi), %xmm0			/* check 16 bytes in src for null */
+	pmovmskb %xmm0, %edx
+	shr	%cl, %edx			/* adjust for offset from 16byte boundary */
+	test	%edx, %edx			/* edx will be 0 if chars are non-null */
+	jnz	LABEL(less16bytes)		/* null char found in first 16 bytes examined */
+#ifdef USE_AS_STRNCPY
+	/*
+	 * Check if the count is satisfied in first 16 bytes examined.
+	 */
+	lea	-16(%r8, %rcx), %r11
+	cmp	$0, %r11
+	jle	LABEL(less16bytes)
+#endif
+	mov	%rcx, %r9			/* rsi alignment offset */
+	or	%edi, %ecx
+	and	$0xf, %ecx
+	lea	-16(%r9), %r10
+	jz	LABEL(ashr_0)			/* src and dest are both 16 byte aligned */
+
+	neg	%r10				/* max src bytes remaining in current dqword */
+
+	pxor	%xmm0, %xmm0			/* clear %xmm0, may be polluted by unaligned operation */
+	pcmpeqb	16(%rsi), %xmm0			/* check next 16 bytes in src for a null */
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(less32bytes)		/* null char found in first 32 bytes examined */
+
+#ifdef USE_AS_STRNCPY
+	/*
+	 * If strncpy count <= 16 go to exit case
+	 */
+	sub	$16, %r8
+	jbe	LABEL(less32bytes_strncpy_truncation)
+#endif
+	/*
+	 * At least 16 bytes to copy to destination string. Move them now.
+	 * Don't worry about alignment.
+	 */
+	mov	(%rsi, %r9), %rdx
+	mov	%rdx, (%rdi)
+	mov	8(%rsi, %r9), %rdx
+	mov	%rdx, 8(%rdi)
+
+	/*
+	 * so far destination rdi may be aligned by 16, re-calculate rsi and
+	 * jump to corresponding src/dest relative offset case.
+	 * 	rcx is offset of rsi
+	 * 	rdx is offset of rdi
+	 */
+	and	$0xfffffffffffffff0, %rdi	/* force rdi 16 byte align */
+	mov	%rax, %rdx			/* rax contains orignal rdi */
+	xor	%rdi, %rdx			/* same effect as "and $0xf, %rdx" */
+#ifdef USE_AS_STRNCPY
+	/*
+	 * Will now do 16 byte aligned stores. Stores may overlap some bytes
+	 * (ie store twice) if destination was unaligned. Compensate here.
+	 */
+	add	%rdx, %r8			/* compensate for overlap */
+#endif
+
+	add	$16, %rdi			/* next 16 bytes for dest */
+
+	/*
+	 * align src to 16-byte boundary. Could be up or down depending on
+	 * whether src offset - dest offset > 0 (up) or
+	 *  src offset - dest offset < 0 (down).
+	 */
+	sub	%rdx, %r9			/* src offset - dest offset */
+
+	lea	16(%r9, %rsi), %rsi
+	mov	%esi, %ecx			/* for new src offset */
+	and	$0xfffffffffffffff0, %rsi	/* force rsi 16 byte align */
+
+	and	$0xf, %ecx			/* new src offset is 0 if rsi/rdi have same alignment */
+	jz	LABEL(ashr_0)
+
+#ifdef USE_AS_STRNCPY
+	xor	%edx, %edx			/* In case unaligned_exit is taken */
+#endif
+	/*
+	 * Jump to case corresponding to source/dest string relative offsets
+	 * Index = (16 + (src offset - dest offset)) % 16
+	 */
+	lea	-16(%rcx), %r10
+	mov	%rcx, %r9
+	neg	%r10				/* max src bytes remaining in current dqword */
+	lea	LABEL(unaligned_table)(%rip), %r11
+	movslq	(%r11, %rcx, 4), %rcx
+	lea	(%r11, %rcx), %rcx
+	jmp	*%rcx
+
+/*
+ * ashr_0 handles the following cases:
+ * 	src alignment offset = dest alignment offset
+ */
+	.p2align 5
+LABEL(ashr_0):
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa	(%rsi), %xmm1		/* fetch 16 bytes from src string */
+	movdqa	%xmm1, (%rdi)		/* store 16 bytes into dest string */
+	add	$16, %rsi
+	add	$16, %rdi
+	pcmpeqb	(%rsi), %xmm0		/* check 16 bytes in src for a null */
+	pmovmskb %xmm0, %edx
+
+	test	%edx, %edx		/* edx will be 0 if chars are non-null */
+	jnz	LABEL(aligned_16bytes)	/* exit tail */
+
+LABEL(ashr_0_loop):
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa	(%rsi, %rcx), %xmm1
+	movdqa	%xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb	(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(aligned_exit)
+
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+	jbe	LABEL(strncpy_truncation_aligned)
+#endif
+	movdqa  (%rsi, %rcx), %xmm1
+	movdqa  %xmm1, (%rdi, %rcx)
+	add	$16, %rcx
+	pcmpeqb  (%rsi, %rcx), %xmm0
+	pmovmskb  %xmm0, %edx
+	test	%edx, %edx
+	jz	LABEL(ashr_0_loop)
+	jmp	LABEL(aligned_exit)
+
+
+/*
+ * ashr_15 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 15 
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_15):
+	xor	%ecx, %ecx				/* clear index */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_15_use_sse2)
+
+	.p2align 4
+LABEL(ashr_15_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $15, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x0f
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $15, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x0f
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_15_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_15_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$15, %xmm2
+	pslldq	$1, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$15, %xmm2
+	pslldq	$1, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_15_use_sse2)
+
+
+/*
+ * ashr_14 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 14 
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_14):
+	xor	%ecx, %ecx				/* clear index */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_14_use_sse2)
+
+	.p2align 4
+LABEL(ashr_14_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $14, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x0e
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $14, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x0e
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_14_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_14_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$14, %xmm2
+	pslldq	$2, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$14, %xmm2
+	pslldq	$2, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_14_use_sse2)
+
+
+/*
+ * ashr_13 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 13 
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_13):
+	xor	%ecx, %ecx				/* clear index */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_13_use_sse2)
+
+	.p2align 4
+LABEL(ashr_13_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $13, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x0d
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
+	#palignr $13, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x0d
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
 #ifdef USE_AS_STRNCPY
-	test	%rdx, %rdx		/* (char *, const char *, size_t) */
-	mov	%rdx, %r11
-	jz	LABEL(exitn)		/* early exit */
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_13_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_13_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$13, %xmm2
+	pslldq	$3, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$13, %xmm2
+	pslldq	$3, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_13_use_sse2)
+
+
+/*
+ * ashr_12 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 12 
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_12):
+	xor	%ecx, %ecx				/* clear index */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_12_use_sse2)
+
+	.p2align 4
+LABEL(ashr_12_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $12, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x0c
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $12, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x0c
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_12_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_12_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$12, %xmm2
+	pslldq	$4, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$12, %xmm2
+	pslldq	$4, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_12_use_sse2)
+
+
+/*
+ * ashr_11 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 11 
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_11):
+	xor	%ecx, %ecx				/* clear index */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_11_use_sse2)
+
+	.p2align 4
+LABEL(ashr_11_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $11, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x0b
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $11, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x0b
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_11_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_11_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$11, %xmm2
+	pslldq	$5, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$11, %xmm2
+	pslldq	$5, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_11_use_sse2)
+
+
+/*
+ * ashr_10 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 10
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_10):
+	xor	%ecx, %ecx				/* clear index */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_10_use_sse2)
+
+	.p2align 4
+LABEL(ashr_10_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $10, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x0a
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $10, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x0a
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_10_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_10_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$10, %xmm2
+	pslldq	$6, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$10, %xmm2
+	pslldq	$6, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_10_use_sse2)
+
+
+/*
+ * ashr_9 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 9
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_9):
+	xor	%ecx, %ecx				/* clear index */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_9_use_sse2)
+
+	.p2align 4
+LABEL(ashr_9_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $9, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x09
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $9, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x09
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_9_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_9_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$9, %xmm2
+	pslldq	$7, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
-        xor     %edx, %edx
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$9, %xmm2
+	pslldq	$7, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_9_use_sse2)
+
+
+/*
+ * ashr_8 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 8
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_8):
+	xor	%ecx, %ecx				/* clear index */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_8_use_sse2)
+
+	.p2align 4
+LABEL(ashr_8_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $8, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x08
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $8, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x08
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_8_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_8_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$8, %xmm2
+	pslldq	$8, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$8, %xmm2
+	pslldq	$8, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_8_use_sse2)
+
+
+/*
+ * ashr_7 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 7
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_7):
+	xor	%ecx, %ecx				/* clear index */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_7_use_sse2)
+
+	.p2align 4
+LABEL(ashr_7_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $7, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x07
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $7, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x07
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_7_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_7_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$7, %xmm2
+	pslldq	$9, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$7, %xmm2
+	pslldq	$9, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_7_use_sse2)
+
 
-LABEL(aligntry):
-        mov     %rsi, %r8		/* align by source */
-        and     $7, %r8
-	jz	LABEL(alignafter)
+/*
+ * ashr_6 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 6
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_6):
+	xor	%ecx, %ecx				/* clear index */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_6_use_sse2)
+
+	.p2align 4
+LABEL(ashr_6_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $6, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x06
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $6, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x06
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_6_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_6_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$6, %xmm2
+	pslldq	$10, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$6, %xmm2
+	pslldq	$10, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_6_use_sse2)
+
 
-LABEL(align):				/* 8-byte align */
-        sub     $8, %r8
+/*
+ * ashr_5 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 5
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_5):
+	xor	%ecx, %ecx				/* clear index */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_5_use_sse2)
+
+	.p2align 4
+LABEL(ashr_5_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $5, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x05
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $5, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x05
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_5_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_5_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$5, %xmm2
+	pslldq	$11, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$5, %xmm2
+	pslldq	$11, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_5_use_sse2)
+
+
+/*
+ * ashr_4 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 4
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_4):
+	xor	%ecx, %ecx				/* clear index */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_4_use_sse2)
 
 	.p2align 4
+LABEL(ashr_4_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
 
-LABEL(alignloop):
+	#palignr $4, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x04
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $4, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x04
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_4_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_4_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$4, %xmm2
+	pslldq	$12, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$4, %xmm2
+	pslldq	$12, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_4_use_sse2)
+
+
+/*
+ * ashr_3 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 3
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_3):
+	xor	%ecx, %ecx				/* clear index */
 #ifdef USE_AS_STRNCPY
-	dec	%r11
-	jl	LABEL(exitn)
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_3_use_sse2)
+
+	.p2align 4
+LABEL(ashr_3_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $3, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x03
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $3, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x03
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_3_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_3_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$3, %xmm2
+	pslldq	$13, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$3, %xmm2
+	pslldq	$13, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_3_use_sse2)
+
+
+/*
+ * ashr_2 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 2
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_2):
+	xor	%ecx, %ecx				/* clear index */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_2_use_sse2)
+
+	.p2align 4
+LABEL(ashr_2_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
 #endif
 
-        mov     (%rsi, %rdx), %al       /* check if same character */
-        test    %al, %al                /* check if character a NUL */
-        mov     %al, (%rdi, %rdx)
-        jz      LABEL(exit)
+	#palignr $2, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x02
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $2, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x02
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_2_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_2_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$2, %xmm2
+	pslldq	$14, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
 
-        inc     %edx
-        inc     %r8
-        jnz     LABEL(alignloop)
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$2, %xmm2
+	pslldq	$14, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_2_use_sse2)
+
+
+/*
+ * ashr_1 handles the following cases:
+ * 	(16 + (src offset - dest offset)) % 16 = 1
+ *
+ * Based on above operation, start from (%r9 + rsi) to the left of this cache
+ * bank, there is no null byte.
+ */
+	.p2align 4
+LABEL(ashr_1):
+	xor	%ecx, %ecx				/* clear index */
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	testl	$USE_SSSE3, .memops_method(%rip)	/* use sse2 or ssse3? */
+	jz	LABEL(ashr_1_use_sse2)
+
+	.p2align 4
+LABEL(ashr_1_use_ssse3):
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	#palignr $1, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x01
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
 
 #ifdef USE_AS_STRNCPY
-	test	%r11, %r11		/* must check remaining size */
-	jz	LABEL(exitn)		/* If we've already done, exit */
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	pcmpeqb %xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+	#palignr $1, (%rsi, %rcx), %xmm3
+	.byte	0x66, 0x0F, 0x3A ,0x0F
+	.byte	0x1c, 0x0e, 0x01
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_1_use_ssse3)
+
+	.p2align 4
+LABEL(ashr_1_use_sse2):
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$1, %xmm2
+	pslldq	$15, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	pcmpeqb 16(%rsi, %rcx), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	LABEL(unaligned_exit)
+#ifdef USE_AS_STRNCPY
+	sub	$16, %r8
+ 	jbe	LABEL(strncpy_truncation_unaligned)
+#endif
+
+	movdqa	16(%rsi, %rcx), %xmm3
+	movdqa	(%rsi, %rcx), %xmm2
+
+	psrldq	$1, %xmm2
+	pslldq	$15, %xmm3
+	por	%xmm2, %xmm3
+
+	movdqa	%xmm3, (%rdi, %rcx)
+	add	$16, %rcx
+#ifdef USE_AS_STRNCPY
+	cmp	%r10, %r8
+	jbe	LABEL(unaligned_exit)
+#endif
+	jmp	LABEL(ashr_1_use_sse2)
+
+
+	/*
+	 * Exit tail code:
+	 * Up to 32 bytes are copied in the case of strcpy.
+	 */
+	.p2align 4
+LABEL(less32bytes):
+	xor	%ecx, %ecx
+LABEL(unaligned_exit):
+	add	%r9, %rsi		/* r9 holds offset of rsi */
+	mov	%rcx, %r9
+	mov	%r10, %rcx
+	shl	%cl, %edx		/* after shl, calculate the exact number to be filled */
+	mov	%r9, %rcx
+	.p2align 4
+LABEL(aligned_exit):
+	add	%rcx, %rdi		/* locate exact address for rdi */
+LABEL(less16bytes):
+	add	%rcx, %rsi		/* locate exact address for rsi */
+LABEL(aligned_16bytes):
+#ifdef USE_AS_STRNCPY
+	/*
+	 * Null found in 16bytes checked. Set bit in bitmask corresponding to
+	 * the strncpy count argument. We will copy to the null (inclusive)
+	 * or count whichever comes first.
+	 */
+	mov	$1, %r9d
+	lea	-1(%r8), %rcx
+	shl	%cl, %r9d
+	cmp	$32, %r8
+	ja	LABEL(strncpy_tail)
+	or	%r9d, %edx
+LABEL(strncpy_tail):
+#endif
+	/*
+	 * Check to see if BSF is fast on this processor. If not, use a
+	 * different exit tail.
+	 */
+	testb	$USE_BSF, .memops_method(%rip)
+	jz	LABEL(AMD_exit)
+	bsf	%rdx, %rcx		/* Find byte with null char */
+	lea	LABEL(tail_table)(%rip), %r11
+	movslq	(%r11, %rcx, 4), %rcx
+	lea	(%r11, %rcx), %rcx
+	jmp	*%rcx
+
+#ifdef USE_AS_STRNCPY
+	/*
+	 * Count reached before null found.
+	 */
+	.p2align 4
+LABEL(less32bytes_strncpy_truncation):
+	xor	%ecx, %ecx
+LABEL(strncpy_truncation_unaligned):
+	add	%r9, %rsi		/* next src char to copy */
+LABEL(strncpy_truncation_aligned):
+	add	%rcx, %rdi
+	add	%rcx, %rsi
+	add	$16, %r8		/* compensation */
+	lea	-1(%r8), %rcx
+	lea	LABEL(tail_table)(%rip), %r11
+	movslq	(%r11, %rcx, 4), %rcx
+	lea	(%r11, %rcx), %rcx
+	jmp	*%rcx
+
+	.p2align 4
+LABEL(strncpy_exitz):
+	mov	%rdi, %rax
+	ret
 #endif
 
 	.p2align 4
-
-LABEL(alignafter):
-
-LABEL(8try):
-        mov     $0xfefefefefefefeff, %rcx
-
-LABEL(8):                               /* 8-byte */
-        mov     (%rsi, %rdx), %rax
-
-LABEL(8loop):
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-LABEL(8after):
-
-LABEL(64try):
-        mov     _sref_(.amd64cache1half), %r9
-
-LABEL(64):				/* 64-byte */
-
-        .p2align 4
-
-LABEL(64loop):
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
+LABEL(AMD_exit):
+	test	%dl, %dl
+	jz	LABEL(AMD_exit_more_8)
+	test	$0x01, %dl
+	jnz	LABEL(tail_0)
+	test	$0x02, %dl
+	jnz	LABEL(tail_1)
+	test	$0x04, %dl
+	jnz	LABEL(tail_2)
+	test	$0x08, %dl
+	jnz	LABEL(tail_3)
+	test	$0x10, %dl
+	jnz	LABEL(tail_4)
+	test	$0x20, %dl
+	jnz	LABEL(tail_5)
+	test	$0x40, %dl
+	jnz	LABEL(tail_6)
 
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
+	.p2align 4
+LABEL(tail_7):				/* 8 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
 #ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
+	mov	$8, %cl
+	sub	$8, %r8
+	jnz	LABEL(strncpy_fill_tail)
 #endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        cmp     %r9, %rdx
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        lea     8 (%rdx), %rdx
-
-        jbe     LABEL(64loop)
-
-LABEL(64after):
-
-LABEL(pretry):
-        mov     _sref_(.amd64cache2half), %r9
-
-LABEL(pre):                              /* 64-byte prefetch */
-
-        .p2align 4
-
-LABEL(preloop):
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
+	ret
 
 #ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        mov     %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %edx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(tail)
-
-        cmp     %r9, %rdx
+	/*
+	 * Null terminated src string shorter than count. Fill the rest of the
+	 * destination with null chars.
+	 */
+	.p2align 4
+LABEL(strncpy_fill_tail):
+	mov	%rax, %rdx
+	movzx	%cl, %rax
+	mov	%r8, %rcx
+	add	%rax, %rdi
+	xor	%eax, %eax
+	shr	$3, %ecx
+	jz	LABEL(strncpy_fill_less_8)
 
-        mov     %rax, (%rdi, %rdx)
-        prefetchnta 512 + 8 (%rdi, %rdx)	/* 3DNow: use prefetchw */
-        mov     8 (%rsi, %rdx), %rax
-        prefetchnta 512 + 8 (%rsi, %rdx)	/* 3DNow: use prefetch */
-        lea     8 (%rdx), %rdx
-
-        jb	LABEL(preloop)
-
-        .p2align 4
-
-LABEL(preafter):
-
-LABEL(NTtry):
-	mfence
-
-LABEL(NT):				/* 64-byte NT */
-
-        .p2align 4
-
-LABEL(NTloop):
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(NTtail)
-
-        movnti  %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %rdx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(NTtail)
-
-        movnti  %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %rdx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(NTtail)
-
-        movnti  %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %rdx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
-#endif
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(NTtail)
-
-        movnti  %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %rdx
-
-#ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
+	rep	stosq
+LABEL(strncpy_fill_less_8):
+	mov	%r8, %rcx
+	and	$7, %rcx
+	jz	LABEL(strncpy_fill_return)
+LABEL(strncpy_fill_less_7):
+	sub	$1, %ecx
+	mov	%al, (%rdi, %rcx)
+	jnz	LABEL(strncpy_fill_less_7)
+LABEL(strncpy_fill_return):
+	mov	%rdx, %rax
+	ret
 #endif
 
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
+	.p2align 4
+LABEL(tail_0):				/* 1 byte */
+	mov	(%rsi), %cl
+	mov	%cl, (%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$1, %cl
+	sub	$1, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(NTtail)
+	.p2align 4
+LABEL(tail_1):				/* 2 bytes */
+	mov	(%rsi), %cx
+	mov	%cx, (%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$2, %cl
+	sub	$2, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-        movnti  %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %rdx
+	.p2align 4
+LABEL(tail_2):				/* 3 bytes */
+	mov	(%rsi), %cx
+	mov	%cx, (%rdi)
+	mov	1(%rsi), %cx
+	mov	%cx, 1(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$3, %cl
+	sub	$3, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
+	.p2align 4
+LABEL(tail_3):				/* 4 bytes */
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
 #ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
+	mov	$4, %cl
+	sub	$4, %r8
+	jnz	LABEL(strncpy_fill_tail)
 #endif
+	ret
 
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
+	.p2align 4
+LABEL(tail_4):				/* 5 bytes */
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+	mov	1(%rsi), %edx
+	mov	%edx, 1(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$5, %cl
+	sub	$5, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(NTtail)
+	.p2align 4
+LABEL(tail_5):				/* 6 bytes */
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+	mov	2(%rsi), %edx
+	mov	%edx, 2(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$6, %cl
+	sub	$6, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-        movnti  %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %rdx
-
+	.p2align 4
+LABEL(tail_6):				/* 7 bytes */
+	mov	(%rsi), %ecx
+	mov	%ecx, (%rdi)
+	mov	3(%rsi), %edx
+	mov	%edx,3(%rdi)
 #ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
+	mov	$7, %cl
+	sub	$7, %r8
+	jnz	LABEL(strncpy_fill_tail)
 #endif
+	ret
 
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
+	.p2align 4
+LABEL(tail_8):				/* 9 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	5(%rsi), %edx
+	mov	%edx, 5(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$9, %cl
+	sub	$9, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(NTtail)
+	.p2align 4
+LABEL(AMD_exit_more_8):
+	test	%dh, %dh
+	jz	LABEL(AMD_exit_more_16)
+	test	$0x01, %dh
+	jnz	LABEL(tail_8)
+	test	$0x02, %dh
+	jnz	LABEL(tail_9)
+	test	$0x04, %dh
+	jnz	LABEL(tail_10)
+	test	$0x08, %dh
+	jnz	LABEL(tail_11)
+	test	$0x10, %dh
+	jnz	LABEL(tail_12)
+	test	$0x20, %dh
+	jnz	LABEL(tail_13)
+	test	$0x40, %dh
+	jnz	LABEL(tail_14)
 
-        movnti  %rax, (%rdi, %rdx)
-        mov     8 (%rsi, %rdx), %rax
-        add     $8, %rdx
+	.p2align 4
+LABEL(tail_15):				/* 16 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$16, %cl
+	sub	$16, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
+	.p2align 4
+LABEL(tail_9):				/* 10 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	6(%rsi), %edx
+	mov	%edx, 6(%rdi)
 #ifdef USE_AS_STRNCPY
-	sub	$8, %r11
-	jle	LABEL(tail)
+	mov	$10, %cl
+	sub	$10, %r8
+	jnz	LABEL(strncpy_fill_tail)
 #endif
+	ret
 
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %r10, %r10
+	.p2align 4
+LABEL(tail_10):				/* 11 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	7(%rsi), %edx
+	mov	%edx, 7(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$11, %cl
+	sub	$11, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %r10, %r8
-        jnz     LABEL(NTtail)
+	.p2align 4
+LABEL(tail_11):				/* 12 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %edx
+	mov	%edx, 8(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$12, %cl
+	sub	$12, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-        movnti  %rax, (%rdi, %rdx)
-	mov     8 (%rsi, %rdx), %rax
-	prefetchnta 768 + 8 (%rsi, %rdx)
-        add     $8, %rdx
+	.p2align 4
+LABEL(tail_12):				/* 13 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	5(%rsi), %rcx
+	mov	%rcx, 5(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$13, %cl
+	sub	$13, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-        jmp     LABEL(NTloop)
-
-        .p2align 4
+	.p2align 4
+LABEL(tail_13):				/* 14 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	6(%rsi), %rcx
+	mov	%rcx, 6(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$14, %cl
+	sub	$14, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-LABEL(NTtail):
-	mfence
-
-        .p2align 4
-
-LABEL(NTafter):
-
-LABEL(tailtry):
-
-LABEL(tail):                             /* 1-byte tail */
+	.p2align 4
+LABEL(tail_14):				/* 15 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	7(%rsi), %rcx
+	mov	%rcx, 7(%rdi)
 #ifdef USE_AS_STRNCPY
-	add	$8, %r11
+	mov	$15, %cl
+	sub	$15, %r8
+	jnz	LABEL(strncpy_fill_tail)
 #endif
+	ret
 
-        .p2align 4
+	.p2align 4
+LABEL(AMD_exit_more_16):
+	shr	$16, %edx
+	test	%dl, %dl
+	jz	LABEL(AMD_exit_more_24)
+	test	$0x01, %dl
+	jnz	LABEL(tail_16)
+	test	$0x02, %dl
+	jnz	LABEL(tail_17)
+	test	$0x04, %dl
+	jnz	LABEL(tail_18)
+	test	$0x08, %dl
+	jnz	LABEL(tail_19)
+	test	$0x10, %dl
+	jnz	LABEL(tail_20)
+	test	$0x20, %dl
+	jnz	LABEL(tail_21)
+	test	$0x40, %dl
+	jnz	LABEL(tail_22)
 
-LABEL(tailloop):
+	.p2align 4
+LABEL(tail_23):				/* 24 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
 #ifdef USE_AS_STRNCPY
-	dec	%r11
-	jl	LABEL(exitn)
+	mov	$24, %cl
+	sub	$24, %r8
+	jnz	LABEL(strncpy_fill_tail)
 #endif
+	ret
 
-        test    %al, %al
-        mov     %al, (%rdi, %rdx)
-        jz      LABEL(exit)
+	.p2align 4
+LABEL(tail_16):				/* 17 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %cl
+	mov	%cl, 16(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$17, %cl
+	sub	$17, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
+
+	.p2align 4
+LABEL(tail_17):				/* 18 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %cx
+	mov	%cx, 16(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$18, %cl
+	sub	$18, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-        inc     %rdx
-
+	.p2align 4
+LABEL(tail_18):				/* 19 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	15(%rsi), %ecx
+	mov	%ecx,15(%rdi)
 #ifdef USE_AS_STRNCPY
-	dec	%r11
-	jl	LABEL(exitn)
+	mov	$19, %cl
+	sub	$19, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-	mov	%ah, %al
+	.p2align 4
+LABEL(tail_19):				/* 20 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %ecx
+	mov	%ecx, 16(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$20, %cl
+	sub	$20, %r8
+	jnz	LABEL(strncpy_fill_tail)
 #endif
+	ret
 
-        test    %ah, %ah
-        mov     %ah, (%rdi, %rdx)
-        jz      LABEL(exit)
-
-        inc     %rdx
+	.p2align 4
+LABEL(tail_20):				/* 21 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	13(%rsi), %rcx
+	mov	%rcx, 13(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$21, %cl
+	sub	$21, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
+	.p2align 4
+LABEL(tail_21):				/* 22 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	14(%rsi), %rcx
+	mov	%rcx, 14(%rdi)
 #ifdef USE_AS_STRNCPY
-	dec	%r11
-	jl	LABEL(exitn)
+	mov	$22, %cl
+	sub	$22, %r8
+	jnz	LABEL(strncpy_fill_tail)
 #endif
-
-        shr     $16, %rax
+	ret
 
-        test    %al, %al
-        mov     %al, (%rdi, %rdx)
-        jz      LABEL(exit)
-
-        inc     %rdx
+	.p2align 4
+LABEL(tail_22):				/* 23 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	15(%rsi), %rcx
+	mov	%rcx, 15(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$23, %cl
+	sub	$23, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-#ifdef USE_AS_STRNCPY
-	dec	%r11
-	jl	LABEL(exitn)
-
-	mov	%ah, %al
-#endif
+	.p2align 4
+LABEL(AMD_exit_more_24):
+	test	$0x01, %dh
+	jnz	LABEL(tail_24)
+	test	$0x02, %dh
+	jnz	LABEL(tail_25)
+	test	$0x04, %dh
+	jnz	LABEL(tail_26)
+	test	$0x08, %dh
+	jnz	LABEL(tail_27)
+	test	$0x10, %dh
+	jnz	LABEL(tail_28)
+	test	$0x20, %dh
+	jnz	LABEL(tail_29)
+	test	$0x40, %dh
+	jnz	LABEL(tail_30)
 
-        test    %ah, %ah
-        mov     %ah, (%rdi, %rdx)
-        jz      LABEL(exit)
+	.p2align 4
+LABEL(tail_31):				/* 32 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	24(%rsi), %rdx
+	mov	%rdx, 24(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$32, %cl
+	sub	$32, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-        shr     $16, %rax
-        inc     %rdx
-
-        jmp     LABEL(tailloop)
-
-        .p2align 4
-
-LABEL(tailafter):
+	.p2align 4
+LABEL(tail_24):				/* 25 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	21(%rsi), %edx
+	mov	%edx, 21(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$25, %cl
+	sub	$25, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-LABEL(exit):
+	.p2align 4
+LABEL(tail_25):				/* 26 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	22(%rsi), %edx
+	mov	%edx, 22(%rdi)
 #ifdef USE_AS_STRNCPY
-	test	%r11, %r11
-	mov	%r11, %rcx
+	mov	$26, %cl
+	sub	$26, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-#ifdef USE_AS_STPCPY
-        lea     (%rdi, %rdx), %r8
-#else
-        mov     %rdi, %r8
+	.p2align 4
+LABEL(tail_26):				/* 27 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	23(%rsi), %edx
+	mov	%edx, 23(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$27, %cl
+	sub	$27, %r8
+	jnz	LABEL(strncpy_fill_tail)
 #endif
+	ret
 
-	jz	2f
-
-	xor	%eax, %eax		/* bzero () would do too, but usually there are only a handfull of bytes left */
-	shr	$3, %rcx
-        lea     1 (%rdi, %rdx), %rdi
-	jz	1f
+	.p2align 4
+LABEL(tail_27):				/* 28 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	24(%rsi), %edx
+	mov	%edx, 24(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$28, %cl
+	sub	$28, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-	rep	stosq
-
-1:
-	mov	%r11d, %ecx
-	and	$7, %ecx
-	jz	2f
+	.p2align 4
+LABEL(tail_28):				/* 29 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	21(%rsi), %rdx
+	mov	%rdx, 21(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$29, %cl
+	sub	$29, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-        .p2align 4,, 3
+	.p2align 4
+LABEL(tail_29):				/* 30 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	22(%rsi), %rdx
+	mov	%rdx, 22(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$30, %cl
+	sub	$30, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-3:
-	dec	%ecx
-	mov	%al, (%rdi, %rcx)
-	jnz	3b
-
-        .p2align 4,, 3
-
-2:
-	mov	%r8, %rax
-        ret
+	.p2align 4
+LABEL(tail_30):				/* 31 bytes */
+	mov	(%rsi), %rcx
+	mov	%rcx, (%rdi)
+	mov	8(%rsi), %rdx
+	mov	%rdx, 8(%rdi)
+	mov	16(%rsi), %rcx
+	mov	%rcx, 16(%rdi)
+	mov	23(%rsi), %rdx
+	mov	%rdx, 23(%rdi)
+#ifdef USE_AS_STRNCPY
+	mov	$31, %cl
+	sub	$31, %r8
+	jnz	LABEL(strncpy_fill_tail)
+#endif
+	ret
 
-#endif
-
-        .p2align 4
+	.pushsection .rodata
+	.p2align 4
+LABEL(tail_table):
+	.int	LABEL(tail_0) - LABEL(tail_table)	/* 1 byte */
+	.int	LABEL(tail_1) - LABEL(tail_table)
+	.int	LABEL(tail_2) - LABEL(tail_table)
+	.int	LABEL(tail_3) - LABEL(tail_table)
+	.int	LABEL(tail_4) - LABEL(tail_table)
+	.int	LABEL(tail_5) - LABEL(tail_table)
+	.int	LABEL(tail_6) - LABEL(tail_table)
+	.int	LABEL(tail_7) - LABEL(tail_table)
+	.int	LABEL(tail_8) - LABEL(tail_table)
+	.int	LABEL(tail_9) - LABEL(tail_table)
+	.int	LABEL(tail_10) - LABEL(tail_table)
+	.int	LABEL(tail_11) - LABEL(tail_table)
+	.int	LABEL(tail_12) - LABEL(tail_table)
+	.int	LABEL(tail_13) - LABEL(tail_table)
+	.int	LABEL(tail_14) - LABEL(tail_table)
+	.int	LABEL(tail_15) - LABEL(tail_table)
+	.int	LABEL(tail_16) - LABEL(tail_table)
+	.int	LABEL(tail_17) - LABEL(tail_table)
+	.int	LABEL(tail_18) - LABEL(tail_table)
+	.int	LABEL(tail_19) - LABEL(tail_table)
+	.int	LABEL(tail_20) - LABEL(tail_table)
+	.int	LABEL(tail_21) - LABEL(tail_table)
+	.int	LABEL(tail_22) - LABEL(tail_table)
+	.int	LABEL(tail_23) - LABEL(tail_table)
+	.int	LABEL(tail_24) - LABEL(tail_table)
+	.int	LABEL(tail_25) - LABEL(tail_table)
+	.int	LABEL(tail_26) - LABEL(tail_table)
+	.int	LABEL(tail_27) - LABEL(tail_table)
+	.int	LABEL(tail_28) - LABEL(tail_table)
+	.int	LABEL(tail_29) - LABEL(tail_table)
+	.int	LABEL(tail_30) - LABEL(tail_table)
+	.int	LABEL(tail_31) - LABEL(tail_table)	/* 32 bytes */
 
-LABEL(exitn):
-#ifdef USE_AS_STPCPY
-        lea     (%rdi, %rdx), %rax
-#else
-        mov     %rdi, %rax
-#endif
-
-        ret
+	.p2align 4
+LABEL(unaligned_table):
+	.int	LABEL(ashr_0) - LABEL(unaligned_table)
+	.int	LABEL(ashr_1) - LABEL(unaligned_table)
+	.int	LABEL(ashr_2) - LABEL(unaligned_table)
+	.int	LABEL(ashr_3) - LABEL(unaligned_table)
+	.int	LABEL(ashr_4) - LABEL(unaligned_table)
+	.int	LABEL(ashr_5) - LABEL(unaligned_table)
+	.int	LABEL(ashr_6) - LABEL(unaligned_table)
+	.int	LABEL(ashr_7) - LABEL(unaligned_table)
+	.int	LABEL(ashr_8) - LABEL(unaligned_table)
+	.int	LABEL(ashr_9) - LABEL(unaligned_table)
+	.int	LABEL(ashr_10) - LABEL(unaligned_table)
+	.int	LABEL(ashr_11) - LABEL(unaligned_table)
+	.int	LABEL(ashr_12) - LABEL(unaligned_table)
+	.int	LABEL(ashr_13) - LABEL(unaligned_table)
+	.int	LABEL(ashr_14) - LABEL(unaligned_table)
+	.int	LABEL(ashr_15) - LABEL(unaligned_table)
+	.popsection
 
 #ifdef USE_AS_STRNCPY
 	SET_SIZE(strncpy)
 #else
-	SET_SIZE(strcpy)                        /* (char *, const char *) */
+	SET_SIZE(strcpy)			/* (char *, const char *) */
 #endif
--- a/usr/src/lib/libc/amd64/gen/strlen.s	Fri Sep 18 12:50:18 2009 -0700
+++ b/usr/src/lib/libc/amd64/gen/strlen.s	Fri Sep 18 14:25:49 2009 -0700
@@ -1,430 +1,199 @@
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-	
-/*
- * Copyright (c) 2002 Advanced Micro Devices, Inc.
- * 
- * All rights reserved.
- * 
- * Redistribution and  use in source and binary  forms, with or
- * without  modification,  are   permitted  provided  that  the
- * following conditions are met:
- * 
- * + Redistributions  of source  code  must  retain  the  above
- *   copyright  notice,   this  list  of   conditions  and  the
- *   following disclaimer.
- * 
- * + Redistributions  in binary  form must reproduce  the above
- *   copyright  notice,   this  list  of   conditions  and  the
- *   following  disclaimer in  the  documentation and/or  other
- *   materials provided with the distribution.
- * 
- * + Neither the  name of Advanced Micro Devices,  Inc. nor the
- *   names  of  its contributors  may  be  used  to endorse  or
- *   promote  products  derived   from  this  software  without
- *   specific prior written permission.
- * 
- * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
- * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
- * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
- * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
- * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
- * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
- * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
- * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
- * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
- * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
- * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
- * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
- * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
- * POSSIBILITY OF SUCH DAMAGE.
- * 
- * It is  licensee's responsibility  to comply with  any export
- * regulations applicable in licensee's jurisdiction.
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
  */
 
-	.file	"strlen.s"
+/*
+ * Copyright (c) 2009, Intel Corporation
+ * All rights reserved.
+ */
+
+/*
+ *	strlen - calculate the length of string
+ */
 
 #include "SYS.h"
-#include "cache.h"
+#include "proc64_id.h"
 
 #define LABEL(s) .strlen/**/s
 
-	ENTRY(strlen)                /* (const char *s) */
-
-        mov     %rdi, %rsi
-        neg     %rdi
-
-LABEL(aligntry):
-        mov     %rsi , %r8
-        and     $7, %r8d
-	jz	LABEL(alignafter)
-
-LABEL(align):                            /* 8-byte align */
-        sub     $8, %r8
-
-        .p2align 4
-
-LABEL(alignloop):
-        cmpb    $0, (%rsi)
-        je      LABEL(exit)
+	/*
+	 * This implementation uses SSE instructions to compare up to 16 bytes
+	 * at a time looking for the end of string (null char).
+	 */
+	ENTRY(strlen)			/* (const char *s) */
+	mov	%rdi, %rsi		/* keep original %rdi value */
+	mov	%rsi, %rcx
+	pxor	%xmm0, %xmm0		/* 16 null chars */
+	and	$15, %rcx	
+	jz	LABEL(align16_loop)	/* string is 16 byte aligned */ 		
 
-        inc     %rsi
-        inc     %r8
-        jnz     LABEL(alignloop)
-
-        .p2align 4
-
-LABEL(alignafter):
-
-LABEL(56try):
-
-LABEL(56):                               /* 56-byte */
-        mov     (%rsi), %rax
-        mov     $0xfefefefefefefeff, %rcx
-
-LABEL(56loop):
-        mov     %rcx, %r8
-        add     %rax, %r8
-        jnc     LABEL(tail)
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        inc     %r8
-        jnz     LABEL(tail)
+	/*
+	 * Unaligned case. Round down to 16-byte boundary before comparing
+	 * 16 bytes for a null char. The code then compensates for any extra chars
+	 * preceding the start of the string. 
+	 */
+LABEL(unalign16):
+	and	$0xfffffffffffffff0, %rsi
 
-        mov     8 (%rsi), %rax
-        lea     8 (%rsi), %rsi
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        jnc     LABEL(tail)
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        inc     %r8
-        jnz     LABEL(tail)
-
-        mov     8 (%rsi), %rax
-        lea     8 (%rsi), %rsi
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        jnc     LABEL(tail)
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        inc     %r8
-        jnz     LABEL(tail)
+	pcmpeqb	(%rsi), %xmm0
+	lea	16(%rdi), %rsi		
+	pmovmskb %xmm0, %edx
 
-        mov     8 (%rsi), %rax
-        lea     8 (%rsi), %rsi
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        jnc     LABEL(tail)
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        inc     %r8
-        jnz     LABEL(tail)
+	shr	%cl, %edx		/* Compensate for bytes preceding the string */
+	test	%edx, %edx
+	jnz	LABEL(exit)
+	sub	%rcx, %rsi		/* no null, adjust to next 16-byte boundary */
+	pxor	%xmm0, %xmm0		/* clear xmm0, may have been changed... */
+	
+	.p2align 4
+LABEL(align16_loop):			/* 16 byte aligned */
+	pcmpeqb	(%rsi), %xmm0		/* look for null bytes */
+	pmovmskb %xmm0, %edx		/* move each byte mask of %xmm0 to edx */
 
-        mov     8 (%rsi), %rax
-        lea     8 (%rsi), %rsi
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        jnc     LABEL(tail)
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        inc     %r8
-        jnz     LABEL(tail)
+	add	$16, %rsi		/* prepare to search next 16 bytes */
+	test	%edx, %edx		/* if no null byte, %edx must be 0 */
+	jnz	LABEL(exit)		/* found a null */
 
-        mov     8 (%rsi), %rax
-        lea     8 (%rsi), %rsi
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        jnc     LABEL(tail)
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        inc     %r8
-        jnz     LABEL(tail)
-
-        mov     8 (%rsi), %rax
-        lea     8 (%rsi), %rsi
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        jnc     LABEL(tail)
+	pcmpeqb	(%rsi), %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %rsi
+	test	%edx, %edx
+	jnz	LABEL(exit)
 
-        xor     %rax, %r8
-        or      %rcx, %r8
-        inc     %r8
-        jnz     LABEL(tail)
-
-        mov     8 (%rsi), %rax
-        lea     8 (%rsi), %rsi
-
-LABEL(56after):
-
-LABEL(32):                               /* 32-byte */
-        mov     _sref_(.amd64cache1), %r9
+	pcmpeqb	(%rsi), %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %rsi
+	test	%edx, %edx
+	jnz	LABEL(exit)
 
-        .p2align 4
-
-LABEL(32loop):
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
-
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
+	pcmpeqb	(%rsi), %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %rsi
+	test	%edx, %edx
+	jz	LABEL(align16_loop)
 
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
-
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
-
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
+	.p2align 4
+LABEL(exit):
+	neg	%rdi		
+	/*
+	 * Check to see if BSF is fast on this processor. If not, use a different
+	 * exit tail to find first bit set indicating null byte match.
+	 */
+	testl	$USE_BSF, .memops_method(%rip)
+	jz	LABEL(AMD_exit)
 
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
-
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
+	lea	-16(%rdi, %rsi), %rax	/* calculate exact offset */	
+	bsf	%edx, %ecx		/* Least significant 1 bit is index of null */	
+	lea	(%rax, %rcx),%rax
+	ret
 
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
-
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
+	/*
+	 * This exit tail does not use the bsf instruction.
+	 */
+	.p2align 4
+LABEL(AMD_exit):
+	lea	-16(%rdi, %rsi), %rax
+	test	%dl, %dl	
+	jz	LABEL(exit_high)
+	test	$0x01, %dl
+	jnz	LABEL(exit_tail0)
 
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
+	test	$0x02, %dl
+	jnz	LABEL(exit_tail1)
 
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
+	.p2align 4		
+	test	$0x04, %dl
+	jnz	LABEL(exit_tail2)
 
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
+	test	$0x08, %dl
+	jnz	LABEL(exit_tail3)
 
-        sub     $32, %r9
-
-        mov     8 (%rsi), %rax
-        lea     8 (%rsi), %rsi
+	test	$0x10, %dl
+	jnz	LABEL(exit_tail4)
 
-        jbe     LABEL(32loop)
-
-LABEL(32after):
-
-LABEL(pretry):
-
-LABEL(pre):                              /* 64-byte prefetch */
+	test	$0x20, %dl
+	jnz	LABEL(exit_tail5)
 
-        .p2align 4
-
-LABEL(preloop):
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
+	test	$0x40, %dl
+	jnz	LABEL(exit_tail6)
+	add	$7, %rax
+	ret
 
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
+	.p2align 4
+LABEL(exit_high):
+	add	$8, %rax
+	test	$0x01, %dh
+	jnz	LABEL(exit_tail0)
 
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
+	test	$0x02, %dh
+	jnz	LABEL(exit_tail1)
 
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
+	test	$0x04, %dh
+	jnz	LABEL(exit_tail2)
 
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
+	test	$0x08, %dh
+	jnz	LABEL(exit_tail3)
 
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
+	test	$0x10, %dh
+	jnz	LABEL(exit_tail4)
 
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
+	test	$0x20, %dh
+	jnz	LABEL(exit_tail5)
 
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
+	test	$0x40, %dh
+	jnz	LABEL(exit_tail6)
+	add	$7, %rax
+	ret
 
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
+	.p2align 4
+LABEL(exit_tail0):
+	xor	%ecx, %ecx
+	ret
 
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
-
-        mov     %rcx, %r8
-        add     %rax, %r8
-        sbb     %rdx, %rdx
-
-        xor     %rax, %r8
-        or      %rcx, %r8
-        sub     %rdx, %r8
-        jnz     LABEL(tail)
+	.p2align 4
+LABEL(exit_tail1):
+	add	$1, %rax
+	ret
 
-        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
-
-        mov     8 (%rsi), %rax
-        add     $8, %rsi
-
-        jmp     LABEL(preloop)
-
-        .p2align 4
-
-LABEL(preafter):
+	.p2align 4
+LABEL(exit_tail2):
+	add	$2, %rax
+	ret
 
-LABEL(tailtry):
-
-LABEL(tail):                             /* 4-byte tail */
-
-LABEL(tailloop):
-        test    %al, %al
-        jz      LABEL(exit)
-
-        inc     %rsi
-
-        test    %ah, %ah
-        jz      LABEL(exit)
+	.p2align 4
+LABEL(exit_tail3):
+	add	$3, %rax
+	ret
 
-        inc     %rsi
-
-        test    $0x00ff0000, %eax
-        jz      LABEL(exit)
-
-        inc     %rsi
-
-        test    $0xff000000, %eax
-        jz      LABEL(exit)
-
-        inc     %rsi
+	.p2align 4
+LABEL(exit_tail4):
+	add	$4, %rax
+	ret
 
-        shr     $32, %rax
-        jmp     LABEL(tailloop)
-
-LABEL(tailafter):
+	.p2align 4
+LABEL(exit_tail5):
+	add	$5, %rax
+	ret
 
-        .p2align 4
-
-LABEL(exit):
-        lea     (%rdi, %rsi), %rax
-        ret
-
+	.p2align 4
+LABEL(exit_tail6):
+	add	$6, %rax
+	ret
 	SET_SIZE(strlen)