view usr/src/cmd/sgs/rtld/amd64/boot_elf.s @ 13675:a9ae30c28ee4

2413 %ymm* need to be preserved on way through PLT Reviewed by: Richard Lowe <richlowe@richlowe.net> Reviewed by: Joshua M. Clulow <josh@sysmgr.org> Reviewed by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> Approved by: Albert Lee <trisk@nexenta.com>
author Robert Mustacchi <rm@joyent.com>
date Wed, 25 Apr 2012 00:27:21 -0400
parents ae3aa141e3fa
children
line wrap: on
line source

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright (c) 2012 Joyent, Inc. All rights reserved.
 */

#if	defined(lint)

#include	<sys/types.h>
#include	<_rtld.h>
#include	<_audit.h>
#include	<_elf.h>
#include	<sys/regset.h>
#include	<sys/auxv_386.h>

/* ARGSUSED0 */
int
elf_plt_trace()
{
	return (0);
}
#else

#include	<link.h>
#include	<_audit.h>
#include	<sys/asm_linkage.h>
#include	<sys/auxv_386.h>

	.file	"boot_elf.s"
	.text

/*
 * On entry the 'glue code' has already  done the following:
 *
 *	pushq	%rbp
 *	movq	%rsp, %rbp
 *	subq	$0x10, %rsp
 *	leaq	trace_fields(%rip), %r11
 *	movq	%r11, -0x8(%rbp)
 *	movq	$elf_plt_trace, %r11
 *	jmp	*%r11
 *
 * so - -8(%rbp) contains the dyndata ptr
 *
 *	0x0	Addr		*reflmp
 *	0x8	Addr		*deflmp
 *	0x10	Word		symndx
 *	0x14	Word		sb_flags
 *	0x18	Sym		symdef.st_name
 *	0x1c			symdef.st_info
 *	0x1d			symdef.st_other
 *	0x1e			symdef.st_shndx
 *	0x20			symdef.st_value
 *	0x28			symdef.st_size
 *
 * Also note - on entry 16 bytes have already been subtracted
 * from the %rsp.  The first 8 bytes is for the dyn_data_ptr,
 * the second 8 bytes are to align the stack and are available
 * for use.
 */
#define	REFLMP_OFF		0x0	
#define	DEFLMP_OFF		0x8	
#define	SYMNDX_OFF		0x10
#define	SBFLAGS_OFF		0x14
#define	SYMDEF_OFF		0x18
#define	SYMDEF_VALUE_OFF	0x20
/*
 * Local stack space storage for elf_plt_trace is allocated
 * as follows:
 *
 *  First - before we got here - %rsp has been decremented
 *  by 0x10 to make space for the dyndata ptr (and another
 *  free word).  In addition to that, we create space
 *  for the following:
 *
 *	La_amd64_regs	    8 * 8:	64
 *	prev_stack_size	    8		 8
 *	Saved regs:
 *	    %rdi			 8
 *	    %rsi			 8
 *	    %rdx			 8
 *	    %rcx			 8
 *	    %r8				 8
 *	    %r9				 8
 *	    %r10			 8
 *	    %r11			 8
 *	    %rax			 8
 *				    =======
 *			    Subtotal:	144 (32byte aligned)
 *
 *	Saved Media Regs (used to pass floating point args):
 *	    %xmm0 - %xmm7   32 * 8:	256
 *				    =======
 *			    Total:	400 (32byte aligned)
 *  
 *  So - will subtract the following to create enough space
 *
 *	-8(%rbp)	store dyndata ptr
 *	-16(%rbp)	store call destination
 *	-80(%rbp)	space for La_amd64_regs
 *	-88(%rbp)	prev stack size
 *  The next %rbp offsets are only true if the caller had correct stack
 *  alignment.  See note above SPRDIOFF for why we use %rsp alignment to
 *  access these stack fields.
 *	-96(%rbp)	entering %rdi
 *	-104(%rbp)	entering %rsi
 *	-112(%rbp)	entering %rdx
 *	-120(%rbp)	entering %rcx
 *	-128(%rbp)	entering %r8
 *	-136(%rbp)	entering %r9
 *	-144(%rbp)	entering %r10
 *	-152(%rbp)	entering %r11
 *	-160(%rbp)	entering %rax
 *	-192(%rbp)	entering %xmm0
 *	-224(%rbp)	entering %xmm1
 *	-256(%rbp)	entering %xmm2
 *	-288(%rbp)	entering %xmm3
 *	-320(%rbp)	entering %xmm4
 *	-384(%rbp)	entering %xmm5
 *	-416(%rbp)	entering %xmm6
 *	-448(%rbp)	entering %xmm7
 *
 */
#define	SPDYNOFF    -8
#define	SPDESTOFF   -16
#define	SPLAREGOFF  -80
#define	SPPRVSTKOFF -88

/*
 * The next set of offsets are relative to %rsp.
 * We guarantee %rsp is ABI compliant 32-byte aligned.  This guarantees the
 * ymm registers are saved to 32-byte aligned addresses.
 * %rbp may only be 8 byte aligned if we came in from non-ABI compliant code.
 */ 
#define	SPRDIOFF	320
#define	SPRSIOFF	312
#define	SPRDXOFF	304
#define	SPRCXOFF	296
#define	SPR8OFF		288
#define	SPR9OFF		280
#define	SPR10OFF	272
#define	SPR11OFF	264
#define	SPRAXOFF	256
#define	SPXMM0OFF	224
#define	SPXMM1OFF	192
#define	SPXMM2OFF	160
#define	SPXMM3OFF	128
#define	SPXMM4OFF	96
#define	SPXMM5OFF	64
#define	SPXMM6OFF	32
#define	SPXMM7OFF	0

	/* See elf_rtbndr for explanation behind org_scapset */
	.extern org_scapset
	.globl	elf_plt_trace
	.type	elf_plt_trace,@function
	.align 16
elf_plt_trace:
	/*
	 * Enforce ABI 32-byte stack alignment here.
	 * The next andq instruction does this pseudo code:
	 * If %rsp is 8 byte aligned then subtract 8 from %rsp.
	 */
	andq    $-32, %rsp	/* enforce ABI 32-byte stack alignment */
	subq	$400,%rsp	/ create some local storage

	movq	%rdi, SPRDIOFF(%rsp)
	movq	%rsi, SPRSIOFF(%rsp)
	movq	%rdx, SPRDXOFF(%rsp)
	movq	%rcx, SPRCXOFF(%rsp)
	movq	%r8, SPR8OFF(%rsp)
	movq	%r9, SPR9OFF(%rsp)
	movq	%r10, SPR10OFF(%rsp)
	movq	%r11, SPR11OFF(%rsp)
	movq	%rax, SPRAXOFF(%rsp)

	movq	org_scapset@GOTPCREL(%rip),%r9
	movq	(%r9),%r9
	movl	(%r9),%edx
	testl	$AV_386_AVX,%edx
	jne	.trace_save_ymm

.trace_save_xmm:
	movdqa	%xmm0, SPXMM0OFF(%rsp)
	movdqa	%xmm1, SPXMM1OFF(%rsp)
	movdqa	%xmm2, SPXMM2OFF(%rsp)
	movdqa	%xmm3, SPXMM3OFF(%rsp)
	movdqa	%xmm4, SPXMM4OFF(%rsp)
	movdqa	%xmm5, SPXMM5OFF(%rsp)
	movdqa	%xmm6, SPXMM6OFF(%rsp)
	movdqa	%xmm7, SPXMM7OFF(%rsp)
	jmp	.trace_save_finish	

.trace_save_ymm:
	vmovdqa	%ymm0, SPXMM0OFF(%rsp)
	vmovdqa	%ymm1, SPXMM1OFF(%rsp)
	vmovdqa	%ymm2, SPXMM2OFF(%rsp)
	vmovdqa	%ymm3, SPXMM3OFF(%rsp)
	vmovdqa	%ymm4, SPXMM4OFF(%rsp)
	vmovdqa	%ymm5, SPXMM5OFF(%rsp)
	vmovdqa	%ymm6, SPXMM6OFF(%rsp)
	vmovdqa	%ymm7, SPXMM7OFF(%rsp)

.trace_save_finish:

	movq	SPDYNOFF(%rbp), %rax			/ %rax = dyndata
	testb	$LA_SYMB_NOPLTENTER, SBFLAGS_OFF(%rax)	/ <link.h>
	je	.start_pltenter
	movq	SYMDEF_VALUE_OFF(%rax), %rdi
	movq	%rdi, SPDESTOFF(%rbp)		/ save destination address
	jmp	.end_pltenter

.start_pltenter:
	/*
	 * save all registers into La_amd64_regs
	 */
	leaq	SPLAREGOFF(%rbp), %rsi	/ %rsi = &La_amd64_regs
	leaq	8(%rbp), %rdi
	movq	%rdi, 0(%rsi)		/ la_rsp
	movq	0(%rbp), %rdi
	movq	%rdi, 8(%rsi)		/ la_rbp
	movq	SPRDIOFF(%rsp), %rdi
	movq	%rdi, 16(%rsi)		/ la_rdi
	movq	SPRSIOFF(%rsp), %rdi
	movq	%rdi, 24(%rsi)		/ la_rsi
	movq	SPRDXOFF(%rsp), %rdi
	movq	%rdi, 32(%rsi)		/ la_rdx
	movq	SPRCXOFF(%rsp), %rdi
	movq	%rdi, 40(%rsi)		/ la_rcx
	movq	SPR8OFF(%rsp), %rdi
	movq	%rdi, 48(%rsi)		/ la_r8
	movq	SPR9OFF(%rsp), %rdi
	movq	%rdi, 56(%rsi)		/ la_r9

	/*
	 * prepare for call to la_pltenter
	 */
	movq	SPDYNOFF(%rbp), %r11		/ %r11 = &dyndata
	leaq	SBFLAGS_OFF(%r11), %r9		/ arg6 (&sb_flags)
	leaq	SPLAREGOFF(%rbp), %r8		/ arg5 (&La_amd64_regs)
	movl	SYMNDX_OFF(%r11), %ecx		/ arg4 (symndx)
	leaq	SYMDEF_OFF(%r11), %rdx		/ arg3 (&Sym)
	movq	DEFLMP_OFF(%r11), %rsi		/ arg2 (dlmp)
	movq	REFLMP_OFF(%r11), %rdi		/ arg1 (rlmp)
	call	audit_pltenter@PLT
	movq	%rax, SPDESTOFF(%rbp)		/ save calling address
.end_pltenter:

	/*
	 * If *no* la_pltexit() routines exist
	 * we do not need to keep the stack frame
	 * before we call the actual routine.  Instead we
	 * jump to it and remove our stack from the stack
	 * at the same time.
	 */
	movl	audit_flags(%rip), %eax
	andl	$AF_PLTEXIT, %eax		/ value of audit.h:AF_PLTEXIT
	cmpl	$0, %eax
	je	.bypass_pltexit
	/*
	 * Has the *nopltexit* flag been set for this entry point
	 */
	movq	SPDYNOFF(%rbp), %r11		/ %r11 = &dyndata
	testb	$LA_SYMB_NOPLTEXIT, SBFLAGS_OFF(%r11)
	je	.start_pltexit

.bypass_pltexit:
	/*
	 * No PLTEXIT processing required.
	 */
	movq	0(%rbp), %r11
	movq	%r11, -8(%rbp)			/ move prev %rbp
	movq	SPDESTOFF(%rbp), %r11		/ r11 == calling destination
	movq	%r11, 0(%rbp)			/ store destination at top

	/
	/ Restore registers
	/
	movq	org_scapset@GOTPCREL(%rip),%r9
	movq	(%r9),%r9
	movl	(%r9),%edx
	testl	$AV_386_AVX,%edx
	jne	.trace_restore_ymm

.trace_restore_xmm:
	movdqa	SPXMM0OFF(%rsp), %xmm0
	movdqa	SPXMM1OFF(%rsp), %xmm1
	movdqa	SPXMM2OFF(%rsp), %xmm2
	movdqa	SPXMM3OFF(%rsp), %xmm3
	movdqa	SPXMM4OFF(%rsp), %xmm4
	movdqa	SPXMM5OFF(%rsp), %xmm5
	movdqa	SPXMM6OFF(%rsp), %xmm6
	movdqa	SPXMM7OFF(%rsp), %xmm7
	jmp	.trace_restore_finish

.trace_restore_ymm:
	vmovdqa	SPXMM0OFF(%rsp), %ymm0
	vmovdqa	SPXMM1OFF(%rsp), %ymm1
	vmovdqa	SPXMM2OFF(%rsp), %ymm2
	vmovdqa	SPXMM3OFF(%rsp), %ymm3
	vmovdqa	SPXMM4OFF(%rsp), %ymm4
	vmovdqa	SPXMM5OFF(%rsp), %ymm5
	vmovdqa	SPXMM6OFF(%rsp), %ymm6
	vmovdqa	SPXMM7OFF(%rsp), %ymm7

.trace_restore_finish:
	movq	SPRDIOFF(%rsp), %rdi
	movq	SPRSIOFF(%rsp), %rsi
	movq	SPRDXOFF(%rsp), %rdx
	movq	SPRCXOFF(%rsp), %rcx
	movq	SPR8OFF(%rsp), %r8
	movq	SPR9OFF(%rsp), %r9
	movq	SPR10OFF(%rsp), %r10
	movq	SPR11OFF(%rsp), %r11
	movq	SPRAXOFF(%rsp), %rax

	subq	$8, %rbp			/ adjust %rbp for 'ret'
	movq	%rbp, %rsp			/
	/*
	 * At this point, after a little doctoring, we should
	 * have the following on the stack:
	 *
	 *	16(%rsp):  ret addr
	 *	8(%rsp):  dest_addr
	 *	0(%rsp):  Previous %rbp
	 *
	 * So - we pop the previous %rbp, and then
	 * ret to our final destination.
	 */
	popq	%rbp				/
	ret					/ jmp to final destination
						/ and clean up stack :)

.start_pltexit:
	/*
	 * In order to call the destination procedure and then return
	 * to audit_pltexit() for post analysis we must first grow
	 * our stack frame and then duplicate the original callers
	 * stack state.  This duplicates all of the arguements
	 * that were to be passed to the destination procedure.
	 */
	movq	%rbp, %rdi			/
	addq	$16, %rdi			/    %rdi = src
	movq	(%rbp), %rdx			/
	subq	%rdi, %rdx			/    %rdx == prev frame sz
	/*
	 * If audit_argcnt > 0 then we limit the number of
	 * arguements that will be duplicated to audit_argcnt.
	 *
	 * If (prev_stack_size > (audit_argcnt * 8))
	 *	prev_stack_size = audit_argcnt * 8;
	 */
	movl	audit_argcnt(%rip),%eax		/   %eax = audit_argcnt
	cmpl	$0, %eax
	jle	.grow_stack
	leaq	(,%rax,8), %rax			/    %eax = %eax * 4
	cmpq	%rax,%rdx
	jle	.grow_stack
	movq	%rax, %rdx
	/*
	 * Grow the stack and duplicate the arguements of the
	 * original caller.
	 *
	 * We save %rsp in %r11 since we need to use the current rsp for
	 * accessing the registers saved in our stack frame.
	 */
.grow_stack:
	movq	%rsp, %r11
	subq	%rdx, %rsp			/    grow the stack 
	movq	%rdx, SPPRVSTKOFF(%rbp)		/    -88(%rbp) == prev frame sz
	movq	%rsp, %rcx			/    %rcx = dest
	addq	%rcx, %rdx			/    %rdx == tail of dest
.while_base:
	cmpq	%rdx, %rcx			/   while (base+size >= src++) {
	jge	.end_while			/
	movq	(%rdi), %rsi
	movq	%rsi,(%rcx)			/        *dest = *src
	addq	$8, %rdi			/	 src++
	addq	$8, %rcx			/        dest++
	jmp	.while_base			/    }

	/*
	 * The above stack is now an exact duplicate of
	 * the stack of the original calling procedure.
	 */
.end_while:
	/
	/ Restore registers using %r11 which contains our old %rsp value
	/ before growing the stack.
	/

	/ Yes, we have to do this dance again. Sorry.
	movq	org_scapset@GOTPCREL(%rip),%r9
	movq	(%r9),%r9
	movl	(%r9),%edx
	testl	$AV_386_AVX,%edx
	jne	.trace_r2_ymm

.trace_r2_xmm:
	movdqa	SPXMM0OFF(%r11), %xmm0
	movdqa	SPXMM1OFF(%r11), %xmm1
	movdqa	SPXMM2OFF(%r11), %xmm2
	movdqa	SPXMM3OFF(%r11), %xmm3
	movdqa	SPXMM4OFF(%r11), %xmm4
	movdqa	SPXMM5OFF(%r11), %xmm5
	movdqa	SPXMM6OFF(%r11), %xmm6
	movdqa	SPXMM7OFF(%r11), %xmm7
	jmp	.trace_r2_finish

.trace_r2_ymm:
	vmovdqa	SPXMM0OFF(%r11), %ymm0
	vmovdqa	SPXMM1OFF(%r11), %ymm1
	vmovdqa	SPXMM2OFF(%r11), %ymm2
	vmovdqa	SPXMM3OFF(%r11), %ymm3
	vmovdqa	SPXMM4OFF(%r11), %ymm4
	vmovdqa	SPXMM5OFF(%r11), %ymm5
	vmovdqa	SPXMM6OFF(%r11), %ymm6
	vmovdqa	SPXMM7OFF(%r11), %ymm7

.trace_r2_finish:
	movq	SPRDIOFF(%r11), %rdi
	movq	SPRSIOFF(%r11), %rsi
	movq	SPRDXOFF(%r11), %rdx
	movq	SPRCXOFF(%r11), %rcx
	movq	SPR8OFF(%r11), %r8
	movq	SPR9OFF(%r11), %r9
	movq	SPR10OFF(%r11), %r10
	movq	SPRAXOFF(%r11), %rax
	movq	SPR11OFF(%r11), %r11		/ retore %r11 last

	/*
	 * Call to desitnation function - we'll return here
	 * for pltexit monitoring.
	 */
	call	*SPDESTOFF(%rbp)

	addq	SPPRVSTKOFF(%rbp), %rsp	/ cleanup dupped stack

	/
	/ prepare for call to audit_pltenter()
	/
	movq	SPDYNOFF(%rbp), %r11		/ %r11 = &dyndata
	movq	SYMNDX_OFF(%r11), %r8		/ arg5 (symndx)
	leaq	SYMDEF_OFF(%r11), %rcx		/ arg4 (&Sym)
	movq	DEFLMP_OFF(%r11), %rdx		/ arg3 (dlmp)
	movq	REFLMP_OFF(%r11), %rsi		/ arg2 (rlmp)
	movq	%rax, %rdi			/ arg1 (returnval)
	call	audit_pltexit@PLT
	
	/*
	 * Clean up after ourselves and return to the
	 * original calling procedure.
	 */

	/
	/ Restore registers
	/
	movq	SPRDIOFF(%rsp), %rdi
	movq	SPRSIOFF(%rsp), %rsi
	movq	SPRDXOFF(%rsp), %rdx
	movq	SPRCXOFF(%rsp), %rcx
	movq	SPR8OFF(%rsp), %r8
	movq	SPR9OFF(%rsp), %r9
	movq	SPR10OFF(%rsp), %r10
	movq	SPR11OFF(%rsp), %r11
	// rax already contains return value
	movdqa	SPXMM0OFF(%rsp), %xmm0
	movdqa	SPXMM1OFF(%rsp), %xmm1
	movdqa	SPXMM2OFF(%rsp), %xmm2
	movdqa	SPXMM3OFF(%rsp), %xmm3
	movdqa	SPXMM4OFF(%rsp), %xmm4
	movdqa	SPXMM5OFF(%rsp), %xmm5
	movdqa	SPXMM6OFF(%rsp), %xmm6
	movdqa	SPXMM7OFF(%rsp), %xmm7

	movq	%rbp, %rsp			/
	popq	%rbp				/
	ret					/ return to caller
	.size	elf_plt_trace, .-elf_plt_trace
#endif

/*
 * We got here because a call to a function resolved to a procedure
 * linkage table entry.  That entry did a JMPL to the first PLT entry, which
 * in turn did a call to elf_rtbndr.
 *
 * the code sequence that got us here was:
 *
 * .PLT0:
 *	pushq	GOT+8(%rip)	#GOT[1]
 *	jmp	*GOT+16(%rip)	#GOT[2]
 *	nop
 *	nop
 *	nop
 *	nop
 *	...
 * PLT entry for foo:
 *	jmp	*name1@GOTPCREL(%rip)
 *	pushl	$rel.plt.foo
 *	jmp	PLT0
 *
 * At entry, the stack looks like this:
 *
 *	return address			16(%rsp)
 *	$rel.plt.foo	(plt index)	8(%rsp)
 *	lmp				0(%rsp)
 *
 */
#if defined(lint)

extern unsigned long	elf_bndr(Rt_map *, unsigned long, caddr_t);

void
elf_rtbndr(Rt_map * lmp, unsigned long reloc, caddr_t pc)
{
	(void) elf_bndr(lmp, reloc, pc);
}

#else

/*
 * The PLT code that landed us here placed 2 arguments on the stack as
 * arguments to elf_rtbndr.
 * Additionally the pc of caller is below these 2 args.
 * Our stack will look like this after we establish a stack frame with
 * push %rbp; movq %rsp, %rbp sequence:
 *
 *	8(%rbp)			arg1 - *lmp
 *	16(%rbp), %rsi		arg2 - reloc index
 *	24(%rbp), %rdx		arg3 - pc of caller
 */
#define	LBPLMPOFF	8	/* arg1 - *lmp */
#define	LBPRELOCOFF	16	/* arg2 - reloc index */
#define	LBRPCOFF	24	/* arg3 - pc of caller */

/*
 * Possible arguments for the resolved function are in registers as per
 * the AMD64 ABI.  We must save on the local stack all possible register
 * arguments before interposing functions to resolve the called function. 
 * Possible arguments must be restored before invoking the resolved function.
 * 
 * Before the AVX instruction set enhancements to AMD64 there were no changes in
 * the set of registers and their sizes across different processors. With AVX,
 * the xmm registers became the lower 128 bits of the ymm registers. Because of
 * this, we need to conditionally save 256 bits instead of 128 bits. Regardless
 * of whether we have ymm registers or not, we're always going to push the stack
 * space assuming that we do to simplify the code.
 * 
 * Local stack space storage for elf_rtbndr is allocated as follows:
 *
 *	Saved regs:
 *	    %rax			 8
 *	    %rdi			 8
 *	    %rsi			 8
 *	    %rdx			 8
 *	    %rcx			 8
 *	    %r8				 8
 *	    %r9				 8
 *	    %r10			 8
 *				    =======
 *			    Subtotal:   64 (32byte aligned)
 *
 *	Saved Media Regs (used to pass floating point args):
 *	    %ymm0 - %ymm7   32 * 8     256
 *				    =======
 *			    Total:     320 (32byte aligned)
 *  
 *  So - will subtract the following to create enough space
 *
 *	0(%rsp)		save %rax
 *	8(%rsp)		save %rdi
 *	16(%rsp)	save %rsi
 *	24(%rsp)	save %rdx
 *	32(%rsp)	save %rcx
 *	40(%rsp)	save %r8
 *	48(%rsp)	save %r9
 *	56(%rsp)	save %r10
 *	64(%rsp)	save %ymm0
 *	96(%rsp)	save %ymm1
 *	128(%rsp)	save %ymm2
 *	160(%rsp)	save %ymm3
 *	192(%rsp)	save %ymm4
 *	224(%rsp)	save %ymm5
 *	256(%rsp)	save %ymm6
 *	288(%rsp)	save %ymm7
 *
 * Note: Some callers may use 8-byte stack alignment instead of the
 * ABI required 16-byte alignment.  We use %rsp offsets to save/restore
 * registers because %rbp may not be 16-byte aligned.  We guarantee %rsp
 * is 16-byte aligned in the function preamble.
 */
/*
 * As the registers may either be xmm or ymm, we've left the name as xmm, but
 * increased the offset between them to always cover the xmm and ymm cases.
 */
#define	LS_SIZE	$320	/* local stack space to save all possible arguments */
#define	LSRAXOFF	0	/* for SSE register count */
#define	LSRDIOFF	8	/* arg 0 ... */
#define	LSRSIOFF	16
#define	LSRDXOFF	24
#define	LSRCXOFF	32
#define	LSR8OFF		40
#define	LSR9OFF		48
#define	LSR10OFF	56	/* ... arg 5 */
#define	LSXMM0OFF	64	/* SSE arg 0 ... */
#define	LSXMM1OFF	96
#define	LSXMM2OFF	128
#define	LSXMM3OFF	160
#define	LSXMM4OFF	192
#define	LSXMM5OFF	224
#define	LSXMM6OFF	256
#define	LSXMM7OFF	288	/* ... SSE arg 7 */

	/*
	 * The org_scapset is a global variable that is a part of rtld. It
	 * contains the capabilities that the kernel has told us are supported
	 * (auxv_hwcap). This is necessary for determining whether or not we
	 * need to save and restore AVX registers or simple SSE registers. Note,
	 * that the field we care about is currently at offset 0, if that
	 * changes, this code will have to be updated.
	 */
	.extern org_scapset
	.weak	_elf_rtbndr
	_elf_rtbndr = elf_rtbndr

	ENTRY(elf_rtbndr)

	pushq	%rbp
	movq	%rsp, %rbp

	/*
	 * Some libraries may (incorrectly) use non-ABI compliant 8-byte stack
	 * alignment.  Enforce ABI 16-byte stack alignment here.
	 * The next andq instruction does this pseudo code:
	 * If %rsp is 8 byte aligned then subtract 8 from %rsp.
	 */
	andq	$-32, %rsp	/* enforce ABI 32-byte stack alignment */

	subq	LS_SIZE, %rsp	/* save all ABI defined argument registers */

	movq	%rax, LSRAXOFF(%rsp)	/* for SSE register count */
	movq	%rdi, LSRDIOFF(%rsp)	/*  arg 0 .. */
	movq	%rsi, LSRSIOFF(%rsp)
	movq	%rdx, LSRDXOFF(%rsp)
	movq	%rcx, LSRCXOFF(%rsp)
	movq	%r8, LSR8OFF(%rsp)
	movq	%r9, LSR9OFF(%rsp)	/* .. arg 5 */
	movq	%r10, LSR10OFF(%rsp)	/* call chain reg */

	/*
	 * Our xmm registers could secretly by ymm registers in disguise.
	 */
	movq	org_scapset@GOTPCREL(%rip),%r9
	movq	(%r9),%r9
	movl	(%r9),%edx
	testl	$AV_386_AVX,%edx
	jne	.save_ymm

.save_xmm:
	movdqa	%xmm0, LSXMM0OFF(%rsp)	/* SSE arg 0 ... */
	movdqa	%xmm1, LSXMM1OFF(%rsp)
	movdqa	%xmm2, LSXMM2OFF(%rsp)
	movdqa	%xmm3, LSXMM3OFF(%rsp)
	movdqa	%xmm4, LSXMM4OFF(%rsp)
	movdqa	%xmm5, LSXMM5OFF(%rsp)
	movdqa	%xmm6, LSXMM6OFF(%rsp)
	movdqa	%xmm7, LSXMM7OFF(%rsp)	/* ... SSE arg 7 */
	jmp	.save_finish	

.save_ymm:
	vmovdqa	%ymm0, LSXMM0OFF(%rsp)	/* SSE arg 0 ... */
	vmovdqa	%ymm1, LSXMM1OFF(%rsp)
	vmovdqa	%ymm2, LSXMM2OFF(%rsp)
	vmovdqa	%ymm3, LSXMM3OFF(%rsp)
	vmovdqa	%ymm4, LSXMM4OFF(%rsp)
	vmovdqa	%ymm5, LSXMM5OFF(%rsp)
	vmovdqa	%ymm6, LSXMM6OFF(%rsp)
	vmovdqa	%ymm7, LSXMM7OFF(%rsp)	/* ... SSE arg 7 */

.save_finish:
	movq	LBPLMPOFF(%rbp), %rdi	/* arg1 - *lmp */
	movq	LBPRELOCOFF(%rbp), %rsi	/* arg2 - reloc index */
	movq	LBRPCOFF(%rbp), %rdx	/* arg3 - pc of caller */
	call	elf_bndr@PLT		/* call elf_rtbndr(lmp, relndx, pc) */
	movq	%rax, LBPRELOCOFF(%rbp)	/* store final destination */

	/*
	 * Restore possible arguments before invoking resolved function. We
	 * check the xmm vs. ymm regs first so we can use the others.
	 */
	movq	org_scapset@GOTPCREL(%rip),%r9
	movq	(%r9),%r9
	movl	(%r9),%edx
	testl	$AV_386_AVX,%edx
	jne	.restore_ymm

.restore_xmm:
	movdqa	LSXMM0OFF(%rsp), %xmm0
	movdqa	LSXMM1OFF(%rsp), %xmm1
	movdqa	LSXMM2OFF(%rsp), %xmm2
	movdqa	LSXMM3OFF(%rsp), %xmm3
	movdqa	LSXMM4OFF(%rsp), %xmm4
	movdqa	LSXMM5OFF(%rsp), %xmm5
	movdqa	LSXMM6OFF(%rsp), %xmm6
	movdqa	LSXMM7OFF(%rsp), %xmm7
	jmp .restore_finish

.restore_ymm:
	vmovdqa	LSXMM0OFF(%rsp), %ymm0
	vmovdqa	LSXMM1OFF(%rsp), %ymm1
	vmovdqa	LSXMM2OFF(%rsp), %ymm2
	vmovdqa	LSXMM3OFF(%rsp), %ymm3
	vmovdqa	LSXMM4OFF(%rsp), %ymm4
	vmovdqa	LSXMM5OFF(%rsp), %ymm5
	vmovdqa	LSXMM6OFF(%rsp), %ymm6
	vmovdqa	LSXMM7OFF(%rsp), %ymm7

.restore_finish:
	movq	LSRAXOFF(%rsp), %rax
	movq	LSRDIOFF(%rsp), %rdi
	movq	LSRSIOFF(%rsp), %rsi
	movq	LSRDXOFF(%rsp), %rdx
	movq	LSRCXOFF(%rsp), %rcx
	movq	LSR8OFF(%rsp), %r8
	movq	LSR9OFF(%rsp), %r9
	movq	LSR10OFF(%rsp), %r10

	movq	%rbp, %rsp
	popq	%rbp

	addq	$8, %rsp	/* pop 1st plt-pushed args */
				/* the second arguement is used */
				/* for the 'return' address to our */
				/* final destination */

	ret			/* invoke resolved function */
	.size 	elf_rtbndr, .-elf_rtbndr
#endif