comparison usr/src/cmd/sgs/rtld/amd64/boot_elf.s @ 13675:a9ae30c28ee4

2413 %ymm* need to be preserved on way through PLT Reviewed by: Richard Lowe <richlowe@richlowe.net> Reviewed by: Joshua M. Clulow <josh@sysmgr.org> Reviewed by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> Approved by: Albert Lee <trisk@nexenta.com>
author Robert Mustacchi <rm@joyent.com>
date Wed, 25 Apr 2012 00:27:21 -0400
parents ae3aa141e3fa
children
comparison
equal deleted inserted replaced
13674:181ba6c41bee 13675:a9ae30c28ee4
20 */ 20 */
21 21
22 /* 22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms. 24 * Use is subject to license terms.
25 * Copyright (c) 2012 Joyent, Inc. All rights reserved.
25 */ 26 */
26
27 #pragma ident "%Z%%M% %I% %E% SMI"
28 27
29 #if defined(lint) 28 #if defined(lint)
30 29
31 #include <sys/types.h> 30 #include <sys/types.h>
32 #include <_rtld.h> 31 #include <_rtld.h>
33 #include <_audit.h> 32 #include <_audit.h>
34 #include <_elf.h> 33 #include <_elf.h>
35 #include <sys/regset.h> 34 #include <sys/regset.h>
35 #include <sys/auxv_386.h>
36 36
37 /* ARGSUSED0 */ 37 /* ARGSUSED0 */
38 int 38 int
39 elf_plt_trace() 39 elf_plt_trace()
40 { 40 {
43 #else 43 #else
44 44
45 #include <link.h> 45 #include <link.h>
46 #include <_audit.h> 46 #include <_audit.h>
47 #include <sys/asm_linkage.h> 47 #include <sys/asm_linkage.h>
48 #include <sys/auxv_386.h>
48 49
49 .file "boot_elf.s" 50 .file "boot_elf.s"
50 .text 51 .text
51 52
52 /* 53 /*
104 * %r9 8 105 * %r9 8
105 * %r10 8 106 * %r10 8
106 * %r11 8 107 * %r11 8
107 * %rax 8 108 * %rax 8
108 * ======= 109 * =======
109 * Subtotal: 144 (16byte aligned) 110 * Subtotal: 144 (32byte aligned)
110 * 111 *
111 * Saved Media Regs (used to pass floating point args): 112 * Saved Media Regs (used to pass floating point args):
112 * %xmm0 - %xmm7 16 * 8: 128 113 * %xmm0 - %xmm7 32 * 8: 256
113 * ======= 114 * =======
114 * Total: 272 (16byte aligned) 115 * Total: 400 (32byte aligned)
115 * 116 *
116 * So - will subtract the following to create enough space 117 * So - will subtract the following to create enough space
117 * 118 *
118 * -8(%rbp) store dyndata ptr 119 * -8(%rbp) store dyndata ptr
119 * -16(%rbp) store call destination 120 * -16(%rbp) store call destination
129 * -128(%rbp) entering %r8 130 * -128(%rbp) entering %r8
130 * -136(%rbp) entering %r9 131 * -136(%rbp) entering %r9
131 * -144(%rbp) entering %r10 132 * -144(%rbp) entering %r10
132 * -152(%rbp) entering %r11 133 * -152(%rbp) entering %r11
133 * -160(%rbp) entering %rax 134 * -160(%rbp) entering %rax
134 * -176(%rbp) entering %xmm0 135 * -192(%rbp) entering %xmm0
135 * -192(%rbp) entering %xmm1 136 * -224(%rbp) entering %xmm1
136 * -208(%rbp) entering %xmm2 137 * -256(%rbp) entering %xmm2
137 * -224(%rbp) entering %xmm3 138 * -288(%rbp) entering %xmm3
138 * -240(%rbp) entering %xmm4 139 * -320(%rbp) entering %xmm4
139 * -256(%rbp) entering %xmm5 140 * -384(%rbp) entering %xmm5
140 * -272(%rbp) entering %xmm6 141 * -416(%rbp) entering %xmm6
141 * -288(%rbp) entering %xmm7 142 * -448(%rbp) entering %xmm7
142 * 143 *
143 */ 144 */
144 #define SPDYNOFF -8 145 #define SPDYNOFF -8
145 #define SPDESTOFF -16 146 #define SPDESTOFF -16
146 #define SPLAREGOFF -80 147 #define SPLAREGOFF -80
147 #define SPPRVSTKOFF -88 148 #define SPPRVSTKOFF -88
148 149
149 /* 150 /*
150 * The next set of offsets are relative to %rsp. 151 * The next set of offsets are relative to %rsp.
151 * We guarantee %rsp is ABI compliant 16-byte aligned. This guarantees the 152 * We guarantee %rsp is ABI compliant 32-byte aligned. This guarantees the
152 * xmm registers are saved to 16-byte aligned addresses. 153 * ymm registers are saved to 32-byte aligned addresses.
153 * %rbp may only be 8 byte aligned if we came in from non-ABI compliant code. 154 * %rbp may only be 8 byte aligned if we came in from non-ABI compliant code.
154 */ 155 */
155 #define SPRDIOFF 192 156 #define SPRDIOFF 320
156 #define SPRSIOFF 184 157 #define SPRSIOFF 312
157 #define SPRDXOFF 176 158 #define SPRDXOFF 304
158 #define SPRCXOFF 168 159 #define SPRCXOFF 296
159 #define SPR8OFF 160 160 #define SPR8OFF 288
160 #define SPR9OFF 152 161 #define SPR9OFF 280
161 #define SPR10OFF 144 162 #define SPR10OFF 272
162 #define SPR11OFF 136 163 #define SPR11OFF 264
163 #define SPRAXOFF 128 164 #define SPRAXOFF 256
164 #define SPXMM0OFF 112 165 #define SPXMM0OFF 224
165 #define SPXMM1OFF 96 166 #define SPXMM1OFF 192
166 #define SPXMM2OFF 80 167 #define SPXMM2OFF 160
167 #define SPXMM3OFF 64 168 #define SPXMM3OFF 128
168 #define SPXMM4OFF 48 169 #define SPXMM4OFF 96
169 #define SPXMM5OFF 32 170 #define SPXMM5OFF 64
170 #define SPXMM6OFF 16 171 #define SPXMM6OFF 32
171 #define SPXMM7OFF 0 172 #define SPXMM7OFF 0
172 173
174 /* See elf_rtbndr for explanation behind org_scapset */
175 .extern org_scapset
173 .globl elf_plt_trace 176 .globl elf_plt_trace
174 .type elf_plt_trace,@function 177 .type elf_plt_trace,@function
175 .align 16 178 .align 16
176 elf_plt_trace: 179 elf_plt_trace:
177 /* 180 /*
178 * Enforce ABI 16-byte stack alignment here. 181 * Enforce ABI 32-byte stack alignment here.
179 * The next andq instruction does this pseudo code: 182 * The next andq instruction does this pseudo code:
180 * If %rsp is 8 byte aligned then subtract 8 from %rsp. 183 * If %rsp is 8 byte aligned then subtract 8 from %rsp.
181 */ 184 */
182 andq $-16, %rsp /* enforce ABI 16-byte stack alignment */ 185 andq $-32, %rsp /* enforce ABI 32-byte stack alignment */
183 subq $272,%rsp / create some local storage 186 subq $400,%rsp / create some local storage
184 187
185 movq %rdi, SPRDIOFF(%rsp) 188 movq %rdi, SPRDIOFF(%rsp)
186 movq %rsi, SPRSIOFF(%rsp) 189 movq %rsi, SPRSIOFF(%rsp)
187 movq %rdx, SPRDXOFF(%rsp) 190 movq %rdx, SPRDXOFF(%rsp)
188 movq %rcx, SPRCXOFF(%rsp) 191 movq %rcx, SPRCXOFF(%rsp)
189 movq %r8, SPR8OFF(%rsp) 192 movq %r8, SPR8OFF(%rsp)
190 movq %r9, SPR9OFF(%rsp) 193 movq %r9, SPR9OFF(%rsp)
191 movq %r10, SPR10OFF(%rsp) 194 movq %r10, SPR10OFF(%rsp)
192 movq %r11, SPR11OFF(%rsp) 195 movq %r11, SPR11OFF(%rsp)
193 movq %rax, SPRAXOFF(%rsp) 196 movq %rax, SPRAXOFF(%rsp)
197
198 movq org_scapset@GOTPCREL(%rip),%r9
199 movq (%r9),%r9
200 movl (%r9),%edx
201 testl $AV_386_AVX,%edx
202 jne .trace_save_ymm
203
204 .trace_save_xmm:
194 movdqa %xmm0, SPXMM0OFF(%rsp) 205 movdqa %xmm0, SPXMM0OFF(%rsp)
195 movdqa %xmm1, SPXMM1OFF(%rsp) 206 movdqa %xmm1, SPXMM1OFF(%rsp)
196 movdqa %xmm2, SPXMM2OFF(%rsp) 207 movdqa %xmm2, SPXMM2OFF(%rsp)
197 movdqa %xmm3, SPXMM3OFF(%rsp) 208 movdqa %xmm3, SPXMM3OFF(%rsp)
198 movdqa %xmm4, SPXMM4OFF(%rsp) 209 movdqa %xmm4, SPXMM4OFF(%rsp)
199 movdqa %xmm5, SPXMM5OFF(%rsp) 210 movdqa %xmm5, SPXMM5OFF(%rsp)
200 movdqa %xmm6, SPXMM6OFF(%rsp) 211 movdqa %xmm6, SPXMM6OFF(%rsp)
201 movdqa %xmm7, SPXMM7OFF(%rsp) 212 movdqa %xmm7, SPXMM7OFF(%rsp)
213 jmp .trace_save_finish
214
215 .trace_save_ymm:
216 vmovdqa %ymm0, SPXMM0OFF(%rsp)
217 vmovdqa %ymm1, SPXMM1OFF(%rsp)
218 vmovdqa %ymm2, SPXMM2OFF(%rsp)
219 vmovdqa %ymm3, SPXMM3OFF(%rsp)
220 vmovdqa %ymm4, SPXMM4OFF(%rsp)
221 vmovdqa %ymm5, SPXMM5OFF(%rsp)
222 vmovdqa %ymm6, SPXMM6OFF(%rsp)
223 vmovdqa %ymm7, SPXMM7OFF(%rsp)
224
225 .trace_save_finish:
202 226
203 movq SPDYNOFF(%rbp), %rax / %rax = dyndata 227 movq SPDYNOFF(%rbp), %rax / %rax = dyndata
204 testb $LA_SYMB_NOPLTENTER, SBFLAGS_OFF(%rax) / <link.h> 228 testb $LA_SYMB_NOPLTENTER, SBFLAGS_OFF(%rax) / <link.h>
205 je .start_pltenter 229 je .start_pltenter
206 movq SYMDEF_VALUE_OFF(%rax), %rdi 230 movq SYMDEF_VALUE_OFF(%rax), %rdi
271 movq %r11, 0(%rbp) / store destination at top 295 movq %r11, 0(%rbp) / store destination at top
272 296
273 / 297 /
274 / Restore registers 298 / Restore registers
275 / 299 /
300 movq org_scapset@GOTPCREL(%rip),%r9
301 movq (%r9),%r9
302 movl (%r9),%edx
303 testl $AV_386_AVX,%edx
304 jne .trace_restore_ymm
305
306 .trace_restore_xmm:
307 movdqa SPXMM0OFF(%rsp), %xmm0
308 movdqa SPXMM1OFF(%rsp), %xmm1
309 movdqa SPXMM2OFF(%rsp), %xmm2
310 movdqa SPXMM3OFF(%rsp), %xmm3
311 movdqa SPXMM4OFF(%rsp), %xmm4
312 movdqa SPXMM5OFF(%rsp), %xmm5
313 movdqa SPXMM6OFF(%rsp), %xmm6
314 movdqa SPXMM7OFF(%rsp), %xmm7
315 jmp .trace_restore_finish
316
317 .trace_restore_ymm:
318 vmovdqa SPXMM0OFF(%rsp), %ymm0
319 vmovdqa SPXMM1OFF(%rsp), %ymm1
320 vmovdqa SPXMM2OFF(%rsp), %ymm2
321 vmovdqa SPXMM3OFF(%rsp), %ymm3
322 vmovdqa SPXMM4OFF(%rsp), %ymm4
323 vmovdqa SPXMM5OFF(%rsp), %ymm5
324 vmovdqa SPXMM6OFF(%rsp), %ymm6
325 vmovdqa SPXMM7OFF(%rsp), %ymm7
326
327 .trace_restore_finish:
276 movq SPRDIOFF(%rsp), %rdi 328 movq SPRDIOFF(%rsp), %rdi
277 movq SPRSIOFF(%rsp), %rsi 329 movq SPRSIOFF(%rsp), %rsi
278 movq SPRDXOFF(%rsp), %rdx 330 movq SPRDXOFF(%rsp), %rdx
279 movq SPRCXOFF(%rsp), %rcx 331 movq SPRCXOFF(%rsp), %rcx
280 movq SPR8OFF(%rsp), %r8 332 movq SPR8OFF(%rsp), %r8
281 movq SPR9OFF(%rsp), %r9 333 movq SPR9OFF(%rsp), %r9
282 movq SPR10OFF(%rsp), %r10 334 movq SPR10OFF(%rsp), %r10
283 movq SPR11OFF(%rsp), %r11 335 movq SPR11OFF(%rsp), %r11
284 movq SPRAXOFF(%rsp), %rax 336 movq SPRAXOFF(%rsp), %rax
285 movdqa SPXMM0OFF(%rsp), %xmm0
286 movdqa SPXMM1OFF(%rsp), %xmm1
287 movdqa SPXMM2OFF(%rsp), %xmm2
288 movdqa SPXMM3OFF(%rsp), %xmm3
289 movdqa SPXMM4OFF(%rsp), %xmm4
290 movdqa SPXMM5OFF(%rsp), %xmm5
291 movdqa SPXMM6OFF(%rsp), %xmm6
292 movdqa SPXMM7OFF(%rsp), %xmm7
293 337
294 subq $8, %rbp / adjust %rbp for 'ret' 338 subq $8, %rbp / adjust %rbp for 'ret'
295 movq %rbp, %rsp / 339 movq %rbp, %rsp /
296 /* 340 /*
297 * At this point, after a little doctoring, we should 341 * At this point, after a little doctoring, we should
363 .end_while: 407 .end_while:
364 / 408 /
365 / Restore registers using %r11 which contains our old %rsp value 409 / Restore registers using %r11 which contains our old %rsp value
366 / before growing the stack. 410 / before growing the stack.
367 / 411 /
412
413 / Yes, we have to do this dance again. Sorry.
414 movq org_scapset@GOTPCREL(%rip),%r9
415 movq (%r9),%r9
416 movl (%r9),%edx
417 testl $AV_386_AVX,%edx
418 jne .trace_r2_ymm
419
420 .trace_r2_xmm:
421 movdqa SPXMM0OFF(%r11), %xmm0
422 movdqa SPXMM1OFF(%r11), %xmm1
423 movdqa SPXMM2OFF(%r11), %xmm2
424 movdqa SPXMM3OFF(%r11), %xmm3
425 movdqa SPXMM4OFF(%r11), %xmm4
426 movdqa SPXMM5OFF(%r11), %xmm5
427 movdqa SPXMM6OFF(%r11), %xmm6
428 movdqa SPXMM7OFF(%r11), %xmm7
429 jmp .trace_r2_finish
430
431 .trace_r2_ymm:
432 vmovdqa SPXMM0OFF(%r11), %ymm0
433 vmovdqa SPXMM1OFF(%r11), %ymm1
434 vmovdqa SPXMM2OFF(%r11), %ymm2
435 vmovdqa SPXMM3OFF(%r11), %ymm3
436 vmovdqa SPXMM4OFF(%r11), %ymm4
437 vmovdqa SPXMM5OFF(%r11), %ymm5
438 vmovdqa SPXMM6OFF(%r11), %ymm6
439 vmovdqa SPXMM7OFF(%r11), %ymm7
440
441 .trace_r2_finish:
368 movq SPRDIOFF(%r11), %rdi 442 movq SPRDIOFF(%r11), %rdi
369 movq SPRSIOFF(%r11), %rsi 443 movq SPRSIOFF(%r11), %rsi
370 movq SPRDXOFF(%r11), %rdx 444 movq SPRDXOFF(%r11), %rdx
371 movq SPRCXOFF(%r11), %rcx 445 movq SPRCXOFF(%r11), %rcx
372 movq SPR8OFF(%r11), %r8 446 movq SPR8OFF(%r11), %r8
373 movq SPR9OFF(%r11), %r9 447 movq SPR9OFF(%r11), %r9
374 movq SPR10OFF(%r11), %r10 448 movq SPR10OFF(%r11), %r10
375 movq SPRAXOFF(%r11), %rax 449 movq SPRAXOFF(%r11), %rax
376 movdqa SPXMM0OFF(%r11), %xmm0
377 movdqa SPXMM1OFF(%r11), %xmm1
378 movdqa SPXMM2OFF(%r11), %xmm2
379 movdqa SPXMM3OFF(%r11), %xmm3
380 movdqa SPXMM4OFF(%r11), %xmm4
381 movdqa SPXMM5OFF(%r11), %xmm5
382 movdqa SPXMM6OFF(%r11), %xmm6
383 movdqa SPXMM7OFF(%r11), %xmm7
384 movq SPR11OFF(%r11), %r11 / retore %r11 last 450 movq SPR11OFF(%r11), %r11 / retore %r11 last
385 451
386 /* 452 /*
387 * Call to desitnation function - we'll return here 453 * Call to desitnation function - we'll return here
388 * for pltexit monitoring. 454 * for pltexit monitoring.
491 /* 557 /*
492 * Possible arguments for the resolved function are in registers as per 558 * Possible arguments for the resolved function are in registers as per
493 * the AMD64 ABI. We must save on the local stack all possible register 559 * the AMD64 ABI. We must save on the local stack all possible register
494 * arguments before interposing functions to resolve the called function. 560 * arguments before interposing functions to resolve the called function.
495 * Possible arguments must be restored before invoking the resolved function. 561 * Possible arguments must be restored before invoking the resolved function.
496 * 562 *
563 * Before the AVX instruction set enhancements to AMD64 there were no changes in
564 * the set of registers and their sizes across different processors. With AVX,
565 * the xmm registers became the lower 128 bits of the ymm registers. Because of
566 * this, we need to conditionally save 256 bits instead of 128 bits. Regardless
567 * of whether we have ymm registers or not, we're always going to push the stack
568 * space assuming that we do to simplify the code.
569 *
497 * Local stack space storage for elf_rtbndr is allocated as follows: 570 * Local stack space storage for elf_rtbndr is allocated as follows:
498 * 571 *
499 * Saved regs: 572 * Saved regs:
500 * %rax 8 573 * %rax 8
501 * %rdi 8 574 * %rdi 8
504 * %rcx 8 577 * %rcx 8
505 * %r8 8 578 * %r8 8
506 * %r9 8 579 * %r9 8
507 * %r10 8 580 * %r10 8
508 * ======= 581 * =======
509 * Subtotal: 64 (16byte aligned) 582 * Subtotal: 64 (32byte aligned)
510 * 583 *
511 * Saved Media Regs (used to pass floating point args): 584 * Saved Media Regs (used to pass floating point args):
512 * %xmm0 - %xmm7 16 * 8: 128 585 * %ymm0 - %ymm7 32 * 8 256
513 * ======= 586 * =======
514 * Total: 192 (16byte aligned) 587 * Total: 320 (32byte aligned)
515 * 588 *
516 * So - will subtract the following to create enough space 589 * So - will subtract the following to create enough space
517 * 590 *
518 * 0(%rsp) save %rax 591 * 0(%rsp) save %rax
519 * 8(%rsp) save %rdi 592 * 8(%rsp) save %rdi
521 * 24(%rsp) save %rdx 594 * 24(%rsp) save %rdx
522 * 32(%rsp) save %rcx 595 * 32(%rsp) save %rcx
523 * 40(%rsp) save %r8 596 * 40(%rsp) save %r8
524 * 48(%rsp) save %r9 597 * 48(%rsp) save %r9
525 * 56(%rsp) save %r10 598 * 56(%rsp) save %r10
526 * 64(%rsp) save %xmm0 599 * 64(%rsp) save %ymm0
527 * 80(%rsp) save %xmm1 600 * 96(%rsp) save %ymm1
528 * 96(%rsp) save %xmm2 601 * 128(%rsp) save %ymm2
529 * 112(%rsp) save %xmm3 602 * 160(%rsp) save %ymm3
530 * 128(%rsp) save %xmm4 603 * 192(%rsp) save %ymm4
531 * 144(%rsp) save %xmm5 604 * 224(%rsp) save %ymm5
532 * 160(%rsp) save %xmm6 605 * 256(%rsp) save %ymm6
533 * 176(%rsp) save %xmm7 606 * 288(%rsp) save %ymm7
534 * 607 *
535 * Note: Some callers may use 8-byte stack alignment instead of the 608 * Note: Some callers may use 8-byte stack alignment instead of the
536 * ABI required 16-byte alignment. We use %rsp offsets to save/restore 609 * ABI required 16-byte alignment. We use %rsp offsets to save/restore
537 * registers because %rbp may not be 16-byte aligned. We guarantee %rsp 610 * registers because %rbp may not be 16-byte aligned. We guarantee %rsp
538 * is 16-byte aligned in the function preamble. 611 * is 16-byte aligned in the function preamble.
539 */ 612 */
540 #define LS_SIZE $192 /* local stack space to save all possible arguments */ 613 /*
614 * As the registers may either be xmm or ymm, we've left the name as xmm, but
615 * increased the offset between them to always cover the xmm and ymm cases.
616 */
617 #define LS_SIZE $320 /* local stack space to save all possible arguments */
541 #define LSRAXOFF 0 /* for SSE register count */ 618 #define LSRAXOFF 0 /* for SSE register count */
542 #define LSRDIOFF 8 /* arg 0 ... */ 619 #define LSRDIOFF 8 /* arg 0 ... */
543 #define LSRSIOFF 16 620 #define LSRSIOFF 16
544 #define LSRDXOFF 24 621 #define LSRDXOFF 24
545 #define LSRCXOFF 32 622 #define LSRCXOFF 32
546 #define LSR8OFF 40 623 #define LSR8OFF 40
547 #define LSR9OFF 48 624 #define LSR9OFF 48
548 #define LSR10OFF 56 /* ... arg 5 */ 625 #define LSR10OFF 56 /* ... arg 5 */
549 #define LSXMM0OFF 64 /* SSE arg 0 ... */ 626 #define LSXMM0OFF 64 /* SSE arg 0 ... */
550 #define LSXMM1OFF 80 627 #define LSXMM1OFF 96
551 #define LSXMM2OFF 96 628 #define LSXMM2OFF 128
552 #define LSXMM3OFF 112 629 #define LSXMM3OFF 160
553 #define LSXMM4OFF 128 630 #define LSXMM4OFF 192
554 #define LSXMM5OFF 144 631 #define LSXMM5OFF 224
555 #define LSXMM6OFF 160 632 #define LSXMM6OFF 256
556 #define LSXMM7OFF 176 /* ... SSE arg 7 */ 633 #define LSXMM7OFF 288 /* ... SSE arg 7 */
557 634
635 /*
636 * The org_scapset is a global variable that is a part of rtld. It
637 * contains the capabilities that the kernel has told us are supported
638 * (auxv_hwcap). This is necessary for determining whether or not we
639 * need to save and restore AVX registers or simple SSE registers. Note,
640 * that the field we care about is currently at offset 0, if that
641 * changes, this code will have to be updated.
642 */
643 .extern org_scapset
558 .weak _elf_rtbndr 644 .weak _elf_rtbndr
559 _elf_rtbndr = elf_rtbndr 645 _elf_rtbndr = elf_rtbndr
560 646
561 ENTRY(elf_rtbndr) 647 ENTRY(elf_rtbndr)
562 648
567 * Some libraries may (incorrectly) use non-ABI compliant 8-byte stack 653 * Some libraries may (incorrectly) use non-ABI compliant 8-byte stack
568 * alignment. Enforce ABI 16-byte stack alignment here. 654 * alignment. Enforce ABI 16-byte stack alignment here.
569 * The next andq instruction does this pseudo code: 655 * The next andq instruction does this pseudo code:
570 * If %rsp is 8 byte aligned then subtract 8 from %rsp. 656 * If %rsp is 8 byte aligned then subtract 8 from %rsp.
571 */ 657 */
572 andq $-16, %rsp /* enforce ABI 16-byte stack alignment */ 658 andq $-32, %rsp /* enforce ABI 32-byte stack alignment */
573 659
574 subq LS_SIZE, %rsp /* save all ABI defined argument registers */ 660 subq LS_SIZE, %rsp /* save all ABI defined argument registers */
575 661
576 movq %rax, LSRAXOFF(%rsp) /* for SSE register count */ 662 movq %rax, LSRAXOFF(%rsp) /* for SSE register count */
577 movq %rdi, LSRDIOFF(%rsp) /* arg 0 .. */ 663 movq %rdi, LSRDIOFF(%rsp) /* arg 0 .. */
580 movq %rcx, LSRCXOFF(%rsp) 666 movq %rcx, LSRCXOFF(%rsp)
581 movq %r8, LSR8OFF(%rsp) 667 movq %r8, LSR8OFF(%rsp)
582 movq %r9, LSR9OFF(%rsp) /* .. arg 5 */ 668 movq %r9, LSR9OFF(%rsp) /* .. arg 5 */
583 movq %r10, LSR10OFF(%rsp) /* call chain reg */ 669 movq %r10, LSR10OFF(%rsp) /* call chain reg */
584 670
671 /*
672 * Our xmm registers could secretly by ymm registers in disguise.
673 */
674 movq org_scapset@GOTPCREL(%rip),%r9
675 movq (%r9),%r9
676 movl (%r9),%edx
677 testl $AV_386_AVX,%edx
678 jne .save_ymm
679
680 .save_xmm:
585 movdqa %xmm0, LSXMM0OFF(%rsp) /* SSE arg 0 ... */ 681 movdqa %xmm0, LSXMM0OFF(%rsp) /* SSE arg 0 ... */
586 movdqa %xmm1, LSXMM1OFF(%rsp) 682 movdqa %xmm1, LSXMM1OFF(%rsp)
587 movdqa %xmm2, LSXMM2OFF(%rsp) 683 movdqa %xmm2, LSXMM2OFF(%rsp)
588 movdqa %xmm3, LSXMM3OFF(%rsp) 684 movdqa %xmm3, LSXMM3OFF(%rsp)
589 movdqa %xmm4, LSXMM4OFF(%rsp) 685 movdqa %xmm4, LSXMM4OFF(%rsp)
590 movdqa %xmm5, LSXMM5OFF(%rsp) 686 movdqa %xmm5, LSXMM5OFF(%rsp)
591 movdqa %xmm6, LSXMM6OFF(%rsp) 687 movdqa %xmm6, LSXMM6OFF(%rsp)
592 movdqa %xmm7, LSXMM7OFF(%rsp) /* ... SSE arg 7 */ 688 movdqa %xmm7, LSXMM7OFF(%rsp) /* ... SSE arg 7 */
593 689 jmp .save_finish
690
691 .save_ymm:
692 vmovdqa %ymm0, LSXMM0OFF(%rsp) /* SSE arg 0 ... */
693 vmovdqa %ymm1, LSXMM1OFF(%rsp)
694 vmovdqa %ymm2, LSXMM2OFF(%rsp)
695 vmovdqa %ymm3, LSXMM3OFF(%rsp)
696 vmovdqa %ymm4, LSXMM4OFF(%rsp)
697 vmovdqa %ymm5, LSXMM5OFF(%rsp)
698 vmovdqa %ymm6, LSXMM6OFF(%rsp)
699 vmovdqa %ymm7, LSXMM7OFF(%rsp) /* ... SSE arg 7 */
700
701 .save_finish:
594 movq LBPLMPOFF(%rbp), %rdi /* arg1 - *lmp */ 702 movq LBPLMPOFF(%rbp), %rdi /* arg1 - *lmp */
595 movq LBPRELOCOFF(%rbp), %rsi /* arg2 - reloc index */ 703 movq LBPRELOCOFF(%rbp), %rsi /* arg2 - reloc index */
596 movq LBRPCOFF(%rbp), %rdx /* arg3 - pc of caller */ 704 movq LBRPCOFF(%rbp), %rdx /* arg3 - pc of caller */
597 call elf_bndr@PLT /* call elf_rtbndr(lmp, relndx, pc) */ 705 call elf_bndr@PLT /* call elf_rtbndr(lmp, relndx, pc) */
598 movq %rax, LBPRELOCOFF(%rbp) /* store final destination */ 706 movq %rax, LBPRELOCOFF(%rbp) /* store final destination */
599 707
600 /* restore possible arguments before invoking resolved function */ 708 /*
709 * Restore possible arguments before invoking resolved function. We
710 * check the xmm vs. ymm regs first so we can use the others.
711 */
712 movq org_scapset@GOTPCREL(%rip),%r9
713 movq (%r9),%r9
714 movl (%r9),%edx
715 testl $AV_386_AVX,%edx
716 jne .restore_ymm
717
718 .restore_xmm:
719 movdqa LSXMM0OFF(%rsp), %xmm0
720 movdqa LSXMM1OFF(%rsp), %xmm1
721 movdqa LSXMM2OFF(%rsp), %xmm2
722 movdqa LSXMM3OFF(%rsp), %xmm3
723 movdqa LSXMM4OFF(%rsp), %xmm4
724 movdqa LSXMM5OFF(%rsp), %xmm5
725 movdqa LSXMM6OFF(%rsp), %xmm6
726 movdqa LSXMM7OFF(%rsp), %xmm7
727 jmp .restore_finish
728
729 .restore_ymm:
730 vmovdqa LSXMM0OFF(%rsp), %ymm0
731 vmovdqa LSXMM1OFF(%rsp), %ymm1
732 vmovdqa LSXMM2OFF(%rsp), %ymm2
733 vmovdqa LSXMM3OFF(%rsp), %ymm3
734 vmovdqa LSXMM4OFF(%rsp), %ymm4
735 vmovdqa LSXMM5OFF(%rsp), %ymm5
736 vmovdqa LSXMM6OFF(%rsp), %ymm6
737 vmovdqa LSXMM7OFF(%rsp), %ymm7
738
739 .restore_finish:
601 movq LSRAXOFF(%rsp), %rax 740 movq LSRAXOFF(%rsp), %rax
602 movq LSRDIOFF(%rsp), %rdi 741 movq LSRDIOFF(%rsp), %rdi
603 movq LSRSIOFF(%rsp), %rsi 742 movq LSRSIOFF(%rsp), %rsi
604 movq LSRDXOFF(%rsp), %rdx 743 movq LSRDXOFF(%rsp), %rdx
605 movq LSRCXOFF(%rsp), %rcx 744 movq LSRCXOFF(%rsp), %rcx
606 movq LSR8OFF(%rsp), %r8 745 movq LSR8OFF(%rsp), %r8
607 movq LSR9OFF(%rsp), %r9 746 movq LSR9OFF(%rsp), %r9
608 movq LSR10OFF(%rsp), %r10 747 movq LSR10OFF(%rsp), %r10
609 748
610 movdqa LSXMM0OFF(%rsp), %xmm0
611 movdqa LSXMM1OFF(%rsp), %xmm1
612 movdqa LSXMM2OFF(%rsp), %xmm2
613 movdqa LSXMM3OFF(%rsp), %xmm3
614 movdqa LSXMM4OFF(%rsp), %xmm4
615 movdqa LSXMM5OFF(%rsp), %xmm5
616 movdqa LSXMM6OFF(%rsp), %xmm6
617 movdqa LSXMM7OFF(%rsp), %xmm7
618
619 movq %rbp, %rsp 749 movq %rbp, %rsp
620 popq %rbp 750 popq %rbp
621 751
622 addq $8, %rsp /* pop 1st plt-pushed args */ 752 addq $8, %rsp /* pop 1st plt-pushed args */
623 /* the second arguement is used */ 753 /* the second arguement is used */