changeset 13675:a9ae30c28ee4

2413 %ymm* need to be preserved on way through PLT Reviewed by: Richard Lowe <richlowe@richlowe.net> Reviewed by: Joshua M. Clulow <josh@sysmgr.org> Reviewed by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> Approved by: Albert Lee <trisk@nexenta.com>
author Robert Mustacchi <rm@joyent.com>
date Wed, 25 Apr 2012 00:27:21 -0400
parents 181ba6c41bee
children 98ca40df9171
files usr/src/cmd/sgs/include/rtld.h usr/src/cmd/sgs/rtld/amd64/boot_elf.s usr/src/cmd/sgs/rtld/common/globals.c usr/src/uts/i86pc/os/cpuid.c
diffstat 4 files changed, 224 insertions(+), 83 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/sgs/include/rtld.h	Tue Apr 24 22:00:12 2012 -0400
+++ b/usr/src/cmd/sgs/include/rtld.h	Wed Apr 25 00:27:21 2012 -0400
@@ -1106,6 +1106,10 @@
  * capabilities of the system.  This structure follows the Objcapset definition
  * from libld.h, however the system can only have one platform or machine
  * hardware name, thus this structure is a little simpler.
+ *
+ * Note, the amd64 version of elf_rtbndr assumes that the sc_hw_1 value is at
+ * offset zero. If you are changing this structure in a way that invalidates
+ * this you need to update that code.
  */
 typedef	struct {
 	elfcap_mask_t	sc_hw_1;	/* CA_SUNW_HW_1 capabilities */
--- a/usr/src/cmd/sgs/rtld/amd64/boot_elf.s	Tue Apr 24 22:00:12 2012 -0400
+++ b/usr/src/cmd/sgs/rtld/amd64/boot_elf.s	Wed Apr 25 00:27:21 2012 -0400
@@ -22,10 +22,9 @@
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2012 Joyent, Inc. All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #if	defined(lint)
 
 #include	<sys/types.h>
@@ -33,6 +32,7 @@
 #include	<_audit.h>
 #include	<_elf.h>
 #include	<sys/regset.h>
+#include	<sys/auxv_386.h>
 
 /* ARGSUSED0 */
 int
@@ -45,6 +45,7 @@
 #include	<link.h>
 #include	<_audit.h>
 #include	<sys/asm_linkage.h>
+#include	<sys/auxv_386.h>
 
 	.file	"boot_elf.s"
 	.text
@@ -106,12 +107,12 @@
  *	    %r11			 8
  *	    %rax			 8
  *				    =======
- *			    Subtotal:	144 (16byte aligned)
+ *			    Subtotal:	144 (32byte aligned)
  *
  *	Saved Media Regs (used to pass floating point args):
- *	    %xmm0 - %xmm7   16 * 8:	128
+ *	    %xmm0 - %xmm7   32 * 8:	256
  *				    =======
- *			    Total:	272 (16byte aligned)
+ *			    Total:	400 (32byte aligned)
  *  
  *  So - will subtract the following to create enough space
  *
@@ -131,14 +132,14 @@
  *	-144(%rbp)	entering %r10
  *	-152(%rbp)	entering %r11
  *	-160(%rbp)	entering %rax
- *	-176(%rbp)	entering %xmm0
- *	-192(%rbp)	entering %xmm1
- *	-208(%rbp)	entering %xmm2
- *	-224(%rbp)	entering %xmm3
- *	-240(%rbp)	entering %xmm4
- *	-256(%rbp)	entering %xmm5
- *	-272(%rbp)	entering %xmm6
- *	-288(%rbp)	entering %xmm7
+ *	-192(%rbp)	entering %xmm0
+ *	-224(%rbp)	entering %xmm1
+ *	-256(%rbp)	entering %xmm2
+ *	-288(%rbp)	entering %xmm3
+ *	-320(%rbp)	entering %xmm4
+ *	-384(%rbp)	entering %xmm5
+ *	-416(%rbp)	entering %xmm6
+ *	-448(%rbp)	entering %xmm7
  *
  */
 #define	SPDYNOFF    -8
@@ -148,39 +149,41 @@
 
 /*
  * The next set of offsets are relative to %rsp.
- * We guarantee %rsp is ABI compliant 16-byte aligned.  This guarantees the
- * xmm registers are saved to 16-byte aligned addresses.
+ * We guarantee %rsp is ABI compliant 32-byte aligned.  This guarantees the
+ * ymm registers are saved to 32-byte aligned addresses.
  * %rbp may only be 8 byte aligned if we came in from non-ABI compliant code.
  */ 
-#define	SPRDIOFF	192
-#define	SPRSIOFF	184
-#define	SPRDXOFF	176
-#define	SPRCXOFF	168
-#define	SPR8OFF		160
-#define	SPR9OFF		152
-#define	SPR10OFF	144
-#define	SPR11OFF	136
-#define	SPRAXOFF	128
-#define	SPXMM0OFF	112
-#define	SPXMM1OFF	96
-#define	SPXMM2OFF	80
-#define	SPXMM3OFF	64
-#define	SPXMM4OFF	48
-#define	SPXMM5OFF	32
-#define	SPXMM6OFF	16
+#define	SPRDIOFF	320
+#define	SPRSIOFF	312
+#define	SPRDXOFF	304
+#define	SPRCXOFF	296
+#define	SPR8OFF		288
+#define	SPR9OFF		280
+#define	SPR10OFF	272
+#define	SPR11OFF	264
+#define	SPRAXOFF	256
+#define	SPXMM0OFF	224
+#define	SPXMM1OFF	192
+#define	SPXMM2OFF	160
+#define	SPXMM3OFF	128
+#define	SPXMM4OFF	96
+#define	SPXMM5OFF	64
+#define	SPXMM6OFF	32
 #define	SPXMM7OFF	0
 
+	/* See elf_rtbndr for explanation behind org_scapset */
+	.extern org_scapset
 	.globl	elf_plt_trace
 	.type	elf_plt_trace,@function
 	.align 16
 elf_plt_trace:
 	/*
-	 * Enforce ABI 16-byte stack alignment here.
+	 * Enforce ABI 32-byte stack alignment here.
 	 * The next andq instruction does this pseudo code:
 	 * If %rsp is 8 byte aligned then subtract 8 from %rsp.
 	 */
-	andq    $-16, %rsp	/* enforce ABI 16-byte stack alignment */
-	subq	$272,%rsp	/ create some local storage
+	andq    $-32, %rsp	/* enforce ABI 32-byte stack alignment */
+	subq	$400,%rsp	/ create some local storage
 
 	movq	%rdi, SPRDIOFF(%rsp)
 	movq	%rsi, SPRSIOFF(%rsp)
@@ -191,6 +194,14 @@
 	movq	%r10, SPR10OFF(%rsp)
 	movq	%r11, SPR11OFF(%rsp)
 	movq	%rax, SPRAXOFF(%rsp)
+
+	movq	org_scapset@GOTPCREL(%rip),%r9
+	movq	(%r9),%r9
+	movl	(%r9),%edx
+	testl	$AV_386_AVX,%edx
+	jne	.trace_save_ymm
+
+.trace_save_xmm:
 	movdqa	%xmm0, SPXMM0OFF(%rsp)
 	movdqa	%xmm1, SPXMM1OFF(%rsp)
 	movdqa	%xmm2, SPXMM2OFF(%rsp)
@@ -199,6 +210,19 @@
 	movdqa	%xmm5, SPXMM5OFF(%rsp)
 	movdqa	%xmm6, SPXMM6OFF(%rsp)
 	movdqa	%xmm7, SPXMM7OFF(%rsp)
+	jmp	.trace_save_finish	
+
+.trace_save_ymm:
+	vmovdqa	%ymm0, SPXMM0OFF(%rsp)
+	vmovdqa	%ymm1, SPXMM1OFF(%rsp)
+	vmovdqa	%ymm2, SPXMM2OFF(%rsp)
+	vmovdqa	%ymm3, SPXMM3OFF(%rsp)
+	vmovdqa	%ymm4, SPXMM4OFF(%rsp)
+	vmovdqa	%ymm5, SPXMM5OFF(%rsp)
+	vmovdqa	%ymm6, SPXMM6OFF(%rsp)
+	vmovdqa	%ymm7, SPXMM7OFF(%rsp)
+
+.trace_save_finish:
 
 	movq	SPDYNOFF(%rbp), %rax			/ %rax = dyndata
 	testb	$LA_SYMB_NOPLTENTER, SBFLAGS_OFF(%rax)	/ <link.h>
@@ -273,6 +297,34 @@
 	/
 	/ Restore registers
 	/
+	movq	org_scapset@GOTPCREL(%rip),%r9
+	movq	(%r9),%r9
+	movl	(%r9),%edx
+	testl	$AV_386_AVX,%edx
+	jne	.trace_restore_ymm
+
+.trace_restore_xmm:
+	movdqa	SPXMM0OFF(%rsp), %xmm0
+	movdqa	SPXMM1OFF(%rsp), %xmm1
+	movdqa	SPXMM2OFF(%rsp), %xmm2
+	movdqa	SPXMM3OFF(%rsp), %xmm3
+	movdqa	SPXMM4OFF(%rsp), %xmm4
+	movdqa	SPXMM5OFF(%rsp), %xmm5
+	movdqa	SPXMM6OFF(%rsp), %xmm6
+	movdqa	SPXMM7OFF(%rsp), %xmm7
+	jmp	.trace_restore_finish
+
+.trace_restore_ymm:
+	vmovdqa	SPXMM0OFF(%rsp), %ymm0
+	vmovdqa	SPXMM1OFF(%rsp), %ymm1
+	vmovdqa	SPXMM2OFF(%rsp), %ymm2
+	vmovdqa	SPXMM3OFF(%rsp), %ymm3
+	vmovdqa	SPXMM4OFF(%rsp), %ymm4
+	vmovdqa	SPXMM5OFF(%rsp), %ymm5
+	vmovdqa	SPXMM6OFF(%rsp), %ymm6
+	vmovdqa	SPXMM7OFF(%rsp), %ymm7
+
+.trace_restore_finish:
 	movq	SPRDIOFF(%rsp), %rdi
 	movq	SPRSIOFF(%rsp), %rsi
 	movq	SPRDXOFF(%rsp), %rdx
@@ -282,14 +334,6 @@
 	movq	SPR10OFF(%rsp), %r10
 	movq	SPR11OFF(%rsp), %r11
 	movq	SPRAXOFF(%rsp), %rax
-	movdqa	SPXMM0OFF(%rsp), %xmm0
-	movdqa	SPXMM1OFF(%rsp), %xmm1
-	movdqa	SPXMM2OFF(%rsp), %xmm2
-	movdqa	SPXMM3OFF(%rsp), %xmm3
-	movdqa	SPXMM4OFF(%rsp), %xmm4
-	movdqa	SPXMM5OFF(%rsp), %xmm5
-	movdqa	SPXMM6OFF(%rsp), %xmm6
-	movdqa	SPXMM7OFF(%rsp), %xmm7
 
 	subq	$8, %rbp			/ adjust %rbp for 'ret'
 	movq	%rbp, %rsp			/
@@ -365,6 +409,36 @@
 	/ Restore registers using %r11 which contains our old %rsp value
 	/ before growing the stack.
 	/
+
+	/ Yes, we have to do this dance again. Sorry.
+	movq	org_scapset@GOTPCREL(%rip),%r9
+	movq	(%r9),%r9
+	movl	(%r9),%edx
+	testl	$AV_386_AVX,%edx
+	jne	.trace_r2_ymm
+
+.trace_r2_xmm:
+	movdqa	SPXMM0OFF(%r11), %xmm0
+	movdqa	SPXMM1OFF(%r11), %xmm1
+	movdqa	SPXMM2OFF(%r11), %xmm2
+	movdqa	SPXMM3OFF(%r11), %xmm3
+	movdqa	SPXMM4OFF(%r11), %xmm4
+	movdqa	SPXMM5OFF(%r11), %xmm5
+	movdqa	SPXMM6OFF(%r11), %xmm6
+	movdqa	SPXMM7OFF(%r11), %xmm7
+	jmp	.trace_r2_finish
+
+.trace_r2_ymm:
+	vmovdqa	SPXMM0OFF(%r11), %ymm0
+	vmovdqa	SPXMM1OFF(%r11), %ymm1
+	vmovdqa	SPXMM2OFF(%r11), %ymm2
+	vmovdqa	SPXMM3OFF(%r11), %ymm3
+	vmovdqa	SPXMM4OFF(%r11), %ymm4
+	vmovdqa	SPXMM5OFF(%r11), %ymm5
+	vmovdqa	SPXMM6OFF(%r11), %ymm6
+	vmovdqa	SPXMM7OFF(%r11), %ymm7
+
+.trace_r2_finish:
 	movq	SPRDIOFF(%r11), %rdi
 	movq	SPRSIOFF(%r11), %rsi
 	movq	SPRDXOFF(%r11), %rdx
@@ -373,14 +447,6 @@
 	movq	SPR9OFF(%r11), %r9
 	movq	SPR10OFF(%r11), %r10
 	movq	SPRAXOFF(%r11), %rax
-	movdqa	SPXMM0OFF(%r11), %xmm0
-	movdqa	SPXMM1OFF(%r11), %xmm1
-	movdqa	SPXMM2OFF(%r11), %xmm2
-	movdqa	SPXMM3OFF(%r11), %xmm3
-	movdqa	SPXMM4OFF(%r11), %xmm4
-	movdqa	SPXMM5OFF(%r11), %xmm5
-	movdqa	SPXMM6OFF(%r11), %xmm6
-	movdqa	SPXMM7OFF(%r11), %xmm7
 	movq	SPR11OFF(%r11), %r11		/ retore %r11 last
 
 	/*
@@ -493,7 +559,14 @@
  * the AMD64 ABI.  We must save on the local stack all possible register
  * arguments before interposing functions to resolve the called function. 
  * Possible arguments must be restored before invoking the resolved function.
- *
+ * 
+ * Before the AVX instruction set enhancements to AMD64 there were no changes in
+ * the set of registers and their sizes across different processors. With AVX,
+ * the xmm registers became the lower 128 bits of the ymm registers. Because of
+ * this, we need to conditionally save 256 bits instead of 128 bits. Regardless
+ * of whether we have ymm registers or not, we're always going to push the stack
+ * space assuming that we do to simplify the code.
+ * 
  * Local stack space storage for elf_rtbndr is allocated as follows:
  *
  *	Saved regs:
@@ -506,12 +579,12 @@
  *	    %r9				 8
  *	    %r10			 8
  *				    =======
- *			    Subtotal:   64 (16byte aligned)
+ *			    Subtotal:   64 (32byte aligned)
  *
  *	Saved Media Regs (used to pass floating point args):
- *	    %xmm0 - %xmm7   16 * 8:    128
+ *	    %ymm0 - %ymm7   32 * 8     256
  *				    =======
- *			    Total:     192 (16byte aligned)
+ *			    Total:     320 (32byte aligned)
  *  
  *  So - will subtract the following to create enough space
  *
@@ -523,21 +596,25 @@
  *	40(%rsp)	save %r8
  *	48(%rsp)	save %r9
  *	56(%rsp)	save %r10
- *	64(%rsp)	save %xmm0
- *	80(%rsp)	save %xmm1
- *	96(%rsp)	save %xmm2
- *	112(%rsp)	save %xmm3
- *	128(%rsp)	save %xmm4
- *	144(%rsp)	save %xmm5
- *	160(%rsp)	save %xmm6
- *	176(%rsp)	save %xmm7
+ *	64(%rsp)	save %ymm0
+ *	96(%rsp)	save %ymm1
+ *	128(%rsp)	save %ymm2
+ *	160(%rsp)	save %ymm3
+ *	192(%rsp)	save %ymm4
+ *	224(%rsp)	save %ymm5
+ *	256(%rsp)	save %ymm6
+ *	288(%rsp)	save %ymm7
  *
  * Note: Some callers may use 8-byte stack alignment instead of the
  * ABI required 16-byte alignment.  We use %rsp offsets to save/restore
  * registers because %rbp may not be 16-byte aligned.  We guarantee %rsp
  * is 16-byte aligned in the function preamble.
  */
-#define	LS_SIZE	$192	/* local stack space to save all possible arguments */
+/*
+ * As the registers may either be xmm or ymm, we've left the name as xmm, but
+ * increased the offset between them to always cover the xmm and ymm cases.
+ */
+#define	LS_SIZE	$320	/* local stack space to save all possible arguments */
 #define	LSRAXOFF	0	/* for SSE register count */
 #define	LSRDIOFF	8	/* arg 0 ... */
 #define	LSRSIOFF	16
@@ -547,14 +624,23 @@
 #define	LSR9OFF		48
 #define	LSR10OFF	56	/* ... arg 5 */
 #define	LSXMM0OFF	64	/* SSE arg 0 ... */
-#define	LSXMM1OFF	80
-#define	LSXMM2OFF	96
-#define	LSXMM3OFF	112
-#define	LSXMM4OFF	128
-#define	LSXMM5OFF	144
-#define	LSXMM6OFF	160
-#define	LSXMM7OFF	176	/* ... SSE arg 7 */
+#define	LSXMM1OFF	96
+#define	LSXMM2OFF	128
+#define	LSXMM3OFF	160
+#define	LSXMM4OFF	192
+#define	LSXMM5OFF	224
+#define	LSXMM6OFF	256
+#define	LSXMM7OFF	288	/* ... SSE arg 7 */
 
+	/*
+	 * The org_scapset is a global variable that is a part of rtld. It
+	 * contains the capabilities that the kernel has told us are supported
+	 * (auxv_hwcap). This is necessary for determining whether or not we
+	 * need to save and restore AVX registers or simple SSE registers. Note,
+	 * that the field we care about is currently at offset 0, if that
+	 * changes, this code will have to be updated.
+	 */
+	.extern org_scapset
 	.weak	_elf_rtbndr
 	_elf_rtbndr = elf_rtbndr
 
@@ -569,7 +655,7 @@
 	 * The next andq instruction does this pseudo code:
 	 * If %rsp is 8 byte aligned then subtract 8 from %rsp.
 	 */
-	andq	$-16, %rsp	/* enforce ABI 16-byte stack alignment */
+	andq	$-32, %rsp	/* enforce ABI 32-byte stack alignment */
 
 	subq	LS_SIZE, %rsp	/* save all ABI defined argument registers */
 
@@ -582,6 +668,16 @@
 	movq	%r9, LSR9OFF(%rsp)	/* .. arg 5 */
 	movq	%r10, LSR10OFF(%rsp)	/* call chain reg */
 
+	/*
+	 * Our xmm registers could secretly by ymm registers in disguise.
+	 */
+	movq	org_scapset@GOTPCREL(%rip),%r9
+	movq	(%r9),%r9
+	movl	(%r9),%edx
+	testl	$AV_386_AVX,%edx
+	jne	.save_ymm
+
+.save_xmm:
 	movdqa	%xmm0, LSXMM0OFF(%rsp)	/* SSE arg 0 ... */
 	movdqa	%xmm1, LSXMM1OFF(%rsp)
 	movdqa	%xmm2, LSXMM2OFF(%rsp)
@@ -590,14 +686,57 @@
 	movdqa	%xmm5, LSXMM5OFF(%rsp)
 	movdqa	%xmm6, LSXMM6OFF(%rsp)
 	movdqa	%xmm7, LSXMM7OFF(%rsp)	/* ... SSE arg 7 */
+	jmp	.save_finish	
 
+.save_ymm:
+	vmovdqa	%ymm0, LSXMM0OFF(%rsp)	/* SSE arg 0 ... */
+	vmovdqa	%ymm1, LSXMM1OFF(%rsp)
+	vmovdqa	%ymm2, LSXMM2OFF(%rsp)
+	vmovdqa	%ymm3, LSXMM3OFF(%rsp)
+	vmovdqa	%ymm4, LSXMM4OFF(%rsp)
+	vmovdqa	%ymm5, LSXMM5OFF(%rsp)
+	vmovdqa	%ymm6, LSXMM6OFF(%rsp)
+	vmovdqa	%ymm7, LSXMM7OFF(%rsp)	/* ... SSE arg 7 */
+
+.save_finish:
 	movq	LBPLMPOFF(%rbp), %rdi	/* arg1 - *lmp */
 	movq	LBPRELOCOFF(%rbp), %rsi	/* arg2 - reloc index */
 	movq	LBRPCOFF(%rbp), %rdx	/* arg3 - pc of caller */
 	call	elf_bndr@PLT		/* call elf_rtbndr(lmp, relndx, pc) */
 	movq	%rax, LBPRELOCOFF(%rbp)	/* store final destination */
 
-	/* restore possible arguments before invoking resolved function */
+	/*
+	 * Restore possible arguments before invoking resolved function. We
+	 * check the xmm vs. ymm regs first so we can use the others.
+	 */
+	movq	org_scapset@GOTPCREL(%rip),%r9
+	movq	(%r9),%r9
+	movl	(%r9),%edx
+	testl	$AV_386_AVX,%edx
+	jne	.restore_ymm
+
+.restore_xmm:
+	movdqa	LSXMM0OFF(%rsp), %xmm0
+	movdqa	LSXMM1OFF(%rsp), %xmm1
+	movdqa	LSXMM2OFF(%rsp), %xmm2
+	movdqa	LSXMM3OFF(%rsp), %xmm3
+	movdqa	LSXMM4OFF(%rsp), %xmm4
+	movdqa	LSXMM5OFF(%rsp), %xmm5
+	movdqa	LSXMM6OFF(%rsp), %xmm6
+	movdqa	LSXMM7OFF(%rsp), %xmm7
+	jmp .restore_finish
+
+.restore_ymm:
+	vmovdqa	LSXMM0OFF(%rsp), %ymm0
+	vmovdqa	LSXMM1OFF(%rsp), %ymm1
+	vmovdqa	LSXMM2OFF(%rsp), %ymm2
+	vmovdqa	LSXMM3OFF(%rsp), %ymm3
+	vmovdqa	LSXMM4OFF(%rsp), %ymm4
+	vmovdqa	LSXMM5OFF(%rsp), %ymm5
+	vmovdqa	LSXMM6OFF(%rsp), %ymm6
+	vmovdqa	LSXMM7OFF(%rsp), %ymm7
+
+.restore_finish:
 	movq	LSRAXOFF(%rsp), %rax
 	movq	LSRDIOFF(%rsp), %rdi
 	movq	LSRSIOFF(%rsp), %rsi
@@ -607,15 +746,6 @@
 	movq	LSR9OFF(%rsp), %r9
 	movq	LSR10OFF(%rsp), %r10
 
-	movdqa	LSXMM0OFF(%rsp), %xmm0
-	movdqa	LSXMM1OFF(%rsp), %xmm1
-	movdqa	LSXMM2OFF(%rsp), %xmm2
-	movdqa	LSXMM3OFF(%rsp), %xmm3
-	movdqa	LSXMM4OFF(%rsp), %xmm4
-	movdqa	LSXMM5OFF(%rsp), %xmm5
-	movdqa	LSXMM6OFF(%rsp), %xmm6
-	movdqa	LSXMM7OFF(%rsp), %xmm7
-
 	movq	%rbp, %rsp
 	popq	%rbp
 
--- a/usr/src/cmd/sgs/rtld/common/globals.c	Tue Apr 24 22:00:12 2012 -0400
+++ b/usr/src/cmd/sgs/rtld/common/globals.c	Wed Apr 25 00:27:21 2012 -0400
@@ -151,6 +151,9 @@
  * override the system capabilities for testing purposes.  Furthermore, these
  * alternative capabilities can be specified such that they only apply to
  * specified files rather than to all objects.
+ *
+ * The org_scapset is relied upon by the amd64 version of elf_rtbndr to
+ * determine whether or not AVX registers are present in the system.
  */
 static Syscapset	scapset = { 0 };
 Syscapset	*org_scapset = &scapset;	/* original system and */
--- a/usr/src/uts/i86pc/os/cpuid.c	Tue Apr 24 22:00:12 2012 -0400
+++ b/usr/src/uts/i86pc/os/cpuid.c	Wed Apr 25 00:27:21 2012 -0400
@@ -30,7 +30,7 @@
  * Portions Copyright 2009 Advanced Micro Devices, Inc.
  */
 /*
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 /*
  * Various routines to handle identification
@@ -2576,8 +2576,12 @@
 		if (*ecx & CPUID_INTC_ECX_PCLMULQDQ)
 			hwcap_flags |= AV_386_PCLMULQDQ;
 		if ((*ecx & CPUID_INTC_ECX_XSAVE) &&
-		    (*ecx & CPUID_INTC_ECX_OSXSAVE))
+		    (*ecx & CPUID_INTC_ECX_OSXSAVE)) {
 			hwcap_flags |= AV_386_XSAVE;
+
+			if (*ecx & CPUID_INTC_ECX_AVX)
+				hwcap_flags |= AV_386_AVX;
+		}
 		if (*ecx & CPUID_INTC_ECX_VMX)
 			hwcap_flags |= AV_386_VMX;
 		if (*ecx & CPUID_INTC_ECX_POPCNT)