changeset 10627:e56a44d6d742

6826942 Need an optimized GCM leveraging Intel's PCMULQDQ instruction
author Dan OpenSolaris Anderson <opensolaris@drydog.com>
date Wed, 23 Sep 2009 15:55:57 -0700
parents 9c09f5dd637e
children cddb35f5bfa6
files usr/src/common/crypto/modes/amd64/gcm_intel.s usr/src/common/crypto/modes/gcm.c usr/src/common/crypto/modes/modes.c usr/src/uts/common/crypto/api/kcf_random.c usr/src/uts/intel/kcf/Makefile
diffstat 5 files changed, 483 insertions(+), 38 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/common/crypto/modes/amd64/gcm_intel.s	Wed Sep 23 15:55:57 2009 -0700
@@ -0,0 +1,348 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009 Intel Corporation
+ * All Rights Reserved.
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions.  This file contains an accelerated
+ * Galois Field Multiplication implementation.
+ *
+ * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
+ * carry-less multiplication. More information about PCLMULQDQ can be
+ * found at:
+ * http://software.intel.com/en-us/articles/
+ * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
+ *
+ */
+
+/*
+ * ====================================================================
+ * OpenSolaris OS modifications
+ *
+ * This source originates as file galois_hash_asm.c from
+ * Intel Corporation dated September 21, 2009.
+ *
+ * This OpenSolaris version has these major changes from the original source:
+ *
+ * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
+ * definition for lint.
+ *
+ * 2. Formatted code, added comments, and added #includes and #defines.
+ *
+ * 3. Commented out pclmulqdq and pshufb instructions and replaced with
+ * .byte sequences (as pclmulqdq isn't supported yet by all of the gas, as,
+ * and aw assemblers).
+ *
+ * 4. If bit CR0.TS is set, clear and set the TS bit, after and before
+ * calling kpreempt_disable() and kpreempt_enable().
+ * If the TS bit is not set, Save and restore %xmm registers at the beginning
+ * and end of function calls (%xmm* registers are not saved and restored by
+ * during kernel thread preemption).
+ *
+ * 5. Removed code to perform hashing.  This is already done with C macro
+ * GHASH in gcm.c.  For better performance, this removed code should be
+ * reintegrated in the future to replace the C GHASH macro.
+ *
+ * 6. Added code to byte swap 16-byte input and output.
+ *
+ * 7. Folded in comments from the original C source with embedded assembly
+ * (SB_w_shift_xor.c)
+ *
+ * 8. Renamed function and reordered parameters to match OpenSolaris:
+ * Intel interface:
+ *	void galois_hash_asm(unsigned char *hk, unsigned char *s,
+ *		unsigned char *d, int length)
+ * OpenSolaris OS interface:
+ *	void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
+ * ====================================================================
+ */
+
+
+#if defined(lint) || defined(__lint)
+
+#include <sys/types.h>
+
+/* ARGSUSED */
+void
+gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
+}
+
+#else	/* lint */
+
+#include <sys/asm_linkage.h>
+#include <sys/controlregs.h>
+#ifdef _KERNEL
+#include <sys/machprivregs.h>
+#endif
+
+#ifdef _KERNEL
+	/*
+	 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv.  That is,
+	 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
+	 * uses it to pass P2 to syscall.
+	 * This also occurs with the STTS macro, but we don't care if
+	 * P2 (%rsi) is modified just before function exit.
+	 * The CLTS and STTS macros push and pop P1 (%rdi) already.
+	 */
+#ifdef __xpv
+#define	PROTECTED_CLTS \
+	push	%rsi; \
+	CLTS; \
+	pop	%rsi
+#else
+#define	PROTECTED_CLTS \
+	CLTS
+#endif	/* __xpv */
+
+	/*
+	 * If CR0_TS is not set, align stack (with push %rbp) and push
+	 * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS
+	 */
+#define	CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \
+	push	%rbp; \
+	mov	%rsp, %rbp; \
+	movq    %cr0, tmpreg; \
+	testq	$CR0_TS, tmpreg; \
+	jnz	1f; \
+	and	$-XMM_ALIGN, %rsp; \
+	sub	$[XMM_SIZE * 11], %rsp; \
+	movaps	%xmm0, 160(%rsp); \
+	movaps	%xmm1, 144(%rsp); \
+	movaps	%xmm2, 128(%rsp); \
+	movaps	%xmm3, 112(%rsp); \
+	movaps	%xmm4, 96(%rsp); \
+	movaps	%xmm5, 80(%rsp); \
+	movaps	%xmm6, 64(%rsp); \
+	movaps	%xmm7, 48(%rsp); \
+	movaps	%xmm8, 32(%rsp); \
+	movaps	%xmm9, 16(%rsp); \
+	movaps	%xmm10, (%rsp); \
+	jmp	2f; \
+1: \
+	PROTECTED_CLTS; \
+2:
+
+
+	/*
+	 * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack,
+	 * otherwise set CR0_TS.
+	 */
+#define	SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \
+	testq	$CR0_TS, tmpreg; \
+	jnz	1f; \
+	movaps	(%rsp), %xmm10; \
+	movaps	16(%rsp), %xmm9; \
+	movaps	32(%rsp), %xmm8; \
+	movaps	48(%rsp), %xmm7; \
+	movaps	64(%rsp), %xmm6; \
+	movaps	80(%rsp), %xmm5; \
+	movaps	96(%rsp), %xmm4; \
+	movaps	112(%rsp), %xmm3; \
+	movaps	128(%rsp), %xmm2; \
+	movaps	144(%rsp), %xmm1; \
+	movaps	160(%rsp), %xmm0; \
+	jmp	2f; \
+1: \
+	STTS(tmpreg); \
+2: \
+	mov	%rbp, %rsp; \
+	pop	%rbp
+
+
+#else
+#define	PROTECTED_CLTS
+#define	CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg)
+#define	SET_TS_OR_POP_XMM_REGISTERS(tmpreg)
+#endif	/* _KERNEL */
+
+/*
+ * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
+ */
+
+// static uint8_t byte_swap16_mask[] = {
+//	 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
+.text
+.align XMM_ALIGN
+.Lbyte_swap16_mask:
+	.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+
+
+/*
+ * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
+ *
+ * Perform a carry-less multiplication (that is, use XOR instead of the
+ * multiply operator) on  P1 and P2 and place the result in P3.
+ *
+ * Byte swap the input and the output.
+ *
+ * Note: x_in, y, and res all point to a block of 20-byte numbers
+ * (an array of two 64-bit integers).
+ *
+ * Note2: For kernel code, caller is responsible for ensuring
+ * kpreempt_disable() has been called.  This is because %xmm registers are
+ * not saved/restored.  Clear and set the CR0.TS bit on entry and exit,
+ * respectively,  if TS is set on entry.  Otherwise, if TS is not set,
+ * save and restore %xmm registers on the stack.
+ *
+ * Note3: Original Intel definition:
+ * void galois_hash_asm(unsigned char *hk, unsigned char *s,
+ *	unsigned char *d, int length)
+ *
+ * Note4: Register/parameter mapping:
+ * Intel:
+ *	Parameter 1: %rcx (copied to %xmm0)	hk or x_in
+ *	Parameter 2: %rdx (copied to %xmm1)	s or y
+ *	Parameter 3: %rdi (result)		d or res
+ * OpenSolaris:
+ *	Parameter 1: %rdi (copied to %xmm0)	x_in
+ *	Parameter 2: %rsi (copied to %xmm1)	y
+ *	Parameter 3: %rdx (result)		res
+ */
+
+ENTRY_NP(gcm_mul_pclmulqdq)
+	CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10)
+
+	//
+	// Copy Parameters
+	//
+	movdqu	(%rdi), %xmm0 // P1
+	movdqu	(%rsi), %xmm1 // P2
+
+	//
+	// Byte swap 16-byte input
+	//
+	lea	.Lbyte_swap16_mask(%rip), %rax
+	movaps	(%rax), %xmm10
+	//pshufb	%xmm10, %xmm0
+	.byte	0x66, 0x41, 0x0f, 0x38, 0x00, 0xc2
+	//pshufb	%xmm10, %xmm1
+	.byte	0x66, 0x41, 0x0f, 0x38, 0x00, 0xca
+
+
+	//
+	// Multiply with the hash key
+	//
+	movdqu	%xmm0, %xmm3
+	//pclmulqdq $0, %xmm1, %xmm3	// xmm3 holds a0*b0
+	.byte	0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00
+
+	movdqu	%xmm0, %xmm4
+	//pclmulqdq $16, %xmm1, %xmm4	// xmm4 holds a0*b1
+	.byte	0x66, 0x0f, 0x3a, 0x44, 0xe1, 0x10
+
+	movdqu	%xmm0, %xmm5
+	//pclmulqdq $1, %xmm1, %xmm5	// xmm5 holds a1*b0
+	.byte	0x66, 0x0f, 0x3a, 0x44, 0xe9, 0x01
+	movdqu	%xmm0, %xmm6
+	//pclmulqdq $17, %xmm1, %xmm6	// xmm6 holds a1*b1
+	.byte	0x66, 0x0f, 0x3a, 0x44, 0xf1, 0x11
+
+	pxor	%xmm5, %xmm4	// xmm4 holds a0*b1 + a1*b0
+
+	movdqu	%xmm4, %xmm5	// move the contents of xmm4 to xmm5
+	psrldq	$8, %xmm4	// shift by xmm4 64 bits to the right
+	pslldq	$8, %xmm5	// shift by xmm5 64 bits to the left
+	pxor	%xmm5, %xmm3
+	pxor	%xmm4, %xmm6	// Register pair <xmm6:xmm3> holds the result
+				// of the carry-less multiplication of
+				// xmm0 by xmm1.
+
+	// We shift the result of the multiplication by one bit position
+	// to the left to cope for the fact that the bits are reversed.
+	movdqu	%xmm3, %xmm7
+	movdqu	%xmm6, %xmm8
+	pslld	$1, %xmm3
+	pslld	$1, %xmm6
+	psrld	$31, %xmm7
+	psrld	$31, %xmm8
+	movdqu	%xmm7, %xmm9
+	pslldq	$4, %xmm8
+	pslldq	$4, %xmm7
+	psrldq	$12, %xmm9
+	por	%xmm7, %xmm3
+	por	%xmm8, %xmm6
+	por	%xmm9, %xmm6
+
+	//
+	// First phase of the reduction
+	//
+	// Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
+	// independently.
+	movdqu	%xmm3, %xmm7
+	movdqu	%xmm3, %xmm8
+	movdqu	%xmm3, %xmm9
+	pslld	$31, %xmm7	// packed right shift shifting << 31
+	pslld	$30, %xmm8	// packed right shift shifting << 30
+	pslld	$25, %xmm9	// packed right shift shifting << 25
+	pxor	%xmm8, %xmm7	// xor the shifted versions
+	pxor	%xmm9, %xmm7
+	movdqu	%xmm7, %xmm8
+	pslldq	$12, %xmm7
+	psrldq	$4, %xmm8
+	pxor	%xmm7, %xmm3	// first phase of the reduction complete
+
+	//
+	// Second phase of the reduction
+	//
+	// Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
+	// shift operations.
+	movdqu	%xmm3, %xmm2
+	movdqu	%xmm3, %xmm4	// packed left shifting >> 1
+	movdqu	%xmm3, %xmm5
+	psrld	$1, %xmm2
+	psrld	$2, %xmm4	// packed left shifting >> 2
+	psrld	$7, %xmm5	// packed left shifting >> 7
+	pxor	%xmm4, %xmm2	// xor the shifted versions
+	pxor	%xmm5, %xmm2
+	pxor	%xmm8, %xmm2
+	pxor	%xmm2, %xmm3
+	pxor	%xmm3, %xmm6	// the result is in xmm6
+
+	//
+	// Byte swap 16-byte result
+	//
+	//pshufb	%xmm10, %xmm6	// %xmm10 has the swap mask
+	.byte	0x66, 0x41, 0x0f, 0x38, 0x00, 0xf2
+
+	//
+	// Store the result
+	//
+	movdqu	%xmm6, (%rdx) // P3
+
+
+	//
+	// Cleanup and Return
+	//
+	SET_TS_OR_POP_XMM_REGISTERS(%r10)
+	ret
+	SET_SIZE(gcm_mul_pclmulqdq)
+
+#endif  /* lint || __lint */
--- a/usr/src/common/crypto/modes/gcm.c	Wed Sep 23 18:41:35 2009 -0400
+++ b/usr/src/common/crypto/modes/gcm.c	Wed Sep 23 15:55:57 2009 -0700
@@ -23,12 +23,14 @@
  * Use is subject to license terms.
  */
 
+
 #ifndef _KERNEL
 #include <strings.h>
 #include <limits.h>
 #include <assert.h>
 #include <security/cryptoki.h>
-#endif
+#endif	/* _KERNEL */
+
 
 #include <sys/types.h>
 #include <sys/kmem.h>
@@ -37,43 +39,85 @@
 #include <sys/crypto/impl.h>
 #include <sys/byteorder.h>
 
+#ifdef __amd64
+#include <sys/x86_archext.h>	/* x86_feature, X86_*, CPUID_* */
+
+#ifndef _KERNEL
+#include <sys/cpuvar.h>		/* cpu_t, CPU */
+#include <sys/disp.h>		/* kpreempt_disable(), kpreempt_enable */
+/* Workaround for no XMM kernel thread save/restore */
+#define	KPREEMPT_DISABLE	kpreempt_disable()
+#define	KPREEMPT_ENABLE		kpreempt_enable()
+
+#else
+#include <sys/auxv.h>		/* getisax() */
+#include <sys/auxv_386.h>	/* AV_386_PCLMULQDQ bit */
+#define	KPREEMPT_DISABLE
+#define	KPREEMPT_ENABLE
+#endif	/* _KERNEL */
+
+extern void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
+static int intel_pclmulqdq_instruction_present(void);
+#endif	/* __amd64 */
+
 struct aes_block {
 	uint64_t a;
 	uint64_t b;
 };
 
+
+/*
+ * gcm_mul()
+ * Perform a carry-less multiplication (that is, use XOR instead of the
+ * multiply operator) on *x_in and *y and place the result in *res.
+ *
+ * Byte swap the input (*x_in and *y) and the output (*res).
+ *
+ * Note: x_in, y, and res all point to 16-byte numbers (an array of two
+ * 64-bit integers).
+ */
 void
 gcm_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
 {
-	uint64_t R = { 0xe100000000000000ULL };
-	struct aes_block z = { 0, 0 };
-	struct aes_block v;
-	uint64_t x;
-	int i, j;
-
-	v.a = ntohll(y[0]);
-	v.b = ntohll(y[1]);
+#ifdef __amd64
+	if (intel_pclmulqdq_instruction_present()) {
+		KPREEMPT_DISABLE;
+		gcm_mul_pclmulqdq(x_in, y, res);
+		KPREEMPT_ENABLE;
+	} else
+#endif	/* __amd64 */
+	{
+		static const uint64_t R = 0xe100000000000000ULL;
+		struct aes_block z = {0, 0};
+		struct aes_block v;
+		uint64_t x;
+		int i, j;
 
-	for (j = 0; j < 2; j++) {
-		x = ntohll(x_in[j]);
-		for (i = 0; i < 64; i++, x <<= 1) {
-			if (x & 0x8000000000000000ULL) {
-				z.a ^= v.a;
-				z.b ^= v.b;
-			}
-			if (v.b & 1ULL) {
-				v.b = (v.a << 63)|(v.b >> 1);
-				v.a = (v.a >> 1) ^ R;
-			} else {
-				v.b = (v.a << 63)|(v.b >> 1);
-				v.a = v.a >> 1;
+		v.a = ntohll(y[0]);
+		v.b = ntohll(y[1]);
+
+		for (j = 0; j < 2; j++) {
+			x = ntohll(x_in[j]);
+			for (i = 0; i < 64; i++, x <<= 1) {
+				if (x & 0x8000000000000000ULL) {
+					z.a ^= v.a;
+					z.b ^= v.b;
+				}
+				if (v.b & 1ULL) {
+					v.b = (v.a << 63)|(v.b >> 1);
+					v.a = (v.a >> 1) ^ R;
+				} else {
+					v.b = (v.a << 63)|(v.b >> 1);
+					v.a = v.a >> 1;
+				}
 			}
 		}
+		res[0] = htonll(z.a);
+		res[1] = htonll(z.b);
 	}
-	res[0] = htonll(z.a);
-	res[1] = htonll(z.b);
 }
 
+
 #define	GHASH(c, d, t) \
 	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
 	gcm_mul((uint64_t *)(c)->gcm_ghash, (c)->gcm_H, (uint64_t *)(t));
@@ -674,3 +718,47 @@
 {
 	ctx->gcm_kmflag = kmflag;
 }
+
+
+#ifdef __amd64
+/*
+ * Return 1 if executing on Intel with PCLMULQDQ instructions,
+ * otherwise 0 (i.e., Intel without PCLMULQDQ or AMD64).
+ * Cache the result, as the CPU can't change.
+ *
+ * Note: the userland version uses getisax().  The kernel version uses
+ * global variable x86_feature or the output of cpuid_insn().
+ */
+static int
+intel_pclmulqdq_instruction_present(void)
+{
+	static int	cached_result = -1;
+
+	if (cached_result == -1) { /* first time */
+#ifdef _KERNEL
+#ifdef X86_PCLMULQDQ
+		cached_result = (x86_feature & X86_PCLMULQDQ) != 0;
+#else
+		if (cpuid_getvendor(CPU) == X86_VENDOR_Intel) {
+			struct cpuid_regs	cpr;
+			cpu_t			*cp = CPU;
+
+			cpr.cp_eax = 1; /* Function 1: get processor info */
+			(void) cpuid_insn(cp, &cpr);
+			cached_result = ((cpr.cp_ecx &
+			    CPUID_INTC_ECX_PCLMULQDQ) != 0);
+		} else {
+			cached_result = 0;
+		}
+#endif	/* X86_PCLMULQDQ */
+#else
+		uint_t		ui = 0;
+
+		(void) getisax(&ui, 1);
+		cached_result = (ui & AV_386_PCLMULQDQ) != 0;
+#endif	/* _KERNEL */
+	}
+
+	return (cached_result);
+}
+#endif	/* __amd64 */
--- a/usr/src/common/crypto/modes/modes.c	Wed Sep 23 18:41:35 2009 -0400
+++ b/usr/src/common/crypto/modes/modes.c	Wed Sep 23 15:55:57 2009 -0700
@@ -153,7 +153,7 @@
 			*current_offset = offset + amt;
 		} else {
 			/* one block spans two mblks */
-			*out_data_1_len = mp->b_wptr - p;
+			*out_data_1_len = _PTRDIFF(mp->b_wptr, p);
 			if ((mp = mp->b_cont) == NULL)
 				return;
 			*out_data_2 = mp->b_rptr;
--- a/usr/src/uts/common/crypto/api/kcf_random.c	Wed Sep 23 18:41:35 2009 -0400
+++ b/usr/src/uts/common/crypto/api/kcf_random.c	Wed Sep 23 15:55:57 2009 -0700
@@ -110,6 +110,7 @@
 static int num_waiters;		/* #threads waiting to read from /dev/random */
 
 static struct pollhead rnd_pollhead;
+/* LINTED E_STATIC_UNUSED */
 static timeout_id_t kcf_rndtimeout_id;
 static crypto_mech_type_t rngmech_type = CRYPTO_MECH_INVALID;
 rnd_stats_t rnd_stats;
@@ -287,7 +288,8 @@
 static int
 rngprov_getbytes_nblk(uint8_t *ptr, size_t len)
 {
-	int rv, blen, total_bytes;
+	int rv, total_bytes;
+	size_t blen;
 	uchar_t *rndbuf;
 	kcf_provider_desc_t *pd;
 	kcf_req_params_t params;
@@ -425,7 +427,7 @@
 static int
 rnd_get_bytes(uint8_t *ptr, size_t len, extract_type_t how)
 {
-	int bytes;
+	size_t bytes;
 	size_t got;
 
 	ASSERT(mutex_owned(&rndpool_lock));
@@ -581,8 +583,8 @@
 static int
 rnd_generate_pseudo_bytes(rndmag_pad_t *rmp, uint8_t *ptr, size_t len)
 {
-	size_t bytes = len;
-	int nblock, size;
+	size_t bytes = len, size;
+	int nblock;
 	uint32_t oblocks;
 	uint32_t tempout[HASHSIZE/BYTES_IN_WORD];
 	uint32_t seed[HASHSIZE/BYTES_IN_WORD];
--- a/usr/src/uts/intel/kcf/Makefile	Wed Sep 23 18:41:35 2009 -0400
+++ b/usr/src/uts/intel/kcf/Makefile	Wed Sep 23 15:55:57 2009 -0700
@@ -19,15 +19,13 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
-#
 #	This makefile drives the production of the Kernel Cryptographic
 #	Framework core module.
 #
-#	intel implementation architecture dependent
+#	Intel implementation architecture dependent
 #
 
 #
@@ -40,8 +38,11 @@
 #	Define the module and object file sets.
 #
 MODULE		= kcf
+LINTS		= $(KCF_OBJS:%.o=$(LINTS_DIR)/%.ln)
+KCF_OBJS_32	=
+KCF_OBJS_64	= gcm_intel.o
+KCF_OBJS	+= $(KCF_OBJS_$(CLASS))
 OBJECTS		= $(KCF_OBJS:%=$(OBJS_DIR)/%)
-LINTS		= $(KCF_OBJS:%.o=$(LINTS_DIR)/%.ln)
 ROOTMODULE	= $(ROOT_MISC_DIR)/$(MODULE)
 
 #
@@ -63,16 +64,15 @@
 #
 LINTTAGS	+= -erroff=E_SUSPICIOUS_COMPARISON
 LINTTAGS	+= -erroff=E_BAD_PTR_CAST_ALIGN
-LINTTAGS	+= -erroff=E_ASSIGN_NARROW_CONV
-LINTTAGS	+= -erroff=E_STATIC_UNUSED
-LINTTAGS	+= -erroff=E_PTRDIFF_OVERFLOW
+
 LINTTAGS	+= -I$(COM_DIR)
 
 #
 # Linkage dependencies
 #
 
-CFLAGS += $(CCVERBOSE) -I$(COM_DIR)
+CFLAGS		+= $(CCVERBOSE) -I$(COM_DIR)
+AS_CPPFLAGS	+= -I../../$(PLATFORM)
 
 #
 #	Default build targets.
@@ -99,3 +99,10 @@
 #	Include common targets.
 #
 include $(UTSBASE)/intel/Makefile.targ
+
+$(OBJS_DIR)/%.o: $(COM_DIR)/modes/amd64/%.s
+	$(COMPILE.s) -o $@ $(COM_DIR)/modes/amd64/${@F:.o=.s}
+	$(POST_PROCESS_O)
+
+$(OBJS_DIR)/%.ln: $(COM_DIR)/modes/amd64/%.s
+	@($(LHEAD) $(LINT.s) $(COM_DIR)/modes/amd64/${@F:.ln=.s} $(LTAIL))