Mercurial > illumos > illumos-gate
changeset 10627:e56a44d6d742
6826942 Need an optimized GCM leveraging Intel's PCMULQDQ instruction
author | Dan OpenSolaris Anderson <opensolaris@drydog.com> |
---|---|
date | Wed, 23 Sep 2009 15:55:57 -0700 |
parents | 9c09f5dd637e |
children | cddb35f5bfa6 |
files | usr/src/common/crypto/modes/amd64/gcm_intel.s usr/src/common/crypto/modes/gcm.c usr/src/common/crypto/modes/modes.c usr/src/uts/common/crypto/api/kcf_random.c usr/src/uts/intel/kcf/Makefile |
diffstat | 5 files changed, 483 insertions(+), 38 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/common/crypto/modes/amd64/gcm_intel.s Wed Sep 23 15:55:57 2009 -0700 @@ -0,0 +1,348 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009 Intel Corporation + * All Rights Reserved. + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains an accelerated + * Galois Field Multiplication implementation. + * + * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, + * carry-less multiplication. More information about PCLMULQDQ can be + * found at: + * http://software.intel.com/en-us/articles/ + * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ + * + */ + +/* + * ==================================================================== + * OpenSolaris OS modifications + * + * This source originates as file galois_hash_asm.c from + * Intel Corporation dated September 21, 2009. + * + * This OpenSolaris version has these major changes from the original source: + * + * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function + * definition for lint. + * + * 2. Formatted code, added comments, and added #includes and #defines. + * + * 3. Commented out pclmulqdq and pshufb instructions and replaced with + * .byte sequences (as pclmulqdq isn't supported yet by all of the gas, as, + * and aw assemblers). + * + * 4. If bit CR0.TS is set, clear and set the TS bit, after and before + * calling kpreempt_disable() and kpreempt_enable(). + * If the TS bit is not set, Save and restore %xmm registers at the beginning + * and end of function calls (%xmm* registers are not saved and restored by + * during kernel thread preemption). + * + * 5. Removed code to perform hashing. This is already done with C macro + * GHASH in gcm.c. For better performance, this removed code should be + * reintegrated in the future to replace the C GHASH macro. + * + * 6. Added code to byte swap 16-byte input and output. + * + * 7. Folded in comments from the original C source with embedded assembly + * (SB_w_shift_xor.c) + * + * 8. Renamed function and reordered parameters to match OpenSolaris: + * Intel interface: + * void galois_hash_asm(unsigned char *hk, unsigned char *s, + * unsigned char *d, int length) + * OpenSolaris OS interface: + * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); + * ==================================================================== + */ + + +#if defined(lint) || defined(__lint) + +#include <sys/types.h> + +/* ARGSUSED */ +void +gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { +} + +#else /* lint */ + +#include <sys/asm_linkage.h> +#include <sys/controlregs.h> +#ifdef _KERNEL +#include <sys/machprivregs.h> +#endif + +#ifdef _KERNEL + /* + * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is, + * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it + * uses it to pass P2 to syscall. + * This also occurs with the STTS macro, but we don't care if + * P2 (%rsi) is modified just before function exit. + * The CLTS and STTS macros push and pop P1 (%rdi) already. + */ +#ifdef __xpv +#define PROTECTED_CLTS \ + push %rsi; \ + CLTS; \ + pop %rsi +#else +#define PROTECTED_CLTS \ + CLTS +#endif /* __xpv */ + + /* + * If CR0_TS is not set, align stack (with push %rbp) and push + * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS + */ +#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \ + push %rbp; \ + mov %rsp, %rbp; \ + movq %cr0, tmpreg; \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + and $-XMM_ALIGN, %rsp; \ + sub $[XMM_SIZE * 11], %rsp; \ + movaps %xmm0, 160(%rsp); \ + movaps %xmm1, 144(%rsp); \ + movaps %xmm2, 128(%rsp); \ + movaps %xmm3, 112(%rsp); \ + movaps %xmm4, 96(%rsp); \ + movaps %xmm5, 80(%rsp); \ + movaps %xmm6, 64(%rsp); \ + movaps %xmm7, 48(%rsp); \ + movaps %xmm8, 32(%rsp); \ + movaps %xmm9, 16(%rsp); \ + movaps %xmm10, (%rsp); \ + jmp 2f; \ +1: \ + PROTECTED_CLTS; \ +2: + + + /* + * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack, + * otherwise set CR0_TS. + */ +#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + movaps (%rsp), %xmm10; \ + movaps 16(%rsp), %xmm9; \ + movaps 32(%rsp), %xmm8; \ + movaps 48(%rsp), %xmm7; \ + movaps 64(%rsp), %xmm6; \ + movaps 80(%rsp), %xmm5; \ + movaps 96(%rsp), %xmm4; \ + movaps 112(%rsp), %xmm3; \ + movaps 128(%rsp), %xmm2; \ + movaps 144(%rsp), %xmm1; \ + movaps 160(%rsp), %xmm0; \ + jmp 2f; \ +1: \ + STTS(tmpreg); \ +2: \ + mov %rbp, %rsp; \ + pop %rbp + + +#else +#define PROTECTED_CLTS +#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) +#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) +#endif /* _KERNEL */ + +/* + * Use this mask to byte-swap a 16-byte integer with the pshufb instruction + */ + +// static uint8_t byte_swap16_mask[] = { +// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; +.text +.align XMM_ALIGN +.Lbyte_swap16_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + + + +/* + * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); + * + * Perform a carry-less multiplication (that is, use XOR instead of the + * multiply operator) on P1 and P2 and place the result in P3. + * + * Byte swap the input and the output. + * + * Note: x_in, y, and res all point to a block of 20-byte numbers + * (an array of two 64-bit integers). + * + * Note2: For kernel code, caller is responsible for ensuring + * kpreempt_disable() has been called. This is because %xmm registers are + * not saved/restored. Clear and set the CR0.TS bit on entry and exit, + * respectively, if TS is set on entry. Otherwise, if TS is not set, + * save and restore %xmm registers on the stack. + * + * Note3: Original Intel definition: + * void galois_hash_asm(unsigned char *hk, unsigned char *s, + * unsigned char *d, int length) + * + * Note4: Register/parameter mapping: + * Intel: + * Parameter 1: %rcx (copied to %xmm0) hk or x_in + * Parameter 2: %rdx (copied to %xmm1) s or y + * Parameter 3: %rdi (result) d or res + * OpenSolaris: + * Parameter 1: %rdi (copied to %xmm0) x_in + * Parameter 2: %rsi (copied to %xmm1) y + * Parameter 3: %rdx (result) res + */ + +ENTRY_NP(gcm_mul_pclmulqdq) + CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10) + + // + // Copy Parameters + // + movdqu (%rdi), %xmm0 // P1 + movdqu (%rsi), %xmm1 // P2 + + // + // Byte swap 16-byte input + // + lea .Lbyte_swap16_mask(%rip), %rax + movaps (%rax), %xmm10 + //pshufb %xmm10, %xmm0 + .byte 0x66, 0x41, 0x0f, 0x38, 0x00, 0xc2 + //pshufb %xmm10, %xmm1 + .byte 0x66, 0x41, 0x0f, 0x38, 0x00, 0xca + + + // + // Multiply with the hash key + // + movdqu %xmm0, %xmm3 + //pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 + .byte 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00 + + movdqu %xmm0, %xmm4 + //pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 + .byte 0x66, 0x0f, 0x3a, 0x44, 0xe1, 0x10 + + movdqu %xmm0, %xmm5 + //pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 + .byte 0x66, 0x0f, 0x3a, 0x44, 0xe9, 0x01 + movdqu %xmm0, %xmm6 + //pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 + .byte 0x66, 0x0f, 0x3a, 0x44, 0xf1, 0x11 + + pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 + + movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 + psrldq $8, %xmm4 // shift by xmm4 64 bits to the right + pslldq $8, %xmm5 // shift by xmm5 64 bits to the left + pxor %xmm5, %xmm3 + pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result + // of the carry-less multiplication of + // xmm0 by xmm1. + + // We shift the result of the multiplication by one bit position + // to the left to cope for the fact that the bits are reversed. + movdqu %xmm3, %xmm7 + movdqu %xmm6, %xmm8 + pslld $1, %xmm3 + pslld $1, %xmm6 + psrld $31, %xmm7 + psrld $31, %xmm8 + movdqu %xmm7, %xmm9 + pslldq $4, %xmm8 + pslldq $4, %xmm7 + psrldq $12, %xmm9 + por %xmm7, %xmm3 + por %xmm8, %xmm6 + por %xmm9, %xmm6 + + // + // First phase of the reduction + // + // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts + // independently. + movdqu %xmm3, %xmm7 + movdqu %xmm3, %xmm8 + movdqu %xmm3, %xmm9 + pslld $31, %xmm7 // packed right shift shifting << 31 + pslld $30, %xmm8 // packed right shift shifting << 30 + pslld $25, %xmm9 // packed right shift shifting << 25 + pxor %xmm8, %xmm7 // xor the shifted versions + pxor %xmm9, %xmm7 + movdqu %xmm7, %xmm8 + pslldq $12, %xmm7 + psrldq $4, %xmm8 + pxor %xmm7, %xmm3 // first phase of the reduction complete + + // + // Second phase of the reduction + // + // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these + // shift operations. + movdqu %xmm3, %xmm2 + movdqu %xmm3, %xmm4 // packed left shifting >> 1 + movdqu %xmm3, %xmm5 + psrld $1, %xmm2 + psrld $2, %xmm4 // packed left shifting >> 2 + psrld $7, %xmm5 // packed left shifting >> 7 + pxor %xmm4, %xmm2 // xor the shifted versions + pxor %xmm5, %xmm2 + pxor %xmm8, %xmm2 + pxor %xmm2, %xmm3 + pxor %xmm3, %xmm6 // the result is in xmm6 + + // + // Byte swap 16-byte result + // + //pshufb %xmm10, %xmm6 // %xmm10 has the swap mask + .byte 0x66, 0x41, 0x0f, 0x38, 0x00, 0xf2 + + // + // Store the result + // + movdqu %xmm6, (%rdx) // P3 + + + // + // Cleanup and Return + // + SET_TS_OR_POP_XMM_REGISTERS(%r10) + ret + SET_SIZE(gcm_mul_pclmulqdq) + +#endif /* lint || __lint */
--- a/usr/src/common/crypto/modes/gcm.c Wed Sep 23 18:41:35 2009 -0400 +++ b/usr/src/common/crypto/modes/gcm.c Wed Sep 23 15:55:57 2009 -0700 @@ -23,12 +23,14 @@ * Use is subject to license terms. */ + #ifndef _KERNEL #include <strings.h> #include <limits.h> #include <assert.h> #include <security/cryptoki.h> -#endif +#endif /* _KERNEL */ + #include <sys/types.h> #include <sys/kmem.h> @@ -37,43 +39,85 @@ #include <sys/crypto/impl.h> #include <sys/byteorder.h> +#ifdef __amd64 +#include <sys/x86_archext.h> /* x86_feature, X86_*, CPUID_* */ + +#ifndef _KERNEL +#include <sys/cpuvar.h> /* cpu_t, CPU */ +#include <sys/disp.h> /* kpreempt_disable(), kpreempt_enable */ +/* Workaround for no XMM kernel thread save/restore */ +#define KPREEMPT_DISABLE kpreempt_disable() +#define KPREEMPT_ENABLE kpreempt_enable() + +#else +#include <sys/auxv.h> /* getisax() */ +#include <sys/auxv_386.h> /* AV_386_PCLMULQDQ bit */ +#define KPREEMPT_DISABLE +#define KPREEMPT_ENABLE +#endif /* _KERNEL */ + +extern void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); +static int intel_pclmulqdq_instruction_present(void); +#endif /* __amd64 */ + struct aes_block { uint64_t a; uint64_t b; }; + +/* + * gcm_mul() + * Perform a carry-less multiplication (that is, use XOR instead of the + * multiply operator) on *x_in and *y and place the result in *res. + * + * Byte swap the input (*x_in and *y) and the output (*res). + * + * Note: x_in, y, and res all point to 16-byte numbers (an array of two + * 64-bit integers). + */ void gcm_mul(uint64_t *x_in, uint64_t *y, uint64_t *res) { - uint64_t R = { 0xe100000000000000ULL }; - struct aes_block z = { 0, 0 }; - struct aes_block v; - uint64_t x; - int i, j; - - v.a = ntohll(y[0]); - v.b = ntohll(y[1]); +#ifdef __amd64 + if (intel_pclmulqdq_instruction_present()) { + KPREEMPT_DISABLE; + gcm_mul_pclmulqdq(x_in, y, res); + KPREEMPT_ENABLE; + } else +#endif /* __amd64 */ + { + static const uint64_t R = 0xe100000000000000ULL; + struct aes_block z = {0, 0}; + struct aes_block v; + uint64_t x; + int i, j; - for (j = 0; j < 2; j++) { - x = ntohll(x_in[j]); - for (i = 0; i < 64; i++, x <<= 1) { - if (x & 0x8000000000000000ULL) { - z.a ^= v.a; - z.b ^= v.b; - } - if (v.b & 1ULL) { - v.b = (v.a << 63)|(v.b >> 1); - v.a = (v.a >> 1) ^ R; - } else { - v.b = (v.a << 63)|(v.b >> 1); - v.a = v.a >> 1; + v.a = ntohll(y[0]); + v.b = ntohll(y[1]); + + for (j = 0; j < 2; j++) { + x = ntohll(x_in[j]); + for (i = 0; i < 64; i++, x <<= 1) { + if (x & 0x8000000000000000ULL) { + z.a ^= v.a; + z.b ^= v.b; + } + if (v.b & 1ULL) { + v.b = (v.a << 63)|(v.b >> 1); + v.a = (v.a >> 1) ^ R; + } else { + v.b = (v.a << 63)|(v.b >> 1); + v.a = v.a >> 1; + } } } + res[0] = htonll(z.a); + res[1] = htonll(z.b); } - res[0] = htonll(z.a); - res[1] = htonll(z.b); } + #define GHASH(c, d, t) \ xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ gcm_mul((uint64_t *)(c)->gcm_ghash, (c)->gcm_H, (uint64_t *)(t)); @@ -674,3 +718,47 @@ { ctx->gcm_kmflag = kmflag; } + + +#ifdef __amd64 +/* + * Return 1 if executing on Intel with PCLMULQDQ instructions, + * otherwise 0 (i.e., Intel without PCLMULQDQ or AMD64). + * Cache the result, as the CPU can't change. + * + * Note: the userland version uses getisax(). The kernel version uses + * global variable x86_feature or the output of cpuid_insn(). + */ +static int +intel_pclmulqdq_instruction_present(void) +{ + static int cached_result = -1; + + if (cached_result == -1) { /* first time */ +#ifdef _KERNEL +#ifdef X86_PCLMULQDQ + cached_result = (x86_feature & X86_PCLMULQDQ) != 0; +#else + if (cpuid_getvendor(CPU) == X86_VENDOR_Intel) { + struct cpuid_regs cpr; + cpu_t *cp = CPU; + + cpr.cp_eax = 1; /* Function 1: get processor info */ + (void) cpuid_insn(cp, &cpr); + cached_result = ((cpr.cp_ecx & + CPUID_INTC_ECX_PCLMULQDQ) != 0); + } else { + cached_result = 0; + } +#endif /* X86_PCLMULQDQ */ +#else + uint_t ui = 0; + + (void) getisax(&ui, 1); + cached_result = (ui & AV_386_PCLMULQDQ) != 0; +#endif /* _KERNEL */ + } + + return (cached_result); +} +#endif /* __amd64 */
--- a/usr/src/common/crypto/modes/modes.c Wed Sep 23 18:41:35 2009 -0400 +++ b/usr/src/common/crypto/modes/modes.c Wed Sep 23 15:55:57 2009 -0700 @@ -153,7 +153,7 @@ *current_offset = offset + amt; } else { /* one block spans two mblks */ - *out_data_1_len = mp->b_wptr - p; + *out_data_1_len = _PTRDIFF(mp->b_wptr, p); if ((mp = mp->b_cont) == NULL) return; *out_data_2 = mp->b_rptr;
--- a/usr/src/uts/common/crypto/api/kcf_random.c Wed Sep 23 18:41:35 2009 -0400 +++ b/usr/src/uts/common/crypto/api/kcf_random.c Wed Sep 23 15:55:57 2009 -0700 @@ -110,6 +110,7 @@ static int num_waiters; /* #threads waiting to read from /dev/random */ static struct pollhead rnd_pollhead; +/* LINTED E_STATIC_UNUSED */ static timeout_id_t kcf_rndtimeout_id; static crypto_mech_type_t rngmech_type = CRYPTO_MECH_INVALID; rnd_stats_t rnd_stats; @@ -287,7 +288,8 @@ static int rngprov_getbytes_nblk(uint8_t *ptr, size_t len) { - int rv, blen, total_bytes; + int rv, total_bytes; + size_t blen; uchar_t *rndbuf; kcf_provider_desc_t *pd; kcf_req_params_t params; @@ -425,7 +427,7 @@ static int rnd_get_bytes(uint8_t *ptr, size_t len, extract_type_t how) { - int bytes; + size_t bytes; size_t got; ASSERT(mutex_owned(&rndpool_lock)); @@ -581,8 +583,8 @@ static int rnd_generate_pseudo_bytes(rndmag_pad_t *rmp, uint8_t *ptr, size_t len) { - size_t bytes = len; - int nblock, size; + size_t bytes = len, size; + int nblock; uint32_t oblocks; uint32_t tempout[HASHSIZE/BYTES_IN_WORD]; uint32_t seed[HASHSIZE/BYTES_IN_WORD];
--- a/usr/src/uts/intel/kcf/Makefile Wed Sep 23 18:41:35 2009 -0400 +++ b/usr/src/uts/intel/kcf/Makefile Wed Sep 23 15:55:57 2009 -0700 @@ -19,15 +19,13 @@ # CDDL HEADER END # # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" -# # This makefile drives the production of the Kernel Cryptographic # Framework core module. # -# intel implementation architecture dependent +# Intel implementation architecture dependent # # @@ -40,8 +38,11 @@ # Define the module and object file sets. # MODULE = kcf +LINTS = $(KCF_OBJS:%.o=$(LINTS_DIR)/%.ln) +KCF_OBJS_32 = +KCF_OBJS_64 = gcm_intel.o +KCF_OBJS += $(KCF_OBJS_$(CLASS)) OBJECTS = $(KCF_OBJS:%=$(OBJS_DIR)/%) -LINTS = $(KCF_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_MISC_DIR)/$(MODULE) # @@ -63,16 +64,15 @@ # LINTTAGS += -erroff=E_SUSPICIOUS_COMPARISON LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN -LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV -LINTTAGS += -erroff=E_STATIC_UNUSED -LINTTAGS += -erroff=E_PTRDIFF_OVERFLOW + LINTTAGS += -I$(COM_DIR) # # Linkage dependencies # -CFLAGS += $(CCVERBOSE) -I$(COM_DIR) +CFLAGS += $(CCVERBOSE) -I$(COM_DIR) +AS_CPPFLAGS += -I../../$(PLATFORM) # # Default build targets. @@ -99,3 +99,10 @@ # Include common targets. # include $(UTSBASE)/intel/Makefile.targ + +$(OBJS_DIR)/%.o: $(COM_DIR)/modes/amd64/%.s + $(COMPILE.s) -o $@ $(COM_DIR)/modes/amd64/${@F:.o=.s} + $(POST_PROCESS_O) + +$(OBJS_DIR)/%.ln: $(COM_DIR)/modes/amd64/%.s + @($(LHEAD) $(LINT.s) $(COM_DIR)/modes/amd64/${@F:.ln=.s} $(LTAIL))