12 years ago · 797994f81a
--- a/Documentation/devicetree/bindings/crypto/fsl-imx-sahara.txt
+++ b/Documentation/devicetree/bindings/crypto/fsl-imx-sahara.txt
@@ -0,0 +1,15 @@
 
															+Freescale SAHARA Cryptographic Accelerator included in some i.MX chips.
														
 
															+Currently only i.MX27 is supported.
														
 
															+
														
 
															+Required properties:
														
 
															+- compatible : Should be "fsl,<soc>-sahara"
														
 
															+- reg : Should contain SAHARA registers location and length
														
 
															+- interrupts : Should contain SAHARA interrupt number
														
 
															+
														
 
															+Example:
														
 
															+
														
 
															+sah@10025000 {
														
 
															+	compatible = "fsl,imx27-sahara";
														
 
															+	reg = <	0x10025000 0x800>;
														
 
															+	interrupts = <75>;
														
 
															+};
														
--- a/Documentation/devicetree/bindings/hwrng/timeriomem_rng.txt
+++ b/Documentation/devicetree/bindings/hwrng/timeriomem_rng.txt
@@ -0,0 +1,18 @@
 
															+HWRNG support for the timeriomem_rng driver
														
 
															+
														
 
															+Required properties:
														
 
															+- compatible : "timeriomem_rng"
														
 
															+- reg : base address to sample from
														
 
															+- period : wait time in microseconds to use between samples
														
 
															+
														
 
															+N.B. currently 'reg' must be four bytes wide and aligned
														
 
															+
														
 
															+Example:
														
 
															+
														
 
															+hwrng@44 {
														
 
															+	#address-cells = <1>;
														
 
															+	#size-cells = <1>;
														
 
															+	compatible = "timeriomem_rng";
														
 
															+	reg = <0x44 0x04>;
														
 
															+	period = <1000000>;
														
 
															+};
														
--- a/Documentation/devicetree/bindings/rng/brcm,bcm2835.txt
+++ b/Documentation/devicetree/bindings/rng/brcm,bcm2835.txt
@@ -0,0 +1,13 @@
 
															+BCM2835 Random number generator
														
 
															+
														
 
															+Required properties:
														
 
															+
														
 
															+- compatible : should be "brcm,bcm2835-rng"
														
 
															+- reg : Specifies base physical address and size of the registers.
														
 
															+
														
 
															+Example:
														
 
															+
														
 
															+rng {
														
 
															+        compatible = "brcm,bcm2835-rng";
														
 
															+        reg = <0x7e104000 0x10>;
														
 
															+};
														
--- a/Documentation/hw_random.txt
+++ b/Documentation/hw_random.txt
@@ -63,7 +63,7 @@ Intel RNG Driver notes:
 
															 	* FIXME: support poll(2)
														
 
															-	NOTE: request_mem_region was removed, for two reasons:
														
 
															+	NOTE: request_mem_region was removed, for three reasons:
														
 
															 	1) Only one RNG is supported by this driver, 2) The location
														
 
															 	used by the RNG is a fixed location in MMIO-addressable memory,
														
 
															 	3) users with properly working BIOS e820 handling will always
														
--- a/arch/arm/mach-at91/at91sam9g45_devices.c
+++ b/arch/arm/mach-at91/at91sam9g45_devices.c
@@ -18,7 +18,7 @@
 
															 #include <linux/platform_device.h>
														
 
															 #include <linux/i2c-gpio.h>
														
 
															 #include <linux/atmel-mci.h>
														
 
															-#include <linux/platform_data/atmel-aes.h>
														
 
															+#include <linux/platform_data/crypto-atmel.h>
														
 
															 #include <linux/platform_data/at91_adc.h>
														
@@ -1900,7 +1900,8 @@ static void __init at91_add_device_tdes(void) {}
 
															  * -------------------------------------------------------------------- */
														
 
															 #if defined(CONFIG_CRYPTO_DEV_ATMEL_AES) || defined(CONFIG_CRYPTO_DEV_ATMEL_AES_MODULE)
														
 
															-static struct aes_platform_data aes_data;
														
 
															+static struct crypto_platform_data aes_data;
														
 
															+static struct crypto_dma_data alt_atslave;
														
 
															 static u64 aes_dmamask = DMA_BIT_MASK(32);
														
 
															 static struct resource aes_resources[] = {
														
@@ -1931,23 +1932,20 @@ static struct platform_device at91sam9g45_aes_device = {
 
															 static void __init at91_add_device_aes(void)
														
 
															 {
														
 
															 	struct at_dma_slave	*atslave;
														
 
															-	struct aes_dma_data	*alt_atslave;
														
 
															-
														
 
															-	alt_atslave = kzalloc(sizeof(struct aes_dma_data), GFP_KERNEL);
														
 
															 	/* DMA TX slave channel configuration */
														
 
															-	atslave = &alt_atslave->txdata;
														
 
															+	atslave = &alt_atslave.txdata;
														
 
															 	atslave->dma_dev = &at_hdmac_device.dev;
														
 
															 	atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE	| ATC_SRC_H2SEL_HW |
														
 
															 						ATC_SRC_PER(AT_DMA_ID_AES_RX);
														
 
															 	/* DMA RX slave channel configuration */
														
 
															-	atslave = &alt_atslave->rxdata;
														
 
															+	atslave = &alt_atslave.rxdata;
														
 
															 	atslave->dma_dev = &at_hdmac_device.dev;
														
 
															 	atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE	| ATC_DST_H2SEL_HW |
														
 
															 						ATC_DST_PER(AT_DMA_ID_AES_TX);
														
 
															-	aes_data.dma_slave = alt_atslave;
														
 
															+	aes_data.dma_slave = &alt_atslave;
														
 
															 	platform_device_register(&at91sam9g45_aes_device);
														
 
															 }
														
 
															 #else
														
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,6 +2,10 @@
 
															 # Arch-specific CryptoAPI modules.
														
 
															 #
														
 
															+avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
														
 
															+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
														
 
															+					$(comma)4)$(comma)%ymm2,yes,no)
														
 
															+
														
 
															 obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
														
 
															 obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
														
@@ -12,22 +16,37 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
 
															 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
														
 
															 obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
														
 
															-obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
														
 
															-obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
														
 
															-obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
														
 
															 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
														
 
															 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
														
 
															 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
														
 
															-obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
														
 
															 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
														
 
															 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
														
 
															-obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
														
 
															 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
														
 
															 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
														
 
															 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
														
 
															 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
														
 
															 obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
														
 
															+obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
														
 
															+obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
														
 
															+
														
 
															+# These modules require assembler to support AVX.
														
 
															+ifeq ($(avx_supported),yes)
														
 
															+	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
														
 
															+						camellia-aesni-avx-x86_64.o
														
 
															+	obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
														
 
															+	obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
														
 
															+	obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
														
 
															+	obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
														
 
															+endif
														
 
															+
														
 
															+# These modules require assembler to support AVX2.
														
 
															+ifeq ($(avx2_supported),yes)
														
 
															+	obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
														
 
															+	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
														
 
															+	obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
														
 
															+	obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
														
 
															+endif
														
 
															 aes-i586-y := aes-i586-asm_32.o aes_glue.o
														
 
															 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
														
@@ -36,21 +55,35 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
 
															 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
														
 
															 camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
														
 
															-camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
														
 
															-			       camellia_aesni_avx_glue.o
														
 
															-cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
														
 
															-cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
														
 
															 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
														
 
															 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
														
 
															 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
														
 
															-twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
														
 
															 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
														
 
															 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
														
 
															-serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
														
 
															+
														
 
															+ifeq ($(avx_supported),yes)
														
 
															+	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
														
 
															+					camellia_aesni_avx_glue.o
														
 
															+	cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
														
 
															+	cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
														
 
															+	twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \
														
 
															+				twofish_avx_glue.o
														
 
															+	serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \
														
 
															+				serpent_avx_glue.o
														
 
															+endif
														
 
															+
														
 
															+ifeq ($(avx2_supported),yes)
														
 
															+	blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
														
 
															+	camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
														
 
															+	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
														
 
															+	twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
														
 
															+endif
														
 
															 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
														
 
															 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
														
 
															 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
														
 
															 crc32c-intel-y := crc32c-intel_glue.o
														
 
															-crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
														
 
															+crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
														
 
															 crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
														
 
															+sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
														
 
															+sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
														
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -34,6 +34,10 @@
 
															 #ifdef __x86_64__
														
 
															 .data
														
 
															+.align 16
														
 
															+.Lgf128mul_x_ble_mask:
														
 
															+	.octa 0x00000000000000010000000000000087
														
 
															+
														
 
															 POLY:   .octa 0xC2000000000000000000000000000001
														
 
															 TWOONE: .octa 0x00000001000000000000000000000001
														
@@ -105,6 +109,8 @@ enc:        .octa 0x2
 
															 #define CTR	%xmm11
														
 
															 #define INC	%xmm12
														
 
															+#define GF128MUL_MASK %xmm10
														
 
															+
														
 
															 #ifdef __x86_64__
														
 
															 #define AREG	%rax
														
 
															 #define KEYP	%rdi
														
@@ -2636,4 +2642,115 @@ ENTRY(aesni_ctr_enc)
 
															 .Lctr_enc_just_ret:
														
 
															 	ret
														
 
															 ENDPROC(aesni_ctr_enc)
														
 
															+
														
 
															+/*
														
 
															+ * _aesni_gf128mul_x_ble:		internal ABI
														
 
															+ *	Multiply in GF(2^128) for XTS IVs
														
 
															+ * input:
														
 
															+ *	IV:	current IV
														
 
															+ *	GF128MUL_MASK == mask with 0x87 and 0x01
														
 
															+ * output:
														
 
															+ *	IV:	next IV
														
 
															+ * changed:
														
 
															+ *	CTR:	== temporary value
														
 
															+ */
														
 
															+#define _aesni_gf128mul_x_ble() \
														
 
															+	pshufd $0x13, IV, CTR; \
														
 
															+	paddq IV, IV; \
														
 
															+	psrad $31, CTR; \
														
 
															+	pand GF128MUL_MASK, CTR; \
														
 
															+	pxor CTR, IV;
														
 
															+
														
 
															+/*
														
 
															+ * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
														
 
															+ *			 bool enc, u8 *iv)
														
 
															+ */
														
 
															+ENTRY(aesni_xts_crypt8)
														
 
															+	cmpb $0, %cl
														
 
															+	movl $0, %ecx
														
 
															+	movl $240, %r10d
														
 
															+	leaq _aesni_enc4, %r11
														
 
															+	leaq _aesni_dec4, %rax
														
 
															+	cmovel %r10d, %ecx
														
 
															+	cmoveq %rax, %r11
														
 
															+
														
 
															+	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
														
 
															+	movups (IVP), IV
														
 
															+
														
 
															+	mov 480(KEYP), KLEN
														
 
															+	addq %rcx, KEYP
														
 
															+
														
 
															+	movdqa IV, STATE1
														
 
															+	pxor 0x00(INP), STATE1
														
 
															+	movdqu IV, 0x00(OUTP)
														
 
															+
														
 
															+	_aesni_gf128mul_x_ble()
														
 
															+	movdqa IV, STATE2
														
 
															+	pxor 0x10(INP), STATE2
														
 
															+	movdqu IV, 0x10(OUTP)
														
 
															+
														
 
															+	_aesni_gf128mul_x_ble()
														
 
															+	movdqa IV, STATE3
														
 
															+	pxor 0x20(INP), STATE3
														
 
															+	movdqu IV, 0x20(OUTP)
														
 
															+
														
 
															+	_aesni_gf128mul_x_ble()
														
 
															+	movdqa IV, STATE4
														
 
															+	pxor 0x30(INP), STATE4
														
 
															+	movdqu IV, 0x30(OUTP)
														
 
															+
														
 
															+	call *%r11
														
 
															+
														
 
															+	pxor 0x00(OUTP), STATE1
														
 
															+	movdqu STATE1, 0x00(OUTP)
														
 
															+
														
 
															+	_aesni_gf128mul_x_ble()
														
 
															+	movdqa IV, STATE1
														
 
															+	pxor 0x40(INP), STATE1
														
 
															+	movdqu IV, 0x40(OUTP)
														
 
															+
														
 
															+	pxor 0x10(OUTP), STATE2
														
 
															+	movdqu STATE2, 0x10(OUTP)
														
 
															+
														
 
															+	_aesni_gf128mul_x_ble()
														
 
															+	movdqa IV, STATE2
														
 
															+	pxor 0x50(INP), STATE2
														
 
															+	movdqu IV, 0x50(OUTP)
														
 
															+
														
 
															+	pxor 0x20(OUTP), STATE3
														
 
															+	movdqu STATE3, 0x20(OUTP)
														
 
															+
														
 
															+	_aesni_gf128mul_x_ble()
														
 
															+	movdqa IV, STATE3
														
 
															+	pxor 0x60(INP), STATE3
														
 
															+	movdqu IV, 0x60(OUTP)
														
 
															+
														
 
															+	pxor 0x30(OUTP), STATE4
														
 
															+	movdqu STATE4, 0x30(OUTP)
														
 
															+
														
 
															+	_aesni_gf128mul_x_ble()
														
 
															+	movdqa IV, STATE4
														
 
															+	pxor 0x70(INP), STATE4
														
 
															+	movdqu IV, 0x70(OUTP)
														
 
															+
														
 
															+	_aesni_gf128mul_x_ble()
														
 
															+	movups IV, (IVP)
														
 
															+
														
 
															+	call *%r11
														
 
															+
														
 
															+	pxor 0x40(OUTP), STATE1
														
 
															+	movdqu STATE1, 0x40(OUTP)
														
 
															+
														
 
															+	pxor 0x50(OUTP), STATE2
														
 
															+	movdqu STATE2, 0x50(OUTP)
														
 
															+
														
 
															+	pxor 0x60(OUTP), STATE3
														
 
															+	movdqu STATE3, 0x60(OUTP)
														
 
															+
														
 
															+	pxor 0x70(OUTP), STATE4
														
 
															+	movdqu STATE4, 0x70(OUTP)
														
 
															+
														
 
															+	ret
														
 
															+ENDPROC(aesni_xts_crypt8)
														
 
															+
														
 
															 #endif
														
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -39,6 +39,9 @@
 
															 #include <crypto/internal/aead.h>
														
 
															 #include <linux/workqueue.h>
														
 
															 #include <linux/spinlock.h>
														
 
															+#ifdef CONFIG_X86_64
														
 
															+#include <asm/crypto/glue_helper.h>
														
 
															+#endif
														
 
															 #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
														
 
															 #define HAS_PCBC
														
@@ -102,6 +105,9 @@ void crypto_fpu_exit(void);
 
															 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
														
 
															 			      const u8 *in, unsigned int len, u8 *iv);
														
 
															+asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
														
 
															+				 const u8 *in, bool enc, u8 *iv);
														
 
															+
														
 
															 /* asmlinkage void aesni_gcm_enc()
														
 
															  * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
														
 
															  * u8 *out, Ciphertext output. Encrypt in-place is allowed.
														
@@ -510,6 +516,78 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in)
 
															 	aesni_enc(ctx, out, in);
														
 
															 }
														
 
															+#ifdef CONFIG_X86_64
														
 
															+
														
 
															+static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															+{
														
 
															+	glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc));
														
 
															+}
														
 
															+
														
 
															+static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															+{
														
 
															+	glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec));
														
 
															+}
														
 
															+
														
 
															+static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															+{
														
 
															+	aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv);
														
 
															+}
														
 
															+
														
 
															+static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															+{
														
 
															+	aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv);
														
 
															+}
														
 
															+
														
 
															+static const struct common_glue_ctx aesni_enc_xts = {
														
 
															+	.num_funcs = 2,
														
 
															+	.fpu_blocks_limit = 1,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = 8,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx aesni_dec_xts = {
														
 
															+	.num_funcs = 2,
														
 
															+	.fpu_blocks_limit = 1,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = 8,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+
														
 
															+	return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(aesni_xts_tweak),
														
 
															+				     aes_ctx(ctx->raw_tweak_ctx),
														
 
															+				     aes_ctx(ctx->raw_crypt_ctx));
														
 
															+}
														
 
															+
														
 
															+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+
														
 
															+	return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(aesni_xts_tweak),
														
 
															+				     aes_ctx(ctx->raw_tweak_ctx),
														
 
															+				     aes_ctx(ctx->raw_crypt_ctx));
														
 
															+}
														
 
															+
														
 
															+#else
														
 
															+
														
 
															 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															 		       struct scatterlist *src, unsigned int nbytes)
														
 
															 {
														
@@ -560,6 +638,8 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 
															 	return ret;
														
 
															 }
														
 
															+#endif
														
 
															+
														
 
															 #ifdef CONFIG_X86_64
														
 
															 static int rfc4106_init(struct crypto_tfm *tfm)
														
 
															 {
														
--- a/arch/x86/crypto/blowfish-avx2-asm_64.S
+++ b/arch/x86/crypto/blowfish-avx2-asm_64.S
@@ -0,0 +1,449 @@
 
															+/*
														
 
															+ * x86_64/AVX2 assembler optimized version of Blowfish
														
 
															+ *
														
 
															+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify
														
 
															+ * it under the terms of the GNU General Public License as published by
														
 
															+ * the Free Software Foundation; either version 2 of the License, or
														
 
															+ * (at your option) any later version.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#include <linux/linkage.h>
														
 
															+
														
 
															+.file "blowfish-avx2-asm_64.S"
														
 
															+
														
 
															+.data
														
 
															+.align 32
														
 
															+
														
 
															+.Lprefetch_mask:
														
 
															+.long 0*64
														
 
															+.long 1*64
														
 
															+.long 2*64
														
 
															+.long 3*64
														
 
															+.long 4*64
														
 
															+.long 5*64
														
 
															+.long 6*64
														
 
															+.long 7*64
														
 
															+
														
 
															+.Lbswap32_mask:
														
 
															+.long 0x00010203
														
 
															+.long 0x04050607
														
 
															+.long 0x08090a0b
														
 
															+.long 0x0c0d0e0f
														
 
															+
														
 
															+.Lbswap128_mask:
														
 
															+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
														
 
															+.Lbswap_iv_mask:
														
 
															+	.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
														
 
															+
														
 
															+.text
														
 
															+/* structure of crypto context */
														
 
															+#define p	0
														
 
															+#define s0	((16 + 2) * 4)
														
 
															+#define s1	((16 + 2 + (1 * 256)) * 4)
														
 
															+#define s2	((16 + 2 + (2 * 256)) * 4)
														
 
															+#define s3	((16 + 2 + (3 * 256)) * 4)
														
 
															+
														
 
															+/* register macros */
														
 
															+#define CTX	%rdi
														
 
															+#define RIO	 %rdx
														
 
															+
														
 
															+#define RS0	%rax
														
 
															+#define RS1	%r8
														
 
															+#define RS2	%r9
														
 
															+#define RS3	%r10
														
 
															+
														
 
															+#define RLOOP	%r11
														
 
															+#define RLOOPd	%r11d
														
 
															+
														
 
															+#define RXr0	%ymm8
														
 
															+#define RXr1	%ymm9
														
 
															+#define RXr2	%ymm10
														
 
															+#define RXr3	%ymm11
														
 
															+#define RXl0	%ymm12
														
 
															+#define RXl1	%ymm13
														
 
															+#define RXl2	%ymm14
														
 
															+#define RXl3	%ymm15
														
 
															+
														
 
															+/* temp regs */
														
 
															+#define RT0	%ymm0
														
 
															+#define RT0x	%xmm0
														
 
															+#define RT1	%ymm1
														
 
															+#define RT1x	%xmm1
														
 
															+#define RIDX0	%ymm2
														
 
															+#define RIDX1	%ymm3
														
 
															+#define RIDX1x	%xmm3
														
 
															+#define RIDX2	%ymm4
														
 
															+#define RIDX3	%ymm5
														
 
															+
														
 
															+/* vpgatherdd mask and '-1' */
														
 
															+#define RNOT	%ymm6
														
 
															+
														
 
															+/* byte mask, (-1 >> 24) */
														
 
															+#define RBYTE	%ymm7
														
 
															+
														
 
															+/***********************************************************************
														
 
															+ * 32-way AVX2 blowfish
														
 
															+ ***********************************************************************/
														
 
															+#define F(xl, xr) \
														
 
															+	vpsrld $24, xl, RIDX0; \
														
 
															+	vpsrld $16, xl, RIDX1; \
														
 
															+	vpsrld $8, xl, RIDX2; \
														
 
															+	vpand RBYTE, RIDX1, RIDX1; \
														
 
															+	vpand RBYTE, RIDX2, RIDX2; \
														
 
															+	vpand RBYTE, xl, RIDX3; \
														
 
															+	\
														
 
															+	vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
														
 
															+	vpcmpeqd RNOT, RNOT, RNOT; \
														
 
															+	vpcmpeqd RIDX0, RIDX0, RIDX0; \
														
 
															+	\
														
 
															+	vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
														
 
															+	vpcmpeqd RIDX1, RIDX1, RIDX1; \
														
 
															+	vpaddd RT0, RT1, RT0; \
														
 
															+	\
														
 
															+	vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
														
 
															+	vpxor RT0, RT1, RT0; \
														
 
															+	\
														
 
															+	vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
														
 
															+	vpcmpeqd RNOT, RNOT, RNOT; \
														
 
															+	vpaddd RT0, RT1, RT0; \
														
 
															+	\
														
 
															+	vpxor RT0, xr, xr;
														
 
															+
														
 
															+#define add_roundkey(xl, nmem) \
														
 
															+	vpbroadcastd nmem, RT0; \
														
 
															+	vpxor RT0, xl ## 0, xl ## 0; \
														
 
															+	vpxor RT0, xl ## 1, xl ## 1; \
														
 
															+	vpxor RT0, xl ## 2, xl ## 2; \
														
 
															+	vpxor RT0, xl ## 3, xl ## 3;
														
 
															+
														
 
															+#define round_enc() \
														
 
															+	add_roundkey(RXr, p(CTX,RLOOP,4)); \
														
 
															+	F(RXl0, RXr0); \
														
 
															+	F(RXl1, RXr1); \
														
 
															+	F(RXl2, RXr2); \
														
 
															+	F(RXl3, RXr3); \
														
 
															+	\
														
 
															+	add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
														
 
															+	F(RXr0, RXl0); \
														
 
															+	F(RXr1, RXl1); \
														
 
															+	F(RXr2, RXl2); \
														
 
															+	F(RXr3, RXl3);
														
 
															+
														
 
															+#define round_dec() \
														
 
															+	add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
														
 
															+	F(RXl0, RXr0); \
														
 
															+	F(RXl1, RXr1); \
														
 
															+	F(RXl2, RXr2); \
														
 
															+	F(RXl3, RXr3); \
														
 
															+	\
														
 
															+	add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
														
 
															+	F(RXr0, RXl0); \
														
 
															+	F(RXr1, RXl1); \
														
 
															+	F(RXr2, RXl2); \
														
 
															+	F(RXr3, RXl3);
														
 
															+
														
 
															+#define init_round_constants() \
														
 
															+	vpcmpeqd RNOT, RNOT, RNOT; \
														
 
															+	leaq s0(CTX), RS0; \
														
 
															+	leaq s1(CTX), RS1; \
														
 
															+	leaq s2(CTX), RS2; \
														
 
															+	leaq s3(CTX), RS3; \
														
 
															+	vpsrld $24, RNOT, RBYTE;
														
 
															+
														
 
															+#define transpose_2x2(x0, x1, t0) \
														
 
															+	vpunpckldq x0, x1, t0; \
														
 
															+	vpunpckhdq x0, x1, x1; \
														
 
															+	\
														
 
															+	vpunpcklqdq t0, x1, x0; \
														
 
															+	vpunpckhqdq t0, x1, x1;
														
 
															+
														
 
															+#define read_block(xl, xr) \
														
 
															+	vbroadcasti128 .Lbswap32_mask, RT1; \
														
 
															+	\
														
 
															+	vpshufb RT1, xl ## 0, xl ## 0; \
														
 
															+	vpshufb RT1, xr ## 0, xr ## 0; \
														
 
															+	vpshufb RT1, xl ## 1, xl ## 1; \
														
 
															+	vpshufb RT1, xr ## 1, xr ## 1; \
														
 
															+	vpshufb RT1, xl ## 2, xl ## 2; \
														
 
															+	vpshufb RT1, xr ## 2, xr ## 2; \
														
 
															+	vpshufb RT1, xl ## 3, xl ## 3; \
														
 
															+	vpshufb RT1, xr ## 3, xr ## 3; \
														
 
															+	\
														
 
															+	transpose_2x2(xl ## 0, xr ## 0, RT0); \
														
 
															+	transpose_2x2(xl ## 1, xr ## 1, RT0); \
														
 
															+	transpose_2x2(xl ## 2, xr ## 2, RT0); \
														
 
															+	transpose_2x2(xl ## 3, xr ## 3, RT0);
														
 
															+
														
 
															+#define write_block(xl, xr) \
														
 
															+	vbroadcasti128 .Lbswap32_mask, RT1; \
														
 
															+	\
														
 
															+	transpose_2x2(xl ## 0, xr ## 0, RT0); \
														
 
															+	transpose_2x2(xl ## 1, xr ## 1, RT0); \
														
 
															+	transpose_2x2(xl ## 2, xr ## 2, RT0); \
														
 
															+	transpose_2x2(xl ## 3, xr ## 3, RT0); \
														
 
															+	\
														
 
															+	vpshufb RT1, xl ## 0, xl ## 0; \
														
 
															+	vpshufb RT1, xr ## 0, xr ## 0; \
														
 
															+	vpshufb RT1, xl ## 1, xl ## 1; \
														
 
															+	vpshufb RT1, xr ## 1, xr ## 1; \
														
 
															+	vpshufb RT1, xl ## 2, xl ## 2; \
														
 
															+	vpshufb RT1, xr ## 2, xr ## 2; \
														
 
															+	vpshufb RT1, xl ## 3, xl ## 3; \
														
 
															+	vpshufb RT1, xr ## 3, xr ## 3;
														
 
															+
														
 
															+.align 8
														
 
															+__blowfish_enc_blk32:
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	RXl0..4, RXr0..4: plaintext
														
 
															+	 * output:
														
 
															+	 *	RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped)
														
 
															+	 */
														
 
															+	init_round_constants();
														
 
															+
														
 
															+	read_block(RXl, RXr);
														
 
															+
														
 
															+	movl $1, RLOOPd;
														
 
															+	add_roundkey(RXl, p+4*(0)(CTX));
														
 
															+
														
 
															+.align 4
														
 
															+.L__enc_loop:
														
 
															+	round_enc();
														
 
															+
														
 
															+	leal 2(RLOOPd), RLOOPd;
														
 
															+	cmpl $17, RLOOPd;
														
 
															+	jne .L__enc_loop;
														
 
															+
														
 
															+	add_roundkey(RXr, p+4*(17)(CTX));
														
 
															+
														
 
															+	write_block(RXl, RXr);
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(__blowfish_enc_blk32)
														
 
															+
														
 
															+.align 8
														
 
															+__blowfish_dec_blk32:
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	RXl0..4, RXr0..4: ciphertext
														
 
															+	 * output:
														
 
															+	 *	RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped)
														
 
															+	 */
														
 
															+	init_round_constants();
														
 
															+
														
 
															+	read_block(RXl, RXr);
														
 
															+
														
 
															+	movl $14, RLOOPd;
														
 
															+	add_roundkey(RXl, p+4*(17)(CTX));
														
 
															+
														
 
															+.align 4
														
 
															+.L__dec_loop:
														
 
															+	round_dec();
														
 
															+
														
 
															+	addl $-2, RLOOPd;
														
 
															+	jns .L__dec_loop;
														
 
															+
														
 
															+	add_roundkey(RXr, p+4*(0)(CTX));
														
 
															+
														
 
															+	write_block(RXl, RXr);
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(__blowfish_dec_blk32)
														
 
															+
														
 
															+ENTRY(blowfish_ecb_enc_32way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	vmovdqu 0*32(%rdx), RXl0;
														
 
															+	vmovdqu 1*32(%rdx), RXr0;
														
 
															+	vmovdqu 2*32(%rdx), RXl1;
														
 
															+	vmovdqu 3*32(%rdx), RXr1;
														
 
															+	vmovdqu 4*32(%rdx), RXl2;
														
 
															+	vmovdqu 5*32(%rdx), RXr2;
														
 
															+	vmovdqu 6*32(%rdx), RXl3;
														
 
															+	vmovdqu 7*32(%rdx), RXr3;
														
 
															+
														
 
															+	call __blowfish_enc_blk32;
														
 
															+
														
 
															+	vmovdqu RXr0, 0*32(%rsi);
														
 
															+	vmovdqu RXl0, 1*32(%rsi);
														
 
															+	vmovdqu RXr1, 2*32(%rsi);
														
 
															+	vmovdqu RXl1, 3*32(%rsi);
														
 
															+	vmovdqu RXr2, 4*32(%rsi);
														
 
															+	vmovdqu RXl2, 5*32(%rsi);
														
 
															+	vmovdqu RXr3, 6*32(%rsi);
														
 
															+	vmovdqu RXl3, 7*32(%rsi);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(blowfish_ecb_enc_32way)
														
 
															+
														
 
															+ENTRY(blowfish_ecb_dec_32way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	vmovdqu 0*32(%rdx), RXl0;
														
 
															+	vmovdqu 1*32(%rdx), RXr0;
														
 
															+	vmovdqu 2*32(%rdx), RXl1;
														
 
															+	vmovdqu 3*32(%rdx), RXr1;
														
 
															+	vmovdqu 4*32(%rdx), RXl2;
														
 
															+	vmovdqu 5*32(%rdx), RXr2;
														
 
															+	vmovdqu 6*32(%rdx), RXl3;
														
 
															+	vmovdqu 7*32(%rdx), RXr3;
														
 
															+
														
 
															+	call __blowfish_dec_blk32;
														
 
															+
														
 
															+	vmovdqu RXr0, 0*32(%rsi);
														
 
															+	vmovdqu RXl0, 1*32(%rsi);
														
 
															+	vmovdqu RXr1, 2*32(%rsi);
														
 
															+	vmovdqu RXl1, 3*32(%rsi);
														
 
															+	vmovdqu RXr2, 4*32(%rsi);
														
 
															+	vmovdqu RXl2, 5*32(%rsi);
														
 
															+	vmovdqu RXr3, 6*32(%rsi);
														
 
															+	vmovdqu RXl3, 7*32(%rsi);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(blowfish_ecb_dec_32way)
														
 
															+
														
 
															+ENTRY(blowfish_cbc_dec_32way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	vmovdqu 0*32(%rdx), RXl0;
														
 
															+	vmovdqu 1*32(%rdx), RXr0;
														
 
															+	vmovdqu 2*32(%rdx), RXl1;
														
 
															+	vmovdqu 3*32(%rdx), RXr1;
														
 
															+	vmovdqu 4*32(%rdx), RXl2;
														
 
															+	vmovdqu 5*32(%rdx), RXr2;
														
 
															+	vmovdqu 6*32(%rdx), RXl3;
														
 
															+	vmovdqu 7*32(%rdx), RXr3;
														
 
															+
														
 
															+	call __blowfish_dec_blk32;
														
 
															+
														
 
															+	/* xor with src */
														
 
															+	vmovq (%rdx), RT0x;
														
 
															+	vpshufd $0x4f, RT0x, RT0x;
														
 
															+	vinserti128 $1, 8(%rdx), RT0, RT0;
														
 
															+	vpxor RT0, RXr0, RXr0;
														
 
															+	vpxor 0*32+24(%rdx), RXl0, RXl0;
														
 
															+	vpxor 1*32+24(%rdx), RXr1, RXr1;
														
 
															+	vpxor 2*32+24(%rdx), RXl1, RXl1;
														
 
															+	vpxor 3*32+24(%rdx), RXr2, RXr2;
														
 
															+	vpxor 4*32+24(%rdx), RXl2, RXl2;
														
 
															+	vpxor 5*32+24(%rdx), RXr3, RXr3;
														
 
															+	vpxor 6*32+24(%rdx), RXl3, RXl3;
														
 
															+
														
 
															+	vmovdqu RXr0, (0*32)(%rsi);
														
 
															+	vmovdqu RXl0, (1*32)(%rsi);
														
 
															+	vmovdqu RXr1, (2*32)(%rsi);
														
 
															+	vmovdqu RXl1, (3*32)(%rsi);
														
 
															+	vmovdqu RXr2, (4*32)(%rsi);
														
 
															+	vmovdqu RXl2, (5*32)(%rsi);
														
 
															+	vmovdqu RXr3, (6*32)(%rsi);
														
 
															+	vmovdqu RXl3, (7*32)(%rsi);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(blowfish_cbc_dec_32way)
														
 
															+
														
 
															+ENTRY(blowfish_ctr_32way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 *	%rcx: iv (big endian, 64bit)
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	vpcmpeqd RT0, RT0, RT0;
														
 
															+	vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */
														
 
															+
														
 
															+	vpcmpeqd RT1x, RT1x, RT1x;
														
 
															+	vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */
														
 
															+	vpxor RIDX0, RIDX0, RIDX0;
														
 
															+	vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */
														
 
															+
														
 
															+	vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */
														
 
															+
														
 
															+	vpcmpeqd RT1, RT1, RT1;
														
 
															+	vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */
														
 
															+	vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */
														
 
															+
														
 
															+	vbroadcasti128 .Lbswap_iv_mask, RIDX0;
														
 
															+	vbroadcasti128 .Lbswap128_mask, RIDX1;
														
 
															+
														
 
															+	/* load IV and byteswap */
														
 
															+	vmovq (%rcx), RT1x;
														
 
															+	vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */
														
 
															+	vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */
														
 
															+
														
 
															+	/* construct IVs */
														
 
															+	vpsubq RT0, RT1, RT1;		/* a: le1, b: le0, c: le3, d: le2 */
														
 
															+	vpshufb RIDX1, RT1, RXl0;	/* a: be0, b: be1, c: be2, d: be3 */
														
 
															+	vpsubq RIDX2, RT1, RT1;		/* le5, le4, le7, le6 */
														
 
															+	vpshufb RIDX1, RT1, RXr0;	/* be4, be5, be6, be7 */
														
 
															+	vpsubq RIDX2, RT1, RT1;
														
 
															+	vpshufb RIDX1, RT1, RXl1;
														
 
															+	vpsubq RIDX2, RT1, RT1;
														
 
															+	vpshufb RIDX1, RT1, RXr1;
														
 
															+	vpsubq RIDX2, RT1, RT1;
														
 
															+	vpshufb RIDX1, RT1, RXl2;
														
 
															+	vpsubq RIDX2, RT1, RT1;
														
 
															+	vpshufb RIDX1, RT1, RXr2;
														
 
															+	vpsubq RIDX2, RT1, RT1;
														
 
															+	vpshufb RIDX1, RT1, RXl3;
														
 
															+	vpsubq RIDX2, RT1, RT1;
														
 
															+	vpshufb RIDX1, RT1, RXr3;
														
 
															+
														
 
															+	/* store last IV */
														
 
															+	vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */
														
 
															+	vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */
														
 
															+	vmovq RT1x, (%rcx);
														
 
															+
														
 
															+	call __blowfish_enc_blk32;
														
 
															+
														
 
															+	/* dst = src ^ iv */
														
 
															+	vpxor 0*32(%rdx), RXr0, RXr0;
														
 
															+	vpxor 1*32(%rdx), RXl0, RXl0;
														
 
															+	vpxor 2*32(%rdx), RXr1, RXr1;
														
 
															+	vpxor 3*32(%rdx), RXl1, RXl1;
														
 
															+	vpxor 4*32(%rdx), RXr2, RXr2;
														
 
															+	vpxor 5*32(%rdx), RXl2, RXl2;
														
 
															+	vpxor 6*32(%rdx), RXr3, RXr3;
														
 
															+	vpxor 7*32(%rdx), RXl3, RXl3;
														
 
															+	vmovdqu RXr0, (0*32)(%rsi);
														
 
															+	vmovdqu RXl0, (1*32)(%rsi);
														
 
															+	vmovdqu RXr1, (2*32)(%rsi);
														
 
															+	vmovdqu RXl1, (3*32)(%rsi);
														
 
															+	vmovdqu RXr2, (4*32)(%rsi);
														
 
															+	vmovdqu RXl2, (5*32)(%rsi);
														
 
															+	vmovdqu RXr3, (6*32)(%rsi);
														
 
															+	vmovdqu RXl3, (7*32)(%rsi);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(blowfish_ctr_32way)
														
--- a/arch/x86/crypto/blowfish_avx2_glue.c
+++ b/arch/x86/crypto/blowfish_avx2_glue.c
@@ -0,0 +1,585 @@
 
															+/*
														
 
															+ * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish
														
 
															+ *
														
 
															+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ *
														
 
															+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
														
 
															+ *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
														
 
															+ * CTR part based on code (crypto/ctr.c) by:
														
 
															+ *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify
														
 
															+ * it under the terms of the GNU General Public License as published by
														
 
															+ * the Free Software Foundation; either version 2 of the License, or
														
 
															+ * (at your option) any later version.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
														
 
															+ * GNU General Public License for more details.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/crypto.h>
														
 
															+#include <linux/err.h>
														
 
															+#include <crypto/algapi.h>
														
 
															+#include <crypto/blowfish.h>
														
 
															+#include <crypto/cryptd.h>
														
 
															+#include <crypto/ctr.h>
														
 
															+#include <asm/i387.h>
														
 
															+#include <asm/xcr.h>
														
 
															+#include <asm/xsave.h>
														
 
															+#include <asm/crypto/blowfish.h>
														
 
															+#include <asm/crypto/ablk_helper.h>
														
 
															+#include <crypto/scatterwalk.h>
														
 
															+
														
 
															+#define BF_AVX2_PARALLEL_BLOCKS 32
														
 
															+
														
 
															+/* 32-way AVX2 parallel cipher functions */
														
 
															+asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src);
														
 
															+asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src);
														
 
															+asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src);
														
 
															+asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src,
														
 
															+				   __be64 *iv);
														
 
															+
														
 
															+static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes)
														
 
															+{
														
 
															+	if (fpu_enabled)
														
 
															+		return true;
														
 
															+
														
 
															+	/* FPU is only used when chunk to be processed is large enough, so
														
 
															+	 * do not enable FPU until it is necessary.
														
 
															+	 */
														
 
															+	if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS)
														
 
															+		return false;
														
 
															+
														
 
															+	kernel_fpu_begin();
														
 
															+	return true;
														
 
															+}
														
 
															+
														
 
															+static inline void bf_fpu_end(bool fpu_enabled)
														
 
															+{
														
 
															+	if (fpu_enabled)
														
 
															+		kernel_fpu_end();
														
 
															+}
														
 
															+
														
 
															+static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
														
 
															+		     bool enc)
														
 
															+{
														
 
															+	bool fpu_enabled = false;
														
 
															+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+	const unsigned int bsize = BF_BLOCK_SIZE;
														
 
															+	unsigned int nbytes;
														
 
															+	int err;
														
 
															+
														
 
															+	err = blkcipher_walk_virt(desc, walk);
														
 
															+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															+
														
 
															+	while ((nbytes = walk->nbytes)) {
														
 
															+		u8 *wsrc = walk->src.virt.addr;
														
 
															+		u8 *wdst = walk->dst.virt.addr;
														
 
															+
														
 
															+		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
														
 
															+
														
 
															+		/* Process multi-block AVX2 batch */
														
 
															+		if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
														
 
															+			do {
														
 
															+				if (enc)
														
 
															+					blowfish_ecb_enc_32way(ctx, wdst, wsrc);
														
 
															+				else
														
 
															+					blowfish_ecb_dec_32way(ctx, wdst, wsrc);
														
 
															+
														
 
															+				wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS;
														
 
															+				wdst += bsize * BF_AVX2_PARALLEL_BLOCKS;
														
 
															+				nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
														
 
															+			} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
														
 
															+
														
 
															+			if (nbytes < bsize)
														
 
															+				goto done;
														
 
															+		}
														
 
															+
														
 
															+		/* Process multi-block batch */
														
 
															+		if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
														
 
															+			do {
														
 
															+				if (enc)
														
 
															+					blowfish_enc_blk_4way(ctx, wdst, wsrc);
														
 
															+				else
														
 
															+					blowfish_dec_blk_4way(ctx, wdst, wsrc);
														
 
															+
														
 
															+				wsrc += bsize * BF_PARALLEL_BLOCKS;
														
 
															+				wdst += bsize * BF_PARALLEL_BLOCKS;
														
 
															+				nbytes -= bsize * BF_PARALLEL_BLOCKS;
														
 
															+			} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
														
 
															+
														
 
															+			if (nbytes < bsize)
														
 
															+				goto done;
														
 
															+		}
														
 
															+
														
 
															+		/* Handle leftovers */
														
 
															+		do {
														
 
															+			if (enc)
														
 
															+				blowfish_enc_blk(ctx, wdst, wsrc);
														
 
															+			else
														
 
															+				blowfish_dec_blk(ctx, wdst, wsrc);
														
 
															+
														
 
															+			wsrc += bsize;
														
 
															+			wdst += bsize;
														
 
															+			nbytes -= bsize;
														
 
															+		} while (nbytes >= bsize);
														
 
															+
														
 
															+done:
														
 
															+		err = blkcipher_walk_done(desc, walk, nbytes);
														
 
															+	}
														
 
															+
														
 
															+	bf_fpu_end(fpu_enabled);
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct blkcipher_walk walk;
														
 
															+
														
 
															+	blkcipher_walk_init(&walk, dst, src, nbytes);
														
 
															+	return ecb_crypt(desc, &walk, true);
														
 
															+}
														
 
															+
														
 
															+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct blkcipher_walk walk;
														
 
															+
														
 
															+	blkcipher_walk_init(&walk, dst, src, nbytes);
														
 
															+	return ecb_crypt(desc, &walk, false);
														
 
															+}
														
 
															+
														
 
															+static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
														
 
															+				  struct blkcipher_walk *walk)
														
 
															+{
														
 
															+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+	unsigned int bsize = BF_BLOCK_SIZE;
														
 
															+	unsigned int nbytes = walk->nbytes;
														
 
															+	u64 *src = (u64 *)walk->src.virt.addr;
														
 
															+	u64 *dst = (u64 *)walk->dst.virt.addr;
														
 
															+	u64 *iv = (u64 *)walk->iv;
														
 
															+
														
 
															+	do {
														
 
															+		*dst = *src ^ *iv;
														
 
															+		blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
														
 
															+		iv = dst;
														
 
															+
														
 
															+		src += 1;
														
 
															+		dst += 1;
														
 
															+		nbytes -= bsize;
														
 
															+	} while (nbytes >= bsize);
														
 
															+
														
 
															+	*(u64 *)walk->iv = *iv;
														
 
															+	return nbytes;
														
 
															+}
														
 
															+
														
 
															+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct blkcipher_walk walk;
														
 
															+	int err;
														
 
															+
														
 
															+	blkcipher_walk_init(&walk, dst, src, nbytes);
														
 
															+	err = blkcipher_walk_virt(desc, &walk);
														
 
															+
														
 
															+	while ((nbytes = walk.nbytes)) {
														
 
															+		nbytes = __cbc_encrypt(desc, &walk);
														
 
															+		err = blkcipher_walk_done(desc, &walk, nbytes);
														
 
															+	}
														
 
															+
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
														
 
															+				  struct blkcipher_walk *walk)
														
 
															+{
														
 
															+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+	const unsigned int bsize = BF_BLOCK_SIZE;
														
 
															+	unsigned int nbytes = walk->nbytes;
														
 
															+	u64 *src = (u64 *)walk->src.virt.addr;
														
 
															+	u64 *dst = (u64 *)walk->dst.virt.addr;
														
 
															+	u64 last_iv;
														
 
															+	int i;
														
 
															+
														
 
															+	/* Start of the last block. */
														
 
															+	src += nbytes / bsize - 1;
														
 
															+	dst += nbytes / bsize - 1;
														
 
															+
														
 
															+	last_iv = *src;
														
 
															+
														
 
															+	/* Process multi-block AVX2 batch */
														
 
															+	if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
														
 
															+		do {
														
 
															+			nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1);
														
 
															+			src -= BF_AVX2_PARALLEL_BLOCKS - 1;
														
 
															+			dst -= BF_AVX2_PARALLEL_BLOCKS - 1;
														
 
															+
														
 
															+			blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src);
														
 
															+
														
 
															+			nbytes -= bsize;
														
 
															+			if (nbytes < bsize)
														
 
															+				goto done;
														
 
															+
														
 
															+			*dst ^= *(src - 1);
														
 
															+			src -= 1;
														
 
															+			dst -= 1;
														
 
															+		} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
														
 
															+
														
 
															+		if (nbytes < bsize)
														
 
															+			goto done;
														
 
															+	}
														
 
															+
														
 
															+	/* Process multi-block batch */
														
 
															+	if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
														
 
															+		u64 ivs[BF_PARALLEL_BLOCKS - 1];
														
 
															+
														
 
															+		do {
														
 
															+			nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1);
														
 
															+			src -= BF_PARALLEL_BLOCKS - 1;
														
 
															+			dst -= BF_PARALLEL_BLOCKS - 1;
														
 
															+
														
 
															+			for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
														
 
															+				ivs[i] = src[i];
														
 
															+
														
 
															+			blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
														
 
															+
														
 
															+			for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
														
 
															+				dst[i + 1] ^= ivs[i];
														
 
															+
														
 
															+			nbytes -= bsize;
														
 
															+			if (nbytes < bsize)
														
 
															+				goto done;
														
 
															+
														
 
															+			*dst ^= *(src - 1);
														
 
															+			src -= 1;
														
 
															+			dst -= 1;
														
 
															+		} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
														
 
															+
														
 
															+		if (nbytes < bsize)
														
 
															+			goto done;
														
 
															+	}
														
 
															+
														
 
															+	/* Handle leftovers */
														
 
															+	for (;;) {
														
 
															+		blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
														
 
															+
														
 
															+		nbytes -= bsize;
														
 
															+		if (nbytes < bsize)
														
 
															+			break;
														
 
															+
														
 
															+		*dst ^= *(src - 1);
														
 
															+		src -= 1;
														
 
															+		dst -= 1;
														
 
															+	}
														
 
															+
														
 
															+done:
														
 
															+	*dst ^= *(u64 *)walk->iv;
														
 
															+	*(u64 *)walk->iv = last_iv;
														
 
															+
														
 
															+	return nbytes;
														
 
															+}
														
 
															+
														
 
															+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	bool fpu_enabled = false;
														
 
															+	struct blkcipher_walk walk;
														
 
															+	int err;
														
 
															+
														
 
															+	blkcipher_walk_init(&walk, dst, src, nbytes);
														
 
															+	err = blkcipher_walk_virt(desc, &walk);
														
 
															+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															+
														
 
															+	while ((nbytes = walk.nbytes)) {
														
 
															+		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
														
 
															+		nbytes = __cbc_decrypt(desc, &walk);
														
 
															+		err = blkcipher_walk_done(desc, &walk, nbytes);
														
 
															+	}
														
 
															+
														
 
															+	bf_fpu_end(fpu_enabled);
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static void ctr_crypt_final(struct blkcipher_desc *desc,
														
 
															+			    struct blkcipher_walk *walk)
														
 
															+{
														
 
															+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+	u8 *ctrblk = walk->iv;
														
 
															+	u8 keystream[BF_BLOCK_SIZE];
														
 
															+	u8 *src = walk->src.virt.addr;
														
 
															+	u8 *dst = walk->dst.virt.addr;
														
 
															+	unsigned int nbytes = walk->nbytes;
														
 
															+
														
 
															+	blowfish_enc_blk(ctx, keystream, ctrblk);
														
 
															+	crypto_xor(keystream, src, nbytes);
														
 
															+	memcpy(dst, keystream, nbytes);
														
 
															+
														
 
															+	crypto_inc(ctrblk, BF_BLOCK_SIZE);
														
 
															+}
														
 
															+
														
 
															+static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
														
 
															+				struct blkcipher_walk *walk)
														
 
															+{
														
 
															+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+	unsigned int bsize = BF_BLOCK_SIZE;
														
 
															+	unsigned int nbytes = walk->nbytes;
														
 
															+	u64 *src = (u64 *)walk->src.virt.addr;
														
 
															+	u64 *dst = (u64 *)walk->dst.virt.addr;
														
 
															+	int i;
														
 
															+
														
 
															+	/* Process multi-block AVX2 batch */
														
 
															+	if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
														
 
															+		do {
														
 
															+			blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src,
														
 
															+					   (__be64 *)walk->iv);
														
 
															+
														
 
															+			src += BF_AVX2_PARALLEL_BLOCKS;
														
 
															+			dst += BF_AVX2_PARALLEL_BLOCKS;
														
 
															+			nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
														
 
															+		} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
														
 
															+
														
 
															+		if (nbytes < bsize)
														
 
															+			goto done;
														
 
															+	}
														
 
															+
														
 
															+	/* Process four block batch */
														
 
															+	if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
														
 
															+		__be64 ctrblocks[BF_PARALLEL_BLOCKS];
														
 
															+		u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
														
 
															+
														
 
															+		do {
														
 
															+			/* create ctrblks for parallel encrypt */
														
 
															+			for (i = 0; i < BF_PARALLEL_BLOCKS; i++) {
														
 
															+				if (dst != src)
														
 
															+					dst[i] = src[i];
														
 
															+
														
 
															+				ctrblocks[i] = cpu_to_be64(ctrblk++);
														
 
															+			}
														
 
															+
														
 
															+			blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
														
 
															+						  (u8 *)ctrblocks);
														
 
															+
														
 
															+			src += BF_PARALLEL_BLOCKS;
														
 
															+			dst += BF_PARALLEL_BLOCKS;
														
 
															+			nbytes -= bsize * BF_PARALLEL_BLOCKS;
														
 
															+		} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
														
 
															+
														
 
															+		*(__be64 *)walk->iv = cpu_to_be64(ctrblk);
														
 
															+
														
 
															+		if (nbytes < bsize)
														
 
															+			goto done;
														
 
															+	}
														
 
															+
														
 
															+	/* Handle leftovers */
														
 
															+	do {
														
 
															+		u64 ctrblk;
														
 
															+
														
 
															+		if (dst != src)
														
 
															+			*dst = *src;
														
 
															+
														
 
															+		ctrblk = *(u64 *)walk->iv;
														
 
															+		be64_add_cpu((__be64 *)walk->iv, 1);
														
 
															+
														
 
															+		blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
														
 
															+
														
 
															+		src += 1;
														
 
															+		dst += 1;
														
 
															+	} while ((nbytes -= bsize) >= bsize);
														
 
															+
														
 
															+done:
														
 
															+	return nbytes;
														
 
															+}
														
 
															+
														
 
															+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		     struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	bool fpu_enabled = false;
														
 
															+	struct blkcipher_walk walk;
														
 
															+	int err;
														
 
															+
														
 
															+	blkcipher_walk_init(&walk, dst, src, nbytes);
														
 
															+	err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE);
														
 
															+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															+
														
 
															+	while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
														
 
															+		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
														
 
															+		nbytes = __ctr_crypt(desc, &walk);
														
 
															+		err = blkcipher_walk_done(desc, &walk, nbytes);
														
 
															+	}
														
 
															+
														
 
															+	bf_fpu_end(fpu_enabled);
														
 
															+
														
 
															+	if (walk.nbytes) {
														
 
															+		ctr_crypt_final(desc, &walk);
														
 
															+		err = blkcipher_walk_done(desc, &walk, 0);
														
 
															+	}
														
 
															+
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static struct crypto_alg bf_algs[6] = { {
														
 
															+	.cra_name		= "__ecb-blowfish-avx2",
														
 
															+	.cra_driver_name	= "__driver-ecb-blowfish-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= BF_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct bf_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= BF_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= BF_MAX_KEY_SIZE,
														
 
															+			.setkey		= blowfish_setkey,
														
 
															+			.encrypt	= ecb_encrypt,
														
 
															+			.decrypt	= ecb_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "__cbc-blowfish-avx2",
														
 
															+	.cra_driver_name	= "__driver-cbc-blowfish-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= BF_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct bf_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= BF_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= BF_MAX_KEY_SIZE,
														
 
															+			.setkey		= blowfish_setkey,
														
 
															+			.encrypt	= cbc_encrypt,
														
 
															+			.decrypt	= cbc_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "__ctr-blowfish-avx2",
														
 
															+	.cra_driver_name	= "__driver-ctr-blowfish-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= 1,
														
 
															+	.cra_ctxsize		= sizeof(struct bf_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= BF_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= BF_MAX_KEY_SIZE,
														
 
															+			.ivsize		= BF_BLOCK_SIZE,
														
 
															+			.setkey		= blowfish_setkey,
														
 
															+			.encrypt	= ctr_crypt,
														
 
															+			.decrypt	= ctr_crypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "ecb(blowfish)",
														
 
															+	.cra_driver_name	= "ecb-blowfish-avx2",
														
 
															+	.cra_priority		= 400,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= BF_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= BF_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= BF_MAX_KEY_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= ablk_encrypt,
														
 
															+			.decrypt	= ablk_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "cbc(blowfish)",
														
 
															+	.cra_driver_name	= "cbc-blowfish-avx2",
														
 
															+	.cra_priority		= 400,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= BF_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= BF_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= BF_MAX_KEY_SIZE,
														
 
															+			.ivsize		= BF_BLOCK_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= __ablk_encrypt,
														
 
															+			.decrypt	= ablk_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "ctr(blowfish)",
														
 
															+	.cra_driver_name	= "ctr-blowfish-avx2",
														
 
															+	.cra_priority		= 400,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= 1,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= BF_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= BF_MAX_KEY_SIZE,
														
 
															+			.ivsize		= BF_BLOCK_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= ablk_encrypt,
														
 
															+			.decrypt	= ablk_encrypt,
														
 
															+			.geniv		= "chainiv",
														
 
															+		},
														
 
															+	},
														
 
															+} };
														
 
															+
														
 
															+
														
 
															+static int __init init(void)
														
 
															+{
														
 
															+	u64 xcr0;
														
 
															+
														
 
															+	if (!cpu_has_avx2 || !cpu_has_osxsave) {
														
 
															+		pr_info("AVX2 instructions are not detected.\n");
														
 
															+		return -ENODEV;
														
 
															+	}
														
 
															+
														
 
															+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
														
 
															+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
														
 
															+		pr_info("AVX detected but unusable.\n");
														
 
															+		return -ENODEV;
														
 
															+	}
														
 
															+
														
 
															+	return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs));
														
 
															+}
														
 
															+
														
 
															+static void __exit fini(void)
														
 
															+{
														
 
															+	crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs));
														
 
															+}
														
 
															+
														
 
															+module_init(init);
														
 
															+module_exit(fini);
														
 
															+
														
 
															+MODULE_LICENSE("GPL");
														
 
															+MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized");
														
 
															+MODULE_ALIAS("blowfish");
														
 
															+MODULE_ALIAS("blowfish-asm");
														
--- a/arch/x86/crypto/blowfish_glue.c
+++ b/arch/x86/crypto/blowfish_glue.c
@@ -1,7 +1,7 @@
 
															 /*
														
 
															  * Glue Code for assembler optimized version of Blowfish
														
 
															  *
														
 
															- * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															  *
														
 
															  * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
														
 
															  *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
														
@@ -32,40 +32,24 @@
 
															 #include <linux/module.h>
														
 
															 #include <linux/types.h>
														
 
															 #include <crypto/algapi.h>
														
 
															+#include <asm/crypto/blowfish.h>
														
 
															 /* regular block cipher functions */
														
 
															 asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
														
 
															 				   bool xor);
														
 
															+EXPORT_SYMBOL_GPL(__blowfish_enc_blk);
														
 
															+
														
 
															 asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
														
 
															+EXPORT_SYMBOL_GPL(blowfish_dec_blk);
														
 
															 /* 4-way parallel cipher functions */
														
 
															 asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
														
 
															 					const u8 *src, bool xor);
														
 
															+EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way);
														
 
															+
														
 
															 asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
														
 
															 				      const u8 *src);
														
 
															-
														
 
															-static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
														
 
															-{
														
 
															-	__blowfish_enc_blk(ctx, dst, src, false);
														
 
															-}
														
 
															-
														
 
															-static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
														
 
															-					const u8 *src)
														
 
															-{
														
 
															-	__blowfish_enc_blk(ctx, dst, src, true);
														
 
															-}
														
 
															-
														
 
															-static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
														
 
															-					 const u8 *src)
														
 
															-{
														
 
															-	__blowfish_enc_blk_4way(ctx, dst, src, false);
														
 
															-}
														
 
															-
														
 
															-static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
														
 
															-				      const u8 *src)
														
 
															-{
														
 
															-	__blowfish_enc_blk_4way(ctx, dst, src, true);
														
 
															-}
														
 
															+EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way);
														
 
															 static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
														
 
															 {
														
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -1,7 +1,7 @@
 
															 /*
														
 
															  * x86_64/AVX/AES-NI assembler implementation of Camellia
														
 
															  *
														
 
															- * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															  *
														
 
															  * This program is free software; you can redistribute it and/or modify
														
 
															  * it under the terms of the GNU General Public License as published by
														
@@ -589,6 +589,10 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 
															 .Lbswap128_mask:
														
 
															 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
														
 
															+/* For XTS mode IV generation */
														
 
															+.Lxts_gf128mul_and_shl1_mask:
														
 
															+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
														
 
															+
														
 
															 /*
														
 
															  * pre-SubByte transform
														
 
															  *
														
@@ -1090,3 +1094,177 @@ ENTRY(camellia_ctr_16way)
 
															 	ret;
														
 
															 ENDPROC(camellia_ctr_16way)
														
 
															+
														
 
															+#define gf128mul_x_ble(iv, mask, tmp) \
														
 
															+	vpsrad $31, iv, tmp; \
														
 
															+	vpaddq iv, iv, iv; \
														
 
															+	vpshufd $0x13, tmp, tmp; \
														
 
															+	vpand mask, tmp, tmp; \
														
 
															+	vpxor tmp, iv, iv;
														
 
															+
														
 
															+.align 8
														
 
															+camellia_xts_crypt_16way:
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (16 blocks)
														
 
															+	 *	%rdx: src (16 blocks)
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 *	%r8: index for input whitening key
														
 
															+	 *	%r9: pointer to  __camellia_enc_blk16 or __camellia_dec_blk16
														
 
															+	 */
														
 
															+
														
 
															+	subq $(16 * 16), %rsp;
														
 
															+	movq %rsp, %rax;
														
 
															+
														
 
															+	vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
														
 
															+
														
 
															+	/* load IV */
														
 
															+	vmovdqu (%rcx), %xmm0;
														
 
															+	vpxor 0 * 16(%rdx), %xmm0, %xmm15;
														
 
															+	vmovdqu %xmm15, 15 * 16(%rax);
														
 
															+	vmovdqu %xmm0, 0 * 16(%rsi);
														
 
															+
														
 
															+	/* construct IVs */
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 1 * 16(%rdx), %xmm0, %xmm15;
														
 
															+	vmovdqu %xmm15, 14 * 16(%rax);
														
 
															+	vmovdqu %xmm0, 1 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 2 * 16(%rdx), %xmm0, %xmm13;
														
 
															+	vmovdqu %xmm0, 2 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 3 * 16(%rdx), %xmm0, %xmm12;
														
 
															+	vmovdqu %xmm0, 3 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 4 * 16(%rdx), %xmm0, %xmm11;
														
 
															+	vmovdqu %xmm0, 4 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 5 * 16(%rdx), %xmm0, %xmm10;
														
 
															+	vmovdqu %xmm0, 5 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 6 * 16(%rdx), %xmm0, %xmm9;
														
 
															+	vmovdqu %xmm0, 6 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 7 * 16(%rdx), %xmm0, %xmm8;
														
 
															+	vmovdqu %xmm0, 7 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 8 * 16(%rdx), %xmm0, %xmm7;
														
 
															+	vmovdqu %xmm0, 8 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 9 * 16(%rdx), %xmm0, %xmm6;
														
 
															+	vmovdqu %xmm0, 9 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 10 * 16(%rdx), %xmm0, %xmm5;
														
 
															+	vmovdqu %xmm0, 10 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 11 * 16(%rdx), %xmm0, %xmm4;
														
 
															+	vmovdqu %xmm0, 11 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 12 * 16(%rdx), %xmm0, %xmm3;
														
 
															+	vmovdqu %xmm0, 12 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 13 * 16(%rdx), %xmm0, %xmm2;
														
 
															+	vmovdqu %xmm0, 13 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 14 * 16(%rdx), %xmm0, %xmm1;
														
 
															+	vmovdqu %xmm0, 14 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vpxor 15 * 16(%rdx), %xmm0, %xmm15;
														
 
															+	vmovdqu %xmm15, 0 * 16(%rax);
														
 
															+	vmovdqu %xmm0, 15 * 16(%rsi);
														
 
															+
														
 
															+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
														
 
															+	vmovdqu %xmm0, (%rcx);
														
 
															+
														
 
															+	/* inpack16_pre: */
														
 
															+	vmovq (key_table)(CTX, %r8, 8), %xmm15;
														
 
															+	vpshufb .Lpack_bswap, %xmm15, %xmm15;
														
 
															+	vpxor 0 * 16(%rax), %xmm15, %xmm0;
														
 
															+	vpxor %xmm1, %xmm15, %xmm1;
														
 
															+	vpxor %xmm2, %xmm15, %xmm2;
														
 
															+	vpxor %xmm3, %xmm15, %xmm3;
														
 
															+	vpxor %xmm4, %xmm15, %xmm4;
														
 
															+	vpxor %xmm5, %xmm15, %xmm5;
														
 
															+	vpxor %xmm6, %xmm15, %xmm6;
														
 
															+	vpxor %xmm7, %xmm15, %xmm7;
														
 
															+	vpxor %xmm8, %xmm15, %xmm8;
														
 
															+	vpxor %xmm9, %xmm15, %xmm9;
														
 
															+	vpxor %xmm10, %xmm15, %xmm10;
														
 
															+	vpxor %xmm11, %xmm15, %xmm11;
														
 
															+	vpxor %xmm12, %xmm15, %xmm12;
														
 
															+	vpxor %xmm13, %xmm15, %xmm13;
														
 
															+	vpxor 14 * 16(%rax), %xmm15, %xmm14;
														
 
															+	vpxor 15 * 16(%rax), %xmm15, %xmm15;
														
 
															+
														
 
															+	call *%r9;
														
 
															+
														
 
															+	addq $(16 * 16), %rsp;
														
 
															+
														
 
															+	vpxor 0 * 16(%rsi), %xmm7, %xmm7;
														
 
															+	vpxor 1 * 16(%rsi), %xmm6, %xmm6;
														
 
															+	vpxor 2 * 16(%rsi), %xmm5, %xmm5;
														
 
															+	vpxor 3 * 16(%rsi), %xmm4, %xmm4;
														
 
															+	vpxor 4 * 16(%rsi), %xmm3, %xmm3;
														
 
															+	vpxor 5 * 16(%rsi), %xmm2, %xmm2;
														
 
															+	vpxor 6 * 16(%rsi), %xmm1, %xmm1;
														
 
															+	vpxor 7 * 16(%rsi), %xmm0, %xmm0;
														
 
															+	vpxor 8 * 16(%rsi), %xmm15, %xmm15;
														
 
															+	vpxor 9 * 16(%rsi), %xmm14, %xmm14;
														
 
															+	vpxor 10 * 16(%rsi), %xmm13, %xmm13;
														
 
															+	vpxor 11 * 16(%rsi), %xmm12, %xmm12;
														
 
															+	vpxor 12 * 16(%rsi), %xmm11, %xmm11;
														
 
															+	vpxor 13 * 16(%rsi), %xmm10, %xmm10;
														
 
															+	vpxor 14 * 16(%rsi), %xmm9, %xmm9;
														
 
															+	vpxor 15 * 16(%rsi), %xmm8, %xmm8;
														
 
															+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
														
 
															+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
														
 
															+		     %xmm8, %rsi);
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(camellia_xts_crypt_16way)
														
 
															+
														
 
															+ENTRY(camellia_xts_enc_16way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (16 blocks)
														
 
															+	 *	%rdx: src (16 blocks)
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 */
														
 
															+	xorl %r8d, %r8d; /* input whitening key, 0 for enc */
														
 
															+
														
 
															+	leaq __camellia_enc_blk16, %r9;
														
 
															+
														
 
															+	jmp camellia_xts_crypt_16way;
														
 
															+ENDPROC(camellia_xts_enc_16way)
														
 
															+
														
 
															+ENTRY(camellia_xts_dec_16way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (16 blocks)
														
 
															+	 *	%rdx: src (16 blocks)
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 */
														
 
															+
														
 
															+	cmpl $16, key_length(CTX);
														
 
															+	movl $32, %r8d;
														
 
															+	movl $24, %eax;
														
 
															+	cmovel %eax, %r8d;  /* input whitening key, last for dec */
														
 
															+
														
 
															+	leaq __camellia_dec_blk16, %r9;
														
 
															+
														
 
															+	jmp camellia_xts_crypt_16way;
														
 
															+ENDPROC(camellia_xts_dec_16way)
														
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -0,0 +1,1368 @@
 
															+/*
														
 
															+ * x86_64/AVX2/AES-NI assembler implementation of Camellia
														
 
															+ *
														
 
															+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify
														
 
															+ * it under the terms of the GNU General Public License as published by
														
 
															+ * the Free Software Foundation; either version 2 of the License, or
														
 
															+ * (at your option) any later version.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#include <linux/linkage.h>
														
 
															+
														
 
															+#define CAMELLIA_TABLE_BYTE_LEN 272
														
 
															+
														
 
															+/* struct camellia_ctx: */
														
 
															+#define key_table 0
														
 
															+#define key_length CAMELLIA_TABLE_BYTE_LEN
														
 
															+
														
 
															+/* register macros */
														
 
															+#define CTX %rdi
														
 
															+#define RIO %r8
														
 
															+
														
 
															+/**********************************************************************
														
 
															+  helper macros
														
 
															+ **********************************************************************/
														
 
															+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
														
 
															+	vpand x, mask4bit, tmp0; \
														
 
															+	vpandn x, mask4bit, x; \
														
 
															+	vpsrld $4, x, x; \
														
 
															+	\
														
 
															+	vpshufb tmp0, lo_t, tmp0; \
														
 
															+	vpshufb x, hi_t, x; \
														
 
															+	vpxor tmp0, x, x;
														
 
															+
														
 
															+#define ymm0_x xmm0
														
 
															+#define ymm1_x xmm1
														
 
															+#define ymm2_x xmm2
														
 
															+#define ymm3_x xmm3
														
 
															+#define ymm4_x xmm4
														
 
															+#define ymm5_x xmm5
														
 
															+#define ymm6_x xmm6
														
 
															+#define ymm7_x xmm7
														
 
															+#define ymm8_x xmm8
														
 
															+#define ymm9_x xmm9
														
 
															+#define ymm10_x xmm10
														
 
															+#define ymm11_x xmm11
														
 
															+#define ymm12_x xmm12
														
 
															+#define ymm13_x xmm13
														
 
															+#define ymm14_x xmm14
														
 
															+#define ymm15_x xmm15
														
 
															+
														
 
															+/*
														
 
															+ * AES-NI instructions do not support ymmX registers, so we need splitting and
														
 
															+ * merging.
														
 
															+ */
														
 
															+#define vaesenclast256(zero, yreg, tmp) \
														
 
															+	vextracti128 $1, yreg, tmp##_x; \
														
 
															+	vaesenclast zero##_x, yreg##_x, yreg##_x; \
														
 
															+	vaesenclast zero##_x, tmp##_x, tmp##_x; \
														
 
															+	vinserti128 $1, tmp##_x, yreg, yreg;
														
 
															+
														
 
															+/**********************************************************************
														
 
															+  32-way camellia
														
 
															+ **********************************************************************/
														
 
															+
														
 
															+/*
														
 
															+ * IN:
														
 
															+ *   x0..x7: byte-sliced AB state
														
 
															+ *   mem_cd: register pointer storing CD state
														
 
															+ *   key: index for key material
														
 
															+ * OUT:
														
 
															+ *   x0..x7: new byte-sliced CD state
														
 
															+ */
														
 
															+#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
														
 
															+		  t7, mem_cd, key) \
														
 
															+	/* \
														
 
															+	 * S-function with AES subbytes \
														
 
															+	 */ \
														
 
															+	vbroadcasti128 .Linv_shift_row, t4; \
														
 
															+	vpbroadcastb .L0f0f0f0f, t7; \
														
 
															+	vbroadcasti128 .Lpre_tf_lo_s1, t0; \
														
 
															+	vbroadcasti128 .Lpre_tf_hi_s1, t1; \
														
 
															+	\
														
 
															+	/* AES inverse shift rows */ \
														
 
															+	vpshufb t4, x0, x0; \
														
 
															+	vpshufb t4, x7, x7; \
														
 
															+	vpshufb t4, x1, x1; \
														
 
															+	vpshufb t4, x4, x4; \
														
 
															+	vpshufb t4, x2, x2; \
														
 
															+	vpshufb t4, x5, x5; \
														
 
															+	vpshufb t4, x3, x3; \
														
 
															+	vpshufb t4, x6, x6; \
														
 
															+	\
														
 
															+	/* prefilter sboxes 1, 2 and 3 */ \
														
 
															+	vbroadcasti128 .Lpre_tf_lo_s4, t2; \
														
 
															+	vbroadcasti128 .Lpre_tf_hi_s4, t3; \
														
 
															+	filter_8bit(x0, t0, t1, t7, t6); \
														
 
															+	filter_8bit(x7, t0, t1, t7, t6); \
														
 
															+	filter_8bit(x1, t0, t1, t7, t6); \
														
 
															+	filter_8bit(x4, t0, t1, t7, t6); \
														
 
															+	filter_8bit(x2, t0, t1, t7, t6); \
														
 
															+	filter_8bit(x5, t0, t1, t7, t6); \
														
 
															+	\
														
 
															+	/* prefilter sbox 4 */ \
														
 
															+	vpxor t4##_x, t4##_x, t4##_x; \
														
 
															+	filter_8bit(x3, t2, t3, t7, t6); \
														
 
															+	filter_8bit(x6, t2, t3, t7, t6); \
														
 
															+	\
														
 
															+	/* AES subbytes + AES shift rows */ \
														
 
															+	vbroadcasti128 .Lpost_tf_lo_s1, t0; \
														
 
															+	vbroadcasti128 .Lpost_tf_hi_s1, t1; \
														
 
															+	vaesenclast256(t4, x0, t5); \
														
 
															+	vaesenclast256(t4, x7, t5); \
														
 
															+	vaesenclast256(t4, x1, t5); \
														
 
															+	vaesenclast256(t4, x4, t5); \
														
 
															+	vaesenclast256(t4, x2, t5); \
														
 
															+	vaesenclast256(t4, x5, t5); \
														
 
															+	vaesenclast256(t4, x3, t5); \
														
 
															+	vaesenclast256(t4, x6, t5); \
														
 
															+	\
														
 
															+	/* postfilter sboxes 1 and 4 */ \
														
 
															+	vbroadcasti128 .Lpost_tf_lo_s3, t2; \
														
 
															+	vbroadcasti128 .Lpost_tf_hi_s3, t3; \
														
 
															+	filter_8bit(x0, t0, t1, t7, t6); \
														
 
															+	filter_8bit(x7, t0, t1, t7, t6); \
														
 
															+	filter_8bit(x3, t0, t1, t7, t6); \
														
 
															+	filter_8bit(x6, t0, t1, t7, t6); \
														
 
															+	\
														
 
															+	/* postfilter sbox 3 */ \
														
 
															+	vbroadcasti128 .Lpost_tf_lo_s2, t4; \
														
 
															+	vbroadcasti128 .Lpost_tf_hi_s2, t5; \
														
 
															+	filter_8bit(x2, t2, t3, t7, t6); \
														
 
															+	filter_8bit(x5, t2, t3, t7, t6); \
														
 
															+	\
														
 
															+	vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
														
 
															+	\
														
 
															+	/* postfilter sbox 2 */ \
														
 
															+	filter_8bit(x1, t4, t5, t7, t2); \
														
 
															+	filter_8bit(x4, t4, t5, t7, t2); \
														
 
															+	\
														
 
															+	vpsrldq $1, t0, t1; \
														
 
															+	vpsrldq $2, t0, t2; \
														
 
															+	vpsrldq $3, t0, t3; \
														
 
															+	vpsrldq $4, t0, t4; \
														
 
															+	vpsrldq $5, t0, t5; \
														
 
															+	vpsrldq $6, t0, t6; \
														
 
															+	vpsrldq $7, t0, t7; \
														
 
															+	vpbroadcastb t0##_x, t0; \
														
 
															+	vpbroadcastb t1##_x, t1; \
														
 
															+	vpbroadcastb t2##_x, t2; \
														
 
															+	vpbroadcastb t3##_x, t3; \
														
 
															+	vpbroadcastb t4##_x, t4; \
														
 
															+	vpbroadcastb t6##_x, t6; \
														
 
															+	vpbroadcastb t5##_x, t5; \
														
 
															+	vpbroadcastb t7##_x, t7; \
														
 
															+	\
														
 
															+	/* P-function */ \
														
 
															+	vpxor x5, x0, x0; \
														
 
															+	vpxor x6, x1, x1; \
														
 
															+	vpxor x7, x2, x2; \
														
 
															+	vpxor x4, x3, x3; \
														
 
															+	\
														
 
															+	vpxor x2, x4, x4; \
														
 
															+	vpxor x3, x5, x5; \
														
 
															+	vpxor x0, x6, x6; \
														
 
															+	vpxor x1, x7, x7; \
														
 
															+	\
														
 
															+	vpxor x7, x0, x0; \
														
 
															+	vpxor x4, x1, x1; \
														
 
															+	vpxor x5, x2, x2; \
														
 
															+	vpxor x6, x3, x3; \
														
 
															+	\
														
 
															+	vpxor x3, x4, x4; \
														
 
															+	vpxor x0, x5, x5; \
														
 
															+	vpxor x1, x6, x6; \
														
 
															+	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
														
 
															+	\
														
 
															+	/* Add key material and result to CD (x becomes new CD) */ \
														
 
															+	\
														
 
															+	vpxor t7, x0, x0; \
														
 
															+	vpxor 4 * 32(mem_cd), x0, x0; \
														
 
															+	\
														
 
															+	vpxor t6, x1, x1; \
														
 
															+	vpxor 5 * 32(mem_cd), x1, x1; \
														
 
															+	\
														
 
															+	vpxor t5, x2, x2; \
														
 
															+	vpxor 6 * 32(mem_cd), x2, x2; \
														
 
															+	\
														
 
															+	vpxor t4, x3, x3; \
														
 
															+	vpxor 7 * 32(mem_cd), x3, x3; \
														
 
															+	\
														
 
															+	vpxor t3, x4, x4; \
														
 
															+	vpxor 0 * 32(mem_cd), x4, x4; \
														
 
															+	\
														
 
															+	vpxor t2, x5, x5; \
														
 
															+	vpxor 1 * 32(mem_cd), x5, x5; \
														
 
															+	\
														
 
															+	vpxor t1, x6, x6; \
														
 
															+	vpxor 2 * 32(mem_cd), x6, x6; \
														
 
															+	\
														
 
															+	vpxor t0, x7, x7; \
														
 
															+	vpxor 3 * 32(mem_cd), x7, x7;
														
 
															+
														
 
															+/*
														
 
															+ * Size optimization... with inlined roundsm16 binary would be over 5 times
														
 
															+ * larger and would only marginally faster.
														
 
															+ */
														
 
															+.align 8
														
 
															+roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
														
 
															+	roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		  %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
														
 
															+		  %rcx, (%r9));
														
 
															+	ret;
														
 
															+ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
														
 
															+
														
 
															+.align 8
														
 
															+roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
														
 
															+	roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
														
 
															+		  %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
														
 
															+		  %rax, (%r9));
														
 
															+	ret;
														
 
															+ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
														
 
															+
														
 
															+/*
														
 
															+ * IN/OUT:
														
 
															+ *  x0..x7: byte-sliced AB state preloaded
														
 
															+ *  mem_ab: byte-sliced AB state in memory
														
 
															+ *  mem_cb: byte-sliced CD state in memory
														
 
															+ */
														
 
															+#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
														
 
															+		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
														
 
															+	leaq (key_table + (i) * 8)(CTX), %r9; \
														
 
															+	call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
														
 
															+	\
														
 
															+	vmovdqu x0, 4 * 32(mem_cd); \
														
 
															+	vmovdqu x1, 5 * 32(mem_cd); \
														
 
															+	vmovdqu x2, 6 * 32(mem_cd); \
														
 
															+	vmovdqu x3, 7 * 32(mem_cd); \
														
 
															+	vmovdqu x4, 0 * 32(mem_cd); \
														
 
															+	vmovdqu x5, 1 * 32(mem_cd); \
														
 
															+	vmovdqu x6, 2 * 32(mem_cd); \
														
 
															+	vmovdqu x7, 3 * 32(mem_cd); \
														
 
															+	\
														
 
															+	leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
														
 
															+	call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
														
 
															+	\
														
 
															+	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
														
 
															+
														
 
															+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
														
 
															+
														
 
															+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
														
 
															+	/* Store new AB state */ \
														
 
															+	vmovdqu x4, 4 * 32(mem_ab); \
														
 
															+	vmovdqu x5, 5 * 32(mem_ab); \
														
 
															+	vmovdqu x6, 6 * 32(mem_ab); \
														
 
															+	vmovdqu x7, 7 * 32(mem_ab); \
														
 
															+	vmovdqu x0, 0 * 32(mem_ab); \
														
 
															+	vmovdqu x1, 1 * 32(mem_ab); \
														
 
															+	vmovdqu x2, 2 * 32(mem_ab); \
														
 
															+	vmovdqu x3, 3 * 32(mem_ab);
														
 
															+
														
 
															+#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
														
 
															+		      y6, y7, mem_ab, mem_cd, i) \
														
 
															+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
														
 
															+		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
														
 
															+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
														
 
															+		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
														
 
															+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
														
 
															+		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
														
 
															+
														
 
															+#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
														
 
															+		      y6, y7, mem_ab, mem_cd, i) \
														
 
															+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
														
 
															+		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
														
 
															+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
														
 
															+		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
														
 
															+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
														
 
															+		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
														
 
															+
														
 
															+/*
														
 
															+ * IN:
														
 
															+ *  v0..3: byte-sliced 32-bit integers
														
 
															+ * OUT:
														
 
															+ *  v0..3: (IN <<< 1)
														
 
															+ */
														
 
															+#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
														
 
															+	vpcmpgtb v0, zero, t0; \
														
 
															+	vpaddb v0, v0, v0; \
														
 
															+	vpabsb t0, t0; \
														
 
															+	\
														
 
															+	vpcmpgtb v1, zero, t1; \
														
 
															+	vpaddb v1, v1, v1; \
														
 
															+	vpabsb t1, t1; \
														
 
															+	\
														
 
															+	vpcmpgtb v2, zero, t2; \
														
 
															+	vpaddb v2, v2, v2; \
														
 
															+	vpabsb t2, t2; \
														
 
															+	\
														
 
															+	vpor t0, v1, v1; \
														
 
															+	\
														
 
															+	vpcmpgtb v3, zero, t0; \
														
 
															+	vpaddb v3, v3, v3; \
														
 
															+	vpabsb t0, t0; \
														
 
															+	\
														
 
															+	vpor t1, v2, v2; \
														
 
															+	vpor t2, v3, v3; \
														
 
															+	vpor t0, v0, v0;
														
 
															+
														
 
															+/*
														
 
															+ * IN:
														
 
															+ *   r: byte-sliced AB state in memory
														
 
															+ *   l: byte-sliced CD state in memory
														
 
															+ * OUT:
														
 
															+ *   x0..x7: new byte-sliced CD state
														
 
															+ */
														
 
															+#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
														
 
															+	      tt1, tt2, tt3, kll, klr, krl, krr) \
														
 
															+	/* \
														
 
															+	 * t0 = kll; \
														
 
															+	 * t0 &= ll; \
														
 
															+	 * lr ^= rol32(t0, 1); \
														
 
															+	 */ \
														
 
															+	vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
														
 
															+	vpxor tt0, tt0, tt0; \
														
 
															+	vpbroadcastb t0##_x, t3; \
														
 
															+	vpsrldq $1, t0, t0; \
														
 
															+	vpbroadcastb t0##_x, t2; \
														
 
															+	vpsrldq $1, t0, t0; \
														
 
															+	vpbroadcastb t0##_x, t1; \
														
 
															+	vpsrldq $1, t0, t0; \
														
 
															+	vpbroadcastb t0##_x, t0; \
														
 
															+	\
														
 
															+	vpand l0, t0, t0; \
														
 
															+	vpand l1, t1, t1; \
														
 
															+	vpand l2, t2, t2; \
														
 
															+	vpand l3, t3, t3; \
														
 
															+	\
														
 
															+	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
														
 
															+	\
														
 
															+	vpxor l4, t0, l4; \
														
 
															+	vmovdqu l4, 4 * 32(l); \
														
 
															+	vpxor l5, t1, l5; \
														
 
															+	vmovdqu l5, 5 * 32(l); \
														
 
															+	vpxor l6, t2, l6; \
														
 
															+	vmovdqu l6, 6 * 32(l); \
														
 
															+	vpxor l7, t3, l7; \
														
 
															+	vmovdqu l7, 7 * 32(l); \
														
 
															+	\
														
 
															+	/* \
														
 
															+	 * t2 = krr; \
														
 
															+	 * t2 |= rr; \
														
 
															+	 * rl ^= t2; \
														
 
															+	 */ \
														
 
															+	\
														
 
															+	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
														
 
															+	vpbroadcastb t0##_x, t3; \
														
 
															+	vpsrldq $1, t0, t0; \
														
 
															+	vpbroadcastb t0##_x, t2; \
														
 
															+	vpsrldq $1, t0, t0; \
														
 
															+	vpbroadcastb t0##_x, t1; \
														
 
															+	vpsrldq $1, t0, t0; \
														
 
															+	vpbroadcastb t0##_x, t0; \
														
 
															+	\
														
 
															+	vpor 4 * 32(r), t0, t0; \
														
 
															+	vpor 5 * 32(r), t1, t1; \
														
 
															+	vpor 6 * 32(r), t2, t2; \
														
 
															+	vpor 7 * 32(r), t3, t3; \
														
 
															+	\
														
 
															+	vpxor 0 * 32(r), t0, t0; \
														
 
															+	vpxor 1 * 32(r), t1, t1; \
														
 
															+	vpxor 2 * 32(r), t2, t2; \
														
 
															+	vpxor 3 * 32(r), t3, t3; \
														
 
															+	vmovdqu t0, 0 * 32(r); \
														
 
															+	vmovdqu t1, 1 * 32(r); \
														
 
															+	vmovdqu t2, 2 * 32(r); \
														
 
															+	vmovdqu t3, 3 * 32(r); \
														
 
															+	\
														
 
															+	/* \
														
 
															+	 * t2 = krl; \
														
 
															+	 * t2 &= rl; \
														
 
															+	 * rr ^= rol32(t2, 1); \
														
 
															+	 */ \
														
 
															+	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
														
 
															+	vpbroadcastb t0##_x, t3; \
														
 
															+	vpsrldq $1, t0, t0; \
														
 
															+	vpbroadcastb t0##_x, t2; \
														
 
															+	vpsrldq $1, t0, t0; \
														
 
															+	vpbroadcastb t0##_x, t1; \
														
 
															+	vpsrldq $1, t0, t0; \
														
 
															+	vpbroadcastb t0##_x, t0; \
														
 
															+	\
														
 
															+	vpand 0 * 32(r), t0, t0; \
														
 
															+	vpand 1 * 32(r), t1, t1; \
														
 
															+	vpand 2 * 32(r), t2, t2; \
														
 
															+	vpand 3 * 32(r), t3, t3; \
														
 
															+	\
														
 
															+	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
														
 
															+	\
														
 
															+	vpxor 4 * 32(r), t0, t0; \
														
 
															+	vpxor 5 * 32(r), t1, t1; \
														
 
															+	vpxor 6 * 32(r), t2, t2; \
														
 
															+	vpxor 7 * 32(r), t3, t3; \
														
 
															+	vmovdqu t0, 4 * 32(r); \
														
 
															+	vmovdqu t1, 5 * 32(r); \
														
 
															+	vmovdqu t2, 6 * 32(r); \
														
 
															+	vmovdqu t3, 7 * 32(r); \
														
 
															+	\
														
 
															+	/* \
														
 
															+	 * t0 = klr; \
														
 
															+	 * t0 |= lr; \
														
 
															+	 * ll ^= t0; \
														
 
															+	 */ \
														
 
															+	\
														
 
															+	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
														
 
															+	vpbroadcastb t0##_x, t3; \
														
 
															+	vpsrldq $1, t0, t0; \
														
 
															+	vpbroadcastb t0##_x, t2; \
														
 
															+	vpsrldq $1, t0, t0; \
														
 
															+	vpbroadcastb t0##_x, t1; \
														
 
															+	vpsrldq $1, t0, t0; \
														
 
															+	vpbroadcastb t0##_x, t0; \
														
 
															+	\
														
 
															+	vpor l4, t0, t0; \
														
 
															+	vpor l5, t1, t1; \
														
 
															+	vpor l6, t2, t2; \
														
 
															+	vpor l7, t3, t3; \
														
 
															+	\
														
 
															+	vpxor l0, t0, l0; \
														
 
															+	vmovdqu l0, 0 * 32(l); \
														
 
															+	vpxor l1, t1, l1; \
														
 
															+	vmovdqu l1, 1 * 32(l); \
														
 
															+	vpxor l2, t2, l2; \
														
 
															+	vmovdqu l2, 2 * 32(l); \
														
 
															+	vpxor l3, t3, l3; \
														
 
															+	vmovdqu l3, 3 * 32(l);
														
 
															+
														
 
															+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
														
 
															+	vpunpckhdq x1, x0, t2; \
														
 
															+	vpunpckldq x1, x0, x0; \
														
 
															+	\
														
 
															+	vpunpckldq x3, x2, t1; \
														
 
															+	vpunpckhdq x3, x2, x2; \
														
 
															+	\
														
 
															+	vpunpckhqdq t1, x0, x1; \
														
 
															+	vpunpcklqdq t1, x0, x0; \
														
 
															+	\
														
 
															+	vpunpckhqdq x2, t2, x3; \
														
 
															+	vpunpcklqdq x2, t2, x2;
														
 
															+
														
 
															+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
														
 
															+			      a3, b3, c3, d3, st0, st1) \
														
 
															+	vmovdqu d2, st0; \
														
 
															+	vmovdqu d3, st1; \
														
 
															+	transpose_4x4(a0, a1, a2, a3, d2, d3); \
														
 
															+	transpose_4x4(b0, b1, b2, b3, d2, d3); \
														
 
															+	vmovdqu st0, d2; \
														
 
															+	vmovdqu st1, d3; \
														
 
															+	\
														
 
															+	vmovdqu a0, st0; \
														
 
															+	vmovdqu a1, st1; \
														
 
															+	transpose_4x4(c0, c1, c2, c3, a0, a1); \
														
 
															+	transpose_4x4(d0, d1, d2, d3, a0, a1); \
														
 
															+	\
														
 
															+	vbroadcasti128 .Lshufb_16x16b, a0; \
														
 
															+	vmovdqu st1, a1; \
														
 
															+	vpshufb a0, a2, a2; \
														
 
															+	vpshufb a0, a3, a3; \
														
 
															+	vpshufb a0, b0, b0; \
														
 
															+	vpshufb a0, b1, b1; \
														
 
															+	vpshufb a0, b2, b2; \
														
 
															+	vpshufb a0, b3, b3; \
														
 
															+	vpshufb a0, a1, a1; \
														
 
															+	vpshufb a0, c0, c0; \
														
 
															+	vpshufb a0, c1, c1; \
														
 
															+	vpshufb a0, c2, c2; \
														
 
															+	vpshufb a0, c3, c3; \
														
 
															+	vpshufb a0, d0, d0; \
														
 
															+	vpshufb a0, d1, d1; \
														
 
															+	vpshufb a0, d2, d2; \
														
 
															+	vpshufb a0, d3, d3; \
														
 
															+	vmovdqu d3, st1; \
														
 
															+	vmovdqu st0, d3; \
														
 
															+	vpshufb a0, d3, a0; \
														
 
															+	vmovdqu d2, st0; \
														
 
															+	\
														
 
															+	transpose_4x4(a0, b0, c0, d0, d2, d3); \
														
 
															+	transpose_4x4(a1, b1, c1, d1, d2, d3); \
														
 
															+	vmovdqu st0, d2; \
														
 
															+	vmovdqu st1, d3; \
														
 
															+	\
														
 
															+	vmovdqu b0, st0; \
														
 
															+	vmovdqu b1, st1; \
														
 
															+	transpose_4x4(a2, b2, c2, d2, b0, b1); \
														
 
															+	transpose_4x4(a3, b3, c3, d3, b0, b1); \
														
 
															+	vmovdqu st0, b0; \
														
 
															+	vmovdqu st1, b1; \
														
 
															+	/* does not adjust output bytes inside vectors */
														
 
															+
														
 
															+/* load blocks to registers and apply pre-whitening */
														
 
															+#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
														
 
															+		     y6, y7, rio, key) \
														
 
															+	vpbroadcastq key, x0; \
														
 
															+	vpshufb .Lpack_bswap, x0, x0; \
														
 
															+	\
														
 
															+	vpxor 0 * 32(rio), x0, y7; \
														
 
															+	vpxor 1 * 32(rio), x0, y6; \
														
 
															+	vpxor 2 * 32(rio), x0, y5; \
														
 
															+	vpxor 3 * 32(rio), x0, y4; \
														
 
															+	vpxor 4 * 32(rio), x0, y3; \
														
 
															+	vpxor 5 * 32(rio), x0, y2; \
														
 
															+	vpxor 6 * 32(rio), x0, y1; \
														
 
															+	vpxor 7 * 32(rio), x0, y0; \
														
 
															+	vpxor 8 * 32(rio), x0, x7; \
														
 
															+	vpxor 9 * 32(rio), x0, x6; \
														
 
															+	vpxor 10 * 32(rio), x0, x5; \
														
 
															+	vpxor 11 * 32(rio), x0, x4; \
														
 
															+	vpxor 12 * 32(rio), x0, x3; \
														
 
															+	vpxor 13 * 32(rio), x0, x2; \
														
 
															+	vpxor 14 * 32(rio), x0, x1; \
														
 
															+	vpxor 15 * 32(rio), x0, x0;
														
 
															+
														
 
															+/* byteslice pre-whitened blocks and store to temporary memory */
														
 
															+#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
														
 
															+		      y6, y7, mem_ab, mem_cd) \
														
 
															+	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
														
 
															+			      y4, y5, y6, y7, (mem_ab), (mem_cd)); \
														
 
															+	\
														
 
															+	vmovdqu x0, 0 * 32(mem_ab); \
														
 
															+	vmovdqu x1, 1 * 32(mem_ab); \
														
 
															+	vmovdqu x2, 2 * 32(mem_ab); \
														
 
															+	vmovdqu x3, 3 * 32(mem_ab); \
														
 
															+	vmovdqu x4, 4 * 32(mem_ab); \
														
 
															+	vmovdqu x5, 5 * 32(mem_ab); \
														
 
															+	vmovdqu x6, 6 * 32(mem_ab); \
														
 
															+	vmovdqu x7, 7 * 32(mem_ab); \
														
 
															+	vmovdqu y0, 0 * 32(mem_cd); \
														
 
															+	vmovdqu y1, 1 * 32(mem_cd); \
														
 
															+	vmovdqu y2, 2 * 32(mem_cd); \
														
 
															+	vmovdqu y3, 3 * 32(mem_cd); \
														
 
															+	vmovdqu y4, 4 * 32(mem_cd); \
														
 
															+	vmovdqu y5, 5 * 32(mem_cd); \
														
 
															+	vmovdqu y6, 6 * 32(mem_cd); \
														
 
															+	vmovdqu y7, 7 * 32(mem_cd);
														
 
															+
														
 
															+/* de-byteslice, apply post-whitening and store blocks */
														
 
															+#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
														
 
															+		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
														
 
															+	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
														
 
															+			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
														
 
															+	\
														
 
															+	vmovdqu x0, stack_tmp0; \
														
 
															+	\
														
 
															+	vpbroadcastq key, x0; \
														
 
															+	vpshufb .Lpack_bswap, x0, x0; \
														
 
															+	\
														
 
															+	vpxor x0, y7, y7; \
														
 
															+	vpxor x0, y6, y6; \
														
 
															+	vpxor x0, y5, y5; \
														
 
															+	vpxor x0, y4, y4; \
														
 
															+	vpxor x0, y3, y3; \
														
 
															+	vpxor x0, y2, y2; \
														
 
															+	vpxor x0, y1, y1; \
														
 
															+	vpxor x0, y0, y0; \
														
 
															+	vpxor x0, x7, x7; \
														
 
															+	vpxor x0, x6, x6; \
														
 
															+	vpxor x0, x5, x5; \
														
 
															+	vpxor x0, x4, x4; \
														
 
															+	vpxor x0, x3, x3; \
														
 
															+	vpxor x0, x2, x2; \
														
 
															+	vpxor x0, x1, x1; \
														
 
															+	vpxor stack_tmp0, x0, x0;
														
 
															+
														
 
															+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
														
 
															+		     y6, y7, rio) \
														
 
															+	vmovdqu x0, 0 * 32(rio); \
														
 
															+	vmovdqu x1, 1 * 32(rio); \
														
 
															+	vmovdqu x2, 2 * 32(rio); \
														
 
															+	vmovdqu x3, 3 * 32(rio); \
														
 
															+	vmovdqu x4, 4 * 32(rio); \
														
 
															+	vmovdqu x5, 5 * 32(rio); \
														
 
															+	vmovdqu x6, 6 * 32(rio); \
														
 
															+	vmovdqu x7, 7 * 32(rio); \
														
 
															+	vmovdqu y0, 8 * 32(rio); \
														
 
															+	vmovdqu y1, 9 * 32(rio); \
														
 
															+	vmovdqu y2, 10 * 32(rio); \
														
 
															+	vmovdqu y3, 11 * 32(rio); \
														
 
															+	vmovdqu y4, 12 * 32(rio); \
														
 
															+	vmovdqu y5, 13 * 32(rio); \
														
 
															+	vmovdqu y6, 14 * 32(rio); \
														
 
															+	vmovdqu y7, 15 * 32(rio);
														
 
															+
														
 
															+.data
														
 
															+.align 32
														
 
															+
														
 
															+#define SHUFB_BYTES(idx) \
														
 
															+	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
														
 
															+
														
 
															+.Lshufb_16x16b:
														
 
															+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
														
 
															+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
														
 
															+
														
 
															+.Lpack_bswap:
														
 
															+	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
														
 
															+	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
														
 
															+
														
 
															+/* For CTR-mode IV byteswap */
														
 
															+.Lbswap128_mask:
														
 
															+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
														
 
															+
														
 
															+/* For XTS mode */
														
 
															+.Lxts_gf128mul_and_shl1_mask_0:
														
 
															+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
														
 
															+.Lxts_gf128mul_and_shl1_mask_1:
														
 
															+	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
														
 
															+
														
 
															+/*
														
 
															+ * pre-SubByte transform
														
 
															+ *
														
 
															+ * pre-lookup for sbox1, sbox2, sbox3:
														
 
															+ *   swap_bitendianness(
														
 
															+ *       isom_map_camellia_to_aes(
														
 
															+ *           camellia_f(
														
 
															+ *               swap_bitendianess(in)
														
 
															+ *           )
														
 
															+ *       )
														
 
															+ *   )
														
 
															+ *
														
 
															+ * (note: '⊕ 0xc5' inside camellia_f())
														
 
															+ */
														
 
															+.Lpre_tf_lo_s1:
														
 
															+	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
														
 
															+	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
														
 
															+.Lpre_tf_hi_s1:
														
 
															+	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
														
 
															+	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
														
 
															+
														
 
															+/*
														
 
															+ * pre-SubByte transform
														
 
															+ *
														
 
															+ * pre-lookup for sbox4:
														
 
															+ *   swap_bitendianness(
														
 
															+ *       isom_map_camellia_to_aes(
														
 
															+ *           camellia_f(
														
 
															+ *               swap_bitendianess(in <<< 1)
														
 
															+ *           )
														
 
															+ *       )
														
 
															+ *   )
														
 
															+ *
														
 
															+ * (note: '⊕ 0xc5' inside camellia_f())
														
 
															+ */
														
 
															+.Lpre_tf_lo_s4:
														
 
															+	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
														
 
															+	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
														
 
															+.Lpre_tf_hi_s4:
														
 
															+	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
														
 
															+	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
														
 
															+
														
 
															+/*
														
 
															+ * post-SubByte transform
														
 
															+ *
														
 
															+ * post-lookup for sbox1, sbox4:
														
 
															+ *  swap_bitendianness(
														
 
															+ *      camellia_h(
														
 
															+ *          isom_map_aes_to_camellia(
														
 
															+ *              swap_bitendianness(
														
 
															+ *                  aes_inverse_affine_transform(in)
														
 
															+ *              )
														
 
															+ *          )
														
 
															+ *      )
														
 
															+ *  )
														
 
															+ *
														
 
															+ * (note: '⊕ 0x6e' inside camellia_h())
														
 
															+ */
														
 
															+.Lpost_tf_lo_s1:
														
 
															+	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
														
 
															+	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
														
 
															+.Lpost_tf_hi_s1:
														
 
															+	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
														
 
															+	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
														
 
															+
														
 
															+/*
														
 
															+ * post-SubByte transform
														
 
															+ *
														
 
															+ * post-lookup for sbox2:
														
 
															+ *  swap_bitendianness(
														
 
															+ *      camellia_h(
														
 
															+ *          isom_map_aes_to_camellia(
														
 
															+ *              swap_bitendianness(
														
 
															+ *                  aes_inverse_affine_transform(in)
														
 
															+ *              )
														
 
															+ *          )
														
 
															+ *      )
														
 
															+ *  ) <<< 1
														
 
															+ *
														
 
															+ * (note: '⊕ 0x6e' inside camellia_h())
														
 
															+ */
														
 
															+.Lpost_tf_lo_s2:
														
 
															+	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
														
 
															+	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
														
 
															+.Lpost_tf_hi_s2:
														
 
															+	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
														
 
															+	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
														
 
															+
														
 
															+/*
														
 
															+ * post-SubByte transform
														
 
															+ *
														
 
															+ * post-lookup for sbox3:
														
 
															+ *  swap_bitendianness(
														
 
															+ *      camellia_h(
														
 
															+ *          isom_map_aes_to_camellia(
														
 
															+ *              swap_bitendianness(
														
 
															+ *                  aes_inverse_affine_transform(in)
														
 
															+ *              )
														
 
															+ *          )
														
 
															+ *      )
														
 
															+ *  ) >>> 1
														
 
															+ *
														
 
															+ * (note: '⊕ 0x6e' inside camellia_h())
														
 
															+ */
														
 
															+.Lpost_tf_lo_s3:
														
 
															+	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
														
 
															+	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
														
 
															+.Lpost_tf_hi_s3:
														
 
															+	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
														
 
															+	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
														
 
															+
														
 
															+/* For isolating SubBytes from AESENCLAST, inverse shift row */
														
 
															+.Linv_shift_row:
														
 
															+	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
														
 
															+	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
														
 
															+
														
 
															+.align 4
														
 
															+/* 4-bit mask */
														
 
															+.L0f0f0f0f:
														
 
															+	.long 0x0f0f0f0f
														
 
															+
														
 
															+.text
														
 
															+
														
 
															+.align 8
														
 
															+__camellia_enc_blk32:
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rax: temporary storage, 512 bytes
														
 
															+	 *	%ymm0..%ymm15: 32 plaintext blocks
														
 
															+	 * output:
														
 
															+	 *	%ymm0..%ymm15: 32 encrypted blocks, order swapped:
														
 
															+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
														
 
															+	 */
														
 
															+
														
 
															+	leaq 8 * 32(%rax), %rcx;
														
 
															+
														
 
															+	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		      %ymm15, %rax, %rcx);
														
 
															+
														
 
															+	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		     %ymm15, %rax, %rcx, 0);
														
 
															+
														
 
															+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+	      %ymm15,
														
 
															+	      ((key_table + (8) * 8) + 0)(CTX),
														
 
															+	      ((key_table + (8) * 8) + 4)(CTX),
														
 
															+	      ((key_table + (8) * 8) + 8)(CTX),
														
 
															+	      ((key_table + (8) * 8) + 12)(CTX));
														
 
															+
														
 
															+	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		     %ymm15, %rax, %rcx, 8);
														
 
															+
														
 
															+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+	      %ymm15,
														
 
															+	      ((key_table + (16) * 8) + 0)(CTX),
														
 
															+	      ((key_table + (16) * 8) + 4)(CTX),
														
 
															+	      ((key_table + (16) * 8) + 8)(CTX),
														
 
															+	      ((key_table + (16) * 8) + 12)(CTX));
														
 
															+
														
 
															+	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		     %ymm15, %rax, %rcx, 16);
														
 
															+
														
 
															+	movl $24, %r8d;
														
 
															+	cmpl $16, key_length(CTX);
														
 
															+	jne .Lenc_max32;
														
 
															+
														
 
															+.Lenc_done:
														
 
															+	/* load CD for output */
														
 
															+	vmovdqu 0 * 32(%rcx), %ymm8;
														
 
															+	vmovdqu 1 * 32(%rcx), %ymm9;
														
 
															+	vmovdqu 2 * 32(%rcx), %ymm10;
														
 
															+	vmovdqu 3 * 32(%rcx), %ymm11;
														
 
															+	vmovdqu 4 * 32(%rcx), %ymm12;
														
 
															+	vmovdqu 5 * 32(%rcx), %ymm13;
														
 
															+	vmovdqu 6 * 32(%rcx), %ymm14;
														
 
															+	vmovdqu 7 * 32(%rcx), %ymm15;
														
 
															+
														
 
															+	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		    %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
														
 
															+
														
 
															+	ret;
														
 
															+
														
 
															+.align 8
														
 
															+.Lenc_max32:
														
 
															+	movl $32, %r8d;
														
 
															+
														
 
															+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+	      %ymm15,
														
 
															+	      ((key_table + (24) * 8) + 0)(CTX),
														
 
															+	      ((key_table + (24) * 8) + 4)(CTX),
														
 
															+	      ((key_table + (24) * 8) + 8)(CTX),
														
 
															+	      ((key_table + (24) * 8) + 12)(CTX));
														
 
															+
														
 
															+	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		     %ymm15, %rax, %rcx, 24);
														
 
															+
														
 
															+	jmp .Lenc_done;
														
 
															+ENDPROC(__camellia_enc_blk32)
														
 
															+
														
 
															+.align 8
														
 
															+__camellia_dec_blk32:
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rax: temporary storage, 512 bytes
														
 
															+	 *	%r8d: 24 for 16 byte key, 32 for larger
														
 
															+	 *	%ymm0..%ymm15: 16 encrypted blocks
														
 
															+	 * output:
														
 
															+	 *	%ymm0..%ymm15: 16 plaintext blocks, order swapped:
														
 
															+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
														
 
															+	 */
														
 
															+
														
 
															+	leaq 8 * 32(%rax), %rcx;
														
 
															+
														
 
															+	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		      %ymm15, %rax, %rcx);
														
 
															+
														
 
															+	cmpl $32, %r8d;
														
 
															+	je .Ldec_max32;
														
 
															+
														
 
															+.Ldec_max24:
														
 
															+	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		     %ymm15, %rax, %rcx, 16);
														
 
															+
														
 
															+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+	      %ymm15,
														
 
															+	      ((key_table + (16) * 8) + 8)(CTX),
														
 
															+	      ((key_table + (16) * 8) + 12)(CTX),
														
 
															+	      ((key_table + (16) * 8) + 0)(CTX),
														
 
															+	      ((key_table + (16) * 8) + 4)(CTX));
														
 
															+
														
 
															+	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		     %ymm15, %rax, %rcx, 8);
														
 
															+
														
 
															+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+	      %ymm15,
														
 
															+	      ((key_table + (8) * 8) + 8)(CTX),
														
 
															+	      ((key_table + (8) * 8) + 12)(CTX),
														
 
															+	      ((key_table + (8) * 8) + 0)(CTX),
														
 
															+	      ((key_table + (8) * 8) + 4)(CTX));
														
 
															+
														
 
															+	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		     %ymm15, %rax, %rcx, 0);
														
 
															+
														
 
															+	/* load CD for output */
														
 
															+	vmovdqu 0 * 32(%rcx), %ymm8;
														
 
															+	vmovdqu 1 * 32(%rcx), %ymm9;
														
 
															+	vmovdqu 2 * 32(%rcx), %ymm10;
														
 
															+	vmovdqu 3 * 32(%rcx), %ymm11;
														
 
															+	vmovdqu 4 * 32(%rcx), %ymm12;
														
 
															+	vmovdqu 5 * 32(%rcx), %ymm13;
														
 
															+	vmovdqu 6 * 32(%rcx), %ymm14;
														
 
															+	vmovdqu 7 * 32(%rcx), %ymm15;
														
 
															+
														
 
															+	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		    %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
														
 
															+
														
 
															+	ret;
														
 
															+
														
 
															+.align 8
														
 
															+.Ldec_max32:
														
 
															+	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		     %ymm15, %rax, %rcx, 24);
														
 
															+
														
 
															+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+	      %ymm15,
														
 
															+	      ((key_table + (24) * 8) + 8)(CTX),
														
 
															+	      ((key_table + (24) * 8) + 12)(CTX),
														
 
															+	      ((key_table + (24) * 8) + 0)(CTX),
														
 
															+	      ((key_table + (24) * 8) + 4)(CTX));
														
 
															+
														
 
															+	jmp .Ldec_max24;
														
 
															+ENDPROC(__camellia_dec_blk32)
														
 
															+
														
 
															+ENTRY(camellia_ecb_enc_32way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (32 blocks)
														
 
															+	 *	%rdx: src (32 blocks)
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		     %ymm15, %rdx, (key_table)(CTX));
														
 
															+
														
 
															+	/* now dst can be used as temporary buffer (even in src == dst case) */
														
 
															+	movq	%rsi, %rax;
														
 
															+
														
 
															+	call __camellia_enc_blk32;
														
 
															+
														
 
															+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
														
 
															+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
														
 
															+		     %ymm8, %rsi);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(camellia_ecb_enc_32way)
														
 
															+
														
 
															+ENTRY(camellia_ecb_dec_32way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (32 blocks)
														
 
															+	 *	%rdx: src (32 blocks)
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	cmpl $16, key_length(CTX);
														
 
															+	movl $32, %r8d;
														
 
															+	movl $24, %eax;
														
 
															+	cmovel %eax, %r8d; /* max */
														
 
															+
														
 
															+	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
														
 
															+
														
 
															+	/* now dst can be used as temporary buffer (even in src == dst case) */
														
 
															+	movq	%rsi, %rax;
														
 
															+
														
 
															+	call __camellia_dec_blk32;
														
 
															+
														
 
															+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
														
 
															+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
														
 
															+		     %ymm8, %rsi);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(camellia_ecb_dec_32way)
														
 
															+
														
 
															+ENTRY(camellia_cbc_dec_32way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (32 blocks)
														
 
															+	 *	%rdx: src (32 blocks)
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	cmpl $16, key_length(CTX);
														
 
															+	movl $32, %r8d;
														
 
															+	movl $24, %eax;
														
 
															+	cmovel %eax, %r8d; /* max */
														
 
															+
														
 
															+	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
														
 
															+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
														
 
															+		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
														
 
															+
														
 
															+	movq %rsp, %r10;
														
 
															+	cmpq %rsi, %rdx;
														
 
															+	je .Lcbc_dec_use_stack;
														
 
															+
														
 
															+	/* dst can be used as temporary storage, src is not overwritten. */
														
 
															+	movq %rsi, %rax;
														
 
															+	jmp .Lcbc_dec_continue;
														
 
															+
														
 
															+.Lcbc_dec_use_stack:
														
 
															+	/*
														
 
															+	 * dst still in-use (because dst == src), so use stack for temporary
														
 
															+	 * storage.
														
 
															+	 */
														
 
															+	subq $(16 * 32), %rsp;
														
 
															+	movq %rsp, %rax;
														
 
															+
														
 
															+.Lcbc_dec_continue:
														
 
															+	call __camellia_dec_blk32;
														
 
															+
														
 
															+	vmovdqu %ymm7, (%rax);
														
 
															+	vpxor %ymm7, %ymm7, %ymm7;
														
 
															+	vinserti128 $1, (%rdx), %ymm7, %ymm7;
														
 
															+	vpxor (%rax), %ymm7, %ymm7;
														
 
															+	movq %r10, %rsp;
														
 
															+	vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
														
 
															+	vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
														
 
															+	vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
														
 
															+	vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
														
 
															+	vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
														
 
															+	vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
														
 
															+	vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
														
 
															+	vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
														
 
															+	vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
														
 
															+	vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
														
 
															+	vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
														
 
															+	vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
														
 
															+	vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
														
 
															+	vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
														
 
															+	vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
														
 
															+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
														
 
															+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
														
 
															+		     %ymm8, %rsi);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(camellia_cbc_dec_32way)
														
 
															+
														
 
															+#define inc_le128(x, minus_one, tmp) \
														
 
															+	vpcmpeqq minus_one, x, tmp; \
														
 
															+	vpsubq minus_one, x, x; \
														
 
															+	vpslldq $8, tmp, tmp; \
														
 
															+	vpsubq tmp, x, x;
														
 
															+
														
 
															+#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
														
 
															+	vpcmpeqq minus_one, x, tmp1; \
														
 
															+	vpcmpeqq minus_two, x, tmp2; \
														
 
															+	vpsubq minus_two, x, x; \
														
 
															+	vpor tmp2, tmp1, tmp1; \
														
 
															+	vpslldq $8, tmp1, tmp1; \
														
 
															+	vpsubq tmp1, x, x;
														
 
															+
														
 
															+ENTRY(camellia_ctr_32way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (32 blocks)
														
 
															+	 *	%rdx: src (32 blocks)
														
 
															+	 *	%rcx: iv (little endian, 128bit)
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	movq %rsp, %r10;
														
 
															+	cmpq %rsi, %rdx;
														
 
															+	je .Lctr_use_stack;
														
 
															+
														
 
															+	/* dst can be used as temporary storage, src is not overwritten. */
														
 
															+	movq %rsi, %rax;
														
 
															+	jmp .Lctr_continue;
														
 
															+
														
 
															+.Lctr_use_stack:
														
 
															+	subq $(16 * 32), %rsp;
														
 
															+	movq %rsp, %rax;
														
 
															+
														
 
															+.Lctr_continue:
														
 
															+	vpcmpeqd %ymm15, %ymm15, %ymm15;
														
 
															+	vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
														
 
															+	vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
														
 
															+
														
 
															+	/* load IV and byteswap */
														
 
															+	vmovdqu (%rcx), %xmm0;
														
 
															+	vmovdqa %xmm0, %xmm1;
														
 
															+	inc_le128(%xmm0, %xmm15, %xmm14);
														
 
															+	vbroadcasti128 .Lbswap128_mask, %ymm14;
														
 
															+	vinserti128 $1, %xmm0, %ymm1, %ymm0;
														
 
															+	vpshufb %ymm14, %ymm0, %ymm13;
														
 
															+	vmovdqu %ymm13, 15 * 32(%rax);
														
 
															+
														
 
															+	/* construct IVs */
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
														
 
															+	vpshufb %ymm14, %ymm0, %ymm13;
														
 
															+	vmovdqu %ymm13, 14 * 32(%rax);
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
														
 
															+	vpshufb %ymm14, %ymm0, %ymm13;
														
 
															+	vmovdqu %ymm13, 13 * 32(%rax);
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
														
 
															+	vpshufb %ymm14, %ymm0, %ymm13;
														
 
															+	vmovdqu %ymm13, 12 * 32(%rax);
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
														
 
															+	vpshufb %ymm14, %ymm0, %ymm13;
														
 
															+	vmovdqu %ymm13, 11 * 32(%rax);
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
														
 
															+	vpshufb %ymm14, %ymm0, %ymm10;
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
														
 
															+	vpshufb %ymm14, %ymm0, %ymm9;
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
														
 
															+	vpshufb %ymm14, %ymm0, %ymm8;
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
														
 
															+	vpshufb %ymm14, %ymm0, %ymm7;
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
														
 
															+	vpshufb %ymm14, %ymm0, %ymm6;
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
														
 
															+	vpshufb %ymm14, %ymm0, %ymm5;
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
														
 
															+	vpshufb %ymm14, %ymm0, %ymm4;
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
														
 
															+	vpshufb %ymm14, %ymm0, %ymm3;
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
														
 
															+	vpshufb %ymm14, %ymm0, %ymm2;
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
														
 
															+	vpshufb %ymm14, %ymm0, %ymm1;
														
 
															+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
														
 
															+	vextracti128 $1, %ymm0, %xmm13;
														
 
															+	vpshufb %ymm14, %ymm0, %ymm0;
														
 
															+	inc_le128(%xmm13, %xmm15, %xmm14);
														
 
															+	vmovdqu %xmm13, (%rcx);
														
 
															+
														
 
															+	/* inpack32_pre: */
														
 
															+	vpbroadcastq (key_table)(CTX), %ymm15;
														
 
															+	vpshufb .Lpack_bswap, %ymm15, %ymm15;
														
 
															+	vpxor %ymm0, %ymm15, %ymm0;
														
 
															+	vpxor %ymm1, %ymm15, %ymm1;
														
 
															+	vpxor %ymm2, %ymm15, %ymm2;
														
 
															+	vpxor %ymm3, %ymm15, %ymm3;
														
 
															+	vpxor %ymm4, %ymm15, %ymm4;
														
 
															+	vpxor %ymm5, %ymm15, %ymm5;
														
 
															+	vpxor %ymm6, %ymm15, %ymm6;
														
 
															+	vpxor %ymm7, %ymm15, %ymm7;
														
 
															+	vpxor %ymm8, %ymm15, %ymm8;
														
 
															+	vpxor %ymm9, %ymm15, %ymm9;
														
 
															+	vpxor %ymm10, %ymm15, %ymm10;
														
 
															+	vpxor 11 * 32(%rax), %ymm15, %ymm11;
														
 
															+	vpxor 12 * 32(%rax), %ymm15, %ymm12;
														
 
															+	vpxor 13 * 32(%rax), %ymm15, %ymm13;
														
 
															+	vpxor 14 * 32(%rax), %ymm15, %ymm14;
														
 
															+	vpxor 15 * 32(%rax), %ymm15, %ymm15;
														
 
															+
														
 
															+	call __camellia_enc_blk32;
														
 
															+
														
 
															+	movq %r10, %rsp;
														
 
															+
														
 
															+	vpxor 0 * 32(%rdx), %ymm7, %ymm7;
														
 
															+	vpxor 1 * 32(%rdx), %ymm6, %ymm6;
														
 
															+	vpxor 2 * 32(%rdx), %ymm5, %ymm5;
														
 
															+	vpxor 3 * 32(%rdx), %ymm4, %ymm4;
														
 
															+	vpxor 4 * 32(%rdx), %ymm3, %ymm3;
														
 
															+	vpxor 5 * 32(%rdx), %ymm2, %ymm2;
														
 
															+	vpxor 6 * 32(%rdx), %ymm1, %ymm1;
														
 
															+	vpxor 7 * 32(%rdx), %ymm0, %ymm0;
														
 
															+	vpxor 8 * 32(%rdx), %ymm15, %ymm15;
														
 
															+	vpxor 9 * 32(%rdx), %ymm14, %ymm14;
														
 
															+	vpxor 10 * 32(%rdx), %ymm13, %ymm13;
														
 
															+	vpxor 11 * 32(%rdx), %ymm12, %ymm12;
														
 
															+	vpxor 12 * 32(%rdx), %ymm11, %ymm11;
														
 
															+	vpxor 13 * 32(%rdx), %ymm10, %ymm10;
														
 
															+	vpxor 14 * 32(%rdx), %ymm9, %ymm9;
														
 
															+	vpxor 15 * 32(%rdx), %ymm8, %ymm8;
														
 
															+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
														
 
															+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
														
 
															+		     %ymm8, %rsi);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(camellia_ctr_32way)
														
 
															+
														
 
															+#define gf128mul_x_ble(iv, mask, tmp) \
														
 
															+	vpsrad $31, iv, tmp; \
														
 
															+	vpaddq iv, iv, iv; \
														
 
															+	vpshufd $0x13, tmp, tmp; \
														
 
															+	vpand mask, tmp, tmp; \
														
 
															+	vpxor tmp, iv, iv;
														
 
															+
														
 
															+#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
														
 
															+	vpsrad $31, iv, tmp0; \
														
 
															+	vpaddq iv, iv, tmp1; \
														
 
															+	vpsllq $2, iv, iv; \
														
 
															+	vpshufd $0x13, tmp0, tmp0; \
														
 
															+	vpsrad $31, tmp1, tmp1; \
														
 
															+	vpand mask2, tmp0, tmp0; \
														
 
															+	vpshufd $0x13, tmp1, tmp1; \
														
 
															+	vpxor tmp0, iv, iv; \
														
 
															+	vpand mask1, tmp1, tmp1; \
														
 
															+	vpxor tmp1, iv, iv;
														
 
															+
														
 
															+.align 8
														
 
															+camellia_xts_crypt_32way:
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (32 blocks)
														
 
															+	 *	%rdx: src (32 blocks)
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 *	%r8: index for input whitening key
														
 
															+	 *	%r9: pointer to  __camellia_enc_blk32 or __camellia_dec_blk32
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	subq $(16 * 32), %rsp;
														
 
															+	movq %rsp, %rax;
														
 
															+
														
 
															+	vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
														
 
															+
														
 
															+	/* load IV and construct second IV */
														
 
															+	vmovdqu (%rcx), %xmm0;
														
 
															+	vmovdqa %xmm0, %xmm15;
														
 
															+	gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
														
 
															+	vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
														
 
															+	vinserti128 $1, %xmm0, %ymm15, %ymm0;
														
 
															+	vpxor 0 * 32(%rdx), %ymm0, %ymm15;
														
 
															+	vmovdqu %ymm15, 15 * 32(%rax);
														
 
															+	vmovdqu %ymm0, 0 * 32(%rsi);
														
 
															+
														
 
															+	/* construct IVs */
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 1 * 32(%rdx), %ymm0, %ymm15;
														
 
															+	vmovdqu %ymm15, 14 * 32(%rax);
														
 
															+	vmovdqu %ymm0, 1 * 32(%rsi);
														
 
															+
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 2 * 32(%rdx), %ymm0, %ymm15;
														
 
															+	vmovdqu %ymm15, 13 * 32(%rax);
														
 
															+	vmovdqu %ymm0, 2 * 32(%rsi);
														
 
															+
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 3 * 32(%rdx), %ymm0, %ymm15;
														
 
															+	vmovdqu %ymm15, 12 * 32(%rax);
														
 
															+	vmovdqu %ymm0, 3 * 32(%rsi);
														
 
															+
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 4 * 32(%rdx), %ymm0, %ymm11;
														
 
															+	vmovdqu %ymm0, 4 * 32(%rsi);
														
 
															+
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 5 * 32(%rdx), %ymm0, %ymm10;
														
 
															+	vmovdqu %ymm0, 5 * 32(%rsi);
														
 
															+
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 6 * 32(%rdx), %ymm0, %ymm9;
														
 
															+	vmovdqu %ymm0, 6 * 32(%rsi);
														
 
															+
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 7 * 32(%rdx), %ymm0, %ymm8;
														
 
															+	vmovdqu %ymm0, 7 * 32(%rsi);
														
 
															+
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 8 * 32(%rdx), %ymm0, %ymm7;
														
 
															+	vmovdqu %ymm0, 8 * 32(%rsi);
														
 
															+
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 9 * 32(%rdx), %ymm0, %ymm6;
														
 
															+	vmovdqu %ymm0, 9 * 32(%rsi);
														
 
															+
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 10 * 32(%rdx), %ymm0, %ymm5;
														
 
															+	vmovdqu %ymm0, 10 * 32(%rsi);
														
 
															+
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 11 * 32(%rdx), %ymm0, %ymm4;
														
 
															+	vmovdqu %ymm0, 11 * 32(%rsi);
														
 
															+
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 12 * 32(%rdx), %ymm0, %ymm3;
														
 
															+	vmovdqu %ymm0, 12 * 32(%rsi);
														
 
															+
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 13 * 32(%rdx), %ymm0, %ymm2;
														
 
															+	vmovdqu %ymm0, 13 * 32(%rsi);
														
 
															+
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 14 * 32(%rdx), %ymm0, %ymm1;
														
 
															+	vmovdqu %ymm0, 14 * 32(%rsi);
														
 
															+
														
 
															+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
														
 
															+	vpxor 15 * 32(%rdx), %ymm0, %ymm15;
														
 
															+	vmovdqu %ymm15, 0 * 32(%rax);
														
 
															+	vmovdqu %ymm0, 15 * 32(%rsi);
														
 
															+
														
 
															+	vextracti128 $1, %ymm0, %xmm0;
														
 
															+	gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
														
 
															+	vmovdqu %xmm0, (%rcx);
														
 
															+
														
 
															+	/* inpack32_pre: */
														
 
															+	vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
														
 
															+	vpshufb .Lpack_bswap, %ymm15, %ymm15;
														
 
															+	vpxor 0 * 32(%rax), %ymm15, %ymm0;
														
 
															+	vpxor %ymm1, %ymm15, %ymm1;
														
 
															+	vpxor %ymm2, %ymm15, %ymm2;
														
 
															+	vpxor %ymm3, %ymm15, %ymm3;
														
 
															+	vpxor %ymm4, %ymm15, %ymm4;
														
 
															+	vpxor %ymm5, %ymm15, %ymm5;
														
 
															+	vpxor %ymm6, %ymm15, %ymm6;
														
 
															+	vpxor %ymm7, %ymm15, %ymm7;
														
 
															+	vpxor %ymm8, %ymm15, %ymm8;
														
 
															+	vpxor %ymm9, %ymm15, %ymm9;
														
 
															+	vpxor %ymm10, %ymm15, %ymm10;
														
 
															+	vpxor %ymm11, %ymm15, %ymm11;
														
 
															+	vpxor 12 * 32(%rax), %ymm15, %ymm12;
														
 
															+	vpxor 13 * 32(%rax), %ymm15, %ymm13;
														
 
															+	vpxor 14 * 32(%rax), %ymm15, %ymm14;
														
 
															+	vpxor 15 * 32(%rax), %ymm15, %ymm15;
														
 
															+
														
 
															+	call *%r9;
														
 
															+
														
 
															+	addq $(16 * 32), %rsp;
														
 
															+
														
 
															+	vpxor 0 * 32(%rsi), %ymm7, %ymm7;
														
 
															+	vpxor 1 * 32(%rsi), %ymm6, %ymm6;
														
 
															+	vpxor 2 * 32(%rsi), %ymm5, %ymm5;
														
 
															+	vpxor 3 * 32(%rsi), %ymm4, %ymm4;
														
 
															+	vpxor 4 * 32(%rsi), %ymm3, %ymm3;
														
 
															+	vpxor 5 * 32(%rsi), %ymm2, %ymm2;
														
 
															+	vpxor 6 * 32(%rsi), %ymm1, %ymm1;
														
 
															+	vpxor 7 * 32(%rsi), %ymm0, %ymm0;
														
 
															+	vpxor 8 * 32(%rsi), %ymm15, %ymm15;
														
 
															+	vpxor 9 * 32(%rsi), %ymm14, %ymm14;
														
 
															+	vpxor 10 * 32(%rsi), %ymm13, %ymm13;
														
 
															+	vpxor 11 * 32(%rsi), %ymm12, %ymm12;
														
 
															+	vpxor 12 * 32(%rsi), %ymm11, %ymm11;
														
 
															+	vpxor 13 * 32(%rsi), %ymm10, %ymm10;
														
 
															+	vpxor 14 * 32(%rsi), %ymm9, %ymm9;
														
 
															+	vpxor 15 * 32(%rsi), %ymm8, %ymm8;
														
 
															+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
														
 
															+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
														
 
															+		     %ymm8, %rsi);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(camellia_xts_crypt_32way)
														
 
															+
														
 
															+ENTRY(camellia_xts_enc_32way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (32 blocks)
														
 
															+	 *	%rdx: src (32 blocks)
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 */
														
 
															+
														
 
															+	xorl %r8d, %r8d; /* input whitening key, 0 for enc */
														
 
															+
														
 
															+	leaq __camellia_enc_blk32, %r9;
														
 
															+
														
 
															+	jmp camellia_xts_crypt_32way;
														
 
															+ENDPROC(camellia_xts_enc_32way)
														
 
															+
														
 
															+ENTRY(camellia_xts_dec_32way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (32 blocks)
														
 
															+	 *	%rdx: src (32 blocks)
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 */
														
 
															+
														
 
															+	cmpl $16, key_length(CTX);
														
 
															+	movl $32, %r8d;
														
 
															+	movl $24, %eax;
														
 
															+	cmovel %eax, %r8d;  /* input whitening key, last for dec */
														
 
															+
														
 
															+	leaq __camellia_dec_blk32, %r9;
														
 
															+
														
 
															+	jmp camellia_xts_crypt_32way;
														
 
															+ENDPROC(camellia_xts_dec_32way)
														
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -0,0 +1,586 @@
 
															+/*
														
 
															+ * Glue Code for x86_64/AVX2/AES-NI assembler optimized version of Camellia
														
 
															+ *
														
 
															+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify
														
 
															+ * it under the terms of the GNU General Public License as published by
														
 
															+ * the Free Software Foundation; either version 2 of the License, or
														
 
															+ * (at your option) any later version.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/crypto.h>
														
 
															+#include <linux/err.h>
														
 
															+#include <crypto/algapi.h>
														
 
															+#include <crypto/ctr.h>
														
 
															+#include <crypto/lrw.h>
														
 
															+#include <crypto/xts.h>
														
 
															+#include <asm/xcr.h>
														
 
															+#include <asm/xsave.h>
														
 
															+#include <asm/crypto/camellia.h>
														
 
															+#include <asm/crypto/ablk_helper.h>
														
 
															+#include <asm/crypto/glue_helper.h>
														
 
															+
														
 
															+#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
														
 
															+#define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32
														
 
															+
														
 
															+/* 32-way AVX2/AES-NI parallel cipher functions */
														
 
															+asmlinkage void camellia_ecb_enc_32way(struct camellia_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src);
														
 
															+asmlinkage void camellia_ecb_dec_32way(struct camellia_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src);
														
 
															+
														
 
															+asmlinkage void camellia_cbc_dec_32way(struct camellia_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src);
														
 
															+asmlinkage void camellia_ctr_32way(struct camellia_ctx *ctx, u8 *dst,
														
 
															+				   const u8 *src, le128 *iv);
														
 
															+
														
 
															+asmlinkage void camellia_xts_enc_32way(struct camellia_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src, le128 *iv);
														
 
															+asmlinkage void camellia_xts_dec_32way(struct camellia_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src, le128 *iv);
														
 
															+
														
 
															+static const struct common_glue_ctx camellia_enc = {
														
 
															+	.num_funcs = 4,
														
 
															+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_32way) }
														
 
															+	}, {
														
 
															+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 2,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx camellia_ctr = {
														
 
															+	.num_funcs = 4,
														
 
															+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_32way) }
														
 
															+	}, {
														
 
															+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 2,
														
 
															+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx camellia_enc_xts = {
														
 
															+	.num_funcs = 3,
														
 
															+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_32way) }
														
 
															+	}, {
														
 
															+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx camellia_dec = {
														
 
															+	.num_funcs = 4,
														
 
															+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_32way) }
														
 
															+	}, {
														
 
															+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 2,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx camellia_dec_cbc = {
														
 
															+	.num_funcs = 4,
														
 
															+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_32way) }
														
 
															+	}, {
														
 
															+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 2,
														
 
															+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx camellia_dec_xts = {
														
 
															+	.num_funcs = 3,
														
 
															+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_32way) }
														
 
															+	}, {
														
 
															+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
														
 
															+}
														
 
															+
														
 
															+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
														
 
															+}
														
 
															+
														
 
															+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
														
 
															+				       dst, src, nbytes);
														
 
															+}
														
 
															+
														
 
															+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
														
 
															+				       nbytes);
														
 
															+}
														
 
															+
														
 
															+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		     struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
														
 
															+}
														
 
															+
														
 
															+static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_fpu_begin(CAMELLIA_BLOCK_SIZE,
														
 
															+			      CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled,
														
 
															+			      nbytes);
														
 
															+}
														
 
															+
														
 
															+static inline void camellia_fpu_end(bool fpu_enabled)
														
 
															+{
														
 
															+	glue_fpu_end(fpu_enabled);
														
 
															+}
														
 
															+
														
 
															+static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
														
 
															+			   unsigned int key_len)
														
 
															+{
														
 
															+	return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
														
 
															+				 &tfm->crt_flags);
														
 
															+}
														
 
															+
														
 
															+struct crypt_priv {
														
 
															+	struct camellia_ctx *ctx;
														
 
															+	bool fpu_enabled;
														
 
															+};
														
 
															+
														
 
															+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
														
 
															+{
														
 
															+	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
														
 
															+	struct crypt_priv *ctx = priv;
														
 
															+	int i;
														
 
															+
														
 
															+	ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
														
 
															+
														
 
															+	if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) {
														
 
															+		camellia_ecb_enc_32way(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
														
 
															+		nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
														
 
															+	}
														
 
															+
														
 
															+	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
														
 
															+		camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
														
 
															+		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
														
 
															+	}
														
 
															+
														
 
															+	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
														
 
															+		camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
														
 
															+		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
														
 
															+		camellia_enc_blk(ctx->ctx, srcdst, srcdst);
														
 
															+}
														
 
															+
														
 
															+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
														
 
															+{
														
 
															+	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
														
 
															+	struct crypt_priv *ctx = priv;
														
 
															+	int i;
														
 
															+
														
 
															+	ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
														
 
															+
														
 
															+	if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) {
														
 
															+		camellia_ecb_dec_32way(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
														
 
															+		nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
														
 
															+	}
														
 
															+
														
 
															+	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
														
 
															+		camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
														
 
															+		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
														
 
															+	}
														
 
															+
														
 
															+	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
														
 
															+		camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
														
 
															+		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
														
 
															+		camellia_dec_blk(ctx->ctx, srcdst, srcdst);
														
 
															+}
														
 
															+
														
 
															+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+	be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS];
														
 
															+	struct crypt_priv crypt_ctx = {
														
 
															+		.ctx = &ctx->camellia_ctx,
														
 
															+		.fpu_enabled = false,
														
 
															+	};
														
 
															+	struct lrw_crypt_req req = {
														
 
															+		.tbuf = buf,
														
 
															+		.tbuflen = sizeof(buf),
														
 
															+
														
 
															+		.table_ctx = &ctx->lrw_table,
														
 
															+		.crypt_ctx = &crypt_ctx,
														
 
															+		.crypt_fn = encrypt_callback,
														
 
															+	};
														
 
															+	int ret;
														
 
															+
														
 
															+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
														
 
															+	camellia_fpu_end(crypt_ctx.fpu_enabled);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+	be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS];
														
 
															+	struct crypt_priv crypt_ctx = {
														
 
															+		.ctx = &ctx->camellia_ctx,
														
 
															+		.fpu_enabled = false,
														
 
															+	};
														
 
															+	struct lrw_crypt_req req = {
														
 
															+		.tbuf = buf,
														
 
															+		.tbuflen = sizeof(buf),
														
 
															+
														
 
															+		.table_ctx = &ctx->lrw_table,
														
 
															+		.crypt_ctx = &crypt_ctx,
														
 
															+		.crypt_fn = decrypt_callback,
														
 
															+	};
														
 
															+	int ret;
														
 
															+
														
 
															+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
														
 
															+	camellia_fpu_end(crypt_ctx.fpu_enabled);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+
														
 
															+	return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(camellia_enc_blk),
														
 
															+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
														
 
															+}
														
 
															+
														
 
															+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+
														
 
															+	return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(camellia_enc_blk),
														
 
															+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
														
 
															+}
														
 
															+
														
 
															+static struct crypto_alg cmll_algs[10] = { {
														
 
															+	.cra_name		= "__ecb-camellia-aesni-avx2",
														
 
															+	.cra_driver_name	= "__driver-ecb-camellia-aesni-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct camellia_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
														
 
															+			.setkey		= camellia_setkey,
														
 
															+			.encrypt	= ecb_encrypt,
														
 
															+			.decrypt	= ecb_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "__cbc-camellia-aesni-avx2",
														
 
															+	.cra_driver_name	= "__driver-cbc-camellia-aesni-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct camellia_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
														
 
															+			.setkey		= camellia_setkey,
														
 
															+			.encrypt	= cbc_encrypt,
														
 
															+			.decrypt	= cbc_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "__ctr-camellia-aesni-avx2",
														
 
															+	.cra_driver_name	= "__driver-ctr-camellia-aesni-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= 1,
														
 
															+	.cra_ctxsize		= sizeof(struct camellia_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
														
 
															+			.ivsize		= CAMELLIA_BLOCK_SIZE,
														
 
															+			.setkey		= camellia_setkey,
														
 
															+			.encrypt	= ctr_crypt,
														
 
															+			.decrypt	= ctr_crypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "__lrw-camellia-aesni-avx2",
														
 
															+	.cra_driver_name	= "__driver-lrw-camellia-aesni-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct camellia_lrw_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_exit		= lrw_camellia_exit_tfm,
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE +
														
 
															+					  CAMELLIA_BLOCK_SIZE,
														
 
															+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE +
														
 
															+					  CAMELLIA_BLOCK_SIZE,
														
 
															+			.ivsize		= CAMELLIA_BLOCK_SIZE,
														
 
															+			.setkey		= lrw_camellia_setkey,
														
 
															+			.encrypt	= lrw_encrypt,
														
 
															+			.decrypt	= lrw_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "__xts-camellia-aesni-avx2",
														
 
															+	.cra_driver_name	= "__driver-xts-camellia-aesni-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct camellia_xts_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE * 2,
														
 
															+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE * 2,
														
 
															+			.ivsize		= CAMELLIA_BLOCK_SIZE,
														
 
															+			.setkey		= xts_camellia_setkey,
														
 
															+			.encrypt	= xts_encrypt,
														
 
															+			.decrypt	= xts_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "ecb(camellia)",
														
 
															+	.cra_driver_name	= "ecb-camellia-aesni-avx2",
														
 
															+	.cra_priority		= 500,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= ablk_encrypt,
														
 
															+			.decrypt	= ablk_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "cbc(camellia)",
														
 
															+	.cra_driver_name	= "cbc-camellia-aesni-avx2",
														
 
															+	.cra_priority		= 500,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
														
 
															+			.ivsize		= CAMELLIA_BLOCK_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= __ablk_encrypt,
														
 
															+			.decrypt	= ablk_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "ctr(camellia)",
														
 
															+	.cra_driver_name	= "ctr-camellia-aesni-avx2",
														
 
															+	.cra_priority		= 500,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= 1,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
														
 
															+			.ivsize		= CAMELLIA_BLOCK_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= ablk_encrypt,
														
 
															+			.decrypt	= ablk_encrypt,
														
 
															+			.geniv		= "chainiv",
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "lrw(camellia)",
														
 
															+	.cra_driver_name	= "lrw-camellia-aesni-avx2",
														
 
															+	.cra_priority		= 500,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE +
														
 
															+					  CAMELLIA_BLOCK_SIZE,
														
 
															+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE +
														
 
															+					  CAMELLIA_BLOCK_SIZE,
														
 
															+			.ivsize		= CAMELLIA_BLOCK_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= ablk_encrypt,
														
 
															+			.decrypt	= ablk_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "xts(camellia)",
														
 
															+	.cra_driver_name	= "xts-camellia-aesni-avx2",
														
 
															+	.cra_priority		= 500,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE * 2,
														
 
															+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE * 2,
														
 
															+			.ivsize		= CAMELLIA_BLOCK_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= ablk_encrypt,
														
 
															+			.decrypt	= ablk_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+} };
														
 
															+
														
 
															+static int __init camellia_aesni_init(void)
														
 
															+{
														
 
															+	u64 xcr0;
														
 
															+
														
 
															+	if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
														
 
															+		pr_info("AVX2 or AES-NI instructions are not detected.\n");
														
 
															+		return -ENODEV;
														
 
															+	}
														
 
															+
														
 
															+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
														
 
															+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
														
 
															+		pr_info("AVX2 detected but unusable.\n");
														
 
															+		return -ENODEV;
														
 
															+	}
														
 
															+
														
 
															+	return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
														
 
															+}
														
 
															+
														
 
															+static void __exit camellia_aesni_fini(void)
														
 
															+{
														
 
															+	crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
														
 
															+}
														
 
															+
														
 
															+module_init(camellia_aesni_init);
														
 
															+module_exit(camellia_aesni_fini);
														
 
															+
														
 
															+MODULE_LICENSE("GPL");
														
 
															+MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized");
														
 
															+MODULE_ALIAS("camellia");
														
 
															+MODULE_ALIAS("camellia-asm");
														
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -1,7 +1,7 @@
 
															 /*
														
 
															  * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
														
 
															  *
														
 
															- * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															  *
														
 
															  * This program is free software; you can redistribute it and/or modify
														
 
															  * it under the terms of the GNU General Public License as published by
														
@@ -26,16 +26,44 @@
 
															 #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
														
 
															-/* 16-way AES-NI parallel cipher functions */
														
 
															+/* 16-way parallel cipher functions (avx/aes-ni) */
														
 
															 asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
														
 
															 				       const u8 *src);
														
 
															+EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way);
														
 
															+
														
 
															 asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
														
 
															 				       const u8 *src);
														
 
															+EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way);
														
 
															 asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
														
 
															 				       const u8 *src);
														
 
															+EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way);
														
 
															+
														
 
															 asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
														
 
															 				   const u8 *src, le128 *iv);
														
 
															+EXPORT_SYMBOL_GPL(camellia_ctr_16way);
														
 
															+
														
 
															+asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src, le128 *iv);
														
 
															+EXPORT_SYMBOL_GPL(camellia_xts_enc_16way);
														
 
															+
														
 
															+asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src, le128 *iv);
														
 
															+EXPORT_SYMBOL_GPL(camellia_xts_dec_16way);
														
 
															+
														
 
															+void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															+{
														
 
															+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
														
 
															+				  GLUE_FUNC_CAST(camellia_enc_blk));
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(camellia_xts_enc);
														
 
															+
														
 
															+void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															+{
														
 
															+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
														
 
															+				  GLUE_FUNC_CAST(camellia_dec_blk));
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(camellia_xts_dec);
														
 
															 static const struct common_glue_ctx camellia_enc = {
														
 
															 	.num_funcs = 3,
														
@@ -69,6 +97,19 @@ static const struct common_glue_ctx camellia_ctr = {
 
															 	} }
														
 
															 };
														
 
															+static const struct common_glue_ctx camellia_enc_xts = {
														
 
															+	.num_funcs = 2,
														
 
															+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															 static const struct common_glue_ctx camellia_dec = {
														
 
															 	.num_funcs = 3,
														
 
															 	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
@@ -101,6 +142,19 @@ static const struct common_glue_ctx camellia_dec_cbc = {
 
															 	} }
														
 
															 };
														
 
															+static const struct common_glue_ctx camellia_dec_xts = {
														
 
															+	.num_funcs = 2,
														
 
															+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															 		       struct scatterlist *src, unsigned int nbytes)
														
 
															 {
														
@@ -261,54 +315,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 
															 		       struct scatterlist *src, unsigned int nbytes)
														
 
															 {
														
 
															 	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															-	be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
														
 
															-	struct crypt_priv crypt_ctx = {
														
 
															-		.ctx = &ctx->crypt_ctx,
														
 
															-		.fpu_enabled = false,
														
 
															-	};
														
 
															-	struct xts_crypt_req req = {
														
 
															-		.tbuf = buf,
														
 
															-		.tbuflen = sizeof(buf),
														
 
															-		.tweak_ctx = &ctx->tweak_ctx,
														
 
															-		.tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
														
 
															-		.crypt_ctx = &crypt_ctx,
														
 
															-		.crypt_fn = encrypt_callback,
														
 
															-	};
														
 
															-	int ret;
														
 
															-
														
 
															-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															-	ret = xts_crypt(desc, dst, src, nbytes, &req);
														
 
															-	camellia_fpu_end(crypt_ctx.fpu_enabled);
														
 
															-
														
 
															-	return ret;
														
 
															+	return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(camellia_enc_blk),
														
 
															+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
														
 
															 }
														
 
															 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															 		       struct scatterlist *src, unsigned int nbytes)
														
 
															 {
														
 
															 	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															-	be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
														
 
															-	struct crypt_priv crypt_ctx = {
														
 
															-		.ctx = &ctx->crypt_ctx,
														
 
															-		.fpu_enabled = false,
														
 
															-	};
														
 
															-	struct xts_crypt_req req = {
														
 
															-		.tbuf = buf,
														
 
															-		.tbuflen = sizeof(buf),
														
 
															-
														
 
															-		.tweak_ctx = &ctx->tweak_ctx,
														
 
															-		.tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
														
 
															-		.crypt_ctx = &crypt_ctx,
														
 
															-		.crypt_fn = decrypt_callback,
														
 
															-	};
														
 
															-	int ret;
														
 
															-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															-	ret = xts_crypt(desc, dst, src, nbytes, &req);
														
 
															-	camellia_fpu_end(crypt_ctx.fpu_enabled);
														
 
															-
														
 
															-	return ret;
														
 
															+	return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(camellia_enc_blk),
														
 
															+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
														
 
															 }
														
 
															 static struct crypto_alg cmll_algs[10] = { {
														
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -4,7 +4,7 @@
 
															  * Copyright (C) 2012 Johannes Goetzfried
														
 
															  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
														
 
															  *
														
 
															- * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															  *
														
 
															  * This program is free software; you can redistribute it and/or modify
														
 
															  * it under the terms of the GNU General Public License as published by
														
@@ -227,6 +227,8 @@
 
															 .data
														
 
															 .align 16
														
 
															+.Lxts_gf128mul_and_shl1_mask:
														
 
															+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
														
 
															 .Lbswap_mask:
														
 
															 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
														
 
															 .Lbswap128_mask:
														
@@ -424,3 +426,47 @@ ENTRY(cast6_ctr_8way)
 
															 	ret;
														
 
															 ENDPROC(cast6_ctr_8way)
														
 
															+
														
 
															+ENTRY(cast6_xts_enc_8way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 */
														
 
															+
														
 
															+	movq %rsi, %r11;
														
 
															+
														
 
															+	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
														
 
															+	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
														
 
															+		      RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
														
 
															+
														
 
															+	call __cast6_enc_blk8;
														
 
															+
														
 
															+	/* dst <= regs xor IVs(in dst) */
														
 
															+	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(cast6_xts_enc_8way)
														
 
															+
														
 
															+ENTRY(cast6_xts_dec_8way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 */
														
 
															+
														
 
															+	movq %rsi, %r11;
														
 
															+
														
 
															+	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
														
 
															+	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
														
 
															+		      RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
														
 
															+
														
 
															+	call __cast6_dec_blk8;
														
 
															+
														
 
															+	/* dst <= regs xor IVs(in dst) */
														
 
															+	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(cast6_xts_dec_8way)
														
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -4,6 +4,8 @@
 
															  * Copyright (C) 2012 Johannes Goetzfried
														
 
															  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
														
 
															  *
														
 
															+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															+ *
														
 
															  * This program is free software; you can redistribute it and/or modify
														
 
															  * it under the terms of the GNU General Public License as published by
														
 
															  * the Free Software Foundation; either version 2 of the License, or
														
@@ -50,6 +52,23 @@ asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
 
															 asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
														
 
															 			       le128 *iv);
														
 
															+asmlinkage void cast6_xts_enc_8way(struct cast6_ctx *ctx, u8 *dst,
														
 
															+				   const u8 *src, le128 *iv);
														
 
															+asmlinkage void cast6_xts_dec_8way(struct cast6_ctx *ctx, u8 *dst,
														
 
															+				   const u8 *src, le128 *iv);
														
 
															+
														
 
															+static void cast6_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															+{
														
 
															+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
														
 
															+				  GLUE_FUNC_CAST(__cast6_encrypt));
														
 
															+}
														
 
															+
														
 
															+static void cast6_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															+{
														
 
															+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
														
 
															+				  GLUE_FUNC_CAST(__cast6_decrypt));
														
 
															+}
														
 
															+
														
 
															 static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															 {
														
 
															 	be128 ctrblk;
														
@@ -87,6 +106,19 @@ static const struct common_glue_ctx cast6_ctr = {
 
															 	} }
														
 
															 };
														
 
															+static const struct common_glue_ctx cast6_enc_xts = {
														
 
															+	.num_funcs = 2,
														
 
															+	.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = CAST6_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc_8way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															 static const struct common_glue_ctx cast6_dec = {
														
 
															 	.num_funcs = 2,
														
 
															 	.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
														
@@ -113,6 +145,19 @@ static const struct common_glue_ctx cast6_dec_cbc = {
 
															 	} }
														
 
															 };
														
 
															+static const struct common_glue_ctx cast6_dec_xts = {
														
 
															+	.num_funcs = 2,
														
 
															+	.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = CAST6_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec_8way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															 		       struct scatterlist *src, unsigned int nbytes)
														
 
															 {
														
@@ -307,54 +352,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 
															 		       struct scatterlist *src, unsigned int nbytes)
														
 
															 {
														
 
															 	struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															-	be128 buf[CAST6_PARALLEL_BLOCKS];
														
 
															-	struct crypt_priv crypt_ctx = {
														
 
															-		.ctx = &ctx->crypt_ctx,
														
 
															-		.fpu_enabled = false,
														
 
															-	};
														
 
															-	struct xts_crypt_req req = {
														
 
															-		.tbuf = buf,
														
 
															-		.tbuflen = sizeof(buf),
														
 
															-		.tweak_ctx = &ctx->tweak_ctx,
														
 
															-		.tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt),
														
 
															-		.crypt_ctx = &crypt_ctx,
														
 
															-		.crypt_fn = encrypt_callback,
														
 
															-	};
														
 
															-	int ret;
														
 
															-
														
 
															-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															-	ret = xts_crypt(desc, dst, src, nbytes, &req);
														
 
															-	cast6_fpu_end(crypt_ctx.fpu_enabled);
														
 
															-
														
 
															-	return ret;
														
 
															+	return glue_xts_crypt_128bit(&cast6_enc_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(__cast6_encrypt),
														
 
															+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
														
 
															 }
														
 
															 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															 		       struct scatterlist *src, unsigned int nbytes)
														
 
															 {
														
 
															 	struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															-	be128 buf[CAST6_PARALLEL_BLOCKS];
														
 
															-	struct crypt_priv crypt_ctx = {
														
 
															-		.ctx = &ctx->crypt_ctx,
														
 
															-		.fpu_enabled = false,
														
 
															-	};
														
 
															-	struct xts_crypt_req req = {
														
 
															-		.tbuf = buf,
														
 
															-		.tbuflen = sizeof(buf),
														
 
															-
														
 
															-		.tweak_ctx = &ctx->tweak_ctx,
														
 
															-		.tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt),
														
 
															-		.crypt_ctx = &crypt_ctx,
														
 
															-		.crypt_fn = decrypt_callback,
														
 
															-	};
														
 
															-	int ret;
														
 
															-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															-	ret = xts_crypt(desc, dst, src, nbytes, &req);
														
 
															-	cast6_fpu_end(crypt_ctx.fpu_enabled);
														
 
															-
														
 
															-	return ret;
														
 
															+	return glue_xts_crypt_128bit(&cast6_dec_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(__cast6_encrypt),
														
 
															+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
														
 
															 }
														
 
															 static struct crypto_alg cast6_algs[10] = { {
														
--- a/arch/x86/crypto/crc32-pclmul_asm.S
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -101,9 +101,8 @@
 
															  *      uint crc32_pclmul_le_16(unsigned char const *buffer,
														
 
															  *	                     size_t len, uint crc32)
														
 
															  */
														
 
															-.globl crc32_pclmul_le_16
														
 
															-.align 4, 0x90
														
 
															-crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
														
 
															+
														
 
															+ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
														
 
															 	movdqa  (BUF), %xmm1
														
 
															 	movdqa  0x10(BUF), %xmm2
														
 
															 	movdqa  0x20(BUF), %xmm3
														
@@ -244,3 +243,4 @@ fold_64:
 
															 	pextrd  $0x01, %xmm1, %eax
														
 
															 	ret
														
 
															+ENDPROC(crc32_pclmul_le_16)
														
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -1,9 +1,10 @@
 
															 /*
														
 
															  * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
														
 
															  *
														
 
															- * The white paper on CRC32C calculations with PCLMULQDQ instruction can be
														
 
															+ * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
														
 
															  * downloaded from:
														
 
															- * http://download.intel.com/design/intarch/papers/323405.pdf
														
 
															+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
														
 
															+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
														
 
															  *
														
 
															  * Copyright (C) 2012 Intel Corporation.
														
 
															  *
														
@@ -42,6 +43,7 @@
 
															  * SOFTWARE.
														
 
															  */
														
 
															+#include <asm/inst.h>
														
 
															 #include <linux/linkage.h>
														
 
															 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
														
@@ -225,10 +227,10 @@ LABEL crc_ %i
 
															 	movdqa  (bufp), %xmm0			# 2 consts: K1:K2
														
 
															 	movq    crc_init, %xmm1			# CRC for block 1
														
 
															-	pclmulqdq $0x00,%xmm0,%xmm1		# Multiply by K2
														
 
															+	PCLMULQDQ 0x00,%xmm0,%xmm1		# Multiply by K2
														
 
															 	movq    crc1, %xmm2			# CRC for block 2
														
 
															-	pclmulqdq $0x10, %xmm0, %xmm2		# Multiply by K1
														
 
															+	PCLMULQDQ 0x10, %xmm0, %xmm2		# Multiply by K1
														
 
															 	pxor    %xmm2,%xmm1
														
 
															 	movq    %xmm1, %rax
														
--- a/arch/x86/crypto/glue_helper-asm-avx.S
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -1,7 +1,7 @@
 
															 /*
														
 
															  * Shared glue code for 128bit block ciphers, AVX assembler macros
														
 
															  *
														
 
															- * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															  *
														
 
															  * This program is free software; you can redistribute it and/or modify
														
 
															  * it under the terms of the GNU General Public License as published by
														
@@ -89,3 +89,62 @@
 
															 	vpxor (6*16)(src), x6, x6; \
														
 
															 	vpxor (7*16)(src), x7, x7; \
														
 
															 	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
														
 
															+
														
 
															+#define gf128mul_x_ble(iv, mask, tmp) \
														
 
															+	vpsrad $31, iv, tmp; \
														
 
															+	vpaddq iv, iv, iv; \
														
 
															+	vpshufd $0x13, tmp, tmp; \
														
 
															+	vpand mask, tmp, tmp; \
														
 
															+	vpxor tmp, iv, iv;
														
 
															+
														
 
															+#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
														
 
															+		      t1, xts_gf128mul_and_shl1_mask) \
														
 
															+	vmovdqa xts_gf128mul_and_shl1_mask, t0; \
														
 
															+	\
														
 
															+	/* load IV */ \
														
 
															+	vmovdqu (iv), tiv; \
														
 
															+	vpxor (0*16)(src), tiv, x0; \
														
 
															+	vmovdqu tiv, (0*16)(dst); \
														
 
															+	\
														
 
															+	/* construct and store IVs, also xor with source */ \
														
 
															+	gf128mul_x_ble(tiv, t0, t1); \
														
 
															+	vpxor (1*16)(src), tiv, x1; \
														
 
															+	vmovdqu tiv, (1*16)(dst); \
														
 
															+	\
														
 
															+	gf128mul_x_ble(tiv, t0, t1); \
														
 
															+	vpxor (2*16)(src), tiv, x2; \
														
 
															+	vmovdqu tiv, (2*16)(dst); \
														
 
															+	\
														
 
															+	gf128mul_x_ble(tiv, t0, t1); \
														
 
															+	vpxor (3*16)(src), tiv, x3; \
														
 
															+	vmovdqu tiv, (3*16)(dst); \
														
 
															+	\
														
 
															+	gf128mul_x_ble(tiv, t0, t1); \
														
 
															+	vpxor (4*16)(src), tiv, x4; \
														
 
															+	vmovdqu tiv, (4*16)(dst); \
														
 
															+	\
														
 
															+	gf128mul_x_ble(tiv, t0, t1); \
														
 
															+	vpxor (5*16)(src), tiv, x5; \
														
 
															+	vmovdqu tiv, (5*16)(dst); \
														
 
															+	\
														
 
															+	gf128mul_x_ble(tiv, t0, t1); \
														
 
															+	vpxor (6*16)(src), tiv, x6; \
														
 
															+	vmovdqu tiv, (6*16)(dst); \
														
 
															+	\
														
 
															+	gf128mul_x_ble(tiv, t0, t1); \
														
 
															+	vpxor (7*16)(src), tiv, x7; \
														
 
															+	vmovdqu tiv, (7*16)(dst); \
														
 
															+	\
														
 
															+	gf128mul_x_ble(tiv, t0, t1); \
														
 
															+	vmovdqu tiv, (iv);
														
 
															+
														
 
															+#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
														
 
															+	vpxor (0*16)(dst), x0, x0; \
														
 
															+	vpxor (1*16)(dst), x1, x1; \
														
 
															+	vpxor (2*16)(dst), x2, x2; \
														
 
															+	vpxor (3*16)(dst), x3, x3; \
														
 
															+	vpxor (4*16)(dst), x4, x4; \
														
 
															+	vpxor (5*16)(dst), x5, x5; \
														
 
															+	vpxor (6*16)(dst), x6, x6; \
														
 
															+	vpxor (7*16)(dst), x7, x7; \
														
 
															+	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
														
--- a/arch/x86/crypto/glue_helper-asm-avx2.S
+++ b/arch/x86/crypto/glue_helper-asm-avx2.S
@@ -0,0 +1,180 @@
 
															+/*
														
 
															+ * Shared glue code for 128bit block ciphers, AVX2 assembler macros
														
 
															+ *
														
 
															+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify
														
 
															+ * it under the terms of the GNU General Public License as published by
														
 
															+ * the Free Software Foundation; either version 2 of the License, or
														
 
															+ * (at your option) any later version.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
														
 
															+	vmovdqu (0*32)(src), x0; \
														
 
															+	vmovdqu (1*32)(src), x1; \
														
 
															+	vmovdqu (2*32)(src), x2; \
														
 
															+	vmovdqu (3*32)(src), x3; \
														
 
															+	vmovdqu (4*32)(src), x4; \
														
 
															+	vmovdqu (5*32)(src), x5; \
														
 
															+	vmovdqu (6*32)(src), x6; \
														
 
															+	vmovdqu (7*32)(src), x7;
														
 
															+
														
 
															+#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
														
 
															+	vmovdqu x0, (0*32)(dst); \
														
 
															+	vmovdqu x1, (1*32)(dst); \
														
 
															+	vmovdqu x2, (2*32)(dst); \
														
 
															+	vmovdqu x3, (3*32)(dst); \
														
 
															+	vmovdqu x4, (4*32)(dst); \
														
 
															+	vmovdqu x5, (5*32)(dst); \
														
 
															+	vmovdqu x6, (6*32)(dst); \
														
 
															+	vmovdqu x7, (7*32)(dst);
														
 
															+
														
 
															+#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \
														
 
															+	vpxor t0, t0, t0; \
														
 
															+	vinserti128 $1, (src), t0, t0; \
														
 
															+	vpxor t0, x0, x0; \
														
 
															+	vpxor (0*32+16)(src), x1, x1; \
														
 
															+	vpxor (1*32+16)(src), x2, x2; \
														
 
															+	vpxor (2*32+16)(src), x3, x3; \
														
 
															+	vpxor (3*32+16)(src), x4, x4; \
														
 
															+	vpxor (4*32+16)(src), x5, x5; \
														
 
															+	vpxor (5*32+16)(src), x6, x6; \
														
 
															+	vpxor (6*32+16)(src), x7, x7; \
														
 
															+	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
														
 
															+
														
 
															+#define inc_le128(x, minus_one, tmp) \
														
 
															+	vpcmpeqq minus_one, x, tmp; \
														
 
															+	vpsubq minus_one, x, x; \
														
 
															+	vpslldq $8, tmp, tmp; \
														
 
															+	vpsubq tmp, x, x;
														
 
															+
														
 
															+#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
														
 
															+	vpcmpeqq minus_one, x, tmp1; \
														
 
															+	vpcmpeqq minus_two, x, tmp2; \
														
 
															+	vpsubq minus_two, x, x; \
														
 
															+	vpor tmp2, tmp1, tmp1; \
														
 
															+	vpslldq $8, tmp1, tmp1; \
														
 
															+	vpsubq tmp1, x, x;
														
 
															+
														
 
															+#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
														
 
															+		       t1x, t2, t2x, t3, t3x, t4, t5) \
														
 
															+	vpcmpeqd t0, t0, t0; \
														
 
															+	vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \
														
 
															+	vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\
														
 
															+	\
														
 
															+	/* load IV and byteswap */ \
														
 
															+	vmovdqu (iv), t2x; \
														
 
															+	vmovdqa t2x, t3x; \
														
 
															+	inc_le128(t2x, t0x, t1x); \
														
 
															+	vbroadcasti128 bswap, t1; \
														
 
															+	vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \
														
 
															+	vpshufb t1, t2, x0; \
														
 
															+	\
														
 
															+	/* construct IVs */ \
														
 
															+	add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \
														
 
															+	vpshufb t1, t2, x1; \
														
 
															+	add2_le128(t2, t0, t4, t3, t5); \
														
 
															+	vpshufb t1, t2, x2; \
														
 
															+	add2_le128(t2, t0, t4, t3, t5); \
														
 
															+	vpshufb t1, t2, x3; \
														
 
															+	add2_le128(t2, t0, t4, t3, t5); \
														
 
															+	vpshufb t1, t2, x4; \
														
 
															+	add2_le128(t2, t0, t4, t3, t5); \
														
 
															+	vpshufb t1, t2, x5; \
														
 
															+	add2_le128(t2, t0, t4, t3, t5); \
														
 
															+	vpshufb t1, t2, x6; \
														
 
															+	add2_le128(t2, t0, t4, t3, t5); \
														
 
															+	vpshufb t1, t2, x7; \
														
 
															+	vextracti128 $1, t2, t2x; \
														
 
															+	inc_le128(t2x, t0x, t3x); \
														
 
															+	vmovdqu t2x, (iv);
														
 
															+
														
 
															+#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
														
 
															+	vpxor (0*32)(src), x0, x0; \
														
 
															+	vpxor (1*32)(src), x1, x1; \
														
 
															+	vpxor (2*32)(src), x2, x2; \
														
 
															+	vpxor (3*32)(src), x3, x3; \
														
 
															+	vpxor (4*32)(src), x4, x4; \
														
 
															+	vpxor (5*32)(src), x5, x5; \
														
 
															+	vpxor (6*32)(src), x6, x6; \
														
 
															+	vpxor (7*32)(src), x7, x7; \
														
 
															+	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
														
 
															+
														
 
															+#define gf128mul_x_ble(iv, mask, tmp) \
														
 
															+	vpsrad $31, iv, tmp; \
														
 
															+	vpaddq iv, iv, iv; \
														
 
															+	vpshufd $0x13, tmp, tmp; \
														
 
															+	vpand mask, tmp, tmp; \
														
 
															+	vpxor tmp, iv, iv;
														
 
															+
														
 
															+#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
														
 
															+	vpsrad $31, iv, tmp0; \
														
 
															+	vpaddq iv, iv, tmp1; \
														
 
															+	vpsllq $2, iv, iv; \
														
 
															+	vpshufd $0x13, tmp0, tmp0; \
														
 
															+	vpsrad $31, tmp1, tmp1; \
														
 
															+	vpand mask2, tmp0, tmp0; \
														
 
															+	vpshufd $0x13, tmp1, tmp1; \
														
 
															+	vpxor tmp0, iv, iv; \
														
 
															+	vpand mask1, tmp1, tmp1; \
														
 
															+	vpxor tmp1, iv, iv;
														
 
															+
														
 
															+#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
														
 
															+		       tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
														
 
															+		       xts_gf128mul_and_shl1_mask_0, \
														
 
															+		       xts_gf128mul_and_shl1_mask_1) \
														
 
															+	vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
														
 
															+	\
														
 
															+	/* load IV and construct second IV */ \
														
 
															+	vmovdqu (iv), tivx; \
														
 
															+	vmovdqa tivx, t0x; \
														
 
															+	gf128mul_x_ble(tivx, t1x, t2x); \
														
 
															+	vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
														
 
															+	vinserti128 $1, tivx, t0, tiv; \
														
 
															+	vpxor (0*32)(src), tiv, x0; \
														
 
															+	vmovdqu tiv, (0*32)(dst); \
														
 
															+	\
														
 
															+	/* construct and store IVs, also xor with source */ \
														
 
															+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
														
 
															+	vpxor (1*32)(src), tiv, x1; \
														
 
															+	vmovdqu tiv, (1*32)(dst); \
														
 
															+	\
														
 
															+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
														
 
															+	vpxor (2*32)(src), tiv, x2; \
														
 
															+	vmovdqu tiv, (2*32)(dst); \
														
 
															+	\
														
 
															+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
														
 
															+	vpxor (3*32)(src), tiv, x3; \
														
 
															+	vmovdqu tiv, (3*32)(dst); \
														
 
															+	\
														
 
															+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
														
 
															+	vpxor (4*32)(src), tiv, x4; \
														
 
															+	vmovdqu tiv, (4*32)(dst); \
														
 
															+	\
														
 
															+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
														
 
															+	vpxor (5*32)(src), tiv, x5; \
														
 
															+	vmovdqu tiv, (5*32)(dst); \
														
 
															+	\
														
 
															+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
														
 
															+	vpxor (6*32)(src), tiv, x6; \
														
 
															+	vmovdqu tiv, (6*32)(dst); \
														
 
															+	\
														
 
															+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
														
 
															+	vpxor (7*32)(src), tiv, x7; \
														
 
															+	vmovdqu tiv, (7*32)(dst); \
														
 
															+	\
														
 
															+	vextracti128 $1, tiv, tivx; \
														
 
															+	gf128mul_x_ble(tivx, t1x, t2x); \
														
 
															+	vmovdqu tivx, (iv);
														
 
															+
														
 
															+#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
														
 
															+	vpxor (0*32)(dst), x0, x0; \
														
 
															+	vpxor (1*32)(dst), x1, x1; \
														
 
															+	vpxor (2*32)(dst), x2, x2; \
														
 
															+	vpxor (3*32)(dst), x3, x3; \
														
 
															+	vpxor (4*32)(dst), x4, x4; \
														
 
															+	vpxor (5*32)(dst), x5, x5; \
														
 
															+	vpxor (6*32)(dst), x6, x6; \
														
 
															+	vpxor (7*32)(dst), x7, x7; \
														
 
															+	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
														
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -1,7 +1,7 @@
 
															 /*
														
 
															  * Shared glue code for 128bit block ciphers
														
 
															  *
														
 
															- * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															  *
														
 
															  * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
														
 
															  *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
														
@@ -304,4 +304,99 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit);
														
 
															+static unsigned int __glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
														
 
															+					    void *ctx,
														
 
															+					    struct blkcipher_desc *desc,
														
 
															+					    struct blkcipher_walk *walk)
														
 
															+{
														
 
															+	const unsigned int bsize = 128 / 8;
														
 
															+	unsigned int nbytes = walk->nbytes;
														
 
															+	u128 *src = (u128 *)walk->src.virt.addr;
														
 
															+	u128 *dst = (u128 *)walk->dst.virt.addr;
														
 
															+	unsigned int num_blocks, func_bytes;
														
 
															+	unsigned int i;
														
 
															+
														
 
															+	/* Process multi-block batch */
														
 
															+	for (i = 0; i < gctx->num_funcs; i++) {
														
 
															+		num_blocks = gctx->funcs[i].num_blocks;
														
 
															+		func_bytes = bsize * num_blocks;
														
 
															+
														
 
															+		if (nbytes >= func_bytes) {
														
 
															+			do {
														
 
															+				gctx->funcs[i].fn_u.xts(ctx, dst, src,
														
 
															+							(le128 *)walk->iv);
														
 
															+
														
 
															+				src += num_blocks;
														
 
															+				dst += num_blocks;
														
 
															+				nbytes -= func_bytes;
														
 
															+			} while (nbytes >= func_bytes);
														
 
															+
														
 
															+			if (nbytes < bsize)
														
 
															+				goto done;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+done:
														
 
															+	return nbytes;
														
 
															+}
														
 
															+
														
 
															+/* for implementations implementing faster XTS IV generator */
														
 
															+int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
														
 
															+			  struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+			  struct scatterlist *src, unsigned int nbytes,
														
 
															+			  void (*tweak_fn)(void *ctx, u8 *dst, const u8 *src),
														
 
															+			  void *tweak_ctx, void *crypt_ctx)
														
 
															+{
														
 
															+	const unsigned int bsize = 128 / 8;
														
 
															+	bool fpu_enabled = false;
														
 
															+	struct blkcipher_walk walk;
														
 
															+	int err;
														
 
															+
														
 
															+	blkcipher_walk_init(&walk, dst, src, nbytes);
														
 
															+
														
 
															+	err = blkcipher_walk_virt(desc, &walk);
														
 
															+	nbytes = walk.nbytes;
														
 
															+	if (!nbytes)
														
 
															+		return err;
														
 
															+
														
 
															+	/* set minimum length to bsize, for tweak_fn */
														
 
															+	fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
														
 
															+				     desc, fpu_enabled,
														
 
															+				     nbytes < bsize ? bsize : nbytes);
														
 
															+
														
 
															+	/* calculate first value of T */
														
 
															+	tweak_fn(tweak_ctx, walk.iv, walk.iv);
														
 
															+
														
 
															+	while (nbytes) {
														
 
															+		nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
														
 
															+
														
 
															+		err = blkcipher_walk_done(desc, &walk, nbytes);
														
 
															+		nbytes = walk.nbytes;
														
 
															+	}
														
 
															+
														
 
															+	glue_fpu_end(fpu_enabled);
														
 
															+
														
 
															+	return err;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
														
 
															+
														
 
															+void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv,
														
 
															+			       common_glue_func_t fn)
														
 
															+{
														
 
															+	le128 ivblk = *iv;
														
 
															+
														
 
															+	/* generate next IV */
														
 
															+	le128_gf128mul_x_ble(iv, &ivblk);
														
 
															+
														
 
															+	/* CC <- T xor C */
														
 
															+	u128_xor(dst, src, (u128 *)&ivblk);
														
 
															+
														
 
															+	/* PP <- D(Key2,CC) */
														
 
															+	fn(ctx, (u8 *)dst, (u8 *)dst);
														
 
															+
														
 
															+	/* P <- T xor PP */
														
 
															+	u128_xor(dst, dst, (u128 *)&ivblk);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one);
														
 
															+
														
 
															 MODULE_LICENSE("GPL");
														
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -4,8 +4,7 @@
 
															  * Copyright (C) 2012 Johannes Goetzfried
														
 
															  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
														
 
															  *
														
 
															- * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by
														
 
															- *  Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															  *
														
 
															  * This program is free software; you can redistribute it and/or modify
														
 
															  * it under the terms of the GNU General Public License as published by
														
@@ -34,6 +33,8 @@
 
															 .Lbswap128_mask:
														
 
															 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
														
 
															+.Lxts_gf128mul_and_shl1_mask:
														
 
															+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
														
 
															 .text
														
@@ -739,3 +740,43 @@ ENTRY(serpent_ctr_8way_avx)
 
															 	ret;
														
 
															 ENDPROC(serpent_ctr_8way_avx)
														
 
															+
														
 
															+ENTRY(serpent_xts_enc_8way_avx)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 */
														
 
															+
														
 
															+	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
														
 
															+	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
														
 
															+		      RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
														
 
															+
														
 
															+	call __serpent_enc_blk8_avx;
														
 
															+
														
 
															+	/* dst <= regs xor IVs(in dst) */
														
 
															+	store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(serpent_xts_enc_8way_avx)
														
 
															+
														
 
															+ENTRY(serpent_xts_dec_8way_avx)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 */
														
 
															+
														
 
															+	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
														
 
															+	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
														
 
															+		      RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
														
 
															+
														
 
															+	call __serpent_dec_blk8_avx;
														
 
															+
														
 
															+	/* dst <= regs xor IVs(in dst) */
														
 
															+	store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(serpent_xts_dec_8way_avx)
														
--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -0,0 +1,800 @@
 
															+/*
														
 
															+ * x86_64/AVX2 assembler optimized version of Serpent
														
 
															+ *
														
 
															+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ *
														
 
															+ * Based on AVX assembler implementation of Serpent by:
														
 
															+ *  Copyright © 2012 Johannes Goetzfried
														
 
															+ *      <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify
														
 
															+ * it under the terms of the GNU General Public License as published by
														
 
															+ * the Free Software Foundation; either version 2 of the License, or
														
 
															+ * (at your option) any later version.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#include <linux/linkage.h>
														
 
															+#include "glue_helper-asm-avx2.S"
														
 
															+
														
 
															+.file "serpent-avx2-asm_64.S"
														
 
															+
														
 
															+.data
														
 
															+.align 16
														
 
															+
														
 
															+.Lbswap128_mask:
														
 
															+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
														
 
															+.Lxts_gf128mul_and_shl1_mask_0:
														
 
															+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
														
 
															+.Lxts_gf128mul_and_shl1_mask_1:
														
 
															+	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
														
 
															+
														
 
															+.text
														
 
															+
														
 
															+#define CTX %rdi
														
 
															+
														
 
															+#define RNOT %ymm0
														
 
															+#define tp  %ymm1
														
 
															+
														
 
															+#define RA1 %ymm2
														
 
															+#define RA2 %ymm3
														
 
															+#define RB1 %ymm4
														
 
															+#define RB2 %ymm5
														
 
															+#define RC1 %ymm6
														
 
															+#define RC2 %ymm7
														
 
															+#define RD1 %ymm8
														
 
															+#define RD2 %ymm9
														
 
															+#define RE1 %ymm10
														
 
															+#define RE2 %ymm11
														
 
															+
														
 
															+#define RK0 %ymm12
														
 
															+#define RK1 %ymm13
														
 
															+#define RK2 %ymm14
														
 
															+#define RK3 %ymm15
														
 
															+
														
 
															+#define RK0x %xmm12
														
 
															+#define RK1x %xmm13
														
 
															+#define RK2x %xmm14
														
 
															+#define RK3x %xmm15
														
 
															+
														
 
															+#define S0_1(x0, x1, x2, x3, x4)      \
														
 
															+	vpor		x0,   x3, tp; \
														
 
															+	vpxor		x3,   x0, x0; \
														
 
															+	vpxor		x2,   x3, x4; \
														
 
															+	vpxor		RNOT, x4, x4; \
														
 
															+	vpxor		x1,   tp, x3; \
														
 
															+	vpand		x0,   x1, x1; \
														
 
															+	vpxor		x4,   x1, x1; \
														
 
															+	vpxor		x0,   x2, x2;
														
 
															+#define S0_2(x0, x1, x2, x3, x4)      \
														
 
															+	vpxor		x3,   x0, x0; \
														
 
															+	vpor		x0,   x4, x4; \
														
 
															+	vpxor		x2,   x0, x0; \
														
 
															+	vpand		x1,   x2, x2; \
														
 
															+	vpxor		x2,   x3, x3; \
														
 
															+	vpxor		RNOT, x1, x1; \
														
 
															+	vpxor		x4,   x2, x2; \
														
 
															+	vpxor		x2,   x1, x1;
														
 
															+
														
 
															+#define S1_1(x0, x1, x2, x3, x4)      \
														
 
															+	vpxor		x0,   x1, tp; \
														
 
															+	vpxor		x3,   x0, x0; \
														
 
															+	vpxor		RNOT, x3, x3; \
														
 
															+	vpand		tp,   x1, x4; \
														
 
															+	vpor		tp,   x0, x0; \
														
 
															+	vpxor		x2,   x3, x3; \
														
 
															+	vpxor		x3,   x0, x0; \
														
 
															+	vpxor		x3,   tp, x1;
														
 
															+#define S1_2(x0, x1, x2, x3, x4)      \
														
 
															+	vpxor		x4,   x3, x3; \
														
 
															+	vpor		x4,   x1, x1; \
														
 
															+	vpxor		x2,   x4, x4; \
														
 
															+	vpand		x0,   x2, x2; \
														
 
															+	vpxor		x1,   x2, x2; \
														
 
															+	vpor		x0,   x1, x1; \
														
 
															+	vpxor		RNOT, x0, x0; \
														
 
															+	vpxor		x2,   x0, x0; \
														
 
															+	vpxor		x1,   x4, x4;
														
 
															+
														
 
															+#define S2_1(x0, x1, x2, x3, x4)      \
														
 
															+	vpxor		RNOT, x3, x3; \
														
 
															+	vpxor		x0,   x1, x1; \
														
 
															+	vpand		x2,   x0, tp; \
														
 
															+	vpxor		x3,   tp, tp; \
														
 
															+	vpor		x0,   x3, x3; \
														
 
															+	vpxor		x1,   x2, x2; \
														
 
															+	vpxor		x1,   x3, x3; \
														
 
															+	vpand		tp,   x1, x1;
														
 
															+#define S2_2(x0, x1, x2, x3, x4)      \
														
 
															+	vpxor		x2,   tp, tp; \
														
 
															+	vpand		x3,   x2, x2; \
														
 
															+	vpor		x1,   x3, x3; \
														
 
															+	vpxor		RNOT, tp, tp; \
														
 
															+	vpxor		tp,   x3, x3; \
														
 
															+	vpxor		tp,   x0, x4; \
														
 
															+	vpxor		x2,   tp, x0; \
														
 
															+	vpor		x2,   x1, x1;
														
 
															+
														
 
															+#define S3_1(x0, x1, x2, x3, x4)      \
														
 
															+	vpxor		x3,   x1, tp; \
														
 
															+	vpor		x0,   x3, x3; \
														
 
															+	vpand		x0,   x1, x4; \
														
 
															+	vpxor		x2,   x0, x0; \
														
 
															+	vpxor		tp,   x2, x2; \
														
 
															+	vpand		x3,   tp, x1; \
														
 
															+	vpxor		x3,   x2, x2; \
														
 
															+	vpor		x4,   x0, x0; \
														
 
															+	vpxor		x3,   x4, x4;
														
 
															+#define S3_2(x0, x1, x2, x3, x4)      \
														
 
															+	vpxor		x0,   x1, x1; \
														
 
															+	vpand		x3,   x0, x0; \
														
 
															+	vpand		x4,   x3, x3; \
														
 
															+	vpxor		x2,   x3, x3; \
														
 
															+	vpor		x1,   x4, x4; \
														
 
															+	vpand		x1,   x2, x2; \
														
 
															+	vpxor		x3,   x4, x4; \
														
 
															+	vpxor		x3,   x0, x0; \
														
 
															+	vpxor		x2,   x3, x3;
														
 
															+
														
 
															+#define S4_1(x0, x1, x2, x3, x4)      \
														
 
															+	vpand		x0,   x3, tp; \
														
 
															+	vpxor		x3,   x0, x0; \
														
 
															+	vpxor		x2,   tp, tp; \
														
 
															+	vpor		x3,   x2, x2; \
														
 
															+	vpxor		x1,   x0, x0; \
														
 
															+	vpxor		tp,   x3, x4; \
														
 
															+	vpor		x0,   x2, x2; \
														
 
															+	vpxor		x1,   x2, x2;
														
 
															+#define S4_2(x0, x1, x2, x3, x4)      \
														
 
															+	vpand		x0,   x1, x1; \
														
 
															+	vpxor		x4,   x1, x1; \
														
 
															+	vpand		x2,   x4, x4; \
														
 
															+	vpxor		tp,   x2, x2; \
														
 
															+	vpxor		x0,   x4, x4; \
														
 
															+	vpor		x1,   tp, x3; \
														
 
															+	vpxor		RNOT, x1, x1; \
														
 
															+	vpxor		x0,   x3, x3;
														
 
															+
														
 
															+#define S5_1(x0, x1, x2, x3, x4)      \
														
 
															+	vpor		x0,   x1, tp; \
														
 
															+	vpxor		tp,   x2, x2; \
														
 
															+	vpxor		RNOT, x3, x3; \
														
 
															+	vpxor		x0,   x1, x4; \
														
 
															+	vpxor		x2,   x0, x0; \
														
 
															+	vpand		x4,   tp, x1; \
														
 
															+	vpor		x3,   x4, x4; \
														
 
															+	vpxor		x0,   x4, x4;
														
 
															+#define S5_2(x0, x1, x2, x3, x4)      \
														
 
															+	vpand		x3,   x0, x0; \
														
 
															+	vpxor		x3,   x1, x1; \
														
 
															+	vpxor		x2,   x3, x3; \
														
 
															+	vpxor		x1,   x0, x0; \
														
 
															+	vpand		x4,   x2, x2; \
														
 
															+	vpxor		x2,   x1, x1; \
														
 
															+	vpand		x0,   x2, x2; \
														
 
															+	vpxor		x2,   x3, x3;
														
 
															+
														
 
															+#define S6_1(x0, x1, x2, x3, x4)      \
														
 
															+	vpxor		x0,   x3, x3; \
														
 
															+	vpxor		x2,   x1, tp; \
														
 
															+	vpxor		x0,   x2, x2; \
														
 
															+	vpand		x3,   x0, x0; \
														
 
															+	vpor		x3,   tp, tp; \
														
 
															+	vpxor		RNOT, x1, x4; \
														
 
															+	vpxor		tp,   x0, x0; \
														
 
															+	vpxor		x2,   tp, x1;
														
 
															+#define S6_2(x0, x1, x2, x3, x4)      \
														
 
															+	vpxor		x4,   x3, x3; \
														
 
															+	vpxor		x0,   x4, x4; \
														
 
															+	vpand		x0,   x2, x2; \
														
 
															+	vpxor		x1,   x4, x4; \
														
 
															+	vpxor		x3,   x2, x2; \
														
 
															+	vpand		x1,   x3, x3; \
														
 
															+	vpxor		x0,   x3, x3; \
														
 
															+	vpxor		x2,   x1, x1;
														
 
															+
														
 
															+#define S7_1(x0, x1, x2, x3, x4)      \
														
 
															+	vpxor		RNOT, x1, tp; \
														
 
															+	vpxor		RNOT, x0, x0; \
														
 
															+	vpand		x2,   tp, x1; \
														
 
															+	vpxor		x3,   x1, x1; \
														
 
															+	vpor		tp,   x3, x3; \
														
 
															+	vpxor		x2,   tp, x4; \
														
 
															+	vpxor		x3,   x2, x2; \
														
 
															+	vpxor		x0,   x3, x3; \
														
 
															+	vpor		x1,   x0, x0;
														
 
															+#define S7_2(x0, x1, x2, x3, x4)      \
														
 
															+	vpand		x0,   x2, x2; \
														
 
															+	vpxor		x4,   x0, x0; \
														
 
															+	vpxor		x3,   x4, x4; \
														
 
															+	vpand		x0,   x3, x3; \
														
 
															+	vpxor		x1,   x4, x4; \
														
 
															+	vpxor		x4,   x2, x2; \
														
 
															+	vpxor		x1,   x3, x3; \
														
 
															+	vpor		x0,   x4, x4; \
														
 
															+	vpxor		x1,   x4, x4;
														
 
															+
														
 
															+#define SI0_1(x0, x1, x2, x3, x4)     \
														
 
															+	vpxor		x0,   x1, x1; \
														
 
															+	vpor		x1,   x3, tp; \
														
 
															+	vpxor		x1,   x3, x4; \
														
 
															+	vpxor		RNOT, x0, x0; \
														
 
															+	vpxor		tp,   x2, x2; \
														
 
															+	vpxor		x0,   tp, x3; \
														
 
															+	vpand		x1,   x0, x0; \
														
 
															+	vpxor		x2,   x0, x0;
														
 
															+#define SI0_2(x0, x1, x2, x3, x4)     \
														
 
															+	vpand		x3,   x2, x2; \
														
 
															+	vpxor		x4,   x3, x3; \
														
 
															+	vpxor		x3,   x2, x2; \
														
 
															+	vpxor		x3,   x1, x1; \
														
 
															+	vpand		x0,   x3, x3; \
														
 
															+	vpxor		x0,   x1, x1; \
														
 
															+	vpxor		x2,   x0, x0; \
														
 
															+	vpxor		x3,   x4, x4;
														
 
															+
														
 
															+#define SI1_1(x0, x1, x2, x3, x4)     \
														
 
															+	vpxor		x3,   x1, x1; \
														
 
															+	vpxor		x2,   x0, tp; \
														
 
															+	vpxor		RNOT, x2, x2; \
														
 
															+	vpor		x1,   x0, x4; \
														
 
															+	vpxor		x3,   x4, x4; \
														
 
															+	vpand		x1,   x3, x3; \
														
 
															+	vpxor		x2,   x1, x1; \
														
 
															+	vpand		x4,   x2, x2;
														
 
															+#define SI1_2(x0, x1, x2, x3, x4)     \
														
 
															+	vpxor		x1,   x4, x4; \
														
 
															+	vpor		x3,   x1, x1; \
														
 
															+	vpxor		tp,   x3, x3; \
														
 
															+	vpxor		tp,   x2, x2; \
														
 
															+	vpor		x4,   tp, x0; \
														
 
															+	vpxor		x4,   x2, x2; \
														
 
															+	vpxor		x0,   x1, x1; \
														
 
															+	vpxor		x1,   x4, x4;
														
 
															+
														
 
															+#define SI2_1(x0, x1, x2, x3, x4)     \
														
 
															+	vpxor		x1,   x2, x2; \
														
 
															+	vpxor		RNOT, x3, tp; \
														
 
															+	vpor		x2,   tp, tp; \
														
 
															+	vpxor		x3,   x2, x2; \
														
 
															+	vpxor		x0,   x3, x4; \
														
 
															+	vpxor		x1,   tp, x3; \
														
 
															+	vpor		x2,   x1, x1; \
														
 
															+	vpxor		x0,   x2, x2;
														
 
															+#define SI2_2(x0, x1, x2, x3, x4)     \
														
 
															+	vpxor		x4,   x1, x1; \
														
 
															+	vpor		x3,   x4, x4; \
														
 
															+	vpxor		x3,   x2, x2; \
														
 
															+	vpxor		x2,   x4, x4; \
														
 
															+	vpand		x1,   x2, x2; \
														
 
															+	vpxor		x3,   x2, x2; \
														
 
															+	vpxor		x4,   x3, x3; \
														
 
															+	vpxor		x0,   x4, x4;
														
 
															+
														
 
															+#define SI3_1(x0, x1, x2, x3, x4)     \
														
 
															+	vpxor		x1,   x2, x2; \
														
 
															+	vpand		x2,   x1, tp; \
														
 
															+	vpxor		x0,   tp, tp; \
														
 
															+	vpor		x1,   x0, x0; \
														
 
															+	vpxor		x3,   x1, x4; \
														
 
															+	vpxor		x3,   x0, x0; \
														
 
															+	vpor		tp,   x3, x3; \
														
 
															+	vpxor		x2,   tp, x1;
														
 
															+#define SI3_2(x0, x1, x2, x3, x4)     \
														
 
															+	vpxor		x3,   x1, x1; \
														
 
															+	vpxor		x2,   x0, x0; \
														
 
															+	vpxor		x3,   x2, x2; \
														
 
															+	vpand		x1,   x3, x3; \
														
 
															+	vpxor		x0,   x1, x1; \
														
 
															+	vpand		x2,   x0, x0; \
														
 
															+	vpxor		x3,   x4, x4; \
														
 
															+	vpxor		x0,   x3, x3; \
														
 
															+	vpxor		x1,   x0, x0;
														
 
															+
														
 
															+#define SI4_1(x0, x1, x2, x3, x4)     \
														
 
															+	vpxor		x3,   x2, x2; \
														
 
															+	vpand		x1,   x0, tp; \
														
 
															+	vpxor		x2,   tp, tp; \
														
 
															+	vpor		x3,   x2, x2; \
														
 
															+	vpxor		RNOT, x0, x4; \
														
 
															+	vpxor		tp,   x1, x1; \
														
 
															+	vpxor		x2,   tp, x0; \
														
 
															+	vpand		x4,   x2, x2;
														
 
															+#define SI4_2(x0, x1, x2, x3, x4)     \
														
 
															+	vpxor		x0,   x2, x2; \
														
 
															+	vpor		x4,   x0, x0; \
														
 
															+	vpxor		x3,   x0, x0; \
														
 
															+	vpand		x2,   x3, x3; \
														
 
															+	vpxor		x3,   x4, x4; \
														
 
															+	vpxor		x1,   x3, x3; \
														
 
															+	vpand		x0,   x1, x1; \
														
 
															+	vpxor		x1,   x4, x4; \
														
 
															+	vpxor		x3,   x0, x0;
														
 
															+
														
 
															+#define SI5_1(x0, x1, x2, x3, x4)     \
														
 
															+	vpor		x2,   x1, tp; \
														
 
															+	vpxor		x1,   x2, x2; \
														
 
															+	vpxor		x3,   tp, tp; \
														
 
															+	vpand		x1,   x3, x3; \
														
 
															+	vpxor		x3,   x2, x2; \
														
 
															+	vpor		x0,   x3, x3; \
														
 
															+	vpxor		RNOT, x0, x0; \
														
 
															+	vpxor		x2,   x3, x3; \
														
 
															+	vpor		x0,   x2, x2;
														
 
															+#define SI5_2(x0, x1, x2, x3, x4)     \
														
 
															+	vpxor		tp,   x1, x4; \
														
 
															+	vpxor		x4,   x2, x2; \
														
 
															+	vpand		x0,   x4, x4; \
														
 
															+	vpxor		tp,   x0, x0; \
														
 
															+	vpxor		x3,   tp, x1; \
														
 
															+	vpand		x2,   x0, x0; \
														
 
															+	vpxor		x3,   x2, x2; \
														
 
															+	vpxor		x2,   x0, x0; \
														
 
															+	vpxor		x4,   x2, x2; \
														
 
															+	vpxor		x3,   x4, x4;
														
 
															+
														
 
															+#define SI6_1(x0, x1, x2, x3, x4)     \
														
 
															+	vpxor		x2,   x0, x0; \
														
 
															+	vpand		x3,   x0, tp; \
														
 
															+	vpxor		x3,   x2, x2; \
														
 
															+	vpxor		x2,   tp, tp; \
														
 
															+	vpxor		x1,   x3, x3; \
														
 
															+	vpor		x0,   x2, x2; \
														
 
															+	vpxor		x3,   x2, x2; \
														
 
															+	vpand		tp,   x3, x3;
														
 
															+#define SI6_2(x0, x1, x2, x3, x4)     \
														
 
															+	vpxor		RNOT, tp, tp; \
														
 
															+	vpxor		x1,   x3, x3; \
														
 
															+	vpand		x2,   x1, x1; \
														
 
															+	vpxor		tp,   x0, x4; \
														
 
															+	vpxor		x4,   x3, x3; \
														
 
															+	vpxor		x2,   x4, x4; \
														
 
															+	vpxor		x1,   tp, x0; \
														
 
															+	vpxor		x0,   x2, x2;
														
 
															+
														
 
															+#define SI7_1(x0, x1, x2, x3, x4)     \
														
 
															+	vpand		x0,   x3, tp; \
														
 
															+	vpxor		x2,   x0, x0; \
														
 
															+	vpor		x3,   x2, x2; \
														
 
															+	vpxor		x1,   x3, x4; \
														
 
															+	vpxor		RNOT, x0, x0; \
														
 
															+	vpor		tp,   x1, x1; \
														
 
															+	vpxor		x0,   x4, x4; \
														
 
															+	vpand		x2,   x0, x0; \
														
 
															+	vpxor		x1,   x0, x0;
														
 
															+#define SI7_2(x0, x1, x2, x3, x4)     \
														
 
															+	vpand		x2,   x1, x1; \
														
 
															+	vpxor		x2,   tp, x3; \
														
 
															+	vpxor		x3,   x4, x4; \
														
 
															+	vpand		x3,   x2, x2; \
														
 
															+	vpor		x0,   x3, x3; \
														
 
															+	vpxor		x4,   x1, x1; \
														
 
															+	vpxor		x4,   x3, x3; \
														
 
															+	vpand		x0,   x4, x4; \
														
 
															+	vpxor		x2,   x4, x4;
														
 
															+
														
 
															+#define get_key(i,j,t) \
														
 
															+	vpbroadcastd (4*(i)+(j))*4(CTX), t;
														
 
															+
														
 
															+#define K2(x0, x1, x2, x3, x4, i) \
														
 
															+	get_key(i, 0, RK0); \
														
 
															+	get_key(i, 1, RK1); \
														
 
															+	get_key(i, 2, RK2); \
														
 
															+	get_key(i, 3, RK3); \
														
 
															+	vpxor RK0,	x0 ## 1, x0 ## 1; \
														
 
															+	vpxor RK1,	x1 ## 1, x1 ## 1; \
														
 
															+	vpxor RK2,	x2 ## 1, x2 ## 1; \
														
 
															+	vpxor RK3,	x3 ## 1, x3 ## 1; \
														
 
															+		vpxor RK0,	x0 ## 2, x0 ## 2; \
														
 
															+		vpxor RK1,	x1 ## 2, x1 ## 2; \
														
 
															+		vpxor RK2,	x2 ## 2, x2 ## 2; \
														
 
															+		vpxor RK3,	x3 ## 2, x3 ## 2;
														
 
															+
														
 
															+#define LK2(x0, x1, x2, x3, x4, i) \
														
 
															+	vpslld $13,		x0 ## 1, x4 ## 1;          \
														
 
															+	vpsrld $(32 - 13),	x0 ## 1, x0 ## 1;          \
														
 
															+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
														
 
															+	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
														
 
															+	vpslld $3,		x2 ## 1, x4 ## 1;          \
														
 
															+	vpsrld $(32 - 3),	x2 ## 1, x2 ## 1;          \
														
 
															+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
														
 
															+	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
														
 
															+		vpslld $13,		x0 ## 2, x4 ## 2;          \
														
 
															+		vpsrld $(32 - 13),	x0 ## 2, x0 ## 2;          \
														
 
															+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
														
 
															+		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
														
 
															+		vpslld $3,		x2 ## 2, x4 ## 2;          \
														
 
															+		vpsrld $(32 - 3),	x2 ## 2, x2 ## 2;          \
														
 
															+		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
														
 
															+		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
														
 
															+	vpslld $1,		x1 ## 1, x4 ## 1;          \
														
 
															+	vpsrld $(32 - 1),	x1 ## 1, x1 ## 1;          \
														
 
															+	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
														
 
															+	vpslld $3,		x0 ## 1, x4 ## 1;          \
														
 
															+	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
														
 
															+	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
														
 
															+	get_key(i, 1, RK1); \
														
 
															+		vpslld $1,		x1 ## 2, x4 ## 2;          \
														
 
															+		vpsrld $(32 - 1),	x1 ## 2, x1 ## 2;          \
														
 
															+		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
														
 
															+		vpslld $3,		x0 ## 2, x4 ## 2;          \
														
 
															+		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
														
 
															+		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
														
 
															+		get_key(i, 3, RK3); \
														
 
															+	vpslld $7,		x3 ## 1, x4 ## 1;          \
														
 
															+	vpsrld $(32 - 7),	x3 ## 1, x3 ## 1;          \
														
 
															+	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
														
 
															+	vpslld $7,		x1 ## 1, x4 ## 1;          \
														
 
															+	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
														
 
															+	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
														
 
															+	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
														
 
															+	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
														
 
															+	get_key(i, 0, RK0); \
														
 
															+		vpslld $7,		x3 ## 2, x4 ## 2;          \
														
 
															+		vpsrld $(32 - 7),	x3 ## 2, x3 ## 2;          \
														
 
															+		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
														
 
															+		vpslld $7,		x1 ## 2, x4 ## 2;          \
														
 
															+		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
														
 
															+		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
														
 
															+		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
														
 
															+		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
														
 
															+		get_key(i, 2, RK2); \
														
 
															+	vpxor			RK1, x1 ## 1, x1 ## 1;     \
														
 
															+	vpxor			RK3, x3 ## 1, x3 ## 1;     \
														
 
															+	vpslld $5,		x0 ## 1, x4 ## 1;          \
														
 
															+	vpsrld $(32 - 5),	x0 ## 1, x0 ## 1;          \
														
 
															+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
														
 
															+	vpslld $22,		x2 ## 1, x4 ## 1;          \
														
 
															+	vpsrld $(32 - 22),	x2 ## 1, x2 ## 1;          \
														
 
															+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
														
 
															+	vpxor			RK0, x0 ## 1, x0 ## 1;     \
														
 
															+	vpxor			RK2, x2 ## 1, x2 ## 1;     \
														
 
															+		vpxor			RK1, x1 ## 2, x1 ## 2;     \
														
 
															+		vpxor			RK3, x3 ## 2, x3 ## 2;     \
														
 
															+		vpslld $5,		x0 ## 2, x4 ## 2;          \
														
 
															+		vpsrld $(32 - 5),	x0 ## 2, x0 ## 2;          \
														
 
															+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
														
 
															+		vpslld $22,		x2 ## 2, x4 ## 2;          \
														
 
															+		vpsrld $(32 - 22),	x2 ## 2, x2 ## 2;          \
														
 
															+		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
														
 
															+		vpxor			RK0, x0 ## 2, x0 ## 2;     \
														
 
															+		vpxor			RK2, x2 ## 2, x2 ## 2;
														
 
															+
														
 
															+#define KL2(x0, x1, x2, x3, x4, i) \
														
 
															+	vpxor			RK0, x0 ## 1, x0 ## 1;     \
														
 
															+	vpxor			RK2, x2 ## 1, x2 ## 1;     \
														
 
															+	vpsrld $5,		x0 ## 1, x4 ## 1;          \
														
 
															+	vpslld $(32 - 5),	x0 ## 1, x0 ## 1;          \
														
 
															+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
														
 
															+	vpxor			RK3, x3 ## 1, x3 ## 1;     \
														
 
															+	vpxor			RK1, x1 ## 1, x1 ## 1;     \
														
 
															+	vpsrld $22,		x2 ## 1, x4 ## 1;          \
														
 
															+	vpslld $(32 - 22),	x2 ## 1, x2 ## 1;          \
														
 
															+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
														
 
															+	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
														
 
															+		vpxor			RK0, x0 ## 2, x0 ## 2;     \
														
 
															+		vpxor			RK2, x2 ## 2, x2 ## 2;     \
														
 
															+		vpsrld $5,		x0 ## 2, x4 ## 2;          \
														
 
															+		vpslld $(32 - 5),	x0 ## 2, x0 ## 2;          \
														
 
															+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
														
 
															+		vpxor			RK3, x3 ## 2, x3 ## 2;     \
														
 
															+		vpxor			RK1, x1 ## 2, x1 ## 2;     \
														
 
															+		vpsrld $22,		x2 ## 2, x4 ## 2;          \
														
 
															+		vpslld $(32 - 22),	x2 ## 2, x2 ## 2;          \
														
 
															+		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
														
 
															+		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
														
 
															+	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
														
 
															+	vpslld $7,		x1 ## 1, x4 ## 1;          \
														
 
															+	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
														
 
															+	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
														
 
															+	vpsrld $1,		x1 ## 1, x4 ## 1;          \
														
 
															+	vpslld $(32 - 1),	x1 ## 1, x1 ## 1;          \
														
 
															+	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
														
 
															+		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
														
 
															+		vpslld $7,		x1 ## 2, x4 ## 2;          \
														
 
															+		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
														
 
															+		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
														
 
															+		vpsrld $1,		x1 ## 2, x4 ## 2;          \
														
 
															+		vpslld $(32 - 1),	x1 ## 2, x1 ## 2;          \
														
 
															+		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
														
 
															+	vpsrld $7,		x3 ## 1, x4 ## 1;          \
														
 
															+	vpslld $(32 - 7),	x3 ## 1, x3 ## 1;          \
														
 
															+	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
														
 
															+	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
														
 
															+	vpslld $3,		x0 ## 1, x4 ## 1;          \
														
 
															+	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
														
 
															+		vpsrld $7,		x3 ## 2, x4 ## 2;          \
														
 
															+		vpslld $(32 - 7),	x3 ## 2, x3 ## 2;          \
														
 
															+		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
														
 
															+		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
														
 
															+		vpslld $3,		x0 ## 2, x4 ## 2;          \
														
 
															+		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
														
 
															+	vpsrld $13,		x0 ## 1, x4 ## 1;          \
														
 
															+	vpslld $(32 - 13),	x0 ## 1, x0 ## 1;          \
														
 
															+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
														
 
															+	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
														
 
															+	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
														
 
															+	vpsrld $3,		x2 ## 1, x4 ## 1;          \
														
 
															+	vpslld $(32 - 3),	x2 ## 1, x2 ## 1;          \
														
 
															+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
														
 
															+		vpsrld $13,		x0 ## 2, x4 ## 2;          \
														
 
															+		vpslld $(32 - 13),	x0 ## 2, x0 ## 2;          \
														
 
															+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
														
 
															+		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
														
 
															+		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
														
 
															+		vpsrld $3,		x2 ## 2, x4 ## 2;          \
														
 
															+		vpslld $(32 - 3),	x2 ## 2, x2 ## 2;          \
														
 
															+		vpor			x4 ## 2, x2 ## 2, x2 ## 2;
														
 
															+
														
 
															+#define S(SBOX, x0, x1, x2, x3, x4) \
														
 
															+	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
														
 
															+	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
														
 
															+	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
														
 
															+	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
														
 
															+
														
 
															+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
														
 
															+	get_key(i, 0, RK0); \
														
 
															+	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
														
 
															+	get_key(i, 2, RK2); \
														
 
															+	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
														
 
															+	get_key(i, 3, RK3); \
														
 
															+	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
														
 
															+	get_key(i, 1, RK1); \
														
 
															+	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
														
 
															+
														
 
															+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
														
 
															+	vpunpckldq		x1, x0, t0; \
														
 
															+	vpunpckhdq		x1, x0, t2; \
														
 
															+	vpunpckldq		x3, x2, t1; \
														
 
															+	vpunpckhdq		x3, x2, x3; \
														
 
															+	\
														
 
															+	vpunpcklqdq		t1, t0, x0; \
														
 
															+	vpunpckhqdq		t1, t0, x1; \
														
 
															+	vpunpcklqdq		x3, t2, x2; \
														
 
															+	vpunpckhqdq		x3, t2, x3;
														
 
															+
														
 
															+#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
														
 
															+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
														
 
															+
														
 
															+#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
														
 
															+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
														
 
															+
														
 
															+.align 8
														
 
															+__serpent_enc_blk16:
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
														
 
															+	 * output:
														
 
															+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
														
 
															+	 */
														
 
															+
														
 
															+	vpcmpeqd RNOT, RNOT, RNOT;
														
 
															+
														
 
															+	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
														
 
															+	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
														
 
															+
														
 
															+						 K2(RA, RB, RC, RD, RE, 0);
														
 
															+	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1);
														
 
															+	S(S1, RC, RB, RD, RA, RE);		LK2(RE, RD, RA, RC, RB, 2);
														
 
															+	S(S2, RE, RD, RA, RC, RB);		LK2(RB, RD, RE, RC, RA, 3);
														
 
															+	S(S3, RB, RD, RE, RC, RA);		LK2(RC, RA, RD, RB, RE, 4);
														
 
															+	S(S4, RC, RA, RD, RB, RE);		LK2(RA, RD, RB, RE, RC, 5);
														
 
															+	S(S5, RA, RD, RB, RE, RC);		LK2(RC, RA, RD, RE, RB, 6);
														
 
															+	S(S6, RC, RA, RD, RE, RB);		LK2(RD, RB, RA, RE, RC, 7);
														
 
															+	S(S7, RD, RB, RA, RE, RC);		LK2(RC, RA, RE, RD, RB, 8);
														
 
															+	S(S0, RC, RA, RE, RD, RB);		LK2(RE, RA, RD, RC, RB, 9);
														
 
															+	S(S1, RE, RA, RD, RC, RB);		LK2(RB, RD, RC, RE, RA, 10);
														
 
															+	S(S2, RB, RD, RC, RE, RA);		LK2(RA, RD, RB, RE, RC, 11);
														
 
															+	S(S3, RA, RD, RB, RE, RC);		LK2(RE, RC, RD, RA, RB, 12);
														
 
															+	S(S4, RE, RC, RD, RA, RB);		LK2(RC, RD, RA, RB, RE, 13);
														
 
															+	S(S5, RC, RD, RA, RB, RE);		LK2(RE, RC, RD, RB, RA, 14);
														
 
															+	S(S6, RE, RC, RD, RB, RA);		LK2(RD, RA, RC, RB, RE, 15);
														
 
															+	S(S7, RD, RA, RC, RB, RE);		LK2(RE, RC, RB, RD, RA, 16);
														
 
															+	S(S0, RE, RC, RB, RD, RA);		LK2(RB, RC, RD, RE, RA, 17);
														
 
															+	S(S1, RB, RC, RD, RE, RA);		LK2(RA, RD, RE, RB, RC, 18);
														
 
															+	S(S2, RA, RD, RE, RB, RC);		LK2(RC, RD, RA, RB, RE, 19);
														
 
															+	S(S3, RC, RD, RA, RB, RE);		LK2(RB, RE, RD, RC, RA, 20);
														
 
															+	S(S4, RB, RE, RD, RC, RA);		LK2(RE, RD, RC, RA, RB, 21);
														
 
															+	S(S5, RE, RD, RC, RA, RB);		LK2(RB, RE, RD, RA, RC, 22);
														
 
															+	S(S6, RB, RE, RD, RA, RC);		LK2(RD, RC, RE, RA, RB, 23);
														
 
															+	S(S7, RD, RC, RE, RA, RB);		LK2(RB, RE, RA, RD, RC, 24);
														
 
															+	S(S0, RB, RE, RA, RD, RC);		LK2(RA, RE, RD, RB, RC, 25);
														
 
															+	S(S1, RA, RE, RD, RB, RC);		LK2(RC, RD, RB, RA, RE, 26);
														
 
															+	S(S2, RC, RD, RB, RA, RE);		LK2(RE, RD, RC, RA, RB, 27);
														
 
															+	S(S3, RE, RD, RC, RA, RB);		LK2(RA, RB, RD, RE, RC, 28);
														
 
															+	S(S4, RA, RB, RD, RE, RC);		LK2(RB, RD, RE, RC, RA, 29);
														
 
															+	S(S5, RB, RD, RE, RC, RA);		LK2(RA, RB, RD, RC, RE, 30);
														
 
															+	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);
														
 
															+	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32);
														
 
															+
														
 
															+	write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
														
 
															+	write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(__serpent_enc_blk16)
														
 
															+
														
 
															+.align 8
														
 
															+__serpent_dec_blk16:
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
														
 
															+	 * output:
														
 
															+	 *	RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
														
 
															+	 */
														
 
															+
														
 
															+	vpcmpeqd RNOT, RNOT, RNOT;
														
 
															+
														
 
															+	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
														
 
															+	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
														
 
															+
														
 
															+						 K2(RA, RB, RC, RD, RE, 32);
														
 
															+	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31);
														
 
															+	SP(SI6, RB, RD, RA, RE, RC, 30);	KL2(RA, RC, RE, RB, RD, 30);
														
 
															+	SP(SI5, RA, RC, RE, RB, RD, 29);	KL2(RC, RD, RA, RE, RB, 29);
														
 
															+	SP(SI4, RC, RD, RA, RE, RB, 28);	KL2(RC, RA, RB, RE, RD, 28);
														
 
															+	SP(SI3, RC, RA, RB, RE, RD, 27);	KL2(RB, RC, RD, RE, RA, 27);
														
 
															+	SP(SI2, RB, RC, RD, RE, RA, 26);	KL2(RC, RA, RE, RD, RB, 26);
														
 
															+	SP(SI1, RC, RA, RE, RD, RB, 25);	KL2(RB, RA, RE, RD, RC, 25);
														
 
															+	SP(SI0, RB, RA, RE, RD, RC, 24);	KL2(RE, RC, RA, RB, RD, 24);
														
 
															+	SP(SI7, RE, RC, RA, RB, RD, 23);	KL2(RC, RB, RE, RD, RA, 23);
														
 
															+	SP(SI6, RC, RB, RE, RD, RA, 22);	KL2(RE, RA, RD, RC, RB, 22);
														
 
															+	SP(SI5, RE, RA, RD, RC, RB, 21);	KL2(RA, RB, RE, RD, RC, 21);
														
 
															+	SP(SI4, RA, RB, RE, RD, RC, 20);	KL2(RA, RE, RC, RD, RB, 20);
														
 
															+	SP(SI3, RA, RE, RC, RD, RB, 19);	KL2(RC, RA, RB, RD, RE, 19);
														
 
															+	SP(SI2, RC, RA, RB, RD, RE, 18);	KL2(RA, RE, RD, RB, RC, 18);
														
 
															+	SP(SI1, RA, RE, RD, RB, RC, 17);	KL2(RC, RE, RD, RB, RA, 17);
														
 
															+	SP(SI0, RC, RE, RD, RB, RA, 16);	KL2(RD, RA, RE, RC, RB, 16);
														
 
															+	SP(SI7, RD, RA, RE, RC, RB, 15);	KL2(RA, RC, RD, RB, RE, 15);
														
 
															+	SP(SI6, RA, RC, RD, RB, RE, 14);	KL2(RD, RE, RB, RA, RC, 14);
														
 
															+	SP(SI5, RD, RE, RB, RA, RC, 13);	KL2(RE, RC, RD, RB, RA, 13);
														
 
															+	SP(SI4, RE, RC, RD, RB, RA, 12);	KL2(RE, RD, RA, RB, RC, 12);
														
 
															+	SP(SI3, RE, RD, RA, RB, RC, 11);	KL2(RA, RE, RC, RB, RD, 11);
														
 
															+	SP(SI2, RA, RE, RC, RB, RD, 10);	KL2(RE, RD, RB, RC, RA, 10);
														
 
															+	SP(SI1, RE, RD, RB, RC, RA, 9);		KL2(RA, RD, RB, RC, RE, 9);
														
 
															+	SP(SI0, RA, RD, RB, RC, RE, 8);		KL2(RB, RE, RD, RA, RC, 8);
														
 
															+	SP(SI7, RB, RE, RD, RA, RC, 7);		KL2(RE, RA, RB, RC, RD, 7);
														
 
															+	SP(SI6, RE, RA, RB, RC, RD, 6);		KL2(RB, RD, RC, RE, RA, 6);
														
 
															+	SP(SI5, RB, RD, RC, RE, RA, 5);		KL2(RD, RA, RB, RC, RE, 5);
														
 
															+	SP(SI4, RD, RA, RB, RC, RE, 4);		KL2(RD, RB, RE, RC, RA, 4);
														
 
															+	SP(SI3, RD, RB, RE, RC, RA, 3);		KL2(RE, RD, RA, RC, RB, 3);
														
 
															+	SP(SI2, RE, RD, RA, RC, RB, 2);		KL2(RD, RB, RC, RA, RE, 2);
														
 
															+	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);
														
 
															+	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0);
														
 
															+
														
 
															+	write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
														
 
															+	write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(__serpent_dec_blk16)
														
 
															+
														
 
															+ENTRY(serpent_ecb_enc_16way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
														
 
															+
														
 
															+	call __serpent_enc_blk16;
														
 
															+
														
 
															+	store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(serpent_ecb_enc_16way)
														
 
															+
														
 
															+ENTRY(serpent_ecb_dec_16way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
														
 
															+
														
 
															+	call __serpent_dec_blk16;
														
 
															+
														
 
															+	store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(serpent_ecb_dec_16way)
														
 
															+
														
 
															+ENTRY(serpent_cbc_dec_16way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
														
 
															+
														
 
															+	call __serpent_dec_blk16;
														
 
															+
														
 
															+	store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
														
 
															+			RK0);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(serpent_cbc_dec_16way)
														
 
															+
														
 
															+ENTRY(serpent_ctr_16way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (16 blocks)
														
 
															+	 *	%rdx: src (16 blocks)
														
 
															+	 *	%rcx: iv (little endian, 128bit)
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
														
 
															+		       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
														
 
															+		       tp);
														
 
															+
														
 
															+	call __serpent_enc_blk16;
														
 
															+
														
 
															+	store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(serpent_ctr_16way)
														
 
															+
														
 
															+ENTRY(serpent_xts_enc_16way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (16 blocks)
														
 
															+	 *	%rdx: src (16 blocks)
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
														
 
															+		       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
														
 
															+		       .Lxts_gf128mul_and_shl1_mask_0,
														
 
															+		       .Lxts_gf128mul_and_shl1_mask_1);
														
 
															+
														
 
															+	call __serpent_enc_blk16;
														
 
															+
														
 
															+	store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(serpent_xts_enc_16way)
														
 
															+
														
 
															+ENTRY(serpent_xts_dec_16way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (16 blocks)
														
 
															+	 *	%rdx: src (16 blocks)
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
														
 
															+		       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
														
 
															+		       .Lxts_gf128mul_and_shl1_mask_0,
														
 
															+		       .Lxts_gf128mul_and_shl1_mask_1);
														
 
															+
														
 
															+	call __serpent_dec_blk16;
														
 
															+
														
 
															+	store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
														
 
															+
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(serpent_xts_dec_16way)
														
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -0,0 +1,562 @@
 
															+/*
														
 
															+ * Glue Code for x86_64/AVX2 assembler optimized version of Serpent
														
 
															+ *
														
 
															+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify
														
 
															+ * it under the terms of the GNU General Public License as published by
														
 
															+ * the Free Software Foundation; either version 2 of the License, or
														
 
															+ * (at your option) any later version.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/crypto.h>
														
 
															+#include <linux/err.h>
														
 
															+#include <crypto/algapi.h>
														
 
															+#include <crypto/ctr.h>
														
 
															+#include <crypto/lrw.h>
														
 
															+#include <crypto/xts.h>
														
 
															+#include <crypto/serpent.h>
														
 
															+#include <asm/xcr.h>
														
 
															+#include <asm/xsave.h>
														
 
															+#include <asm/crypto/serpent-avx.h>
														
 
															+#include <asm/crypto/ablk_helper.h>
														
 
															+#include <asm/crypto/glue_helper.h>
														
 
															+
														
 
															+#define SERPENT_AVX2_PARALLEL_BLOCKS 16
														
 
															+
														
 
															+/* 16-way AVX2 parallel cipher functions */
														
 
															+asmlinkage void serpent_ecb_enc_16way(struct serpent_ctx *ctx, u8 *dst,
														
 
															+				      const u8 *src);
														
 
															+asmlinkage void serpent_ecb_dec_16way(struct serpent_ctx *ctx, u8 *dst,
														
 
															+				      const u8 *src);
														
 
															+asmlinkage void serpent_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
														
 
															+
														
 
															+asmlinkage void serpent_ctr_16way(void *ctx, u128 *dst, const u128 *src,
														
 
															+				  le128 *iv);
														
 
															+asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst,
														
 
															+				      const u8 *src, le128 *iv);
														
 
															+asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst,
														
 
															+				      const u8 *src, le128 *iv);
														
 
															+
														
 
															+static const struct common_glue_ctx serpent_enc = {
														
 
															+	.num_funcs = 3,
														
 
															+	.fpu_blocks_limit = 8,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = 16,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 8,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx serpent_ctr = {
														
 
															+	.num_funcs = 3,
														
 
															+	.fpu_blocks_limit = 8,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = 16,
														
 
															+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_16way) }
														
 
															+	},  {
														
 
															+		.num_blocks = 8,
														
 
															+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx serpent_enc_xts = {
														
 
															+	.num_funcs = 3,
														
 
															+	.fpu_blocks_limit = 8,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = 16,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 8,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx serpent_dec = {
														
 
															+	.num_funcs = 3,
														
 
															+	.fpu_blocks_limit = 8,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = 16,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 8,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx serpent_dec_cbc = {
														
 
															+	.num_funcs = 3,
														
 
															+	.fpu_blocks_limit = 8,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = 16,
														
 
															+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 8,
														
 
															+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx serpent_dec_xts = {
														
 
															+	.num_funcs = 3,
														
 
															+	.fpu_blocks_limit = 8,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = 16,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 8,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
														
 
															+}
														
 
															+
														
 
															+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
														
 
															+}
														
 
															+
														
 
															+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
														
 
															+				       dst, src, nbytes);
														
 
															+}
														
 
															+
														
 
															+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
														
 
															+				       nbytes);
														
 
															+}
														
 
															+
														
 
															+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		     struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
														
 
															+}
														
 
															+
														
 
															+static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
														
 
															+{
														
 
															+	/* since reusing AVX functions, starts using FPU at 8 parallel blocks */
														
 
															+	return glue_fpu_begin(SERPENT_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
														
 
															+}
														
 
															+
														
 
															+static inline void serpent_fpu_end(bool fpu_enabled)
														
 
															+{
														
 
															+	glue_fpu_end(fpu_enabled);
														
 
															+}
														
 
															+
														
 
															+struct crypt_priv {
														
 
															+	struct serpent_ctx *ctx;
														
 
															+	bool fpu_enabled;
														
 
															+};
														
 
															+
														
 
															+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
														
 
															+{
														
 
															+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
														
 
															+	struct crypt_priv *ctx = priv;
														
 
															+	int i;
														
 
															+
														
 
															+	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
														
 
															+
														
 
															+	if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) {
														
 
															+		serpent_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
														
 
															+		nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
														
 
															+	}
														
 
															+
														
 
															+	while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
														
 
															+		serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
														
 
															+		nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
														
 
															+		__serpent_encrypt(ctx->ctx, srcdst, srcdst);
														
 
															+}
														
 
															+
														
 
															+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
														
 
															+{
														
 
															+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
														
 
															+	struct crypt_priv *ctx = priv;
														
 
															+	int i;
														
 
															+
														
 
															+	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
														
 
															+
														
 
															+	if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) {
														
 
															+		serpent_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
														
 
															+		nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
														
 
															+	}
														
 
															+
														
 
															+	while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
														
 
															+		serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
														
 
															+		nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
														
 
															+		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
														
 
															+}
														
 
															+
														
 
															+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+	be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS];
														
 
															+	struct crypt_priv crypt_ctx = {
														
 
															+		.ctx = &ctx->serpent_ctx,
														
 
															+		.fpu_enabled = false,
														
 
															+	};
														
 
															+	struct lrw_crypt_req req = {
														
 
															+		.tbuf = buf,
														
 
															+		.tbuflen = sizeof(buf),
														
 
															+
														
 
															+		.table_ctx = &ctx->lrw_table,
														
 
															+		.crypt_ctx = &crypt_ctx,
														
 
															+		.crypt_fn = encrypt_callback,
														
 
															+	};
														
 
															+	int ret;
														
 
															+
														
 
															+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
														
 
															+	serpent_fpu_end(crypt_ctx.fpu_enabled);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+	be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS];
														
 
															+	struct crypt_priv crypt_ctx = {
														
 
															+		.ctx = &ctx->serpent_ctx,
														
 
															+		.fpu_enabled = false,
														
 
															+	};
														
 
															+	struct lrw_crypt_req req = {
														
 
															+		.tbuf = buf,
														
 
															+		.tbuflen = sizeof(buf),
														
 
															+
														
 
															+		.table_ctx = &ctx->lrw_table,
														
 
															+		.crypt_ctx = &crypt_ctx,
														
 
															+		.crypt_fn = decrypt_callback,
														
 
															+	};
														
 
															+	int ret;
														
 
															+
														
 
															+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
														
 
															+	serpent_fpu_end(crypt_ctx.fpu_enabled);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+
														
 
															+	return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(__serpent_encrypt),
														
 
															+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
														
 
															+}
														
 
															+
														
 
															+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+
														
 
															+	return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(__serpent_encrypt),
														
 
															+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
														
 
															+}
														
 
															+
														
 
															+static struct crypto_alg srp_algs[10] = { {
														
 
															+	.cra_name		= "__ecb-serpent-avx2",
														
 
															+	.cra_driver_name	= "__driver-ecb-serpent-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct serpent_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_list		= LIST_HEAD_INIT(srp_algs[0].cra_list),
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
														
 
															+			.setkey		= serpent_setkey,
														
 
															+			.encrypt	= ecb_encrypt,
														
 
															+			.decrypt	= ecb_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "__cbc-serpent-avx2",
														
 
															+	.cra_driver_name	= "__driver-cbc-serpent-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct serpent_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_list		= LIST_HEAD_INIT(srp_algs[1].cra_list),
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
														
 
															+			.setkey		= serpent_setkey,
														
 
															+			.encrypt	= cbc_encrypt,
														
 
															+			.decrypt	= cbc_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "__ctr-serpent-avx2",
														
 
															+	.cra_driver_name	= "__driver-ctr-serpent-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= 1,
														
 
															+	.cra_ctxsize		= sizeof(struct serpent_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_list		= LIST_HEAD_INIT(srp_algs[2].cra_list),
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
														
 
															+			.ivsize		= SERPENT_BLOCK_SIZE,
														
 
															+			.setkey		= serpent_setkey,
														
 
															+			.encrypt	= ctr_crypt,
														
 
															+			.decrypt	= ctr_crypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "__lrw-serpent-avx2",
														
 
															+	.cra_driver_name	= "__driver-lrw-serpent-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_list		= LIST_HEAD_INIT(srp_algs[3].cra_list),
														
 
															+	.cra_exit		= lrw_serpent_exit_tfm,
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= SERPENT_MIN_KEY_SIZE +
														
 
															+					  SERPENT_BLOCK_SIZE,
														
 
															+			.max_keysize	= SERPENT_MAX_KEY_SIZE +
														
 
															+					  SERPENT_BLOCK_SIZE,
														
 
															+			.ivsize		= SERPENT_BLOCK_SIZE,
														
 
															+			.setkey		= lrw_serpent_setkey,
														
 
															+			.encrypt	= lrw_encrypt,
														
 
															+			.decrypt	= lrw_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "__xts-serpent-avx2",
														
 
															+	.cra_driver_name	= "__driver-xts-serpent-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct serpent_xts_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_list		= LIST_HEAD_INIT(srp_algs[4].cra_list),
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2,
														
 
															+			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2,
														
 
															+			.ivsize		= SERPENT_BLOCK_SIZE,
														
 
															+			.setkey		= xts_serpent_setkey,
														
 
															+			.encrypt	= xts_encrypt,
														
 
															+			.decrypt	= xts_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "ecb(serpent)",
														
 
															+	.cra_driver_name	= "ecb-serpent-avx2",
														
 
															+	.cra_priority		= 600,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_list		= LIST_HEAD_INIT(srp_algs[5].cra_list),
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= ablk_encrypt,
														
 
															+			.decrypt	= ablk_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "cbc(serpent)",
														
 
															+	.cra_driver_name	= "cbc-serpent-avx2",
														
 
															+	.cra_priority		= 600,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_list		= LIST_HEAD_INIT(srp_algs[6].cra_list),
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
														
 
															+			.ivsize		= SERPENT_BLOCK_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= __ablk_encrypt,
														
 
															+			.decrypt	= ablk_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "ctr(serpent)",
														
 
															+	.cra_driver_name	= "ctr-serpent-avx2",
														
 
															+	.cra_priority		= 600,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= 1,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_list		= LIST_HEAD_INIT(srp_algs[7].cra_list),
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
														
 
															+			.ivsize		= SERPENT_BLOCK_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= ablk_encrypt,
														
 
															+			.decrypt	= ablk_encrypt,
														
 
															+			.geniv		= "chainiv",
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "lrw(serpent)",
														
 
															+	.cra_driver_name	= "lrw-serpent-avx2",
														
 
															+	.cra_priority		= 600,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_list		= LIST_HEAD_INIT(srp_algs[8].cra_list),
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= SERPENT_MIN_KEY_SIZE +
														
 
															+					  SERPENT_BLOCK_SIZE,
														
 
															+			.max_keysize	= SERPENT_MAX_KEY_SIZE +
														
 
															+					  SERPENT_BLOCK_SIZE,
														
 
															+			.ivsize		= SERPENT_BLOCK_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= ablk_encrypt,
														
 
															+			.decrypt	= ablk_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "xts(serpent)",
														
 
															+	.cra_driver_name	= "xts-serpent-avx2",
														
 
															+	.cra_priority		= 600,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_list		= LIST_HEAD_INIT(srp_algs[9].cra_list),
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2,
														
 
															+			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2,
														
 
															+			.ivsize		= SERPENT_BLOCK_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= ablk_encrypt,
														
 
															+			.decrypt	= ablk_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+} };
														
 
															+
														
 
															+static int __init init(void)
														
 
															+{
														
 
															+	u64 xcr0;
														
 
															+
														
 
															+	if (!cpu_has_avx2 || !cpu_has_osxsave) {
														
 
															+		pr_info("AVX2 instructions are not detected.\n");
														
 
															+		return -ENODEV;
														
 
															+	}
														
 
															+
														
 
															+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
														
 
															+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
														
 
															+		pr_info("AVX detected but unusable.\n");
														
 
															+		return -ENODEV;
														
 
															+	}
														
 
															+
														
 
															+	return crypto_register_algs(srp_algs, ARRAY_SIZE(srp_algs));
														
 
															+}
														
 
															+
														
 
															+static void __exit fini(void)
														
 
															+{
														
 
															+	crypto_unregister_algs(srp_algs, ARRAY_SIZE(srp_algs));
														
 
															+}
														
 
															+
														
 
															+module_init(init);
														
 
															+module_exit(fini);
														
 
															+
														
 
															+MODULE_LICENSE("GPL");
														
 
															+MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized");
														
 
															+MODULE_ALIAS("serpent");
														
 
															+MODULE_ALIAS("serpent-asm");
														
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -4,8 +4,7 @@
 
															  * Copyright (C) 2012 Johannes Goetzfried
														
 
															  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
														
 
															  *
														
 
															- * Glue code based on serpent_sse2_glue.c by:
														
 
															- *  Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															  *
														
 
															  * This program is free software; you can redistribute it and/or modify
														
 
															  * it under the terms of the GNU General Public License as published by
														
@@ -42,7 +41,32 @@
 
															 #include <asm/crypto/ablk_helper.h>
														
 
															 #include <asm/crypto/glue_helper.h>
														
 
															-static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															+/* 8-way parallel cipher functions */
														
 
															+asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
														
 
															+					 const u8 *src);
														
 
															+EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx);
														
 
															+
														
 
															+asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
														
 
															+					 const u8 *src);
														
 
															+EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx);
														
 
															+
														
 
															+asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
														
 
															+					 const u8 *src);
														
 
															+EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx);
														
 
															+
														
 
															+asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
														
 
															+				     const u8 *src, le128 *iv);
														
 
															+EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx);
														
 
															+
														
 
															+asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
														
 
															+					 const u8 *src, le128 *iv);
														
 
															+EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx);
														
 
															+
														
 
															+asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
														
 
															+					 const u8 *src, le128 *iv);
														
 
															+EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx);
														
 
															+
														
 
															+void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															 {
														
 
															 	be128 ctrblk;
														
@@ -52,6 +76,22 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 
															 	__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
														
 
															 	u128_xor(dst, src, (u128 *)&ctrblk);
														
 
															 }
														
 
															+EXPORT_SYMBOL_GPL(__serpent_crypt_ctr);
														
 
															+
														
 
															+void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															+{
														
 
															+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
														
 
															+				  GLUE_FUNC_CAST(__serpent_encrypt));
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(serpent_xts_enc);
														
 
															+
														
 
															+void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															+{
														
 
															+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
														
 
															+				  GLUE_FUNC_CAST(__serpent_decrypt));
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(serpent_xts_dec);
														
 
															+
														
 
															 static const struct common_glue_ctx serpent_enc = {
														
 
															 	.num_funcs = 2,
														
@@ -75,7 +115,20 @@ static const struct common_glue_ctx serpent_ctr = {
 
															 		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
														
 
															 	}, {
														
 
															 		.num_blocks = 1,
														
 
															-		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
														
 
															+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx serpent_enc_xts = {
														
 
															+	.num_funcs = 2,
														
 
															+	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = SERPENT_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
														
 
															 	} }
														
 
															 };
														
@@ -105,6 +158,19 @@ static const struct common_glue_ctx serpent_dec_cbc = {
 
															 	} }
														
 
															 };
														
 
															+static const struct common_glue_ctx serpent_dec_xts = {
														
 
															+	.num_funcs = 2,
														
 
															+	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = SERPENT_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															 		       struct scatterlist *src, unsigned int nbytes)
														
 
															 {
														
@@ -187,13 +253,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 
															 		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
														
 
															 }
														
 
															-struct serpent_lrw_ctx {
														
 
															-	struct lrw_table_ctx lrw_table;
														
 
															-	struct serpent_ctx serpent_ctx;
														
 
															-};
														
 
															-
														
 
															-static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
														
 
															-			      unsigned int keylen)
														
 
															+int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
														
 
															+		       unsigned int keylen)
														
 
															 {
														
 
															 	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
														
 
															 	int err;
														
@@ -206,6 +267,7 @@ static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
 
															 	return lrw_init_table(&ctx->lrw_table, key + keylen -
														
 
															 						SERPENT_BLOCK_SIZE);
														
 
															 }
														
 
															+EXPORT_SYMBOL_GPL(lrw_serpent_setkey);
														
 
															 static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															 		       struct scatterlist *src, unsigned int nbytes)
														
@@ -259,20 +321,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 
															 	return ret;
														
 
															 }
														
 
															-static void lrw_exit_tfm(struct crypto_tfm *tfm)
														
 
															+void lrw_serpent_exit_tfm(struct crypto_tfm *tfm)
														
 
															 {
														
 
															 	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
														
 
															 	lrw_free_table(&ctx->lrw_table);
														
 
															 }
														
 
															+EXPORT_SYMBOL_GPL(lrw_serpent_exit_tfm);
														
 
															-struct serpent_xts_ctx {
														
 
															-	struct serpent_ctx tweak_ctx;
														
 
															-	struct serpent_ctx crypt_ctx;
														
 
															-};
														
 
															-
														
 
															-static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
														
 
															-			      unsigned int keylen)
														
 
															+int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
														
 
															+		       unsigned int keylen)
														
 
															 {
														
 
															 	struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
														
 
															 	u32 *flags = &tfm->crt_flags;
														
@@ -294,59 +352,26 @@ static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
 
															 	/* second half of xts-key is for tweak */
														
 
															 	return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
														
 
															 }
														
 
															+EXPORT_SYMBOL_GPL(xts_serpent_setkey);
														
 
															 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															 		       struct scatterlist *src, unsigned int nbytes)
														
 
															 {
														
 
															 	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															-	be128 buf[SERPENT_PARALLEL_BLOCKS];
														
 
															-	struct crypt_priv crypt_ctx = {
														
 
															-		.ctx = &ctx->crypt_ctx,
														
 
															-		.fpu_enabled = false,
														
 
															-	};
														
 
															-	struct xts_crypt_req req = {
														
 
															-		.tbuf = buf,
														
 
															-		.tbuflen = sizeof(buf),
														
 
															-
														
 
															-		.tweak_ctx = &ctx->tweak_ctx,
														
 
															-		.tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
														
 
															-		.crypt_ctx = &crypt_ctx,
														
 
															-		.crypt_fn = encrypt_callback,
														
 
															-	};
														
 
															-	int ret;
														
 
															-
														
 
															-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															-	ret = xts_crypt(desc, dst, src, nbytes, &req);
														
 
															-	serpent_fpu_end(crypt_ctx.fpu_enabled);
														
 
															-	return ret;
														
 
															+	return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(__serpent_encrypt),
														
 
															+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
														
 
															 }
														
 
															 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															 		       struct scatterlist *src, unsigned int nbytes)
														
 
															 {
														
 
															 	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															-	be128 buf[SERPENT_PARALLEL_BLOCKS];
														
 
															-	struct crypt_priv crypt_ctx = {
														
 
															-		.ctx = &ctx->crypt_ctx,
														
 
															-		.fpu_enabled = false,
														
 
															-	};
														
 
															-	struct xts_crypt_req req = {
														
 
															-		.tbuf = buf,
														
 
															-		.tbuflen = sizeof(buf),
														
 
															-
														
 
															-		.tweak_ctx = &ctx->tweak_ctx,
														
 
															-		.tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
														
 
															-		.crypt_ctx = &crypt_ctx,
														
 
															-		.crypt_fn = decrypt_callback,
														
 
															-	};
														
 
															-	int ret;
														
 
															-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															-	ret = xts_crypt(desc, dst, src, nbytes, &req);
														
 
															-	serpent_fpu_end(crypt_ctx.fpu_enabled);
														
 
															-
														
 
															-	return ret;
														
 
															+	return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(__serpent_encrypt),
														
 
															+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
														
 
															 }
														
 
															 static struct crypto_alg serpent_algs[10] = { {
														
@@ -417,7 +442,7 @@ static struct crypto_alg serpent_algs[10] = { {
 
															 	.cra_alignmask		= 0,
														
 
															 	.cra_type		= &crypto_blkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															-	.cra_exit		= lrw_exit_tfm,
														
 
															+	.cra_exit		= lrw_serpent_exit_tfm,
														
 
															 	.cra_u = {
														
 
															 		.blkcipher = {
														
 
															 			.min_keysize	= SERPENT_MIN_KEY_SIZE +
														
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -0,0 +1,496 @@
 
															+########################################################################
														
 
															+# Implement fast SHA-256 with AVX1 instructions. (x86_64)
														
 
															+#
														
 
															+# Copyright (C) 2013 Intel Corporation.
														
 
															+#
														
 
															+# Authors:
														
 
															+#     James Guilford <james.guilford@intel.com>
														
 
															+#     Kirk Yap <kirk.s.yap@intel.com>
														
 
															+#     Tim Chen <tim.c.chen@linux.intel.com>
														
 
															+#
														
 
															+# This software is available to you under a choice of one of two
														
 
															+# licenses.  You may choose to be licensed under the terms of the GNU
														
 
															+# General Public License (GPL) Version 2, available from the file
														
 
															+# COPYING in the main directory of this source tree, or the
														
 
															+# OpenIB.org BSD license below:
														
 
															+#
														
 
															+#     Redistribution and use in source and binary forms, with or
														
 
															+#     without modification, are permitted provided that the following
														
 
															+#     conditions are met:
														
 
															+#
														
 
															+#      - Redistributions of source code must retain the above
														
 
															+#        copyright notice, this list of conditions and the following
														
 
															+#        disclaimer.
														
 
															+#
														
 
															+#      - Redistributions in binary form must reproduce the above
														
 
															+#        copyright notice, this list of conditions and the following
														
 
															+#        disclaimer in the documentation and/or other materials
														
 
															+#        provided with the distribution.
														
 
															+#
														
 
															+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
														
 
															+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
														
 
															+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
														
 
															+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
														
 
															+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
														
 
															+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
														
 
															+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
														
 
															+# SOFTWARE.
														
 
															+########################################################################
														
 
															+#
														
 
															+# This code is described in an Intel White-Paper:
														
 
															+# "Fast SHA-256 Implementations on Intel Architecture Processors"
														
 
															+#
														
 
															+# To find it, surf to http://www.intel.com/p/en_US/embedded
														
 
															+# and search for that title.
														
 
															+#
														
 
															+########################################################################
														
 
															+# This code schedules 1 block at a time, with 4 lanes per block
														
 
															+########################################################################
														
 
															+
														
 
															+#ifdef CONFIG_AS_AVX
														
 
															+#include <linux/linkage.h>
														
 
															+
														
 
															+## assume buffers not aligned
														
 
															+#define    VMOVDQ vmovdqu
														
 
															+
														
 
															+################################ Define Macros
														
 
															+
														
 
															+# addm [mem], reg
														
 
															+# Add reg to mem using reg-mem add and store
														
 
															+.macro addm p1 p2
														
 
															+	add     \p1, \p2
														
 
															+	mov     \p2, \p1
														
 
															+.endm
														
 
															+
														
 
															+
														
 
															+.macro MY_ROR p1 p2
														
 
															+	shld    $(32-(\p1)), \p2, \p2
														
 
															+.endm
														
 
															+
														
 
															+################################
														
 
															+
														
 
															+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
														
 
															+# Load xmm with mem and byte swap each dword
														
 
															+.macro COPY_XMM_AND_BSWAP p1 p2 p3
														
 
															+	VMOVDQ \p2, \p1
														
 
															+	vpshufb \p3, \p1, \p1
														
 
															+.endm
														
 
															+
														
 
															+################################
														
 
															+
														
 
															+X0 = %xmm4
														
 
															+X1 = %xmm5
														
 
															+X2 = %xmm6
														
 
															+X3 = %xmm7
														
 
															+
														
 
															+XTMP0 = %xmm0
														
 
															+XTMP1 = %xmm1
														
 
															+XTMP2 = %xmm2
														
 
															+XTMP3 = %xmm3
														
 
															+XTMP4 = %xmm8
														
 
															+XFER = %xmm9
														
 
															+XTMP5 = %xmm11
														
 
															+
														
 
															+SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
														
 
															+SHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
														
 
															+BYTE_FLIP_MASK = %xmm13
														
 
															+
														
 
															+NUM_BLKS = %rdx   # 3rd arg
														
 
															+CTX = %rsi        # 2nd arg
														
 
															+INP = %rdi        # 1st arg
														
 
															+
														
 
															+SRND = %rdi       # clobbers INP
														
 
															+c = %ecx
														
 
															+d = %r8d
														
 
															+e = %edx
														
 
															+TBL = %rbp
														
 
															+a = %eax
														
 
															+b = %ebx
														
 
															+
														
 
															+f = %r9d
														
 
															+g = %r10d
														
 
															+h = %r11d
														
 
															+
														
 
															+y0 = %r13d
														
 
															+y1 = %r14d
														
 
															+y2 = %r15d
														
 
															+
														
 
															+
														
 
															+_INP_END_SIZE = 8
														
 
															+_INP_SIZE = 8
														
 
															+_XFER_SIZE = 8
														
 
															+_XMM_SAVE_SIZE = 0
														
 
															+
														
 
															+_INP_END = 0
														
 
															+_INP            = _INP_END  + _INP_END_SIZE
														
 
															+_XFER           = _INP      + _INP_SIZE
														
 
															+_XMM_SAVE       = _XFER     + _XFER_SIZE
														
 
															+STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
														
 
															+
														
 
															+# rotate_Xs
														
 
															+# Rotate values of symbols X0...X3
														
 
															+.macro rotate_Xs
														
 
															+X_ = X0
														
 
															+X0 = X1
														
 
															+X1 = X2
														
 
															+X2 = X3
														
 
															+X3 = X_
														
 
															+.endm
														
 
															+
														
 
															+# ROTATE_ARGS
														
 
															+# Rotate values of symbols a...h
														
 
															+.macro ROTATE_ARGS
														
 
															+TMP_ = h
														
 
															+h = g
														
 
															+g = f
														
 
															+f = e
														
 
															+e = d
														
 
															+d = c
														
 
															+c = b
														
 
															+b = a
														
 
															+a = TMP_
														
 
															+.endm
														
 
															+
														
 
															+.macro FOUR_ROUNDS_AND_SCHED
														
 
															+	## compute s0 four at a time and s1 two at a time
														
 
															+	## compute W[-16] + W[-7] 4 at a time
														
 
															+
														
 
															+	mov     e, y0			# y0 = e
														
 
															+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
														
 
															+	mov     a, y1                   # y1 = a
														
 
															+	vpalignr $4, X2, X3, XTMP0      # XTMP0 = W[-7]
														
 
															+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
														
 
															+	mov     f, y2                   # y2 = f
														
 
															+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
														
 
															+	xor     g, y2                   # y2 = f^g
														
 
															+	vpaddd  X0, XTMP0, XTMP0        # XTMP0 = W[-7] + W[-16]
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	and     e, y2                   # y2 = (f^g)&e
														
 
															+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	## compute s0
														
 
															+	vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
														
 
															+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
														
 
															+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
														
 
															+	add     y0, y2                  # y2 = S1 + CH
														
 
															+	add     _XFER(%rsp), y2         # y2 = k + w + S1 + CH
														
 
															+	mov     a, y0                   # y0 = a
														
 
															+	add     y2, h                   # h = h + S1 + CH + k + w
														
 
															+	mov     a, y2                   # y2 = a
														
 
															+	vpsrld  $7, XTMP1, XTMP2
														
 
															+	or      c, y0                   # y0 = a|c
														
 
															+	add     h, d                    # d = d + h + S1 + CH + k + w
														
 
															+	and     c, y2                   # y2 = a&c
														
 
															+	vpslld  $(32-7), XTMP1, XTMP3
														
 
															+	and     b, y0                   # y0 = (a|c)&b
														
 
															+	add     y1, h                   # h = h + S1 + CH + k + w + S0
														
 
															+	vpor    XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7
														
 
															+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
														
 
															+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
														
 
															+	ROTATE_ARGS
														
 
															+	mov     e, y0                   # y0 = e
														
 
															+	mov     a, y1                   # y1 = a
														
 
															+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
														
 
															+	mov     f, y2                   # y2 = f
														
 
															+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
														
 
															+	vpsrld  $18, XTMP1, XTMP2       #
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
														
 
															+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	xor     g, y2                   # y2 = f^g
														
 
															+	vpsrld  $3, XTMP1, XTMP4        # XTMP4 = W[-15] >> 3
														
 
															+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	and     e, y2                   # y2 = (f^g)&e
														
 
															+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
														
 
															+	vpslld  $(32-18), XTMP1, XTMP1
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
														
 
															+	vpxor   XTMP1, XTMP3, XTMP3     #
														
 
															+	add     y0, y2                  # y2 = S1 + CH
														
 
															+	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
														
 
															+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
														
 
															+	vpxor   XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
														
 
															+	mov     a, y0                   # y0 = a
														
 
															+	add     y2, h                   # h = h + S1 + CH + k + w
														
 
															+	mov     a, y2                   # y2 = a
														
 
															+	vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
														
 
															+	or      c, y0                   # y0 = a|c
														
 
															+	add     h, d                    # d = d + h + S1 + CH + k + w
														
 
															+	and     c, y2                   # y2 = a&c
														
 
															+	## compute low s1
														
 
															+	vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
														
 
															+	and     b, y0                   # y0 = (a|c)&b
														
 
															+	add     y1, h                   # h = h + S1 + CH + k + w + S0
														
 
															+	vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
														
 
															+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
														
 
															+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
														
 
															+	ROTATE_ARGS
														
 
															+	mov     e, y0                   # y0 = e
														
 
															+	mov     a, y1                   # y1 = a
														
 
															+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
														
 
															+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
														
 
															+	mov     f, y2                   # y2 = f
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
														
 
															+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	vpsrld  $10, XTMP2, XTMP4       # XTMP4 = W[-2] >> 10 {BBAA}
														
 
															+	xor     g, y2                   # y2 = f^g
														
 
															+	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xBxA}
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	and     e, y2                   # y2 = (f^g)&e
														
 
															+	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xBxA}
														
 
															+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
														
 
															+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
														
 
															+	vpxor   XTMP3, XTMP2, XTMP2     #
														
 
															+	add     y0, y2                  # y2 = S1 + CH
														
 
															+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
														
 
															+	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
														
 
															+	vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
														
 
															+	mov     a, y0                   # y0 = a
														
 
															+	add     y2, h                   # h = h + S1 + CH + k + w
														
 
															+	mov     a, y2                   # y2 = a
														
 
															+	vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
														
 
															+	or      c, y0                   # y0 = a|c
														
 
															+	add     h, d                    # d = d + h + S1 + CH + k + w
														
 
															+	and     c, y2                   # y2 = a&c
														
 
															+	vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
														
 
															+	and     b, y0                   # y0 = (a|c)&b
														
 
															+	add     y1, h                   # h = h + S1 + CH + k + w + S0
														
 
															+	## compute high s1
														
 
															+	vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
														
 
															+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
														
 
															+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
														
 
															+	ROTATE_ARGS
														
 
															+	mov     e, y0                   # y0 = e
														
 
															+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
														
 
															+	mov     a, y1                   # y1 = a
														
 
															+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
														
 
															+	mov     f, y2                   # y2 = f
														
 
															+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
														
 
															+	xor     g, y2                   # y2 = f^g
														
 
															+	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xDxC}
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	and     e, y2                   # y2 = (f^g)&e
														
 
															+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xDxC}
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
														
 
															+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
														
 
															+	vpxor   XTMP3, XTMP2, XTMP2
														
 
															+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
														
 
															+	add     y0, y2                  # y2 = S1 + CH
														
 
															+	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
														
 
															+	vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
														
 
															+	mov     a, y0                   # y0 = a
														
 
															+	add     y2, h                   # h = h + S1 + CH + k + w
														
 
															+	mov     a, y2                   # y2 = a
														
 
															+	vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
														
 
															+	or      c, y0                   # y0 = a|c
														
 
															+	add     h, d                    # d = d + h + S1 + CH + k + w
														
 
															+	and     c, y2                   # y2 = a&c
														
 
															+	vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
														
 
															+	and     b, y0                   # y0 = (a|c)&b
														
 
															+	add     y1, h                   # h = h + S1 + CH + k + w + S0
														
 
															+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
														
 
															+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
														
 
															+	ROTATE_ARGS
														
 
															+	rotate_Xs
														
 
															+.endm
														
 
															+
														
 
															+## input is [rsp + _XFER + %1 * 4]
														
 
															+.macro DO_ROUND round
														
 
															+	mov	e, y0			# y0 = e
														
 
															+        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
														
 
															+        mov     a, y1                   # y1 = a
														
 
															+        xor     e, y0                   # y0 = e ^ (e >> (25-11))
														
 
															+        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
														
 
															+        mov     f, y2                   # y2 = f
														
 
															+        xor     a, y1                   # y1 = a ^ (a >> (22-13)
														
 
															+        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
														
 
															+        xor     g, y2                   # y2 = f^g
														
 
															+        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
														
 
															+        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
														
 
															+        and     e, y2                   # y2 = (f^g)&e
														
 
															+        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
														
 
															+        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
														
 
															+        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
														
 
															+        add     y0, y2                  # y2 = S1 + CH
														
 
															+        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
														
 
															+        offset = \round * 4 + _XFER     #
														
 
															+        add     offset(%rsp), y2	# y2 = k + w + S1 + CH
														
 
															+        mov     a, y0			# y0 = a
														
 
															+        add     y2, h                   # h = h + S1 + CH + k + w
														
 
															+        mov     a, y2                   # y2 = a
														
 
															+        or      c, y0                   # y0 = a|c
														
 
															+        add     h, d                    # d = d + h + S1 + CH + k + w
														
 
															+        and     c, y2                   # y2 = a&c
														
 
															+        and     b, y0                   # y0 = (a|c)&b
														
 
															+        add     y1, h                   # h = h + S1 + CH + k + w + S0
														
 
															+        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
														
 
															+        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
														
 
															+        ROTATE_ARGS
														
 
															+.endm
														
 
															+
														
 
															+########################################################################
														
 
															+## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
														
 
															+## arg 1 : pointer to input data
														
 
															+## arg 2 : pointer to digest
														
 
															+## arg 3 : Num blocks
														
 
															+########################################################################
														
 
															+.text
														
 
															+ENTRY(sha256_transform_avx)
														
 
															+.align 32
														
 
															+	pushq   %rbx
														
 
															+	pushq   %rbp
														
 
															+	pushq   %r13
														
 
															+	pushq   %r14
														
 
															+	pushq   %r15
														
 
															+	pushq   %r12
														
 
															+
														
 
															+	mov	%rsp, %r12
														
 
															+	subq    $STACK_SIZE, %rsp	# allocate stack space
														
 
															+	and	$~15, %rsp		# align stack pointer
														
 
															+
														
 
															+	shl     $6, NUM_BLKS		# convert to bytes
														
 
															+	jz      done_hash
														
 
															+	add     INP, NUM_BLKS		# pointer to end of data
														
 
															+	mov     NUM_BLKS, _INP_END(%rsp)
														
 
															+
														
 
															+	## load initial digest
														
 
															+	mov     4*0(CTX), a
														
 
															+	mov     4*1(CTX), b
														
 
															+	mov     4*2(CTX), c
														
 
															+	mov     4*3(CTX), d
														
 
															+	mov     4*4(CTX), e
														
 
															+	mov     4*5(CTX), f
														
 
															+	mov     4*6(CTX), g
														
 
															+	mov     4*7(CTX), h
														
 
															+
														
 
															+	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
														
 
															+	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
														
 
															+	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
														
 
															+loop0:
														
 
															+	lea     K256(%rip), TBL
														
 
															+
														
 
															+	## byte swap first 16 dwords
														
 
															+	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
														
 
															+	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
														
 
															+	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
														
 
															+	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
														
 
															+
														
 
															+	mov     INP, _INP(%rsp)
														
 
															+
														
 
															+	## schedule 48 input dwords, by doing 3 rounds of 16 each
														
 
															+	mov     $3, SRND
														
 
															+.align 16
														
 
															+loop1:
														
 
															+	vpaddd  (TBL), X0, XFER
														
 
															+	vmovdqa XFER, _XFER(%rsp)
														
 
															+	FOUR_ROUNDS_AND_SCHED
														
 
															+
														
 
															+	vpaddd  1*16(TBL), X0, XFER
														
 
															+	vmovdqa XFER, _XFER(%rsp)
														
 
															+	FOUR_ROUNDS_AND_SCHED
														
 
															+
														
 
															+	vpaddd  2*16(TBL), X0, XFER
														
 
															+	vmovdqa XFER, _XFER(%rsp)
														
 
															+	FOUR_ROUNDS_AND_SCHED
														
 
															+
														
 
															+	vpaddd  3*16(TBL), X0, XFER
														
 
															+	vmovdqa XFER, _XFER(%rsp)
														
 
															+	add	$4*16, TBL
														
 
															+	FOUR_ROUNDS_AND_SCHED
														
 
															+
														
 
															+	sub     $1, SRND
														
 
															+	jne     loop1
														
 
															+
														
 
															+	mov     $2, SRND
														
 
															+loop2:
														
 
															+	vpaddd  (TBL), X0, XFER
														
 
															+	vmovdqa XFER, _XFER(%rsp)
														
 
															+	DO_ROUND        0
														
 
															+	DO_ROUND        1
														
 
															+	DO_ROUND        2
														
 
															+	DO_ROUND        3
														
 
															+
														
 
															+	vpaddd  1*16(TBL), X1, XFER
														
 
															+	vmovdqa XFER, _XFER(%rsp)
														
 
															+	add     $2*16, TBL
														
 
															+	DO_ROUND        0
														
 
															+	DO_ROUND        1
														
 
															+	DO_ROUND        2
														
 
															+	DO_ROUND        3
														
 
															+
														
 
															+	vmovdqa X2, X0
														
 
															+	vmovdqa X3, X1
														
 
															+
														
 
															+	sub     $1, SRND
														
 
															+	jne     loop2
														
 
															+
														
 
															+	addm    (4*0)(CTX),a
														
 
															+	addm    (4*1)(CTX),b
														
 
															+	addm    (4*2)(CTX),c
														
 
															+	addm    (4*3)(CTX),d
														
 
															+	addm    (4*4)(CTX),e
														
 
															+	addm    (4*5)(CTX),f
														
 
															+	addm    (4*6)(CTX),g
														
 
															+	addm    (4*7)(CTX),h
														
 
															+
														
 
															+	mov     _INP(%rsp), INP
														
 
															+	add     $64, INP
														
 
															+	cmp     _INP_END(%rsp), INP
														
 
															+	jne     loop0
														
 
															+
														
 
															+done_hash:
														
 
															+
														
 
															+	mov	%r12, %rsp
														
 
															+
														
 
															+	popq	%r12
														
 
															+	popq    %r15
														
 
															+	popq    %r14
														
 
															+	popq    %r13
														
 
															+	popq    %rbp
														
 
															+	popq    %rbx
														
 
															+	ret
														
 
															+ENDPROC(sha256_transform_avx)
														
 
															+
														
 
															+.data
														
 
															+.align 64
														
 
															+K256:
														
 
															+	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
														
 
															+	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
														
 
															+	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
														
 
															+	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
														
 
															+	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
														
 
															+	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
														
 
															+	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
														
 
															+	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
														
 
															+	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
														
 
															+	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
														
 
															+	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
														
 
															+	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
														
 
															+	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
														
 
															+	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
														
 
															+	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
														
 
															+	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
														
 
															+
														
 
															+PSHUFFLE_BYTE_FLIP_MASK:
														
 
															+	.octa 0x0c0d0e0f08090a0b0405060700010203
														
 
															+
														
 
															+# shuffle xBxA -> 00BA
														
 
															+_SHUF_00BA:
														
 
															+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
														
 
															+
														
 
															+# shuffle xDxC -> DC00
														
 
															+_SHUF_DC00:
														
 
															+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
														
 
															+#endif
														
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -0,0 +1,772 @@
 
															+########################################################################
														
 
															+# Implement fast SHA-256 with AVX2 instructions. (x86_64)
														
 
															+#
														
 
															+# Copyright (C) 2013 Intel Corporation.
														
 
															+#
														
 
															+# Authors:
														
 
															+#     James Guilford <james.guilford@intel.com>
														
 
															+#     Kirk Yap <kirk.s.yap@intel.com>
														
 
															+#     Tim Chen <tim.c.chen@linux.intel.com>
														
 
															+#
														
 
															+# This software is available to you under a choice of one of two
														
 
															+# licenses.  You may choose to be licensed under the terms of the GNU
														
 
															+# General Public License (GPL) Version 2, available from the file
														
 
															+# COPYING in the main directory of this source tree, or the
														
 
															+# OpenIB.org BSD license below:
														
 
															+#
														
 
															+#     Redistribution and use in source and binary forms, with or
														
 
															+#     without modification, are permitted provided that the following
														
 
															+#     conditions are met:
														
 
															+#
														
 
															+#      - Redistributions of source code must retain the above
														
 
															+#        copyright notice, this list of conditions and the following
														
 
															+#        disclaimer.
														
 
															+#
														
 
															+#      - Redistributions in binary form must reproduce the above
														
 
															+#        copyright notice, this list of conditions and the following
														
 
															+#        disclaimer in the documentation and/or other materials
														
 
															+#        provided with the distribution.
														
 
															+#
														
 
															+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
														
 
															+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
														
 
															+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
														
 
															+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
														
 
															+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
														
 
															+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
														
 
															+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
														
 
															+# SOFTWARE.
														
 
															+#
														
 
															+########################################################################
														
 
															+#
														
 
															+# This code is described in an Intel White-Paper:
														
 
															+# "Fast SHA-256 Implementations on Intel Architecture Processors"
														
 
															+#
														
 
															+# To find it, surf to http://www.intel.com/p/en_US/embedded
														
 
															+# and search for that title.
														
 
															+#
														
 
															+########################################################################
														
 
															+# This code schedules 2 blocks at a time, with 4 lanes per block
														
 
															+########################################################################
														
 
															+
														
 
															+#ifdef CONFIG_AS_AVX2
														
 
															+#include <linux/linkage.h>
														
 
															+
														
 
															+## assume buffers not aligned
														
 
															+#define	VMOVDQ vmovdqu
														
 
															+
														
 
															+################################ Define Macros
														
 
															+
														
 
															+# addm [mem], reg
														
 
															+# Add reg to mem using reg-mem add and store
														
 
															+.macro addm p1 p2
														
 
															+	add	\p1, \p2
														
 
															+	mov	\p2, \p1
														
 
															+.endm
														
 
															+
														
 
															+################################
														
 
															+
														
 
															+X0 = %ymm4
														
 
															+X1 = %ymm5
														
 
															+X2 = %ymm6
														
 
															+X3 = %ymm7
														
 
															+
														
 
															+# XMM versions of above
														
 
															+XWORD0 = %xmm4
														
 
															+XWORD1 = %xmm5
														
 
															+XWORD2 = %xmm6
														
 
															+XWORD3 = %xmm7
														
 
															+
														
 
															+XTMP0 = %ymm0
														
 
															+XTMP1 = %ymm1
														
 
															+XTMP2 = %ymm2
														
 
															+XTMP3 = %ymm3
														
 
															+XTMP4 = %ymm8
														
 
															+XFER  = %ymm9
														
 
															+XTMP5 = %ymm11
														
 
															+
														
 
															+SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
														
 
															+SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
														
 
															+BYTE_FLIP_MASK = %ymm13
														
 
															+
														
 
															+X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
														
 
															+
														
 
															+NUM_BLKS = %rdx	# 3rd arg
														
 
															+CTX	= %rsi  # 2nd arg
														
 
															+INP	= %rdi	# 1st arg
														
 
															+c	= %ecx
														
 
															+d	= %r8d
														
 
															+e       = %edx	# clobbers NUM_BLKS
														
 
															+y3	= %edi	# clobbers INP
														
 
															+
														
 
															+
														
 
															+TBL	= %rbp
														
 
															+SRND	= CTX	# SRND is same register as CTX
														
 
															+
														
 
															+a = %eax
														
 
															+b = %ebx
														
 
															+f = %r9d
														
 
															+g = %r10d
														
 
															+h = %r11d
														
 
															+old_h = %r11d
														
 
															+
														
 
															+T1 = %r12d
														
 
															+y0 = %r13d
														
 
															+y1 = %r14d
														
 
															+y2 = %r15d
														
 
															+
														
 
															+
														
 
															+_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
														
 
															+_XMM_SAVE_SIZE	= 0
														
 
															+_INP_END_SIZE	= 8
														
 
															+_INP_SIZE	= 8
														
 
															+_CTX_SIZE	= 8
														
 
															+_RSP_SIZE	= 8
														
 
															+
														
 
															+_XFER		= 0
														
 
															+_XMM_SAVE	= _XFER     + _XFER_SIZE
														
 
															+_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
														
 
															+_INP		= _INP_END  + _INP_END_SIZE
														
 
															+_CTX		= _INP      + _INP_SIZE
														
 
															+_RSP		= _CTX      + _CTX_SIZE
														
 
															+STACK_SIZE	= _RSP      + _RSP_SIZE
														
 
															+
														
 
															+# rotate_Xs
														
 
															+# Rotate values of symbols X0...X3
														
 
															+.macro rotate_Xs
														
 
															+	X_ = X0
														
 
															+	X0 = X1
														
 
															+	X1 = X2
														
 
															+	X2 = X3
														
 
															+	X3 = X_
														
 
															+.endm
														
 
															+
														
 
															+# ROTATE_ARGS
														
 
															+# Rotate values of symbols a...h
														
 
															+.macro ROTATE_ARGS
														
 
															+	old_h = h
														
 
															+	TMP_ = h
														
 
															+	h = g
														
 
															+	g = f
														
 
															+	f = e
														
 
															+	e = d
														
 
															+	d = c
														
 
															+	c = b
														
 
															+	b = a
														
 
															+	a = TMP_
														
 
															+.endm
														
 
															+
														
 
															+.macro FOUR_ROUNDS_AND_SCHED disp
														
 
															+################################### RND N + 0 ############################
														
 
															+
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
														
 
															+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
														
 
															+
														
 
															+	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
														
 
															+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
														
 
															+
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
														
 
															+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
														
 
															+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
														
 
															+
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+	vpsrld	$7, XTMP1, XTMP2
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+	vpslld	$(32-7), XTMP1, XTMP3
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
														
 
															+
														
 
															+	vpsrld	$18, XTMP1, XTMP2
														
 
															+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+	add	y3, h		# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+
														
 
															+	ROTATE_ARGS
														
 
															+
														
 
															+################################### RND N + 1 ############################
														
 
															+
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
														
 
															+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
														
 
															+	offset = \disp + 1*4
														
 
															+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+
														
 
															+
														
 
															+	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+
														
 
															+
														
 
															+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
														
 
															+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+
														
 
															+	vpslld	$(32-18), XTMP1, XTMP1
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
														
 
															+
														
 
															+	vpxor	XTMP1, XTMP3, XTMP3
														
 
															+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+
														
 
															+	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+
														
 
															+	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
														
 
															+	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+
														
 
															+	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+	add	y3, h		# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
														
 
															+
														
 
															+
														
 
															+	ROTATE_ARGS
														
 
															+
														
 
															+################################### RND N + 2 ############################
														
 
															+
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
														
 
															+	offset = \disp + 2*4
														
 
															+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
														
 
															+
														
 
															+	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
														
 
															+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+
														
 
															+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
														
 
															+	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+
														
 
															+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
														
 
															+	vpxor	XTMP3, XTMP2, XTMP2
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
														
 
															+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
														
 
															+	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+
														
 
															+	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
														
 
															+	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
														
 
															+	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
														
 
															+
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+	add	y1,h		# h = k + w + h + S0                    # --
														
 
															+	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+
														
 
															+	add	y3,h		# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+
														
 
															+	ROTATE_ARGS
														
 
															+
														
 
															+################################### RND N + 3 ############################
														
 
															+
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
														
 
															+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
														
 
															+	offset = \disp + 3*4
														
 
															+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+
														
 
															+
														
 
															+	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+
														
 
															+
														
 
															+	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
														
 
															+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+
														
 
															+	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+
														
 
															+	vpxor	XTMP3, XTMP2, XTMP2
														
 
															+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+
														
 
															+	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+
														
 
															+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
														
 
															+	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
														
 
															+
														
 
															+	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+	add	y3, h		# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+	ROTATE_ARGS
														
 
															+	rotate_Xs
														
 
															+.endm
														
 
															+
														
 
															+.macro DO_4ROUNDS disp
														
 
															+################################### RND N + 0 ###########################
														
 
															+
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
														
 
															+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
														
 
															+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
														
 
															+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
														
 
															+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
														
 
															+	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+
														
 
															+
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+
														
 
															+	ROTATE_ARGS
														
 
															+
														
 
															+################################### RND N + 1 ###########################
														
 
															+
														
 
															+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
														
 
															+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
														
 
															+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
														
 
															+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
														
 
															+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
														
 
															+	offset = 4*1 + \disp
														
 
															+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+
														
 
															+
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+
														
 
															+	ROTATE_ARGS
														
 
															+
														
 
															+################################### RND N + 2 ##############################
														
 
															+
														
 
															+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
														
 
															+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
														
 
															+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
														
 
															+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
														
 
															+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
														
 
															+	offset = 4*2 + \disp
														
 
															+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+
														
 
															+
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+
														
 
															+	ROTATE_ARGS
														
 
															+
														
 
															+################################### RND N + 3 ###########################
														
 
															+
														
 
															+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
														
 
															+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
														
 
															+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
														
 
															+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
														
 
															+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
														
 
															+	offset = 4*3 + \disp
														
 
															+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+
														
 
															+
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+
														
 
															+
														
 
															+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+
														
 
															+	add	y3, h		# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+	ROTATE_ARGS
														
 
															+
														
 
															+.endm
														
 
															+
														
 
															+########################################################################
														
 
															+## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
														
 
															+## arg 1 : pointer to input data
														
 
															+## arg 2 : pointer to digest
														
 
															+## arg 3 : Num blocks
														
 
															+########################################################################
														
 
															+.text
														
 
															+ENTRY(sha256_transform_rorx)
														
 
															+.align 32
														
 
															+	pushq	%rbx
														
 
															+	pushq	%rbp
														
 
															+	pushq	%r12
														
 
															+	pushq	%r13
														
 
															+	pushq	%r14
														
 
															+	pushq	%r15
														
 
															+
														
 
															+	mov	%rsp, %rax
														
 
															+	subq	$STACK_SIZE, %rsp
														
 
															+	and	$-32, %rsp	# align rsp to 32 byte boundary
														
 
															+	mov	%rax, _RSP(%rsp)
														
 
															+
														
 
															+
														
 
															+	shl	$6, NUM_BLKS	# convert to bytes
														
 
															+	jz	done_hash
														
 
															+	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
														
 
															+	mov	NUM_BLKS, _INP_END(%rsp)
														
 
															+
														
 
															+	cmp	NUM_BLKS, INP
														
 
															+	je	only_one_block
														
 
															+
														
 
															+	## load initial digest
														
 
															+	mov	(CTX), a
														
 
															+	mov	4*1(CTX), b
														
 
															+	mov	4*2(CTX), c
														
 
															+	mov	4*3(CTX), d
														
 
															+	mov	4*4(CTX), e
														
 
															+	mov	4*5(CTX), f
														
 
															+	mov	4*6(CTX), g
														
 
															+	mov	4*7(CTX), h
														
 
															+
														
 
															+	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
														
 
															+	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
														
 
															+	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
														
 
															+
														
 
															+	mov	CTX, _CTX(%rsp)
														
 
															+
														
 
															+loop0:
														
 
															+	lea     K256(%rip), TBL
														
 
															+
														
 
															+	## Load first 16 dwords from two blocks
														
 
															+	VMOVDQ	0*32(INP),XTMP0
														
 
															+	VMOVDQ	1*32(INP),XTMP1
														
 
															+	VMOVDQ	2*32(INP),XTMP2
														
 
															+	VMOVDQ	3*32(INP),XTMP3
														
 
															+
														
 
															+	## byte swap data
														
 
															+	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
														
 
															+	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
														
 
															+	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
														
 
															+	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
														
 
															+
														
 
															+	## transpose data into high/low halves
														
 
															+	vperm2i128	$0x20, XTMP2, XTMP0, X0
														
 
															+	vperm2i128	$0x31, XTMP2, XTMP0, X1
														
 
															+	vperm2i128	$0x20, XTMP3, XTMP1, X2
														
 
															+	vperm2i128	$0x31, XTMP3, XTMP1, X3
														
 
															+
														
 
															+last_block_enter:
														
 
															+	add	$64, INP
														
 
															+	mov	INP, _INP(%rsp)
														
 
															+
														
 
															+	## schedule 48 input dwords, by doing 3 rounds of 12 each
														
 
															+	xor	SRND, SRND
														
 
															+
														
 
															+.align 16
														
 
															+loop1:
														
 
															+	vpaddd	0*32(TBL, SRND), X0, XFER
														
 
															+	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
														
 
															+	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
														
 
															+
														
 
															+	vpaddd	1*32(TBL, SRND), X0, XFER
														
 
															+	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
														
 
															+	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
														
 
															+
														
 
															+	vpaddd	2*32(TBL, SRND), X0, XFER
														
 
															+	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
														
 
															+	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
														
 
															+
														
 
															+	vpaddd	3*32(TBL, SRND), X0, XFER
														
 
															+	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
														
 
															+	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
														
 
															+
														
 
															+	add	$4*32, SRND
														
 
															+	cmp	$3*4*32, SRND
														
 
															+	jb	loop1
														
 
															+
														
 
															+loop2:
														
 
															+	## Do last 16 rounds with no scheduling
														
 
															+	vpaddd	0*32(TBL, SRND), X0, XFER
														
 
															+	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
														
 
															+	DO_4ROUNDS	_XFER + 0*32
														
 
															+	vpaddd	1*32(TBL, SRND), X1, XFER
														
 
															+	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
														
 
															+	DO_4ROUNDS	_XFER + 1*32
														
 
															+	add	$2*32, SRND
														
 
															+
														
 
															+	vmovdqa	X2, X0
														
 
															+	vmovdqa	X3, X1
														
 
															+
														
 
															+	cmp	$4*4*32, SRND
														
 
															+	jb	loop2
														
 
															+
														
 
															+	mov	_CTX(%rsp), CTX
														
 
															+	mov	_INP(%rsp), INP
														
 
															+
														
 
															+	addm    (4*0)(CTX),a
														
 
															+	addm    (4*1)(CTX),b
														
 
															+	addm    (4*2)(CTX),c
														
 
															+	addm    (4*3)(CTX),d
														
 
															+	addm    (4*4)(CTX),e
														
 
															+	addm    (4*5)(CTX),f
														
 
															+	addm    (4*6)(CTX),g
														
 
															+	addm    (4*7)(CTX),h
														
 
															+
														
 
															+	cmp	_INP_END(%rsp), INP
														
 
															+	ja	done_hash
														
 
															+
														
 
															+	#### Do second block using previously scheduled results
														
 
															+	xor	SRND, SRND
														
 
															+.align 16
														
 
															+loop3:
														
 
															+	DO_4ROUNDS	 _XFER + 0*32 + 16
														
 
															+	DO_4ROUNDS	 _XFER + 1*32 + 16
														
 
															+	add	$2*32, SRND
														
 
															+	cmp	$4*4*32, SRND
														
 
															+	jb	loop3
														
 
															+
														
 
															+	mov	_CTX(%rsp), CTX
														
 
															+	mov	_INP(%rsp), INP
														
 
															+	add	$64, INP
														
 
															+
														
 
															+	addm    (4*0)(CTX),a
														
 
															+	addm    (4*1)(CTX),b
														
 
															+	addm    (4*2)(CTX),c
														
 
															+	addm    (4*3)(CTX),d
														
 
															+	addm    (4*4)(CTX),e
														
 
															+	addm    (4*5)(CTX),f
														
 
															+	addm    (4*6)(CTX),g
														
 
															+	addm    (4*7)(CTX),h
														
 
															+
														
 
															+	cmp	_INP_END(%rsp), INP
														
 
															+	jb	loop0
														
 
															+	ja	done_hash
														
 
															+
														
 
															+do_last_block:
														
 
															+	#### do last block
														
 
															+	lea	K256(%rip), TBL
														
 
															+
														
 
															+	VMOVDQ	0*16(INP),XWORD0
														
 
															+	VMOVDQ	1*16(INP),XWORD1
														
 
															+	VMOVDQ	2*16(INP),XWORD2
														
 
															+	VMOVDQ	3*16(INP),XWORD3
														
 
															+
														
 
															+	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
														
 
															+	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
														
 
															+	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
														
 
															+	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
														
 
															+
														
 
															+	jmp	last_block_enter
														
 
															+
														
 
															+only_one_block:
														
 
															+
														
 
															+	## load initial digest
														
 
															+	mov	(4*0)(CTX),a
														
 
															+	mov	(4*1)(CTX),b
														
 
															+	mov	(4*2)(CTX),c
														
 
															+	mov	(4*3)(CTX),d
														
 
															+	mov	(4*4)(CTX),e
														
 
															+	mov	(4*5)(CTX),f
														
 
															+	mov	(4*6)(CTX),g
														
 
															+	mov	(4*7)(CTX),h
														
 
															+
														
 
															+	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
														
 
															+	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
														
 
															+	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
														
 
															+
														
 
															+	mov	CTX, _CTX(%rsp)
														
 
															+	jmp	do_last_block
														
 
															+
														
 
															+done_hash:
														
 
															+
														
 
															+	mov	_RSP(%rsp), %rsp
														
 
															+
														
 
															+	popq	%r15
														
 
															+	popq	%r14
														
 
															+	popq	%r13
														
 
															+	popq	%r12
														
 
															+	popq	%rbp
														
 
															+	popq	%rbx
														
 
															+	ret
														
 
															+ENDPROC(sha256_transform_rorx)
														
 
															+
														
 
															+.data
														
 
															+.align 64
														
 
															+K256:
														
 
															+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
														
 
															+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
														
 
															+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
														
 
															+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
														
 
															+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
														
 
															+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
														
 
															+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
														
 
															+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
														
 
															+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
														
 
															+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
														
 
															+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
														
 
															+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
														
 
															+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
														
 
															+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
														
 
															+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
														
 
															+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
														
 
															+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
														
 
															+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
														
 
															+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
														
 
															+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
														
 
															+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
														
 
															+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
														
 
															+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
														
 
															+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
														
 
															+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
														
 
															+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
														
 
															+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
														
 
															+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
														
 
															+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
														
 
															+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
														
 
															+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
														
 
															+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
														
 
															+
														
 
															+PSHUFFLE_BYTE_FLIP_MASK:
														
 
															+	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
														
 
															+
														
 
															+# shuffle xBxA -> 00BA
														
 
															+_SHUF_00BA:
														
 
															+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
														
 
															+
														
 
															+# shuffle xDxC -> DC00
														
 
															+_SHUF_DC00:
														
 
															+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
														
 
															+#endif
														
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -0,0 +1,506 @@
 
															+########################################################################
														
 
															+# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
														
 
															+#
														
 
															+# Copyright (C) 2013 Intel Corporation.
														
 
															+#
														
 
															+# Authors:
														
 
															+#     James Guilford <james.guilford@intel.com>
														
 
															+#     Kirk Yap <kirk.s.yap@intel.com>
														
 
															+#     Tim Chen <tim.c.chen@linux.intel.com>
														
 
															+#
														
 
															+# This software is available to you under a choice of one of two
														
 
															+# licenses.  You may choose to be licensed under the terms of the GNU
														
 
															+# General Public License (GPL) Version 2, available from the file
														
 
															+# COPYING in the main directory of this source tree, or the
														
 
															+# OpenIB.org BSD license below:
														
 
															+#
														
 
															+#     Redistribution and use in source and binary forms, with or
														
 
															+#     without modification, are permitted provided that the following
														
 
															+#     conditions are met:
														
 
															+#
														
 
															+#      - Redistributions of source code must retain the above
														
 
															+#        copyright notice, this list of conditions and the following
														
 
															+#        disclaimer.
														
 
															+#
														
 
															+#      - Redistributions in binary form must reproduce the above
														
 
															+#        copyright notice, this list of conditions and the following
														
 
															+#        disclaimer in the documentation and/or other materials
														
 
															+#        provided with the distribution.
														
 
															+#
														
 
															+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
														
 
															+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
														
 
															+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
														
 
															+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
														
 
															+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
														
 
															+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
														
 
															+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
														
 
															+# SOFTWARE.
														
 
															+#
														
 
															+########################################################################
														
 
															+#
														
 
															+# This code is described in an Intel White-Paper:
														
 
															+# "Fast SHA-256 Implementations on Intel Architecture Processors"
														
 
															+#
														
 
															+# To find it, surf to http://www.intel.com/p/en_US/embedded
														
 
															+# and search for that title.
														
 
															+#
														
 
															+########################################################################
														
 
															+
														
 
															+#include <linux/linkage.h>
														
 
															+
														
 
															+## assume buffers not aligned
														
 
															+#define    MOVDQ movdqu
														
 
															+
														
 
															+################################ Define Macros
														
 
															+
														
 
															+# addm [mem], reg
														
 
															+# Add reg to mem using reg-mem add and store
														
 
															+.macro addm p1 p2
														
 
															+        add     \p1, \p2
														
 
															+        mov     \p2, \p1
														
 
															+.endm
														
 
															+
														
 
															+################################
														
 
															+
														
 
															+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
														
 
															+# Load xmm with mem and byte swap each dword
														
 
															+.macro COPY_XMM_AND_BSWAP p1 p2 p3
														
 
															+        MOVDQ \p2, \p1
														
 
															+        pshufb \p3, \p1
														
 
															+.endm
														
 
															+
														
 
															+################################
														
 
															+
														
 
															+X0 = %xmm4
														
 
															+X1 = %xmm5
														
 
															+X2 = %xmm6
														
 
															+X3 = %xmm7
														
 
															+
														
 
															+XTMP0 = %xmm0
														
 
															+XTMP1 = %xmm1
														
 
															+XTMP2 = %xmm2
														
 
															+XTMP3 = %xmm3
														
 
															+XTMP4 = %xmm8
														
 
															+XFER = %xmm9
														
 
															+
														
 
															+SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
														
 
															+SHUF_DC00 = %xmm11      # shuffle xDxC -> DC00
														
 
															+BYTE_FLIP_MASK = %xmm12
														
 
															+
														
 
															+NUM_BLKS = %rdx   # 3rd arg
														
 
															+CTX = %rsi        # 2nd arg
														
 
															+INP = %rdi        # 1st arg
														
 
															+
														
 
															+SRND = %rdi       # clobbers INP
														
 
															+c = %ecx
														
 
															+d = %r8d
														
 
															+e = %edx
														
 
															+TBL = %rbp
														
 
															+a = %eax
														
 
															+b = %ebx
														
 
															+
														
 
															+f = %r9d
														
 
															+g = %r10d
														
 
															+h = %r11d
														
 
															+
														
 
															+y0 = %r13d
														
 
															+y1 = %r14d
														
 
															+y2 = %r15d
														
 
															+
														
 
															+
														
 
															+
														
 
															+_INP_END_SIZE = 8
														
 
															+_INP_SIZE = 8
														
 
															+_XFER_SIZE = 8
														
 
															+_XMM_SAVE_SIZE = 0
														
 
															+
														
 
															+_INP_END = 0
														
 
															+_INP            = _INP_END  + _INP_END_SIZE
														
 
															+_XFER           = _INP      + _INP_SIZE
														
 
															+_XMM_SAVE       = _XFER     + _XFER_SIZE
														
 
															+STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
														
 
															+
														
 
															+# rotate_Xs
														
 
															+# Rotate values of symbols X0...X3
														
 
															+.macro rotate_Xs
														
 
															+X_ = X0
														
 
															+X0 = X1
														
 
															+X1 = X2
														
 
															+X2 = X3
														
 
															+X3 = X_
														
 
															+.endm
														
 
															+
														
 
															+# ROTATE_ARGS
														
 
															+# Rotate values of symbols a...h
														
 
															+.macro ROTATE_ARGS
														
 
															+TMP_ = h
														
 
															+h = g
														
 
															+g = f
														
 
															+f = e
														
 
															+e = d
														
 
															+d = c
														
 
															+c = b
														
 
															+b = a
														
 
															+a = TMP_
														
 
															+.endm
														
 
															+
														
 
															+.macro FOUR_ROUNDS_AND_SCHED
														
 
															+	## compute s0 four at a time and s1 two at a time
														
 
															+	## compute W[-16] + W[-7] 4 at a time
														
 
															+	movdqa  X3, XTMP0
														
 
															+	mov     e, y0			# y0 = e
														
 
															+	ror     $(25-11), y0            # y0 = e >> (25-11)
														
 
															+	mov     a, y1                   # y1 = a
														
 
															+	palignr $4, X2, XTMP0           # XTMP0 = W[-7]
														
 
															+	ror     $(22-13), y1            # y1 = a >> (22-13)
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
														
 
															+	mov     f, y2                   # y2 = f
														
 
															+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	movdqa  X1, XTMP1
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
														
 
															+	xor     g, y2                   # y2 = f^g
														
 
															+	paddd   X0, XTMP0               # XTMP0 = W[-7] + W[-16]
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	and     e, y2                   # y2 = (f^g)&e
														
 
															+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	## compute s0
														
 
															+	palignr $4, X0, XTMP1           # XTMP1 = W[-15]
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
														
 
															+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
														
 
															+	movdqa  XTMP1, XTMP2            # XTMP2 = W[-15]
														
 
															+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
														
 
															+	add     y0, y2                  # y2 = S1 + CH
														
 
															+	add     _XFER(%rsp) , y2        # y2 = k + w + S1 + CH
														
 
															+	movdqa  XTMP1, XTMP3            # XTMP3 = W[-15]
														
 
															+	mov     a, y0                   # y0 = a
														
 
															+	add     y2, h                   # h = h + S1 + CH + k + w
														
 
															+	mov     a, y2                   # y2 = a
														
 
															+	pslld   $(32-7), XTMP1          #
														
 
															+	or      c, y0                   # y0 = a|c
														
 
															+	add     h, d                    # d = d + h + S1 + CH + k + w
														
 
															+	and     c, y2                   # y2 = a&c
														
 
															+	psrld   $7, XTMP2               #
														
 
															+	and     b, y0                   # y0 = (a|c)&b
														
 
															+	add     y1, h                   # h = h + S1 + CH + k + w + S0
														
 
															+	por     XTMP2, XTMP1            # XTMP1 = W[-15] ror 7
														
 
															+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
														
 
															+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
														
 
															+					#
														
 
															+	ROTATE_ARGS                     #
														
 
															+	movdqa  XTMP3, XTMP2            # XTMP2 = W[-15]
														
 
															+	mov     e, y0                   # y0 = e
														
 
															+	mov     a, y1                   # y1 = a
														
 
															+	movdqa  XTMP3, XTMP4            # XTMP4 = W[-15]
														
 
															+	ror     $(25-11), y0            # y0 = e >> (25-11)
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
														
 
															+	mov     f, y2                   # y2 = f
														
 
															+	ror     $(22-13), y1            # y1 = a >> (22-13)
														
 
															+	pslld   $(32-18), XTMP3         #
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
														
 
															+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	xor     g, y2                   # y2 = f^g
														
 
															+	psrld   $18, XTMP2              #
														
 
															+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	and     e, y2                   # y2 = (f^g)&e
														
 
															+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
														
 
															+	pxor    XTMP3, XTMP1
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
														
 
															+	psrld   $3, XTMP4               # XTMP4 = W[-15] >> 3
														
 
															+	add     y0, y2                  # y2 = S1 + CH
														
 
															+	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
														
 
															+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
														
 
															+	pxor    XTMP2, XTMP1            # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
														
 
															+	mov     a, y0                   # y0 = a
														
 
															+	add     y2, h                   # h = h + S1 + CH + k + w
														
 
															+	mov     a, y2                   # y2 = a
														
 
															+	pxor    XTMP4, XTMP1            # XTMP1 = s0
														
 
															+	or      c, y0                   # y0 = a|c
														
 
															+	add     h, d                    # d = d + h + S1 + CH + k + w
														
 
															+	and     c, y2                   # y2 = a&c
														
 
															+	## compute low s1
														
 
															+	pshufd  $0b11111010, X3, XTMP2   # XTMP2 = W[-2] {BBAA}
														
 
															+	and     b, y0			# y0 = (a|c)&b
														
 
															+	add     y1, h                   # h = h + S1 + CH + k + w + S0
														
 
															+	paddd   XTMP1, XTMP0            # XTMP0 = W[-16] + W[-7] + s0
														
 
															+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
														
 
															+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
														
 
															+
														
 
															+	ROTATE_ARGS
														
 
															+	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {BBAA}
														
 
															+	mov     e, y0                   # y0 = e
														
 
															+	mov     a, y1                   # y1 = a
														
 
															+	ror     $(25-11), y0            # y0 = e >> (25-11)
														
 
															+	movdqa  XTMP2, XTMP4            # XTMP4 = W[-2] {BBAA}
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
														
 
															+	ror     $(22-13), y1            # y1 = a >> (22-13)
														
 
															+	mov     f, y2                   # y2 = f
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
														
 
															+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xBxA}
														
 
															+	xor     g, y2                   # y2 = f^g
														
 
															+	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xBxA}
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	and     e, y2                   # y2 = (f^g)&e
														
 
															+	psrld   $10, XTMP4              # XTMP4 = W[-2] >> 10 {BBAA}
														
 
															+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
														
 
															+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
														
 
															+	pxor    XTMP3, XTMP2
														
 
															+	add     y0, y2                  # y2 = S1 + CH
														
 
															+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
														
 
															+	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
														
 
															+	pxor    XTMP2, XTMP4            # XTMP4 = s1 {xBxA}
														
 
															+	mov     a, y0                   # y0 = a
														
 
															+	add     y2, h                   # h = h + S1 + CH + k + w
														
 
															+	mov     a, y2                   # y2 = a
														
 
															+	pshufb  SHUF_00BA, XTMP4        # XTMP4 = s1 {00BA}
														
 
															+	or      c, y0                   # y0 = a|c
														
 
															+	add     h, d                    # d = d + h + S1 + CH + k + w
														
 
															+	and     c, y2                   # y2 = a&c
														
 
															+	paddd   XTMP4, XTMP0            # XTMP0 = {..., ..., W[1], W[0]}
														
 
															+	and     b, y0                   # y0 = (a|c)&b
														
 
															+	add     y1, h                   # h = h + S1 + CH + k + w + S0
														
 
															+	## compute high s1
														
 
															+	pshufd  $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
														
 
															+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
														
 
															+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
														
 
															+					#
														
 
															+	ROTATE_ARGS                     #
														
 
															+	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {DDCC}
														
 
															+	mov     e, y0                   # y0 = e
														
 
															+	ror     $(25-11), y0            # y0 = e >> (25-11)
														
 
															+	mov     a, y1                   # y1 = a
														
 
															+	movdqa  XTMP2, X0               # X0    = W[-2] {DDCC}
														
 
															+	ror     $(22-13), y1            # y1 = a >> (22-13)
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
														
 
															+	mov     f, y2                   # y2 = f
														
 
															+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xDxC}
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
														
 
															+	xor     g, y2                   # y2 = f^g
														
 
															+	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xDxC}
														
 
															+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25
														
 
															+	and     e, y2                   # y2 = (f^g)&e
														
 
															+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	psrld   $10, X0                 # X0 = W[-2] >> 10 {DDCC}
														
 
															+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22
														
 
															+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
														
 
															+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
														
 
															+	pxor    XTMP3, XTMP2            #
														
 
															+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
														
 
															+	add     y0, y2                  # y2 = S1 + CH
														
 
															+	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
														
 
															+	pxor    XTMP2, X0               # X0 = s1 {xDxC}
														
 
															+	mov     a, y0                   # y0 = a
														
 
															+	add     y2, h                   # h = h + S1 + CH + k + w
														
 
															+	mov     a, y2                   # y2 = a
														
 
															+	pshufb  SHUF_DC00, X0           # X0 = s1 {DC00}
														
 
															+	or      c, y0                   # y0 = a|c
														
 
															+	add     h, d                    # d = d + h + S1 + CH + k + w
														
 
															+	and     c, y2                   # y2 = a&c
														
 
															+	paddd   XTMP0, X0               # X0 = {W[3], W[2], W[1], W[0]}
														
 
															+	and     b, y0                   # y0 = (a|c)&b
														
 
															+	add     y1, h                   # h = h + S1 + CH + k + w + S0
														
 
															+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
														
 
															+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
														
 
															+
														
 
															+	ROTATE_ARGS
														
 
															+	rotate_Xs
														
 
															+.endm
														
 
															+
														
 
															+## input is [rsp + _XFER + %1 * 4]
														
 
															+.macro DO_ROUND round
														
 
															+	mov     e, y0                 # y0 = e
														
 
															+	ror     $(25-11), y0          # y0 = e >> (25-11)
														
 
															+	mov     a, y1                 # y1 = a
														
 
															+	xor     e, y0                 # y0 = e ^ (e >> (25-11))
														
 
															+	ror     $(22-13), y1          # y1 = a >> (22-13)
														
 
															+	mov     f, y2                 # y2 = f
														
 
															+	xor     a, y1                 # y1 = a ^ (a >> (22-13)
														
 
															+	ror     $(11-6), y0           # y0 = (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	xor     g, y2                 # y2 = f^g
														
 
															+	xor     e, y0                 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
														
 
															+	ror     $(13-2), y1           # y1 = (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	and     e, y2                 # y2 = (f^g)&e
														
 
															+	xor     a, y1                 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
														
 
															+	ror     $6, y0                # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
														
 
															+	xor     g, y2                 # y2 = CH = ((f^g)&e)^g
														
 
															+	add     y0, y2                # y2 = S1 + CH
														
 
															+	ror     $2, y1                # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
														
 
															+	offset = \round * 4 + _XFER
														
 
															+	add     offset(%rsp), y2      # y2 = k + w + S1 + CH
														
 
															+	mov     a, y0                 # y0 = a
														
 
															+	add     y2, h                 # h = h + S1 + CH + k + w
														
 
															+	mov     a, y2                 # y2 = a
														
 
															+	or      c, y0                 # y0 = a|c
														
 
															+	add     h, d                  # d = d + h + S1 + CH + k + w
														
 
															+	and     c, y2                 # y2 = a&c
														
 
															+	and     b, y0                 # y0 = (a|c)&b
														
 
															+	add     y1, h                 # h = h + S1 + CH + k + w + S0
														
 
															+	or      y2, y0		      # y0 = MAJ = (a|c)&b)|(a&c)
														
 
															+	add     y0, h		      # h = h + S1 + CH + k + w + S0 + MAJ
														
 
															+	ROTATE_ARGS
														
 
															+.endm
														
 
															+
														
 
															+########################################################################
														
 
															+## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
														
 
															+## arg 1 : pointer to input data
														
 
															+## arg 2 : pointer to digest
														
 
															+## arg 3 : Num blocks
														
 
															+########################################################################
														
 
															+.text
														
 
															+ENTRY(sha256_transform_ssse3)
														
 
															+.align 32
														
 
															+	pushq   %rbx
														
 
															+	pushq   %rbp
														
 
															+	pushq   %r13
														
 
															+	pushq   %r14
														
 
															+	pushq   %r15
														
 
															+	pushq   %r12
														
 
															+
														
 
															+	mov	%rsp, %r12
														
 
															+	subq    $STACK_SIZE, %rsp
														
 
															+	and	$~15, %rsp
														
 
															+
														
 
															+	shl     $6, NUM_BLKS		 # convert to bytes
														
 
															+	jz      done_hash
														
 
															+	add     INP, NUM_BLKS
														
 
															+	mov     NUM_BLKS, _INP_END(%rsp) # pointer to end of data
														
 
															+
														
 
															+	## load initial digest
														
 
															+	mov     4*0(CTX), a
														
 
															+	mov     4*1(CTX), b
														
 
															+	mov     4*2(CTX), c
														
 
															+	mov     4*3(CTX), d
														
 
															+	mov     4*4(CTX), e
														
 
															+	mov     4*5(CTX), f
														
 
															+	mov     4*6(CTX), g
														
 
															+	mov     4*7(CTX), h
														
 
															+
														
 
															+	movdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
														
 
															+	movdqa  _SHUF_00BA(%rip), SHUF_00BA
														
 
															+	movdqa  _SHUF_DC00(%rip), SHUF_DC00
														
 
															+
														
 
															+loop0:
														
 
															+	lea     K256(%rip), TBL
														
 
															+
														
 
															+	## byte swap first 16 dwords
														
 
															+	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
														
 
															+	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
														
 
															+	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
														
 
															+	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
														
 
															+
														
 
															+	mov     INP, _INP(%rsp)
														
 
															+
														
 
															+	## schedule 48 input dwords, by doing 3 rounds of 16 each
														
 
															+	mov     $3, SRND
														
 
															+.align 16
														
 
															+loop1:
														
 
															+	movdqa  (TBL), XFER
														
 
															+	paddd   X0, XFER
														
 
															+	movdqa  XFER, _XFER(%rsp)
														
 
															+	FOUR_ROUNDS_AND_SCHED
														
 
															+
														
 
															+	movdqa  1*16(TBL), XFER
														
 
															+	paddd   X0, XFER
														
 
															+	movdqa  XFER, _XFER(%rsp)
														
 
															+	FOUR_ROUNDS_AND_SCHED
														
 
															+
														
 
															+	movdqa  2*16(TBL), XFER
														
 
															+	paddd   X0, XFER
														
 
															+	movdqa  XFER, _XFER(%rsp)
														
 
															+	FOUR_ROUNDS_AND_SCHED
														
 
															+
														
 
															+	movdqa  3*16(TBL), XFER
														
 
															+	paddd   X0, XFER
														
 
															+	movdqa  XFER, _XFER(%rsp)
														
 
															+	add     $4*16, TBL
														
 
															+	FOUR_ROUNDS_AND_SCHED
														
 
															+
														
 
															+	sub     $1, SRND
														
 
															+	jne     loop1
														
 
															+
														
 
															+	mov     $2, SRND
														
 
															+loop2:
														
 
															+	paddd   (TBL), X0
														
 
															+	movdqa  X0, _XFER(%rsp)
														
 
															+	DO_ROUND        0
														
 
															+	DO_ROUND        1
														
 
															+	DO_ROUND        2
														
 
															+	DO_ROUND        3
														
 
															+	paddd   1*16(TBL), X1
														
 
															+	movdqa  X1, _XFER(%rsp)
														
 
															+	add     $2*16, TBL
														
 
															+	DO_ROUND        0
														
 
															+	DO_ROUND        1
														
 
															+	DO_ROUND        2
														
 
															+	DO_ROUND        3
														
 
															+
														
 
															+	movdqa  X2, X0
														
 
															+	movdqa  X3, X1
														
 
															+
														
 
															+	sub     $1, SRND
														
 
															+	jne     loop2
														
 
															+
														
 
															+	addm    (4*0)(CTX),a
														
 
															+	addm    (4*1)(CTX),b
														
 
															+	addm    (4*2)(CTX),c
														
 
															+	addm    (4*3)(CTX),d
														
 
															+	addm    (4*4)(CTX),e
														
 
															+	addm    (4*5)(CTX),f
														
 
															+	addm    (4*6)(CTX),g
														
 
															+	addm    (4*7)(CTX),h
														
 
															+
														
 
															+	mov     _INP(%rsp), INP
														
 
															+	add     $64, INP
														
 
															+	cmp     _INP_END(%rsp), INP
														
 
															+	jne     loop0
														
 
															+
														
 
															+done_hash:
														
 
															+
														
 
															+	mov	%r12, %rsp
														
 
															+
														
 
															+	popq    %r12
														
 
															+	popq    %r15
														
 
															+	popq    %r14
														
 
															+	popq    %r13
														
 
															+	popq    %rbp
														
 
															+	popq    %rbx
														
 
															+
														
 
															+	ret
														
 
															+ENDPROC(sha256_transform_ssse3)
														
 
															+
														
 
															+.data
														
 
															+.align 64
														
 
															+K256:
														
 
															+        .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
														
 
															+        .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
														
 
															+        .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
														
 
															+        .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
														
 
															+        .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
														
 
															+        .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
														
 
															+        .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
														
 
															+        .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
														
 
															+        .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
														
 
															+        .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
														
 
															+        .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
														
 
															+        .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
														
 
															+        .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
														
 
															+        .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
														
 
															+        .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
														
 
															+        .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
														
 
															+
														
 
															+PSHUFFLE_BYTE_FLIP_MASK:
														
 
															+	.octa 0x0c0d0e0f08090a0b0405060700010203
														
 
															+
														
 
															+# shuffle xBxA -> 00BA
														
 
															+_SHUF_00BA:
														
 
															+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
														
 
															+
														
 
															+# shuffle xDxC -> DC00
														
 
															+_SHUF_DC00:
														
 
															+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
														
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -0,0 +1,275 @@
 
															+/*
														
 
															+ * Cryptographic API.
														
 
															+ *
														
 
															+ * Glue code for the SHA256 Secure Hash Algorithm assembler
														
 
															+ * implementation using supplemental SSE3 / AVX / AVX2 instructions.
														
 
															+ *
														
 
															+ * This file is based on sha256_generic.c
														
 
															+ *
														
 
															+ * Copyright (C) 2013 Intel Corporation.
														
 
															+ *
														
 
															+ * Author:
														
 
															+ *     Tim Chen <tim.c.chen@linux.intel.com>
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify it
														
 
															+ * under the terms of the GNU General Public License as published by the Free
														
 
															+ * Software Foundation; either version 2 of the License, or (at your option)
														
 
															+ * any later version.
														
 
															+ *
														
 
															+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
														
 
															+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
														
 
															+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
														
 
															+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
														
 
															+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
														
 
															+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
														
 
															+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
														
 
															+ * SOFTWARE.
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
														
 
															+
														
 
															+#include <crypto/internal/hash.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/mm.h>
														
 
															+#include <linux/cryptohash.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <crypto/sha.h>
														
 
															+#include <asm/byteorder.h>
														
 
															+#include <asm/i387.h>
														
 
															+#include <asm/xcr.h>
														
 
															+#include <asm/xsave.h>
														
 
															+#include <linux/string.h>
														
 
															+
														
 
															+asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest,
														
 
															+				     u64 rounds);
														
 
															+#ifdef CONFIG_AS_AVX
														
 
															+asmlinkage void sha256_transform_avx(const char *data, u32 *digest,
														
 
															+				     u64 rounds);
														
 
															+#endif
														
 
															+#ifdef CONFIG_AS_AVX2
														
 
															+asmlinkage void sha256_transform_rorx(const char *data, u32 *digest,
														
 
															+				     u64 rounds);
														
 
															+#endif
														
 
															+
														
 
															+static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64);
														
 
															+
														
 
															+
														
 
															+static int sha256_ssse3_init(struct shash_desc *desc)
														
 
															+{
														
 
															+	struct sha256_state *sctx = shash_desc_ctx(desc);
														
 
															+
														
 
															+	sctx->state[0] = SHA256_H0;
														
 
															+	sctx->state[1] = SHA256_H1;
														
 
															+	sctx->state[2] = SHA256_H2;
														
 
															+	sctx->state[3] = SHA256_H3;
														
 
															+	sctx->state[4] = SHA256_H4;
														
 
															+	sctx->state[5] = SHA256_H5;
														
 
															+	sctx->state[6] = SHA256_H6;
														
 
															+	sctx->state[7] = SHA256_H7;
														
 
															+	sctx->count = 0;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
														
 
															+			       unsigned int len, unsigned int partial)
														
 
															+{
														
 
															+	struct sha256_state *sctx = shash_desc_ctx(desc);
														
 
															+	unsigned int done = 0;
														
 
															+
														
 
															+	sctx->count += len;
														
 
															+
														
 
															+	if (partial) {
														
 
															+		done = SHA256_BLOCK_SIZE - partial;
														
 
															+		memcpy(sctx->buf + partial, data, done);
														
 
															+		sha256_transform_asm(sctx->buf, sctx->state, 1);
														
 
															+	}
														
 
															+
														
 
															+	if (len - done >= SHA256_BLOCK_SIZE) {
														
 
															+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
														
 
															+
														
 
															+		sha256_transform_asm(data + done, sctx->state, (u64) rounds);
														
 
															+
														
 
															+		done += rounds * SHA256_BLOCK_SIZE;
														
 
															+	}
														
 
															+
														
 
															+	memcpy(sctx->buf, data + done, len - done);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
														
 
															+			     unsigned int len)
														
 
															+{
														
 
															+	struct sha256_state *sctx = shash_desc_ctx(desc);
														
 
															+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
														
 
															+	int res;
														
 
															+
														
 
															+	/* Handle the fast case right here */
														
 
															+	if (partial + len < SHA256_BLOCK_SIZE) {
														
 
															+		sctx->count += len;
														
 
															+		memcpy(sctx->buf + partial, data, len);
														
 
															+
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	if (!irq_fpu_usable()) {
														
 
															+		res = crypto_sha256_update(desc, data, len);
														
 
															+	} else {
														
 
															+		kernel_fpu_begin();
														
 
															+		res = __sha256_ssse3_update(desc, data, len, partial);
														
 
															+		kernel_fpu_end();
														
 
															+	}
														
 
															+
														
 
															+	return res;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* Add padding and return the message digest. */
														
 
															+static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
														
 
															+{
														
 
															+	struct sha256_state *sctx = shash_desc_ctx(desc);
														
 
															+	unsigned int i, index, padlen;
														
 
															+	__be32 *dst = (__be32 *)out;
														
 
															+	__be64 bits;
														
 
															+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
														
 
															+
														
 
															+	bits = cpu_to_be64(sctx->count << 3);
														
 
															+
														
 
															+	/* Pad out to 56 mod 64 and append length */
														
 
															+	index = sctx->count % SHA256_BLOCK_SIZE;
														
 
															+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
														
 
															+
														
 
															+	if (!irq_fpu_usable()) {
														
 
															+		crypto_sha256_update(desc, padding, padlen);
														
 
															+		crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
														
 
															+	} else {
														
 
															+		kernel_fpu_begin();
														
 
															+		/* We need to fill a whole block for __sha256_ssse3_update() */
														
 
															+		if (padlen <= 56) {
														
 
															+			sctx->count += padlen;
														
 
															+			memcpy(sctx->buf + index, padding, padlen);
														
 
															+		} else {
														
 
															+			__sha256_ssse3_update(desc, padding, padlen, index);
														
 
															+		}
														
 
															+		__sha256_ssse3_update(desc, (const u8 *)&bits,
														
 
															+					sizeof(bits), 56);
														
 
															+		kernel_fpu_end();
														
 
															+	}
														
 
															+
														
 
															+	/* Store state in digest */
														
 
															+	for (i = 0; i < 8; i++)
														
 
															+		dst[i] = cpu_to_be32(sctx->state[i]);
														
 
															+
														
 
															+	/* Wipe context */
														
 
															+	memset(sctx, 0, sizeof(*sctx));
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int sha256_ssse3_export(struct shash_desc *desc, void *out)
														
 
															+{
														
 
															+	struct sha256_state *sctx = shash_desc_ctx(desc);
														
 
															+
														
 
															+	memcpy(out, sctx, sizeof(*sctx));
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int sha256_ssse3_import(struct shash_desc *desc, const void *in)
														
 
															+{
														
 
															+	struct sha256_state *sctx = shash_desc_ctx(desc);
														
 
															+
														
 
															+	memcpy(sctx, in, sizeof(*sctx));
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static struct shash_alg alg = {
														
 
															+	.digestsize	=	SHA256_DIGEST_SIZE,
														
 
															+	.init		=	sha256_ssse3_init,
														
 
															+	.update		=	sha256_ssse3_update,
														
 
															+	.final		=	sha256_ssse3_final,
														
 
															+	.export		=	sha256_ssse3_export,
														
 
															+	.import		=	sha256_ssse3_import,
														
 
															+	.descsize	=	sizeof(struct sha256_state),
														
 
															+	.statesize	=	sizeof(struct sha256_state),
														
 
															+	.base		=	{
														
 
															+		.cra_name	=	"sha256",
														
 
															+		.cra_driver_name =	"sha256-ssse3",
														
 
															+		.cra_priority	=	150,
														
 
															+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
														
 
															+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
														
 
															+		.cra_module	=	THIS_MODULE,
														
 
															+	}
														
 
															+};
														
 
															+
														
 
															+#ifdef CONFIG_AS_AVX
														
 
															+static bool __init avx_usable(void)
														
 
															+{
														
 
															+	u64 xcr0;
														
 
															+
														
 
															+	if (!cpu_has_avx || !cpu_has_osxsave)
														
 
															+		return false;
														
 
															+
														
 
															+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
														
 
															+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
														
 
															+		pr_info("AVX detected but unusable.\n");
														
 
															+
														
 
															+		return false;
														
 
															+	}
														
 
															+
														
 
															+	return true;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+static int __init sha256_ssse3_mod_init(void)
														
 
															+{
														
 
															+	/* test for SSE3 first */
														
 
															+	if (cpu_has_ssse3)
														
 
															+		sha256_transform_asm = sha256_transform_ssse3;
														
 
															+
														
 
															+#ifdef CONFIG_AS_AVX
														
 
															+	/* allow AVX to override SSSE3, it's a little faster */
														
 
															+	if (avx_usable()) {
														
 
															+#ifdef CONFIG_AS_AVX2
														
 
															+		if (boot_cpu_has(X86_FEATURE_AVX2))
														
 
															+			sha256_transform_asm = sha256_transform_rorx;
														
 
															+		else
														
 
															+#endif
														
 
															+			sha256_transform_asm = sha256_transform_avx;
														
 
															+	}
														
 
															+#endif
														
 
															+
														
 
															+	if (sha256_transform_asm) {
														
 
															+#ifdef CONFIG_AS_AVX
														
 
															+		if (sha256_transform_asm == sha256_transform_avx)
														
 
															+			pr_info("Using AVX optimized SHA-256 implementation\n");
														
 
															+#ifdef CONFIG_AS_AVX2
														
 
															+		else if (sha256_transform_asm == sha256_transform_rorx)
														
 
															+			pr_info("Using AVX2 optimized SHA-256 implementation\n");
														
 
															+#endif
														
 
															+		else
														
 
															+#endif
														
 
															+			pr_info("Using SSSE3 optimized SHA-256 implementation\n");
														
 
															+		return crypto_register_shash(&alg);
														
 
															+	}
														
 
															+	pr_info("Neither AVX nor SSSE3 is available/usable.\n");
														
 
															+
														
 
															+	return -ENODEV;
														
 
															+}
														
 
															+
														
 
															+static void __exit sha256_ssse3_mod_fini(void)
														
 
															+{
														
 
															+	crypto_unregister_shash(&alg);
														
 
															+}
														
 
															+
														
 
															+module_init(sha256_ssse3_mod_init);
														
 
															+module_exit(sha256_ssse3_mod_fini);
														
 
															+
														
 
															+MODULE_LICENSE("GPL");
														
 
															+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
														
 
															+
														
 
															+MODULE_ALIAS("sha256");
														
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -0,0 +1,423 @@
 
															+########################################################################
														
 
															+# Implement fast SHA-512 with AVX instructions. (x86_64)
														
 
															+#
														
 
															+# Copyright (C) 2013 Intel Corporation.
														
 
															+#
														
 
															+# Authors:
														
 
															+#     James Guilford <james.guilford@intel.com>
														
 
															+#     Kirk Yap <kirk.s.yap@intel.com>
														
 
															+#     David Cote <david.m.cote@intel.com>
														
 
															+#     Tim Chen <tim.c.chen@linux.intel.com>
														
 
															+#
														
 
															+# This software is available to you under a choice of one of two
														
 
															+# licenses.  You may choose to be licensed under the terms of the GNU
														
 
															+# General Public License (GPL) Version 2, available from the file
														
 
															+# COPYING in the main directory of this source tree, or the
														
 
															+# OpenIB.org BSD license below:
														
 
															+#
														
 
															+#     Redistribution and use in source and binary forms, with or
														
 
															+#     without modification, are permitted provided that the following
														
 
															+#     conditions are met:
														
 
															+#
														
 
															+#      - Redistributions of source code must retain the above
														
 
															+#        copyright notice, this list of conditions and the following
														
 
															+#        disclaimer.
														
 
															+#
														
 
															+#      - Redistributions in binary form must reproduce the above
														
 
															+#        copyright notice, this list of conditions and the following
														
 
															+#        disclaimer in the documentation and/or other materials
														
 
															+#        provided with the distribution.
														
 
															+#
														
 
															+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
														
 
															+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
														
 
															+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
														
 
															+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
														
 
															+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
														
 
															+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
														
 
															+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
														
 
															+# SOFTWARE.
														
 
															+#
														
 
															+########################################################################
														
 
															+#
														
 
															+# This code is described in an Intel White-Paper:
														
 
															+# "Fast SHA-512 Implementations on Intel Architecture Processors"
														
 
															+#
														
 
															+# To find it, surf to http://www.intel.com/p/en_US/embedded
														
 
															+# and search for that title.
														
 
															+#
														
 
															+########################################################################
														
 
															+
														
 
															+#ifdef CONFIG_AS_AVX
														
 
															+#include <linux/linkage.h>
														
 
															+
														
 
															+.text
														
 
															+
														
 
															+# Virtual Registers
														
 
															+# ARG1
														
 
															+msg	= %rdi
														
 
															+# ARG2
														
 
															+digest	= %rsi
														
 
															+# ARG3
														
 
															+msglen	= %rdx
														
 
															+T1	= %rcx
														
 
															+T2	= %r8
														
 
															+a_64	= %r9
														
 
															+b_64	= %r10
														
 
															+c_64	= %r11
														
 
															+d_64	= %r12
														
 
															+e_64	= %r13
														
 
															+f_64	= %r14
														
 
															+g_64	= %r15
														
 
															+h_64	= %rbx
														
 
															+tmp0	= %rax
														
 
															+
														
 
															+# Local variables (stack frame)
														
 
															+
														
 
															+# Message Schedule
														
 
															+W_SIZE = 80*8
														
 
															+# W[t] + K[t] | W[t+1] + K[t+1]
														
 
															+WK_SIZE = 2*8
														
 
															+RSPSAVE_SIZE = 1*8
														
 
															+GPRSAVE_SIZE = 5*8
														
 
															+
														
 
															+frame_W = 0
														
 
															+frame_WK = frame_W + W_SIZE
														
 
															+frame_RSPSAVE = frame_WK + WK_SIZE
														
 
															+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
														
 
															+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
														
 
															+
														
 
															+# Useful QWORD "arrays" for simpler memory references
														
 
															+# MSG, DIGEST, K_t, W_t are arrays
														
 
															+# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
														
 
															+
														
 
															+# Input message (arg1)
														
 
															+#define MSG(i)    8*i(msg)
														
 
															+
														
 
															+# Output Digest (arg2)
														
 
															+#define DIGEST(i) 8*i(digest)
														
 
															+
														
 
															+# SHA Constants (static mem)
														
 
															+#define K_t(i)    8*i+K512(%rip)
														
 
															+
														
 
															+# Message Schedule (stack frame)
														
 
															+#define W_t(i)    8*i+frame_W(%rsp)
														
 
															+
														
 
															+# W[t]+K[t] (stack frame)
														
 
															+#define WK_2(i)   8*((i%2))+frame_WK(%rsp)
														
 
															+
														
 
															+.macro RotateState
														
 
															+	# Rotate symbols a..h right
														
 
															+	TMP   = h_64
														
 
															+	h_64  = g_64
														
 
															+	g_64  = f_64
														
 
															+	f_64  = e_64
														
 
															+	e_64  = d_64
														
 
															+	d_64  = c_64
														
 
															+	c_64  = b_64
														
 
															+	b_64  = a_64
														
 
															+	a_64  = TMP
														
 
															+.endm
														
 
															+
														
 
															+.macro RORQ p1 p2
														
 
															+	# shld is faster than ror on Sandybridge
														
 
															+	shld	$(64-\p2), \p1, \p1
														
 
															+.endm
														
 
															+
														
 
															+.macro SHA512_Round rnd
														
 
															+	# Compute Round %%t
														
 
															+	mov     f_64, T1          # T1 = f
														
 
															+	mov     e_64, tmp0        # tmp = e
														
 
															+	xor     g_64, T1          # T1 = f ^ g
														
 
															+	RORQ    tmp0, 23   # 41    # tmp = e ror 23
														
 
															+	and     e_64, T1          # T1 = (f ^ g) & e
														
 
															+	xor     e_64, tmp0        # tmp = (e ror 23) ^ e
														
 
															+	xor     g_64, T1          # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
														
 
															+	idx = \rnd
														
 
															+	add     WK_2(idx), T1     # W[t] + K[t] from message scheduler
														
 
															+	RORQ    tmp0, 4   # 18    # tmp = ((e ror 23) ^ e) ror 4
														
 
															+	xor     e_64, tmp0        # tmp = (((e ror 23) ^ e) ror 4) ^ e
														
 
															+	mov     a_64, T2          # T2 = a
														
 
															+	add     h_64, T1          # T1 = CH(e,f,g) + W[t] + K[t] + h
														
 
															+	RORQ    tmp0, 14  # 14    # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
														
 
															+	add     tmp0, T1          # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
														
 
															+	mov     a_64, tmp0        # tmp = a
														
 
															+	xor     c_64, T2          # T2 = a ^ c
														
 
															+	and     c_64, tmp0        # tmp = a & c
														
 
															+	and     b_64, T2          # T2 = (a ^ c) & b
														
 
															+	xor     tmp0, T2          # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
														
 
															+	mov     a_64, tmp0        # tmp = a
														
 
															+	RORQ    tmp0, 5  # 39     # tmp = a ror 5
														
 
															+	xor     a_64, tmp0        # tmp = (a ror 5) ^ a
														
 
															+	add     T1, d_64          # e(next_state) = d + T1
														
 
															+	RORQ    tmp0, 6  # 34     # tmp = ((a ror 5) ^ a) ror 6
														
 
															+	xor     a_64, tmp0        # tmp = (((a ror 5) ^ a) ror 6) ^ a
														
 
															+	lea     (T1, T2), h_64    # a(next_state) = T1 + Maj(a,b,c)
														
 
															+	RORQ    tmp0, 28  # 28    # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
														
 
															+	add     tmp0, h_64        # a(next_state) = T1 + Maj(a,b,c) S0(a)
														
 
															+	RotateState
														
 
															+.endm
														
 
															+
														
 
															+.macro SHA512_2Sched_2Round_avx rnd
														
 
															+	# Compute rounds t-2 and t-1
														
 
															+	# Compute message schedule QWORDS t and t+1
														
 
															+
														
 
															+	#   Two rounds are computed based on the values for K[t-2]+W[t-2] and
														
 
															+	# K[t-1]+W[t-1] which were previously stored at WK_2 by the message
														
 
															+	# scheduler.
														
 
															+	#   The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
														
 
															+	# They are then added to their respective SHA512 constants at
														
 
															+	# [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
														
 
															+	#   For brievity, the comments following vectored instructions only refer to
														
 
															+	# the first of a pair of QWORDS.
														
 
															+	# Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
														
 
															+	#   The computation of the message schedule and the rounds are tightly
														
 
															+	# stitched to take advantage of instruction-level parallelism.
														
 
															+
														
 
															+	idx = \rnd - 2
														
 
															+	vmovdqa	W_t(idx), %xmm4		# XMM4 = W[t-2]
														
 
															+	idx = \rnd - 15
														
 
															+	vmovdqu	W_t(idx), %xmm5		# XMM5 = W[t-15]
														
 
															+	mov	f_64, T1
														
 
															+	vpsrlq	$61, %xmm4, %xmm0	# XMM0 = W[t-2]>>61
														
 
															+	mov	e_64, tmp0
														
 
															+	vpsrlq	$1, %xmm5, %xmm6	# XMM6 = W[t-15]>>1
														
 
															+	xor	g_64, T1
														
 
															+	RORQ	tmp0, 23 # 41
														
 
															+	vpsrlq	$19, %xmm4, %xmm1	# XMM1 = W[t-2]>>19
														
 
															+	and	e_64, T1
														
 
															+	xor	e_64, tmp0
														
 
															+	vpxor	%xmm1, %xmm0, %xmm0	# XMM0 = W[t-2]>>61 ^ W[t-2]>>19
														
 
															+	xor	g_64, T1
														
 
															+	idx = \rnd
														
 
															+	add	WK_2(idx), T1#
														
 
															+	vpsrlq	$8, %xmm5, %xmm7	# XMM7 = W[t-15]>>8
														
 
															+	RORQ	tmp0, 4 # 18
														
 
															+	vpsrlq	$6, %xmm4, %xmm2	# XMM2 = W[t-2]>>6
														
 
															+	xor	e_64, tmp0
														
 
															+	mov	a_64, T2
														
 
															+	add	h_64, T1
														
 
															+	vpxor	%xmm7, %xmm6, %xmm6	# XMM6 = W[t-15]>>1 ^ W[t-15]>>8
														
 
															+	RORQ	tmp0, 14 # 14
														
 
															+	add	tmp0, T1
														
 
															+	vpsrlq	$7, %xmm5, %xmm8	# XMM8 = W[t-15]>>7
														
 
															+	mov	a_64, tmp0
														
 
															+	xor	c_64, T2
														
 
															+	vpsllq	$(64-61), %xmm4, %xmm3  # XMM3 = W[t-2]<<3
														
 
															+	and	c_64, tmp0
														
 
															+	and	b_64, T2
														
 
															+	vpxor	%xmm3, %xmm2, %xmm2	# XMM2 = W[t-2]>>6 ^ W[t-2]<<3
														
 
															+	xor	tmp0, T2
														
 
															+	mov	a_64, tmp0
														
 
															+	vpsllq	$(64-1), %xmm5, %xmm9	# XMM9 = W[t-15]<<63
														
 
															+	RORQ	tmp0, 5 # 39
														
 
															+	vpxor	%xmm9, %xmm8, %xmm8	# XMM8 = W[t-15]>>7 ^ W[t-15]<<63
														
 
															+	xor	a_64, tmp0
														
 
															+	add	T1, d_64
														
 
															+	RORQ	tmp0, 6 # 34
														
 
															+	xor	a_64, tmp0
														
 
															+	vpxor	%xmm8, %xmm6, %xmm6	# XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
														
 
															+					#  W[t-15]>>7 ^ W[t-15]<<63
														
 
															+	lea	(T1, T2), h_64
														
 
															+	RORQ	tmp0, 28 # 28
														
 
															+	vpsllq	$(64-19), %xmm4, %xmm4  # XMM4 = W[t-2]<<25
														
 
															+	add	tmp0, h_64
														
 
															+	RotateState
														
 
															+	vpxor	%xmm4, %xmm0, %xmm0     # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
														
 
															+					#        W[t-2]<<25
														
 
															+	mov	f_64, T1
														
 
															+	vpxor	%xmm2, %xmm0, %xmm0     # XMM0 = s1(W[t-2])
														
 
															+	mov	e_64, tmp0
														
 
															+	xor	g_64, T1
														
 
															+	idx = \rnd - 16
														
 
															+	vpaddq	W_t(idx), %xmm0, %xmm0  # XMM0 = s1(W[t-2]) + W[t-16]
														
 
															+	idx = \rnd - 7
														
 
															+	vmovdqu	W_t(idx), %xmm1		# XMM1 = W[t-7]
														
 
															+	RORQ	tmp0, 23 # 41
														
 
															+	and	e_64, T1
														
 
															+	xor	e_64, tmp0
														
 
															+	xor	g_64, T1
														
 
															+	vpsllq	$(64-8), %xmm5, %xmm5   # XMM5 = W[t-15]<<56
														
 
															+	idx = \rnd + 1
														
 
															+	add	WK_2(idx), T1
														
 
															+	vpxor	%xmm5, %xmm6, %xmm6     # XMM6 = s0(W[t-15])
														
 
															+	RORQ	tmp0, 4 # 18
														
 
															+	vpaddq	%xmm6, %xmm0, %xmm0     # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
														
 
															+	xor	e_64, tmp0
														
 
															+	vpaddq	%xmm1, %xmm0, %xmm0     # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
														
 
															+					#               s0(W[t-15]) + W[t-16]
														
 
															+	mov	a_64, T2
														
 
															+	add	h_64, T1
														
 
															+	RORQ	tmp0, 14 # 14
														
 
															+	add	tmp0, T1
														
 
															+	idx = \rnd
														
 
															+	vmovdqa	%xmm0, W_t(idx)		# Store W[t]
														
 
															+	vpaddq	K_t(idx), %xmm0, %xmm0  # Compute W[t]+K[t]
														
 
															+	vmovdqa	%xmm0, WK_2(idx)	# Store W[t]+K[t] for next rounds
														
 
															+	mov	a_64, tmp0
														
 
															+	xor	c_64, T2
														
 
															+	and	c_64, tmp0
														
 
															+	and	b_64, T2
														
 
															+	xor	tmp0, T2
														
 
															+	mov	a_64, tmp0
														
 
															+	RORQ	tmp0, 5 # 39
														
 
															+	xor	a_64, tmp0
														
 
															+	add	T1, d_64
														
 
															+	RORQ	tmp0, 6 # 34
														
 
															+	xor	a_64, tmp0
														
 
															+	lea	(T1, T2), h_64
														
 
															+	RORQ	tmp0, 28 # 28
														
 
															+	add	tmp0, h_64
														
 
															+	RotateState
														
 
															+.endm
														
 
															+
														
 
															+########################################################################
														
 
															+# void sha512_transform_avx(const void* M, void* D, u64 L)
														
 
															+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
														
 
															+# The size of the message pointed to by M must be an integer multiple of SHA512
														
 
															+# message blocks.
														
 
															+# L is the message length in SHA512 blocks
														
 
															+########################################################################
														
 
															+ENTRY(sha512_transform_avx)
														
 
															+	cmp $0, msglen
														
 
															+	je nowork
														
 
															+
														
 
															+	# Allocate Stack Space
														
 
															+	mov	%rsp, %rax
														
 
															+	sub     $frame_size, %rsp
														
 
															+	and	$~(0x20 - 1), %rsp
														
 
															+	mov	%rax, frame_RSPSAVE(%rsp)
														
 
															+
														
 
															+	# Save GPRs
														
 
															+	mov     %rbx, frame_GPRSAVE(%rsp)
														
 
															+	mov     %r12, frame_GPRSAVE +8*1(%rsp)
														
 
															+	mov     %r13, frame_GPRSAVE +8*2(%rsp)
														
 
															+	mov     %r14, frame_GPRSAVE +8*3(%rsp)
														
 
															+	mov     %r15, frame_GPRSAVE +8*4(%rsp)
														
 
															+
														
 
															+updateblock:
														
 
															+
														
 
															+	# Load state variables
														
 
															+	mov     DIGEST(0), a_64
														
 
															+	mov     DIGEST(1), b_64
														
 
															+	mov     DIGEST(2), c_64
														
 
															+	mov     DIGEST(3), d_64
														
 
															+	mov     DIGEST(4), e_64
														
 
															+	mov     DIGEST(5), f_64
														
 
															+	mov     DIGEST(6), g_64
														
 
															+	mov     DIGEST(7), h_64
														
 
															+
														
 
															+	t = 0
														
 
															+	.rept 80/2 + 1
														
 
															+	# (80 rounds) / (2 rounds/iteration) + (1 iteration)
														
 
															+	# +1 iteration because the scheduler leads hashing by 1 iteration
														
 
															+		.if t < 2
														
 
															+			# BSWAP 2 QWORDS
														
 
															+			vmovdqa  XMM_QWORD_BSWAP(%rip), %xmm1
														
 
															+			vmovdqu  MSG(t), %xmm0
														
 
															+			vpshufb  %xmm1, %xmm0, %xmm0    # BSWAP
														
 
															+			vmovdqa  %xmm0, W_t(t) # Store Scheduled Pair
														
 
															+			vpaddq   K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
														
 
															+			vmovdqa  %xmm0, WK_2(t) # Store into WK for rounds
														
 
															+		.elseif t < 16
														
 
															+			# BSWAP 2 QWORDS# Compute 2 Rounds
														
 
															+			vmovdqu  MSG(t), %xmm0
														
 
															+			vpshufb  %xmm1, %xmm0, %xmm0    # BSWAP
														
 
															+			SHA512_Round t-2    # Round t-2
														
 
															+			vmovdqa  %xmm0, W_t(t) # Store Scheduled Pair
														
 
															+			vpaddq   K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
														
 
															+			SHA512_Round t-1    # Round t-1
														
 
															+			vmovdqa  %xmm0, WK_2(t)# Store W[t]+K[t] into WK
														
 
															+		.elseif t < 79
														
 
															+			# Schedule 2 QWORDS# Compute 2 Rounds
														
 
															+			SHA512_2Sched_2Round_avx t
														
 
															+		.else
														
 
															+			# Compute 2 Rounds
														
 
															+			SHA512_Round t-2
														
 
															+			SHA512_Round t-1
														
 
															+		.endif
														
 
															+		t = t+2
														
 
															+	.endr
														
 
															+
														
 
															+	# Update digest
														
 
															+	add     a_64, DIGEST(0)
														
 
															+	add     b_64, DIGEST(1)
														
 
															+	add     c_64, DIGEST(2)
														
 
															+	add     d_64, DIGEST(3)
														
 
															+	add     e_64, DIGEST(4)
														
 
															+	add     f_64, DIGEST(5)
														
 
															+	add     g_64, DIGEST(6)
														
 
															+	add     h_64, DIGEST(7)
														
 
															+
														
 
															+	# Advance to next message block
														
 
															+	add     $16*8, msg
														
 
															+	dec     msglen
														
 
															+	jnz     updateblock
														
 
															+
														
 
															+	# Restore GPRs
														
 
															+	mov     frame_GPRSAVE(%rsp),      %rbx
														
 
															+	mov     frame_GPRSAVE +8*1(%rsp), %r12
														
 
															+	mov     frame_GPRSAVE +8*2(%rsp), %r13
														
 
															+	mov     frame_GPRSAVE +8*3(%rsp), %r14
														
 
															+	mov     frame_GPRSAVE +8*4(%rsp), %r15
														
 
															+
														
 
															+	# Restore Stack Pointer
														
 
															+	mov	frame_RSPSAVE(%rsp), %rsp
														
 
															+
														
 
															+nowork:
														
 
															+	ret
														
 
															+ENDPROC(sha512_transform_avx)
														
 
															+
														
 
															+########################################################################
														
 
															+### Binary Data
														
 
															+
														
 
															+.data
														
 
															+
														
 
															+.align 16
														
 
															+
														
 
															+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
														
 
															+XMM_QWORD_BSWAP:
														
 
															+	.octa 0x08090a0b0c0d0e0f0001020304050607
														
 
															+
														
 
															+# K[t] used in SHA512 hashing
														
 
															+K512:
														
 
															+	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
														
 
															+	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
														
 
															+	.quad 0x3956c25bf348b538,0x59f111f1b605d019
														
 
															+	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
														
 
															+	.quad 0xd807aa98a3030242,0x12835b0145706fbe
														
 
															+	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
														
 
															+	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
														
 
															+	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
														
 
															+	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
														
 
															+	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
														
 
															+	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
														
 
															+	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
														
 
															+	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
														
 
															+	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
														
 
															+	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
														
 
															+	.quad 0x06ca6351e003826f,0x142929670a0e6e70
														
 
															+	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
														
 
															+	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
														
 
															+	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
														
 
															+	.quad 0x81c2c92e47edaee6,0x92722c851482353b
														
 
															+	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
														
 
															+	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
														
 
															+	.quad 0xd192e819d6ef5218,0xd69906245565a910
														
 
															+	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
														
 
															+	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
														
 
															+	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
														
 
															+	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
														
 
															+	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
														
 
															+	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
														
 
															+	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
														
 
															+	.quad 0x90befffa23631e28,0xa4506cebde82bde9
														
 
															+	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
														
 
															+	.quad 0xca273eceea26619c,0xd186b8c721c0c207
														
 
															+	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
														
 
															+	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
														
 
															+	.quad 0x113f9804bef90dae,0x1b710b35131c471b
														
 
															+	.quad 0x28db77f523047d84,0x32caab7b40c72493
														
 
															+	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
														
 
															+	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
														
 
															+	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
														
 
															+#endif
														
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -0,0 +1,743 @@
 
															+########################################################################
														
 
															+# Implement fast SHA-512 with AVX2 instructions. (x86_64)
														
 
															+#
														
 
															+# Copyright (C) 2013 Intel Corporation.
														
 
															+#
														
 
															+# Authors:
														
 
															+#     James Guilford <james.guilford@intel.com>
														
 
															+#     Kirk Yap <kirk.s.yap@intel.com>
														
 
															+#     David Cote <david.m.cote@intel.com>
														
 
															+#     Tim Chen <tim.c.chen@linux.intel.com>
														
 
															+#
														
 
															+# This software is available to you under a choice of one of two
														
 
															+# licenses.  You may choose to be licensed under the terms of the GNU
														
 
															+# General Public License (GPL) Version 2, available from the file
														
 
															+# COPYING in the main directory of this source tree, or the
														
 
															+# OpenIB.org BSD license below:
														
 
															+#
														
 
															+#     Redistribution and use in source and binary forms, with or
														
 
															+#     without modification, are permitted provided that the following
														
 
															+#     conditions are met:
														
 
															+#
														
 
															+#      - Redistributions of source code must retain the above
														
 
															+#        copyright notice, this list of conditions and the following
														
 
															+#        disclaimer.
														
 
															+#
														
 
															+#      - Redistributions in binary form must reproduce the above
														
 
															+#        copyright notice, this list of conditions and the following
														
 
															+#        disclaimer in the documentation and/or other materials
														
 
															+#        provided with the distribution.
														
 
															+#
														
 
															+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
														
 
															+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
														
 
															+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
														
 
															+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
														
 
															+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
														
 
															+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
														
 
															+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
														
 
															+# SOFTWARE.
														
 
															+#
														
 
															+########################################################################
														
 
															+#
														
 
															+# This code is described in an Intel White-Paper:
														
 
															+# "Fast SHA-512 Implementations on Intel Architecture Processors"
														
 
															+#
														
 
															+# To find it, surf to http://www.intel.com/p/en_US/embedded
														
 
															+# and search for that title.
														
 
															+#
														
 
															+########################################################################
														
 
															+# This code schedules 1 blocks at a time, with 4 lanes per block
														
 
															+########################################################################
														
 
															+
														
 
															+#ifdef CONFIG_AS_AVX2
														
 
															+#include <linux/linkage.h>
														
 
															+
														
 
															+.text
														
 
															+
														
 
															+# Virtual Registers
														
 
															+Y_0 = %ymm4
														
 
															+Y_1 = %ymm5
														
 
															+Y_2 = %ymm6
														
 
															+Y_3 = %ymm7
														
 
															+
														
 
															+YTMP0 = %ymm0
														
 
															+YTMP1 = %ymm1
														
 
															+YTMP2 = %ymm2
														
 
															+YTMP3 = %ymm3
														
 
															+YTMP4 = %ymm8
														
 
															+XFER  = YTMP0
														
 
															+
														
 
															+BYTE_FLIP_MASK  = %ymm9
														
 
															+
														
 
															+# 1st arg
														
 
															+INP         = %rdi
														
 
															+# 2nd arg
														
 
															+CTX         = %rsi
														
 
															+# 3rd arg
														
 
															+NUM_BLKS    = %rdx
														
 
															+
														
 
															+c           = %rcx
														
 
															+d           = %r8
														
 
															+e           = %rdx
														
 
															+y3          = %rdi
														
 
															+
														
 
															+TBL   = %rbp
														
 
															+
														
 
															+a     = %rax
														
 
															+b     = %rbx
														
 
															+
														
 
															+f     = %r9
														
 
															+g     = %r10
														
 
															+h     = %r11
														
 
															+old_h = %r11
														
 
															+
														
 
															+T1    = %r12
														
 
															+y0    = %r13
														
 
															+y1    = %r14
														
 
															+y2    = %r15
														
 
															+
														
 
															+y4    = %r12
														
 
															+
														
 
															+# Local variables (stack frame)
														
 
															+XFER_SIZE = 4*8
														
 
															+SRND_SIZE = 1*8
														
 
															+INP_SIZE = 1*8
														
 
															+INPEND_SIZE = 1*8
														
 
															+RSPSAVE_SIZE = 1*8
														
 
															+GPRSAVE_SIZE = 6*8
														
 
															+
														
 
															+frame_XFER = 0
														
 
															+frame_SRND = frame_XFER + XFER_SIZE
														
 
															+frame_INP = frame_SRND + SRND_SIZE
														
 
															+frame_INPEND = frame_INP + INP_SIZE
														
 
															+frame_RSPSAVE = frame_INPEND + INPEND_SIZE
														
 
															+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
														
 
															+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
														
 
															+
														
 
															+## assume buffers not aligned
														
 
															+#define	VMOVDQ vmovdqu
														
 
															+
														
 
															+# addm [mem], reg
														
 
															+# Add reg to mem using reg-mem add and store
														
 
															+.macro addm p1 p2
														
 
															+	add	\p1, \p2
														
 
															+	mov	\p2, \p1
														
 
															+.endm
														
 
															+
														
 
															+
														
 
															+# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
														
 
															+# Load ymm with mem and byte swap each dword
														
 
															+.macro COPY_YMM_AND_BSWAP p1 p2 p3
														
 
															+	VMOVDQ \p2, \p1
														
 
															+	vpshufb \p3, \p1, \p1
														
 
															+.endm
														
 
															+# rotate_Ys
														
 
															+# Rotate values of symbols Y0...Y3
														
 
															+.macro rotate_Ys
														
 
															+	Y_ = Y_0
														
 
															+	Y_0 = Y_1
														
 
															+	Y_1 = Y_2
														
 
															+	Y_2 = Y_3
														
 
															+	Y_3 = Y_
														
 
															+.endm
														
 
															+
														
 
															+# RotateState
														
 
															+.macro RotateState
														
 
															+	# Rotate symbols a..h right
														
 
															+	old_h  = h
														
 
															+	TMP_   = h
														
 
															+	h      = g
														
 
															+	g      = f
														
 
															+	f      = e
														
 
															+	e      = d
														
 
															+	d      = c
														
 
															+	c      = b
														
 
															+	b      = a
														
 
															+	a      = TMP_
														
 
															+.endm
														
 
															+
														
 
															+# macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL
														
 
															+# YDST = {YSRC1, YSRC2} >> RVAL*8
														
 
															+.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
														
 
															+	vperm2f128      $0x3, \YSRC2, \YSRC1, \YDST     # YDST = {YS1_LO, YS2_HI}
														
 
															+	vpalignr        $\RVAL, \YSRC2, \YDST, \YDST    # YDST = {YDS1, YS2} >> RVAL*8
														
 
															+.endm
														
 
															+
														
 
															+.macro FOUR_ROUNDS_AND_SCHED
														
 
															+################################### RND N + 0 #########################################
														
 
															+
														
 
															+	# Extract w[t-7]
														
 
															+	MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		# YTMP0 = W[-7]
														
 
															+	# Calculate w[t-16] + w[t-7]
														
 
															+	vpaddq		Y_0, YTMP0, YTMP0		# YTMP0 = W[-7] + W[-16]
														
 
															+	# Extract w[t-15]
														
 
															+	MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		# YTMP1 = W[-15]
														
 
															+
														
 
															+	# Calculate sigma0
														
 
															+
														
 
															+	# Calculate w[t-15] ror 1
														
 
															+	vpsrlq		$1, YTMP1, YTMP2
														
 
															+	vpsllq		$(64-1), YTMP1, YTMP3
														
 
															+	vpor		YTMP2, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1
														
 
															+	# Calculate w[t-15] shr 7
														
 
															+	vpsrlq		$7, YTMP1, YTMP4		# YTMP4 = W[-15] >> 7
														
 
															+
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
														
 
															+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
														
 
															+	add	frame_XFER(%rsp),h		# h = k + w + h         # --
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
														
 
															+
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
														
 
															+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
														
 
															+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
														
 
															+
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+
														
 
															+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+	add	y3, h		# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+	RotateState
														
 
															+
														
 
															+################################### RND N + 1 #########################################
														
 
															+
														
 
															+	# Calculate w[t-15] ror 8
														
 
															+	vpsrlq		$8, YTMP1, YTMP2
														
 
															+	vpsllq		$(64-8), YTMP1, YTMP1
														
 
															+	vpor		YTMP2, YTMP1, YTMP1		# YTMP1 = W[-15] ror 8
														
 
															+	# XOR the three components
														
 
															+	vpxor		YTMP4, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
														
 
															+	vpxor		YTMP1, YTMP3, YTMP1		# YTMP1 = s0
														
 
															+
														
 
															+
														
 
															+	# Add three components, w[t-16], w[t-7] and sigma0
														
 
															+	vpaddq		YTMP1, YTMP0, YTMP0		# YTMP0 = W[-16] + W[-7] + s0
														
 
															+	# Move to appropriate lanes for calculating w[16] and w[17]
														
 
															+	vperm2f128	$0x0, YTMP0, YTMP0, Y_0		# Y_0 = W[-16] + W[-7] + s0 {BABA}
														
 
															+	# Move to appropriate lanes for calculating w[18] and w[19]
														
 
															+	vpand		MASK_YMM_LO(%rip), YTMP0, YTMP0	# YTMP0 = W[-16] + W[-7] + s0 {DC00}
														
 
															+
														
 
															+	# Calculate w[16] and w[17] in both 128 bit lanes
														
 
															+
														
 
															+	# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
														
 
															+	vperm2f128	$0x11, Y_3, Y_3, YTMP2		# YTMP2 = W[-2] {BABA}
														
 
															+	vpsrlq		$6, YTMP2, YTMP4		# YTMP4 = W[-2] >> 6 {BABA}
														
 
															+
														
 
															+
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
														
 
															+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
														
 
															+	add	1*8+frame_XFER(%rsp), h		# h = k + w + h         # --
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+
														
 
															+
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+
														
 
															+
														
 
															+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
														
 
															+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
														
 
															+
														
 
															+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+	add	y3, h		# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+	RotateState
														
 
															+
														
 
															+
														
 
															+################################### RND N + 2 #########################################
														
 
															+
														
 
															+	vpsrlq		$19, YTMP2, YTMP3		# YTMP3 = W[-2] >> 19 {BABA}
														
 
															+	vpsllq		$(64-19), YTMP2, YTMP1		# YTMP1 = W[-2] << 19 {BABA}
														
 
															+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {BABA}
														
 
															+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
														
 
															+	vpsrlq		$61, YTMP2, YTMP3		# YTMP3 = W[-2] >> 61 {BABA}
														
 
															+	vpsllq		$(64-61), YTMP2, YTMP1		# YTMP1 = W[-2] << 61 {BABA}
														
 
															+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {BABA}
														
 
															+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
														
 
															+							#  (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
														
 
															+
														
 
															+	# Add sigma1 to the other compunents to get w[16] and w[17]
														
 
															+	vpaddq		YTMP4, Y_0, Y_0			# Y_0 = {W[1], W[0], W[1], W[0]}
														
 
															+
														
 
															+	# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
														
 
															+	vpsrlq		$6, Y_0, YTMP4			# YTMP4 = W[-2] >> 6 {DC--}
														
 
															+
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
														
 
															+	add	2*8+frame_XFER(%rsp), h		# h = k + w + h         # --
														
 
															+
														
 
															+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+
														
 
															+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+
														
 
															+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
														
 
															+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
														
 
															+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+
														
 
															+	add	y3, h		# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+	RotateState
														
 
															+
														
 
															+################################### RND N + 3 #########################################
														
 
															+
														
 
															+	vpsrlq		$19, Y_0, YTMP3			# YTMP3 = W[-2] >> 19 {DC--}
														
 
															+	vpsllq		$(64-19), Y_0, YTMP1		# YTMP1 = W[-2] << 19 {DC--}
														
 
															+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {DC--}
														
 
															+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
														
 
															+	vpsrlq		$61, Y_0, YTMP3			# YTMP3 = W[-2] >> 61 {DC--}
														
 
															+	vpsllq		$(64-61), Y_0, YTMP1		# YTMP1 = W[-2] << 61 {DC--}
														
 
															+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {DC--}
														
 
															+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
														
 
															+							#  (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
														
 
															+
														
 
															+	# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
														
 
															+	# to newly calculated sigma1 to get w[18] and w[19]
														
 
															+	vpaddq		YTMP4, YTMP0, YTMP2		# YTMP2 = {W[3], W[2], --, --}
														
 
															+
														
 
															+	# Form w[19, w[18], w17], w[16]
														
 
															+	vpblendd		$0xF0, YTMP2, Y_0, Y_0		# Y_0 = {W[3], W[2], W[1], W[0]}
														
 
															+
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
														
 
															+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
														
 
															+	add	3*8+frame_XFER(%rsp), h		# h = k + w + h         # --
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+
														
 
															+
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+
														
 
															+
														
 
															+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+
														
 
															+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+
														
 
															+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+	add	y3, h		# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+	RotateState
														
 
															+
														
 
															+	rotate_Ys
														
 
															+.endm
														
 
															+
														
 
															+.macro DO_4ROUNDS
														
 
															+
														
 
															+################################### RND N + 0 #########################################
														
 
															+
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
														
 
															+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
														
 
															+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
														
 
															+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
														
 
															+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
														
 
															+	add	frame_XFER(%rsp), h		# h = k + w + h         # --
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+
														
 
															+	RotateState
														
 
															+
														
 
															+################################### RND N + 1 #########################################
														
 
															+
														
 
															+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
														
 
															+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
														
 
															+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
														
 
															+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
														
 
															+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
														
 
															+	add	8*1+frame_XFER(%rsp), h		# h = k + w + h         # --
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+
														
 
															+	RotateState
														
 
															+
														
 
															+################################### RND N + 2 #########################################
														
 
															+
														
 
															+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
														
 
															+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
														
 
															+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
														
 
															+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
														
 
															+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
														
 
															+	add	8*2+frame_XFER(%rsp), h		# h = k + w + h         # --
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+
														
 
															+	RotateState
														
 
															+
														
 
															+################################### RND N + 3 #########################################
														
 
															+
														
 
															+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+	mov	f, y2		# y2 = f                                # CH
														
 
															+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
														
 
															+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
														
 
															+	xor	g, y2		# y2 = f^g                              # CH
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
														
 
															+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
														
 
															+	and	e, y2		# y2 = (f^g)&e                          # CH
														
 
															+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
														
 
															+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
														
 
															+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
														
 
															+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
														
 
															+	mov	a, y3		# y3 = a                                # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
														
 
															+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
														
 
															+	add	8*3+frame_XFER(%rsp), h		# h = k + w + h         # --
														
 
															+	or	c, y3		# y3 = a|c                              # MAJA
														
 
															+
														
 
															+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
														
 
															+	mov	a, T1		# T1 = a                                # MAJB
														
 
															+	and	b, y3		# y3 = (a|c)&b                          # MAJA
														
 
															+	and	c, T1		# T1 = a&c                              # MAJB
														
 
															+	add	y0, y2		# y2 = S1 + CH                          # --
														
 
															+
														
 
															+
														
 
															+	add	h, d		# d = k + w + h + d                     # --
														
 
															+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
														
 
															+	add	y1, h		# h = k + w + h + S0                    # --
														
 
															+
														
 
															+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
														
 
															+
														
 
															+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
														
 
															+
														
 
															+	add	y3, h		# h = t1 + S0 + MAJ                     # --
														
 
															+
														
 
															+	RotateState
														
 
															+
														
 
															+.endm
														
 
															+
														
 
															+########################################################################
														
 
															+# void sha512_transform_rorx(const void* M, void* D, uint64_t L)#
														
 
															+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
														
 
															+# The size of the message pointed to by M must be an integer multiple of SHA512
														
 
															+#   message blocks.
														
 
															+# L is the message length in SHA512 blocks
														
 
															+########################################################################
														
 
															+ENTRY(sha512_transform_rorx)
														
 
															+	# Allocate Stack Space
														
 
															+	mov	%rsp, %rax
														
 
															+	sub	$frame_size, %rsp
														
 
															+	and	$~(0x20 - 1), %rsp
														
 
															+	mov	%rax, frame_RSPSAVE(%rsp)
														
 
															+
														
 
															+	# Save GPRs
														
 
															+	mov	%rbp, frame_GPRSAVE(%rsp)
														
 
															+	mov	%rbx, 8*1+frame_GPRSAVE(%rsp)
														
 
															+	mov	%r12, 8*2+frame_GPRSAVE(%rsp)
														
 
															+	mov	%r13, 8*3+frame_GPRSAVE(%rsp)
														
 
															+	mov	%r14, 8*4+frame_GPRSAVE(%rsp)
														
 
															+	mov	%r15, 8*5+frame_GPRSAVE(%rsp)
														
 
															+
														
 
															+	shl	$7, NUM_BLKS	# convert to bytes
														
 
															+	jz	done_hash
														
 
															+	add	INP, NUM_BLKS	# pointer to end of data
														
 
															+	mov	NUM_BLKS, frame_INPEND(%rsp)
														
 
															+
														
 
															+	## load initial digest
														
 
															+	mov	8*0(CTX),a
														
 
															+	mov	8*1(CTX),b
														
 
															+	mov	8*2(CTX),c
														
 
															+	mov	8*3(CTX),d
														
 
															+	mov	8*4(CTX),e
														
 
															+	mov	8*5(CTX),f
														
 
															+	mov	8*6(CTX),g
														
 
															+	mov	8*7(CTX),h
														
 
															+
														
 
															+	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
														
 
															+
														
 
															+loop0:
														
 
															+	lea	K512(%rip), TBL
														
 
															+
														
 
															+	## byte swap first 16 dwords
														
 
															+	COPY_YMM_AND_BSWAP	Y_0, (INP), BYTE_FLIP_MASK
														
 
															+	COPY_YMM_AND_BSWAP	Y_1, 1*32(INP), BYTE_FLIP_MASK
														
 
															+	COPY_YMM_AND_BSWAP	Y_2, 2*32(INP), BYTE_FLIP_MASK
														
 
															+	COPY_YMM_AND_BSWAP	Y_3, 3*32(INP), BYTE_FLIP_MASK
														
 
															+
														
 
															+	mov	INP, frame_INP(%rsp)
														
 
															+
														
 
															+	## schedule 64 input dwords, by doing 12 rounds of 4 each
														
 
															+	movq	$4, frame_SRND(%rsp)
														
 
															+
														
 
															+.align 16
														
 
															+loop1:
														
 
															+	vpaddq	(TBL), Y_0, XFER
														
 
															+	vmovdqa XFER, frame_XFER(%rsp)
														
 
															+	FOUR_ROUNDS_AND_SCHED
														
 
															+
														
 
															+	vpaddq	1*32(TBL), Y_0, XFER
														
 
															+	vmovdqa XFER, frame_XFER(%rsp)
														
 
															+	FOUR_ROUNDS_AND_SCHED
														
 
															+
														
 
															+	vpaddq	2*32(TBL), Y_0, XFER
														
 
															+	vmovdqa XFER, frame_XFER(%rsp)
														
 
															+	FOUR_ROUNDS_AND_SCHED
														
 
															+
														
 
															+	vpaddq	3*32(TBL), Y_0, XFER
														
 
															+	vmovdqa XFER, frame_XFER(%rsp)
														
 
															+	add	$(4*32), TBL
														
 
															+	FOUR_ROUNDS_AND_SCHED
														
 
															+
														
 
															+	subq	$1, frame_SRND(%rsp)
														
 
															+	jne	loop1
														
 
															+
														
 
															+	movq	$2, frame_SRND(%rsp)
														
 
															+loop2:
														
 
															+	vpaddq	(TBL), Y_0, XFER
														
 
															+	vmovdqa XFER, frame_XFER(%rsp)
														
 
															+	DO_4ROUNDS
														
 
															+	vpaddq	1*32(TBL), Y_1, XFER
														
 
															+	vmovdqa XFER, frame_XFER(%rsp)
														
 
															+	add	$(2*32), TBL
														
 
															+	DO_4ROUNDS
														
 
															+
														
 
															+	vmovdqa	Y_2, Y_0
														
 
															+	vmovdqa	Y_3, Y_1
														
 
															+
														
 
															+	subq	$1, frame_SRND(%rsp)
														
 
															+	jne	loop2
														
 
															+
														
 
															+	addm	8*0(CTX),a
														
 
															+	addm	8*1(CTX),b
														
 
															+	addm	8*2(CTX),c
														
 
															+	addm	8*3(CTX),d
														
 
															+	addm	8*4(CTX),e
														
 
															+	addm	8*5(CTX),f
														
 
															+	addm	8*6(CTX),g
														
 
															+	addm	8*7(CTX),h
														
 
															+
														
 
															+	mov	frame_INP(%rsp), INP
														
 
															+	add	$128, INP
														
 
															+	cmp	frame_INPEND(%rsp), INP
														
 
															+	jne	loop0
														
 
															+
														
 
															+done_hash:
														
 
															+
														
 
															+# Restore GPRs
														
 
															+	mov	frame_GPRSAVE(%rsp)     ,%rbp
														
 
															+	mov	8*1+frame_GPRSAVE(%rsp) ,%rbx
														
 
															+	mov	8*2+frame_GPRSAVE(%rsp) ,%r12
														
 
															+	mov	8*3+frame_GPRSAVE(%rsp) ,%r13
														
 
															+	mov	8*4+frame_GPRSAVE(%rsp) ,%r14
														
 
															+	mov	8*5+frame_GPRSAVE(%rsp) ,%r15
														
 
															+
														
 
															+	# Restore Stack Pointer
														
 
															+	mov	frame_RSPSAVE(%rsp), %rsp
														
 
															+	ret
														
 
															+ENDPROC(sha512_transform_rorx)
														
 
															+
														
 
															+########################################################################
														
 
															+### Binary Data
														
 
															+
														
 
															+.data
														
 
															+
														
 
															+.align 64
														
 
															+# K[t] used in SHA512 hashing
														
 
															+K512:
														
 
															+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
														
 
															+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
														
 
															+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
														
 
															+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
														
 
															+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
														
 
															+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
														
 
															+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
														
 
															+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
														
 
															+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
														
 
															+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
														
 
															+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
														
 
															+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
														
 
															+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
														
 
															+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
														
 
															+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
														
 
															+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
														
 
															+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
														
 
															+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
														
 
															+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
														
 
															+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
														
 
															+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
														
 
															+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
														
 
															+	.quad	0xd192e819d6ef5218,0xd69906245565a910
														
 
															+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
														
 
															+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
														
 
															+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
														
 
															+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
														
 
															+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
														
 
															+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
														
 
															+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
														
 
															+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
														
 
															+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
														
 
															+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
														
 
															+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
														
 
															+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
														
 
															+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
														
 
															+	.quad	0x28db77f523047d84,0x32caab7b40c72493
														
 
															+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
														
 
															+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
														
 
															+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
														
 
															+
														
 
															+.align 32
														
 
															+
														
 
															+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
														
 
															+PSHUFFLE_BYTE_FLIP_MASK:
														
 
															+	.octa 0x08090a0b0c0d0e0f0001020304050607
														
 
															+	.octa 0x18191a1b1c1d1e1f1011121314151617
														
 
															+
														
 
															+MASK_YMM_LO:
														
 
															+	.octa 0x00000000000000000000000000000000
														
 
															+	.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
														
 
															+#endif
														
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -0,0 +1,421 @@
 
															+########################################################################
														
 
															+# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
														
 
															+#
														
 
															+# Copyright (C) 2013 Intel Corporation.
														
 
															+#
														
 
															+# Authors:
														
 
															+#     James Guilford <james.guilford@intel.com>
														
 
															+#     Kirk Yap <kirk.s.yap@intel.com>
														
 
															+#     David Cote <david.m.cote@intel.com>
														
 
															+#     Tim Chen <tim.c.chen@linux.intel.com>
														
 
															+#
														
 
															+# This software is available to you under a choice of one of two
														
 
															+# licenses.  You may choose to be licensed under the terms of the GNU
														
 
															+# General Public License (GPL) Version 2, available from the file
														
 
															+# COPYING in the main directory of this source tree, or the
														
 
															+# OpenIB.org BSD license below:
														
 
															+#
														
 
															+#     Redistribution and use in source and binary forms, with or
														
 
															+#     without modification, are permitted provided that the following
														
 
															+#     conditions are met:
														
 
															+#
														
 
															+#      - Redistributions of source code must retain the above
														
 
															+#        copyright notice, this list of conditions and the following
														
 
															+#        disclaimer.
														
 
															+#
														
 
															+#      - Redistributions in binary form must reproduce the above
														
 
															+#        copyright notice, this list of conditions and the following
														
 
															+#        disclaimer in the documentation and/or other materials
														
 
															+#        provided with the distribution.
														
 
															+#
														
 
															+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
														
 
															+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
														
 
															+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
														
 
															+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
														
 
															+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
														
 
															+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
														
 
															+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
														
 
															+# SOFTWARE.
														
 
															+#
														
 
															+########################################################################
														
 
															+#
														
 
															+# This code is described in an Intel White-Paper:
														
 
															+# "Fast SHA-512 Implementations on Intel Architecture Processors"
														
 
															+#
														
 
															+# To find it, surf to http://www.intel.com/p/en_US/embedded
														
 
															+# and search for that title.
														
 
															+#
														
 
															+########################################################################
														
 
															+
														
 
															+#include <linux/linkage.h>
														
 
															+
														
 
															+.text
														
 
															+
														
 
															+# Virtual Registers
														
 
															+# ARG1
														
 
															+msg =		%rdi
														
 
															+# ARG2
														
 
															+digest =	%rsi
														
 
															+# ARG3
														
 
															+msglen =	%rdx
														
 
															+T1 =		%rcx
														
 
															+T2 =		%r8
														
 
															+a_64 =		%r9
														
 
															+b_64 =		%r10
														
 
															+c_64 =		%r11
														
 
															+d_64 =		%r12
														
 
															+e_64 =		%r13
														
 
															+f_64 =		%r14
														
 
															+g_64 =		%r15
														
 
															+h_64 =		%rbx
														
 
															+tmp0 =		%rax
														
 
															+
														
 
															+# Local variables (stack frame)
														
 
															+
														
 
															+W_SIZE = 80*8
														
 
															+WK_SIZE = 2*8
														
 
															+RSPSAVE_SIZE = 1*8
														
 
															+GPRSAVE_SIZE = 5*8
														
 
															+
														
 
															+frame_W = 0
														
 
															+frame_WK = frame_W + W_SIZE
														
 
															+frame_RSPSAVE = frame_WK + WK_SIZE
														
 
															+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
														
 
															+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
														
 
															+
														
 
															+# Useful QWORD "arrays" for simpler memory references
														
 
															+# MSG, DIGEST, K_t, W_t are arrays
														
 
															+# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
														
 
															+
														
 
															+# Input message (arg1)
														
 
															+#define MSG(i)    8*i(msg)
														
 
															+
														
 
															+# Output Digest (arg2)
														
 
															+#define DIGEST(i) 8*i(digest)
														
 
															+
														
 
															+# SHA Constants (static mem)
														
 
															+#define K_t(i)    8*i+K512(%rip)
														
 
															+
														
 
															+# Message Schedule (stack frame)
														
 
															+#define W_t(i)    8*i+frame_W(%rsp)
														
 
															+
														
 
															+# W[t]+K[t] (stack frame)
														
 
															+#define WK_2(i)   8*((i%2))+frame_WK(%rsp)
														
 
															+
														
 
															+.macro RotateState
														
 
															+	# Rotate symbols a..h right
														
 
															+	TMP   = h_64
														
 
															+	h_64  = g_64
														
 
															+	g_64  = f_64
														
 
															+	f_64  = e_64
														
 
															+	e_64  = d_64
														
 
															+	d_64  = c_64
														
 
															+	c_64  = b_64
														
 
															+	b_64  = a_64
														
 
															+	a_64  = TMP
														
 
															+.endm
														
 
															+
														
 
															+.macro SHA512_Round rnd
														
 
															+
														
 
															+	# Compute Round %%t
														
 
															+	mov	f_64, T1          # T1 = f
														
 
															+	mov	e_64, tmp0        # tmp = e
														
 
															+	xor	g_64, T1          # T1 = f ^ g
														
 
															+	ror	$23, tmp0 # 41    # tmp = e ror 23
														
 
															+	and	e_64, T1          # T1 = (f ^ g) & e
														
 
															+	xor	e_64, tmp0        # tmp = (e ror 23) ^ e
														
 
															+	xor	g_64, T1          # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
														
 
															+	idx = \rnd
														
 
															+	add	WK_2(idx), T1     # W[t] + K[t] from message scheduler
														
 
															+	ror	$4, tmp0  # 18    # tmp = ((e ror 23) ^ e) ror 4
														
 
															+	xor	e_64, tmp0        # tmp = (((e ror 23) ^ e) ror 4) ^ e
														
 
															+	mov	a_64, T2          # T2 = a
														
 
															+	add	h_64, T1          # T1 = CH(e,f,g) + W[t] + K[t] + h
														
 
															+	ror	$14, tmp0 # 14    # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
														
 
															+	add	tmp0, T1          # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
														
 
															+	mov	a_64, tmp0        # tmp = a
														
 
															+	xor	c_64, T2          # T2 = a ^ c
														
 
															+	and	c_64, tmp0        # tmp = a & c
														
 
															+	and	b_64, T2          # T2 = (a ^ c) & b
														
 
															+	xor	tmp0, T2          # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
														
 
															+	mov	a_64, tmp0        # tmp = a
														
 
															+	ror	$5, tmp0 # 39     # tmp = a ror 5
														
 
															+	xor	a_64, tmp0        # tmp = (a ror 5) ^ a
														
 
															+	add	T1, d_64          # e(next_state) = d + T1
														
 
															+	ror	$6, tmp0 # 34     # tmp = ((a ror 5) ^ a) ror 6
														
 
															+	xor	a_64, tmp0        # tmp = (((a ror 5) ^ a) ror 6) ^ a
														
 
															+	lea	(T1, T2), h_64    # a(next_state) = T1 + Maj(a,b,c)
														
 
															+	ror	$28, tmp0 # 28    # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
														
 
															+	add	tmp0, h_64        # a(next_state) = T1 + Maj(a,b,c) S0(a)
														
 
															+	RotateState
														
 
															+.endm
														
 
															+
														
 
															+.macro SHA512_2Sched_2Round_sse rnd
														
 
															+
														
 
															+	# Compute rounds t-2 and t-1
														
 
															+	# Compute message schedule QWORDS t and t+1
														
 
															+
														
 
															+	#   Two rounds are computed based on the values for K[t-2]+W[t-2] and
														
 
															+	# K[t-1]+W[t-1] which were previously stored at WK_2 by the message
														
 
															+	# scheduler.
														
 
															+	#   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
														
 
															+	# They are then added to their respective SHA512 constants at
														
 
															+	# [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
														
 
															+	#   For brievity, the comments following vectored instructions only refer to
														
 
															+	# the first of a pair of QWORDS.
														
 
															+	# Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
														
 
															+	#   The computation of the message schedule and the rounds are tightly
														
 
															+	# stitched to take advantage of instruction-level parallelism.
														
 
															+	# For clarity, integer instructions (for the rounds calculation) are indented
														
 
															+	# by one tab. Vectored instructions (for the message scheduler) are indented
														
 
															+	# by two tabs.
														
 
															+
														
 
															+	mov	f_64, T1
														
 
															+	idx = \rnd -2
														
 
															+	movdqa	W_t(idx), %xmm2		    # XMM2 = W[t-2]
														
 
															+	xor	g_64, T1
														
 
															+	and	e_64, T1
														
 
															+	movdqa	%xmm2, %xmm0	            # XMM0 = W[t-2]
														
 
															+	xor	g_64, T1
														
 
															+	idx = \rnd
														
 
															+	add	WK_2(idx), T1
														
 
															+	idx = \rnd - 15
														
 
															+	movdqu	W_t(idx), %xmm5		    # XMM5 = W[t-15]
														
 
															+	mov	e_64, tmp0
														
 
															+	ror	$23, tmp0 # 41
														
 
															+	movdqa	%xmm5, %xmm3	            # XMM3 = W[t-15]
														
 
															+	xor	e_64, tmp0
														
 
															+	ror	$4, tmp0 # 18
														
 
															+	psrlq	$61-19, %xmm0		    # XMM0 = W[t-2] >> 42
														
 
															+	xor	e_64, tmp0
														
 
															+	ror	$14, tmp0 # 14
														
 
															+	psrlq	$(8-7), %xmm3		    # XMM3 = W[t-15] >> 1
														
 
															+	add	tmp0, T1
														
 
															+	add	h_64, T1
														
 
															+	pxor	%xmm2, %xmm0                # XMM0 = (W[t-2] >> 42) ^ W[t-2]
														
 
															+	mov	a_64, T2
														
 
															+	xor	c_64, T2
														
 
															+	pxor	%xmm5, %xmm3                # XMM3 = (W[t-15] >> 1) ^ W[t-15]
														
 
															+	and	b_64, T2
														
 
															+	mov	a_64, tmp0
														
 
															+	psrlq	$(19-6), %xmm0		    # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
														
 
															+	and	c_64, tmp0
														
 
															+	xor	tmp0, T2
														
 
															+	psrlq	$(7-1), %xmm3		    # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
														
 
															+	mov	a_64, tmp0
														
 
															+	ror	$5, tmp0 # 39
														
 
															+	pxor	%xmm2, %xmm0	            # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
														
 
															+	xor	a_64, tmp0
														
 
															+	ror	$6, tmp0 # 34
														
 
															+	pxor	%xmm5, %xmm3                # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
														
 
															+	xor	a_64, tmp0
														
 
															+	ror	$28, tmp0 # 28
														
 
															+	psrlq	$6, %xmm0                   # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
														
 
															+	add	tmp0, T2
														
 
															+	add	T1, d_64
														
 
															+	psrlq	$1, %xmm3                   # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
														
 
															+	lea	(T1, T2), h_64
														
 
															+	RotateState
														
 
															+	movdqa	%xmm2, %xmm1	            # XMM1 = W[t-2]
														
 
															+	mov	f_64, T1
														
 
															+	xor	g_64, T1
														
 
															+	movdqa	%xmm5, %xmm4		    # XMM4 = W[t-15]
														
 
															+	and	e_64, T1
														
 
															+	xor	g_64, T1
														
 
															+	psllq	$(64-19)-(64-61) , %xmm1    # XMM1 = W[t-2] << 42
														
 
															+	idx = \rnd + 1
														
 
															+	add	WK_2(idx), T1
														
 
															+	mov	e_64, tmp0
														
 
															+	psllq	$(64-1)-(64-8), %xmm4	    # XMM4 = W[t-15] << 7
														
 
															+	ror	$23, tmp0 # 41
														
 
															+	xor	e_64, tmp0
														
 
															+	pxor	%xmm2, %xmm1		    # XMM1 = (W[t-2] << 42)^W[t-2]
														
 
															+	ror	$4, tmp0 # 18
														
 
															+	xor	e_64, tmp0
														
 
															+	pxor	%xmm5, %xmm4		    # XMM4 = (W[t-15]<<7)^W[t-15]
														
 
															+	ror	$14, tmp0 # 14
														
 
															+	add	tmp0, T1
														
 
															+	psllq	$(64-61), %xmm1		    # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
														
 
															+	add	h_64, T1
														
 
															+	mov	a_64, T2
														
 
															+	psllq	$(64-8), %xmm4		    # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
														
 
															+	xor	c_64, T2
														
 
															+	and	b_64, T2
														
 
															+	pxor	%xmm1, %xmm0		    # XMM0 = s1(W[t-2])
														
 
															+	mov	a_64, tmp0
														
 
															+	and	c_64, tmp0
														
 
															+	idx = \rnd - 7
														
 
															+	movdqu	W_t(idx), %xmm1		    # XMM1 = W[t-7]
														
 
															+	xor	tmp0, T2
														
 
															+	pxor	%xmm4, %xmm3                # XMM3 = s0(W[t-15])
														
 
															+	mov	a_64, tmp0
														
 
															+	paddq	%xmm3, %xmm0		    # XMM0 = s1(W[t-2]) + s0(W[t-15])
														
 
															+	ror	$5, tmp0 # 39
														
 
															+	idx =\rnd-16
														
 
															+	paddq	W_t(idx), %xmm0		    # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
														
 
															+	xor	a_64, tmp0
														
 
															+	paddq	%xmm1, %xmm0	            # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
														
 
															+	ror	$6, tmp0 # 34
														
 
															+	movdqa	%xmm0, W_t(\rnd)	    # Store scheduled qwords
														
 
															+	xor	a_64, tmp0
														
 
															+	paddq	K_t(\rnd), %xmm0	    # Compute W[t]+K[t]
														
 
															+	ror	$28, tmp0 # 28
														
 
															+	idx = \rnd
														
 
															+	movdqa	%xmm0, WK_2(idx)	    # Store W[t]+K[t] for next rounds
														
 
															+	add	tmp0, T2
														
 
															+	add	T1, d_64
														
 
															+	lea	(T1, T2), h_64
														
 
															+	RotateState
														
 
															+.endm
														
 
															+
														
 
															+########################################################################
														
 
															+# void sha512_transform_ssse3(const void* M, void* D, u64 L)#
														
 
															+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
														
 
															+# The size of the message pointed to by M must be an integer multiple of SHA512
														
 
															+#   message blocks.
														
 
															+# L is the message length in SHA512 blocks.
														
 
															+########################################################################
														
 
															+ENTRY(sha512_transform_ssse3)
														
 
															+
														
 
															+	cmp $0, msglen
														
 
															+	je nowork
														
 
															+
														
 
															+	# Allocate Stack Space
														
 
															+	mov	%rsp, %rax
														
 
															+	sub	$frame_size, %rsp
														
 
															+	and	$~(0x20 - 1), %rsp
														
 
															+	mov	%rax, frame_RSPSAVE(%rsp)
														
 
															+
														
 
															+	# Save GPRs
														
 
															+	mov	%rbx, frame_GPRSAVE(%rsp)
														
 
															+	mov	%r12, frame_GPRSAVE +8*1(%rsp)
														
 
															+	mov	%r13, frame_GPRSAVE +8*2(%rsp)
														
 
															+	mov	%r14, frame_GPRSAVE +8*3(%rsp)
														
 
															+	mov	%r15, frame_GPRSAVE +8*4(%rsp)
														
 
															+
														
 
															+updateblock:
														
 
															+
														
 
															+# Load state variables
														
 
															+	mov	DIGEST(0), a_64
														
 
															+	mov	DIGEST(1), b_64
														
 
															+	mov	DIGEST(2), c_64
														
 
															+	mov	DIGEST(3), d_64
														
 
															+	mov	DIGEST(4), e_64
														
 
															+	mov	DIGEST(5), f_64
														
 
															+	mov	DIGEST(6), g_64
														
 
															+	mov	DIGEST(7), h_64
														
 
															+
														
 
															+	t = 0
														
 
															+	.rept 80/2 + 1
														
 
															+	# (80 rounds) / (2 rounds/iteration) + (1 iteration)
														
 
															+	# +1 iteration because the scheduler leads hashing by 1 iteration
														
 
															+		.if t < 2
														
 
															+			# BSWAP 2 QWORDS
														
 
															+			movdqa	XMM_QWORD_BSWAP(%rip), %xmm1
														
 
															+			movdqu	MSG(t), %xmm0
														
 
															+			pshufb	%xmm1, %xmm0	# BSWAP
														
 
															+			movdqa	%xmm0, W_t(t)	# Store Scheduled Pair
														
 
															+			paddq	K_t(t), %xmm0	# Compute W[t]+K[t]
														
 
															+			movdqa	%xmm0, WK_2(t)	# Store into WK for rounds
														
 
															+		.elseif t < 16
														
 
															+			# BSWAP 2 QWORDS# Compute 2 Rounds
														
 
															+			movdqu	MSG(t), %xmm0
														
 
															+			pshufb	%xmm1, %xmm0	# BSWAP
														
 
															+			SHA512_Round t-2	# Round t-2
														
 
															+			movdqa	%xmm0, W_t(t)	# Store Scheduled Pair
														
 
															+			paddq	K_t(t), %xmm0	# Compute W[t]+K[t]
														
 
															+			SHA512_Round t-1	# Round t-1
														
 
															+			movdqa	%xmm0, WK_2(t)	# Store W[t]+K[t] into WK
														
 
															+		.elseif t < 79
														
 
															+			# Schedule 2 QWORDS# Compute 2 Rounds
														
 
															+			SHA512_2Sched_2Round_sse t
														
 
															+		.else
														
 
															+			# Compute 2 Rounds
														
 
															+			SHA512_Round t-2
														
 
															+			SHA512_Round t-1
														
 
															+		.endif
														
 
															+		t = t+2
														
 
															+	.endr
														
 
															+
														
 
															+	# Update digest
														
 
															+	add	a_64, DIGEST(0)
														
 
															+	add	b_64, DIGEST(1)
														
 
															+	add	c_64, DIGEST(2)
														
 
															+	add	d_64, DIGEST(3)
														
 
															+	add	e_64, DIGEST(4)
														
 
															+	add	f_64, DIGEST(5)
														
 
															+	add	g_64, DIGEST(6)
														
 
															+	add	h_64, DIGEST(7)
														
 
															+
														
 
															+	# Advance to next message block
														
 
															+	add	$16*8, msg
														
 
															+	dec	msglen
														
 
															+	jnz	updateblock
														
 
															+
														
 
															+	# Restore GPRs
														
 
															+	mov	frame_GPRSAVE(%rsp),      %rbx
														
 
															+	mov	frame_GPRSAVE +8*1(%rsp), %r12
														
 
															+	mov	frame_GPRSAVE +8*2(%rsp), %r13
														
 
															+	mov	frame_GPRSAVE +8*3(%rsp), %r14
														
 
															+	mov	frame_GPRSAVE +8*4(%rsp), %r15
														
 
															+
														
 
															+	# Restore Stack Pointer
														
 
															+	mov	frame_RSPSAVE(%rsp), %rsp
														
 
															+
														
 
															+nowork:
														
 
															+	ret
														
 
															+ENDPROC(sha512_transform_ssse3)
														
 
															+
														
 
															+########################################################################
														
 
															+### Binary Data
														
 
															+
														
 
															+.data
														
 
															+
														
 
															+.align 16
														
 
															+
														
 
															+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
														
 
															+XMM_QWORD_BSWAP:
														
 
															+	.octa 0x08090a0b0c0d0e0f0001020304050607
														
 
															+
														
 
															+# K[t] used in SHA512 hashing
														
 
															+K512:
														
 
															+	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
														
 
															+	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
														
 
															+	.quad 0x3956c25bf348b538,0x59f111f1b605d019
														
 
															+	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
														
 
															+	.quad 0xd807aa98a3030242,0x12835b0145706fbe
														
 
															+	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
														
 
															+	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
														
 
															+	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
														
 
															+	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
														
 
															+	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
														
 
															+	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
														
 
															+	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
														
 
															+	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
														
 
															+	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
														
 
															+	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
														
 
															+	.quad 0x06ca6351e003826f,0x142929670a0e6e70
														
 
															+	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
														
 
															+	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
														
 
															+	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
														
 
															+	.quad 0x81c2c92e47edaee6,0x92722c851482353b
														
 
															+	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
														
 
															+	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
														
 
															+	.quad 0xd192e819d6ef5218,0xd69906245565a910
														
 
															+	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
														
 
															+	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
														
 
															+	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
														
 
															+	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
														
 
															+	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
														
 
															+	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
														
 
															+	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
														
 
															+	.quad 0x90befffa23631e28,0xa4506cebde82bde9
														
 
															+	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
														
 
															+	.quad 0xca273eceea26619c,0xd186b8c721c0c207
														
 
															+	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
														
 
															+	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
														
 
															+	.quad 0x113f9804bef90dae,0x1b710b35131c471b
														
 
															+	.quad 0x28db77f523047d84,0x32caab7b40c72493
														
 
															+	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
														
 
															+	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
														
 
															+	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
														
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -0,0 +1,282 @@
 
															+/*
														
 
															+ * Cryptographic API.
														
 
															+ *
														
 
															+ * Glue code for the SHA512 Secure Hash Algorithm assembler
														
 
															+ * implementation using supplemental SSE3 / AVX / AVX2 instructions.
														
 
															+ *
														
 
															+ * This file is based on sha512_generic.c
														
 
															+ *
														
 
															+ * Copyright (C) 2013 Intel Corporation
														
 
															+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify it
														
 
															+ * under the terms of the GNU General Public License as published by the Free
														
 
															+ * Software Foundation; either version 2 of the License, or (at your option)
														
 
															+ * any later version.
														
 
															+ *
														
 
															+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
														
 
															+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
														
 
															+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
														
 
															+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
														
 
															+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
														
 
															+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
														
 
															+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
														
 
															+ * SOFTWARE.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
														
 
															+
														
 
															+#include <crypto/internal/hash.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/mm.h>
														
 
															+#include <linux/cryptohash.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <crypto/sha.h>
														
 
															+#include <asm/byteorder.h>
														
 
															+#include <asm/i387.h>
														
 
															+#include <asm/xcr.h>
														
 
															+#include <asm/xsave.h>
														
 
															+
														
 
															+#include <linux/string.h>
														
 
															+
														
 
															+asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest,
														
 
															+				     u64 rounds);
														
 
															+#ifdef CONFIG_AS_AVX
														
 
															+asmlinkage void sha512_transform_avx(const char *data, u64 *digest,
														
 
															+				     u64 rounds);
														
 
															+#endif
														
 
															+#ifdef CONFIG_AS_AVX2
														
 
															+asmlinkage void sha512_transform_rorx(const char *data, u64 *digest,
														
 
															+				     u64 rounds);
														
 
															+#endif
														
 
															+
														
 
															+static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64);
														
 
															+
														
 
															+
														
 
															+static int sha512_ssse3_init(struct shash_desc *desc)
														
 
															+{
														
 
															+	struct sha512_state *sctx = shash_desc_ctx(desc);
														
 
															+
														
 
															+	sctx->state[0] = SHA512_H0;
														
 
															+	sctx->state[1] = SHA512_H1;
														
 
															+	sctx->state[2] = SHA512_H2;
														
 
															+	sctx->state[3] = SHA512_H3;
														
 
															+	sctx->state[4] = SHA512_H4;
														
 
															+	sctx->state[5] = SHA512_H5;
														
 
															+	sctx->state[6] = SHA512_H6;
														
 
															+	sctx->state[7] = SHA512_H7;
														
 
															+	sctx->count[0] = sctx->count[1] = 0;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
														
 
															+			       unsigned int len, unsigned int partial)
														
 
															+{
														
 
															+	struct sha512_state *sctx = shash_desc_ctx(desc);
														
 
															+	unsigned int done = 0;
														
 
															+
														
 
															+	sctx->count[0] += len;
														
 
															+	if (sctx->count[0] < len)
														
 
															+		sctx->count[1]++;
														
 
															+
														
 
															+	if (partial) {
														
 
															+		done = SHA512_BLOCK_SIZE - partial;
														
 
															+		memcpy(sctx->buf + partial, data, done);
														
 
															+		sha512_transform_asm(sctx->buf, sctx->state, 1);
														
 
															+	}
														
 
															+
														
 
															+	if (len - done >= SHA512_BLOCK_SIZE) {
														
 
															+		const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
														
 
															+
														
 
															+		sha512_transform_asm(data + done, sctx->state, (u64) rounds);
														
 
															+
														
 
															+		done += rounds * SHA512_BLOCK_SIZE;
														
 
															+	}
														
 
															+
														
 
															+	memcpy(sctx->buf, data + done, len - done);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
														
 
															+			     unsigned int len)
														
 
															+{
														
 
															+	struct sha512_state *sctx = shash_desc_ctx(desc);
														
 
															+	unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
														
 
															+	int res;
														
 
															+
														
 
															+	/* Handle the fast case right here */
														
 
															+	if (partial + len < SHA512_BLOCK_SIZE) {
														
 
															+		sctx->count[0] += len;
														
 
															+		if (sctx->count[0] < len)
														
 
															+			sctx->count[1]++;
														
 
															+		memcpy(sctx->buf + partial, data, len);
														
 
															+
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	if (!irq_fpu_usable()) {
														
 
															+		res = crypto_sha512_update(desc, data, len);
														
 
															+	} else {
														
 
															+		kernel_fpu_begin();
														
 
															+		res = __sha512_ssse3_update(desc, data, len, partial);
														
 
															+		kernel_fpu_end();
														
 
															+	}
														
 
															+
														
 
															+	return res;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/* Add padding and return the message digest. */
														
 
															+static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
														
 
															+{
														
 
															+	struct sha512_state *sctx = shash_desc_ctx(desc);
														
 
															+	unsigned int i, index, padlen;
														
 
															+	__be64 *dst = (__be64 *)out;
														
 
															+	__be64 bits[2];
														
 
															+	static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
														
 
															+
														
 
															+	/* save number of bits */
														
 
															+	bits[1] = cpu_to_be64(sctx->count[0] << 3);
														
 
															+	bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61;
														
 
															+
														
 
															+	/* Pad out to 112 mod 128 and append length */
														
 
															+	index = sctx->count[0] & 0x7f;
														
 
															+	padlen = (index < 112) ? (112 - index) : ((128+112) - index);
														
 
															+
														
 
															+	if (!irq_fpu_usable()) {
														
 
															+		crypto_sha512_update(desc, padding, padlen);
														
 
															+		crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
														
 
															+	} else {
														
 
															+		kernel_fpu_begin();
														
 
															+		/* We need to fill a whole block for __sha512_ssse3_update() */
														
 
															+		if (padlen <= 112) {
														
 
															+			sctx->count[0] += padlen;
														
 
															+			if (sctx->count[0] < padlen)
														
 
															+				sctx->count[1]++;
														
 
															+			memcpy(sctx->buf + index, padding, padlen);
														
 
															+		} else {
														
 
															+			__sha512_ssse3_update(desc, padding, padlen, index);
														
 
															+		}
														
 
															+		__sha512_ssse3_update(desc, (const u8 *)&bits,
														
 
															+					sizeof(bits), 112);
														
 
															+		kernel_fpu_end();
														
 
															+	}
														
 
															+
														
 
															+	/* Store state in digest */
														
 
															+	for (i = 0; i < 8; i++)
														
 
															+		dst[i] = cpu_to_be64(sctx->state[i]);
														
 
															+
														
 
															+	/* Wipe context */
														
 
															+	memset(sctx, 0, sizeof(*sctx));
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int sha512_ssse3_export(struct shash_desc *desc, void *out)
														
 
															+{
														
 
															+	struct sha512_state *sctx = shash_desc_ctx(desc);
														
 
															+
														
 
															+	memcpy(out, sctx, sizeof(*sctx));
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
														
 
															+{
														
 
															+	struct sha512_state *sctx = shash_desc_ctx(desc);
														
 
															+
														
 
															+	memcpy(sctx, in, sizeof(*sctx));
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static struct shash_alg alg = {
														
 
															+	.digestsize	=	SHA512_DIGEST_SIZE,
														
 
															+	.init		=	sha512_ssse3_init,
														
 
															+	.update		=	sha512_ssse3_update,
														
 
															+	.final		=	sha512_ssse3_final,
														
 
															+	.export		=	sha512_ssse3_export,
														
 
															+	.import		=	sha512_ssse3_import,
														
 
															+	.descsize	=	sizeof(struct sha512_state),
														
 
															+	.statesize	=	sizeof(struct sha512_state),
														
 
															+	.base		=	{
														
 
															+		.cra_name	=	"sha512",
														
 
															+		.cra_driver_name =	"sha512-ssse3",
														
 
															+		.cra_priority	=	150,
														
 
															+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
														
 
															+		.cra_blocksize	=	SHA512_BLOCK_SIZE,
														
 
															+		.cra_module	=	THIS_MODULE,
														
 
															+	}
														
 
															+};
														
 
															+
														
 
															+#ifdef CONFIG_AS_AVX
														
 
															+static bool __init avx_usable(void)
														
 
															+{
														
 
															+	u64 xcr0;
														
 
															+
														
 
															+	if (!cpu_has_avx || !cpu_has_osxsave)
														
 
															+		return false;
														
 
															+
														
 
															+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
														
 
															+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
														
 
															+		pr_info("AVX detected but unusable.\n");
														
 
															+
														
 
															+		return false;
														
 
															+	}
														
 
															+
														
 
															+	return true;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															+static int __init sha512_ssse3_mod_init(void)
														
 
															+{
														
 
															+	/* test for SSE3 first */
														
 
															+	if (cpu_has_ssse3)
														
 
															+		sha512_transform_asm = sha512_transform_ssse3;
														
 
															+
														
 
															+#ifdef CONFIG_AS_AVX
														
 
															+	/* allow AVX to override SSSE3, it's a little faster */
														
 
															+	if (avx_usable()) {
														
 
															+#ifdef CONFIG_AS_AVX2
														
 
															+		if (boot_cpu_has(X86_FEATURE_AVX2))
														
 
															+			sha512_transform_asm = sha512_transform_rorx;
														
 
															+		else
														
 
															+#endif
														
 
															+			sha512_transform_asm = sha512_transform_avx;
														
 
															+	}
														
 
															+#endif
														
 
															+
														
 
															+	if (sha512_transform_asm) {
														
 
															+#ifdef CONFIG_AS_AVX
														
 
															+		if (sha512_transform_asm == sha512_transform_avx)
														
 
															+			pr_info("Using AVX optimized SHA-512 implementation\n");
														
 
															+#ifdef CONFIG_AS_AVX2
														
 
															+		else if (sha512_transform_asm == sha512_transform_rorx)
														
 
															+			pr_info("Using AVX2 optimized SHA-512 implementation\n");
														
 
															+#endif
														
 
															+		else
														
 
															+#endif
														
 
															+			pr_info("Using SSSE3 optimized SHA-512 implementation\n");
														
 
															+		return crypto_register_shash(&alg);
														
 
															+	}
														
 
															+	pr_info("Neither AVX nor SSSE3 is available/usable.\n");
														
 
															+
														
 
															+	return -ENODEV;
														
 
															+}
														
 
															+
														
 
															+static void __exit sha512_ssse3_mod_fini(void)
														
 
															+{
														
 
															+	crypto_unregister_shash(&alg);
														
 
															+}
														
 
															+
														
 
															+module_init(sha512_ssse3_mod_init);
														
 
															+module_exit(sha512_ssse3_mod_fini);
														
 
															+
														
 
															+MODULE_LICENSE("GPL");
														
 
															+MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
														
 
															+
														
 
															+MODULE_ALIAS("sha512");
														
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -4,7 +4,7 @@
 
															  * Copyright (C) 2012 Johannes Goetzfried
														
 
															  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
														
 
															  *
														
 
															- * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															  *
														
 
															  * This program is free software; you can redistribute it and/or modify
														
 
															  * it under the terms of the GNU General Public License as published by
														
@@ -33,6 +33,8 @@
 
															 .Lbswap128_mask:
														
 
															 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
														
 
															+.Lxts_gf128mul_and_shl1_mask:
														
 
															+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
														
 
															 .text
														
@@ -408,3 +410,47 @@ ENTRY(twofish_ctr_8way)
 
															 	ret;
														
 
															 ENDPROC(twofish_ctr_8way)
														
 
															+
														
 
															+ENTRY(twofish_xts_enc_8way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 */
														
 
															+
														
 
															+	movq %rsi, %r11;
														
 
															+
														
 
															+	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
														
 
															+	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
														
 
															+		      RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
														
 
															+
														
 
															+	call __twofish_enc_blk8;
														
 
															+
														
 
															+	/* dst <= regs xor IVs(in dst) */
														
 
															+	store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(twofish_xts_enc_8way)
														
 
															+
														
 
															+ENTRY(twofish_xts_dec_8way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 */
														
 
															+
														
 
															+	movq %rsi, %r11;
														
 
															+
														
 
															+	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
														
 
															+	load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2,
														
 
															+		      RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
														
 
															+
														
 
															+	call __twofish_dec_blk8;
														
 
															+
														
 
															+	/* dst <= regs xor IVs(in dst) */
														
 
															+	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(twofish_xts_dec_8way)
														
--- a/arch/x86/crypto/twofish-avx2-asm_64.S
+++ b/arch/x86/crypto/twofish-avx2-asm_64.S
@@ -0,0 +1,600 @@
 
															+/*
														
 
															+ * x86_64/AVX2 assembler optimized version of Twofish
														
 
															+ *
														
 
															+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify
														
 
															+ * it under the terms of the GNU General Public License as published by
														
 
															+ * the Free Software Foundation; either version 2 of the License, or
														
 
															+ * (at your option) any later version.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#include <linux/linkage.h>
														
 
															+#include "glue_helper-asm-avx2.S"
														
 
															+
														
 
															+.file "twofish-avx2-asm_64.S"
														
 
															+
														
 
															+.data
														
 
															+.align 16
														
 
															+
														
 
															+.Lvpshufb_mask0:
														
 
															+.long 0x80808000
														
 
															+.long 0x80808004
														
 
															+.long 0x80808008
														
 
															+.long 0x8080800c
														
 
															+
														
 
															+.Lbswap128_mask:
														
 
															+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
														
 
															+.Lxts_gf128mul_and_shl1_mask_0:
														
 
															+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
														
 
															+.Lxts_gf128mul_and_shl1_mask_1:
														
 
															+	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
														
 
															+
														
 
															+.text
														
 
															+
														
 
															+/* structure of crypto context */
														
 
															+#define s0	0
														
 
															+#define s1	1024
														
 
															+#define s2	2048
														
 
															+#define s3	3072
														
 
															+#define w	4096
														
 
															+#define	k	4128
														
 
															+
														
 
															+/* register macros */
														
 
															+#define CTX	%rdi
														
 
															+
														
 
															+#define RS0	CTX
														
 
															+#define RS1	%r8
														
 
															+#define RS2	%r9
														
 
															+#define RS3	%r10
														
 
															+#define RK	%r11
														
 
															+#define RW	%rax
														
 
															+#define RROUND  %r12
														
 
															+#define RROUNDd %r12d
														
 
															+
														
 
															+#define RA0	%ymm8
														
 
															+#define RB0	%ymm9
														
 
															+#define RC0	%ymm10
														
 
															+#define RD0	%ymm11
														
 
															+#define RA1	%ymm12
														
 
															+#define RB1	%ymm13
														
 
															+#define RC1	%ymm14
														
 
															+#define RD1	%ymm15
														
 
															+
														
 
															+/* temp regs */
														
 
															+#define RX0	%ymm0
														
 
															+#define RY0	%ymm1
														
 
															+#define RX1	%ymm2
														
 
															+#define RY1	%ymm3
														
 
															+#define RT0	%ymm4
														
 
															+#define RIDX	%ymm5
														
 
															+
														
 
															+#define RX0x	%xmm0
														
 
															+#define RY0x	%xmm1
														
 
															+#define RX1x	%xmm2
														
 
															+#define RY1x	%xmm3
														
 
															+#define RT0x	%xmm4
														
 
															+
														
 
															+/* vpgatherdd mask and '-1' */
														
 
															+#define RNOT	%ymm6
														
 
															+
														
 
															+/* byte mask, (-1 >> 24) */
														
 
															+#define RBYTE	%ymm7
														
 
															+
														
 
															+/**********************************************************************
														
 
															+  16-way AVX2 twofish
														
 
															+ **********************************************************************/
														
 
															+#define init_round_constants() \
														
 
															+	vpcmpeqd RNOT, RNOT, RNOT; \
														
 
															+	vpsrld $24, RNOT, RBYTE; \
														
 
															+	leaq k(CTX), RK; \
														
 
															+	leaq w(CTX), RW; \
														
 
															+	leaq s1(CTX), RS1; \
														
 
															+	leaq s2(CTX), RS2; \
														
 
															+	leaq s3(CTX), RS3; \
														
 
															+
														
 
															+#define g16(ab, rs0, rs1, rs2, rs3, xy) \
														
 
															+	vpand RBYTE, ab ## 0, RIDX; \
														
 
															+	vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
														
 
															+	vpcmpeqd RNOT, RNOT, RNOT; \
														
 
															+		\
														
 
															+		vpand RBYTE, ab ## 1, RIDX; \
														
 
															+		vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
														
 
															+		vpcmpeqd RNOT, RNOT, RNOT; \
														
 
															+	\
														
 
															+	vpsrld $8, ab ## 0, RIDX; \
														
 
															+	vpand RBYTE, RIDX, RIDX; \
														
 
															+	vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
														
 
															+	vpcmpeqd RNOT, RNOT, RNOT; \
														
 
															+	vpxor RT0, xy ## 0, xy ## 0; \
														
 
															+		\
														
 
															+		vpsrld $8, ab ## 1, RIDX; \
														
 
															+		vpand RBYTE, RIDX, RIDX; \
														
 
															+		vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
														
 
															+		vpcmpeqd RNOT, RNOT, RNOT; \
														
 
															+		vpxor RT0, xy ## 1, xy ## 1; \
														
 
															+	\
														
 
															+	vpsrld $16, ab ## 0, RIDX; \
														
 
															+	vpand RBYTE, RIDX, RIDX; \
														
 
															+	vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
														
 
															+	vpcmpeqd RNOT, RNOT, RNOT; \
														
 
															+	vpxor RT0, xy ## 0, xy ## 0; \
														
 
															+		\
														
 
															+		vpsrld $16, ab ## 1, RIDX; \
														
 
															+		vpand RBYTE, RIDX, RIDX; \
														
 
															+		vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
														
 
															+		vpcmpeqd RNOT, RNOT, RNOT; \
														
 
															+		vpxor RT0, xy ## 1, xy ## 1; \
														
 
															+	\
														
 
															+	vpsrld $24, ab ## 0, RIDX; \
														
 
															+	vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
														
 
															+	vpcmpeqd RNOT, RNOT, RNOT; \
														
 
															+	vpxor RT0, xy ## 0, xy ## 0; \
														
 
															+		\
														
 
															+		vpsrld $24, ab ## 1, RIDX; \
														
 
															+		vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
														
 
															+		vpcmpeqd RNOT, RNOT, RNOT; \
														
 
															+		vpxor RT0, xy ## 1, xy ## 1;
														
 
															+
														
 
															+#define g1_16(a, x) \
														
 
															+	g16(a, RS0, RS1, RS2, RS3, x);
														
 
															+
														
 
															+#define g2_16(b, y) \
														
 
															+	g16(b, RS1, RS2, RS3, RS0, y);
														
 
															+
														
 
															+#define encrypt_round_end16(a, b, c, d, nk) \
														
 
															+	vpaddd RY0, RX0, RX0; \
														
 
															+	vpaddd RX0, RY0, RY0; \
														
 
															+	vpbroadcastd nk(RK,RROUND,8), RT0; \
														
 
															+	vpaddd RT0, RX0, RX0; \
														
 
															+	vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
														
 
															+	vpaddd RT0, RY0, RY0; \
														
 
															+	\
														
 
															+	vpxor RY0, d ## 0, d ## 0; \
														
 
															+	\
														
 
															+	vpxor RX0, c ## 0, c ## 0; \
														
 
															+	vpsrld $1, c ## 0, RT0; \
														
 
															+	vpslld $31, c ## 0, c ## 0; \
														
 
															+	vpor RT0, c ## 0, c ## 0; \
														
 
															+	\
														
 
															+		vpaddd RY1, RX1, RX1; \
														
 
															+		vpaddd RX1, RY1, RY1; \
														
 
															+		vpbroadcastd nk(RK,RROUND,8), RT0; \
														
 
															+		vpaddd RT0, RX1, RX1; \
														
 
															+		vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
														
 
															+		vpaddd RT0, RY1, RY1; \
														
 
															+		\
														
 
															+		vpxor RY1, d ## 1, d ## 1; \
														
 
															+		\
														
 
															+		vpxor RX1, c ## 1, c ## 1; \
														
 
															+		vpsrld $1, c ## 1, RT0; \
														
 
															+		vpslld $31, c ## 1, c ## 1; \
														
 
															+		vpor RT0, c ## 1, c ## 1; \
														
 
															+
														
 
															+#define encrypt_round16(a, b, c, d, nk) \
														
 
															+	g2_16(b, RY); \
														
 
															+	\
														
 
															+	vpslld $1, b ## 0, RT0; \
														
 
															+	vpsrld $31, b ## 0, b ## 0; \
														
 
															+	vpor RT0, b ## 0, b ## 0; \
														
 
															+	\
														
 
															+		vpslld $1, b ## 1, RT0; \
														
 
															+		vpsrld $31, b ## 1, b ## 1; \
														
 
															+		vpor RT0, b ## 1, b ## 1; \
														
 
															+	\
														
 
															+	g1_16(a, RX); \
														
 
															+	\
														
 
															+	encrypt_round_end16(a, b, c, d, nk);
														
 
															+
														
 
															+#define encrypt_round_first16(a, b, c, d, nk) \
														
 
															+	vpslld $1, d ## 0, RT0; \
														
 
															+	vpsrld $31, d ## 0, d ## 0; \
														
 
															+	vpor RT0, d ## 0, d ## 0; \
														
 
															+	\
														
 
															+		vpslld $1, d ## 1, RT0; \
														
 
															+		vpsrld $31, d ## 1, d ## 1; \
														
 
															+		vpor RT0, d ## 1, d ## 1; \
														
 
															+	\
														
 
															+	encrypt_round16(a, b, c, d, nk);
														
 
															+
														
 
															+#define encrypt_round_last16(a, b, c, d, nk) \
														
 
															+	g2_16(b, RY); \
														
 
															+	\
														
 
															+	g1_16(a, RX); \
														
 
															+	\
														
 
															+	encrypt_round_end16(a, b, c, d, nk);
														
 
															+
														
 
															+#define decrypt_round_end16(a, b, c, d, nk) \
														
 
															+	vpaddd RY0, RX0, RX0; \
														
 
															+	vpaddd RX0, RY0, RY0; \
														
 
															+	vpbroadcastd nk(RK,RROUND,8), RT0; \
														
 
															+	vpaddd RT0, RX0, RX0; \
														
 
															+	vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
														
 
															+	vpaddd RT0, RY0, RY0; \
														
 
															+	\
														
 
															+	vpxor RX0, c ## 0, c ## 0; \
														
 
															+	\
														
 
															+	vpxor RY0, d ## 0, d ## 0; \
														
 
															+	vpsrld $1, d ## 0, RT0; \
														
 
															+	vpslld $31, d ## 0, d ## 0; \
														
 
															+	vpor RT0, d ## 0, d ## 0; \
														
 
															+	\
														
 
															+		vpaddd RY1, RX1, RX1; \
														
 
															+		vpaddd RX1, RY1, RY1; \
														
 
															+		vpbroadcastd nk(RK,RROUND,8), RT0; \
														
 
															+		vpaddd RT0, RX1, RX1; \
														
 
															+		vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
														
 
															+		vpaddd RT0, RY1, RY1; \
														
 
															+		\
														
 
															+		vpxor RX1, c ## 1, c ## 1; \
														
 
															+		\
														
 
															+		vpxor RY1, d ## 1, d ## 1; \
														
 
															+		vpsrld $1, d ## 1, RT0; \
														
 
															+		vpslld $31, d ## 1, d ## 1; \
														
 
															+		vpor RT0, d ## 1, d ## 1;
														
 
															+
														
 
															+#define decrypt_round16(a, b, c, d, nk) \
														
 
															+	g1_16(a, RX); \
														
 
															+	\
														
 
															+	vpslld $1, a ## 0, RT0; \
														
 
															+	vpsrld $31, a ## 0, a ## 0; \
														
 
															+	vpor RT0, a ## 0, a ## 0; \
														
 
															+	\
														
 
															+		vpslld $1, a ## 1, RT0; \
														
 
															+		vpsrld $31, a ## 1, a ## 1; \
														
 
															+		vpor RT0, a ## 1, a ## 1; \
														
 
															+	\
														
 
															+	g2_16(b, RY); \
														
 
															+	\
														
 
															+	decrypt_round_end16(a, b, c, d, nk);
														
 
															+
														
 
															+#define decrypt_round_first16(a, b, c, d, nk) \
														
 
															+	vpslld $1, c ## 0, RT0; \
														
 
															+	vpsrld $31, c ## 0, c ## 0; \
														
 
															+	vpor RT0, c ## 0, c ## 0; \
														
 
															+	\
														
 
															+		vpslld $1, c ## 1, RT0; \
														
 
															+		vpsrld $31, c ## 1, c ## 1; \
														
 
															+		vpor RT0, c ## 1, c ## 1; \
														
 
															+	\
														
 
															+	decrypt_round16(a, b, c, d, nk)
														
 
															+
														
 
															+#define decrypt_round_last16(a, b, c, d, nk) \
														
 
															+	g1_16(a, RX); \
														
 
															+	\
														
 
															+	g2_16(b, RY); \
														
 
															+	\
														
 
															+	decrypt_round_end16(a, b, c, d, nk);
														
 
															+
														
 
															+#define encrypt_cycle16() \
														
 
															+	encrypt_round16(RA, RB, RC, RD, 0); \
														
 
															+	encrypt_round16(RC, RD, RA, RB, 8);
														
 
															+
														
 
															+#define encrypt_cycle_first16() \
														
 
															+	encrypt_round_first16(RA, RB, RC, RD, 0); \
														
 
															+	encrypt_round16(RC, RD, RA, RB, 8);
														
 
															+
														
 
															+#define encrypt_cycle_last16() \
														
 
															+	encrypt_round16(RA, RB, RC, RD, 0); \
														
 
															+	encrypt_round_last16(RC, RD, RA, RB, 8);
														
 
															+
														
 
															+#define decrypt_cycle16(n) \
														
 
															+	decrypt_round16(RC, RD, RA, RB, 8); \
														
 
															+	decrypt_round16(RA, RB, RC, RD, 0);
														
 
															+
														
 
															+#define decrypt_cycle_first16(n) \
														
 
															+	decrypt_round_first16(RC, RD, RA, RB, 8); \
														
 
															+	decrypt_round16(RA, RB, RC, RD, 0);
														
 
															+
														
 
															+#define decrypt_cycle_last16(n) \
														
 
															+	decrypt_round16(RC, RD, RA, RB, 8); \
														
 
															+	decrypt_round_last16(RA, RB, RC, RD, 0);
														
 
															+
														
 
															+#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
														
 
															+	vpunpckhdq x1, x0, t2; \
														
 
															+	vpunpckldq x1, x0, x0; \
														
 
															+	\
														
 
															+	vpunpckldq x3, x2, t1; \
														
 
															+	vpunpckhdq x3, x2, x2; \
														
 
															+	\
														
 
															+	vpunpckhqdq t1,	x0, x1; \
														
 
															+	vpunpcklqdq t1,	x0, x0; \
														
 
															+	\
														
 
															+	vpunpckhqdq x2, t2, x3; \
														
 
															+	vpunpcklqdq x2,	t2, x2;
														
 
															+
														
 
															+#define read_blocks8(offs,a,b,c,d) \
														
 
															+	transpose_4x4(a, b, c, d, RX0, RY0);
														
 
															+
														
 
															+#define write_blocks8(offs,a,b,c,d) \
														
 
															+	transpose_4x4(a, b, c, d, RX0, RY0);
														
 
															+
														
 
															+#define inpack_enc8(a,b,c,d) \
														
 
															+	vpbroadcastd 4*0(RW), RT0; \
														
 
															+	vpxor RT0, a, a; \
														
 
															+	\
														
 
															+	vpbroadcastd 4*1(RW), RT0; \
														
 
															+	vpxor RT0, b, b; \
														
 
															+	\
														
 
															+	vpbroadcastd 4*2(RW), RT0; \
														
 
															+	vpxor RT0, c, c; \
														
 
															+	\
														
 
															+	vpbroadcastd 4*3(RW), RT0; \
														
 
															+	vpxor RT0, d, d;
														
 
															+
														
 
															+#define outunpack_enc8(a,b,c,d) \
														
 
															+	vpbroadcastd 4*4(RW), RX0; \
														
 
															+	vpbroadcastd 4*5(RW), RY0; \
														
 
															+	vpxor RX0, c, RX0; \
														
 
															+	vpxor RY0, d, RY0; \
														
 
															+	\
														
 
															+	vpbroadcastd 4*6(RW), RT0; \
														
 
															+	vpxor RT0, a, c; \
														
 
															+	vpbroadcastd 4*7(RW), RT0; \
														
 
															+	vpxor RT0, b, d; \
														
 
															+	\
														
 
															+	vmovdqa RX0, a; \
														
 
															+	vmovdqa RY0, b;
														
 
															+
														
 
															+#define inpack_dec8(a,b,c,d) \
														
 
															+	vpbroadcastd 4*4(RW), RX0; \
														
 
															+	vpbroadcastd 4*5(RW), RY0; \
														
 
															+	vpxor RX0, a, RX0; \
														
 
															+	vpxor RY0, b, RY0; \
														
 
															+	\
														
 
															+	vpbroadcastd 4*6(RW), RT0; \
														
 
															+	vpxor RT0, c, a; \
														
 
															+	vpbroadcastd 4*7(RW), RT0; \
														
 
															+	vpxor RT0, d, b; \
														
 
															+	\
														
 
															+	vmovdqa RX0, c; \
														
 
															+	vmovdqa RY0, d;
														
 
															+
														
 
															+#define outunpack_dec8(a,b,c,d) \
														
 
															+	vpbroadcastd 4*0(RW), RT0; \
														
 
															+	vpxor RT0, a, a; \
														
 
															+	\
														
 
															+	vpbroadcastd 4*1(RW), RT0; \
														
 
															+	vpxor RT0, b, b; \
														
 
															+	\
														
 
															+	vpbroadcastd 4*2(RW), RT0; \
														
 
															+	vpxor RT0, c, c; \
														
 
															+	\
														
 
															+	vpbroadcastd 4*3(RW), RT0; \
														
 
															+	vpxor RT0, d, d;
														
 
															+
														
 
															+#define read_blocks16(a,b,c,d) \
														
 
															+	read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
														
 
															+	read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
														
 
															+
														
 
															+#define write_blocks16(a,b,c,d) \
														
 
															+	write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
														
 
															+	write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
														
 
															+
														
 
															+#define xor_blocks16(a,b,c,d) \
														
 
															+	xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
														
 
															+	xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
														
 
															+
														
 
															+#define inpack_enc16(a,b,c,d) \
														
 
															+	inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
														
 
															+	inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
														
 
															+
														
 
															+#define outunpack_enc16(a,b,c,d) \
														
 
															+	outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
														
 
															+	outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
														
 
															+
														
 
															+#define inpack_dec16(a,b,c,d) \
														
 
															+	inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
														
 
															+	inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
														
 
															+
														
 
															+#define outunpack_dec16(a,b,c,d) \
														
 
															+	outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
														
 
															+	outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
														
 
															+
														
 
															+.align 8
														
 
															+__twofish_enc_blk16:
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
														
 
															+	 * output:
														
 
															+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
														
 
															+	 */
														
 
															+	init_round_constants();
														
 
															+
														
 
															+	read_blocks16(RA, RB, RC, RD);
														
 
															+	inpack_enc16(RA, RB, RC, RD);
														
 
															+
														
 
															+	xorl RROUNDd, RROUNDd;
														
 
															+	encrypt_cycle_first16();
														
 
															+	movl $2, RROUNDd;
														
 
															+
														
 
															+.align 4
														
 
															+.L__enc_loop:
														
 
															+	encrypt_cycle16();
														
 
															+
														
 
															+	addl $2, RROUNDd;
														
 
															+	cmpl $14, RROUNDd;
														
 
															+	jne .L__enc_loop;
														
 
															+
														
 
															+	encrypt_cycle_last16();
														
 
															+
														
 
															+	outunpack_enc16(RA, RB, RC, RD);
														
 
															+	write_blocks16(RA, RB, RC, RD);
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(__twofish_enc_blk16)
														
 
															+
														
 
															+.align 8
														
 
															+__twofish_dec_blk16:
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
														
 
															+	 * output:
														
 
															+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
														
 
															+	 */
														
 
															+	init_round_constants();
														
 
															+
														
 
															+	read_blocks16(RA, RB, RC, RD);
														
 
															+	inpack_dec16(RA, RB, RC, RD);
														
 
															+
														
 
															+	movl $14, RROUNDd;
														
 
															+	decrypt_cycle_first16();
														
 
															+	movl $12, RROUNDd;
														
 
															+
														
 
															+.align 4
														
 
															+.L__dec_loop:
														
 
															+	decrypt_cycle16();
														
 
															+
														
 
															+	addl $-2, RROUNDd;
														
 
															+	jnz .L__dec_loop;
														
 
															+
														
 
															+	decrypt_cycle_last16();
														
 
															+
														
 
															+	outunpack_dec16(RA, RB, RC, RD);
														
 
															+	write_blocks16(RA, RB, RC, RD);
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(__twofish_dec_blk16)
														
 
															+
														
 
															+ENTRY(twofish_ecb_enc_16way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+	pushq %r12;
														
 
															+
														
 
															+	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
														
 
															+
														
 
															+	call __twofish_enc_blk16;
														
 
															+
														
 
															+	store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
														
 
															+
														
 
															+	popq %r12;
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(twofish_ecb_enc_16way)
														
 
															+
														
 
															+ENTRY(twofish_ecb_dec_16way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+	pushq %r12;
														
 
															+
														
 
															+	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
														
 
															+
														
 
															+	call __twofish_dec_blk16;
														
 
															+
														
 
															+	store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
														
 
															+
														
 
															+	popq %r12;
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(twofish_ecb_dec_16way)
														
 
															+
														
 
															+ENTRY(twofish_cbc_dec_16way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst
														
 
															+	 *	%rdx: src
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+	pushq %r12;
														
 
															+
														
 
															+	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
														
 
															+
														
 
															+	call __twofish_dec_blk16;
														
 
															+
														
 
															+	store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1,
														
 
															+			RX0);
														
 
															+
														
 
															+	popq %r12;
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(twofish_cbc_dec_16way)
														
 
															+
														
 
															+ENTRY(twofish_ctr_16way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (16 blocks)
														
 
															+	 *	%rdx: src (16 blocks)
														
 
															+	 *	%rcx: iv (little endian, 128bit)
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+	pushq %r12;
														
 
															+
														
 
															+	load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
														
 
															+		       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
														
 
															+		       RBYTE);
														
 
															+
														
 
															+	call __twofish_enc_blk16;
														
 
															+
														
 
															+	store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
														
 
															+
														
 
															+	popq %r12;
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(twofish_ctr_16way)
														
 
															+
														
 
															+.align 8
														
 
															+twofish_xts_crypt_16way:
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (16 blocks)
														
 
															+	 *	%rdx: src (16 blocks)
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 *	%r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16
														
 
															+	 */
														
 
															+
														
 
															+	vzeroupper;
														
 
															+	pushq %r12;
														
 
															+
														
 
															+	load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
														
 
															+		       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
														
 
															+		       .Lxts_gf128mul_and_shl1_mask_0,
														
 
															+		       .Lxts_gf128mul_and_shl1_mask_1);
														
 
															+
														
 
															+	call *%r8;
														
 
															+
														
 
															+	store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
														
 
															+
														
 
															+	popq %r12;
														
 
															+	vzeroupper;
														
 
															+
														
 
															+	ret;
														
 
															+ENDPROC(twofish_xts_crypt_16way)
														
 
															+
														
 
															+ENTRY(twofish_xts_enc_16way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (16 blocks)
														
 
															+	 *	%rdx: src (16 blocks)
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 */
														
 
															+	leaq __twofish_enc_blk16, %r8;
														
 
															+	jmp twofish_xts_crypt_16way;
														
 
															+ENDPROC(twofish_xts_enc_16way)
														
 
															+
														
 
															+ENTRY(twofish_xts_dec_16way)
														
 
															+	/* input:
														
 
															+	 *	%rdi: ctx, CTX
														
 
															+	 *	%rsi: dst (16 blocks)
														
 
															+	 *	%rdx: src (16 blocks)
														
 
															+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
														
 
															+	 */
														
 
															+	leaq __twofish_dec_blk16, %r8;
														
 
															+	jmp twofish_xts_crypt_16way;
														
 
															+ENDPROC(twofish_xts_dec_16way)
														
--- a/arch/x86/crypto/twofish_avx2_glue.c
+++ b/arch/x86/crypto/twofish_avx2_glue.c
@@ -0,0 +1,584 @@
 
															+/*
														
 
															+ * Glue Code for x86_64/AVX2 assembler optimized version of Twofish
														
 
															+ *
														
 
															+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify
														
 
															+ * it under the terms of the GNU General Public License as published by
														
 
															+ * the Free Software Foundation; either version 2 of the License, or
														
 
															+ * (at your option) any later version.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/crypto.h>
														
 
															+#include <linux/err.h>
														
 
															+#include <crypto/algapi.h>
														
 
															+#include <crypto/ctr.h>
														
 
															+#include <crypto/twofish.h>
														
 
															+#include <crypto/lrw.h>
														
 
															+#include <crypto/xts.h>
														
 
															+#include <asm/xcr.h>
														
 
															+#include <asm/xsave.h>
														
 
															+#include <asm/crypto/twofish.h>
														
 
															+#include <asm/crypto/ablk_helper.h>
														
 
															+#include <asm/crypto/glue_helper.h>
														
 
															+#include <crypto/scatterwalk.h>
														
 
															+
														
 
															+#define TF_AVX2_PARALLEL_BLOCKS 16
														
 
															+
														
 
															+/* 16-way AVX2 parallel cipher functions */
														
 
															+asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst,
														
 
															+				      const u8 *src);
														
 
															+asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst,
														
 
															+				      const u8 *src);
														
 
															+asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
														
 
															+
														
 
															+asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src,
														
 
															+				  le128 *iv);
														
 
															+
														
 
															+asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst,
														
 
															+				      const u8 *src, le128 *iv);
														
 
															+asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst,
														
 
															+				      const u8 *src, le128 *iv);
														
 
															+
														
 
															+static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
														
 
															+					const u8 *src)
														
 
															+{
														
 
															+	__twofish_enc_blk_3way(ctx, dst, src, false);
														
 
															+}
														
 
															+
														
 
															+static const struct common_glue_ctx twofish_enc = {
														
 
															+	.num_funcs = 4,
														
 
															+	.fpu_blocks_limit = 8,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = 16,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 8,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 3,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx twofish_ctr = {
														
 
															+	.num_funcs = 4,
														
 
															+	.fpu_blocks_limit = 8,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = 16,
														
 
															+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) }
														
 
															+	},  {
														
 
															+		.num_blocks = 8,
														
 
															+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 3,
														
 
															+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx twofish_enc_xts = {
														
 
															+	.num_funcs = 3,
														
 
															+	.fpu_blocks_limit = 8,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = 16,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 8,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx twofish_dec = {
														
 
															+	.num_funcs = 4,
														
 
															+	.fpu_blocks_limit = 8,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = 16,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 8,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 3,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx twofish_dec_cbc = {
														
 
															+	.num_funcs = 4,
														
 
															+	.fpu_blocks_limit = 8,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = 16,
														
 
															+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 8,
														
 
															+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 3,
														
 
															+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static const struct common_glue_ctx twofish_dec_xts = {
														
 
															+	.num_funcs = 3,
														
 
															+	.fpu_blocks_limit = 8,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = 16,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 8,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
														
 
															+}
														
 
															+
														
 
															+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
														
 
															+}
														
 
															+
														
 
															+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
														
 
															+				       dst, src, nbytes);
														
 
															+}
														
 
															+
														
 
															+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
														
 
															+				       nbytes);
														
 
															+}
														
 
															+
														
 
															+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		     struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
														
 
															+}
														
 
															+
														
 
															+static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
														
 
															+{
														
 
															+	/* since reusing AVX functions, starts using FPU at 8 parallel blocks */
														
 
															+	return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
														
 
															+}
														
 
															+
														
 
															+static inline void twofish_fpu_end(bool fpu_enabled)
														
 
															+{
														
 
															+	glue_fpu_end(fpu_enabled);
														
 
															+}
														
 
															+
														
 
															+struct crypt_priv {
														
 
															+	struct twofish_ctx *ctx;
														
 
															+	bool fpu_enabled;
														
 
															+};
														
 
															+
														
 
															+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
														
 
															+{
														
 
															+	const unsigned int bsize = TF_BLOCK_SIZE;
														
 
															+	struct crypt_priv *ctx = priv;
														
 
															+	int i;
														
 
															+
														
 
															+	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
														
 
															+
														
 
															+	while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
														
 
															+		twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
														
 
															+		nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
														
 
															+	}
														
 
															+
														
 
															+	while (nbytes >= 8 * bsize) {
														
 
															+		twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * 8;
														
 
															+		nbytes -= bsize * 8;
														
 
															+	}
														
 
															+
														
 
															+	while (nbytes >= 3 * bsize) {
														
 
															+		twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * 3;
														
 
															+		nbytes -= bsize * 3;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
														
 
															+		twofish_enc_blk(ctx->ctx, srcdst, srcdst);
														
 
															+}
														
 
															+
														
 
															+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
														
 
															+{
														
 
															+	const unsigned int bsize = TF_BLOCK_SIZE;
														
 
															+	struct crypt_priv *ctx = priv;
														
 
															+	int i;
														
 
															+
														
 
															+	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
														
 
															+
														
 
															+	while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
														
 
															+		twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
														
 
															+		nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
														
 
															+	}
														
 
															+
														
 
															+	while (nbytes >= 8 * bsize) {
														
 
															+		twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * 8;
														
 
															+		nbytes -= bsize * 8;
														
 
															+	}
														
 
															+
														
 
															+	while (nbytes >= 3 * bsize) {
														
 
															+		twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
														
 
															+		srcdst += bsize * 3;
														
 
															+		nbytes -= bsize * 3;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
														
 
															+		twofish_dec_blk(ctx->ctx, srcdst, srcdst);
														
 
															+}
														
 
															+
														
 
															+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+	be128 buf[TF_AVX2_PARALLEL_BLOCKS];
														
 
															+	struct crypt_priv crypt_ctx = {
														
 
															+		.ctx = &ctx->twofish_ctx,
														
 
															+		.fpu_enabled = false,
														
 
															+	};
														
 
															+	struct lrw_crypt_req req = {
														
 
															+		.tbuf = buf,
														
 
															+		.tbuflen = sizeof(buf),
														
 
															+
														
 
															+		.table_ctx = &ctx->lrw_table,
														
 
															+		.crypt_ctx = &crypt_ctx,
														
 
															+		.crypt_fn = encrypt_callback,
														
 
															+	};
														
 
															+	int ret;
														
 
															+
														
 
															+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
														
 
															+	twofish_fpu_end(crypt_ctx.fpu_enabled);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+	be128 buf[TF_AVX2_PARALLEL_BLOCKS];
														
 
															+	struct crypt_priv crypt_ctx = {
														
 
															+		.ctx = &ctx->twofish_ctx,
														
 
															+		.fpu_enabled = false,
														
 
															+	};
														
 
															+	struct lrw_crypt_req req = {
														
 
															+		.tbuf = buf,
														
 
															+		.tbuflen = sizeof(buf),
														
 
															+
														
 
															+		.table_ctx = &ctx->lrw_table,
														
 
															+		.crypt_ctx = &crypt_ctx,
														
 
															+		.crypt_fn = decrypt_callback,
														
 
															+	};
														
 
															+	int ret;
														
 
															+
														
 
															+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
														
 
															+	twofish_fpu_end(crypt_ctx.fpu_enabled);
														
 
															+
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+
														
 
															+	return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(twofish_enc_blk),
														
 
															+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
														
 
															+}
														
 
															+
														
 
															+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															+		       struct scatterlist *src, unsigned int nbytes)
														
 
															+{
														
 
															+	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															+
														
 
															+	return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(twofish_enc_blk),
														
 
															+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
														
 
															+}
														
 
															+
														
 
															+static struct crypto_alg tf_algs[10] = { {
														
 
															+	.cra_name		= "__ecb-twofish-avx2",
														
 
															+	.cra_driver_name	= "__driver-ecb-twofish-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= TF_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct twofish_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= TF_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= TF_MAX_KEY_SIZE,
														
 
															+			.setkey		= twofish_setkey,
														
 
															+			.encrypt	= ecb_encrypt,
														
 
															+			.decrypt	= ecb_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "__cbc-twofish-avx2",
														
 
															+	.cra_driver_name	= "__driver-cbc-twofish-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= TF_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct twofish_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= TF_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= TF_MAX_KEY_SIZE,
														
 
															+			.setkey		= twofish_setkey,
														
 
															+			.encrypt	= cbc_encrypt,
														
 
															+			.decrypt	= cbc_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "__ctr-twofish-avx2",
														
 
															+	.cra_driver_name	= "__driver-ctr-twofish-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= 1,
														
 
															+	.cra_ctxsize		= sizeof(struct twofish_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= TF_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= TF_MAX_KEY_SIZE,
														
 
															+			.ivsize		= TF_BLOCK_SIZE,
														
 
															+			.setkey		= twofish_setkey,
														
 
															+			.encrypt	= ctr_crypt,
														
 
															+			.decrypt	= ctr_crypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "__lrw-twofish-avx2",
														
 
															+	.cra_driver_name	= "__driver-lrw-twofish-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= TF_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct twofish_lrw_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_exit		= lrw_twofish_exit_tfm,
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= TF_MIN_KEY_SIZE +
														
 
															+					  TF_BLOCK_SIZE,
														
 
															+			.max_keysize	= TF_MAX_KEY_SIZE +
														
 
															+					  TF_BLOCK_SIZE,
														
 
															+			.ivsize		= TF_BLOCK_SIZE,
														
 
															+			.setkey		= lrw_twofish_setkey,
														
 
															+			.encrypt	= lrw_encrypt,
														
 
															+			.decrypt	= lrw_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "__xts-twofish-avx2",
														
 
															+	.cra_driver_name	= "__driver-xts-twofish-avx2",
														
 
															+	.cra_priority		= 0,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
														
 
															+	.cra_blocksize		= TF_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct twofish_xts_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_blkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_u = {
														
 
															+		.blkcipher = {
														
 
															+			.min_keysize	= TF_MIN_KEY_SIZE * 2,
														
 
															+			.max_keysize	= TF_MAX_KEY_SIZE * 2,
														
 
															+			.ivsize		= TF_BLOCK_SIZE,
														
 
															+			.setkey		= xts_twofish_setkey,
														
 
															+			.encrypt	= xts_encrypt,
														
 
															+			.decrypt	= xts_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "ecb(twofish)",
														
 
															+	.cra_driver_name	= "ecb-twofish-avx2",
														
 
															+	.cra_priority		= 500,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= TF_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= TF_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= TF_MAX_KEY_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= ablk_encrypt,
														
 
															+			.decrypt	= ablk_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "cbc(twofish)",
														
 
															+	.cra_driver_name	= "cbc-twofish-avx2",
														
 
															+	.cra_priority		= 500,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= TF_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= TF_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= TF_MAX_KEY_SIZE,
														
 
															+			.ivsize		= TF_BLOCK_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= __ablk_encrypt,
														
 
															+			.decrypt	= ablk_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "ctr(twofish)",
														
 
															+	.cra_driver_name	= "ctr-twofish-avx2",
														
 
															+	.cra_priority		= 500,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= 1,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= TF_MIN_KEY_SIZE,
														
 
															+			.max_keysize	= TF_MAX_KEY_SIZE,
														
 
															+			.ivsize		= TF_BLOCK_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= ablk_encrypt,
														
 
															+			.decrypt	= ablk_encrypt,
														
 
															+			.geniv		= "chainiv",
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "lrw(twofish)",
														
 
															+	.cra_driver_name	= "lrw-twofish-avx2",
														
 
															+	.cra_priority		= 500,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= TF_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= TF_MIN_KEY_SIZE +
														
 
															+					  TF_BLOCK_SIZE,
														
 
															+			.max_keysize	= TF_MAX_KEY_SIZE +
														
 
															+					  TF_BLOCK_SIZE,
														
 
															+			.ivsize		= TF_BLOCK_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= ablk_encrypt,
														
 
															+			.decrypt	= ablk_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+}, {
														
 
															+	.cra_name		= "xts(twofish)",
														
 
															+	.cra_driver_name	= "xts-twofish-avx2",
														
 
															+	.cra_priority		= 500,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															+	.cra_blocksize		= TF_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
														
 
															+	.cra_alignmask		= 0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= ablk_init,
														
 
															+	.cra_exit		= ablk_exit,
														
 
															+	.cra_u = {
														
 
															+		.ablkcipher = {
														
 
															+			.min_keysize	= TF_MIN_KEY_SIZE * 2,
														
 
															+			.max_keysize	= TF_MAX_KEY_SIZE * 2,
														
 
															+			.ivsize		= TF_BLOCK_SIZE,
														
 
															+			.setkey		= ablk_set_key,
														
 
															+			.encrypt	= ablk_encrypt,
														
 
															+			.decrypt	= ablk_decrypt,
														
 
															+		},
														
 
															+	},
														
 
															+} };
														
 
															+
														
 
															+static int __init init(void)
														
 
															+{
														
 
															+	u64 xcr0;
														
 
															+
														
 
															+	if (!cpu_has_avx2 || !cpu_has_osxsave) {
														
 
															+		pr_info("AVX2 instructions are not detected.\n");
														
 
															+		return -ENODEV;
														
 
															+	}
														
 
															+
														
 
															+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
														
 
															+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
														
 
															+		pr_info("AVX2 detected but unusable.\n");
														
 
															+		return -ENODEV;
														
 
															+	}
														
 
															+
														
 
															+	return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs));
														
 
															+}
														
 
															+
														
 
															+static void __exit fini(void)
														
 
															+{
														
 
															+	crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs));
														
 
															+}
														
 
															+
														
 
															+module_init(init);
														
 
															+module_exit(fini);
														
 
															+
														
 
															+MODULE_LICENSE("GPL");
														
 
															+MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized");
														
 
															+MODULE_ALIAS("twofish");
														
 
															+MODULE_ALIAS("twofish-asm");
														
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -4,6 +4,8 @@
 
															  * Copyright (C) 2012 Johannes Goetzfried
														
 
															  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
														
 
															  *
														
 
															+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															+ *
														
 
															  * This program is free software; you can redistribute it and/or modify
														
 
															  * it under the terms of the GNU General Public License as published by
														
 
															  * the Free Software Foundation; either version 2 of the License, or
														
@@ -48,13 +50,26 @@
 
															 /* 8-way parallel cipher functions */
														
 
															 asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
														
 
															 				     const u8 *src);
														
 
															+EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way);
														
 
															+
														
 
															 asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
														
 
															 				     const u8 *src);
														
 
															+EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way);
														
 
															 asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
														
 
															 				     const u8 *src);
														
 
															+EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way);
														
 
															+
														
 
															 asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
														
 
															 				 const u8 *src, le128 *iv);
														
 
															+EXPORT_SYMBOL_GPL(twofish_ctr_8way);
														
 
															+
														
 
															+asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
														
 
															+				     const u8 *src, le128 *iv);
														
 
															+EXPORT_SYMBOL_GPL(twofish_xts_enc_8way);
														
 
															+asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
														
 
															+				     const u8 *src, le128 *iv);
														
 
															+EXPORT_SYMBOL_GPL(twofish_xts_dec_8way);
														
 
															 static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
														
 
															 					const u8 *src)
														
@@ -62,6 +77,20 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 
															 	__twofish_enc_blk_3way(ctx, dst, src, false);
														
 
															 }
														
 
															+void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															+{
														
 
															+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
														
 
															+				  GLUE_FUNC_CAST(twofish_enc_blk));
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(twofish_xts_enc);
														
 
															+
														
 
															+void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
														
 
															+{
														
 
															+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
														
 
															+				  GLUE_FUNC_CAST(twofish_dec_blk));
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(twofish_xts_dec);
														
 
															+
														
 
															 static const struct common_glue_ctx twofish_enc = {
														
 
															 	.num_funcs = 3,
														
@@ -95,6 +124,19 @@ static const struct common_glue_ctx twofish_ctr = {
 
															 	} }
														
 
															 };
														
 
															+static const struct common_glue_ctx twofish_enc_xts = {
														
 
															+	.num_funcs = 2,
														
 
															+	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = TWOFISH_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															 static const struct common_glue_ctx twofish_dec = {
														
 
															 	.num_funcs = 3,
														
 
															 	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
														
@@ -127,6 +169,19 @@ static const struct common_glue_ctx twofish_dec_cbc = {
 
															 	} }
														
 
															 };
														
 
															+static const struct common_glue_ctx twofish_dec_xts = {
														
 
															+	.num_funcs = 2,
														
 
															+	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
														
 
															+
														
 
															+	.funcs = { {
														
 
															+		.num_blocks = TWOFISH_PARALLEL_BLOCKS,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
														
 
															+	}, {
														
 
															+		.num_blocks = 1,
														
 
															+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
														
 
															+	} }
														
 
															+};
														
 
															+
														
 
															 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															 		       struct scatterlist *src, unsigned int nbytes)
														
 
															 {
														
@@ -275,54 +330,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 
															 		       struct scatterlist *src, unsigned int nbytes)
														
 
															 {
														
 
															 	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															-	be128 buf[TWOFISH_PARALLEL_BLOCKS];
														
 
															-	struct crypt_priv crypt_ctx = {
														
 
															-		.ctx = &ctx->crypt_ctx,
														
 
															-		.fpu_enabled = false,
														
 
															-	};
														
 
															-	struct xts_crypt_req req = {
														
 
															-		.tbuf = buf,
														
 
															-		.tbuflen = sizeof(buf),
														
 
															-		.tweak_ctx = &ctx->tweak_ctx,
														
 
															-		.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
														
 
															-		.crypt_ctx = &crypt_ctx,
														
 
															-		.crypt_fn = encrypt_callback,
														
 
															-	};
														
 
															-	int ret;
														
 
															-
														
 
															-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															-	ret = xts_crypt(desc, dst, src, nbytes, &req);
														
 
															-	twofish_fpu_end(crypt_ctx.fpu_enabled);
														
 
															-
														
 
															-	return ret;
														
 
															+	return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(twofish_enc_blk),
														
 
															+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
														
 
															 }
														
 
															 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
														
 
															 		       struct scatterlist *src, unsigned int nbytes)
														
 
															 {
														
 
															 	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
														
 
															-	be128 buf[TWOFISH_PARALLEL_BLOCKS];
														
 
															-	struct crypt_priv crypt_ctx = {
														
 
															-		.ctx = &ctx->crypt_ctx,
														
 
															-		.fpu_enabled = false,
														
 
															-	};
														
 
															-	struct xts_crypt_req req = {
														
 
															-		.tbuf = buf,
														
 
															-		.tbuflen = sizeof(buf),
														
 
															-
														
 
															-		.tweak_ctx = &ctx->tweak_ctx,
														
 
															-		.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
														
 
															-		.crypt_ctx = &crypt_ctx,
														
 
															-		.crypt_fn = decrypt_callback,
														
 
															-	};
														
 
															-	int ret;
														
 
															-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
														
 
															-	ret = xts_crypt(desc, dst, src, nbytes, &req);
														
 
															-	twofish_fpu_end(crypt_ctx.fpu_enabled);
														
 
															-
														
 
															-	return ret;
														
 
															+	return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
														
 
															+				     XTS_TWEAK_CAST(twofish_enc_blk),
														
 
															+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
														
 
															 }
														
 
															 static struct crypto_alg twofish_algs[10] = { {
														
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -293,6 +293,7 @@ extern const char * const x86_power_flags[32];
 
															 #define cpu_has_ssse3		boot_cpu_has(X86_FEATURE_SSSE3)
														
 
															 #define cpu_has_aes		boot_cpu_has(X86_FEATURE_AES)
														
 
															 #define cpu_has_avx		boot_cpu_has(X86_FEATURE_AVX)
														
 
															+#define cpu_has_avx2		boot_cpu_has(X86_FEATURE_AVX2)
														
 
															 #define cpu_has_ht		boot_cpu_has(X86_FEATURE_HT)
														
 
															 #define cpu_has_mp		boot_cpu_has(X86_FEATURE_MP)
														
 
															 #define cpu_has_nx		boot_cpu_has(X86_FEATURE_NX)
														
--- a/arch/x86/include/asm/crypto/blowfish.h
+++ b/arch/x86/include/asm/crypto/blowfish.h
@@ -0,0 +1,43 @@
 
															+#ifndef ASM_X86_BLOWFISH_H
														
 
															+#define ASM_X86_BLOWFISH_H
														
 
															+
														
 
															+#include <linux/crypto.h>
														
 
															+#include <crypto/blowfish.h>
														
 
															+
														
 
															+#define BF_PARALLEL_BLOCKS 4
														
 
															+
														
 
															+/* regular block cipher functions */
														
 
															+asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
														
 
															+				   bool xor);
														
 
															+asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
														
 
															+
														
 
															+/* 4-way parallel cipher functions */
														
 
															+asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
														
 
															+					const u8 *src, bool xor);
														
 
															+asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
														
 
															+				      const u8 *src);
														
 
															+
														
 
															+static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
														
 
															+{
														
 
															+	__blowfish_enc_blk(ctx, dst, src, false);
														
 
															+}
														
 
															+
														
 
															+static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
														
 
															+					const u8 *src)
														
 
															+{
														
 
															+	__blowfish_enc_blk(ctx, dst, src, true);
														
 
															+}
														
 
															+
														
 
															+static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
														
 
															+					 const u8 *src)
														
 
															+{
														
 
															+	__blowfish_enc_blk_4way(ctx, dst, src, false);
														
 
															+}
														
 
															+
														
 
															+static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
														
 
															+				      const u8 *src)
														
 
															+{
														
 
															+	__blowfish_enc_blk_4way(ctx, dst, src, true);
														
 
															+}
														
 
															+
														
 
															+#endif
														
--- a/arch/x86/include/asm/crypto/camellia.h
+++ b/arch/x86/include/asm/crypto/camellia.h
@@ -48,6 +48,22 @@ asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
 
															 asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
														
 
															 				      const u8 *src);
														
 
															+/* 16-way parallel cipher functions (avx/aes-ni) */
														
 
															+asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src);
														
 
															+asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src);
														
 
															+
														
 
															+asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src);
														
 
															+asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
														
 
															+				   const u8 *src, le128 *iv);
														
 
															+
														
 
															+asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src, le128 *iv);
														
 
															+asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
														
 
															+				       const u8 *src, le128 *iv);
														
 
															+
														
 
															 static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
														
 
															 				    const u8 *src)
														
 
															 {
														
@@ -79,4 +95,7 @@ extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
 
															 extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
														
 
															 				    le128 *iv);
														
 
															+extern void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
														
 
															+extern void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
														
 
															+
														
 
															 #endif /* ASM_X86_CAMELLIA_H */
														
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -14,10 +14,13 @@ typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
 
															 typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
														
 
															 typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
														
 
															 				       le128 *iv);
														
 
															+typedef void (*common_glue_xts_func_t)(void *ctx, u128 *dst, const u128 *src,
														
 
															+				       le128 *iv);
														
 
															 #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
														
 
															 #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
														
 
															 #define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn))
														
 
															+#define GLUE_XTS_FUNC_CAST(fn) ((common_glue_xts_func_t)(fn))
														
 
															 struct common_glue_func_entry {
														
 
															 	unsigned int num_blocks; /* number of blocks that @fn will process */
														
@@ -25,6 +28,7 @@ struct common_glue_func_entry {
 
															 		common_glue_func_t ecb;
														
 
															 		common_glue_cbc_func_t cbc;
														
 
															 		common_glue_ctr_func_t ctr;
														
 
															+		common_glue_xts_func_t xts;
														
 
															 	} fn_u;
														
 
															 };
														
@@ -96,6 +100,16 @@ static inline void le128_inc(le128 *i)
 
															 	i->b = cpu_to_le64(b);
														
 
															 }
														
 
															+static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src)
														
 
															+{
														
 
															+	u64 a = le64_to_cpu(src->a);
														
 
															+	u64 b = le64_to_cpu(src->b);
														
 
															+	u64 _tt = ((s64)a >> 63) & 0x87;
														
 
															+
														
 
															+	dst->a = cpu_to_le64((a << 1) ^ (b >> 63));
														
 
															+	dst->b = cpu_to_le64((b << 1) ^ _tt);
														
 
															+}
														
 
															+
														
 
															 extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
														
 
															 				 struct blkcipher_desc *desc,
														
 
															 				 struct scatterlist *dst,
														
@@ -118,4 +132,14 @@ extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
 
															 				 struct scatterlist *dst,
														
 
															 				 struct scatterlist *src, unsigned int nbytes);
														
 
															+extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
														
 
															+				 struct blkcipher_desc *desc,
														
 
															+				 struct scatterlist *dst,
														
 
															+				 struct scatterlist *src, unsigned int nbytes,
														
 
															+				 common_glue_func_t tweak_fn, void *tweak_ctx,
														
 
															+				 void *crypt_ctx);
														
 
															+
														
 
															+extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src,
														
 
															+				      le128 *iv, common_glue_func_t fn);
														
 
															+
														
 
															 #endif /* _CRYPTO_GLUE_HELPER_H */
														
--- a/arch/x86/include/asm/crypto/serpent-avx.h
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -6,6 +6,16 @@
 
															 #define SERPENT_PARALLEL_BLOCKS 8
														
 
															+struct serpent_lrw_ctx {
														
 
															+	struct lrw_table_ctx lrw_table;
														
 
															+	struct serpent_ctx serpent_ctx;
														
 
															+};
														
 
															+
														
 
															+struct serpent_xts_ctx {
														
 
															+	struct serpent_ctx tweak_ctx;
														
 
															+	struct serpent_ctx crypt_ctx;
														
 
															+};
														
 
															+
														
 
															 asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
														
 
															 					 const u8 *src);
														
 
															 asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
														
@@ -16,4 +26,23 @@ asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
 
															 asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
														
 
															 				     const u8 *src, le128 *iv);
														
 
															+asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
														
 
															+					 const u8 *src, le128 *iv);
														
 
															+asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
														
 
															+					 const u8 *src, le128 *iv);
														
 
															+
														
 
															+extern void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
														
 
															+				le128 *iv);
														
 
															+
														
 
															+extern void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
														
 
															+extern void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
														
 
															+
														
 
															+extern int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
														
 
															+			      unsigned int keylen);
														
 
															+
														
 
															+extern void lrw_serpent_exit_tfm(struct crypto_tfm *tfm);
														
 
															+
														
 
															+extern int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
														
 
															+			      unsigned int keylen);
														
 
															+
														
 
															 #endif
														
--- a/arch/x86/include/asm/crypto/twofish.h
+++ b/arch/x86/include/asm/crypto/twofish.h
@@ -28,6 +28,20 @@ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 
															 asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
														
 
															 				     const u8 *src);
														
 
															+/* 8-way parallel cipher functions */
														
 
															+asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
														
 
															+				     const u8 *src);
														
 
															+asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
														
 
															+				     const u8 *src);
														
 
															+asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
														
 
															+				     const u8 *src);
														
 
															+asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
														
 
															+				 const u8 *src, le128 *iv);
														
 
															+asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
														
 
															+				     const u8 *src, le128 *iv);
														
 
															+asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
														
 
															+				     const u8 *src, le128 *iv);
														
 
															+
														
 
															 /* helpers from twofish_x86_64-3way module */
														
 
															 extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
														
 
															 extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
														
@@ -43,4 +57,8 @@ extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm);
 
															 extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
														
 
															 			      unsigned int keylen);
														
 
															+/* helpers from twofish-avx module */
														
 
															+extern void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
														
 
															+extern void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
														
 
															+
														
 
															 #endif /* ASM_X86_TWOFISH_H */
														
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -198,6 +198,7 @@ config CRYPTO_GCM
 
															 	select CRYPTO_CTR
														
 
															 	select CRYPTO_AEAD
														
 
															 	select CRYPTO_GHASH
														
 
															+	select CRYPTO_NULL
														
 
															 	help
														
 
															 	  Support for Galois/Counter Mode (GCM) and Galois Message
														
 
															 	  Authentication Code (GMAC). Required for IPSec.
														
@@ -282,6 +283,17 @@ config CRYPTO_XTS
 
															 comment "Hash modes"
														
 
															+config CRYPTO_CMAC
														
 
															+	tristate "CMAC support"
														
 
															+	select CRYPTO_HASH
														
 
															+	select CRYPTO_MANAGER
														
 
															+	help
														
 
															+	  Cipher-based Message Authentication Code (CMAC) specified by
														
 
															+	  The National Institute of Standards and Technology (NIST).
														
 
															+
														
 
															+	  https://tools.ietf.org/html/rfc4493
														
 
															+	  http://csrc.nist.gov/publications/nistpubs/800-38B/SP_800-38B.pdf
														
 
															+
														
 
															 config CRYPTO_HMAC
														
 
															 	tristate "HMAC support"
														
 
															 	select CRYPTO_HASH
														
@@ -322,19 +334,9 @@ config CRYPTO_CRC32C
 
															 	  by iSCSI for header and data digests and by others.
														
 
															 	  See Castagnoli93.  Module will be crc32c.
														
 
															-config CRYPTO_CRC32C_X86_64
														
 
															-	bool
														
 
															-	depends on X86 && 64BIT
														
 
															-	select CRYPTO_HASH
														
 
															-	help
														
 
															-	  In Intel processor with SSE4.2 supported, the processor will
														
 
															-	  support CRC32C calculation using hardware accelerated CRC32
														
 
															-	  instruction optimized with PCLMULQDQ instruction when available.
														
 
															-
														
 
															 config CRYPTO_CRC32C_INTEL
														
 
															 	tristate "CRC32c INTEL hardware acceleration"
														
 
															 	depends on X86
														
 
															-	select CRYPTO_CRC32C_X86_64 if 64BIT
														
 
															 	select CRYPTO_HASH
														
 
															 	help
														
 
															 	  In Intel processor with SSE4.2 supported, the processor will
														
@@ -480,6 +482,28 @@ config CRYPTO_SHA1_SSSE3
 
															 	  using Supplemental SSE3 (SSSE3) instructions or Advanced Vector
														
 
															 	  Extensions (AVX), when available.
														
 
															+config CRYPTO_SHA256_SSSE3
														
 
															+	tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2)"
														
 
															+	depends on X86 && 64BIT
														
 
															+	select CRYPTO_SHA256
														
 
															+	select CRYPTO_HASH
														
 
															+	help
														
 
															+	  SHA-256 secure hash standard (DFIPS 180-2) implemented
														
 
															+	  using Supplemental SSE3 (SSSE3) instructions, or Advanced Vector
														
 
															+	  Extensions version 1 (AVX1), or Advanced Vector Extensions
														
 
															+	  version 2 (AVX2) instructions, when available.
														
 
															+
														
 
															+config CRYPTO_SHA512_SSSE3
														
 
															+	tristate "SHA512 digest algorithm (SSSE3/AVX/AVX2)"
														
 
															+	depends on X86 && 64BIT
														
 
															+	select CRYPTO_SHA512
														
 
															+	select CRYPTO_HASH
														
 
															+	help
														
 
															+	  SHA-512 secure hash standard (DFIPS 180-2) implemented
														
 
															+	  using Supplemental SSE3 (SSSE3) instructions, or Advanced Vector
														
 
															+	  Extensions version 1 (AVX1), or Advanced Vector Extensions
														
 
															+	  version 2 (AVX2) instructions, when available.
														
 
															+
														
 
															 config CRYPTO_SHA1_SPARC64
														
 
															 	tristate "SHA1 digest algorithm (SPARC64)"
														
 
															 	depends on SPARC64
														
@@ -654,6 +678,7 @@ config CRYPTO_AES_NI_INTEL
 
															 	select CRYPTO_CRYPTD
														
 
															 	select CRYPTO_ABLK_HELPER_X86
														
 
															 	select CRYPTO_ALGAPI
														
 
															+	select CRYPTO_GLUE_HELPER_X86 if 64BIT
														
 
															 	select CRYPTO_LRW
														
 
															 	select CRYPTO_XTS
														
 
															 	help
														
@@ -795,6 +820,24 @@ config CRYPTO_BLOWFISH_X86_64
 
															 	  See also:
														
 
															 	  <http://www.schneier.com/blowfish.html>
														
 
															+config CRYPTO_BLOWFISH_AVX2_X86_64
														
 
															+	tristate "Blowfish cipher algorithm (x86_64/AVX2)"
														
 
															+	depends on X86 && 64BIT
														
 
															+	select CRYPTO_ALGAPI
														
 
															+	select CRYPTO_CRYPTD
														
 
															+	select CRYPTO_ABLK_HELPER_X86
														
 
															+	select CRYPTO_BLOWFISH_COMMON
														
 
															+	select CRYPTO_BLOWFISH_X86_64
														
 
															+	help
														
 
															+	  Blowfish cipher algorithm (x86_64/AVX2), by Bruce Schneier.
														
 
															+
														
 
															+	  This is a variable key length cipher which can use keys from 32
														
 
															+	  bits to 448 bits in length.  It's fast, simple and specifically
														
 
															+	  designed for use on "large microprocessors".
														
 
															+
														
 
															+	  See also:
														
 
															+	  <http://www.schneier.com/blowfish.html>
														
 
															+
														
 
															 config CRYPTO_CAMELLIA
														
 
															 	tristate "Camellia cipher algorithms"
														
 
															 	depends on CRYPTO
														
@@ -851,6 +894,29 @@ config CRYPTO_CAMELLIA_AESNI_AVX_X86_64
 
															 	  See also:
														
 
															 	  <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
														
 
															+config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64
														
 
															+	tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX2)"
														
 
															+	depends on X86 && 64BIT
														
 
															+	depends on CRYPTO
														
 
															+	select CRYPTO_ALGAPI
														
 
															+	select CRYPTO_CRYPTD
														
 
															+	select CRYPTO_ABLK_HELPER_X86
														
 
															+	select CRYPTO_GLUE_HELPER_X86
														
 
															+	select CRYPTO_CAMELLIA_X86_64
														
 
															+	select CRYPTO_CAMELLIA_AESNI_AVX_X86_64
														
 
															+	select CRYPTO_LRW
														
 
															+	select CRYPTO_XTS
														
 
															+	help
														
 
															+	  Camellia cipher algorithm module (x86_64/AES-NI/AVX2).
														
 
															+
														
 
															+	  Camellia is a symmetric key block cipher developed jointly
														
 
															+	  at NTT and Mitsubishi Electric Corporation.
														
 
															+
														
 
															+	  The Camellia specifies three key sizes: 128, 192 and 256 bits.
														
 
															+
														
 
															+	  See also:
														
 
															+	  <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
														
 
															+
														
 
															 config CRYPTO_CAMELLIA_SPARC64
														
 
															 	tristate "Camellia cipher algorithm (SPARC64)"
														
 
															 	depends on SPARC64
														
@@ -1088,6 +1154,29 @@ config CRYPTO_SERPENT_AVX_X86_64
 
															 	  See also:
														
 
															 	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
														
 
															+config CRYPTO_SERPENT_AVX2_X86_64
														
 
															+	tristate "Serpent cipher algorithm (x86_64/AVX2)"
														
 
															+	depends on X86 && 64BIT
														
 
															+	select CRYPTO_ALGAPI
														
 
															+	select CRYPTO_CRYPTD
														
 
															+	select CRYPTO_ABLK_HELPER_X86
														
 
															+	select CRYPTO_GLUE_HELPER_X86
														
 
															+	select CRYPTO_SERPENT
														
 
															+	select CRYPTO_SERPENT_AVX_X86_64
														
 
															+	select CRYPTO_LRW
														
 
															+	select CRYPTO_XTS
														
 
															+	help
														
 
															+	  Serpent cipher algorithm, by Anderson, Biham & Knudsen.
														
 
															+
														
 
															+	  Keys are allowed to be from 0 to 256 bits in length, in steps
														
 
															+	  of 8 bits.
														
 
															+
														
 
															+	  This module provides Serpent cipher algorithm that processes 16
														
 
															+	  blocks parallel using AVX2 instruction set.
														
 
															+
														
 
															+	  See also:
														
 
															+	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
														
 
															+
														
 
															 config CRYPTO_TEA
														
 
															 	tristate "TEA, XTEA and XETA cipher algorithms"
														
 
															 	select CRYPTO_ALGAPI
														
@@ -1207,6 +1296,30 @@ config CRYPTO_TWOFISH_AVX_X86_64
 
															 	  See also:
														
 
															 	  <http://www.schneier.com/twofish.html>
														
 
															+config CRYPTO_TWOFISH_AVX2_X86_64
														
 
															+	tristate "Twofish cipher algorithm (x86_64/AVX2)"
														
 
															+	depends on X86 && 64BIT
														
 
															+	select CRYPTO_ALGAPI
														
 
															+	select CRYPTO_CRYPTD
														
 
															+	select CRYPTO_ABLK_HELPER_X86
														
 
															+	select CRYPTO_GLUE_HELPER_X86
														
 
															+	select CRYPTO_TWOFISH_COMMON
														
 
															+	select CRYPTO_TWOFISH_X86_64
														
 
															+	select CRYPTO_TWOFISH_X86_64_3WAY
														
 
															+	select CRYPTO_TWOFISH_AVX_X86_64
														
 
															+	select CRYPTO_LRW
														
 
															+	select CRYPTO_XTS
														
 
															+	help
														
 
															+	  Twofish cipher algorithm (x86_64/AVX2).
														
 
															+
														
 
															+	  Twofish was submitted as an AES (Advanced Encryption Standard)
														
 
															+	  candidate cipher by researchers at CounterPane Systems.  It is a
														
 
															+	  16 round block cipher supporting key sizes of 128, 192, and 256
														
 
															+	  bits.
														
 
															+
														
 
															+	  See also:
														
 
															+	  <http://www.schneier.com/twofish.html>
														
 
															+
														
 
															 comment "Compression"
														
 
															 config CRYPTO_DEFLATE
														
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -32,6 +32,7 @@ cryptomgr-y := algboss.o testmgr.o
 
															 obj-$(CONFIG_CRYPTO_MANAGER2) += cryptomgr.o
														
 
															 obj-$(CONFIG_CRYPTO_USER) += crypto_user.o
														
 
															+obj-$(CONFIG_CRYPTO_CMAC) += cmac.o
														
 
															 obj-$(CONFIG_CRYPTO_HMAC) += hmac.o
														
 
															 obj-$(CONFIG_CRYPTO_VMAC) += vmac.o
														
 
															 obj-$(CONFIG_CRYPTO_XCBC) += xcbc.o
														
--- a/crypto/cmac.c
+++ b/crypto/cmac.c
@@ -0,0 +1,315 @@
 
															+/*
														
 
															+ * CMAC: Cipher Block Mode for Authentication
														
 
															+ *
														
 
															+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
														
 
															+ *
														
 
															+ * Based on work by:
														
 
															+ *  Copyright © 2013 Tom St Denis <tstdenis@elliptictech.com>
														
 
															+ * Based on crypto/xcbc.c:
														
 
															+ *  Copyright © 2006 USAGI/WIDE Project,
														
 
															+ *   Author: Kazunori Miyazawa <miyazawa@linux-ipv6.org>
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify
														
 
															+ * it under the terms of the GNU General Public License as published by
														
 
															+ * the Free Software Foundation; either version 2 of the License, or
														
 
															+ * (at your option) any later version.
														
 
															+ *
														
 
															+ */
														
 
															+
														
 
															+#include <crypto/internal/hash.h>
														
 
															+#include <linux/err.h>
														
 
															+#include <linux/kernel.h>
														
 
															+#include <linux/module.h>
														
 
															+
														
 
															+/*
														
 
															+ * +------------------------
														
 
															+ * | <parent tfm>
														
 
															+ * +------------------------
														
 
															+ * | cmac_tfm_ctx
														
 
															+ * +------------------------
														
 
															+ * | consts (block size * 2)
														
 
															+ * +------------------------
														
 
															+ */
														
 
															+struct cmac_tfm_ctx {
														
 
															+	struct crypto_cipher *child;
														
 
															+	u8 ctx[];
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * +------------------------
														
 
															+ * | <shash desc>
														
 
															+ * +------------------------
														
 
															+ * | cmac_desc_ctx
														
 
															+ * +------------------------
														
 
															+ * | odds (block size)
														
 
															+ * +------------------------
														
 
															+ * | prev (block size)
														
 
															+ * +------------------------
														
 
															+ */
														
 
															+struct cmac_desc_ctx {
														
 
															+	unsigned int len;
														
 
															+	u8 ctx[];
														
 
															+};
														
 
															+
														
 
															+static int crypto_cmac_digest_setkey(struct crypto_shash *parent,
														
 
															+				     const u8 *inkey, unsigned int keylen)
														
 
															+{
														
 
															+	unsigned long alignmask = crypto_shash_alignmask(parent);
														
 
															+	struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent);
														
 
															+	unsigned int bs = crypto_shash_blocksize(parent);
														
 
															+	__be64 *consts = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
														
 
															+	u64 _const[2];
														
 
															+	int i, err = 0;
														
 
															+	u8 msb_mask, gfmask;
														
 
															+
														
 
															+	err = crypto_cipher_setkey(ctx->child, inkey, keylen);
														
 
															+	if (err)
														
 
															+		return err;
														
 
															+
														
 
															+	/* encrypt the zero block */
														
 
															+	memset(consts, 0, bs);
														
 
															+	crypto_cipher_encrypt_one(ctx->child, (u8 *)consts, (u8 *)consts);
														
 
															+
														
 
															+	switch (bs) {
														
 
															+	case 16:
														
 
															+		gfmask = 0x87;
														
 
															+		_const[0] = be64_to_cpu(consts[1]);
														
 
															+		_const[1] = be64_to_cpu(consts[0]);
														
 
															+
														
 
															+		/* gf(2^128) multiply zero-ciphertext with u and u^2 */
														
 
															+		for (i = 0; i < 4; i += 2) {
														
 
															+			msb_mask = ((s64)_const[1] >> 63) & gfmask;
														
 
															+			_const[1] = (_const[1] << 1) | (_const[0] >> 63);
														
 
															+			_const[0] = (_const[0] << 1) ^ msb_mask;
														
 
															+
														
 
															+			consts[i + 0] = cpu_to_be64(_const[1]);
														
 
															+			consts[i + 1] = cpu_to_be64(_const[0]);
														
 
															+		}
														
 
															+
														
 
															+		break;
														
 
															+	case 8:
														
 
															+		gfmask = 0x1B;
														
 
															+		_const[0] = be64_to_cpu(consts[0]);
														
 
															+
														
 
															+		/* gf(2^64) multiply zero-ciphertext with u and u^2 */
														
 
															+		for (i = 0; i < 2; i++) {
														
 
															+			msb_mask = ((s64)_const[0] >> 63) & gfmask;
														
 
															+			_const[0] = (_const[0] << 1) ^ msb_mask;
														
 
															+
														
 
															+			consts[i] = cpu_to_be64(_const[0]);
														
 
															+		}
														
 
															+
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int crypto_cmac_digest_init(struct shash_desc *pdesc)
														
 
															+{
														
 
															+	unsigned long alignmask = crypto_shash_alignmask(pdesc->tfm);
														
 
															+	struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
														
 
															+	int bs = crypto_shash_blocksize(pdesc->tfm);
														
 
															+	u8 *prev = PTR_ALIGN((void *)ctx->ctx, alignmask + 1) + bs;
														
 
															+
														
 
															+	ctx->len = 0;
														
 
															+	memset(prev, 0, bs);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int crypto_cmac_digest_update(struct shash_desc *pdesc, const u8 *p,
														
 
															+				     unsigned int len)
														
 
															+{
														
 
															+	struct crypto_shash *parent = pdesc->tfm;
														
 
															+	unsigned long alignmask = crypto_shash_alignmask(parent);
														
 
															+	struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
														
 
															+	struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
														
 
															+	struct crypto_cipher *tfm = tctx->child;
														
 
															+	int bs = crypto_shash_blocksize(parent);
														
 
															+	u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
														
 
															+	u8 *prev = odds + bs;
														
 
															+
														
 
															+	/* checking the data can fill the block */
														
 
															+	if ((ctx->len + len) <= bs) {
														
 
															+		memcpy(odds + ctx->len, p, len);
														
 
															+		ctx->len += len;
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	/* filling odds with new data and encrypting it */
														
 
															+	memcpy(odds + ctx->len, p, bs - ctx->len);
														
 
															+	len -= bs - ctx->len;
														
 
															+	p += bs - ctx->len;
														
 
															+
														
 
															+	crypto_xor(prev, odds, bs);
														
 
															+	crypto_cipher_encrypt_one(tfm, prev, prev);
														
 
															+
														
 
															+	/* clearing the length */
														
 
															+	ctx->len = 0;
														
 
															+
														
 
															+	/* encrypting the rest of data */
														
 
															+	while (len > bs) {
														
 
															+		crypto_xor(prev, p, bs);
														
 
															+		crypto_cipher_encrypt_one(tfm, prev, prev);
														
 
															+		p += bs;
														
 
															+		len -= bs;
														
 
															+	}
														
 
															+
														
 
															+	/* keeping the surplus of blocksize */
														
 
															+	if (len) {
														
 
															+		memcpy(odds, p, len);
														
 
															+		ctx->len = len;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int crypto_cmac_digest_final(struct shash_desc *pdesc, u8 *out)
														
 
															+{
														
 
															+	struct crypto_shash *parent = pdesc->tfm;
														
 
															+	unsigned long alignmask = crypto_shash_alignmask(parent);
														
 
															+	struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
														
 
															+	struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
														
 
															+	struct crypto_cipher *tfm = tctx->child;
														
 
															+	int bs = crypto_shash_blocksize(parent);
														
 
															+	u8 *consts = PTR_ALIGN((void *)tctx->ctx, alignmask + 1);
														
 
															+	u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
														
 
															+	u8 *prev = odds + bs;
														
 
															+	unsigned int offset = 0;
														
 
															+
														
 
															+	if (ctx->len != bs) {
														
 
															+		unsigned int rlen;
														
 
															+		u8 *p = odds + ctx->len;
														
 
															+
														
 
															+		*p = 0x80;
														
 
															+		p++;
														
 
															+
														
 
															+		rlen = bs - ctx->len - 1;
														
 
															+		if (rlen)
														
 
															+			memset(p, 0, rlen);
														
 
															+
														
 
															+		offset += bs;
														
 
															+	}
														
 
															+
														
 
															+	crypto_xor(prev, odds, bs);
														
 
															+	crypto_xor(prev, consts + offset, bs);
														
 
															+
														
 
															+	crypto_cipher_encrypt_one(tfm, out, prev);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int cmac_init_tfm(struct crypto_tfm *tfm)
														
 
															+{
														
 
															+	struct crypto_cipher *cipher;
														
 
															+	struct crypto_instance *inst = (void *)tfm->__crt_alg;
														
 
															+	struct crypto_spawn *spawn = crypto_instance_ctx(inst);
														
 
															+	struct cmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
														
 
															+
														
 
															+	cipher = crypto_spawn_cipher(spawn);
														
 
															+	if (IS_ERR(cipher))
														
 
															+		return PTR_ERR(cipher);
														
 
															+
														
 
															+	ctx->child = cipher;
														
 
															+
														
 
															+	return 0;
														
 
															+};
														
 
															+
														
 
															+static void cmac_exit_tfm(struct crypto_tfm *tfm)
														
 
															+{
														
 
															+	struct cmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
														
 
															+	crypto_free_cipher(ctx->child);
														
 
															+}
														
 
															+
														
 
															+static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
														
 
															+{
														
 
															+	struct shash_instance *inst;
														
 
															+	struct crypto_alg *alg;
														
 
															+	unsigned long alignmask;
														
 
															+	int err;
														
 
															+
														
 
															+	err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH);
														
 
															+	if (err)
														
 
															+		return err;
														
 
															+
														
 
															+	alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_CIPHER,
														
 
															+				  CRYPTO_ALG_TYPE_MASK);
														
 
															+	if (IS_ERR(alg))
														
 
															+		return PTR_ERR(alg);
														
 
															+
														
 
															+	switch (alg->cra_blocksize) {
														
 
															+	case 16:
														
 
															+	case 8:
														
 
															+		break;
														
 
															+	default:
														
 
															+		goto out_put_alg;
														
 
															+	}
														
 
															+
														
 
															+	inst = shash_alloc_instance("cmac", alg);
														
 
															+	err = PTR_ERR(inst);
														
 
															+	if (IS_ERR(inst))
														
 
															+		goto out_put_alg;
														
 
															+
														
 
															+	err = crypto_init_spawn(shash_instance_ctx(inst), alg,
														
 
															+				shash_crypto_instance(inst),
														
 
															+				CRYPTO_ALG_TYPE_MASK);
														
 
															+	if (err)
														
 
															+		goto out_free_inst;
														
 
															+
														
 
															+	alignmask = alg->cra_alignmask | (sizeof(long) - 1);
														
 
															+	inst->alg.base.cra_alignmask = alignmask;
														
 
															+	inst->alg.base.cra_priority = alg->cra_priority;
														
 
															+	inst->alg.base.cra_blocksize = alg->cra_blocksize;
														
 
															+
														
 
															+	inst->alg.digestsize = alg->cra_blocksize;
														
 
															+	inst->alg.descsize =
														
 
															+		ALIGN(sizeof(struct cmac_desc_ctx), crypto_tfm_ctx_alignment())
														
 
															+		+ (alignmask & ~(crypto_tfm_ctx_alignment() - 1))
														
 
															+		+ alg->cra_blocksize * 2;
														
 
															+
														
 
															+	inst->alg.base.cra_ctxsize =
														
 
															+		ALIGN(sizeof(struct cmac_tfm_ctx), alignmask + 1)
														
 
															+		+ alg->cra_blocksize * 2;
														
 
															+
														
 
															+	inst->alg.base.cra_init = cmac_init_tfm;
														
 
															+	inst->alg.base.cra_exit = cmac_exit_tfm;
														
 
															+
														
 
															+	inst->alg.init = crypto_cmac_digest_init;
														
 
															+	inst->alg.update = crypto_cmac_digest_update;
														
 
															+	inst->alg.final = crypto_cmac_digest_final;
														
 
															+	inst->alg.setkey = crypto_cmac_digest_setkey;
														
 
															+
														
 
															+	err = shash_register_instance(tmpl, inst);
														
 
															+	if (err) {
														
 
															+out_free_inst:
														
 
															+		shash_free_instance(shash_crypto_instance(inst));
														
 
															+	}
														
 
															+
														
 
															+out_put_alg:
														
 
															+	crypto_mod_put(alg);
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static struct crypto_template crypto_cmac_tmpl = {
														
 
															+	.name = "cmac",
														
 
															+	.create = cmac_create,
														
 
															+	.free = shash_free_instance,
														
 
															+	.module = THIS_MODULE,
														
 
															+};
														
 
															+
														
 
															+static int __init crypto_cmac_module_init(void)
														
 
															+{
														
 
															+	return crypto_register_template(&crypto_cmac_tmpl);
														
 
															+}
														
 
															+
														
 
															+static void __exit crypto_cmac_module_exit(void)
														
 
															+{
														
 
															+	crypto_unregister_template(&crypto_cmac_tmpl);
														
 
															+}
														
 
															+
														
 
															+module_init(crypto_cmac_module_init);
														
 
															+module_exit(crypto_cmac_module_exit);
														
 
															+
														
 
															+MODULE_LICENSE("GPL");
														
 
															+MODULE_DESCRIPTION("CMAC keyed hash algorithm");
														
--- a/crypto/crypto_user.c
+++ b/crypto/crypto_user.c
@@ -440,7 +440,7 @@ static const struct nla_policy crypto_policy[CRYPTOCFGA_MAX+1] = {
 
															 #undef MSGSIZE
														
 
															-static struct crypto_link {
														
 
															+static const struct crypto_link {
														
 
															 	int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **);
														
 
															 	int (*dump)(struct sk_buff *, struct netlink_callback *);
														
 
															 	int (*done)(struct netlink_callback *);
														
@@ -456,7 +456,7 @@ static struct crypto_link {
 
															 static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
														
 
															 {
														
 
															 	struct nlattr *attrs[CRYPTOCFGA_MAX+1];
														
 
															-	struct crypto_link *link;
														
 
															+	const struct crypto_link *link;
														
 
															 	int type, err;
														
 
															 	type = nlh->nlmsg_type;
														
--- a/crypto/gcm.c
+++ b/crypto/gcm.c
@@ -37,8 +37,14 @@ struct crypto_rfc4106_ctx {
 
															 	u8 nonce[4];
														
 
															 };
														
 
															+struct crypto_rfc4543_instance_ctx {
														
 
															+	struct crypto_aead_spawn aead;
														
 
															+	struct crypto_skcipher_spawn null;
														
 
															+};
														
 
															+
														
 
															 struct crypto_rfc4543_ctx {
														
 
															 	struct crypto_aead *child;
														
 
															+	struct crypto_blkcipher *null;
														
 
															 	u8 nonce[4];
														
 
															 };
														
@@ -1094,21 +1100,36 @@ static int crypto_rfc4543_setauthsize(struct crypto_aead *parent,
 
															 	return crypto_aead_setauthsize(ctx->child, authsize);
														
 
															 }
														
 
															+static void crypto_rfc4543_done(struct crypto_async_request *areq, int err)
														
 
															+{
														
 
															+	struct aead_request *req = areq->data;
														
 
															+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
														
 
															+	struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req);
														
 
															+
														
 
															+	if (!err) {
														
 
															+		scatterwalk_map_and_copy(rctx->auth_tag, req->dst,
														
 
															+					 req->cryptlen,
														
 
															+					 crypto_aead_authsize(aead), 1);
														
 
															+	}
														
 
															+
														
 
															+	aead_request_complete(req, err);
														
 
															+}
														
 
															+
														
 
															 static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
														
 
															-						 int enc)
														
 
															+						 bool enc)
														
 
															 {
														
 
															 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
														
 
															 	struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead);
														
 
															 	struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req);
														
 
															 	struct aead_request *subreq = &rctx->subreq;
														
 
															-	struct scatterlist *dst = req->dst;
														
 
															+	struct scatterlist *src = req->src;
														
 
															 	struct scatterlist *cipher = rctx->cipher;
														
 
															 	struct scatterlist *payload = rctx->payload;
														
 
															 	struct scatterlist *assoc = rctx->assoc;
														
 
															 	unsigned int authsize = crypto_aead_authsize(aead);
														
 
															 	unsigned int assoclen = req->assoclen;
														
 
															-	struct page *dstp;
														
 
															-	u8 *vdst;
														
 
															+	struct page *srcp;
														
 
															+	u8 *vsrc;
														
 
															 	u8 *iv = PTR_ALIGN((u8 *)(rctx + 1) + crypto_aead_reqsize(ctx->child),
														
 
															 			   crypto_aead_alignmask(ctx->child) + 1);
														
@@ -1119,19 +1140,19 @@ static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
 
															 	if (enc)
														
 
															 		memset(rctx->auth_tag, 0, authsize);
														
 
															 	else
														
 
															-		scatterwalk_map_and_copy(rctx->auth_tag, dst,
														
 
															+		scatterwalk_map_and_copy(rctx->auth_tag, src,
														
 
															 					 req->cryptlen - authsize,
														
 
															 					 authsize, 0);
														
 
															 	sg_init_one(cipher, rctx->auth_tag, authsize);
														
 
															 	/* construct the aad */
														
 
															-	dstp = sg_page(dst);
														
 
															-	vdst = PageHighMem(dstp) ? NULL : page_address(dstp) + dst->offset;
														
 
															+	srcp = sg_page(src);
														
 
															+	vsrc = PageHighMem(srcp) ? NULL : page_address(srcp) + src->offset;
														
 
															 	sg_init_table(payload, 2);
														
 
															 	sg_set_buf(payload, req->iv, 8);
														
 
															-	scatterwalk_crypto_chain(payload, dst, vdst == req->iv + 8, 2);
														
 
															+	scatterwalk_crypto_chain(payload, src, vsrc == req->iv + 8, 2);
														
 
															 	assoclen += 8 + req->cryptlen - (enc ? 0 : authsize);
														
 
															 	if (req->assoc->length == req->assoclen) {
														
@@ -1150,14 +1171,27 @@ static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
 
															 	scatterwalk_crypto_chain(assoc, payload, 0, 2);
														
 
															 	aead_request_set_tfm(subreq, ctx->child);
														
 
															-	aead_request_set_callback(subreq, req->base.flags, req->base.complete,
														
 
															-				  req->base.data);
														
 
															+	aead_request_set_callback(subreq, req->base.flags, crypto_rfc4543_done,
														
 
															+				  req);
														
 
															 	aead_request_set_crypt(subreq, cipher, cipher, enc ? 0 : authsize, iv);
														
 
															 	aead_request_set_assoc(subreq, assoc, assoclen);
														
 
															 	return subreq;
														
 
															 }
														
 
															+static int crypto_rfc4543_copy_src_to_dst(struct aead_request *req, bool enc)
														
 
															+{
														
 
															+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
														
 
															+	struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead);
														
 
															+	unsigned int authsize = crypto_aead_authsize(aead);
														
 
															+	unsigned int nbytes = req->cryptlen - (enc ? 0 : authsize);
														
 
															+	struct blkcipher_desc desc = {
														
 
															+		.tfm = ctx->null,
														
 
															+	};
														
 
															+
														
 
															+	return crypto_blkcipher_encrypt(&desc, req->dst, req->src, nbytes);
														
 
															+}
														
 
															+
														
 
															 static int crypto_rfc4543_encrypt(struct aead_request *req)
														
 
															 {
														
 
															 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
														
@@ -1165,7 +1199,13 @@ static int crypto_rfc4543_encrypt(struct aead_request *req)
 
															 	struct aead_request *subreq;
														
 
															 	int err;
														
 
															-	subreq = crypto_rfc4543_crypt(req, 1);
														
 
															+	if (req->src != req->dst) {
														
 
															+		err = crypto_rfc4543_copy_src_to_dst(req, true);
														
 
															+		if (err)
														
 
															+			return err;
														
 
															+	}
														
 
															+
														
 
															+	subreq = crypto_rfc4543_crypt(req, true);
														
 
															 	err = crypto_aead_encrypt(subreq);
														
 
															 	if (err)
														
 
															 		return err;
														
@@ -1178,7 +1218,15 @@ static int crypto_rfc4543_encrypt(struct aead_request *req)
 
															 static int crypto_rfc4543_decrypt(struct aead_request *req)
														
 
															 {
														
 
															-	req = crypto_rfc4543_crypt(req, 0);
														
 
															+	int err;
														
 
															+
														
 
															+	if (req->src != req->dst) {
														
 
															+		err = crypto_rfc4543_copy_src_to_dst(req, false);
														
 
															+		if (err)
														
 
															+			return err;
														
 
															+	}
														
 
															+
														
 
															+	req = crypto_rfc4543_crypt(req, false);
														
 
															 	return crypto_aead_decrypt(req);
														
 
															 }
														
@@ -1186,16 +1234,25 @@ static int crypto_rfc4543_decrypt(struct aead_request *req)
 
															 static int crypto_rfc4543_init_tfm(struct crypto_tfm *tfm)
														
 
															 {
														
 
															 	struct crypto_instance *inst = (void *)tfm->__crt_alg;
														
 
															-	struct crypto_aead_spawn *spawn = crypto_instance_ctx(inst);
														
 
															+	struct crypto_rfc4543_instance_ctx *ictx = crypto_instance_ctx(inst);
														
 
															+	struct crypto_aead_spawn *spawn = &ictx->aead;
														
 
															 	struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm);
														
 
															 	struct crypto_aead *aead;
														
 
															+	struct crypto_blkcipher *null;
														
 
															 	unsigned long align;
														
 
															+	int err = 0;
														
 
															 	aead = crypto_spawn_aead(spawn);
														
 
															 	if (IS_ERR(aead))
														
 
															 		return PTR_ERR(aead);
														
 
															+	null = crypto_spawn_blkcipher(&ictx->null.base);
														
 
															+	err = PTR_ERR(null);
														
 
															+	if (IS_ERR(null))
														
 
															+		goto err_free_aead;
														
 
															+
														
 
															 	ctx->child = aead;
														
 
															+	ctx->null = null;
														
 
															 	align = crypto_aead_alignmask(aead);
														
 
															 	align &= ~(crypto_tfm_ctx_alignment() - 1);
														
@@ -1205,6 +1262,10 @@ static int crypto_rfc4543_init_tfm(struct crypto_tfm *tfm)
 
															 				align + 16;
														
 
															 	return 0;
														
 
															+
														
 
															+err_free_aead:
														
 
															+	crypto_free_aead(aead);
														
 
															+	return err;
														
 
															 }
														
 
															 static void crypto_rfc4543_exit_tfm(struct crypto_tfm *tfm)
														
@@ -1212,6 +1273,7 @@ static void crypto_rfc4543_exit_tfm(struct crypto_tfm *tfm)
 
															 	struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm);
														
 
															 	crypto_free_aead(ctx->child);
														
 
															+	crypto_free_blkcipher(ctx->null);
														
 
															 }
														
 
															 static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
														
@@ -1220,6 +1282,7 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 
															 	struct crypto_instance *inst;
														
 
															 	struct crypto_aead_spawn *spawn;
														
 
															 	struct crypto_alg *alg;
														
 
															+	struct crypto_rfc4543_instance_ctx *ctx;
														
 
															 	const char *ccm_name;
														
 
															 	int err;
														
@@ -1234,11 +1297,12 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 
															 	if (IS_ERR(ccm_name))
														
 
															 		return ERR_CAST(ccm_name);
														
 
															-	inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
														
 
															+	inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
														
 
															 	if (!inst)
														
 
															 		return ERR_PTR(-ENOMEM);
														
 
															-	spawn = crypto_instance_ctx(inst);
														
 
															+	ctx = crypto_instance_ctx(inst);
														
 
															+	spawn = &ctx->aead;
														
 
															 	crypto_set_aead_spawn(spawn, inst);
														
 
															 	err = crypto_grab_aead(spawn, ccm_name, 0,
														
 
															 			       crypto_requires_sync(algt->type, algt->mask));
														
@@ -1247,15 +1311,23 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 
															 	alg = crypto_aead_spawn_alg(spawn);
														
 
															+	crypto_set_skcipher_spawn(&ctx->null, inst);
														
 
															+	err = crypto_grab_skcipher(&ctx->null, "ecb(cipher_null)", 0,
														
 
															+				   CRYPTO_ALG_ASYNC);
														
 
															+	if (err)
														
 
															+		goto out_drop_alg;
														
 
															+
														
 
															+	crypto_skcipher_spawn_alg(&ctx->null);
														
 
															+
														
 
															 	err = -EINVAL;
														
 
															 	/* We only support 16-byte blocks. */
														
 
															 	if (alg->cra_aead.ivsize != 16)
														
 
															-		goto out_drop_alg;
														
 
															+		goto out_drop_ecbnull;
														
 
															 	/* Not a stream cipher? */
														
 
															 	if (alg->cra_blocksize != 1)
														
 
															-		goto out_drop_alg;
														
 
															+		goto out_drop_ecbnull;
														
 
															 	err = -ENAMETOOLONG;
														
 
															 	if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME,
														
@@ -1263,7 +1335,7 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 
															 	    snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME,
														
 
															 		     "rfc4543(%s)", alg->cra_driver_name) >=
														
 
															 	    CRYPTO_MAX_ALG_NAME)
														
 
															-		goto out_drop_alg;
														
 
															+		goto out_drop_ecbnull;
														
 
															 	inst->alg.cra_flags = CRYPTO_ALG_TYPE_AEAD;
														
 
															 	inst->alg.cra_flags |= alg->cra_flags & CRYPTO_ALG_ASYNC;
														
@@ -1290,6 +1362,8 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 
															 out:
														
 
															 	return inst;
														
 
															+out_drop_ecbnull:
														
 
															+	crypto_drop_skcipher(&ctx->null);
														
 
															 out_drop_alg:
														
 
															 	crypto_drop_aead(spawn);
														
 
															 out_free_inst:
														
@@ -1300,7 +1374,11 @@ out_free_inst:
 
															 static void crypto_rfc4543_free(struct crypto_instance *inst)
														
 
															 {
														
 
															-	crypto_drop_spawn(crypto_instance_ctx(inst));
														
 
															+	struct crypto_rfc4543_instance_ctx *ctx = crypto_instance_ctx(inst);
														
 
															+
														
 
															+	crypto_drop_aead(&ctx->aead);
														
 
															+	crypto_drop_skcipher(&ctx->null);
														
 
															+
														
 
															 	kfree(inst);
														
 
															 }
														
--- a/crypto/sha256_generic.c
+++ b/crypto/sha256_generic.c
@@ -246,7 +246,7 @@ static int sha256_init(struct shash_desc *desc)
 
															 	return 0;
														
 
															 }
														
 
															-static int sha256_update(struct shash_desc *desc, const u8 *data,
														
 
															+int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
														
 
															 			  unsigned int len)
														
 
															 {
														
 
															 	struct sha256_state *sctx = shash_desc_ctx(desc);
														
@@ -277,6 +277,7 @@ static int sha256_update(struct shash_desc *desc, const u8 *data,
 
															 	return 0;
														
 
															 }
														
 
															+EXPORT_SYMBOL(crypto_sha256_update);
														
 
															 static int sha256_final(struct shash_desc *desc, u8 *out)
														
 
															 {
														
@@ -293,10 +294,10 @@ static int sha256_final(struct shash_desc *desc, u8 *out)
 
															 	/* Pad out to 56 mod 64. */
														
 
															 	index = sctx->count & 0x3f;
														
 
															 	pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
														
 
															-	sha256_update(desc, padding, pad_len);
														
 
															+	crypto_sha256_update(desc, padding, pad_len);
														
 
															 	/* Append length (before padding) */
														
 
															-	sha256_update(desc, (const u8 *)&bits, sizeof(bits));
														
 
															+	crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
														
 
															 	/* Store state in digest */
														
 
															 	for (i = 0; i < 8; i++)
														
@@ -339,7 +340,7 @@ static int sha256_import(struct shash_desc *desc, const void *in)
 
															 static struct shash_alg sha256_algs[2] = { {
														
 
															 	.digestsize	=	SHA256_DIGEST_SIZE,
														
 
															 	.init		=	sha256_init,
														
 
															-	.update		=	sha256_update,
														
 
															+	.update		=	crypto_sha256_update,
														
 
															 	.final		=	sha256_final,
														
 
															 	.export		=	sha256_export,
														
 
															 	.import		=	sha256_import,
														
@@ -355,7 +356,7 @@ static struct shash_alg sha256_algs[2] = { {
 
															 }, {
														
 
															 	.digestsize	=	SHA224_DIGEST_SIZE,
														
 
															 	.init		=	sha224_init,
														
 
															-	.update		=	sha256_update,
														
 
															+	.update		=	crypto_sha256_update,
														
 
															 	.final		=	sha224_final,
														
 
															 	.descsize	=	sizeof(struct sha256_state),
														
 
															 	.base		=	{
														
--- a/crypto/sha512_generic.c
+++ b/crypto/sha512_generic.c
@@ -163,8 +163,8 @@ sha384_init(struct shash_desc *desc)
 
															 	return 0;
														
 
															 }
														
 
															-static int
														
 
															-sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len)
														
 
															+int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
														
 
															+			unsigned int len)
														
 
															 {
														
 
															 	struct sha512_state *sctx = shash_desc_ctx(desc);
														
@@ -197,6 +197,7 @@ sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len)
 
															 	return 0;
														
 
															 }
														
 
															+EXPORT_SYMBOL(crypto_sha512_update);
														
 
															 static int
														
 
															 sha512_final(struct shash_desc *desc, u8 *hash)
														
@@ -215,10 +216,10 @@ sha512_final(struct shash_desc *desc, u8 *hash)
 
															 	/* Pad out to 112 mod 128. */
														
 
															 	index = sctx->count[0] & 0x7f;
														
 
															 	pad_len = (index < 112) ? (112 - index) : ((128+112) - index);
														
 
															-	sha512_update(desc, padding, pad_len);
														
 
															+	crypto_sha512_update(desc, padding, pad_len);
														
 
															 	/* Append length (before padding) */
														
 
															-	sha512_update(desc, (const u8 *)bits, sizeof(bits));
														
 
															+	crypto_sha512_update(desc, (const u8 *)bits, sizeof(bits));
														
 
															 	/* Store state in digest */
														
 
															 	for (i = 0; i < 8; i++)
														
@@ -245,7 +246,7 @@ static int sha384_final(struct shash_desc *desc, u8 *hash)
 
															 static struct shash_alg sha512_algs[2] = { {
														
 
															 	.digestsize	=	SHA512_DIGEST_SIZE,
														
 
															 	.init		=	sha512_init,
														
 
															-	.update		=	sha512_update,
														
 
															+	.update		=	crypto_sha512_update,
														
 
															 	.final		=	sha512_final,
														
 
															 	.descsize	=	sizeof(struct sha512_state),
														
 
															 	.base		=	{
														
@@ -257,7 +258,7 @@ static struct shash_alg sha512_algs[2] = { {
 
															 }, {
														
 
															 	.digestsize	=	SHA384_DIGEST_SIZE,
														
 
															 	.init		=	sha384_init,
														
 
															-	.update		=	sha512_update,
														
 
															+	.update		=	crypto_sha512_update,
														
 
															 	.final		=	sha384_final,
														
 
															 	.descsize	=	sizeof(struct sha512_state),
														
 
															 	.base		=	{
														
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -1095,7 +1095,6 @@ static int do_test(int m)
 
															 		break;
														
 
															 	case 28:
														
 
															-
														
 
															 		ret += tcrypt_test("tgr160");
														
 
															 		break;
														
@@ -1118,6 +1117,7 @@ static int do_test(int m)
 
															 		ret += tcrypt_test("lrw(camellia)");
														
 
															 		ret += tcrypt_test("xts(camellia)");
														
 
															 		break;
														
 
															+
														
 
															 	case 33:
														
 
															 		ret += tcrypt_test("sha224");
														
 
															 		break;
														
@@ -1213,6 +1213,7 @@ static int do_test(int m)
 
															 	case 109:
														
 
															 		ret += tcrypt_test("vmac(aes)");
														
 
															 		break;
														
 
															+
														
 
															 	case 110:
														
 
															 		ret += tcrypt_test("hmac(crc32)");
														
 
															 		break;
														
@@ -1225,6 +1226,18 @@ static int do_test(int m)
 
															 		ret += tcrypt_test("rfc4106(gcm(aes))");
														
 
															 		break;
														
 
															+	case 152:
														
 
															+		ret += tcrypt_test("rfc4543(gcm(aes))");
														
 
															+		break;
														
 
															+
														
 
															+	case 153:
														
 
															+		ret += tcrypt_test("cmac(aes)");
														
 
															+		break;
														
 
															+
														
 
															+	case 154:
														
 
															+		ret += tcrypt_test("cmac(des3_ede)");
														
 
															+		break;
														
 
															+
														
 
															 	case 200:
														
 
															 		test_cipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0,
														
 
															 				speed_template_16_24_32);
														
@@ -1755,6 +1768,21 @@ static int do_test(int m)
 
															 				   speed_template_32_64);
														
 
															 		break;
														
 
															+	case 509:
														
 
															+		test_acipher_speed("ecb(blowfish)", ENCRYPT, sec, NULL, 0,
														
 
															+				   speed_template_8_32);
														
 
															+		test_acipher_speed("ecb(blowfish)", DECRYPT, sec, NULL, 0,
														
 
															+				   speed_template_8_32);
														
 
															+		test_acipher_speed("cbc(blowfish)", ENCRYPT, sec, NULL, 0,
														
 
															+				   speed_template_8_32);
														
 
															+		test_acipher_speed("cbc(blowfish)", DECRYPT, sec, NULL, 0,
														
 
															+				   speed_template_8_32);
														
 
															+		test_acipher_speed("ctr(blowfish)", ENCRYPT, sec, NULL, 0,
														
 
															+				   speed_template_8_32);
														
 
															+		test_acipher_speed("ctr(blowfish)", DECRYPT, sec, NULL, 0,
														
 
															+				   speed_template_8_32);
														
 
															+		break;
														
 
															+
														
 
															 	case 1000:
														
 
															 		test_available();
														
 
															 		break;
														
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1644,19 +1644,31 @@ static const struct alg_test_desc alg_test_descs[] = {
 
															 	}, {
														
 
															 		.alg = "__cbc-serpent-avx",
														
 
															 		.test = alg_test_null,
														
 
															+	}, {
														
 
															+		.alg = "__cbc-serpent-avx2",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "__cbc-serpent-sse2",
														
 
															 		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "__cbc-twofish-avx",
														
 
															 		.test = alg_test_null,
														
 
															+	}, {
														
 
															+		.alg = "__cbc-twofish-avx2",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "__driver-cbc-aes-aesni",
														
 
															 		.test = alg_test_null,
														
 
															 		.fips_allowed = 1,
														
 
															+	}, {
														
 
															+		.alg = "__driver-cbc-blowfish-avx2",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "__driver-cbc-camellia-aesni",
														
 
															 		.test = alg_test_null,
														
 
															+	}, {
														
 
															+		.alg = "__driver-cbc-camellia-aesni-avx2",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "__driver-cbc-cast5-avx",
														
 
															 		.test = alg_test_null,
														
@@ -1666,19 +1678,31 @@ static const struct alg_test_desc alg_test_descs[] = {
 
															 	}, {
														
 
															 		.alg = "__driver-cbc-serpent-avx",
														
 
															 		.test = alg_test_null,
														
 
															+	}, {
														
 
															+		.alg = "__driver-cbc-serpent-avx2",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "__driver-cbc-serpent-sse2",
														
 
															 		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "__driver-cbc-twofish-avx",
														
 
															 		.test = alg_test_null,
														
 
															+	}, {
														
 
															+		.alg = "__driver-cbc-twofish-avx2",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "__driver-ecb-aes-aesni",
														
 
															 		.test = alg_test_null,
														
 
															 		.fips_allowed = 1,
														
 
															+	}, {
														
 
															+		.alg = "__driver-ecb-blowfish-avx2",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "__driver-ecb-camellia-aesni",
														
 
															 		.test = alg_test_null,
														
 
															+	}, {
														
 
															+		.alg = "__driver-ecb-camellia-aesni-avx2",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "__driver-ecb-cast5-avx",
														
 
															 		.test = alg_test_null,
														
@@ -1688,12 +1712,18 @@ static const struct alg_test_desc alg_test_descs[] = {
 
															 	}, {
														
 
															 		.alg = "__driver-ecb-serpent-avx",
														
 
															 		.test = alg_test_null,
														
 
															+	}, {
														
 
															+		.alg = "__driver-ecb-serpent-avx2",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "__driver-ecb-serpent-sse2",
														
 
															 		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "__driver-ecb-twofish-avx",
														
 
															 		.test = alg_test_null,
														
 
															+	}, {
														
 
															+		.alg = "__driver-ecb-twofish-avx2",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "__ghash-pclmulqdqni",
														
 
															 		.test = alg_test_null,
														
@@ -1912,6 +1942,27 @@ static const struct alg_test_desc alg_test_descs[] = {
 
															 				}
														
 
															 			}
														
 
															 		}
														
 
															+	}, {
														
 
															+		.alg = "cmac(aes)",
														
 
															+		.test = alg_test_hash,
														
 
															+		.suite = {
														
 
															+			.hash = {
														
 
															+				.vecs = aes_cmac128_tv_template,
														
 
															+				.count = CMAC_AES_TEST_VECTORS
														
 
															+			}
														
 
															+		}
														
 
															+	}, {
														
 
															+		.alg = "cmac(des3_ede)",
														
 
															+		.test = alg_test_hash,
														
 
															+		.suite = {
														
 
															+			.hash = {
														
 
															+				.vecs = des3_ede_cmac64_tv_template,
														
 
															+				.count = CMAC_DES3_EDE_TEST_VECTORS
														
 
															+			}
														
 
															+		}
														
 
															+	}, {
														
 
															+		.alg = "compress_null",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "crc32c",
														
 
															 		.test = alg_test_crc32c,
														
@@ -1926,16 +1977,31 @@ static const struct alg_test_desc alg_test_descs[] = {
 
															 		.alg = "cryptd(__driver-cbc-aes-aesni)",
														
 
															 		.test = alg_test_null,
														
 
															 		.fips_allowed = 1,
														
 
															+	}, {
														
 
															+		.alg = "cryptd(__driver-cbc-blowfish-avx2)",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "cryptd(__driver-cbc-camellia-aesni)",
														
 
															 		.test = alg_test_null,
														
 
															+	}, {
														
 
															+		.alg = "cryptd(__driver-cbc-camellia-aesni-avx2)",
														
 
															+		.test = alg_test_null,
														
 
															+	}, {
														
 
															+		.alg = "cryptd(__driver-cbc-serpent-avx2)",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "cryptd(__driver-ecb-aes-aesni)",
														
 
															 		.test = alg_test_null,
														
 
															 		.fips_allowed = 1,
														
 
															+	}, {
														
 
															+		.alg = "cryptd(__driver-ecb-blowfish-avx2)",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "cryptd(__driver-ecb-camellia-aesni)",
														
 
															 		.test = alg_test_null,
														
 
															+	}, {
														
 
															+		.alg = "cryptd(__driver-ecb-camellia-aesni-avx2)",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "cryptd(__driver-ecb-cast5-avx)",
														
 
															 		.test = alg_test_null,
														
@@ -1945,12 +2011,18 @@ static const struct alg_test_desc alg_test_descs[] = {
 
															 	}, {
														
 
															 		.alg = "cryptd(__driver-ecb-serpent-avx)",
														
 
															 		.test = alg_test_null,
														
 
															+	}, {
														
 
															+		.alg = "cryptd(__driver-ecb-serpent-avx2)",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "cryptd(__driver-ecb-serpent-sse2)",
														
 
															 		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "cryptd(__driver-ecb-twofish-avx)",
														
 
															 		.test = alg_test_null,
														
 
															+	}, {
														
 
															+		.alg = "cryptd(__driver-ecb-twofish-avx2)",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "cryptd(__driver-gcm-aes-aesni)",
														
 
															 		.test = alg_test_null,
														
@@ -2126,6 +2198,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 
															 				}
														
 
															 			}
														
 
															 		}
														
 
															+	}, {
														
 
															+		.alg = "digest_null",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "ecb(__aes-aesni)",
														
 
															 		.test = alg_test_null,
														
@@ -2236,6 +2311,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 
															 				}
														
 
															 			}
														
 
															 		}
														
 
															+	}, {
														
 
															+		.alg = "ecb(cipher_null)",
														
 
															+		.test = alg_test_null,
														
 
															 	}, {
														
 
															 		.alg = "ecb(des)",
														
 
															 		.test = alg_test_skcipher,
														
@@ -2696,8 +2774,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 
															 			}
														
 
															 		}
														
 
															 	}, {
														
 
															-
														
 
															-
														
 
															 		.alg = "rfc4309(ccm(aes))",
														
 
															 		.test = alg_test_aead,
														
 
															 		.fips_allowed = 1,
														
@@ -2713,6 +2789,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 
															 				}
														
 
															 			}
														
 
															 		}
														
 
															+	}, {
														
 
															+		.alg = "rfc4543(gcm(aes))",
														
 
															+		.test = alg_test_aead,
														
 
															+		.suite = {
														
 
															+			.aead = {
														
 
															+				.enc = {
														
 
															+					.vecs = aes_gcm_rfc4543_enc_tv_template,
														
 
															+					.count = AES_GCM_4543_ENC_TEST_VECTORS
														
 
															+				},
														
 
															+				.dec = {
														
 
															+					.vecs = aes_gcm_rfc4543_dec_tv_template,
														
 
															+					.count = AES_GCM_4543_DEC_TEST_VECTORS
														
 
															+				},
														
 
															+			}
														
 
															+		}
														
 
															 	}, {
														
 
															 		.alg = "rmd128",
														
 
															 		.test = alg_test_hash,
														
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -1639,6 +1639,131 @@ static struct hash_testvec hmac_sha256_tv_template[] = {
 
															 	},
														
 
															 };
														
 
															+#define CMAC_AES_TEST_VECTORS 6
														
 
															+
														
 
															+static struct hash_testvec aes_cmac128_tv_template[] = {
														
 
															+	{ /* From NIST Special Publication 800-38B, AES-128 */
														
 
															+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
														
 
															+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
														
 
															+		.plaintext	= zeroed_string,
														
 
															+		.digest		= "\xbb\x1d\x69\x29\xe9\x59\x37\x28"
														
 
															+				  "\x7f\xa3\x7d\x12\x9b\x75\x67\x46",
														
 
															+		.psize		= 0,
														
 
															+		.ksize		= 16,
														
 
															+	}, {
														
 
															+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
														
 
															+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
														
 
															+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
														
 
															+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a",
														
 
															+		.digest		= "\x07\x0a\x16\xb4\x6b\x4d\x41\x44"
														
 
															+				  "\xf7\x9b\xdd\x9d\xd0\x4a\x28\x7c",
														
 
															+		.psize		= 16,
														
 
															+		.ksize		= 16,
														
 
															+	}, {
														
 
															+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
														
 
															+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
														
 
															+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
														
 
															+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
														
 
															+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
														
 
															+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
														
 
															+				  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11",
														
 
															+		.digest		= "\xdf\xa6\x67\x47\xde\x9a\xe6\x30"
														
 
															+				  "\x30\xca\x32\x61\x14\x97\xc8\x27",
														
 
															+		.psize		= 40,
														
 
															+		.ksize		= 16,
														
 
															+	}, {
														
 
															+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
														
 
															+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
														
 
															+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
														
 
															+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
														
 
															+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
														
 
															+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
														
 
															+				  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
														
 
															+				  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
														
 
															+				  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
														
 
															+				  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
														
 
															+		.digest		= "\x51\xf0\xbe\xbf\x7e\x3b\x9d\x92"
														
 
															+				  "\xfc\x49\x74\x17\x79\x36\x3c\xfe",
														
 
															+		.psize		= 64,
														
 
															+		.ksize		= 16,
														
 
															+	}, { /* From NIST Special Publication 800-38B, AES-256 */
														
 
															+		.key		= "\x60\x3d\xeb\x10\x15\xca\x71\xbe"
														
 
															+				  "\x2b\x73\xae\xf0\x85\x7d\x77\x81"
														
 
															+				  "\x1f\x35\x2c\x07\x3b\x61\x08\xd7"
														
 
															+				  "\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
														
 
															+		.plaintext	= zeroed_string,
														
 
															+		.digest		= "\x02\x89\x62\xf6\x1b\x7b\xf8\x9e"
														
 
															+				  "\xfc\x6b\x55\x1f\x46\x67\xd9\x83",
														
 
															+		.psize		= 0,
														
 
															+		.ksize		= 32,
														
 
															+	}, {
														
 
															+		.key		= "\x60\x3d\xeb\x10\x15\xca\x71\xbe"
														
 
															+				  "\x2b\x73\xae\xf0\x85\x7d\x77\x81"
														
 
															+				  "\x1f\x35\x2c\x07\x3b\x61\x08\xd7"
														
 
															+				  "\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
														
 
															+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
														
 
															+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
														
 
															+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
														
 
															+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
														
 
															+				  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
														
 
															+				  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
														
 
															+				  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
														
 
															+				  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
														
 
															+		.digest		= "\xe1\x99\x21\x90\x54\x9f\x6e\xd5"
														
 
															+				  "\x69\x6a\x2c\x05\x6c\x31\x54\x10",
														
 
															+		.psize		= 64,
														
 
															+		.ksize		= 32,
														
 
															+	}
														
 
															+};
														
 
															+
														
 
															+#define CMAC_DES3_EDE_TEST_VECTORS 4
														
 
															+
														
 
															+static struct hash_testvec des3_ede_cmac64_tv_template[] = {
														
 
															+/*
														
 
															+ * From NIST Special Publication 800-38B, Three Key TDEA
														
 
															+ * Corrected test vectors from:
														
 
															+ *  http://csrc.nist.gov/publications/nistpubs/800-38B/Updated_CMAC_Examples.pdf
														
 
															+ */
														
 
															+	{
														
 
															+		.key		= "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62"
														
 
															+				  "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
														
 
															+				  "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
														
 
															+		.plaintext	= zeroed_string,
														
 
															+		.digest		= "\xb7\xa6\x88\xe1\x22\xff\xaf\x95",
														
 
															+		.psize		= 0,
														
 
															+		.ksize		= 24,
														
 
															+	}, {
														
 
															+		.key		= "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62"
														
 
															+				  "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
														
 
															+				  "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
														
 
															+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96",
														
 
															+		.digest		= "\x8e\x8f\x29\x31\x36\x28\x37\x97",
														
 
															+		.psize		= 8,
														
 
															+		.ksize		= 24,
														
 
															+	}, {
														
 
															+		.key		= "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62"
														
 
															+				  "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
														
 
															+				  "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
														
 
															+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
														
 
															+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
														
 
															+				  "\xae\x2d\x8a\x57",
														
 
															+		.digest		= "\x74\x3d\xdb\xe0\xce\x2d\xc2\xed",
														
 
															+		.psize		= 20,
														
 
															+		.ksize		= 24,
														
 
															+	}, {
														
 
															+		.key		= "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62"
														
 
															+				  "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
														
 
															+				  "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
														
 
															+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
														
 
															+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
														
 
															+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
														
 
															+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51",
														
 
															+		.digest		= "\x33\xe6\xb1\x09\x24\x00\xea\xe5",
														
 
															+		.psize		= 32,
														
 
															+		.ksize		= 24,
														
 
															+	}
														
 
															+};
														
 
															+
														
 
															 #define XCBC_AES_TEST_VECTORS 6
														
 
															 static struct hash_testvec aes_xcbc128_tv_template[] = {
														
@@ -12680,6 +12805,8 @@ static struct cipher_testvec cast6_xts_dec_tv_template[] = {
 
															 #define AES_GCM_DEC_TEST_VECTORS 8
														
 
															 #define AES_GCM_4106_ENC_TEST_VECTORS 7
														
 
															 #define AES_GCM_4106_DEC_TEST_VECTORS 7
														
 
															+#define AES_GCM_4543_ENC_TEST_VECTORS 1
														
 
															+#define AES_GCM_4543_DEC_TEST_VECTORS 2
														
 
															 #define AES_CCM_ENC_TEST_VECTORS 7
														
 
															 #define AES_CCM_DEC_TEST_VECTORS 7
														
 
															 #define AES_CCM_4309_ENC_TEST_VECTORS 7
														
@@ -18193,6 +18320,93 @@ static struct aead_testvec aes_gcm_rfc4106_dec_tv_template[] = {
 
															 	}
														
 
															 };
														
 
															+static struct aead_testvec aes_gcm_rfc4543_enc_tv_template[] = {
														
 
															+	{ /* From draft-mcgrew-gcm-test-01 */
														
 
															+		.key	= "\x4c\x80\xcd\xef\xbb\x5d\x10\xda"
														
 
															+			  "\x90\x6a\xc7\x3c\x36\x13\xa6\x34"
														
 
															+			  "\x22\x43\x3c\x64",
														
 
															+		.klen	= 20,
														
 
															+		.iv	= zeroed_string,
														
 
															+		.assoc	= "\x00\x00\x43\x21\x00\x00\x00\x07",
														
 
															+		.alen	= 8,
														
 
															+		.input	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
														
 
															+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
														
 
															+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
														
 
															+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
														
 
															+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
														
 
															+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
														
 
															+			  "\x01\x02\x02\x01",
														
 
															+		.ilen	= 52,
														
 
															+		.result	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
														
 
															+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
														
 
															+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
														
 
															+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
														
 
															+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
														
 
															+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
														
 
															+			  "\x01\x02\x02\x01\xf2\xa9\xa8\x36"
														
 
															+			  "\xe1\x55\x10\x6a\xa8\xdc\xd6\x18"
														
 
															+			  "\xe4\x09\x9a\xaa",
														
 
															+		.rlen	= 68,
														
 
															+	}
														
 
															+};
														
 
															+
														
 
															+static struct aead_testvec aes_gcm_rfc4543_dec_tv_template[] = {
														
 
															+	{ /* From draft-mcgrew-gcm-test-01 */
														
 
															+		.key	= "\x4c\x80\xcd\xef\xbb\x5d\x10\xda"
														
 
															+			  "\x90\x6a\xc7\x3c\x36\x13\xa6\x34"
														
 
															+			  "\x22\x43\x3c\x64",
														
 
															+		.klen	= 20,
														
 
															+		.iv	= zeroed_string,
														
 
															+		.assoc	= "\x00\x00\x43\x21\x00\x00\x00\x07",
														
 
															+		.alen	= 8,
														
 
															+		.input	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
														
 
															+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
														
 
															+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
														
 
															+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
														
 
															+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
														
 
															+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
														
 
															+			  "\x01\x02\x02\x01\xf2\xa9\xa8\x36"
														
 
															+			  "\xe1\x55\x10\x6a\xa8\xdc\xd6\x18"
														
 
															+			  "\xe4\x09\x9a\xaa",
														
 
															+		.ilen	= 68,
														
 
															+		.result	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
														
 
															+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
														
 
															+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
														
 
															+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
														
 
															+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
														
 
															+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
														
 
															+			  "\x01\x02\x02\x01",
														
 
															+		.rlen	= 52,
														
 
															+	}, { /* nearly same as previous, but should fail */
														
 
															+		.key	= "\x4c\x80\xcd\xef\xbb\x5d\x10\xda"
														
 
															+			  "\x90\x6a\xc7\x3c\x36\x13\xa6\x34"
														
 
															+			  "\x22\x43\x3c\x64",
														
 
															+		.klen	= 20,
														
 
															+		.iv	= zeroed_string,
														
 
															+		.assoc	= "\x00\x00\x43\x21\x00\x00\x00\x07",
														
 
															+		.alen	= 8,
														
 
															+		.input	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
														
 
															+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
														
 
															+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
														
 
															+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
														
 
															+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
														
 
															+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
														
 
															+			  "\x01\x02\x02\x01\xf2\xa9\xa8\x36"
														
 
															+			  "\xe1\x55\x10\x6a\xa8\xdc\xd6\x18"
														
 
															+			  "\x00\x00\x00\x00",
														
 
															+		.ilen	= 68,
														
 
															+		.novrfy = 1,
														
 
															+		.result	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
														
 
															+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
														
 
															+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
														
 
															+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
														
 
															+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
														
 
															+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
														
 
															+			  "\x01\x02\x02\x01",
														
 
															+		.rlen	= 52,
														
 
															+	},
														
 
															+};
														
 
															+
														
 
															 static struct aead_testvec aes_ccm_enc_tv_template[] = {
														
 
															 	{ /* From RFC 3610 */
														
 
															 		.key	= "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
														
@@ -20783,8 +20997,72 @@ static struct cipher_testvec camellia_enc_tv_template[] = {
 
															 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
														
 
															 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
														
 
															 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
														
 
															-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
														
 
															-		.ilen	= 496,
														
 
															+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
														
 
															+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
														
 
															+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
														
 
															+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
														
 
															+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
														
 
															+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
														
 
															+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
														
 
															+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
														
 
															+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
														
 
															+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
														
 
															+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
														
 
															+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
														
 
															+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
														
 
															+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
														
 
															+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
														
 
															+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
														
 
															+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
														
 
															+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
														
 
															+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
														
 
															+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
														
 
															+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
														
 
															+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
														
 
															+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
														
 
															+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
														
 
															+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
														
 
															+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
														
 
															+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
														
 
															+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
														
 
															+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
														
 
															+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
														
 
															+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
														
 
															+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
														
 
															+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
														
 
															+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
														
 
															+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
														
 
															+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
														
 
															+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
														
 
															+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
														
 
															+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
														
 
															+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
														
 
															+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
														
 
															+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
														
 
															+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
														
 
															+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
														
 
															+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
														
 
															+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
														
 
															+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
														
 
															+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
														
 
															+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
														
 
															+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
														
 
															+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
														
 
															+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
														
 
															+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
														
 
															+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
														
 
															+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
														
 
															+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
														
 
															+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
														
 
															+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
														
 
															+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
														
 
															+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
														
 
															+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
														
 
															+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
														
 
															+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
														
 
															+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
														
 
															+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
														
 
															+		.ilen	= 1008,
														
 
															 		.result	= "\xED\xCD\xDB\xB8\x68\xCE\xBD\xEA"
														
 
															 			  "\x9D\x9D\xCD\x9F\x4F\xFC\x4D\xB7"
														
 
															 			  "\xA5\xFF\x6F\x43\x0F\xBA\x32\x04"
														
@@ -20846,11 +21124,75 @@ static struct cipher_testvec camellia_enc_tv_template[] = {
 
															 			  "\x2C\x35\x1B\x38\x85\x7D\xE8\xF3"
														
 
															 			  "\x87\x4F\xDA\xD8\x5F\xFC\xB6\x44"
														
 
															 			  "\xD0\xE3\x9B\x8B\xBF\xD6\xB8\xC4"
														
 
															-			  "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB",
														
 
															-		.rlen	= 496,
														
 
															+			  "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB"
														
 
															+			  "\xA4\xAD\xCF\x5D\xD4\x58\xC9\xCD"
														
 
															+			  "\xF7\x90\x68\xCF\xC9\x11\x52\x3E"
														
 
															+			  "\xE8\xA1\xA3\x78\x8B\xD0\xAC\x0A"
														
 
															+			  "\xD4\xC9\xA3\xA5\x55\x30\xC8\x3E"
														
 
															+			  "\xED\x28\x39\xE9\x63\xED\x41\x70"
														
 
															+			  "\x51\xE3\xC4\xA0\xFC\xD5\x43\xCB"
														
 
															+			  "\x4D\x65\xC8\xFD\x3A\x91\x8F\x60"
														
 
															+			  "\x8A\xA6\x6D\x9D\x3E\x01\x23\x4B"
														
 
															+			  "\x50\x47\xC9\xDC\x9B\xDE\x37\xC5"
														
 
															+			  "\xBF\x67\xB1\x6B\x78\x38\xD5\x7E"
														
 
															+			  "\xB6\xFF\x67\x83\x3B\x6E\xBE\x23"
														
 
															+			  "\x45\xFA\x1D\x69\x44\xFD\xC6\xB9"
														
 
															+			  "\xD0\x4A\x92\xD1\xBE\xF6\x4A\xB7"
														
 
															+			  "\xCA\xA8\xA2\x9E\x13\x87\x57\x92"
														
 
															+			  "\x64\x7C\x85\x0B\xB3\x29\x37\xD8"
														
 
															+			  "\xE6\xAA\xAF\xC4\x03\x67\xA3\xBF"
														
 
															+			  "\x2E\x45\x83\xB6\xD8\x54\x00\x89"
														
 
															+			  "\xF6\xBC\x3A\x7A\x88\x58\x51\xED"
														
 
															+			  "\xF4\x4E\x01\xA5\xC3\x2E\xD9\x42"
														
 
															+			  "\xBD\x6E\x0D\x0B\x21\xB0\x1A\xCC"
														
 
															+			  "\xA4\xD3\x3F\xDC\x9B\x81\xD8\xF1"
														
 
															+			  "\xEA\x7A\x6A\xB7\x07\xC9\x6D\x91"
														
 
															+			  "\x6D\x3A\xF5\x5F\xA6\xFF\x87\x1E"
														
 
															+			  "\x3F\xDD\xC0\x72\xEA\xAC\x08\x15"
														
 
															+			  "\x21\xE6\xC6\xB6\x0D\xD8\x51\x86"
														
 
															+			  "\x2A\x03\x73\xF7\x29\xD4\xC4\xE4"
														
 
															+			  "\x7F\x95\x10\xF7\xAB\x3F\x92\x23"
														
 
															+			  "\xD3\xCE\x9C\x2E\x46\x3B\x63\x43"
														
 
															+			  "\xBB\xC2\x82\x7A\x83\xD5\x55\xE2"
														
 
															+			  "\xE7\x9B\x2F\x92\xAF\xFD\x81\x56"
														
 
															+			  "\x79\xFD\x3E\xF9\x46\xE0\x25\xD4"
														
 
															+			  "\x38\xDE\xBC\x2C\xC4\x7A\x2A\x8F"
														
 
															+			  "\x94\x4F\xD0\xAD\x9B\x37\x18\xD4"
														
 
															+			  "\x0E\x4D\x0F\x02\x3A\xDC\x5A\xA2"
														
 
															+			  "\x39\x25\x55\x20\x5A\xA6\x02\x9F"
														
 
															+			  "\xE6\x77\x21\x77\xE5\x4B\x7B\x0B"
														
 
															+			  "\x30\xF8\x5F\x33\x0F\x49\xCD\xFF"
														
 
															+			  "\xF2\xE4\x35\xF9\xF0\x63\xC3\x7E"
														
 
															+			  "\xF1\xA6\x73\xB4\xDF\xE7\xBB\x78"
														
 
															+			  "\xFF\x21\xA9\xF3\xF3\xCF\x5D\xBA"
														
 
															+			  "\xED\x87\x98\xAC\xFE\x48\x97\x6D"
														
 
															+			  "\xA6\x7F\x69\x31\xB1\xC4\xFF\x14"
														
 
															+			  "\xC6\x76\xD4\x10\xDD\xF6\x49\x2C"
														
 
															+			  "\x9C\xC8\x6D\x76\xC0\x8F\x5F\x55"
														
 
															+			  "\x2F\x3C\x8A\x30\xAA\xC3\x16\x55"
														
 
															+			  "\xC6\xFC\x8D\x8B\xB9\xE5\x80\x6C"
														
 
															+			  "\xC8\x7E\xBD\x65\x58\x36\xD5\xBC"
														
 
															+			  "\xF0\x33\x52\x29\x70\xF9\x5C\xE9"
														
 
															+			  "\xAC\x1F\xB5\x73\x56\x66\x54\xAF"
														
 
															+			  "\x1B\x8F\x7D\xED\xAB\x03\xCE\xE3"
														
 
															+			  "\xAE\x47\xB6\x69\x86\xE9\x01\x31"
														
 
															+			  "\x83\x18\x3D\xF4\x74\x7B\xF9\x42"
														
 
															+			  "\x4C\xFD\x75\x4A\x6D\xF0\x03\xA6"
														
 
															+			  "\x2B\x20\x63\xDA\x49\x65\x5E\x8B"
														
 
															+			  "\xC0\x19\xE3\x8D\xD9\xF3\xB0\x34"
														
 
															+			  "\xD3\x52\xFC\x68\x00\x43\x1B\x37"
														
 
															+			  "\x31\x93\x51\x1C\x63\x97\x70\xB0"
														
 
															+			  "\x99\x78\x83\x13\xFD\xCF\x53\x81"
														
 
															+			  "\x36\x46\xB5\x42\x52\x2F\x32\xEB"
														
 
															+			  "\x4A\x3D\xF1\x8F\x1C\x54\x2E\xFC"
														
 
															+			  "\x41\x75\x5A\x8C\x8E\x6F\xE7\x1A"
														
 
															+			  "\xAE\xEF\x3E\x82\x12\x0B\x74\x72"
														
 
															+			  "\xF8\xB2\xAA\x7A\xD6\xFF\xFA\x55"
														
 
															+			  "\x33\x1A\xBB\xD3\xA2\x7E\x97\x66",
														
 
															+		.rlen	= 1008,
														
 
															 		.also_non_np = 1,
														
 
															 		.np	= 2,
														
 
															-		.tap	= { 496 - 16, 16 },
														
 
															+		.tap	= { 1008 - 16, 16 },
														
 
															 	},
														
 
															 };
														
@@ -20955,8 +21297,72 @@ static struct cipher_testvec camellia_dec_tv_template[] = {
 
															 			  "\x2C\x35\x1B\x38\x85\x7D\xE8\xF3"
														
 
															 			  "\x87\x4F\xDA\xD8\x5F\xFC\xB6\x44"
														
 
															 			  "\xD0\xE3\x9B\x8B\xBF\xD6\xB8\xC4"
														
 
															-			  "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB",
														
 
															-		.ilen	= 496,
														
 
															+			  "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB"
														
 
															+			  "\xA4\xAD\xCF\x5D\xD4\x58\xC9\xCD"
														
 
															+			  "\xF7\x90\x68\xCF\xC9\x11\x52\x3E"
														
 
															+			  "\xE8\xA1\xA3\x78\x8B\xD0\xAC\x0A"
														
 
															+			  "\xD4\xC9\xA3\xA5\x55\x30\xC8\x3E"
														
 
															+			  "\xED\x28\x39\xE9\x63\xED\x41\x70"
														
 
															+			  "\x51\xE3\xC4\xA0\xFC\xD5\x43\xCB"
														
 
															+			  "\x4D\x65\xC8\xFD\x3A\x91\x8F\x60"
														
 
															+			  "\x8A\xA6\x6D\x9D\x3E\x01\x23\x4B"
														
 
															+			  "\x50\x47\xC9\xDC\x9B\xDE\x37\xC5"
														
 
															+			  "\xBF\x67\xB1\x6B\x78\x38\xD5\x7E"
														
 
															+			  "\xB6\xFF\x67\x83\x3B\x6E\xBE\x23"
														
 
															+			  "\x45\xFA\x1D\x69\x44\xFD\xC6\xB9"
														
 
															+			  "\xD0\x4A\x92\xD1\xBE\xF6\x4A\xB7"
														
 
															+			  "\xCA\xA8\xA2\x9E\x13\x87\x57\x92"
														
 
															+			  "\x64\x7C\x85\x0B\xB3\x29\x37\xD8"
														
 
															+			  "\xE6\xAA\xAF\xC4\x03\x67\xA3\xBF"
														
 
															+			  "\x2E\x45\x83\xB6\xD8\x54\x00\x89"
														
 
															+			  "\xF6\xBC\x3A\x7A\x88\x58\x51\xED"
														
 
															+			  "\xF4\x4E\x01\xA5\xC3\x2E\xD9\x42"
														
 
															+			  "\xBD\x6E\x0D\x0B\x21\xB0\x1A\xCC"
														
 
															+			  "\xA4\xD3\x3F\xDC\x9B\x81\xD8\xF1"
														
 
															+			  "\xEA\x7A\x6A\xB7\x07\xC9\x6D\x91"
														
 
															+			  "\x6D\x3A\xF5\x5F\xA6\xFF\x87\x1E"
														
 
															+			  "\x3F\xDD\xC0\x72\xEA\xAC\x08\x15"
														
 
															+			  "\x21\xE6\xC6\xB6\x0D\xD8\x51\x86"
														
 
															+			  "\x2A\x03\x73\xF7\x29\xD4\xC4\xE4"
														
 
															+			  "\x7F\x95\x10\xF7\xAB\x3F\x92\x23"
														
 
															+			  "\xD3\xCE\x9C\x2E\x46\x3B\x63\x43"
														
 
															+			  "\xBB\xC2\x82\x7A\x83\xD5\x55\xE2"
														
 
															+			  "\xE7\x9B\x2F\x92\xAF\xFD\x81\x56"
														
 
															+			  "\x79\xFD\x3E\xF9\x46\xE0\x25\xD4"
														
 
															+			  "\x38\xDE\xBC\x2C\xC4\x7A\x2A\x8F"
														
 
															+			  "\x94\x4F\xD0\xAD\x9B\x37\x18\xD4"
														
 
															+			  "\x0E\x4D\x0F\x02\x3A\xDC\x5A\xA2"
														
 
															+			  "\x39\x25\x55\x20\x5A\xA6\x02\x9F"
														
 
															+			  "\xE6\x77\x21\x77\xE5\x4B\x7B\x0B"
														
 
															+			  "\x30\xF8\x5F\x33\x0F\x49\xCD\xFF"
														
 
															+			  "\xF2\xE4\x35\xF9\xF0\x63\xC3\x7E"
														
 
															+			  "\xF1\xA6\x73\xB4\xDF\xE7\xBB\x78"
														
 
															+			  "\xFF\x21\xA9\xF3\xF3\xCF\x5D\xBA"
														
 
															+			  "\xED\x87\x98\xAC\xFE\x48\x97\x6D"
														
 
															+			  "\xA6\x7F\x69\x31\xB1\xC4\xFF\x14"
														
 
															+			  "\xC6\x76\xD4\x10\xDD\xF6\x49\x2C"
														
 
															+			  "\x9C\xC8\x6D\x76\xC0\x8F\x5F\x55"
														
 
															+			  "\x2F\x3C\x8A\x30\xAA\xC3\x16\x55"
														
 
															+			  "\xC6\xFC\x8D\x8B\xB9\xE5\x80\x6C"
														
 
															+			  "\xC8\x7E\xBD\x65\x58\x36\xD5\xBC"
														
 
															+			  "\xF0\x33\x52\x29\x70\xF9\x5C\xE9"
														
 
															+			  "\xAC\x1F\xB5\x73\x56\x66\x54\xAF"
														
 
															+			  "\x1B\x8F\x7D\xED\xAB\x03\xCE\xE3"
														
 
															+			  "\xAE\x47\xB6\x69\x86\xE9\x01\x31"
														
 
															+			  "\x83\x18\x3D\xF4\x74\x7B\xF9\x42"
														
 
															+			  "\x4C\xFD\x75\x4A\x6D\xF0\x03\xA6"
														
 
															+			  "\x2B\x20\x63\xDA\x49\x65\x5E\x8B"
														
 
															+			  "\xC0\x19\xE3\x8D\xD9\xF3\xB0\x34"
														
 
															+			  "\xD3\x52\xFC\x68\x00\x43\x1B\x37"
														
 
															+			  "\x31\x93\x51\x1C\x63\x97\x70\xB0"
														
 
															+			  "\x99\x78\x83\x13\xFD\xCF\x53\x81"
														
 
															+			  "\x36\x46\xB5\x42\x52\x2F\x32\xEB"
														
 
															+			  "\x4A\x3D\xF1\x8F\x1C\x54\x2E\xFC"
														
 
															+			  "\x41\x75\x5A\x8C\x8E\x6F\xE7\x1A"
														
 
															+			  "\xAE\xEF\x3E\x82\x12\x0B\x74\x72"
														
 
															+			  "\xF8\xB2\xAA\x7A\xD6\xFF\xFA\x55"
														
 
															+			  "\x33\x1A\xBB\xD3\xA2\x7E\x97\x66",
														
 
															+		.ilen	= 1008,
														
 
															 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
														
 
															 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
														
 
															 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
														
@@ -21018,11 +21424,75 @@ static struct cipher_testvec camellia_dec_tv_template[] = {
 
															 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
														
 
															 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
														
 
															 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
														
 
															-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
														
 
															-		.rlen	= 496,
														
 
															+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
														
 
															+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
														
 
															+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
														
 
															+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
														
 
															+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
														
 
															+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
														
 
															+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
														
 
															+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
														
 
															+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
														
 
															+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
														
 
															+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
														
 
															+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
														
 
															+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
														
 
															+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
														
 
															+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
														
 
															+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
														
 
															+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
														
 
															+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
														
 
															+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
														
 
															+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
														
 
															+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
														
 
															+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
														
 
															+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
														
 
															+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
														
 
															+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
														
 
															+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
														
 
															+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
														
 
															+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
														
 
															+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
														
 
															+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
														
 
															+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
														
 
															+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
														
 
															+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
														
 
															+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
														
 
															+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
														
 
															+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
														
 
															+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
														
 
															+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
														
 
															+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
														
 
															+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
														
 
															+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
														
 
															+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
														
 
															+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
														
 
															+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
														
 
															+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
														
 
															+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
														
 
															+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
														
 
															+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
														
 
															+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
														
 
															+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
														
 
															+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
														
 
															+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
														
 
															+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
														
 
															+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
														
 
															+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
														
 
															+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
														
 
															+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
														
 
															+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
														
 
															+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
														
 
															+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
														
 
															+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
														
 
															+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
														
 
															+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
														
 
															+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
														
 
															+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
														
 
															+		.rlen	= 1008,
														
 
															 		.also_non_np = 1,
														
 
															 		.np	= 2,
														
 
															-		.tap	= { 496 - 16, 16 },
														
 
															+		.tap	= { 1008 - 16, 16 },
														
 
															 	},
														
 
															 };
														
@@ -21123,8 +21593,72 @@ static struct cipher_testvec camellia_cbc_enc_tv_template[] = {
 
															 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
														
 
															 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
														
 
															 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
														
 
															-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
														
 
															-		.ilen	= 496,
														
 
															+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
														
 
															+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
														
 
															+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
														
 
															+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
														
 
															+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
														
 
															+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
														
 
															+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
														
 
															+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
														
 
															+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
														
 
															+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
														
 
															+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
														
 
															+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
														
 
															+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
														
 
															+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
														
 
															+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
														
 
															+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
														
 
															+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
														
 
															+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
														
 
															+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
														
 
															+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
														
 
															+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
														
 
															+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
														
 
															+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
														
 
															+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
														
 
															+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
														
 
															+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
														
 
															+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
														
 
															+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
														
 
															+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
														
 
															+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
														
 
															+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
														
 
															+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
														
 
															+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
														
 
															+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
														
 
															+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
														
 
															+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
														
 
															+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
														
 
															+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
														
 
															+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
														
 
															+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
														
 
															+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
														
 
															+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
														
 
															+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
														
 
															+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
														
 
															+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
														
 
															+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
														
 
															+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
														
 
															+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
														
 
															+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
														
 
															+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
														
 
															+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
														
 
															+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
														
 
															+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
														
 
															+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
														
 
															+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
														
 
															+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
														
 
															+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
														
 
															+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
														
 
															+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
														
 
															+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
														
 
															+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
														
 
															+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
														
 
															+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
														
 
															+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
														
 
															+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
														
 
															+		.ilen	= 1008,
														
 
															 		.result	= "\xCD\x3E\x2A\x3B\x3E\x94\xC5\x77"
														
 
															 			  "\xBA\xBB\x5B\xB1\xDE\x7B\xA4\x40"
														
 
															 			  "\x88\x39\xE3\xFD\x94\x4B\x25\x58"
														
@@ -21186,11 +21720,75 @@ static struct cipher_testvec camellia_cbc_enc_tv_template[] = {
 
															 			  "\x2D\x1A\x68\xFE\xEC\x92\x94\xDA"
														
 
															 			  "\x94\x2A\x6F\xD6\xFE\xE5\x76\x97"
														
 
															 			  "\xF4\x6E\xEE\xCB\x2B\x95\x4E\x36"
														
 
															-			  "\x5F\x74\x8C\x86\x5B\x71\xD0\x20",
														
 
															-		.rlen	= 496,
														
 
															+			  "\x5F\x74\x8C\x86\x5B\x71\xD0\x20"
														
 
															+			  "\x78\x1A\x7F\x18\x8C\xD9\xCD\xF5"
														
 
															+			  "\x21\x41\x56\x72\x13\xE1\x86\x07"
														
 
															+			  "\x07\x26\xF3\x4F\x7B\xEA\xB5\x18"
														
 
															+			  "\xFE\x94\x2D\x9F\xE0\x72\x18\x65"
														
 
															+			  "\xB2\xA5\x63\x48\xB4\x13\x22\xF7"
														
 
															+			  "\x25\xF1\x80\xA8\x7F\x54\x86\x7B"
														
 
															+			  "\x39\xAE\x95\x0C\x09\x32\x22\x2D"
														
 
															+			  "\x4D\x73\x39\x0C\x09\x2C\x7C\x10"
														
 
															+			  "\xD0\x4B\x53\xF6\x90\xC5\x99\x2F"
														
 
															+			  "\x15\xE1\x7F\xC6\xC5\x7A\x52\x14"
														
 
															+			  "\x65\xEE\x93\x54\xD0\x66\x15\x3C"
														
 
															+			  "\x4C\x68\xFD\x64\x0F\xF9\x10\x39"
														
 
															+			  "\x46\x7A\xDD\x97\x20\xEE\xC7\xD2"
														
 
															+			  "\x98\x4A\xB6\xE6\xF5\xA8\x1F\x4F"
														
 
															+			  "\xDB\xAB\x6D\xD5\x9B\x34\x16\x97"
														
 
															+			  "\x2F\x64\xE5\x37\xEF\x0E\xA1\xE9"
														
 
															+			  "\xBE\x31\x31\x96\x8B\x40\x18\x75"
														
 
															+			  "\x11\x75\x14\x32\xA5\x2D\x1B\x6B"
														
 
															+			  "\xDB\x59\xEB\xFA\x3D\x8E\x7C\xC4"
														
 
															+			  "\xDE\x68\xC8\x9F\xC9\x99\xE3\xC6"
														
 
															+			  "\x71\xB0\x12\x57\x89\x0D\xC0\x2B"
														
 
															+			  "\x9F\x12\x6A\x04\x67\xF1\x95\x31"
														
 
															+			  "\x59\xFD\x84\x95\x2C\x9C\x5B\xEC"
														
 
															+			  "\x09\xB0\x43\x96\x4A\x64\x80\x40"
														
 
															+			  "\xB9\x72\x19\xDD\x70\x42\xFA\xB1"
														
 
															+			  "\x4A\x2C\x0C\x0A\x60\x6E\xE3\x7C"
														
 
															+			  "\x37\x5A\xBE\xA4\x62\xCF\x29\xAB"
														
 
															+			  "\x7F\x4D\xA6\xB3\xE2\xB6\x64\xC6"
														
 
															+			  "\x33\x0B\xF3\xD5\x01\x38\x74\xA4"
														
 
															+			  "\x67\x1E\x75\x68\xC3\xAD\x76\xE9"
														
 
															+			  "\xE9\xBC\xF0\xEB\xD8\xFD\x31\x8A"
														
 
															+			  "\x5F\xC9\x18\x94\x4B\x86\x66\xFC"
														
 
															+			  "\xBD\x0B\x3D\xB3\x9F\xFA\x1F\xD9"
														
 
															+			  "\x78\xC4\xE3\x24\x1C\x67\xA2\xF8"
														
 
															+			  "\x43\xBC\x76\x75\xBF\x6C\x05\xB3"
														
 
															+			  "\x32\xE8\x7C\x80\xDB\xC7\xB6\x61"
														
 
															+			  "\x1A\x3E\x2B\xA7\x25\xED\x8F\xA0"
														
 
															+			  "\x00\x4B\xF8\x90\xCA\xD8\xFB\x12"
														
 
															+			  "\xAC\x1F\x18\xE9\xD2\x5E\xA2\x8E"
														
 
															+			  "\xE4\x84\x6B\x9D\xEB\x1E\x6B\xA3"
														
 
															+			  "\x7B\xDC\xCE\x15\x97\x27\xB2\x65"
														
 
															+			  "\xBC\x0E\x47\xAB\x55\x13\x53\xAB"
														
 
															+			  "\x0E\x34\x55\x02\x5F\x27\xC5\x89"
														
 
															+			  "\xDF\xC5\x70\xC4\xDD\x76\x82\xEE"
														
 
															+			  "\x68\xA6\x09\xB0\xE5\x5E\xF1\x0C"
														
 
															+			  "\xE3\xF3\x09\x9B\xFE\x65\x4B\xB8"
														
 
															+			  "\x30\xEC\xD5\x7C\x6A\xEC\x1D\xD2"
														
 
															+			  "\x93\xB7\xA1\x1A\x02\xD4\xC0\xD6"
														
 
															+			  "\x8D\x4D\x83\x9A\xED\x29\x4E\x14"
														
 
															+			  "\x86\xD5\x3C\x1A\xD5\xB9\x0A\x6A"
														
 
															+			  "\x72\x22\xD5\x92\x38\xF1\xA1\x86"
														
 
															+			  "\xB2\x41\x51\xCA\x4E\xAB\x8F\xD3"
														
 
															+			  "\x80\x56\xC3\xD7\x65\xE1\xB3\x86"
														
 
															+			  "\xCB\xCE\x98\xA1\xD4\x59\x1C\x06"
														
 
															+			  "\x01\xED\xF8\x29\x91\x19\x5C\x9A"
														
 
															+			  "\xEE\x28\x1B\x48\xD7\x32\xEF\x9F"
														
 
															+			  "\x6C\x2B\x66\x4E\x78\xD5\x8B\x72"
														
 
															+			  "\x80\xE7\x29\xDC\x23\x55\x98\x54"
														
 
															+			  "\xB1\xFF\x3E\x95\x56\xA8\x78\x78"
														
 
															+			  "\xEF\xC4\xA5\x11\x2D\x2B\xD8\x93"
														
 
															+			  "\x30\x6E\x7E\x51\xBB\x42\x5F\x03"
														
 
															+			  "\x43\x94\x23\x7E\xEE\xF0\xA5\x79"
														
 
															+			  "\x55\x01\xD4\x58\xB2\xF2\x85\x49"
														
 
															+			  "\x70\xC5\xB9\x0B\x3B\x7A\x6E\x6C",
														
 
															+		.rlen	= 1008,
														
 
															 		.also_non_np = 1,
														
 
															 		.np	= 2,
														
 
															-		.tap	= { 496 - 16, 16 },
														
 
															+		.tap	= { 1008 - 16, 16 },
														
 
															 	},
														
 
															 };
														
@@ -21291,8 +21889,72 @@ static struct cipher_testvec camellia_cbc_dec_tv_template[] = {
 
															 			  "\x2D\x1A\x68\xFE\xEC\x92\x94\xDA"
														
 
															 			  "\x94\x2A\x6F\xD6\xFE\xE5\x76\x97"
														
 
															 			  "\xF4\x6E\xEE\xCB\x2B\x95\x4E\x36"
														
 
															-			  "\x5F\x74\x8C\x86\x5B\x71\xD0\x20",
														
 
															-		.ilen	= 496,
														
 
															+			  "\x5F\x74\x8C\x86\x5B\x71\xD0\x20"
														
 
															+			  "\x78\x1A\x7F\x18\x8C\xD9\xCD\xF5"
														
 
															+			  "\x21\x41\x56\x72\x13\xE1\x86\x07"
														
 
															+			  "\x07\x26\xF3\x4F\x7B\xEA\xB5\x18"
														
 
															+			  "\xFE\x94\x2D\x9F\xE0\x72\x18\x65"
														
 
															+			  "\xB2\xA5\x63\x48\xB4\x13\x22\xF7"
														
 
															+			  "\x25\xF1\x80\xA8\x7F\x54\x86\x7B"
														
 
															+			  "\x39\xAE\x95\x0C\x09\x32\x22\x2D"
														
 
															+			  "\x4D\x73\x39\x0C\x09\x2C\x7C\x10"
														
 
															+			  "\xD0\x4B\x53\xF6\x90\xC5\x99\x2F"
														
 
															+			  "\x15\xE1\x7F\xC6\xC5\x7A\x52\x14"
														
 
															+			  "\x65\xEE\x93\x54\xD0\x66\x15\x3C"
														
 
															+			  "\x4C\x68\xFD\x64\x0F\xF9\x10\x39"
														
 
															+			  "\x46\x7A\xDD\x97\x20\xEE\xC7\xD2"
														
 
															+			  "\x98\x4A\xB6\xE6\xF5\xA8\x1F\x4F"
														
 
															+			  "\xDB\xAB\x6D\xD5\x9B\x34\x16\x97"
														
 
															+			  "\x2F\x64\xE5\x37\xEF\x0E\xA1\xE9"
														
 
															+			  "\xBE\x31\x31\x96\x8B\x40\x18\x75"
														
 
															+			  "\x11\x75\x14\x32\xA5\x2D\x1B\x6B"
														
 
															+			  "\xDB\x59\xEB\xFA\x3D\x8E\x7C\xC4"
														
 
															+			  "\xDE\x68\xC8\x9F\xC9\x99\xE3\xC6"
														
 
															+			  "\x71\xB0\x12\x57\x89\x0D\xC0\x2B"
														
 
															+			  "\x9F\x12\x6A\x04\x67\xF1\x95\x31"
														
 
															+			  "\x59\xFD\x84\x95\x2C\x9C\x5B\xEC"
														
 
															+			  "\x09\xB0\x43\x96\x4A\x64\x80\x40"
														
 
															+			  "\xB9\x72\x19\xDD\x70\x42\xFA\xB1"
														
 
															+			  "\x4A\x2C\x0C\x0A\x60\x6E\xE3\x7C"
														
 
															+			  "\x37\x5A\xBE\xA4\x62\xCF\x29\xAB"
														
 
															+			  "\x7F\x4D\xA6\xB3\xE2\xB6\x64\xC6"
														
 
															+			  "\x33\x0B\xF3\xD5\x01\x38\x74\xA4"
														
 
															+			  "\x67\x1E\x75\x68\xC3\xAD\x76\xE9"
														
 
															+			  "\xE9\xBC\xF0\xEB\xD8\xFD\x31\x8A"
														
 
															+			  "\x5F\xC9\x18\x94\x4B\x86\x66\xFC"
														
 
															+			  "\xBD\x0B\x3D\xB3\x9F\xFA\x1F\xD9"
														
 
															+			  "\x78\xC4\xE3\x24\x1C\x67\xA2\xF8"
														
 
															+			  "\x43\xBC\x76\x75\xBF\x6C\x05\xB3"
														
 
															+			  "\x32\xE8\x7C\x80\xDB\xC7\xB6\x61"
														
 
															+			  "\x1A\x3E\x2B\xA7\x25\xED\x8F\xA0"
														
 
															+			  "\x00\x4B\xF8\x90\xCA\xD8\xFB\x12"
														
 
															+			  "\xAC\x1F\x18\xE9\xD2\x5E\xA2\x8E"
														
 
															+			  "\xE4\x84\x6B\x9D\xEB\x1E\x6B\xA3"
														
 
															+			  "\x7B\xDC\xCE\x15\x97\x27\xB2\x65"
														
 
															+			  "\xBC\x0E\x47\xAB\x55\x13\x53\xAB"
														
 
															+			  "\x0E\x34\x55\x02\x5F\x27\xC5\x89"
														
 
															+			  "\xDF\xC5\x70\xC4\xDD\x76\x82\xEE"
														
 
															+			  "\x68\xA6\x09\xB0\xE5\x5E\xF1\x0C"
														
 
															+			  "\xE3\xF3\x09\x9B\xFE\x65\x4B\xB8"
														
 
															+			  "\x30\xEC\xD5\x7C\x6A\xEC\x1D\xD2"
														
 
															+			  "\x93\xB7\xA1\x1A\x02\xD4\xC0\xD6"
														
 
															+			  "\x8D\x4D\x83\x9A\xED\x29\x4E\x14"
														
 
															+			  "\x86\xD5\x3C\x1A\xD5\xB9\x0A\x6A"
														
 
															+			  "\x72\x22\xD5\x92\x38\xF1\xA1\x86"
														
 
															+			  "\xB2\x41\x51\xCA\x4E\xAB\x8F\xD3"
														
 
															+			  "\x80\x56\xC3\xD7\x65\xE1\xB3\x86"
														
 
															+			  "\xCB\xCE\x98\xA1\xD4\x59\x1C\x06"
														
 
															+			  "\x01\xED\xF8\x29\x91\x19\x5C\x9A"
														
 
															+			  "\xEE\x28\x1B\x48\xD7\x32\xEF\x9F"
														
 
															+			  "\x6C\x2B\x66\x4E\x78\xD5\x8B\x72"
														
 
															+			  "\x80\xE7\x29\xDC\x23\x55\x98\x54"
														
 
															+			  "\xB1\xFF\x3E\x95\x56\xA8\x78\x78"
														
 
															+			  "\xEF\xC4\xA5\x11\x2D\x2B\xD8\x93"
														
 
															+			  "\x30\x6E\x7E\x51\xBB\x42\x5F\x03"
														
 
															+			  "\x43\x94\x23\x7E\xEE\xF0\xA5\x79"
														
 
															+			  "\x55\x01\xD4\x58\xB2\xF2\x85\x49"
														
 
															+			  "\x70\xC5\xB9\x0B\x3B\x7A\x6E\x6C",
														
 
															+		.ilen	= 1008,
														
 
															 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
														
 
															 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
														
 
															 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
														
@@ -21354,11 +22016,75 @@ static struct cipher_testvec camellia_cbc_dec_tv_template[] = {
 
															 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
														
 
															 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
														
 
															 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
														
 
															-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
														
 
															-		.rlen	= 496,
														
 
															+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
														
 
															+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
														
 
															+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
														
 
															+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
														
 
															+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
														
 
															+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
														
 
															+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
														
 
															+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
														
 
															+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
														
 
															+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
														
 
															+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
														
 
															+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
														
 
															+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
														
 
															+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
														
 
															+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
														
 
															+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
														
 
															+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
														
 
															+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
														
 
															+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
														
 
															+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
														
 
															+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
														
 
															+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
														
 
															+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
														
 
															+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
														
 
															+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
														
 
															+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
														
 
															+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
														
 
															+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
														
 
															+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
														
 
															+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
														
 
															+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
														
 
															+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
														
 
															+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
														
 
															+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
														
 
															+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
														
 
															+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
														
 
															+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
														
 
															+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
														
 
															+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
														
 
															+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
														
 
															+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
														
 
															+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
														
 
															+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
														
 
															+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
														
 
															+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
														
 
															+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
														
 
															+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
														
 
															+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
														
 
															+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
														
 
															+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
														
 
															+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
														
 
															+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
														
 
															+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
														
 
															+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
														
 
															+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
														
 
															+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
														
 
															+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
														
 
															+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
														
 
															+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
														
 
															+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
														
 
															+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
														
 
															+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
														
 
															+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
														
 
															+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
														
 
															+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
														
 
															+		.rlen	= 1008,
														
 
															 		.also_non_np = 1,
														
 
															 		.np	= 2,
														
 
															-		.tap	= { 496 - 16, 16 },
														
 
															+		.tap	= { 1008 - 16, 16 },
														
 
															 	},
														
 
															 };
														
@@ -21567,8 +22293,72 @@ static struct cipher_testvec camellia_ctr_enc_tv_template[] = {
 
															 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
														
 
															 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
														
 
															 			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
														
 
															-			  "\x2B\xC2\x59",
														
 
															-		.ilen	= 499,
														
 
															+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
														
 
															+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
														
 
															+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
														
 
															+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
														
 
															+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
														
 
															+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
														
 
															+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
														
 
															+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
														
 
															+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
														
 
															+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
														
 
															+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
														
 
															+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
														
 
															+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
														
 
															+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
														
 
															+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
														
 
															+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
														
 
															+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
														
 
															+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
														
 
															+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
														
 
															+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
														
 
															+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
														
 
															+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
														
 
															+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
														
 
															+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
														
 
															+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
														
 
															+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
														
 
															+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
														
 
															+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
														
 
															+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
														
 
															+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
														
 
															+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
														
 
															+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
														
 
															+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
														
 
															+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
														
 
															+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
														
 
															+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
														
 
															+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
														
 
															+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
														
 
															+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
														
 
															+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
														
 
															+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
														
 
															+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
														
 
															+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
														
 
															+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
														
 
															+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
														
 
															+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
														
 
															+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
														
 
															+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
														
 
															+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
														
 
															+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
														
 
															+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
														
 
															+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
														
 
															+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
														
 
															+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
														
 
															+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
														
 
															+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
														
 
															+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
														
 
															+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
														
 
															+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
														
 
															+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
														
 
															+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
														
 
															+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
														
 
															+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
														
 
															+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D"
														
 
															+			  "\xE4\x7B\x12",
														
 
															+		.ilen	= 1011,
														
 
															 		.result	= "\xF3\x06\x3A\x84\xCD\xBA\x8E\x11"
														
 
															 			  "\xB7\x74\x6F\x5C\x97\xFB\x36\xFE"
														
 
															 			  "\xDE\x71\x58\xD4\x15\xD1\xC1\xA4"
														
@@ -21631,11 +22421,75 @@ static struct cipher_testvec camellia_ctr_enc_tv_template[] = {
 
															 			  "\x7E\x42\xEC\xB6\x6F\x4D\x6B\x48"
														
 
															 			  "\xE6\xA6\x50\x80\x78\x9E\xF1\xB0"
														
 
															 			  "\x4D\xB2\x0D\x3D\xFC\x40\x25\x4D"
														
 
															-			  "\x93\x11\x1C",
														
 
															-		.rlen	= 499,
														
 
															+			  "\x93\x11\x1C\xE9\xD2\x9F\x6E\x90"
														
 
															+			  "\xE5\x41\x4A\xE2\x3C\x45\x29\x35"
														
 
															+			  "\xEC\xD6\x47\x50\xCB\x7B\xA2\x32"
														
 
															+			  "\xF7\x8B\x62\xF1\xE3\x9A\xFE\xC7"
														
 
															+			  "\x1D\x8C\x02\x72\x68\x09\xE9\xB6"
														
 
															+			  "\x4A\x80\xE6\xB1\x56\xDF\x90\xD4"
														
 
															+			  "\x93\x74\xA4\xCE\x20\x23\xBF\x48"
														
 
															+			  "\xA5\xDE\x1B\xFA\x40\x69\x31\x98"
														
 
															+			  "\x62\x6E\xA5\xC7\xBF\x0C\x62\xE5"
														
 
															+			  "\x6D\xE1\x93\xF1\x83\x10\x1C\xCA"
														
 
															+			  "\xF6\x5C\x19\xF8\x90\x78\xCB\xE4"
														
 
															+			  "\x0B\x3A\xB5\xF8\x43\x86\xD3\x3F"
														
 
															+			  "\xBA\x83\x34\x3C\x42\xCC\x7D\x28"
														
 
															+			  "\x29\x63\x4F\xD8\x02\x17\xC5\x07"
														
 
															+			  "\x2C\xA4\xAC\x79\xCB\xC3\xA9\x09"
														
 
															+			  "\x81\x45\x18\xED\xE4\xCB\x42\x3B"
														
 
															+			  "\x87\x2D\x23\xDC\xC5\xBA\x45\xBD"
														
 
															+			  "\x92\xE5\x02\x97\x96\xCE\xAD\xEC"
														
 
															+			  "\xBA\xD8\x76\xF8\xCA\xC1\x31\xEC"
														
 
															+			  "\x1E\x4F\x3F\x83\xF8\x33\xE8\x6E"
														
 
															+			  "\xCC\xF8\x5F\xDD\x65\x50\x99\x69"
														
 
															+			  "\xAF\x48\xCE\xA5\xBA\xB6\x14\x9F"
														
 
															+			  "\x05\x93\xB2\xE6\x59\xC8\x28\xFE"
														
 
															+			  "\x8F\x37\xF9\x64\xB9\xA5\x56\x8F"
														
 
															+			  "\xF1\x1B\x90\xEF\xAE\xEB\xFC\x09"
														
 
															+			  "\x11\x7A\xF2\x19\x0A\x0A\x9A\x3C"
														
 
															+			  "\xE2\x5E\x29\xFA\x31\x9B\xC1\x74"
														
 
															+			  "\x1E\x10\x3E\x07\xA9\x31\x6D\xF8"
														
 
															+			  "\x81\xF5\xD5\x8A\x04\x23\x51\xAC"
														
 
															+			  "\xA2\xE2\x63\xFD\x27\x1F\x79\x5B"
														
 
															+			  "\x1F\xE8\xDA\x11\x49\x4D\x1C\xBA"
														
 
															+			  "\x54\xCC\x0F\xBA\x92\x69\xE5\xCB"
														
 
															+			  "\x41\x1A\x67\xA6\x40\x82\x70\x8C"
														
 
															+			  "\x19\x79\x08\xA4\x51\x20\x7D\xC9"
														
 
															+			  "\x12\x27\xAE\x20\x0D\x2C\xA1\x6D"
														
 
															+			  "\xF4\x55\xD4\xE7\xE6\xD4\x28\x08"
														
 
															+			  "\x00\x70\x12\x56\x56\x50\xAD\x14"
														
 
															+			  "\x5C\x3E\xA2\xD1\x36\x3F\x36\x48"
														
 
															+			  "\xED\xB1\x57\x3E\x5D\x15\xF6\x1E"
														
 
															+			  "\x53\xE9\xA4\x3E\xED\x7D\xCF\x7D"
														
 
															+			  "\x29\xAF\xF3\x1E\x51\xA8\x9F\x85"
														
 
															+			  "\x8B\xF0\xBB\xCE\xCC\x39\xC3\x64"
														
 
															+			  "\x4B\xF2\xAD\x70\x19\xD4\x44\x8F"
														
 
															+			  "\x91\x76\xE8\x15\x66\x34\x9F\xF6"
														
 
															+			  "\x0F\x15\xA4\xA8\x24\xF8\x58\xB1"
														
 
															+			  "\x38\x46\x47\xC7\x9B\xCA\xE9\x42"
														
 
															+			  "\x44\xAA\xE6\xB5\x9C\x91\xA4\xD3"
														
 
															+			  "\x16\xA0\xED\x42\xBE\xB5\x06\x19"
														
 
															+			  "\xBE\x67\xE8\xBC\x22\x32\xA4\x1E"
														
 
															+			  "\x93\xEB\xBE\xE9\xE1\x93\xE5\x31"
														
 
															+			  "\x3A\xA2\x75\xDF\xE3\x6B\xE7\xCC"
														
 
															+			  "\xB4\x70\x20\xE0\x6D\x82\x7C\xC8"
														
 
															+			  "\x94\x5C\x5E\x37\x18\xAD\xED\x8B"
														
 
															+			  "\x44\x86\xCA\x5E\x07\xB7\x70\x8D"
														
 
															+			  "\x40\x48\x19\x73\x7C\x78\x64\x0B"
														
 
															+			  "\xDB\x01\xCA\xAE\x63\x19\xE9\xD1"
														
 
															+			  "\x6B\x2C\x84\x10\x45\x42\x2E\xC3"
														
 
															+			  "\xDF\x7F\xAA\xE8\x87\x1B\x63\x46"
														
 
															+			  "\x74\x28\x9D\x05\x30\x20\x62\x41"
														
 
															+			  "\xC0\x9F\x2C\x36\x2B\x78\xD7\x26"
														
 
															+			  "\xDF\x58\x51\xED\xFA\xDC\x87\x79"
														
 
															+			  "\xBF\x8C\xBF\xC4\x0F\xE5\x05\xDA"
														
 
															+			  "\x45\xE3\x35\x0D\x69\x91\x54\x1C"
														
 
															+			  "\xE7\x2C\x49\x08\x8B\x72\xFA\x5C"
														
 
															+			  "\xF1\x6B\xD9",
														
 
															+		.rlen	= 1011,
														
 
															 		.also_non_np = 1,
														
 
															 		.np	= 2,
														
 
															-		.tap	= { 499 - 16, 16 },
														
 
															+		.tap	= { 1011 - 16, 16 },
														
 
															 	}, { /* Generated with Crypto++ */
														
 
															 		.key	= "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
														
 
															 			  "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
														
@@ -21705,8 +22559,72 @@ static struct cipher_testvec camellia_ctr_enc_tv_template[] = {
 
															 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
														
 
															 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
														
 
															 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
														
 
															-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
														
 
															-		.ilen	= 496,
														
 
															+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
														
 
															+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
														
 
															+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
														
 
															+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
														
 
															+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
														
 
															+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
														
 
															+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
														
 
															+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
														
 
															+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
														
 
															+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
														
 
															+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
														
 
															+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
														
 
															+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
														
 
															+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
														
 
															+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
														
 
															+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
														
 
															+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
														
 
															+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
														
 
															+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
														
 
															+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
														
 
															+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
														
 
															+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
														
 
															+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
														
 
															+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
														
 
															+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
														
 
															+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
														
 
															+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
														
 
															+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
														
 
															+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
														
 
															+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
														
 
															+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
														
 
															+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
														
 
															+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
														
 
															+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
														
 
															+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
														
 
															+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
														
 
															+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
														
 
															+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
														
 
															+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
														
 
															+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
														
 
															+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
														
 
															+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
														
 
															+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
														
 
															+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
														
 
															+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
														
 
															+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
														
 
															+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
														
 
															+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
														
 
															+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
														
 
															+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
														
 
															+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
														
 
															+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
														
 
															+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
														
 
															+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
														
 
															+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
														
 
															+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
														
 
															+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
														
 
															+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
														
 
															+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
														
 
															+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
														
 
															+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
														
 
															+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
														
 
															+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
														
 
															+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
														
 
															+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
														
 
															+		.ilen	= 1008,
														
 
															 		.result	= "\x85\x79\x6C\x8B\x2B\x6D\x14\xF9"
														
 
															 			  "\xA6\x83\xB6\x80\x5B\x3A\xF3\x7E"
														
 
															 			  "\x30\x29\xEB\x1F\xDC\x19\x5F\xEB"
														
@@ -21768,8 +22686,72 @@ static struct cipher_testvec camellia_ctr_enc_tv_template[] = {
 
															 			  "\xB4\x3A\x5F\x19\xCF\x42\x1B\x22"
														
 
															 			  "\x0B\x2D\x7B\xF1\xC5\x43\xF7\x5E"
														
 
															 			  "\x12\xA8\x01\x64\x16\x0B\x26\x5A"
														
 
															-			  "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C",
														
 
															-		.rlen	= 496,
														
 
															+			  "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C"
														
 
															+			  "\xCF\xF5\xD5\xB7\x7A\x34\x23\xB6"
														
 
															+			  "\xAA\x9E\xA8\x98\xA2\xF8\x3D\xD3"
														
 
															+			  "\x3F\x23\x69\x63\x56\x96\x45\xD6"
														
 
															+			  "\x74\x23\x1D\x5C\x63\xCC\xD8\x78"
														
 
															+			  "\x16\xE2\x9C\xD2\x80\x02\xF2\x28"
														
 
															+			  "\x69\x2F\xC4\xA8\x15\x15\x24\x3B"
														
 
															+			  "\xCB\xF0\x14\xE4\x62\xC8\xF3\xD1"
														
 
															+			  "\x03\x58\x1B\x33\x77\x74\x1F\xB4"
														
 
															+			  "\x07\x86\xF2\x21\xB7\x41\xAE\xBF"
														
 
															+			  "\x25\xC2\xFF\x51\xEF\xEA\xCE\xC4"
														
 
															+			  "\x5F\xD9\xB8\x18\x6A\xF0\x0F\x0D"
														
 
															+			  "\xF8\x04\xBB\x6D\x62\x33\x87\x26"
														
 
															+			  "\x4F\x2F\x14\x6E\xDC\xDB\x66\x09"
														
 
															+			  "\x2A\xEF\x7D\x84\x10\xAC\x82\x5E"
														
 
															+			  "\xD2\xE4\xAD\x74\x7A\x6D\xCC\x3A"
														
 
															+			  "\x7B\x62\xD8\xD6\x07\x2D\xF7\xDF"
														
 
															+			  "\x9B\xB3\x82\xCF\x9C\x1D\x76\x5C"
														
 
															+			  "\xAC\x7B\xD4\x9B\x45\xA1\x64\x11"
														
 
															+			  "\x66\xF1\xA7\x0B\xF9\xDD\x00\xDD"
														
 
															+			  "\xA4\x45\x3D\x3E\x03\xC9\x2E\xCB"
														
 
															+			  "\xC3\x14\x84\x72\xFD\x41\xDC\xBD"
														
 
															+			  "\x75\xBE\xA8\xE5\x16\x48\x64\x39"
														
 
															+			  "\xCA\xF3\xE6\xDC\x25\x24\xF1\x6D"
														
 
															+			  "\xB2\x8D\xC5\x38\x54\xD3\x5D\x6D"
														
 
															+			  "\x0B\x29\x10\x15\x0E\x13\x3B\xAC"
														
 
															+			  "\x7E\xCC\x9E\x3E\x18\x48\xA6\x02"
														
 
															+			  "\xEF\x03\xB2\x2E\xE3\xD2\x70\x21"
														
 
															+			  "\xB4\x19\x26\xBE\x3A\x3D\x05\xE0"
														
 
															+			  "\xF8\x09\xAF\xE4\x31\x26\x92\x2F"
														
 
															+			  "\x8F\x55\xAC\xED\x0B\xB2\xA5\x34"
														
 
															+			  "\xBE\x50\xB1\x02\x22\x96\xE3\x40"
														
 
															+			  "\x7B\x70\x50\x6E\x3B\xD5\xE5\xA0"
														
 
															+			  "\x8E\xA2\xAD\x14\x60\x5C\x7A\x2B"
														
 
															+			  "\x3D\x1B\x7F\xC1\xC0\x2C\x56\x36"
														
 
															+			  "\xD2\x0A\x32\x06\x97\x34\xB9\xF4"
														
 
															+			  "\x6F\x9F\x7E\x80\xD0\x9D\xF7\x6A"
														
 
															+			  "\x21\xC1\xA2\x6A\xB1\x96\x5B\x4D"
														
 
															+			  "\x7A\x15\x6C\xC4\x4E\xB8\xE0\x9E"
														
 
															+			  "\x6C\x50\xF3\x9C\xC9\xB5\x23\xB7"
														
 
															+			  "\xF1\xD4\x29\x4A\x23\xC4\xAD\x1E"
														
 
															+			  "\x2C\x07\xD2\x43\x5F\x57\x93\xCA"
														
 
															+			  "\x85\xF9\x9F\xAD\x4C\xF1\xE4\xB1"
														
 
															+			  "\x1A\x8E\x28\xA4\xB6\x52\x77\x7E"
														
 
															+			  "\x68\xC6\x47\xB9\x76\xCC\x65\x5F"
														
 
															+			  "\x0B\xF9\x67\x93\xD8\x0E\x9A\x37"
														
 
															+			  "\x5F\x41\xED\x64\x6C\xAD\x5F\xED"
														
 
															+			  "\x3F\x8D\xFB\x8E\x1E\xA0\xE4\x1F"
														
 
															+			  "\xC2\xC7\xED\x18\x43\xE1\x20\x86"
														
 
															+			  "\x5D\xBC\x30\x70\x22\xA1\xDC\x53"
														
 
															+			  "\x10\x3A\x8D\x47\x82\xCD\x7F\x59"
														
 
															+			  "\x03\x2D\x6D\xF5\xE7\x79\xD4\x07"
														
 
															+			  "\x68\x2A\xA5\x42\x19\x4D\xAF\xF5"
														
 
															+			  "\xED\x47\x83\xBC\x5F\x62\x84\xDA"
														
 
															+			  "\xDA\x41\xFF\xB0\x1D\x64\xA3\xC8"
														
 
															+			  "\xBD\x4E\xE0\xB8\x7F\xEE\x55\x0A"
														
 
															+			  "\x4E\x61\xB2\x51\xF6\x9C\x95\xF6"
														
 
															+			  "\x92\xBB\xF6\xC5\xF0\x09\x86\xDE"
														
 
															+			  "\x37\x9E\x29\xF9\x2A\x18\x73\x0D"
														
 
															+			  "\xDC\x7E\x6B\x7B\x1B\x43\x8C\xEA"
														
 
															+			  "\x13\xC8\x1A\x47\x0A\x2D\x6D\x56"
														
 
															+			  "\xCD\xD2\xE7\x53\x1A\xAB\x1C\x3C"
														
 
															+			  "\xC5\x9B\x03\x70\x29\x2A\x49\x09"
														
 
															+			  "\x67\xA1\xEA\xD6\x3A\x5B\xBF\x71"
														
 
															+			  "\x1D\x48\x64\x6C\xFB\xC0\x9E\x36",
														
 
															+		.rlen	= 1008,
														
 
															 	},
														
 
															 };
														
@@ -21978,8 +22960,72 @@ static struct cipher_testvec camellia_ctr_dec_tv_template[] = {
 
															 			  "\x7E\x42\xEC\xB6\x6F\x4D\x6B\x48"
														
 
															 			  "\xE6\xA6\x50\x80\x78\x9E\xF1\xB0"
														
 
															 			  "\x4D\xB2\x0D\x3D\xFC\x40\x25\x4D"
														
 
															-			  "\x93\x11\x1C",
														
 
															-		.ilen	= 499,
														
 
															+			  "\x93\x11\x1C\xE9\xD2\x9F\x6E\x90"
														
 
															+			  "\xE5\x41\x4A\xE2\x3C\x45\x29\x35"
														
 
															+			  "\xEC\xD6\x47\x50\xCB\x7B\xA2\x32"
														
 
															+			  "\xF7\x8B\x62\xF1\xE3\x9A\xFE\xC7"
														
 
															+			  "\x1D\x8C\x02\x72\x68\x09\xE9\xB6"
														
 
															+			  "\x4A\x80\xE6\xB1\x56\xDF\x90\xD4"
														
 
															+			  "\x93\x74\xA4\xCE\x20\x23\xBF\x48"
														
 
															+			  "\xA5\xDE\x1B\xFA\x40\x69\x31\x98"
														
 
															+			  "\x62\x6E\xA5\xC7\xBF\x0C\x62\xE5"
														
 
															+			  "\x6D\xE1\x93\xF1\x83\x10\x1C\xCA"
														
 
															+			  "\xF6\x5C\x19\xF8\x90\x78\xCB\xE4"
														
 
															+			  "\x0B\x3A\xB5\xF8\x43\x86\xD3\x3F"
														
 
															+			  "\xBA\x83\x34\x3C\x42\xCC\x7D\x28"
														
 
															+			  "\x29\x63\x4F\xD8\x02\x17\xC5\x07"
														
 
															+			  "\x2C\xA4\xAC\x79\xCB\xC3\xA9\x09"
														
 
															+			  "\x81\x45\x18\xED\xE4\xCB\x42\x3B"
														
 
															+			  "\x87\x2D\x23\xDC\xC5\xBA\x45\xBD"
														
 
															+			  "\x92\xE5\x02\x97\x96\xCE\xAD\xEC"
														
 
															+			  "\xBA\xD8\x76\xF8\xCA\xC1\x31\xEC"
														
 
															+			  "\x1E\x4F\x3F\x83\xF8\x33\xE8\x6E"
														
 
															+			  "\xCC\xF8\x5F\xDD\x65\x50\x99\x69"
														
 
															+			  "\xAF\x48\xCE\xA5\xBA\xB6\x14\x9F"
														
 
															+			  "\x05\x93\xB2\xE6\x59\xC8\x28\xFE"
														
 
															+			  "\x8F\x37\xF9\x64\xB9\xA5\x56\x8F"
														
 
															+			  "\xF1\x1B\x90\xEF\xAE\xEB\xFC\x09"
														
 
															+			  "\x11\x7A\xF2\x19\x0A\x0A\x9A\x3C"
														
 
															+			  "\xE2\x5E\x29\xFA\x31\x9B\xC1\x74"
														
 
															+			  "\x1E\x10\x3E\x07\xA9\x31\x6D\xF8"
														
 
															+			  "\x81\xF5\xD5\x8A\x04\x23\x51\xAC"
														
 
															+			  "\xA2\xE2\x63\xFD\x27\x1F\x79\x5B"
														
 
															+			  "\x1F\xE8\xDA\x11\x49\x4D\x1C\xBA"
														
 
															+			  "\x54\xCC\x0F\xBA\x92\x69\xE5\xCB"
														
 
															+			  "\x41\x1A\x67\xA6\x40\x82\x70\x8C"
														
 
															+			  "\x19\x79\x08\xA4\x51\x20\x7D\xC9"
														
 
															+			  "\x12\x27\xAE\x20\x0D\x2C\xA1\x6D"
														
 
															+			  "\xF4\x55\xD4\xE7\xE6\xD4\x28\x08"
														
 
															+			  "\x00\x70\x12\x56\x56\x50\xAD\x14"
														
 
															+			  "\x5C\x3E\xA2\xD1\x36\x3F\x36\x48"
														
 
															+			  "\xED\xB1\x57\x3E\x5D\x15\xF6\x1E"
														
 
															+			  "\x53\xE9\xA4\x3E\xED\x7D\xCF\x7D"
														
 
															+			  "\x29\xAF\xF3\x1E\x51\xA8\x9F\x85"
														
 
															+			  "\x8B\xF0\xBB\xCE\xCC\x39\xC3\x64"
														
 
															+			  "\x4B\xF2\xAD\x70\x19\xD4\x44\x8F"
														
 
															+			  "\x91\x76\xE8\x15\x66\x34\x9F\xF6"
														
 
															+			  "\x0F\x15\xA4\xA8\x24\xF8\x58\xB1"
														
 
															+			  "\x38\x46\x47\xC7\x9B\xCA\xE9\x42"
														
 
															+			  "\x44\xAA\xE6\xB5\x9C\x91\xA4\xD3"
														
 
															+			  "\x16\xA0\xED\x42\xBE\xB5\x06\x19"
														
 
															+			  "\xBE\x67\xE8\xBC\x22\x32\xA4\x1E"
														
 
															+			  "\x93\xEB\xBE\xE9\xE1\x93\xE5\x31"
														
 
															+			  "\x3A\xA2\x75\xDF\xE3\x6B\xE7\xCC"
														
 
															+			  "\xB4\x70\x20\xE0\x6D\x82\x7C\xC8"
														
 
															+			  "\x94\x5C\x5E\x37\x18\xAD\xED\x8B"
														
 
															+			  "\x44\x86\xCA\x5E\x07\xB7\x70\x8D"
														
 
															+			  "\x40\x48\x19\x73\x7C\x78\x64\x0B"
														
 
															+			  "\xDB\x01\xCA\xAE\x63\x19\xE9\xD1"
														
 
															+			  "\x6B\x2C\x84\x10\x45\x42\x2E\xC3"
														
 
															+			  "\xDF\x7F\xAA\xE8\x87\x1B\x63\x46"
														
 
															+			  "\x74\x28\x9D\x05\x30\x20\x62\x41"
														
 
															+			  "\xC0\x9F\x2C\x36\x2B\x78\xD7\x26"
														
 
															+			  "\xDF\x58\x51\xED\xFA\xDC\x87\x79"
														
 
															+			  "\xBF\x8C\xBF\xC4\x0F\xE5\x05\xDA"
														
 
															+			  "\x45\xE3\x35\x0D\x69\x91\x54\x1C"
														
 
															+			  "\xE7\x2C\x49\x08\x8B\x72\xFA\x5C"
														
 
															+			  "\xF1\x6B\xD9",
														
 
															+		.ilen	= 1011,
														
 
															 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
														
 
															 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
														
 
															 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
														
@@ -22042,11 +23088,75 @@ static struct cipher_testvec camellia_ctr_dec_tv_template[] = {
 
															 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
														
 
															 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
														
 
															 			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
														
 
															-			  "\x2B\xC2\x59",
														
 
															-		.rlen	= 499,
														
 
															+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
														
 
															+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
														
 
															+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
														
 
															+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
														
 
															+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
														
 
															+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
														
 
															+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
														
 
															+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
														
 
															+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
														
 
															+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
														
 
															+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
														
 
															+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
														
 
															+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
														
 
															+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
														
 
															+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
														
 
															+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
														
 
															+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
														
 
															+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
														
 
															+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
														
 
															+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
														
 
															+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
														
 
															+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
														
 
															+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
														
 
															+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
														
 
															+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
														
 
															+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
														
 
															+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
														
 
															+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
														
 
															+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
														
 
															+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
														
 
															+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
														
 
															+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
														
 
															+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
														
 
															+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
														
 
															+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
														
 
															+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
														
 
															+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
														
 
															+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
														
 
															+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
														
 
															+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
														
 
															+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
														
 
															+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
														
 
															+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
														
 
															+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
														
 
															+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
														
 
															+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
														
 
															+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
														
 
															+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
														
 
															+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
														
 
															+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
														
 
															+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
														
 
															+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
														
 
															+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
														
 
															+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
														
 
															+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
														
 
															+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
														
 
															+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
														
 
															+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
														
 
															+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
														
 
															+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
														
 
															+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
														
 
															+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
														
 
															+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
														
 
															+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D"
														
 
															+			  "\xE4\x7B\x12",
														
 
															+		.rlen	= 1011,
														
 
															 		.also_non_np = 1,
														
 
															 		.np	= 2,
														
 
															-		.tap	= { 499 - 16, 16 },
														
 
															+		.tap	= { 1011 - 16, 16 },
														
 
															 	}, { /* Generated with Crypto++ */
														
 
															 		.key	= "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
														
 
															 			  "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
														
@@ -22116,8 +23226,72 @@ static struct cipher_testvec camellia_ctr_dec_tv_template[] = {
 
															 			  "\xB4\x3A\x5F\x19\xCF\x42\x1B\x22"
														
 
															 			  "\x0B\x2D\x7B\xF1\xC5\x43\xF7\x5E"
														
 
															 			  "\x12\xA8\x01\x64\x16\x0B\x26\x5A"
														
 
															-			  "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C",
														
 
															-		.ilen	= 496,
														
 
															+			  "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C"
														
 
															+			  "\xCF\xF5\xD5\xB7\x7A\x34\x23\xB6"
														
 
															+			  "\xAA\x9E\xA8\x98\xA2\xF8\x3D\xD3"
														
 
															+			  "\x3F\x23\x69\x63\x56\x96\x45\xD6"
														
 
															+			  "\x74\x23\x1D\x5C\x63\xCC\xD8\x78"
														
 
															+			  "\x16\xE2\x9C\xD2\x80\x02\xF2\x28"
														
 
															+			  "\x69\x2F\xC4\xA8\x15\x15\x24\x3B"
														
 
															+			  "\xCB\xF0\x14\xE4\x62\xC8\xF3\xD1"
														
 
															+			  "\x03\x58\x1B\x33\x77\x74\x1F\xB4"
														
 
															+			  "\x07\x86\xF2\x21\xB7\x41\xAE\xBF"
														
 
															+			  "\x25\xC2\xFF\x51\xEF\xEA\xCE\xC4"
														
 
															+			  "\x5F\xD9\xB8\x18\x6A\xF0\x0F\x0D"
														
 
															+			  "\xF8\x04\xBB\x6D\x62\x33\x87\x26"
														
 
															+			  "\x4F\x2F\x14\x6E\xDC\xDB\x66\x09"
														
 
															+			  "\x2A\xEF\x7D\x84\x10\xAC\x82\x5E"
														
 
															+			  "\xD2\xE4\xAD\x74\x7A\x6D\xCC\x3A"
														
 
															+			  "\x7B\x62\xD8\xD6\x07\x2D\xF7\xDF"
														
 
															+			  "\x9B\xB3\x82\xCF\x9C\x1D\x76\x5C"
														
 
															+			  "\xAC\x7B\xD4\x9B\x45\xA1\x64\x11"
														
 
															+			  "\x66\xF1\xA7\x0B\xF9\xDD\x00\xDD"
														
 
															+			  "\xA4\x45\x3D\x3E\x03\xC9\x2E\xCB"
														
 
															+			  "\xC3\x14\x84\x72\xFD\x41\xDC\xBD"
														
 
															+			  "\x75\xBE\xA8\xE5\x16\x48\x64\x39"
														
 
															+			  "\xCA\xF3\xE6\xDC\x25\x24\xF1\x6D"
														
 
															+			  "\xB2\x8D\xC5\x38\x54\xD3\x5D\x6D"
														
 
															+			  "\x0B\x29\x10\x15\x0E\x13\x3B\xAC"
														
 
															+			  "\x7E\xCC\x9E\x3E\x18\x48\xA6\x02"
														
 
															+			  "\xEF\x03\xB2\x2E\xE3\xD2\x70\x21"
														
 
															+			  "\xB4\x19\x26\xBE\x3A\x3D\x05\xE0"
														
 
															+			  "\xF8\x09\xAF\xE4\x31\x26\x92\x2F"
														
 
															+			  "\x8F\x55\xAC\xED\x0B\xB2\xA5\x34"
														
 
															+			  "\xBE\x50\xB1\x02\x22\x96\xE3\x40"
														
 
															+			  "\x7B\x70\x50\x6E\x3B\xD5\xE5\xA0"
														
 
															+			  "\x8E\xA2\xAD\x14\x60\x5C\x7A\x2B"
														
 
															+			  "\x3D\x1B\x7F\xC1\xC0\x2C\x56\x36"
														
 
															+			  "\xD2\x0A\x32\x06\x97\x34\xB9\xF4"
														
 
															+			  "\x6F\x9F\x7E\x80\xD0\x9D\xF7\x6A"
														
 
															+			  "\x21\xC1\xA2\x6A\xB1\x96\x5B\x4D"
														
 
															+			  "\x7A\x15\x6C\xC4\x4E\xB8\xE0\x9E"
														
 
															+			  "\x6C\x50\xF3\x9C\xC9\xB5\x23\xB7"
														
 
															+			  "\xF1\xD4\x29\x4A\x23\xC4\xAD\x1E"
														
 
															+			  "\x2C\x07\xD2\x43\x5F\x57\x93\xCA"
														
 
															+			  "\x85\xF9\x9F\xAD\x4C\xF1\xE4\xB1"
														
 
															+			  "\x1A\x8E\x28\xA4\xB6\x52\x77\x7E"
														
 
															+			  "\x68\xC6\x47\xB9\x76\xCC\x65\x5F"
														
 
															+			  "\x0B\xF9\x67\x93\xD8\x0E\x9A\x37"
														
 
															+			  "\x5F\x41\xED\x64\x6C\xAD\x5F\xED"
														
 
															+			  "\x3F\x8D\xFB\x8E\x1E\xA0\xE4\x1F"
														
 
															+			  "\xC2\xC7\xED\x18\x43\xE1\x20\x86"
														
 
															+			  "\x5D\xBC\x30\x70\x22\xA1\xDC\x53"
														
 
															+			  "\x10\x3A\x8D\x47\x82\xCD\x7F\x59"
														
 
															+			  "\x03\x2D\x6D\xF5\xE7\x79\xD4\x07"
														
 
															+			  "\x68\x2A\xA5\x42\x19\x4D\xAF\xF5"
														
 
															+			  "\xED\x47\x83\xBC\x5F\x62\x84\xDA"
														
 
															+			  "\xDA\x41\xFF\xB0\x1D\x64\xA3\xC8"
														
 
															+			  "\xBD\x4E\xE0\xB8\x7F\xEE\x55\x0A"
														
 
															+			  "\x4E\x61\xB2\x51\xF6\x9C\x95\xF6"
														
 
															+			  "\x92\xBB\xF6\xC5\xF0\x09\x86\xDE"
														
 
															+			  "\x37\x9E\x29\xF9\x2A\x18\x73\x0D"
														
 
															+			  "\xDC\x7E\x6B\x7B\x1B\x43\x8C\xEA"
														
 
															+			  "\x13\xC8\x1A\x47\x0A\x2D\x6D\x56"
														
 
															+			  "\xCD\xD2\xE7\x53\x1A\xAB\x1C\x3C"
														
 
															+			  "\xC5\x9B\x03\x70\x29\x2A\x49\x09"
														
 
															+			  "\x67\xA1\xEA\xD6\x3A\x5B\xBF\x71"
														
 
															+			  "\x1D\x48\x64\x6C\xFB\xC0\x9E\x36",
														
 
															+		.ilen	= 1008,
														
 
															 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
														
 
															 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
														
 
															 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
														
@@ -22179,8 +23353,72 @@ static struct cipher_testvec camellia_ctr_dec_tv_template[] = {
 
															 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
														
 
															 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
														
 
															 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
														
 
															-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
														
 
															-		.rlen	= 496,
														
 
															+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
														
 
															+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
														
 
															+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
														
 
															+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
														
 
															+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
														
 
															+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
														
 
															+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
														
 
															+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
														
 
															+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
														
 
															+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
														
 
															+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
														
 
															+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
														
 
															+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
														
 
															+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
														
 
															+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
														
 
															+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
														
 
															+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
														
 
															+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
														
 
															+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
														
 
															+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
														
 
															+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
														
 
															+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
														
 
															+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
														
 
															+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
														
 
															+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
														
 
															+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
														
 
															+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
														
 
															+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
														
 
															+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
														
 
															+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
														
 
															+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
														
 
															+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
														
 
															+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
														
 
															+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
														
 
															+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
														
 
															+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
														
 
															+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
														
 
															+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
														
 
															+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
														
 
															+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
														
 
															+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
														
 
															+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
														
 
															+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
														
 
															+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
														
 
															+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
														
 
															+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
														
 
															+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
														
 
															+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
														
 
															+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
														
 
															+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
														
 
															+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
														
 
															+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
														
 
															+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
														
 
															+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
														
 
															+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
														
 
															+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
														
 
															+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
														
 
															+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
														
 
															+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
														
 
															+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
														
 
															+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
														
 
															+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
														
 
															+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
														
 
															+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
														
 
															+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
														
 
															+		.rlen	= 1008,
														
 
															 	},
														
 
															 };
														
--- a/drivers/char/hw_random/Kconfig
+++ b/drivers/char/hw_random/Kconfig
@@ -86,6 +86,18 @@ config HW_RANDOM_BCM63XX
 
															 	  If unusure, say Y.
														
 
															+config HW_RANDOM_BCM2835
														
 
															+	tristate "Broadcom BCM2835 Random Number Generator support"
														
 
															+	depends on HW_RANDOM && ARCH_BCM2835
														
 
															+	default HW_RANDOM
														
 
															+	---help---
														
 
															+	  This driver provides kernel-side support for the Random Number
														
 
															+	  Generator hardware found on the Broadcom BCM2835 SoCs.
														
 
															+
														
 
															+	  To compile this driver as a module, choose M here: the
														
 
															+	  module will be called bcm2835-rng
														
 
															+
														
 
															+	  If unsure, say Y.
														
 
															 config HW_RANDOM_GEODE
														
 
															 	tristate "AMD Geode HW Random Number Generator support"
														
--- a/drivers/char/hw_random/Makefile
+++ b/drivers/char/hw_random/Makefile
@@ -26,3 +26,4 @@ obj-$(CONFIG_HW_RANDOM_PPC4XX) += ppc4xx-rng.o
 
															 obj-$(CONFIG_HW_RANDOM_PSERIES) += pseries-rng.o
														
 
															 obj-$(CONFIG_HW_RANDOM_EXYNOS)	+= exynos-rng.o
														
 
															 obj-$(CONFIG_HW_RANDOM_TPM) += tpm-rng.o
														
 
															+obj-$(CONFIG_HW_RANDOM_BCM2835) += bcm2835-rng.o
														
--- a/drivers/char/hw_random/bcm2835-rng.c
+++ b/drivers/char/hw_random/bcm2835-rng.c
@@ -0,0 +1,113 @@
 
															+/**
														
 
															+ * Copyright (c) 2010-2012 Broadcom. All rights reserved.
														
 
															+ * Copyright (c) 2013 Lubomir Rintel
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or
														
 
															+ * modify it under the terms of the GNU General Public License ("GPL")
														
 
															+ * version 2, as published by the Free Software Foundation.
														
 
															+ */
														
 
															+
														
 
															+#include <linux/hw_random.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/io.h>
														
 
															+#include <linux/kernel.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/of_address.h>
														
 
															+#include <linux/of_platform.h>
														
 
															+#include <linux/platform_device.h>
														
 
															+#include <linux/printk.h>
														
 
															+
														
 
															+#define RNG_CTRL	0x0
														
 
															+#define RNG_STATUS	0x4
														
 
															+#define RNG_DATA	0x8
														
 
															+
														
 
															+/* enable rng */
														
 
															+#define RNG_RBGEN	0x1
														
 
															+
														
 
															+/* the initial numbers generated are "less random" so will be discarded */
														
 
															+#define RNG_WARMUP_COUNT 0x40000
														
 
															+
														
 
															+static int bcm2835_rng_read(struct hwrng *rng, void *buf, size_t max,
														
 
															+			       bool wait)
														
 
															+{
														
 
															+	void __iomem *rng_base = (void __iomem *)rng->priv;
														
 
															+
														
 
															+	while ((__raw_readl(rng_base + RNG_STATUS) >> 24) == 0) {
														
 
															+		if (!wait)
														
 
															+			return 0;
														
 
															+		cpu_relax();
														
 
															+	}
														
 
															+
														
 
															+	*(u32 *)buf = __raw_readl(rng_base + RNG_DATA);
														
 
															+	return sizeof(u32);
														
 
															+}
														
 
															+
														
 
															+static struct hwrng bcm2835_rng_ops = {
														
 
															+	.name	= "bcm2835",
														
 
															+	.read	= bcm2835_rng_read,
														
 
															+};
														
 
															+
														
 
															+static int bcm2835_rng_probe(struct platform_device *pdev)
														
 
															+{
														
 
															+	struct device *dev = &pdev->dev;
														
 
															+	struct device_node *np = dev->of_node;
														
 
															+	void __iomem *rng_base;
														
 
															+	int err;
														
 
															+
														
 
															+	/* map peripheral */
														
 
															+	rng_base = of_iomap(np, 0);
														
 
															+	if (!rng_base) {
														
 
															+		dev_err(dev, "failed to remap rng regs");
														
 
															+		return -ENODEV;
														
 
															+	}
														
 
															+	bcm2835_rng_ops.priv = (unsigned long)rng_base;
														
 
															+
														
 
															+	/* register driver */
														
 
															+	err = hwrng_register(&bcm2835_rng_ops);
														
 
															+	if (err) {
														
 
															+		dev_err(dev, "hwrng registration failed\n");
														
 
															+		iounmap(rng_base);
														
 
															+	} else {
														
 
															+		dev_info(dev, "hwrng registered\n");
														
 
															+
														
 
															+		/* set warm-up count & enable */
														
 
															+		__raw_writel(RNG_WARMUP_COUNT, rng_base + RNG_STATUS);
														
 
															+		__raw_writel(RNG_RBGEN, rng_base + RNG_CTRL);
														
 
															+	}
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static int bcm2835_rng_remove(struct platform_device *pdev)
														
 
															+{
														
 
															+	void __iomem *rng_base = (void __iomem *)bcm2835_rng_ops.priv;
														
 
															+
														
 
															+	/* disable rng hardware */
														
 
															+	__raw_writel(0, rng_base + RNG_CTRL);
														
 
															+
														
 
															+	/* unregister driver */
														
 
															+	hwrng_unregister(&bcm2835_rng_ops);
														
 
															+	iounmap(rng_base);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static const struct of_device_id bcm2835_rng_of_match[] = {
														
 
															+	{ .compatible = "brcm,bcm2835-rng", },
														
 
															+	{},
														
 
															+};
														
 
															+MODULE_DEVICE_TABLE(of, bcm2835_rng_of_match);
														
 
															+
														
 
															+static struct platform_driver bcm2835_rng_driver = {
														
 
															+	.driver = {
														
 
															+		.name = "bcm2835-rng",
														
 
															+		.owner = THIS_MODULE,
														
 
															+		.of_match_table = bcm2835_rng_of_match,
														
 
															+	},
														
 
															+	.probe		= bcm2835_rng_probe,
														
 
															+	.remove		= bcm2835_rng_remove,
														
 
															+};
														
 
															+module_platform_driver(bcm2835_rng_driver);
														
 
															+
														
 
															+MODULE_AUTHOR("Lubomir Rintel <lkundrak@v3.sk>");
														
 
															+MODULE_DESCRIPTION("BCM2835 Random Number Generator (RNG) driver");
														
 
															+MODULE_LICENSE("GPLv2");
														
--- a/drivers/char/hw_random/exynos-rng.c
+++ b/drivers/char/hw_random/exynos-rng.c
@@ -144,6 +144,7 @@ static int exynos_rng_remove(struct platform_device *pdev)
 
															 	return 0;
														
 
															 }
														
 
															+#if defined(CONFIG_PM_SLEEP) || defined(CONFIG_PM_RUNTIME)
														
 
															 static int exynos_rng_runtime_suspend(struct device *dev)
														
 
															 {
														
 
															 	struct platform_device *pdev = to_platform_device(dev);
														
@@ -161,7 +162,7 @@ static int exynos_rng_runtime_resume(struct device *dev)
 
															 	return clk_prepare_enable(exynos_rng->clk);
														
 
															 }
														
 
															-
														
 
															+#endif
														
 
															 static UNIVERSAL_DEV_PM_OPS(exynos_rng_pm_ops, exynos_rng_runtime_suspend,
														
 
															 					exynos_rng_runtime_resume, NULL);
														
--- a/drivers/char/hw_random/mxc-rnga.c
+++ b/drivers/char/hw_random/mxc-rnga.c
@@ -142,7 +142,7 @@ static void mxc_rnga_cleanup(struct hwrng *rng)
 
															 static int __init mxc_rnga_probe(struct platform_device *pdev)
														
 
															 {
														
 
															 	int err = -ENODEV;
														
 
															-	struct resource *res, *mem;
														
 
															+	struct resource *res;
														
 
															 	struct mxc_rng *mxc_rng;
														
 
															 	mxc_rng = devm_kzalloc(&pdev->dev, sizeof(struct mxc_rng),
														
@@ -172,15 +172,9 @@ static int __init mxc_rnga_probe(struct platform_device *pdev)
 
															 		goto err_region;
														
 
															 	}
														
 
															-	mem = request_mem_region(res->start, resource_size(res), pdev->name);
														
 
															-	if (mem == NULL) {
														
 
															-		err = -EBUSY;
														
 
															-		goto err_region;
														
 
															-	}
														
 
															-
														
 
															-	mxc_rng->mem = ioremap(res->start, resource_size(res));
														
 
															-	if (!mxc_rng->mem) {
														
 
															-		err = -ENOMEM;
														
 
															+	mxc_rng->mem = devm_ioremap_resource(&pdev->dev, res);
														
 
															+	if (IS_ERR(mxc_rng->mem)) {
														
 
															+		err = PTR_ERR(mxc_rng->mem);
														
 
															 		goto err_ioremap;
														
 
															 	}
														
@@ -195,8 +189,6 @@ static int __init mxc_rnga_probe(struct platform_device *pdev)
 
															 	return 0;
														
 
															 err_ioremap:
														
 
															-	release_mem_region(res->start, resource_size(res));
														
 
															-
														
 
															 err_region:
														
 
															 	clk_disable_unprepare(mxc_rng->clk);
														
@@ -206,15 +198,10 @@ out:
 
															 static int __exit mxc_rnga_remove(struct platform_device *pdev)
														
 
															 {
														
 
															-	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
														
 
															 	struct mxc_rng *mxc_rng = platform_get_drvdata(pdev);
														
 
															 	hwrng_unregister(&mxc_rng->rng);
														
 
															-	iounmap(mxc_rng->mem);
														
 
															-
														
 
															-	release_mem_region(res->start, resource_size(res));
														
 
															-
														
 
															 	clk_disable_unprepare(mxc_rng->clk);
														
 
															 	return 0;
														
--- a/drivers/char/hw_random/timeriomem-rng.c
+++ b/drivers/char/hw_random/timeriomem-rng.c
@@ -23,127 +23,209 @@
 
															 #include <linux/module.h>
														
 
															 #include <linux/kernel.h>
														
 
															 #include <linux/platform_device.h>
														
 
															+#include <linux/of.h>
														
 
															 #include <linux/hw_random.h>
														
 
															 #include <linux/io.h>
														
 
															+#include <linux/slab.h>
														
 
															 #include <linux/timeriomem-rng.h>
														
 
															 #include <linux/jiffies.h>
														
 
															 #include <linux/sched.h>
														
 
															 #include <linux/timer.h>
														
 
															 #include <linux/completion.h>
														
 
															-static struct timeriomem_rng_data *timeriomem_rng_data;
														
 
															+struct timeriomem_rng_private_data {
														
 
															+	void __iomem		*io_base;
														
 
															+	unsigned int		expires;
														
 
															+	unsigned int		period;
														
 
															+	unsigned int		present:1;
														
 
															-static void timeriomem_rng_trigger(unsigned long);
														
 
															-static DEFINE_TIMER(timeriomem_rng_timer, timeriomem_rng_trigger, 0, 0);
														
 
															+	struct timer_list	timer;
														
 
															+	struct completion	completion;
														
 
															+
														
 
															+	struct hwrng		timeriomem_rng_ops;
														
 
															+};
														
 
															+
														
 
															+#define to_rng_priv(rng) \
														
 
															+		((struct timeriomem_rng_private_data *)rng->priv)
														
 
															 /*
														
 
															  * have data return 1, however return 0 if we have nothing
														
 
															  */
														
 
															 static int timeriomem_rng_data_present(struct hwrng *rng, int wait)
														
 
															 {
														
 
															-	if (rng->priv == 0)
														
 
															-		return 1;
														
 
															+	struct timeriomem_rng_private_data *priv = to_rng_priv(rng);
														
 
															-	if (!wait || timeriomem_rng_data->present)
														
 
															-		return timeriomem_rng_data->present;
														
 
															+	if (!wait || priv->present)
														
 
															+		return priv->present;
														
 
															-	wait_for_completion(&timeriomem_rng_data->completion);
														
 
															+	wait_for_completion(&priv->completion);
														
 
															 	return 1;
														
 
															 }
														
 
															 static int timeriomem_rng_data_read(struct hwrng *rng, u32 *data)
														
 
															 {
														
 
															+	struct timeriomem_rng_private_data *priv = to_rng_priv(rng);
														
 
															 	unsigned long cur;
														
 
															 	s32 delay;
														
 
															-	*data = readl(timeriomem_rng_data->address);
														
 
															+	*data = readl(priv->io_base);
														
 
															-	if (rng->priv != 0) {
														
 
															-		cur = jiffies;
														
 
															+	cur = jiffies;
														
 
															-		delay = cur - timeriomem_rng_timer.expires;
														
 
															-		delay = rng->priv - (delay % rng->priv);
														
 
															+	delay = cur - priv->expires;
														
 
															+	delay = priv->period - (delay % priv->period);
														
 
															-		timeriomem_rng_timer.expires = cur + delay;
														
 
															-		timeriomem_rng_data->present = 0;
														
 
															+	priv->expires = cur + delay;
														
 
															+	priv->present = 0;
														
 
															-		init_completion(&timeriomem_rng_data->completion);
														
 
															-		add_timer(&timeriomem_rng_timer);
														
 
															-	}
														
 
															+	INIT_COMPLETION(priv->completion);
														
 
															+	mod_timer(&priv->timer, priv->expires);
														
 
															 	return 4;
														
 
															 }
														
 
															-static void timeriomem_rng_trigger(unsigned long dummy)
														
 
															+static void timeriomem_rng_trigger(unsigned long data)
														
 
															 {
														
 
															-	timeriomem_rng_data->present = 1;
														
 
															-	complete(&timeriomem_rng_data->completion);
														
 
															-}
														
 
															+	struct timeriomem_rng_private_data *priv
														
 
															+			= (struct timeriomem_rng_private_data *)data;
														
 
															-static struct hwrng timeriomem_rng_ops = {
														
 
															-	.name		= "timeriomem",
														
 
															-	.data_present	= timeriomem_rng_data_present,
														
 
															-	.data_read	= timeriomem_rng_data_read,
														
 
															-	.priv		= 0,
														
 
															-};
														
 
															+	priv->present = 1;
														
 
															+	complete(&priv->completion);
														
 
															+}
														
 
															 static int timeriomem_rng_probe(struct platform_device *pdev)
														
 
															 {
														
 
															+	struct timeriomem_rng_data *pdata = pdev->dev.platform_data;
														
 
															+	struct timeriomem_rng_private_data *priv;
														
 
															 	struct resource *res;
														
 
															-	int ret;
														
 
															+	int err = 0;
														
 
															+	int period;
														
 
															-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
														
 
															+	if (!pdev->dev.of_node && !pdata) {
														
 
															+		dev_err(&pdev->dev, "timeriomem_rng_data is missing\n");
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
														
 
															 	if (!res)
														
 
															-		return -ENOENT;
														
 
															+		return -ENXIO;
														
 
															-	timeriomem_rng_data = pdev->dev.platform_data;
														
 
															+	if (res->start % 4 != 0 || resource_size(res) != 4) {
														
 
															+		dev_err(&pdev->dev,
														
 
															+			"address must be four bytes wide and aligned\n");
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															-	timeriomem_rng_data->address = ioremap(res->start, resource_size(res));
														
 
															-	if (!timeriomem_rng_data->address)
														
 
															-		return -EIO;
														
 
															+	/* Allocate memory for the device structure (and zero it) */
														
 
															+	priv = kzalloc(sizeof(struct timeriomem_rng_private_data), GFP_KERNEL);
														
 
															+	if (!priv) {
														
 
															+		dev_err(&pdev->dev, "failed to allocate device structure.\n");
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+
														
 
															+	platform_set_drvdata(pdev, priv);
														
 
															+
														
 
															+	if (pdev->dev.of_node) {
														
 
															+		int i;
														
 
															+
														
 
															+		if (!of_property_read_u32(pdev->dev.of_node,
														
 
															+						"period", &i))
														
 
															+			period = i;
														
 
															+		else {
														
 
															+			dev_err(&pdev->dev, "missing period\n");
														
 
															+			err = -EINVAL;
														
 
															+			goto out_free;
														
 
															+		}
														
 
															+	} else
														
 
															+		period = pdata->period;
														
 
															+
														
 
															+	priv->period = usecs_to_jiffies(period);
														
 
															+	if (priv->period < 1) {
														
 
															+		dev_err(&pdev->dev, "period is less than one jiffy\n");
														
 
															+		err = -EINVAL;
														
 
															+		goto out_free;
														
 
															+	}
														
 
															-	if (timeriomem_rng_data->period != 0
														
 
															-		&& usecs_to_jiffies(timeriomem_rng_data->period) > 0) {
														
 
															-		timeriomem_rng_timer.expires = jiffies;
														
 
															+	priv->expires	= jiffies;
														
 
															+	priv->present	= 1;
														
 
															-		timeriomem_rng_ops.priv = usecs_to_jiffies(
														
 
															-						timeriomem_rng_data->period);
														
 
															+	init_completion(&priv->completion);
														
 
															+	complete(&priv->completion);
														
 
															+
														
 
															+	setup_timer(&priv->timer, timeriomem_rng_trigger, (unsigned long)priv);
														
 
															+
														
 
															+	priv->timeriomem_rng_ops.name		= dev_name(&pdev->dev);
														
 
															+	priv->timeriomem_rng_ops.data_present	= timeriomem_rng_data_present;
														
 
															+	priv->timeriomem_rng_ops.data_read	= timeriomem_rng_data_read;
														
 
															+	priv->timeriomem_rng_ops.priv		= (unsigned long)priv;
														
 
															+
														
 
															+	if (!request_mem_region(res->start, resource_size(res),
														
 
															+				dev_name(&pdev->dev))) {
														
 
															+		dev_err(&pdev->dev, "request_mem_region failed\n");
														
 
															+		err = -EBUSY;
														
 
															+		goto out_timer;
														
 
															 	}
														
 
															-	timeriomem_rng_data->present = 1;
														
 
															-	ret = hwrng_register(&timeriomem_rng_ops);
														
 
															-	if (ret)
														
 
															-		goto failed;
														
 
															+	priv->io_base = ioremap(res->start, resource_size(res));
														
 
															+	if (priv->io_base == NULL) {
														
 
															+		dev_err(&pdev->dev, "ioremap failed\n");
														
 
															+		err = -EIO;
														
 
															+		goto out_release_io;
														
 
															+	}
														
 
															+
														
 
															+	err = hwrng_register(&priv->timeriomem_rng_ops);
														
 
															+	if (err) {
														
 
															+		dev_err(&pdev->dev, "problem registering\n");
														
 
															+		goto out;
														
 
															+	}
														
 
															 	dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n",
														
 
															-			timeriomem_rng_data->address,
														
 
															-			timeriomem_rng_data->period);
														
 
															+			priv->io_base, period);
														
 
															 	return 0;
														
 
															-failed:
														
 
															-	dev_err(&pdev->dev, "problem registering\n");
														
 
															-	iounmap(timeriomem_rng_data->address);
														
 
															-
														
 
															-	return ret;
														
 
															+out:
														
 
															+	iounmap(priv->io_base);
														
 
															+out_release_io:
														
 
															+	release_mem_region(res->start, resource_size(res));
														
 
															+out_timer:
														
 
															+	del_timer_sync(&priv->timer);
														
 
															+out_free:
														
 
															+	platform_set_drvdata(pdev, NULL);
														
 
															+	kfree(priv);
														
 
															+	return err;
														
 
															 }
														
 
															 static int timeriomem_rng_remove(struct platform_device *pdev)
														
 
															 {
														
 
															-	del_timer_sync(&timeriomem_rng_timer);
														
 
															-	hwrng_unregister(&timeriomem_rng_ops);
														
 
															+	struct timeriomem_rng_private_data *priv = platform_get_drvdata(pdev);
														
 
															+	struct resource *res;
														
 
															+
														
 
															+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
														
 
															-	iounmap(timeriomem_rng_data->address);
														
 
															+	hwrng_unregister(&priv->timeriomem_rng_ops);
														
 
															+
														
 
															+	del_timer_sync(&priv->timer);
														
 
															+	iounmap(priv->io_base);
														
 
															+	release_mem_region(res->start, resource_size(res));
														
 
															+	platform_set_drvdata(pdev, NULL);
														
 
															+	kfree(priv);
														
 
															 	return 0;
														
 
															 }
														
 
															+static const struct of_device_id timeriomem_rng_match[] = {
														
 
															+	{ .compatible = "timeriomem_rng" },
														
 
															+	{},
														
 
															+};
														
 
															+MODULE_DEVICE_TABLE(of, timeriomem_rng_match);
														
 
															+
														
 
															 static struct platform_driver timeriomem_rng_driver = {
														
 
															 	.driver = {
														
 
															 		.name		= "timeriomem_rng",
														
 
															 		.owner		= THIS_MODULE,
														
 
															+		.of_match_table	= timeriomem_rng_match,
														
 
															 	},
														
 
															 	.probe		= timeriomem_rng_probe,
														
 
															 	.remove		= timeriomem_rng_remove,
														
--- a/drivers/crypto/Kconfig
+++ b/drivers/crypto/Kconfig
@@ -276,6 +276,16 @@ config CRYPTO_DEV_PICOXCELL
 
															 	  Saying m here will build a module named pipcoxcell_crypto.
														
 
															+config CRYPTO_DEV_SAHARA
														
 
															+	tristate "Support for SAHARA crypto accelerator"
														
 
															+	depends on ARCH_MXC && EXPERIMENTAL && OF
														
 
															+	select CRYPTO_BLKCIPHER
														
 
															+	select CRYPTO_AES
														
 
															+	select CRYPTO_ECB
														
 
															+	help
														
 
															+	  This option enables support for the SAHARA HW crypto accelerator
														
 
															+	  found in some Freescale i.MX chips.
														
 
															+
														
 
															 config CRYPTO_DEV_S5P
														
 
															 	tristate "Support for Samsung S5PV210 crypto accelerator"
														
 
															 	depends on ARCH_S5PV210
														
@@ -361,15 +371,17 @@ config CRYPTO_DEV_ATMEL_TDES
 
															 	  will be called atmel-tdes.
														
 
															 config CRYPTO_DEV_ATMEL_SHA
														
 
															-	tristate "Support for Atmel SHA1/SHA256 hw accelerator"
														
 
															+	tristate "Support for Atmel SHA hw accelerator"
														
 
															 	depends on ARCH_AT91
														
 
															 	select CRYPTO_SHA1
														
 
															 	select CRYPTO_SHA256
														
 
															+	select CRYPTO_SHA512
														
 
															 	select CRYPTO_ALGAPI
														
 
															 	help
														
 
															-	  Some Atmel processors have SHA1/SHA256 hw accelerator.
														
 
															+	  Some Atmel processors have SHA1/SHA224/SHA256/SHA384/SHA512
														
 
															+	  hw accelerator.
														
 
															 	  Select this if you want to use the Atmel module for
														
 
															-	  SHA1/SHA256 algorithms.
														
 
															+	  SHA1/SHA224/SHA256/SHA384/SHA512 algorithms.
														
 
															 	  To compile this driver as a module, choose M here: the module
														
 
															 	  will be called atmel-sha.
														
--- a/drivers/crypto/Makefile
+++ b/drivers/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_DEV_PPC4XX) += amcc/
 
															 obj-$(CONFIG_CRYPTO_DEV_OMAP_SHAM) += omap-sham.o
														
 
															 obj-$(CONFIG_CRYPTO_DEV_OMAP_AES) += omap-aes.o
														
 
															 obj-$(CONFIG_CRYPTO_DEV_PICOXCELL) += picoxcell_crypto.o
														
 
															+obj-$(CONFIG_CRYPTO_DEV_SAHARA) += sahara.o
														
 
															 obj-$(CONFIG_CRYPTO_DEV_S5P) += s5p-sss.o
														
 
															 obj-$(CONFIG_CRYPTO_DEV_TEGRA_AES) += tegra-aes.o
														
 
															 obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/
														
--- a/drivers/crypto/atmel-aes.c
+++ b/drivers/crypto/atmel-aes.c
@@ -38,7 +38,7 @@
 
															 #include <crypto/aes.h>
														
 
															 #include <crypto/hash.h>
														
 
															 #include <crypto/internal/hash.h>
														
 
															-#include <linux/platform_data/atmel-aes.h>
														
 
															+#include <linux/platform_data/crypto-atmel.h>
														
 
															 #include "atmel-aes-regs.h"
														
 
															 #define CFB8_BLOCK_SIZE		1
														
@@ -47,7 +47,7 @@
 
															 #define CFB64_BLOCK_SIZE	8
														
 
															 /* AES flags */
														
 
															-#define AES_FLAGS_MODE_MASK	0x01ff
														
 
															+#define AES_FLAGS_MODE_MASK	0x03ff
														
 
															 #define AES_FLAGS_ENCRYPT	BIT(0)
														
 
															 #define AES_FLAGS_CBC		BIT(1)
														
 
															 #define AES_FLAGS_CFB		BIT(2)
														
@@ -55,21 +55,26 @@
 
															 #define AES_FLAGS_CFB16		BIT(4)
														
 
															 #define AES_FLAGS_CFB32		BIT(5)
														
 
															 #define AES_FLAGS_CFB64		BIT(6)
														
 
															-#define AES_FLAGS_OFB		BIT(7)
														
 
															-#define AES_FLAGS_CTR		BIT(8)
														
 
															+#define AES_FLAGS_CFB128	BIT(7)
														
 
															+#define AES_FLAGS_OFB		BIT(8)
														
 
															+#define AES_FLAGS_CTR		BIT(9)
														
 
															 #define AES_FLAGS_INIT		BIT(16)
														
 
															 #define AES_FLAGS_DMA		BIT(17)
														
 
															 #define AES_FLAGS_BUSY		BIT(18)
														
 
															+#define AES_FLAGS_FAST		BIT(19)
														
 
															-#define AES_FLAGS_DUALBUFF	BIT(24)
														
 
															-
														
 
															-#define ATMEL_AES_QUEUE_LENGTH	1
														
 
															-#define ATMEL_AES_CACHE_SIZE	0
														
 
															+#define ATMEL_AES_QUEUE_LENGTH	50
														
 
															 #define ATMEL_AES_DMA_THRESHOLD		16
														
 
															+struct atmel_aes_caps {
														
 
															+	bool	has_dualbuff;
														
 
															+	bool	has_cfb64;
														
 
															+	u32		max_burst_size;
														
 
															+};
														
 
															+
														
 
															 struct atmel_aes_dev;
														
 
															 struct atmel_aes_ctx {
														
@@ -77,6 +82,8 @@ struct atmel_aes_ctx {
 
															 	int		keylen;
														
 
															 	u32		key[AES_KEYSIZE_256 / sizeof(u32)];
														
 
															+
														
 
															+	u16		block_size;
														
 
															 };
														
 
															 struct atmel_aes_reqctx {
														
@@ -112,20 +119,27 @@ struct atmel_aes_dev {
 
															 	struct scatterlist	*in_sg;
														
 
															 	unsigned int		nb_in_sg;
														
 
															-
														
 
															+	size_t				in_offset;
														
 
															 	struct scatterlist	*out_sg;
														
 
															 	unsigned int		nb_out_sg;
														
 
															+	size_t				out_offset;
														
 
															 	size_t	bufcnt;
														
 
															+	size_t	buflen;
														
 
															+	size_t	dma_size;
														
 
															-	u8	buf_in[ATMEL_AES_DMA_THRESHOLD] __aligned(sizeof(u32));
														
 
															-	int	dma_in;
														
 
															+	void	*buf_in;
														
 
															+	int		dma_in;
														
 
															+	dma_addr_t	dma_addr_in;
														
 
															 	struct atmel_aes_dma	dma_lch_in;
														
 
															-	u8	buf_out[ATMEL_AES_DMA_THRESHOLD] __aligned(sizeof(u32));
														
 
															-	int	dma_out;
														
 
															+	void	*buf_out;
														
 
															+	int		dma_out;
														
 
															+	dma_addr_t	dma_addr_out;
														
 
															 	struct atmel_aes_dma	dma_lch_out;
														
 
															+	struct atmel_aes_caps	caps;
														
 
															+
														
 
															 	u32	hw_version;
														
 
															 };
														
@@ -165,6 +179,37 @@ static int atmel_aes_sg_length(struct ablkcipher_request *req,
 
															 	return sg_nb;
														
 
															 }
														
 
															+static int atmel_aes_sg_copy(struct scatterlist **sg, size_t *offset,
														
 
															+			void *buf, size_t buflen, size_t total, int out)
														
 
															+{
														
 
															+	unsigned int count, off = 0;
														
 
															+
														
 
															+	while (buflen && total) {
														
 
															+		count = min((*sg)->length - *offset, total);
														
 
															+		count = min(count, buflen);
														
 
															+
														
 
															+		if (!count)
														
 
															+			return off;
														
 
															+
														
 
															+		scatterwalk_map_and_copy(buf + off, *sg, *offset, count, out);
														
 
															+
														
 
															+		off += count;
														
 
															+		buflen -= count;
														
 
															+		*offset += count;
														
 
															+		total -= count;
														
 
															+
														
 
															+		if (*offset == (*sg)->length) {
														
 
															+			*sg = sg_next(*sg);
														
 
															+			if (*sg)
														
 
															+				*offset = 0;
														
 
															+			else
														
 
															+				total = 0;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return off;
														
 
															+}
														
 
															+
														
 
															 static inline u32 atmel_aes_read(struct atmel_aes_dev *dd, u32 offset)
														
 
															 {
														
 
															 	return readl_relaxed(dd->io_base + offset);
														
@@ -190,14 +235,6 @@ static void atmel_aes_write_n(struct atmel_aes_dev *dd, u32 offset,
 
															 		atmel_aes_write(dd, offset, *value);
														
 
															 }
														
 
															-static void atmel_aes_dualbuff_test(struct atmel_aes_dev *dd)
														
 
															-{
														
 
															-	atmel_aes_write(dd, AES_MR, AES_MR_DUALBUFF);
														
 
															-
														
 
															-	if (atmel_aes_read(dd, AES_MR) & AES_MR_DUALBUFF)
														
 
															-		dd->flags |= AES_FLAGS_DUALBUFF;
														
 
															-}
														
 
															-
														
 
															 static struct atmel_aes_dev *atmel_aes_find_dev(struct atmel_aes_ctx *ctx)
														
 
															 {
														
 
															 	struct atmel_aes_dev *aes_dd = NULL;
														
@@ -225,7 +262,7 @@ static int atmel_aes_hw_init(struct atmel_aes_dev *dd)
 
															 	if (!(dd->flags & AES_FLAGS_INIT)) {
														
 
															 		atmel_aes_write(dd, AES_CR, AES_CR_SWRST);
														
 
															-		atmel_aes_dualbuff_test(dd);
														
 
															+		atmel_aes_write(dd, AES_MR, 0xE << AES_MR_CKEY_OFFSET);
														
 
															 		dd->flags |= AES_FLAGS_INIT;
														
 
															 		dd->err = 0;
														
 
															 	}
														
@@ -233,11 +270,19 @@ static int atmel_aes_hw_init(struct atmel_aes_dev *dd)
 
															 	return 0;
														
 
															 }
														
 
															+static inline unsigned int atmel_aes_get_version(struct atmel_aes_dev *dd)
														
 
															+{
														
 
															+	return atmel_aes_read(dd, AES_HW_VERSION) & 0x00000fff;
														
 
															+}
														
 
															+
														
 
															 static void atmel_aes_hw_version_init(struct atmel_aes_dev *dd)
														
 
															 {
														
 
															 	atmel_aes_hw_init(dd);
														
 
															-	dd->hw_version = atmel_aes_read(dd, AES_HW_VERSION);
														
 
															+	dd->hw_version = atmel_aes_get_version(dd);
														
 
															+
														
 
															+	dev_info(dd->dev,
														
 
															+			"version: 0x%x\n", dd->hw_version);
														
 
															 	clk_disable_unprepare(dd->iclk);
														
 
															 }
														
@@ -260,50 +305,77 @@ static void atmel_aes_dma_callback(void *data)
 
															 	tasklet_schedule(&dd->done_task);
														
 
															 }
														
 
															-static int atmel_aes_crypt_dma(struct atmel_aes_dev *dd)
														
 
															+static int atmel_aes_crypt_dma(struct atmel_aes_dev *dd,
														
 
															+		dma_addr_t dma_addr_in, dma_addr_t dma_addr_out, int length)
														
 
															 {
														
 
															+	struct scatterlist sg[2];
														
 
															 	struct dma_async_tx_descriptor	*in_desc, *out_desc;
														
 
															-	int nb_dma_sg_in, nb_dma_sg_out;
														
 
															-	dd->nb_in_sg = atmel_aes_sg_length(dd->req, dd->in_sg);
														
 
															-	if (!dd->nb_in_sg)
														
 
															-		goto exit_err;
														
 
															+	dd->dma_size = length;
														
 
															-	nb_dma_sg_in = dma_map_sg(dd->dev, dd->in_sg, dd->nb_in_sg,
														
 
															-			DMA_TO_DEVICE);
														
 
															-	if (!nb_dma_sg_in)
														
 
															-		goto exit_err;
														
 
															+	if (!(dd->flags & AES_FLAGS_FAST)) {
														
 
															+		dma_sync_single_for_device(dd->dev, dma_addr_in, length,
														
 
															+					   DMA_TO_DEVICE);
														
 
															+	}
														
 
															-	in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, dd->in_sg,
														
 
															-				nb_dma_sg_in, DMA_MEM_TO_DEV,
														
 
															-				DMA_PREP_INTERRUPT  |  DMA_CTRL_ACK);
														
 
															+	if (dd->flags & AES_FLAGS_CFB8) {
														
 
															+		dd->dma_lch_in.dma_conf.dst_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_1_BYTE;
														
 
															+		dd->dma_lch_out.dma_conf.src_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_1_BYTE;
														
 
															+	} else if (dd->flags & AES_FLAGS_CFB16) {
														
 
															+		dd->dma_lch_in.dma_conf.dst_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_2_BYTES;
														
 
															+		dd->dma_lch_out.dma_conf.src_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_2_BYTES;
														
 
															+	} else {
														
 
															+		dd->dma_lch_in.dma_conf.dst_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															+		dd->dma_lch_out.dma_conf.src_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															+	}
														
 
															-	if (!in_desc)
														
 
															-		goto unmap_in;
														
 
															+	if (dd->flags & (AES_FLAGS_CFB8 | AES_FLAGS_CFB16 |
														
 
															+			AES_FLAGS_CFB32 | AES_FLAGS_CFB64)) {
														
 
															+		dd->dma_lch_in.dma_conf.src_maxburst = 1;
														
 
															+		dd->dma_lch_in.dma_conf.dst_maxburst = 1;
														
 
															+		dd->dma_lch_out.dma_conf.src_maxburst = 1;
														
 
															+		dd->dma_lch_out.dma_conf.dst_maxburst = 1;
														
 
															+	} else {
														
 
															+		dd->dma_lch_in.dma_conf.src_maxburst = dd->caps.max_burst_size;
														
 
															+		dd->dma_lch_in.dma_conf.dst_maxburst = dd->caps.max_burst_size;
														
 
															+		dd->dma_lch_out.dma_conf.src_maxburst = dd->caps.max_burst_size;
														
 
															+		dd->dma_lch_out.dma_conf.dst_maxburst = dd->caps.max_burst_size;
														
 
															+	}
														
 
															-	/* callback not needed */
														
 
															+	dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf);
														
 
															+	dmaengine_slave_config(dd->dma_lch_out.chan, &dd->dma_lch_out.dma_conf);
														
 
															-	dd->nb_out_sg = atmel_aes_sg_length(dd->req, dd->out_sg);
														
 
															-	if (!dd->nb_out_sg)
														
 
															-		goto unmap_in;
														
 
															+	dd->flags |= AES_FLAGS_DMA;
														
 
															-	nb_dma_sg_out = dma_map_sg(dd->dev, dd->out_sg, dd->nb_out_sg,
														
 
															-			DMA_FROM_DEVICE);
														
 
															-	if (!nb_dma_sg_out)
														
 
															-		goto unmap_out;
														
 
															+	sg_init_table(&sg[0], 1);
														
 
															+	sg_dma_address(&sg[0]) = dma_addr_in;
														
 
															+	sg_dma_len(&sg[0]) = length;
														
 
															-	out_desc = dmaengine_prep_slave_sg(dd->dma_lch_out.chan, dd->out_sg,
														
 
															-				nb_dma_sg_out, DMA_DEV_TO_MEM,
														
 
															-				DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
														
 
															+	sg_init_table(&sg[1], 1);
														
 
															+	sg_dma_address(&sg[1]) = dma_addr_out;
														
 
															+	sg_dma_len(&sg[1]) = length;
														
 
															+
														
 
															+	in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, &sg[0],
														
 
															+				1, DMA_MEM_TO_DEV,
														
 
															+				DMA_PREP_INTERRUPT  |  DMA_CTRL_ACK);
														
 
															+	if (!in_desc)
														
 
															+		return -EINVAL;
														
 
															+	out_desc = dmaengine_prep_slave_sg(dd->dma_lch_out.chan, &sg[1],
														
 
															+				1, DMA_DEV_TO_MEM,
														
 
															+				DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
														
 
															 	if (!out_desc)
														
 
															-		goto unmap_out;
														
 
															+		return -EINVAL;
														
 
															 	out_desc->callback = atmel_aes_dma_callback;
														
 
															 	out_desc->callback_param = dd;
														
 
															-	dd->total -= dd->req->nbytes;
														
 
															-
														
 
															 	dmaengine_submit(out_desc);
														
 
															 	dma_async_issue_pending(dd->dma_lch_out.chan);
														
@@ -311,15 +383,6 @@ static int atmel_aes_crypt_dma(struct atmel_aes_dev *dd)
 
															 	dma_async_issue_pending(dd->dma_lch_in.chan);
														
 
															 	return 0;
														
 
															-
														
 
															-unmap_out:
														
 
															-	dma_unmap_sg(dd->dev, dd->out_sg, dd->nb_out_sg,
														
 
															-		DMA_FROM_DEVICE);
														
 
															-unmap_in:
														
 
															-	dma_unmap_sg(dd->dev, dd->in_sg, dd->nb_in_sg,
														
 
															-		DMA_TO_DEVICE);
														
 
															-exit_err:
														
 
															-	return -EINVAL;
														
 
															 }
														
 
															 static int atmel_aes_crypt_cpu_start(struct atmel_aes_dev *dd)
														
@@ -352,30 +415,66 @@ static int atmel_aes_crypt_cpu_start(struct atmel_aes_dev *dd)
 
															 static int atmel_aes_crypt_dma_start(struct atmel_aes_dev *dd)
														
 
															 {
														
 
															-	int err;
														
 
															+	int err, fast = 0, in, out;
														
 
															+	size_t count;
														
 
															+	dma_addr_t addr_in, addr_out;
														
 
															+
														
 
															+	if ((!dd->in_offset) && (!dd->out_offset)) {
														
 
															+		/* check for alignment */
														
 
															+		in = IS_ALIGNED((u32)dd->in_sg->offset, sizeof(u32)) &&
														
 
															+			IS_ALIGNED(dd->in_sg->length, dd->ctx->block_size);
														
 
															+		out = IS_ALIGNED((u32)dd->out_sg->offset, sizeof(u32)) &&
														
 
															+			IS_ALIGNED(dd->out_sg->length, dd->ctx->block_size);
														
 
															+		fast = in && out;
														
 
															+
														
 
															+		if (sg_dma_len(dd->in_sg) != sg_dma_len(dd->out_sg))
														
 
															+			fast = 0;
														
 
															+	}
														
 
															+
														
 
															+
														
 
															+	if (fast)  {
														
 
															+		count = min(dd->total, sg_dma_len(dd->in_sg));
														
 
															+		count = min(count, sg_dma_len(dd->out_sg));
														
 
															+
														
 
															+		err = dma_map_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
														
 
															+		if (!err) {
														
 
															+			dev_err(dd->dev, "dma_map_sg() error\n");
														
 
															+			return -EINVAL;
														
 
															+		}
														
 
															+
														
 
															+		err = dma_map_sg(dd->dev, dd->out_sg, 1,
														
 
															+				DMA_FROM_DEVICE);
														
 
															+		if (!err) {
														
 
															+			dev_err(dd->dev, "dma_map_sg() error\n");
														
 
															+			dma_unmap_sg(dd->dev, dd->in_sg, 1,
														
 
															+				DMA_TO_DEVICE);
														
 
															+			return -EINVAL;
														
 
															+		}
														
 
															+
														
 
															+		addr_in = sg_dma_address(dd->in_sg);
														
 
															+		addr_out = sg_dma_address(dd->out_sg);
														
 
															+
														
 
															+		dd->flags |= AES_FLAGS_FAST;
														
 
															-	if (dd->flags & AES_FLAGS_CFB8) {
														
 
															-		dd->dma_lch_in.dma_conf.dst_addr_width =
														
 
															-			DMA_SLAVE_BUSWIDTH_1_BYTE;
														
 
															-		dd->dma_lch_out.dma_conf.src_addr_width =
														
 
															-			DMA_SLAVE_BUSWIDTH_1_BYTE;
														
 
															-	} else if (dd->flags & AES_FLAGS_CFB16) {
														
 
															-		dd->dma_lch_in.dma_conf.dst_addr_width =
														
 
															-			DMA_SLAVE_BUSWIDTH_2_BYTES;
														
 
															-		dd->dma_lch_out.dma_conf.src_addr_width =
														
 
															-			DMA_SLAVE_BUSWIDTH_2_BYTES;
														
 
															 	} else {
														
 
															-		dd->dma_lch_in.dma_conf.dst_addr_width =
														
 
															-			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															-		dd->dma_lch_out.dma_conf.src_addr_width =
														
 
															-			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															+		/* use cache buffers */
														
 
															+		count = atmel_aes_sg_copy(&dd->in_sg, &dd->in_offset,
														
 
															+				dd->buf_in, dd->buflen, dd->total, 0);
														
 
															+
														
 
															+		addr_in = dd->dma_addr_in;
														
 
															+		addr_out = dd->dma_addr_out;
														
 
															+
														
 
															+		dd->flags &= ~AES_FLAGS_FAST;
														
 
															 	}
														
 
															-	dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf);
														
 
															-	dmaengine_slave_config(dd->dma_lch_out.chan, &dd->dma_lch_out.dma_conf);
														
 
															+	dd->total -= count;
														
 
															-	dd->flags |= AES_FLAGS_DMA;
														
 
															-	err = atmel_aes_crypt_dma(dd);
														
 
															+	err = atmel_aes_crypt_dma(dd, addr_in, addr_out, count);
														
 
															+
														
 
															+	if (err && (dd->flags & AES_FLAGS_FAST)) {
														
 
															+		dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
														
 
															+		dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_TO_DEVICE);
														
 
															+	}
														
 
															 	return err;
														
 
															 }
														
@@ -410,6 +509,8 @@ static int atmel_aes_write_ctrl(struct atmel_aes_dev *dd)
 
															 			valmr |= AES_MR_CFBS_32b;
														
 
															 		else if (dd->flags & AES_FLAGS_CFB64)
														
 
															 			valmr |= AES_MR_CFBS_64b;
														
 
															+		else if (dd->flags & AES_FLAGS_CFB128)
														
 
															+			valmr |= AES_MR_CFBS_128b;
														
 
															 	} else if (dd->flags & AES_FLAGS_OFB) {
														
 
															 		valmr |= AES_MR_OPMOD_OFB;
														
 
															 	} else if (dd->flags & AES_FLAGS_CTR) {
														
@@ -423,7 +524,7 @@ static int atmel_aes_write_ctrl(struct atmel_aes_dev *dd)
 
															 	if (dd->total > ATMEL_AES_DMA_THRESHOLD) {
														
 
															 		valmr |= AES_MR_SMOD_IDATAR0;
														
 
															-		if (dd->flags & AES_FLAGS_DUALBUFF)
														
 
															+		if (dd->caps.has_dualbuff)
														
 
															 			valmr |= AES_MR_DUALBUFF;
														
 
															 	} else {
														
 
															 		valmr |= AES_MR_SMOD_AUTO;
														
@@ -477,7 +578,9 @@ static int atmel_aes_handle_queue(struct atmel_aes_dev *dd,
 
															 	/* assign new request to device */
														
 
															 	dd->req = req;
														
 
															 	dd->total = req->nbytes;
														
 
															+	dd->in_offset = 0;
														
 
															 	dd->in_sg = req->src;
														
 
															+	dd->out_offset = 0;
														
 
															 	dd->out_sg = req->dst;
														
 
															 	rctx = ablkcipher_request_ctx(req);
														
@@ -506,18 +609,86 @@ static int atmel_aes_handle_queue(struct atmel_aes_dev *dd,
 
															 static int atmel_aes_crypt_dma_stop(struct atmel_aes_dev *dd)
														
 
															 {
														
 
															 	int err = -EINVAL;
														
 
															+	size_t count;
														
 
															 	if (dd->flags & AES_FLAGS_DMA) {
														
 
															-		dma_unmap_sg(dd->dev, dd->out_sg,
														
 
															-			dd->nb_out_sg, DMA_FROM_DEVICE);
														
 
															-		dma_unmap_sg(dd->dev, dd->in_sg,
														
 
															-			dd->nb_in_sg, DMA_TO_DEVICE);
														
 
															 		err = 0;
														
 
															+		if  (dd->flags & AES_FLAGS_FAST) {
														
 
															+			dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_FROM_DEVICE);
														
 
															+			dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
														
 
															+		} else {
														
 
															+			dma_sync_single_for_device(dd->dev, dd->dma_addr_out,
														
 
															+				dd->dma_size, DMA_FROM_DEVICE);
														
 
															+
														
 
															+			/* copy data */
														
 
															+			count = atmel_aes_sg_copy(&dd->out_sg, &dd->out_offset,
														
 
															+				dd->buf_out, dd->buflen, dd->dma_size, 1);
														
 
															+			if (count != dd->dma_size) {
														
 
															+				err = -EINVAL;
														
 
															+				pr_err("not all data converted: %u\n", count);
														
 
															+			}
														
 
															+		}
														
 
															 	}
														
 
															 	return err;
														
 
															 }
														
 
															+
														
 
															+static int atmel_aes_buff_init(struct atmel_aes_dev *dd)
														
 
															+{
														
 
															+	int err = -ENOMEM;
														
 
															+
														
 
															+	dd->buf_in = (void *)__get_free_pages(GFP_KERNEL, 0);
														
 
															+	dd->buf_out = (void *)__get_free_pages(GFP_KERNEL, 0);
														
 
															+	dd->buflen = PAGE_SIZE;
														
 
															+	dd->buflen &= ~(AES_BLOCK_SIZE - 1);
														
 
															+
														
 
															+	if (!dd->buf_in || !dd->buf_out) {
														
 
															+		dev_err(dd->dev, "unable to alloc pages.\n");
														
 
															+		goto err_alloc;
														
 
															+	}
														
 
															+
														
 
															+	/* MAP here */
														
 
															+	dd->dma_addr_in = dma_map_single(dd->dev, dd->buf_in,
														
 
															+					dd->buflen, DMA_TO_DEVICE);
														
 
															+	if (dma_mapping_error(dd->dev, dd->dma_addr_in)) {
														
 
															+		dev_err(dd->dev, "dma %d bytes error\n", dd->buflen);
														
 
															+		err = -EINVAL;
														
 
															+		goto err_map_in;
														
 
															+	}
														
 
															+
														
 
															+	dd->dma_addr_out = dma_map_single(dd->dev, dd->buf_out,
														
 
															+					dd->buflen, DMA_FROM_DEVICE);
														
 
															+	if (dma_mapping_error(dd->dev, dd->dma_addr_out)) {
														
 
															+		dev_err(dd->dev, "dma %d bytes error\n", dd->buflen);
														
 
															+		err = -EINVAL;
														
 
															+		goto err_map_out;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+
														
 
															+err_map_out:
														
 
															+	dma_unmap_single(dd->dev, dd->dma_addr_in, dd->buflen,
														
 
															+		DMA_TO_DEVICE);
														
 
															+err_map_in:
														
 
															+	free_page((unsigned long)dd->buf_out);
														
 
															+	free_page((unsigned long)dd->buf_in);
														
 
															+err_alloc:
														
 
															+	if (err)
														
 
															+		pr_err("error: %d\n", err);
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static void atmel_aes_buff_cleanup(struct atmel_aes_dev *dd)
														
 
															+{
														
 
															+	dma_unmap_single(dd->dev, dd->dma_addr_out, dd->buflen,
														
 
															+			 DMA_FROM_DEVICE);
														
 
															+	dma_unmap_single(dd->dev, dd->dma_addr_in, dd->buflen,
														
 
															+		DMA_TO_DEVICE);
														
 
															+	free_page((unsigned long)dd->buf_out);
														
 
															+	free_page((unsigned long)dd->buf_in);
														
 
															+}
														
 
															+
														
 
															 static int atmel_aes_crypt(struct ablkcipher_request *req, unsigned long mode)
														
 
															 {
														
 
															 	struct atmel_aes_ctx *ctx = crypto_ablkcipher_ctx(
														
@@ -525,9 +696,30 @@ static int atmel_aes_crypt(struct ablkcipher_request *req, unsigned long mode)
 
															 	struct atmel_aes_reqctx *rctx = ablkcipher_request_ctx(req);
														
 
															 	struct atmel_aes_dev *dd;
														
 
															-	if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE)) {
														
 
															-		pr_err("request size is not exact amount of AES blocks\n");
														
 
															-		return -EINVAL;
														
 
															+	if (mode & AES_FLAGS_CFB8) {
														
 
															+		if (!IS_ALIGNED(req->nbytes, CFB8_BLOCK_SIZE)) {
														
 
															+			pr_err("request size is not exact amount of CFB8 blocks\n");
														
 
															+			return -EINVAL;
														
 
															+		}
														
 
															+		ctx->block_size = CFB8_BLOCK_SIZE;
														
 
															+	} else if (mode & AES_FLAGS_CFB16) {
														
 
															+		if (!IS_ALIGNED(req->nbytes, CFB16_BLOCK_SIZE)) {
														
 
															+			pr_err("request size is not exact amount of CFB16 blocks\n");
														
 
															+			return -EINVAL;
														
 
															+		}
														
 
															+		ctx->block_size = CFB16_BLOCK_SIZE;
														
 
															+	} else if (mode & AES_FLAGS_CFB32) {
														
 
															+		if (!IS_ALIGNED(req->nbytes, CFB32_BLOCK_SIZE)) {
														
 
															+			pr_err("request size is not exact amount of CFB32 blocks\n");
														
 
															+			return -EINVAL;
														
 
															+		}
														
 
															+		ctx->block_size = CFB32_BLOCK_SIZE;
														
 
															+	} else {
														
 
															+		if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE)) {
														
 
															+			pr_err("request size is not exact amount of AES blocks\n");
														
 
															+			return -EINVAL;
														
 
															+		}
														
 
															+		ctx->block_size = AES_BLOCK_SIZE;
														
 
															 	}
														
 
															 	dd = atmel_aes_find_dev(ctx);
														
@@ -551,14 +743,12 @@ static bool atmel_aes_filter(struct dma_chan *chan, void *slave)
 
															 	}
														
 
															 }
														
 
															-static int atmel_aes_dma_init(struct atmel_aes_dev *dd)
														
 
															+static int atmel_aes_dma_init(struct atmel_aes_dev *dd,
														
 
															+	struct crypto_platform_data *pdata)
														
 
															 {
														
 
															 	int err = -ENOMEM;
														
 
															-	struct aes_platform_data	*pdata;
														
 
															 	dma_cap_mask_t mask_in, mask_out;
														
 
															-	pdata = dd->dev->platform_data;
														
 
															-
														
 
															 	if (pdata && pdata->dma_slave->txdata.dma_dev &&
														
 
															 		pdata->dma_slave->rxdata.dma_dev) {
														
@@ -568,28 +758,38 @@ static int atmel_aes_dma_init(struct atmel_aes_dev *dd)
 
															 		dd->dma_lch_in.chan = dma_request_channel(mask_in,
														
 
															 				atmel_aes_filter, &pdata->dma_slave->rxdata);
														
 
															+
														
 
															 		if (!dd->dma_lch_in.chan)
														
 
															 			goto err_dma_in;
														
 
															 		dd->dma_lch_in.dma_conf.direction = DMA_MEM_TO_DEV;
														
 
															 		dd->dma_lch_in.dma_conf.dst_addr = dd->phys_base +
														
 
															 			AES_IDATAR(0);
														
 
															-		dd->dma_lch_in.dma_conf.src_maxburst = 1;
														
 
															-		dd->dma_lch_in.dma_conf.dst_maxburst = 1;
														
 
															+		dd->dma_lch_in.dma_conf.src_maxburst = dd->caps.max_burst_size;
														
 
															+		dd->dma_lch_in.dma_conf.src_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															+		dd->dma_lch_in.dma_conf.dst_maxburst = dd->caps.max_burst_size;
														
 
															+		dd->dma_lch_in.dma_conf.dst_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															 		dd->dma_lch_in.dma_conf.device_fc = false;
														
 
															 		dma_cap_zero(mask_out);
														
 
															 		dma_cap_set(DMA_SLAVE, mask_out);
														
 
															 		dd->dma_lch_out.chan = dma_request_channel(mask_out,
														
 
															 				atmel_aes_filter, &pdata->dma_slave->txdata);
														
 
															+
														
 
															 		if (!dd->dma_lch_out.chan)
														
 
															 			goto err_dma_out;
														
 
															 		dd->dma_lch_out.dma_conf.direction = DMA_DEV_TO_MEM;
														
 
															 		dd->dma_lch_out.dma_conf.src_addr = dd->phys_base +
														
 
															 			AES_ODATAR(0);
														
 
															-		dd->dma_lch_out.dma_conf.src_maxburst = 1;
														
 
															-		dd->dma_lch_out.dma_conf.dst_maxburst = 1;
														
 
															+		dd->dma_lch_out.dma_conf.src_maxburst = dd->caps.max_burst_size;
														
 
															+		dd->dma_lch_out.dma_conf.src_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															+		dd->dma_lch_out.dma_conf.dst_maxburst = dd->caps.max_burst_size;
														
 
															+		dd->dma_lch_out.dma_conf.dst_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															 		dd->dma_lch_out.dma_conf.device_fc = false;
														
 
															 		return 0;
														
@@ -665,13 +865,13 @@ static int atmel_aes_ofb_decrypt(struct ablkcipher_request *req)
 
															 static int atmel_aes_cfb_encrypt(struct ablkcipher_request *req)
														
 
															 {
														
 
															 	return atmel_aes_crypt(req,
														
 
															-		AES_FLAGS_ENCRYPT | AES_FLAGS_CFB);
														
 
															+		AES_FLAGS_ENCRYPT | AES_FLAGS_CFB | AES_FLAGS_CFB128);
														
 
															 }
														
 
															 static int atmel_aes_cfb_decrypt(struct ablkcipher_request *req)
														
 
															 {
														
 
															 	return atmel_aes_crypt(req,
														
 
															-		AES_FLAGS_CFB);
														
 
															+		AES_FLAGS_CFB | AES_FLAGS_CFB128);
														
 
															 }
														
 
															 static int atmel_aes_cfb64_encrypt(struct ablkcipher_request *req)
														
@@ -753,7 +953,7 @@ static struct crypto_alg aes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= AES_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
														
 
															-	.cra_alignmask		= 0x0,
														
 
															+	.cra_alignmask		= 0xf,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_aes_cra_init,
														
@@ -773,7 +973,7 @@ static struct crypto_alg aes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= AES_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
														
 
															-	.cra_alignmask		= 0x0,
														
 
															+	.cra_alignmask		= 0xf,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_aes_cra_init,
														
@@ -794,7 +994,7 @@ static struct crypto_alg aes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= AES_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
														
 
															-	.cra_alignmask		= 0x0,
														
 
															+	.cra_alignmask		= 0xf,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_aes_cra_init,
														
@@ -815,7 +1015,7 @@ static struct crypto_alg aes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= AES_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
														
 
															-	.cra_alignmask		= 0x0,
														
 
															+	.cra_alignmask		= 0xf,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_aes_cra_init,
														
@@ -836,7 +1036,7 @@ static struct crypto_alg aes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= CFB32_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
														
 
															-	.cra_alignmask		= 0x0,
														
 
															+	.cra_alignmask		= 0x3,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_aes_cra_init,
														
@@ -857,7 +1057,7 @@ static struct crypto_alg aes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= CFB16_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
														
 
															-	.cra_alignmask		= 0x0,
														
 
															+	.cra_alignmask		= 0x1,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_aes_cra_init,
														
@@ -899,7 +1099,7 @@ static struct crypto_alg aes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= AES_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
														
 
															-	.cra_alignmask		= 0x0,
														
 
															+	.cra_alignmask		= 0xf,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_aes_cra_init,
														
@@ -915,15 +1115,14 @@ static struct crypto_alg aes_algs[] = {
 
															 },
														
 
															 };
														
 
															-static struct crypto_alg aes_cfb64_alg[] = {
														
 
															-{
														
 
															+static struct crypto_alg aes_cfb64_alg = {
														
 
															 	.cra_name		= "cfb64(aes)",
														
 
															 	.cra_driver_name	= "atmel-cfb64-aes",
														
 
															 	.cra_priority		= 100,
														
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= CFB64_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
														
 
															-	.cra_alignmask		= 0x0,
														
 
															+	.cra_alignmask		= 0x7,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_aes_cra_init,
														
@@ -936,7 +1135,6 @@ static struct crypto_alg aes_cfb64_alg[] = {
 
															 		.encrypt	= atmel_aes_cfb64_encrypt,
														
 
															 		.decrypt	= atmel_aes_cfb64_decrypt,
														
 
															 	}
														
 
															-},
														
 
															 };
														
 
															 static void atmel_aes_queue_task(unsigned long data)
														
@@ -969,7 +1167,14 @@ static void atmel_aes_done_task(unsigned long data)
 
															 	err = dd->err ? : err;
														
 
															 	if (dd->total && !err) {
														
 
															-		err = atmel_aes_crypt_dma_start(dd);
														
 
															+		if (dd->flags & AES_FLAGS_FAST) {
														
 
															+			dd->in_sg = sg_next(dd->in_sg);
														
 
															+			dd->out_sg = sg_next(dd->out_sg);
														
 
															+			if (!dd->in_sg || !dd->out_sg)
														
 
															+				err = -EINVAL;
														
 
															+		}
														
 
															+		if (!err)
														
 
															+			err = atmel_aes_crypt_dma_start(dd);
														
 
															 		if (!err)
														
 
															 			return; /* DMA started. Not fininishing. */
														
 
															 	}
														
@@ -1003,8 +1208,8 @@ static void atmel_aes_unregister_algs(struct atmel_aes_dev *dd)
 
															 	for (i = 0; i < ARRAY_SIZE(aes_algs); i++)
														
 
															 		crypto_unregister_alg(&aes_algs[i]);
														
 
															-	if (dd->hw_version >= 0x130)
														
 
															-		crypto_unregister_alg(&aes_cfb64_alg[0]);
														
 
															+	if (dd->caps.has_cfb64)
														
 
															+		crypto_unregister_alg(&aes_cfb64_alg);
														
 
															 }
														
 
															 static int atmel_aes_register_algs(struct atmel_aes_dev *dd)
														
@@ -1017,10 +1222,8 @@ static int atmel_aes_register_algs(struct atmel_aes_dev *dd)
 
															 			goto err_aes_algs;
														
 
															 	}
														
 
															-	atmel_aes_hw_version_init(dd);
														
 
															-
														
 
															-	if (dd->hw_version >= 0x130) {
														
 
															-		err = crypto_register_alg(&aes_cfb64_alg[0]);
														
 
															+	if (dd->caps.has_cfb64) {
														
 
															+		err = crypto_register_alg(&aes_cfb64_alg);
														
 
															 		if (err)
														
 
															 			goto err_aes_cfb64_alg;
														
 
															 	}
														
@@ -1036,10 +1239,32 @@ err_aes_algs:
 
															 	return err;
														
 
															 }
														
 
															+static void atmel_aes_get_cap(struct atmel_aes_dev *dd)
														
 
															+{
														
 
															+	dd->caps.has_dualbuff = 0;
														
 
															+	dd->caps.has_cfb64 = 0;
														
 
															+	dd->caps.max_burst_size = 1;
														
 
															+
														
 
															+	/* keep only major version number */
														
 
															+	switch (dd->hw_version & 0xff0) {
														
 
															+	case 0x130:
														
 
															+		dd->caps.has_dualbuff = 1;
														
 
															+		dd->caps.has_cfb64 = 1;
														
 
															+		dd->caps.max_burst_size = 4;
														
 
															+		break;
														
 
															+	case 0x120:
														
 
															+		break;
														
 
															+	default:
														
 
															+		dev_warn(dd->dev,
														
 
															+				"Unmanaged aes version, set minimum capabilities\n");
														
 
															+		break;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 static int atmel_aes_probe(struct platform_device *pdev)
														
 
															 {
														
 
															 	struct atmel_aes_dev *aes_dd;
														
 
															-	struct aes_platform_data	*pdata;
														
 
															+	struct crypto_platform_data *pdata;
														
 
															 	struct device *dev = &pdev->dev;
														
 
															 	struct resource *aes_res;
														
 
															 	unsigned long aes_phys_size;
														
@@ -1099,7 +1324,7 @@ static int atmel_aes_probe(struct platform_device *pdev)
 
															 	}
														
 
															 	/* Initializing the clock */
														
 
															-	aes_dd->iclk = clk_get(&pdev->dev, NULL);
														
 
															+	aes_dd->iclk = clk_get(&pdev->dev, "aes_clk");
														
 
															 	if (IS_ERR(aes_dd->iclk)) {
														
 
															 		dev_err(dev, "clock intialization failed.\n");
														
 
															 		err = PTR_ERR(aes_dd->iclk);
														
@@ -1113,7 +1338,15 @@ static int atmel_aes_probe(struct platform_device *pdev)
 
															 		goto aes_io_err;
														
 
															 	}
														
 
															-	err = atmel_aes_dma_init(aes_dd);
														
 
															+	atmel_aes_hw_version_init(aes_dd);
														
 
															+
														
 
															+	atmel_aes_get_cap(aes_dd);
														
 
															+
														
 
															+	err = atmel_aes_buff_init(aes_dd);
														
 
															+	if (err)
														
 
															+		goto err_aes_buff;
														
 
															+
														
 
															+	err = atmel_aes_dma_init(aes_dd, pdata);
														
 
															 	if (err)
														
 
															 		goto err_aes_dma;
														
@@ -1135,6 +1368,8 @@ err_algs:
 
															 	spin_unlock(&atmel_aes.lock);
														
 
															 	atmel_aes_dma_cleanup(aes_dd);
														
 
															 err_aes_dma:
														
 
															+	atmel_aes_buff_cleanup(aes_dd);
														
 
															+err_aes_buff:
														
 
															 	iounmap(aes_dd->io_base);
														
 
															 aes_io_err:
														
 
															 	clk_put(aes_dd->iclk);
														
--- a/drivers/crypto/atmel-sha-regs.h
+++ b/drivers/crypto/atmel-sha-regs.h
@@ -14,10 +14,13 @@
 
															 #define SHA_MR_MODE_MANUAL		0x0
														
 
															 #define SHA_MR_MODE_AUTO		0x1
														
 
															 #define SHA_MR_MODE_PDC			0x2
														
 
															-#define	SHA_MR_DUALBUFF			(1 << 3)
														
 
															 #define SHA_MR_PROCDLY			(1 << 4)
														
 
															 #define SHA_MR_ALGO_SHA1		(0 << 8)
														
 
															 #define SHA_MR_ALGO_SHA256		(1 << 8)
														
 
															+#define SHA_MR_ALGO_SHA384		(2 << 8)
														
 
															+#define SHA_MR_ALGO_SHA512		(3 << 8)
														
 
															+#define SHA_MR_ALGO_SHA224		(4 << 8)
														
 
															+#define	SHA_MR_DUALBUFF			(1 << 16)
														
 
															 #define SHA_IER				0x10
														
 
															 #define SHA_IDR				0x14
														
@@ -33,6 +36,8 @@
 
															 #define SHA_ISR_URAT_MR			(0x2 << 12)
														
 
															 #define SHA_ISR_URAT_WO			(0x5 << 12)
														
 
															+#define	SHA_HW_VERSION		0xFC
														
 
															+
														
 
															 #define SHA_TPR				0x108
														
 
															 #define SHA_TCR				0x10C
														
 
															 #define SHA_TNPR			0x118
														
--- a/drivers/crypto/atmel-sha.c
+++ b/drivers/crypto/atmel-sha.c
@@ -38,6 +38,7 @@
 
															 #include <crypto/sha.h>
														
 
															 #include <crypto/hash.h>
														
 
															 #include <crypto/internal/hash.h>
														
 
															+#include <linux/platform_data/crypto-atmel.h>
														
 
															 #include "atmel-sha-regs.h"
														
 
															 /* SHA flags */
														
@@ -52,11 +53,12 @@
 
															 #define SHA_FLAGS_FINUP		BIT(16)
														
 
															 #define SHA_FLAGS_SG		BIT(17)
														
 
															 #define SHA_FLAGS_SHA1		BIT(18)
														
 
															-#define SHA_FLAGS_SHA256	BIT(19)
														
 
															-#define SHA_FLAGS_ERROR		BIT(20)
														
 
															-#define SHA_FLAGS_PAD		BIT(21)
														
 
															-
														
 
															-#define SHA_FLAGS_DUALBUFF	BIT(24)
														
 
															+#define SHA_FLAGS_SHA224	BIT(19)
														
 
															+#define SHA_FLAGS_SHA256	BIT(20)
														
 
															+#define SHA_FLAGS_SHA384	BIT(21)
														
 
															+#define SHA_FLAGS_SHA512	BIT(22)
														
 
															+#define SHA_FLAGS_ERROR		BIT(23)
														
 
															+#define SHA_FLAGS_PAD		BIT(24)
														
 
															 #define SHA_OP_UPDATE	1
														
 
															 #define SHA_OP_FINAL	2
														
@@ -65,6 +67,12 @@
 
															 #define ATMEL_SHA_DMA_THRESHOLD		56
														
 
															+struct atmel_sha_caps {
														
 
															+	bool	has_dma;
														
 
															+	bool	has_dualbuff;
														
 
															+	bool	has_sha224;
														
 
															+	bool	has_sha_384_512;
														
 
															+};
														
 
															 struct atmel_sha_dev;
														
@@ -73,8 +81,8 @@ struct atmel_sha_reqctx {
 
															 	unsigned long	flags;
														
 
															 	unsigned long	op;
														
 
															-	u8	digest[SHA256_DIGEST_SIZE] __aligned(sizeof(u32));
														
 
															-	size_t	digcnt;
														
 
															+	u8	digest[SHA512_DIGEST_SIZE] __aligned(sizeof(u32));
														
 
															+	u64	digcnt[2];
														
 
															 	size_t	bufcnt;
														
 
															 	size_t	buflen;
														
 
															 	dma_addr_t	dma_addr;
														
@@ -84,6 +92,8 @@ struct atmel_sha_reqctx {
 
															 	unsigned int	offset;	/* offset in current sg */
														
 
															 	unsigned int	total;	/* total request */
														
 
															+	size_t block_size;
														
 
															+
														
 
															 	u8	buffer[0] __aligned(sizeof(u32));
														
 
															 };
														
@@ -97,7 +107,12 @@ struct atmel_sha_ctx {
 
															 };
														
 
															-#define ATMEL_SHA_QUEUE_LENGTH	1
														
 
															+#define ATMEL_SHA_QUEUE_LENGTH	50
														
 
															+
														
 
															+struct atmel_sha_dma {
														
 
															+	struct dma_chan			*chan;
														
 
															+	struct dma_slave_config dma_conf;
														
 
															+};
														
 
															 struct atmel_sha_dev {
														
 
															 	struct list_head	list;
														
@@ -114,6 +129,12 @@ struct atmel_sha_dev {
 
															 	unsigned long		flags;
														
 
															 	struct crypto_queue	queue;
														
 
															 	struct ahash_request	*req;
														
 
															+
														
 
															+	struct atmel_sha_dma	dma_lch_in;
														
 
															+
														
 
															+	struct atmel_sha_caps	caps;
														
 
															+
														
 
															+	u32	hw_version;
														
 
															 };
														
 
															 struct atmel_sha_drv {
														
@@ -137,14 +158,6 @@ static inline void atmel_sha_write(struct atmel_sha_dev *dd,
 
															 	writel_relaxed(value, dd->io_base + offset);
														
 
															 }
														
 
															-static void atmel_sha_dualbuff_test(struct atmel_sha_dev *dd)
														
 
															-{
														
 
															-	atmel_sha_write(dd, SHA_MR, SHA_MR_DUALBUFF);
														
 
															-
														
 
															-	if (atmel_sha_read(dd, SHA_MR) & SHA_MR_DUALBUFF)
														
 
															-		dd->flags |= SHA_FLAGS_DUALBUFF;
														
 
															-}
														
 
															-
														
 
															 static size_t atmel_sha_append_sg(struct atmel_sha_reqctx *ctx)
														
 
															 {
														
 
															 	size_t count;
														
@@ -176,31 +189,58 @@ static size_t atmel_sha_append_sg(struct atmel_sha_reqctx *ctx)
 
															 }
														
 
															 /*
														
 
															- * The purpose of this padding is to ensure that the padded message
														
 
															- * is a multiple of 512 bits. The bit "1" is appended at the end of
														
 
															- * the message followed by "padlen-1" zero bits. Then a 64 bits block
														
 
															- * equals to the message length in bits is appended.
														
 
															+ * The purpose of this padding is to ensure that the padded message is a
														
 
															+ * multiple of 512 bits (SHA1/SHA224/SHA256) or 1024 bits (SHA384/SHA512).
														
 
															+ * The bit "1" is appended at the end of the message followed by
														
 
															+ * "padlen-1" zero bits. Then a 64 bits block (SHA1/SHA224/SHA256) or
														
 
															+ * 128 bits block (SHA384/SHA512) equals to the message length in bits
														
 
															+ * is appended.
														
 
															  *
														
 
															- * padlen is calculated as followed:
														
 
															+ * For SHA1/SHA224/SHA256, padlen is calculated as followed:
														
 
															  *  - if message length < 56 bytes then padlen = 56 - message length
														
 
															  *  - else padlen = 64 + 56 - message length
														
 
															+ *
														
 
															+ * For SHA384/SHA512, padlen is calculated as followed:
														
 
															+ *  - if message length < 112 bytes then padlen = 112 - message length
														
 
															+ *  - else padlen = 128 + 112 - message length
														
 
															  */
														
 
															 static void atmel_sha_fill_padding(struct atmel_sha_reqctx *ctx, int length)
														
 
															 {
														
 
															 	unsigned int index, padlen;
														
 
															-	u64 bits;
														
 
															-	u64 size;
														
 
															-
														
 
															-	bits = (ctx->bufcnt + ctx->digcnt + length) << 3;
														
 
															-	size = cpu_to_be64(bits);
														
 
															-
														
 
															-	index = ctx->bufcnt & 0x3f;
														
 
															-	padlen = (index < 56) ? (56 - index) : ((64+56) - index);
														
 
															-	*(ctx->buffer + ctx->bufcnt) = 0x80;
														
 
															-	memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen-1);
														
 
															-	memcpy(ctx->buffer + ctx->bufcnt + padlen, &size, 8);
														
 
															-	ctx->bufcnt += padlen + 8;
														
 
															-	ctx->flags |= SHA_FLAGS_PAD;
														
 
															+	u64 bits[2];
														
 
															+	u64 size[2];
														
 
															+
														
 
															+	size[0] = ctx->digcnt[0];
														
 
															+	size[1] = ctx->digcnt[1];
														
 
															+
														
 
															+	size[0] += ctx->bufcnt;
														
 
															+	if (size[0] < ctx->bufcnt)
														
 
															+		size[1]++;
														
 
															+
														
 
															+	size[0] += length;
														
 
															+	if (size[0]  < length)
														
 
															+		size[1]++;
														
 
															+
														
 
															+	bits[1] = cpu_to_be64(size[0] << 3);
														
 
															+	bits[0] = cpu_to_be64(size[1] << 3 | size[0] >> 61);
														
 
															+
														
 
															+	if (ctx->flags & (SHA_FLAGS_SHA384 | SHA_FLAGS_SHA512)) {
														
 
															+		index = ctx->bufcnt & 0x7f;
														
 
															+		padlen = (index < 112) ? (112 - index) : ((128+112) - index);
														
 
															+		*(ctx->buffer + ctx->bufcnt) = 0x80;
														
 
															+		memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen-1);
														
 
															+		memcpy(ctx->buffer + ctx->bufcnt + padlen, bits, 16);
														
 
															+		ctx->bufcnt += padlen + 16;
														
 
															+		ctx->flags |= SHA_FLAGS_PAD;
														
 
															+	} else {
														
 
															+		index = ctx->bufcnt & 0x3f;
														
 
															+		padlen = (index < 56) ? (56 - index) : ((64+56) - index);
														
 
															+		*(ctx->buffer + ctx->bufcnt) = 0x80;
														
 
															+		memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen-1);
														
 
															+		memcpy(ctx->buffer + ctx->bufcnt + padlen, &bits[1], 8);
														
 
															+		ctx->bufcnt += padlen + 8;
														
 
															+		ctx->flags |= SHA_FLAGS_PAD;
														
 
															+	}
														
 
															 }
														
 
															 static int atmel_sha_init(struct ahash_request *req)
														
@@ -231,13 +271,35 @@ static int atmel_sha_init(struct ahash_request *req)
 
															 	dev_dbg(dd->dev, "init: digest size: %d\n",
														
 
															 		crypto_ahash_digestsize(tfm));
														
 
															-	if (crypto_ahash_digestsize(tfm) == SHA1_DIGEST_SIZE)
														
 
															+	switch (crypto_ahash_digestsize(tfm)) {
														
 
															+	case SHA1_DIGEST_SIZE:
														
 
															 		ctx->flags |= SHA_FLAGS_SHA1;
														
 
															-	else if (crypto_ahash_digestsize(tfm) == SHA256_DIGEST_SIZE)
														
 
															+		ctx->block_size = SHA1_BLOCK_SIZE;
														
 
															+		break;
														
 
															+	case SHA224_DIGEST_SIZE:
														
 
															+		ctx->flags |= SHA_FLAGS_SHA224;
														
 
															+		ctx->block_size = SHA224_BLOCK_SIZE;
														
 
															+		break;
														
 
															+	case SHA256_DIGEST_SIZE:
														
 
															 		ctx->flags |= SHA_FLAGS_SHA256;
														
 
															+		ctx->block_size = SHA256_BLOCK_SIZE;
														
 
															+		break;
														
 
															+	case SHA384_DIGEST_SIZE:
														
 
															+		ctx->flags |= SHA_FLAGS_SHA384;
														
 
															+		ctx->block_size = SHA384_BLOCK_SIZE;
														
 
															+		break;
														
 
															+	case SHA512_DIGEST_SIZE:
														
 
															+		ctx->flags |= SHA_FLAGS_SHA512;
														
 
															+		ctx->block_size = SHA512_BLOCK_SIZE;
														
 
															+		break;
														
 
															+	default:
														
 
															+		return -EINVAL;
														
 
															+		break;
														
 
															+	}
														
 
															 	ctx->bufcnt = 0;
														
 
															-	ctx->digcnt = 0;
														
 
															+	ctx->digcnt[0] = 0;
														
 
															+	ctx->digcnt[1] = 0;
														
 
															 	ctx->buflen = SHA_BUFFER_LEN;
														
 
															 	return 0;
														
@@ -249,19 +311,28 @@ static void atmel_sha_write_ctrl(struct atmel_sha_dev *dd, int dma)
 
															 	u32 valcr = 0, valmr = SHA_MR_MODE_AUTO;
														
 
															 	if (likely(dma)) {
														
 
															-		atmel_sha_write(dd, SHA_IER, SHA_INT_TXBUFE);
														
 
															+		if (!dd->caps.has_dma)
														
 
															+			atmel_sha_write(dd, SHA_IER, SHA_INT_TXBUFE);
														
 
															 		valmr = SHA_MR_MODE_PDC;
														
 
															-		if (dd->flags & SHA_FLAGS_DUALBUFF)
														
 
															-			valmr = SHA_MR_DUALBUFF;
														
 
															+		if (dd->caps.has_dualbuff)
														
 
															+			valmr |= SHA_MR_DUALBUFF;
														
 
															 	} else {
														
 
															 		atmel_sha_write(dd, SHA_IER, SHA_INT_DATARDY);
														
 
															 	}
														
 
															-	if (ctx->flags & SHA_FLAGS_SHA256)
														
 
															+	if (ctx->flags & SHA_FLAGS_SHA1)
														
 
															+		valmr |= SHA_MR_ALGO_SHA1;
														
 
															+	else if (ctx->flags & SHA_FLAGS_SHA224)
														
 
															+		valmr |= SHA_MR_ALGO_SHA224;
														
 
															+	else if (ctx->flags & SHA_FLAGS_SHA256)
														
 
															 		valmr |= SHA_MR_ALGO_SHA256;
														
 
															+	else if (ctx->flags & SHA_FLAGS_SHA384)
														
 
															+		valmr |= SHA_MR_ALGO_SHA384;
														
 
															+	else if (ctx->flags & SHA_FLAGS_SHA512)
														
 
															+		valmr |= SHA_MR_ALGO_SHA512;
														
 
															 	/* Setting CR_FIRST only for the first iteration */
														
 
															-	if (!ctx->digcnt)
														
 
															+	if (!(ctx->digcnt[0] || ctx->digcnt[1]))
														
 
															 		valcr = SHA_CR_FIRST;
														
 
															 	atmel_sha_write(dd, SHA_CR, valcr);
														
@@ -275,13 +346,15 @@ static int atmel_sha_xmit_cpu(struct atmel_sha_dev *dd, const u8 *buf,
 
															 	int count, len32;
														
 
															 	const u32 *buffer = (const u32 *)buf;
														
 
															-	dev_dbg(dd->dev, "xmit_cpu: digcnt: %d, length: %d, final: %d\n",
														
 
															-						ctx->digcnt, length, final);
														
 
															+	dev_dbg(dd->dev, "xmit_cpu: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
														
 
															+		ctx->digcnt[1], ctx->digcnt[0], length, final);
														
 
															 	atmel_sha_write_ctrl(dd, 0);
														
 
															 	/* should be non-zero before next lines to disable clocks later */
														
 
															-	ctx->digcnt += length;
														
 
															+	ctx->digcnt[0] += length;
														
 
															+	if (ctx->digcnt[0] < length)
														
 
															+		ctx->digcnt[1]++;
														
 
															 	if (final)
														
 
															 		dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */
														
@@ -302,8 +375,8 @@ static int atmel_sha_xmit_pdc(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
 
															 	struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
														
 
															 	int len32;
														
 
															-	dev_dbg(dd->dev, "xmit_pdc: digcnt: %d, length: %d, final: %d\n",
														
 
															-						ctx->digcnt, length1, final);
														
 
															+	dev_dbg(dd->dev, "xmit_pdc: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
														
 
															+		ctx->digcnt[1], ctx->digcnt[0], length1, final);
														
 
															 	len32 = DIV_ROUND_UP(length1, sizeof(u32));
														
 
															 	atmel_sha_write(dd, SHA_PTCR, SHA_PTCR_TXTDIS);
														
@@ -317,7 +390,9 @@ static int atmel_sha_xmit_pdc(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
 
															 	atmel_sha_write_ctrl(dd, 1);
														
 
															 	/* should be non-zero before next lines to disable clocks later */
														
 
															-	ctx->digcnt += length1;
														
 
															+	ctx->digcnt[0] += length1;
														
 
															+	if (ctx->digcnt[0] < length1)
														
 
															+		ctx->digcnt[1]++;
														
 
															 	if (final)
														
 
															 		dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */
														
@@ -330,6 +405,86 @@ static int atmel_sha_xmit_pdc(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
 
															 	return -EINPROGRESS;
														
 
															 }
														
 
															+static void atmel_sha_dma_callback(void *data)
														
 
															+{
														
 
															+	struct atmel_sha_dev *dd = data;
														
 
															+
														
 
															+	/* dma_lch_in - completed - wait DATRDY */
														
 
															+	atmel_sha_write(dd, SHA_IER, SHA_INT_DATARDY);
														
 
															+}
														
 
															+
														
 
															+static int atmel_sha_xmit_dma(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
														
 
															+		size_t length1, dma_addr_t dma_addr2, size_t length2, int final)
														
 
															+{
														
 
															+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
														
 
															+	struct dma_async_tx_descriptor	*in_desc;
														
 
															+	struct scatterlist sg[2];
														
 
															+
														
 
															+	dev_dbg(dd->dev, "xmit_dma: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
														
 
															+		ctx->digcnt[1], ctx->digcnt[0], length1, final);
														
 
															+
														
 
															+	if (ctx->flags & (SHA_FLAGS_SHA1 | SHA_FLAGS_SHA224 |
														
 
															+			SHA_FLAGS_SHA256)) {
														
 
															+		dd->dma_lch_in.dma_conf.src_maxburst = 16;
														
 
															+		dd->dma_lch_in.dma_conf.dst_maxburst = 16;
														
 
															+	} else {
														
 
															+		dd->dma_lch_in.dma_conf.src_maxburst = 32;
														
 
															+		dd->dma_lch_in.dma_conf.dst_maxburst = 32;
														
 
															+	}
														
 
															+
														
 
															+	dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf);
														
 
															+
														
 
															+	if (length2) {
														
 
															+		sg_init_table(sg, 2);
														
 
															+		sg_dma_address(&sg[0]) = dma_addr1;
														
 
															+		sg_dma_len(&sg[0]) = length1;
														
 
															+		sg_dma_address(&sg[1]) = dma_addr2;
														
 
															+		sg_dma_len(&sg[1]) = length2;
														
 
															+		in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, sg, 2,
														
 
															+			DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
														
 
															+	} else {
														
 
															+		sg_init_table(sg, 1);
														
 
															+		sg_dma_address(&sg[0]) = dma_addr1;
														
 
															+		sg_dma_len(&sg[0]) = length1;
														
 
															+		in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, sg, 1,
														
 
															+			DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
														
 
															+	}
														
 
															+	if (!in_desc)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	in_desc->callback = atmel_sha_dma_callback;
														
 
															+	in_desc->callback_param = dd;
														
 
															+
														
 
															+	atmel_sha_write_ctrl(dd, 1);
														
 
															+
														
 
															+	/* should be non-zero before next lines to disable clocks later */
														
 
															+	ctx->digcnt[0] += length1;
														
 
															+	if (ctx->digcnt[0] < length1)
														
 
															+		ctx->digcnt[1]++;
														
 
															+
														
 
															+	if (final)
														
 
															+		dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */
														
 
															+
														
 
															+	dd->flags |=  SHA_FLAGS_DMA_ACTIVE;
														
 
															+
														
 
															+	/* Start DMA transfer */
														
 
															+	dmaengine_submit(in_desc);
														
 
															+	dma_async_issue_pending(dd->dma_lch_in.chan);
														
 
															+
														
 
															+	return -EINPROGRESS;
														
 
															+}
														
 
															+
														
 
															+static int atmel_sha_xmit_start(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
														
 
															+		size_t length1, dma_addr_t dma_addr2, size_t length2, int final)
														
 
															+{
														
 
															+	if (dd->caps.has_dma)
														
 
															+		return atmel_sha_xmit_dma(dd, dma_addr1, length1,
														
 
															+				dma_addr2, length2, final);
														
 
															+	else
														
 
															+		return atmel_sha_xmit_pdc(dd, dma_addr1, length1,
														
 
															+				dma_addr2, length2, final);
														
 
															+}
														
 
															+
														
 
															 static int atmel_sha_update_cpu(struct atmel_sha_dev *dd)
														
 
															 {
														
 
															 	struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
														
@@ -337,7 +492,6 @@ static int atmel_sha_update_cpu(struct atmel_sha_dev *dd)
 
															 	atmel_sha_append_sg(ctx);
														
 
															 	atmel_sha_fill_padding(ctx, 0);
														
 
															-
														
 
															 	bufcnt = ctx->bufcnt;
														
 
															 	ctx->bufcnt = 0;
														
@@ -349,17 +503,17 @@ static int atmel_sha_xmit_dma_map(struct atmel_sha_dev *dd,
 
															 					size_t length, int final)
														
 
															 {
														
 
															 	ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer,
														
 
															-				ctx->buflen + SHA1_BLOCK_SIZE, DMA_TO_DEVICE);
														
 
															+				ctx->buflen + ctx->block_size, DMA_TO_DEVICE);
														
 
															 	if (dma_mapping_error(dd->dev, ctx->dma_addr)) {
														
 
															 		dev_err(dd->dev, "dma %u bytes error\n", ctx->buflen +
														
 
															-				SHA1_BLOCK_SIZE);
														
 
															+				ctx->block_size);
														
 
															 		return -EINVAL;
														
 
															 	}
														
 
															 	ctx->flags &= ~SHA_FLAGS_SG;
														
 
															 	/* next call does not fail... so no unmap in the case of error */
														
 
															-	return atmel_sha_xmit_pdc(dd, ctx->dma_addr, length, 0, 0, final);
														
 
															+	return atmel_sha_xmit_start(dd, ctx->dma_addr, length, 0, 0, final);
														
 
															 }
														
 
															 static int atmel_sha_update_dma_slow(struct atmel_sha_dev *dd)
														
@@ -372,8 +526,8 @@ static int atmel_sha_update_dma_slow(struct atmel_sha_dev *dd)
 
															 	final = (ctx->flags & SHA_FLAGS_FINUP) && !ctx->total;
														
 
															-	dev_dbg(dd->dev, "slow: bufcnt: %u, digcnt: %d, final: %d\n",
														
 
															-					 ctx->bufcnt, ctx->digcnt, final);
														
 
															+	dev_dbg(dd->dev, "slow: bufcnt: %u, digcnt: 0x%llx 0x%llx, final: %d\n",
														
 
															+		 ctx->bufcnt, ctx->digcnt[1], ctx->digcnt[0], final);
														
 
															 	if (final)
														
 
															 		atmel_sha_fill_padding(ctx, 0);
														
@@ -400,30 +554,25 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 
															 	if (ctx->bufcnt || ctx->offset)
														
 
															 		return atmel_sha_update_dma_slow(dd);
														
 
															-	dev_dbg(dd->dev, "fast: digcnt: %d, bufcnt: %u, total: %u\n",
														
 
															-			ctx->digcnt, ctx->bufcnt, ctx->total);
														
 
															+	dev_dbg(dd->dev, "fast: digcnt: 0x%llx 0x%llx, bufcnt: %u, total: %u\n",
														
 
															+		ctx->digcnt[1], ctx->digcnt[0], ctx->bufcnt, ctx->total);
														
 
															 	sg = ctx->sg;
														
 
															 	if (!IS_ALIGNED(sg->offset, sizeof(u32)))
														
 
															 		return atmel_sha_update_dma_slow(dd);
														
 
															-	if (!sg_is_last(sg) && !IS_ALIGNED(sg->length, SHA1_BLOCK_SIZE))
														
 
															-		/* size is not SHA1_BLOCK_SIZE aligned */
														
 
															+	if (!sg_is_last(sg) && !IS_ALIGNED(sg->length, ctx->block_size))
														
 
															+		/* size is not ctx->block_size aligned */
														
 
															 		return atmel_sha_update_dma_slow(dd);
														
 
															 	length = min(ctx->total, sg->length);
														
 
															 	if (sg_is_last(sg)) {
														
 
															 		if (!(ctx->flags & SHA_FLAGS_FINUP)) {
														
 
															-			/* not last sg must be SHA1_BLOCK_SIZE aligned */
														
 
															-			tail = length & (SHA1_BLOCK_SIZE - 1);
														
 
															+			/* not last sg must be ctx->block_size aligned */
														
 
															+			tail = length & (ctx->block_size - 1);
														
 
															 			length -= tail;
														
 
															-			if (length == 0) {
														
 
															-				/* offset where to start slow */
														
 
															-				ctx->offset = length;
														
 
															-				return atmel_sha_update_dma_slow(dd);
														
 
															-			}
														
 
															 		}
														
 
															 	}
														
@@ -434,7 +583,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 
															 	/* Add padding */
														
 
															 	if (final) {
														
 
															-		tail = length & (SHA1_BLOCK_SIZE - 1);
														
 
															+		tail = length & (ctx->block_size - 1);
														
 
															 		length -= tail;
														
 
															 		ctx->total += tail;
														
 
															 		ctx->offset = length; /* offset where to start slow */
														
@@ -445,10 +594,10 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 
															 		atmel_sha_fill_padding(ctx, length);
														
 
															 		ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer,
														
 
															-			ctx->buflen + SHA1_BLOCK_SIZE, DMA_TO_DEVICE);
														
 
															+			ctx->buflen + ctx->block_size, DMA_TO_DEVICE);
														
 
															 		if (dma_mapping_error(dd->dev, ctx->dma_addr)) {
														
 
															 			dev_err(dd->dev, "dma %u bytes error\n",
														
 
															-				ctx->buflen + SHA1_BLOCK_SIZE);
														
 
															+				ctx->buflen + ctx->block_size);
														
 
															 			return -EINVAL;
														
 
															 		}
														
@@ -456,7 +605,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 
															 			ctx->flags &= ~SHA_FLAGS_SG;
														
 
															 			count = ctx->bufcnt;
														
 
															 			ctx->bufcnt = 0;
														
 
															-			return atmel_sha_xmit_pdc(dd, ctx->dma_addr, count, 0,
														
 
															+			return atmel_sha_xmit_start(dd, ctx->dma_addr, count, 0,
														
 
															 					0, final);
														
 
															 		} else {
														
 
															 			ctx->sg = sg;
														
@@ -470,7 +619,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 
															 			count = ctx->bufcnt;
														
 
															 			ctx->bufcnt = 0;
														
 
															-			return atmel_sha_xmit_pdc(dd, sg_dma_address(ctx->sg),
														
 
															+			return atmel_sha_xmit_start(dd, sg_dma_address(ctx->sg),
														
 
															 					length, ctx->dma_addr, count, final);
														
 
															 		}
														
 
															 	}
														
@@ -483,7 +632,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 
															 	ctx->flags |= SHA_FLAGS_SG;
														
 
															 	/* next call does not fail... so no unmap in the case of error */
														
 
															-	return atmel_sha_xmit_pdc(dd, sg_dma_address(ctx->sg), length, 0,
														
 
															+	return atmel_sha_xmit_start(dd, sg_dma_address(ctx->sg), length, 0,
														
 
															 								0, final);
														
 
															 }
														
@@ -498,12 +647,13 @@ static int atmel_sha_update_dma_stop(struct atmel_sha_dev *dd)
 
															 			if (ctx->sg)
														
 
															 				ctx->offset = 0;
														
 
															 		}
														
 
															-		if (ctx->flags & SHA_FLAGS_PAD)
														
 
															+		if (ctx->flags & SHA_FLAGS_PAD) {
														
 
															 			dma_unmap_single(dd->dev, ctx->dma_addr,
														
 
															-				ctx->buflen + SHA1_BLOCK_SIZE, DMA_TO_DEVICE);
														
 
															+				ctx->buflen + ctx->block_size, DMA_TO_DEVICE);
														
 
															+		}
														
 
															 	} else {
														
 
															 		dma_unmap_single(dd->dev, ctx->dma_addr, ctx->buflen +
														
 
															-						SHA1_BLOCK_SIZE, DMA_TO_DEVICE);
														
 
															+						ctx->block_size, DMA_TO_DEVICE);
														
 
															 	}
														
 
															 	return 0;
														
@@ -515,8 +665,8 @@ static int atmel_sha_update_req(struct atmel_sha_dev *dd)
 
															 	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
														
 
															 	int err;
														
 
															-	dev_dbg(dd->dev, "update_req: total: %u, digcnt: %d, finup: %d\n",
														
 
															-		 ctx->total, ctx->digcnt, (ctx->flags & SHA_FLAGS_FINUP) != 0);
														
 
															+	dev_dbg(dd->dev, "update_req: total: %u, digcnt: 0x%llx 0x%llx\n",
														
 
															+		ctx->total, ctx->digcnt[1], ctx->digcnt[0]);
														
 
															 	if (ctx->flags & SHA_FLAGS_CPU)
														
 
															 		err = atmel_sha_update_cpu(dd);
														
@@ -524,8 +674,8 @@ static int atmel_sha_update_req(struct atmel_sha_dev *dd)
 
															 		err = atmel_sha_update_dma_start(dd);
														
 
															 	/* wait for dma completion before can take more data */
														
 
															-	dev_dbg(dd->dev, "update: err: %d, digcnt: %d\n",
														
 
															-			err, ctx->digcnt);
														
 
															+	dev_dbg(dd->dev, "update: err: %d, digcnt: 0x%llx 0%llx\n",
														
 
															+			err, ctx->digcnt[1], ctx->digcnt[0]);
														
 
															 	return err;
														
 
															 }
														
@@ -562,12 +712,21 @@ static void atmel_sha_copy_hash(struct ahash_request *req)
 
															 	u32 *hash = (u32 *)ctx->digest;
														
 
															 	int i;
														
 
															-	if (likely(ctx->flags & SHA_FLAGS_SHA1))
														
 
															+	if (ctx->flags & SHA_FLAGS_SHA1)
														
 
															 		for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(u32); i++)
														
 
															 			hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
														
 
															-	else
														
 
															+	else if (ctx->flags & SHA_FLAGS_SHA224)
														
 
															+		for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(u32); i++)
														
 
															+			hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
														
 
															+	else if (ctx->flags & SHA_FLAGS_SHA256)
														
 
															 		for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(u32); i++)
														
 
															 			hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
														
 
															+	else if (ctx->flags & SHA_FLAGS_SHA384)
														
 
															+		for (i = 0; i < SHA384_DIGEST_SIZE / sizeof(u32); i++)
														
 
															+			hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
														
 
															+	else
														
 
															+		for (i = 0; i < SHA512_DIGEST_SIZE / sizeof(u32); i++)
														
 
															+			hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
														
 
															 }
														
 
															 static void atmel_sha_copy_ready_hash(struct ahash_request *req)
														
@@ -577,10 +736,16 @@ static void atmel_sha_copy_ready_hash(struct ahash_request *req)
 
															 	if (!req->result)
														
 
															 		return;
														
 
															-	if (likely(ctx->flags & SHA_FLAGS_SHA1))
														
 
															+	if (ctx->flags & SHA_FLAGS_SHA1)
														
 
															 		memcpy(req->result, ctx->digest, SHA1_DIGEST_SIZE);
														
 
															-	else
														
 
															+	else if (ctx->flags & SHA_FLAGS_SHA224)
														
 
															+		memcpy(req->result, ctx->digest, SHA224_DIGEST_SIZE);
														
 
															+	else if (ctx->flags & SHA_FLAGS_SHA256)
														
 
															 		memcpy(req->result, ctx->digest, SHA256_DIGEST_SIZE);
														
 
															+	else if (ctx->flags & SHA_FLAGS_SHA384)
														
 
															+		memcpy(req->result, ctx->digest, SHA384_DIGEST_SIZE);
														
 
															+	else
														
 
															+		memcpy(req->result, ctx->digest, SHA512_DIGEST_SIZE);
														
 
															 }
														
 
															 static int atmel_sha_finish(struct ahash_request *req)
														
@@ -589,11 +754,11 @@ static int atmel_sha_finish(struct ahash_request *req)
 
															 	struct atmel_sha_dev *dd = ctx->dd;
														
 
															 	int err = 0;
														
 
															-	if (ctx->digcnt)
														
 
															+	if (ctx->digcnt[0] || ctx->digcnt[1])
														
 
															 		atmel_sha_copy_ready_hash(req);
														
 
															-	dev_dbg(dd->dev, "digcnt: %d, bufcnt: %d\n", ctx->digcnt,
														
 
															-		ctx->bufcnt);
														
 
															+	dev_dbg(dd->dev, "digcnt: 0x%llx 0x%llx, bufcnt: %d\n", ctx->digcnt[1],
														
 
															+		ctx->digcnt[0], ctx->bufcnt);
														
 
															 	return err;
														
 
															 }
														
@@ -628,9 +793,8 @@ static int atmel_sha_hw_init(struct atmel_sha_dev *dd)
 
															 {
														
 
															 	clk_prepare_enable(dd->iclk);
														
 
															-	if (SHA_FLAGS_INIT & dd->flags) {
														
 
															+	if (!(SHA_FLAGS_INIT & dd->flags)) {
														
 
															 		atmel_sha_write(dd, SHA_CR, SHA_CR_SWRST);
														
 
															-		atmel_sha_dualbuff_test(dd);
														
 
															 		dd->flags |= SHA_FLAGS_INIT;
														
 
															 		dd->err = 0;
														
 
															 	}
														
@@ -638,6 +802,23 @@ static int atmel_sha_hw_init(struct atmel_sha_dev *dd)
 
															 	return 0;
														
 
															 }
														
 
															+static inline unsigned int atmel_sha_get_version(struct atmel_sha_dev *dd)
														
 
															+{
														
 
															+	return atmel_sha_read(dd, SHA_HW_VERSION) & 0x00000fff;
														
 
															+}
														
 
															+
														
 
															+static void atmel_sha_hw_version_init(struct atmel_sha_dev *dd)
														
 
															+{
														
 
															+	atmel_sha_hw_init(dd);
														
 
															+
														
 
															+	dd->hw_version = atmel_sha_get_version(dd);
														
 
															+
														
 
															+	dev_info(dd->dev,
														
 
															+			"version: 0x%x\n", dd->hw_version);
														
 
															+
														
 
															+	clk_disable_unprepare(dd->iclk);
														
 
															+}
														
 
															+
														
 
															 static int atmel_sha_handle_queue(struct atmel_sha_dev *dd,
														
 
															 				  struct ahash_request *req)
														
 
															 {
														
@@ -682,10 +863,9 @@ static int atmel_sha_handle_queue(struct atmel_sha_dev *dd,
 
															 	if (ctx->op == SHA_OP_UPDATE) {
														
 
															 		err = atmel_sha_update_req(dd);
														
 
															-		if (err != -EINPROGRESS && (ctx->flags & SHA_FLAGS_FINUP)) {
														
 
															+		if (err != -EINPROGRESS && (ctx->flags & SHA_FLAGS_FINUP))
														
 
															 			/* no final() after finup() */
														
 
															 			err = atmel_sha_final_req(dd);
														
 
															-		}
														
 
															 	} else if (ctx->op == SHA_OP_FINAL) {
														
 
															 		err = atmel_sha_final_req(dd);
														
 
															 	}
														
@@ -808,7 +988,7 @@ static int atmel_sha_cra_init_alg(struct crypto_tfm *tfm, const char *alg_base)
 
															 	}
														
 
															 	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
														
 
															 				 sizeof(struct atmel_sha_reqctx) +
														
 
															-				 SHA_BUFFER_LEN + SHA256_BLOCK_SIZE);
														
 
															+				 SHA_BUFFER_LEN + SHA512_BLOCK_SIZE);
														
 
															 	return 0;
														
 
															 }
														
@@ -826,7 +1006,7 @@ static void atmel_sha_cra_exit(struct crypto_tfm *tfm)
 
															 	tctx->fallback = NULL;
														
 
															 }
														
 
															-static struct ahash_alg sha_algs[] = {
														
 
															+static struct ahash_alg sha_1_256_algs[] = {
														
 
															 {
														
 
															 	.init		= atmel_sha_init,
														
 
															 	.update		= atmel_sha_update,
														
@@ -875,6 +1055,79 @@ static struct ahash_alg sha_algs[] = {
 
															 },
														
 
															 };
														
 
															+static struct ahash_alg sha_224_alg = {
														
 
															+	.init		= atmel_sha_init,
														
 
															+	.update		= atmel_sha_update,
														
 
															+	.final		= atmel_sha_final,
														
 
															+	.finup		= atmel_sha_finup,
														
 
															+	.digest		= atmel_sha_digest,
														
 
															+	.halg = {
														
 
															+		.digestsize	= SHA224_DIGEST_SIZE,
														
 
															+		.base	= {
														
 
															+			.cra_name		= "sha224",
														
 
															+			.cra_driver_name	= "atmel-sha224",
														
 
															+			.cra_priority		= 100,
														
 
															+			.cra_flags		= CRYPTO_ALG_ASYNC |
														
 
															+						CRYPTO_ALG_NEED_FALLBACK,
														
 
															+			.cra_blocksize		= SHA224_BLOCK_SIZE,
														
 
															+			.cra_ctxsize		= sizeof(struct atmel_sha_ctx),
														
 
															+			.cra_alignmask		= 0,
														
 
															+			.cra_module		= THIS_MODULE,
														
 
															+			.cra_init		= atmel_sha_cra_init,
														
 
															+			.cra_exit		= atmel_sha_cra_exit,
														
 
															+		}
														
 
															+	}
														
 
															+};
														
 
															+
														
 
															+static struct ahash_alg sha_384_512_algs[] = {
														
 
															+{
														
 
															+	.init		= atmel_sha_init,
														
 
															+	.update		= atmel_sha_update,
														
 
															+	.final		= atmel_sha_final,
														
 
															+	.finup		= atmel_sha_finup,
														
 
															+	.digest		= atmel_sha_digest,
														
 
															+	.halg = {
														
 
															+		.digestsize	= SHA384_DIGEST_SIZE,
														
 
															+		.base	= {
														
 
															+			.cra_name		= "sha384",
														
 
															+			.cra_driver_name	= "atmel-sha384",
														
 
															+			.cra_priority		= 100,
														
 
															+			.cra_flags		= CRYPTO_ALG_ASYNC |
														
 
															+						CRYPTO_ALG_NEED_FALLBACK,
														
 
															+			.cra_blocksize		= SHA384_BLOCK_SIZE,
														
 
															+			.cra_ctxsize		= sizeof(struct atmel_sha_ctx),
														
 
															+			.cra_alignmask		= 0x3,
														
 
															+			.cra_module		= THIS_MODULE,
														
 
															+			.cra_init		= atmel_sha_cra_init,
														
 
															+			.cra_exit		= atmel_sha_cra_exit,
														
 
															+		}
														
 
															+	}
														
 
															+},
														
 
															+{
														
 
															+	.init		= atmel_sha_init,
														
 
															+	.update		= atmel_sha_update,
														
 
															+	.final		= atmel_sha_final,
														
 
															+	.finup		= atmel_sha_finup,
														
 
															+	.digest		= atmel_sha_digest,
														
 
															+	.halg = {
														
 
															+		.digestsize	= SHA512_DIGEST_SIZE,
														
 
															+		.base	= {
														
 
															+			.cra_name		= "sha512",
														
 
															+			.cra_driver_name	= "atmel-sha512",
														
 
															+			.cra_priority		= 100,
														
 
															+			.cra_flags		= CRYPTO_ALG_ASYNC |
														
 
															+						CRYPTO_ALG_NEED_FALLBACK,
														
 
															+			.cra_blocksize		= SHA512_BLOCK_SIZE,
														
 
															+			.cra_ctxsize		= sizeof(struct atmel_sha_ctx),
														
 
															+			.cra_alignmask		= 0x3,
														
 
															+			.cra_module		= THIS_MODULE,
														
 
															+			.cra_init		= atmel_sha_cra_init,
														
 
															+			.cra_exit		= atmel_sha_cra_exit,
														
 
															+		}
														
 
															+	}
														
 
															+},
														
 
															+};
														
 
															+
														
 
															 static void atmel_sha_done_task(unsigned long data)
														
 
															 {
														
 
															 	struct atmel_sha_dev *dd = (struct atmel_sha_dev *)data;
														
@@ -941,32 +1194,142 @@ static void atmel_sha_unregister_algs(struct atmel_sha_dev *dd)
 
															 {
														
 
															 	int i;
														
 
															-	for (i = 0; i < ARRAY_SIZE(sha_algs); i++)
														
 
															-		crypto_unregister_ahash(&sha_algs[i]);
														
 
															+	for (i = 0; i < ARRAY_SIZE(sha_1_256_algs); i++)
														
 
															+		crypto_unregister_ahash(&sha_1_256_algs[i]);
														
 
															+
														
 
															+	if (dd->caps.has_sha224)
														
 
															+		crypto_unregister_ahash(&sha_224_alg);
														
 
															+
														
 
															+	if (dd->caps.has_sha_384_512) {
														
 
															+		for (i = 0; i < ARRAY_SIZE(sha_384_512_algs); i++)
														
 
															+			crypto_unregister_ahash(&sha_384_512_algs[i]);
														
 
															+	}
														
 
															 }
														
 
															 static int atmel_sha_register_algs(struct atmel_sha_dev *dd)
														
 
															 {
														
 
															 	int err, i, j;
														
 
															-	for (i = 0; i < ARRAY_SIZE(sha_algs); i++) {
														
 
															-		err = crypto_register_ahash(&sha_algs[i]);
														
 
															+	for (i = 0; i < ARRAY_SIZE(sha_1_256_algs); i++) {
														
 
															+		err = crypto_register_ahash(&sha_1_256_algs[i]);
														
 
															 		if (err)
														
 
															-			goto err_sha_algs;
														
 
															+			goto err_sha_1_256_algs;
														
 
															+	}
														
 
															+
														
 
															+	if (dd->caps.has_sha224) {
														
 
															+		err = crypto_register_ahash(&sha_224_alg);
														
 
															+		if (err)
														
 
															+			goto err_sha_224_algs;
														
 
															+	}
														
 
															+
														
 
															+	if (dd->caps.has_sha_384_512) {
														
 
															+		for (i = 0; i < ARRAY_SIZE(sha_384_512_algs); i++) {
														
 
															+			err = crypto_register_ahash(&sha_384_512_algs[i]);
														
 
															+			if (err)
														
 
															+				goto err_sha_384_512_algs;
														
 
															+		}
														
 
															 	}
														
 
															 	return 0;
														
 
															-err_sha_algs:
														
 
															+err_sha_384_512_algs:
														
 
															+	for (j = 0; j < i; j++)
														
 
															+		crypto_unregister_ahash(&sha_384_512_algs[j]);
														
 
															+	crypto_unregister_ahash(&sha_224_alg);
														
 
															+err_sha_224_algs:
														
 
															+	i = ARRAY_SIZE(sha_1_256_algs);
														
 
															+err_sha_1_256_algs:
														
 
															 	for (j = 0; j < i; j++)
														
 
															-		crypto_unregister_ahash(&sha_algs[j]);
														
 
															+		crypto_unregister_ahash(&sha_1_256_algs[j]);
														
 
															 	return err;
														
 
															 }
														
 
															+static bool atmel_sha_filter(struct dma_chan *chan, void *slave)
														
 
															+{
														
 
															+	struct at_dma_slave	*sl = slave;
														
 
															+
														
 
															+	if (sl && sl->dma_dev == chan->device->dev) {
														
 
															+		chan->private = sl;
														
 
															+		return true;
														
 
															+	} else {
														
 
															+		return false;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int atmel_sha_dma_init(struct atmel_sha_dev *dd,
														
 
															+				struct crypto_platform_data *pdata)
														
 
															+{
														
 
															+	int err = -ENOMEM;
														
 
															+	dma_cap_mask_t mask_in;
														
 
															+
														
 
															+	if (pdata && pdata->dma_slave->rxdata.dma_dev) {
														
 
															+		/* Try to grab DMA channel */
														
 
															+		dma_cap_zero(mask_in);
														
 
															+		dma_cap_set(DMA_SLAVE, mask_in);
														
 
															+
														
 
															+		dd->dma_lch_in.chan = dma_request_channel(mask_in,
														
 
															+				atmel_sha_filter, &pdata->dma_slave->rxdata);
														
 
															+
														
 
															+		if (!dd->dma_lch_in.chan)
														
 
															+			return err;
														
 
															+
														
 
															+		dd->dma_lch_in.dma_conf.direction = DMA_MEM_TO_DEV;
														
 
															+		dd->dma_lch_in.dma_conf.dst_addr = dd->phys_base +
														
 
															+			SHA_REG_DIN(0);
														
 
															+		dd->dma_lch_in.dma_conf.src_maxburst = 1;
														
 
															+		dd->dma_lch_in.dma_conf.src_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															+		dd->dma_lch_in.dma_conf.dst_maxburst = 1;
														
 
															+		dd->dma_lch_in.dma_conf.dst_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															+		dd->dma_lch_in.dma_conf.device_fc = false;
														
 
															+
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	return -ENODEV;
														
 
															+}
														
 
															+
														
 
															+static void atmel_sha_dma_cleanup(struct atmel_sha_dev *dd)
														
 
															+{
														
 
															+	dma_release_channel(dd->dma_lch_in.chan);
														
 
															+}
														
 
															+
														
 
															+static void atmel_sha_get_cap(struct atmel_sha_dev *dd)
														
 
															+{
														
 
															+
														
 
															+	dd->caps.has_dma = 0;
														
 
															+	dd->caps.has_dualbuff = 0;
														
 
															+	dd->caps.has_sha224 = 0;
														
 
															+	dd->caps.has_sha_384_512 = 0;
														
 
															+
														
 
															+	/* keep only major version number */
														
 
															+	switch (dd->hw_version & 0xff0) {
														
 
															+	case 0x410:
														
 
															+		dd->caps.has_dma = 1;
														
 
															+		dd->caps.has_dualbuff = 1;
														
 
															+		dd->caps.has_sha224 = 1;
														
 
															+		dd->caps.has_sha_384_512 = 1;
														
 
															+		break;
														
 
															+	case 0x400:
														
 
															+		dd->caps.has_dma = 1;
														
 
															+		dd->caps.has_dualbuff = 1;
														
 
															+		dd->caps.has_sha224 = 1;
														
 
															+		break;
														
 
															+	case 0x320:
														
 
															+		break;
														
 
															+	default:
														
 
															+		dev_warn(dd->dev,
														
 
															+				"Unmanaged sha version, set minimum capabilities\n");
														
 
															+		break;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 static int atmel_sha_probe(struct platform_device *pdev)
														
 
															 {
														
 
															 	struct atmel_sha_dev *sha_dd;
														
 
															+	struct crypto_platform_data	*pdata;
														
 
															 	struct device *dev = &pdev->dev;
														
 
															 	struct resource *sha_res;
														
 
															 	unsigned long sha_phys_size;
														
@@ -1018,7 +1381,7 @@ static int atmel_sha_probe(struct platform_device *pdev)
 
															 	}
														
 
															 	/* Initializing the clock */
														
 
															-	sha_dd->iclk = clk_get(&pdev->dev, NULL);
														
 
															+	sha_dd->iclk = clk_get(&pdev->dev, "sha_clk");
														
 
															 	if (IS_ERR(sha_dd->iclk)) {
														
 
															 		dev_err(dev, "clock intialization failed.\n");
														
 
															 		err = PTR_ERR(sha_dd->iclk);
														
@@ -1032,6 +1395,22 @@ static int atmel_sha_probe(struct platform_device *pdev)
 
															 		goto sha_io_err;
														
 
															 	}
														
 
															+	atmel_sha_hw_version_init(sha_dd);
														
 
															+
														
 
															+	atmel_sha_get_cap(sha_dd);
														
 
															+
														
 
															+	if (sha_dd->caps.has_dma) {
														
 
															+		pdata = pdev->dev.platform_data;
														
 
															+		if (!pdata) {
														
 
															+			dev_err(&pdev->dev, "platform data not available\n");
														
 
															+			err = -ENXIO;
														
 
															+			goto err_pdata;
														
 
															+		}
														
 
															+		err = atmel_sha_dma_init(sha_dd, pdata);
														
 
															+		if (err)
														
 
															+			goto err_sha_dma;
														
 
															+	}
														
 
															+
														
 
															 	spin_lock(&atmel_sha.lock);
														
 
															 	list_add_tail(&sha_dd->list, &atmel_sha.dev_list);
														
 
															 	spin_unlock(&atmel_sha.lock);
														
@@ -1048,6 +1427,10 @@ err_algs:
 
															 	spin_lock(&atmel_sha.lock);
														
 
															 	list_del(&sha_dd->list);
														
 
															 	spin_unlock(&atmel_sha.lock);
														
 
															+	if (sha_dd->caps.has_dma)
														
 
															+		atmel_sha_dma_cleanup(sha_dd);
														
 
															+err_sha_dma:
														
 
															+err_pdata:
														
 
															 	iounmap(sha_dd->io_base);
														
 
															 sha_io_err:
														
 
															 	clk_put(sha_dd->iclk);
														
@@ -1078,6 +1461,9 @@ static int atmel_sha_remove(struct platform_device *pdev)
 
															 	tasklet_kill(&sha_dd->done_task);
														
 
															+	if (sha_dd->caps.has_dma)
														
 
															+		atmel_sha_dma_cleanup(sha_dd);
														
 
															+
														
 
															 	iounmap(sha_dd->io_base);
														
 
															 	clk_put(sha_dd->iclk);
														
@@ -1102,6 +1488,6 @@ static struct platform_driver atmel_sha_driver = {
 
															 module_platform_driver(atmel_sha_driver);
														
 
															-MODULE_DESCRIPTION("Atmel SHA1/SHA256 hw acceleration support.");
														
 
															+MODULE_DESCRIPTION("Atmel SHA (1/256/224/384/512) hw acceleration support.");
														
 
															 MODULE_LICENSE("GPL v2");
														
 
															 MODULE_AUTHOR("Nicolas Royer - Eukréa Electromatique");
														
--- a/drivers/crypto/atmel-tdes-regs.h
+++ b/drivers/crypto/atmel-tdes-regs.h
@@ -69,6 +69,8 @@
 
															 #define	TDES_XTEARNDR_XTEA_RNDS_MASK	(0x3F << 0)
														
 
															 #define	TDES_XTEARNDR_XTEA_RNDS_OFFSET	0
														
 
															+#define	TDES_HW_VERSION	0xFC
														
 
															+
														
 
															 #define TDES_RPR		0x100
														
 
															 #define TDES_RCR		0x104
														
 
															 #define TDES_TPR		0x108
														
--- a/drivers/crypto/atmel-tdes.c
+++ b/drivers/crypto/atmel-tdes.c
@@ -38,29 +38,35 @@
 
															 #include <crypto/des.h>
														
 
															 #include <crypto/hash.h>
														
 
															 #include <crypto/internal/hash.h>
														
 
															+#include <linux/platform_data/crypto-atmel.h>
														
 
															 #include "atmel-tdes-regs.h"
														
 
															 /* TDES flags  */
														
 
															-#define TDES_FLAGS_MODE_MASK		0x007f
														
 
															+#define TDES_FLAGS_MODE_MASK		0x00ff
														
 
															 #define TDES_FLAGS_ENCRYPT	BIT(0)
														
 
															 #define TDES_FLAGS_CBC		BIT(1)
														
 
															 #define TDES_FLAGS_CFB		BIT(2)
														
 
															 #define TDES_FLAGS_CFB8		BIT(3)
														
 
															 #define TDES_FLAGS_CFB16	BIT(4)
														
 
															 #define TDES_FLAGS_CFB32	BIT(5)
														
 
															-#define TDES_FLAGS_OFB		BIT(6)
														
 
															+#define TDES_FLAGS_CFB64	BIT(6)
														
 
															+#define TDES_FLAGS_OFB		BIT(7)
														
 
															 #define TDES_FLAGS_INIT		BIT(16)
														
 
															 #define TDES_FLAGS_FAST		BIT(17)
														
 
															 #define TDES_FLAGS_BUSY		BIT(18)
														
 
															+#define TDES_FLAGS_DMA		BIT(19)
														
 
															-#define ATMEL_TDES_QUEUE_LENGTH	1
														
 
															+#define ATMEL_TDES_QUEUE_LENGTH	50
														
 
															 #define CFB8_BLOCK_SIZE		1
														
 
															 #define CFB16_BLOCK_SIZE	2
														
 
															 #define CFB32_BLOCK_SIZE	4
														
 
															-#define CFB64_BLOCK_SIZE	8
														
 
															+struct atmel_tdes_caps {
														
 
															+	bool	has_dma;
														
 
															+	u32		has_cfb_3keys;
														
 
															+};
														
 
															 struct atmel_tdes_dev;
														
@@ -70,12 +76,19 @@ struct atmel_tdes_ctx {
 
															 	int		keylen;
														
 
															 	u32		key[3*DES_KEY_SIZE / sizeof(u32)];
														
 
															 	unsigned long	flags;
														
 
															+
														
 
															+	u16		block_size;
														
 
															 };
														
 
															 struct atmel_tdes_reqctx {
														
 
															 	unsigned long mode;
														
 
															 };
														
 
															+struct atmel_tdes_dma {
														
 
															+	struct dma_chan			*chan;
														
 
															+	struct dma_slave_config dma_conf;
														
 
															+};
														
 
															+
														
 
															 struct atmel_tdes_dev {
														
 
															 	struct list_head	list;
														
 
															 	unsigned long		phys_base;
														
@@ -99,8 +112,10 @@ struct atmel_tdes_dev {
 
															 	size_t				total;
														
 
															 	struct scatterlist	*in_sg;
														
 
															+	unsigned int		nb_in_sg;
														
 
															 	size_t				in_offset;
														
 
															 	struct scatterlist	*out_sg;
														
 
															+	unsigned int		nb_out_sg;
														
 
															 	size_t				out_offset;
														
 
															 	size_t	buflen;
														
@@ -109,10 +124,16 @@ struct atmel_tdes_dev {
 
															 	void	*buf_in;
														
 
															 	int		dma_in;
														
 
															 	dma_addr_t	dma_addr_in;
														
 
															+	struct atmel_tdes_dma	dma_lch_in;
														
 
															 	void	*buf_out;
														
 
															 	int		dma_out;
														
 
															 	dma_addr_t	dma_addr_out;
														
 
															+	struct atmel_tdes_dma	dma_lch_out;
														
 
															+
														
 
															+	struct atmel_tdes_caps	caps;
														
 
															+
														
 
															+	u32	hw_version;
														
 
															 };
														
 
															 struct atmel_tdes_drv {
														
@@ -207,6 +228,31 @@ static int atmel_tdes_hw_init(struct atmel_tdes_dev *dd)
 
															 	return 0;
														
 
															 }
														
 
															+static inline unsigned int atmel_tdes_get_version(struct atmel_tdes_dev *dd)
														
 
															+{
														
 
															+	return atmel_tdes_read(dd, TDES_HW_VERSION) & 0x00000fff;
														
 
															+}
														
 
															+
														
 
															+static void atmel_tdes_hw_version_init(struct atmel_tdes_dev *dd)
														
 
															+{
														
 
															+	atmel_tdes_hw_init(dd);
														
 
															+
														
 
															+	dd->hw_version = atmel_tdes_get_version(dd);
														
 
															+
														
 
															+	dev_info(dd->dev,
														
 
															+			"version: 0x%x\n", dd->hw_version);
														
 
															+
														
 
															+	clk_disable_unprepare(dd->iclk);
														
 
															+}
														
 
															+
														
 
															+static void atmel_tdes_dma_callback(void *data)
														
 
															+{
														
 
															+	struct atmel_tdes_dev *dd = data;
														
 
															+
														
 
															+	/* dma_lch_out - completed */
														
 
															+	tasklet_schedule(&dd->done_task);
														
 
															+}
														
 
															+
														
 
															 static int atmel_tdes_write_ctrl(struct atmel_tdes_dev *dd)
														
 
															 {
														
 
															 	int err;
														
@@ -217,7 +263,9 @@ static int atmel_tdes_write_ctrl(struct atmel_tdes_dev *dd)
 
															 	if (err)
														
 
															 		return err;
														
 
															-	atmel_tdes_write(dd, TDES_PTCR, TDES_PTCR_TXTDIS|TDES_PTCR_RXTDIS);
														
 
															+	if (!dd->caps.has_dma)
														
 
															+		atmel_tdes_write(dd, TDES_PTCR,
														
 
															+			TDES_PTCR_TXTDIS | TDES_PTCR_RXTDIS);
														
 
															 	/* MR register must be set before IV registers */
														
 
															 	if (dd->ctx->keylen > (DES_KEY_SIZE << 1)) {
														
@@ -241,6 +289,8 @@ static int atmel_tdes_write_ctrl(struct atmel_tdes_dev *dd)
 
															 			valmr |= TDES_MR_CFBS_16b;
														
 
															 		else if (dd->flags & TDES_FLAGS_CFB32)
														
 
															 			valmr |= TDES_MR_CFBS_32b;
														
 
															+		else if (dd->flags & TDES_FLAGS_CFB64)
														
 
															+			valmr |= TDES_MR_CFBS_64b;
														
 
															 	} else if (dd->flags & TDES_FLAGS_OFB) {
														
 
															 		valmr |= TDES_MR_OPMOD_OFB;
														
 
															 	}
														
@@ -262,7 +312,7 @@ static int atmel_tdes_write_ctrl(struct atmel_tdes_dev *dd)
 
															 	return 0;
														
 
															 }
														
 
															-static int atmel_tdes_crypt_dma_stop(struct atmel_tdes_dev *dd)
														
 
															+static int atmel_tdes_crypt_pdc_stop(struct atmel_tdes_dev *dd)
														
 
															 {
														
 
															 	int err = 0;
														
 
															 	size_t count;
														
@@ -288,7 +338,7 @@ static int atmel_tdes_crypt_dma_stop(struct atmel_tdes_dev *dd)
 
															 	return err;
														
 
															 }
														
 
															-static int atmel_tdes_dma_init(struct atmel_tdes_dev *dd)
														
 
															+static int atmel_tdes_buff_init(struct atmel_tdes_dev *dd)
														
 
															 {
														
 
															 	int err = -ENOMEM;
														
@@ -333,7 +383,7 @@ err_alloc:
 
															 	return err;
														
 
															 }
														
 
															-static void atmel_tdes_dma_cleanup(struct atmel_tdes_dev *dd)
														
 
															+static void atmel_tdes_buff_cleanup(struct atmel_tdes_dev *dd)
														
 
															 {
														
 
															 	dma_unmap_single(dd->dev, dd->dma_addr_out, dd->buflen,
														
 
															 			 DMA_FROM_DEVICE);
														
@@ -343,7 +393,7 @@ static void atmel_tdes_dma_cleanup(struct atmel_tdes_dev *dd)
 
															 	free_page((unsigned long)dd->buf_in);
														
 
															 }
														
 
															-static int atmel_tdes_crypt_dma(struct crypto_tfm *tfm, dma_addr_t dma_addr_in,
														
 
															+static int atmel_tdes_crypt_pdc(struct crypto_tfm *tfm, dma_addr_t dma_addr_in,
														
 
															 			       dma_addr_t dma_addr_out, int length)
														
 
															 {
														
 
															 	struct atmel_tdes_ctx *ctx = crypto_tfm_ctx(tfm);
														
@@ -379,7 +429,76 @@ static int atmel_tdes_crypt_dma(struct crypto_tfm *tfm, dma_addr_t dma_addr_in,
 
															 	return 0;
														
 
															 }
														
 
															-static int atmel_tdes_crypt_dma_start(struct atmel_tdes_dev *dd)
														
 
															+static int atmel_tdes_crypt_dma(struct crypto_tfm *tfm, dma_addr_t dma_addr_in,
														
 
															+			       dma_addr_t dma_addr_out, int length)
														
 
															+{
														
 
															+	struct atmel_tdes_ctx *ctx = crypto_tfm_ctx(tfm);
														
 
															+	struct atmel_tdes_dev *dd = ctx->dd;
														
 
															+	struct scatterlist sg[2];
														
 
															+	struct dma_async_tx_descriptor	*in_desc, *out_desc;
														
 
															+
														
 
															+	dd->dma_size = length;
														
 
															+
														
 
															+	if (!(dd->flags & TDES_FLAGS_FAST)) {
														
 
															+		dma_sync_single_for_device(dd->dev, dma_addr_in, length,
														
 
															+					   DMA_TO_DEVICE);
														
 
															+	}
														
 
															+
														
 
															+	if (dd->flags & TDES_FLAGS_CFB8) {
														
 
															+		dd->dma_lch_in.dma_conf.dst_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_1_BYTE;
														
 
															+		dd->dma_lch_out.dma_conf.src_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_1_BYTE;
														
 
															+	} else if (dd->flags & TDES_FLAGS_CFB16) {
														
 
															+		dd->dma_lch_in.dma_conf.dst_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_2_BYTES;
														
 
															+		dd->dma_lch_out.dma_conf.src_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_2_BYTES;
														
 
															+	} else {
														
 
															+		dd->dma_lch_in.dma_conf.dst_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															+		dd->dma_lch_out.dma_conf.src_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															+	}
														
 
															+
														
 
															+	dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf);
														
 
															+	dmaengine_slave_config(dd->dma_lch_out.chan, &dd->dma_lch_out.dma_conf);
														
 
															+
														
 
															+	dd->flags |= TDES_FLAGS_DMA;
														
 
															+
														
 
															+	sg_init_table(&sg[0], 1);
														
 
															+	sg_dma_address(&sg[0]) = dma_addr_in;
														
 
															+	sg_dma_len(&sg[0]) = length;
														
 
															+
														
 
															+	sg_init_table(&sg[1], 1);
														
 
															+	sg_dma_address(&sg[1]) = dma_addr_out;
														
 
															+	sg_dma_len(&sg[1]) = length;
														
 
															+
														
 
															+	in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, &sg[0],
														
 
															+				1, DMA_MEM_TO_DEV,
														
 
															+				DMA_PREP_INTERRUPT  |  DMA_CTRL_ACK);
														
 
															+	if (!in_desc)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	out_desc = dmaengine_prep_slave_sg(dd->dma_lch_out.chan, &sg[1],
														
 
															+				1, DMA_DEV_TO_MEM,
														
 
															+				DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
														
 
															+	if (!out_desc)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	out_desc->callback = atmel_tdes_dma_callback;
														
 
															+	out_desc->callback_param = dd;
														
 
															+
														
 
															+	dmaengine_submit(out_desc);
														
 
															+	dma_async_issue_pending(dd->dma_lch_out.chan);
														
 
															+
														
 
															+	dmaengine_submit(in_desc);
														
 
															+	dma_async_issue_pending(dd->dma_lch_in.chan);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int atmel_tdes_crypt_start(struct atmel_tdes_dev *dd)
														
 
															 {
														
 
															 	struct crypto_tfm *tfm = crypto_ablkcipher_tfm(
														
 
															 					crypto_ablkcipher_reqtfm(dd->req));
														
@@ -387,23 +506,23 @@ static int atmel_tdes_crypt_dma_start(struct atmel_tdes_dev *dd)
 
															 	size_t count;
														
 
															 	dma_addr_t addr_in, addr_out;
														
 
															-	if (sg_is_last(dd->in_sg) && sg_is_last(dd->out_sg)) {
														
 
															+	if ((!dd->in_offset) && (!dd->out_offset)) {
														
 
															 		/* check for alignment */
														
 
															-		in = IS_ALIGNED((u32)dd->in_sg->offset, sizeof(u32));
														
 
															-		out = IS_ALIGNED((u32)dd->out_sg->offset, sizeof(u32));
														
 
															-
														
 
															+		in = IS_ALIGNED((u32)dd->in_sg->offset, sizeof(u32)) &&
														
 
															+			IS_ALIGNED(dd->in_sg->length, dd->ctx->block_size);
														
 
															+		out = IS_ALIGNED((u32)dd->out_sg->offset, sizeof(u32)) &&
														
 
															+			IS_ALIGNED(dd->out_sg->length, dd->ctx->block_size);
														
 
															 		fast = in && out;
														
 
															+
														
 
															+		if (sg_dma_len(dd->in_sg) != sg_dma_len(dd->out_sg))
														
 
															+			fast = 0;
														
 
															 	}
														
 
															+
														
 
															 	if (fast)  {
														
 
															 		count = min(dd->total, sg_dma_len(dd->in_sg));
														
 
															 		count = min(count, sg_dma_len(dd->out_sg));
														
 
															-		if (count != dd->total) {
														
 
															-			pr_err("request length != buffer length\n");
														
 
															-			return -EINVAL;
														
 
															-		}
														
 
															-
														
 
															 		err = dma_map_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
														
 
															 		if (!err) {
														
 
															 			dev_err(dd->dev, "dma_map_sg() error\n");
														
@@ -433,13 +552,16 @@ static int atmel_tdes_crypt_dma_start(struct atmel_tdes_dev *dd)
 
															 		addr_out = dd->dma_addr_out;
														
 
															 		dd->flags &= ~TDES_FLAGS_FAST;
														
 
															-
														
 
															 	}
														
 
															 	dd->total -= count;
														
 
															-	err = atmel_tdes_crypt_dma(tfm, addr_in, addr_out, count);
														
 
															-	if (err) {
														
 
															+	if (dd->caps.has_dma)
														
 
															+		err = atmel_tdes_crypt_dma(tfm, addr_in, addr_out, count);
														
 
															+	else
														
 
															+		err = atmel_tdes_crypt_pdc(tfm, addr_in, addr_out, count);
														
 
															+
														
 
															+	if (err && (dd->flags & TDES_FLAGS_FAST)) {
														
 
															 		dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
														
 
															 		dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_TO_DEVICE);
														
 
															 	}
														
@@ -447,7 +569,6 @@ static int atmel_tdes_crypt_dma_start(struct atmel_tdes_dev *dd)
 
															 	return err;
														
 
															 }
														
 
															-
														
 
															 static void atmel_tdes_finish_req(struct atmel_tdes_dev *dd, int err)
														
 
															 {
														
 
															 	struct ablkcipher_request *req = dd->req;
														
@@ -506,7 +627,7 @@ static int atmel_tdes_handle_queue(struct atmel_tdes_dev *dd,
 
															 	err = atmel_tdes_write_ctrl(dd);
														
 
															 	if (!err)
														
 
															-		err = atmel_tdes_crypt_dma_start(dd);
														
 
															+		err = atmel_tdes_crypt_start(dd);
														
 
															 	if (err) {
														
 
															 		/* des_task will not finish it, so do it here */
														
 
															 		atmel_tdes_finish_req(dd, err);
														
@@ -516,41 +637,145 @@ static int atmel_tdes_handle_queue(struct atmel_tdes_dev *dd,
 
															 	return ret;
														
 
															 }
														
 
															+static int atmel_tdes_crypt_dma_stop(struct atmel_tdes_dev *dd)
														
 
															+{
														
 
															+	int err = -EINVAL;
														
 
															+	size_t count;
														
 
															+
														
 
															+	if (dd->flags & TDES_FLAGS_DMA) {
														
 
															+		err = 0;
														
 
															+		if  (dd->flags & TDES_FLAGS_FAST) {
														
 
															+			dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_FROM_DEVICE);
														
 
															+			dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
														
 
															+		} else {
														
 
															+			dma_sync_single_for_device(dd->dev, dd->dma_addr_out,
														
 
															+				dd->dma_size, DMA_FROM_DEVICE);
														
 
															+
														
 
															+			/* copy data */
														
 
															+			count = atmel_tdes_sg_copy(&dd->out_sg, &dd->out_offset,
														
 
															+				dd->buf_out, dd->buflen, dd->dma_size, 1);
														
 
															+			if (count != dd->dma_size) {
														
 
															+				err = -EINVAL;
														
 
															+				pr_err("not all data converted: %u\n", count);
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+	return err;
														
 
															+}
														
 
															 static int atmel_tdes_crypt(struct ablkcipher_request *req, unsigned long mode)
														
 
															 {
														
 
															 	struct atmel_tdes_ctx *ctx = crypto_ablkcipher_ctx(
														
 
															 			crypto_ablkcipher_reqtfm(req));
														
 
															 	struct atmel_tdes_reqctx *rctx = ablkcipher_request_ctx(req);
														
 
															-	struct atmel_tdes_dev *dd;
														
 
															 	if (mode & TDES_FLAGS_CFB8) {
														
 
															 		if (!IS_ALIGNED(req->nbytes, CFB8_BLOCK_SIZE)) {
														
 
															 			pr_err("request size is not exact amount of CFB8 blocks\n");
														
 
															 			return -EINVAL;
														
 
															 		}
														
 
															+		ctx->block_size = CFB8_BLOCK_SIZE;
														
 
															 	} else if (mode & TDES_FLAGS_CFB16) {
														
 
															 		if (!IS_ALIGNED(req->nbytes, CFB16_BLOCK_SIZE)) {
														
 
															 			pr_err("request size is not exact amount of CFB16 blocks\n");
														
 
															 			return -EINVAL;
														
 
															 		}
														
 
															+		ctx->block_size = CFB16_BLOCK_SIZE;
														
 
															 	} else if (mode & TDES_FLAGS_CFB32) {
														
 
															 		if (!IS_ALIGNED(req->nbytes, CFB32_BLOCK_SIZE)) {
														
 
															 			pr_err("request size is not exact amount of CFB32 blocks\n");
														
 
															 			return -EINVAL;
														
 
															 		}
														
 
															-	} else if (!IS_ALIGNED(req->nbytes, DES_BLOCK_SIZE)) {
														
 
															-		pr_err("request size is not exact amount of DES blocks\n");
														
 
															-		return -EINVAL;
														
 
															+		ctx->block_size = CFB32_BLOCK_SIZE;
														
 
															+	} else {
														
 
															+		if (!IS_ALIGNED(req->nbytes, DES_BLOCK_SIZE)) {
														
 
															+			pr_err("request size is not exact amount of DES blocks\n");
														
 
															+			return -EINVAL;
														
 
															+		}
														
 
															+		ctx->block_size = DES_BLOCK_SIZE;
														
 
															 	}
														
 
															-	dd = atmel_tdes_find_dev(ctx);
														
 
															-	if (!dd)
														
 
															+	rctx->mode = mode;
														
 
															+
														
 
															+	return atmel_tdes_handle_queue(ctx->dd, req);
														
 
															+}
														
 
															+
														
 
															+static bool atmel_tdes_filter(struct dma_chan *chan, void *slave)
														
 
															+{
														
 
															+	struct at_dma_slave	*sl = slave;
														
 
															+
														
 
															+	if (sl && sl->dma_dev == chan->device->dev) {
														
 
															+		chan->private = sl;
														
 
															+		return true;
														
 
															+	} else {
														
 
															+		return false;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int atmel_tdes_dma_init(struct atmel_tdes_dev *dd,
														
 
															+			struct crypto_platform_data *pdata)
														
 
															+{
														
 
															+	int err = -ENOMEM;
														
 
															+	dma_cap_mask_t mask_in, mask_out;
														
 
															+
														
 
															+	if (pdata && pdata->dma_slave->txdata.dma_dev &&
														
 
															+		pdata->dma_slave->rxdata.dma_dev) {
														
 
															+
														
 
															+		/* Try to grab 2 DMA channels */
														
 
															+		dma_cap_zero(mask_in);
														
 
															+		dma_cap_set(DMA_SLAVE, mask_in);
														
 
															+
														
 
															+		dd->dma_lch_in.chan = dma_request_channel(mask_in,
														
 
															+				atmel_tdes_filter, &pdata->dma_slave->rxdata);
														
 
															+
														
 
															+		if (!dd->dma_lch_in.chan)
														
 
															+			goto err_dma_in;
														
 
															+
														
 
															+		dd->dma_lch_in.dma_conf.direction = DMA_MEM_TO_DEV;
														
 
															+		dd->dma_lch_in.dma_conf.dst_addr = dd->phys_base +
														
 
															+			TDES_IDATA1R;
														
 
															+		dd->dma_lch_in.dma_conf.src_maxburst = 1;
														
 
															+		dd->dma_lch_in.dma_conf.src_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															+		dd->dma_lch_in.dma_conf.dst_maxburst = 1;
														
 
															+		dd->dma_lch_in.dma_conf.dst_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															+		dd->dma_lch_in.dma_conf.device_fc = false;
														
 
															+
														
 
															+		dma_cap_zero(mask_out);
														
 
															+		dma_cap_set(DMA_SLAVE, mask_out);
														
 
															+		dd->dma_lch_out.chan = dma_request_channel(mask_out,
														
 
															+				atmel_tdes_filter, &pdata->dma_slave->txdata);
														
 
															+
														
 
															+		if (!dd->dma_lch_out.chan)
														
 
															+			goto err_dma_out;
														
 
															+
														
 
															+		dd->dma_lch_out.dma_conf.direction = DMA_DEV_TO_MEM;
														
 
															+		dd->dma_lch_out.dma_conf.src_addr = dd->phys_base +
														
 
															+			TDES_ODATA1R;
														
 
															+		dd->dma_lch_out.dma_conf.src_maxburst = 1;
														
 
															+		dd->dma_lch_out.dma_conf.src_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															+		dd->dma_lch_out.dma_conf.dst_maxburst = 1;
														
 
															+		dd->dma_lch_out.dma_conf.dst_addr_width =
														
 
															+			DMA_SLAVE_BUSWIDTH_4_BYTES;
														
 
															+		dd->dma_lch_out.dma_conf.device_fc = false;
														
 
															+
														
 
															+		return 0;
														
 
															+	} else {
														
 
															 		return -ENODEV;
														
 
															+	}
														
 
															-	rctx->mode = mode;
														
 
															+err_dma_out:
														
 
															+	dma_release_channel(dd->dma_lch_in.chan);
														
 
															+err_dma_in:
														
 
															+	return err;
														
 
															+}
														
 
															-	return atmel_tdes_handle_queue(dd, req);
														
 
															+static void atmel_tdes_dma_cleanup(struct atmel_tdes_dev *dd)
														
 
															+{
														
 
															+	dma_release_channel(dd->dma_lch_in.chan);
														
 
															+	dma_release_channel(dd->dma_lch_out.chan);
														
 
															 }
														
 
															 static int atmel_des_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
														
@@ -590,7 +815,8 @@ static int atmel_tdes_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
 
															 	/*
														
 
															 	 * HW bug in cfb 3-keys mode.
														
 
															 	 */
														
 
															-	if (strstr(alg_name, "cfb") && (keylen != 2*DES_KEY_SIZE)) {
														
 
															+	if (!ctx->dd->caps.has_cfb_3keys && strstr(alg_name, "cfb")
														
 
															+			&& (keylen != 2*DES_KEY_SIZE)) {
														
 
															 		crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
														
 
															 		return -EINVAL;
														
 
															 	} else if ((keylen != 2*DES_KEY_SIZE) && (keylen != 3*DES_KEY_SIZE)) {
														
@@ -678,8 +904,15 @@ static int atmel_tdes_ofb_decrypt(struct ablkcipher_request *req)
 
															 static int atmel_tdes_cra_init(struct crypto_tfm *tfm)
														
 
															 {
														
 
															+	struct atmel_tdes_ctx *ctx = crypto_tfm_ctx(tfm);
														
 
															+	struct atmel_tdes_dev *dd;
														
 
															+
														
 
															 	tfm->crt_ablkcipher.reqsize = sizeof(struct atmel_tdes_reqctx);
														
 
															+	dd = atmel_tdes_find_dev(ctx);
														
 
															+	if (!dd)
														
 
															+		return -ENODEV;
														
 
															+
														
 
															 	return 0;
														
 
															 }
														
@@ -695,7 +928,7 @@ static struct crypto_alg tdes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= DES_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
														
 
															-	.cra_alignmask		= 0,
														
 
															+	.cra_alignmask		= 0x7,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_tdes_cra_init,
														
@@ -715,7 +948,7 @@ static struct crypto_alg tdes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= DES_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
														
 
															-	.cra_alignmask		= 0,
														
 
															+	.cra_alignmask		= 0x7,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_tdes_cra_init,
														
@@ -736,7 +969,7 @@ static struct crypto_alg tdes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= DES_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
														
 
															-	.cra_alignmask		= 0,
														
 
															+	.cra_alignmask		= 0x7,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_tdes_cra_init,
														
@@ -778,7 +1011,7 @@ static struct crypto_alg tdes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= CFB16_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
														
 
															-	.cra_alignmask		= 0,
														
 
															+	.cra_alignmask		= 0x1,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_tdes_cra_init,
														
@@ -799,7 +1032,7 @@ static struct crypto_alg tdes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= CFB32_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
														
 
															-	.cra_alignmask		= 0,
														
 
															+	.cra_alignmask		= 0x3,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_tdes_cra_init,
														
@@ -820,7 +1053,7 @@ static struct crypto_alg tdes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= DES_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
														
 
															-	.cra_alignmask		= 0,
														
 
															+	.cra_alignmask		= 0x7,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_tdes_cra_init,
														
@@ -841,7 +1074,7 @@ static struct crypto_alg tdes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= DES_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
														
 
															-	.cra_alignmask		= 0,
														
 
															+	.cra_alignmask		= 0x7,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_tdes_cra_init,
														
@@ -861,7 +1094,7 @@ static struct crypto_alg tdes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= DES_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
														
 
															-	.cra_alignmask		= 0,
														
 
															+	.cra_alignmask		= 0x7,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_tdes_cra_init,
														
@@ -882,7 +1115,7 @@ static struct crypto_alg tdes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= DES_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
														
 
															-	.cra_alignmask		= 0,
														
 
															+	.cra_alignmask		= 0x7,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_tdes_cra_init,
														
@@ -924,7 +1157,7 @@ static struct crypto_alg tdes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= CFB16_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
														
 
															-	.cra_alignmask		= 0,
														
 
															+	.cra_alignmask		= 0x1,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_tdes_cra_init,
														
@@ -945,7 +1178,7 @@ static struct crypto_alg tdes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= CFB32_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
														
 
															-	.cra_alignmask		= 0,
														
 
															+	.cra_alignmask		= 0x3,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_tdes_cra_init,
														
@@ -966,7 +1199,7 @@ static struct crypto_alg tdes_algs[] = {
 
															 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
														
 
															 	.cra_blocksize		= DES_BLOCK_SIZE,
														
 
															 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
														
 
															-	.cra_alignmask		= 0,
														
 
															+	.cra_alignmask		= 0x7,
														
 
															 	.cra_type		= &crypto_ablkcipher_type,
														
 
															 	.cra_module		= THIS_MODULE,
														
 
															 	.cra_init		= atmel_tdes_cra_init,
														
@@ -994,14 +1227,24 @@ static void atmel_tdes_done_task(unsigned long data)
 
															 	struct atmel_tdes_dev *dd = (struct atmel_tdes_dev *) data;
														
 
															 	int err;
														
 
															-	err = atmel_tdes_crypt_dma_stop(dd);
														
 
															+	if (!(dd->flags & TDES_FLAGS_DMA))
														
 
															+		err = atmel_tdes_crypt_pdc_stop(dd);
														
 
															+	else
														
 
															+		err = atmel_tdes_crypt_dma_stop(dd);
														
 
															 	err = dd->err ? : err;
														
 
															 	if (dd->total && !err) {
														
 
															-		err = atmel_tdes_crypt_dma_start(dd);
														
 
															+		if (dd->flags & TDES_FLAGS_FAST) {
														
 
															+			dd->in_sg = sg_next(dd->in_sg);
														
 
															+			dd->out_sg = sg_next(dd->out_sg);
														
 
															+			if (!dd->in_sg || !dd->out_sg)
														
 
															+				err = -EINVAL;
														
 
															+		}
														
 
															 		if (!err)
														
 
															-			return;
														
 
															+			err = atmel_tdes_crypt_start(dd);
														
 
															+		if (!err)
														
 
															+			return; /* DMA started. Not fininishing. */
														
 
															 	}
														
 
															 	atmel_tdes_finish_req(dd, err);
														
@@ -1053,9 +1296,31 @@ err_tdes_algs:
 
															 	return err;
														
 
															 }
														
 
															+static void atmel_tdes_get_cap(struct atmel_tdes_dev *dd)
														
 
															+{
														
 
															+
														
 
															+	dd->caps.has_dma = 0;
														
 
															+	dd->caps.has_cfb_3keys = 0;
														
 
															+
														
 
															+	/* keep only major version number */
														
 
															+	switch (dd->hw_version & 0xf00) {
														
 
															+	case 0x700:
														
 
															+		dd->caps.has_dma = 1;
														
 
															+		dd->caps.has_cfb_3keys = 1;
														
 
															+		break;
														
 
															+	case 0x600:
														
 
															+		break;
														
 
															+	default:
														
 
															+		dev_warn(dd->dev,
														
 
															+				"Unmanaged tdes version, set minimum capabilities\n");
														
 
															+		break;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 static int atmel_tdes_probe(struct platform_device *pdev)
														
 
															 {
														
 
															 	struct atmel_tdes_dev *tdes_dd;
														
 
															+	struct crypto_platform_data	*pdata;
														
 
															 	struct device *dev = &pdev->dev;
														
 
															 	struct resource *tdes_res;
														
 
															 	unsigned long tdes_phys_size;
														
@@ -1109,7 +1374,7 @@ static int atmel_tdes_probe(struct platform_device *pdev)
 
															 	}
														
 
															 	/* Initializing the clock */
														
 
															-	tdes_dd->iclk = clk_get(&pdev->dev, NULL);
														
 
															+	tdes_dd->iclk = clk_get(&pdev->dev, "tdes_clk");
														
 
															 	if (IS_ERR(tdes_dd->iclk)) {
														
 
															 		dev_err(dev, "clock intialization failed.\n");
														
 
															 		err = PTR_ERR(tdes_dd->iclk);
														
@@ -1123,9 +1388,25 @@ static int atmel_tdes_probe(struct platform_device *pdev)
 
															 		goto tdes_io_err;
														
 
															 	}
														
 
															-	err = atmel_tdes_dma_init(tdes_dd);
														
 
															+	atmel_tdes_hw_version_init(tdes_dd);
														
 
															+
														
 
															+	atmel_tdes_get_cap(tdes_dd);
														
 
															+
														
 
															+	err = atmel_tdes_buff_init(tdes_dd);
														
 
															 	if (err)
														
 
															-		goto err_tdes_dma;
														
 
															+		goto err_tdes_buff;
														
 
															+
														
 
															+	if (tdes_dd->caps.has_dma) {
														
 
															+		pdata = pdev->dev.platform_data;
														
 
															+		if (!pdata) {
														
 
															+			dev_err(&pdev->dev, "platform data not available\n");
														
 
															+			err = -ENXIO;
														
 
															+			goto err_pdata;
														
 
															+		}
														
 
															+		err = atmel_tdes_dma_init(tdes_dd, pdata);
														
 
															+		if (err)
														
 
															+			goto err_tdes_dma;
														
 
															+	}
														
 
															 	spin_lock(&atmel_tdes.lock);
														
 
															 	list_add_tail(&tdes_dd->list, &atmel_tdes.dev_list);
														
@@ -1143,8 +1424,12 @@ err_algs:
 
															 	spin_lock(&atmel_tdes.lock);
														
 
															 	list_del(&tdes_dd->list);
														
 
															 	spin_unlock(&atmel_tdes.lock);
														
 
															-	atmel_tdes_dma_cleanup(tdes_dd);
														
 
															+	if (tdes_dd->caps.has_dma)
														
 
															+		atmel_tdes_dma_cleanup(tdes_dd);
														
 
															 err_tdes_dma:
														
 
															+err_pdata:
														
 
															+	atmel_tdes_buff_cleanup(tdes_dd);
														
 
															+err_tdes_buff:
														
 
															 	iounmap(tdes_dd->io_base);
														
 
															 tdes_io_err:
														
 
															 	clk_put(tdes_dd->iclk);
														
@@ -1178,7 +1463,10 @@ static int atmel_tdes_remove(struct platform_device *pdev)
 
															 	tasklet_kill(&tdes_dd->done_task);
														
 
															 	tasklet_kill(&tdes_dd->queue_task);
														
 
															-	atmel_tdes_dma_cleanup(tdes_dd);
														
 
															+	if (tdes_dd->caps.has_dma)
														
 
															+		atmel_tdes_dma_cleanup(tdes_dd);
														
 
															+
														
 
															+	atmel_tdes_buff_cleanup(tdes_dd);
														
 
															 	iounmap(tdes_dd->io_base);
														
--- a/drivers/crypto/bfin_crc.c
+++ b/drivers/crypto/bfin_crc.c
@@ -151,7 +151,7 @@ static int bfin_crypto_crc_init(struct ahash_request *req)
 
															 	struct bfin_crypto_crc_reqctx *ctx = ahash_request_ctx(req);
														
 
															 	struct bfin_crypto_crc *crc;
														
 
															-	dev_dbg(crc->dev, "crc_init\n");
														
 
															+	dev_dbg(ctx->crc->dev, "crc_init\n");
														
 
															 	spin_lock_bh(&crc_list.lock);
														
 
															 	list_for_each_entry(crc, &crc_list.dev_list, list) {
														
 
															 		crc_ctx->crc = crc;
														
@@ -160,7 +160,7 @@ static int bfin_crypto_crc_init(struct ahash_request *req)
 
															 	spin_unlock_bh(&crc_list.lock);
														
 
															 	if (sg_count(req->src) > CRC_MAX_DMA_DESC) {
														
 
															-		dev_dbg(crc->dev, "init: requested sg list is too big > %d\n",
														
 
															+		dev_dbg(ctx->crc->dev, "init: requested sg list is too big > %d\n",
														
 
															 			CRC_MAX_DMA_DESC);
														
 
															 		return -EINVAL;
														
 
															 	}
														
@@ -175,7 +175,7 @@ static int bfin_crypto_crc_init(struct ahash_request *req)
 
															 	/* init crc results */
														
 
															 	put_unaligned_le32(crc_ctx->key, req->result);
														
 
															-	dev_dbg(crc->dev, "init: digest size: %d\n",
														
 
															+	dev_dbg(ctx->crc->dev, "init: digest size: %d\n",
														
 
															 		crypto_ahash_digestsize(tfm));
														
 
															 	return bfin_crypto_crc_init_hw(crc, crc_ctx->key);
														
--- a/drivers/crypto/caam/Kconfig
+++ b/drivers/crypto/caam/Kconfig
@@ -78,7 +78,7 @@ config CRYPTO_DEV_FSL_CAAM_AHASH_API
 
															 	tristate "Register hash algorithm implementations with Crypto API"
														
 
															 	depends on CRYPTO_DEV_FSL_CAAM
														
 
															 	default y
														
 
															-	select CRYPTO_AHASH
														
 
															+	select CRYPTO_HASH
														
 
															 	help
														
 
															 	  Selecting this will offload ahash for users of the
														
 
															 	  scatterlist crypto API to the SEC4 via job ring.
														
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -1693,6 +1693,7 @@ static struct caam_alg_template driver_algs[] = {
 
															 		.name = "authenc(hmac(sha224),cbc(aes))",
														
 
															 		.driver_name = "authenc-hmac-sha224-cbc-aes-caam",
														
 
															 		.blocksize = AES_BLOCK_SIZE,
														
 
															+		.type = CRYPTO_ALG_TYPE_AEAD,
														
 
															 		.template_aead = {
														
 
															 			.setkey = aead_setkey,
														
 
															 			.setauthsize = aead_setauthsize,
														
@@ -1732,6 +1733,7 @@ static struct caam_alg_template driver_algs[] = {
 
															 		.name = "authenc(hmac(sha384),cbc(aes))",
														
 
															 		.driver_name = "authenc-hmac-sha384-cbc-aes-caam",
														
 
															 		.blocksize = AES_BLOCK_SIZE,
														
 
															+		.type = CRYPTO_ALG_TYPE_AEAD,
														
 
															 		.template_aead = {
														
 
															 			.setkey = aead_setkey,
														
 
															 			.setauthsize = aead_setauthsize,
														
@@ -1810,6 +1812,7 @@ static struct caam_alg_template driver_algs[] = {
 
															 		.name = "authenc(hmac(sha224),cbc(des3_ede))",
														
 
															 		.driver_name = "authenc-hmac-sha224-cbc-des3_ede-caam",
														
 
															 		.blocksize = DES3_EDE_BLOCK_SIZE,
														
 
															+		.type = CRYPTO_ALG_TYPE_AEAD,
														
 
															 		.template_aead = {
														
 
															 			.setkey = aead_setkey,
														
 
															 			.setauthsize = aead_setauthsize,
														
@@ -1849,6 +1852,7 @@ static struct caam_alg_template driver_algs[] = {
 
															 		.name = "authenc(hmac(sha384),cbc(des3_ede))",
														
 
															 		.driver_name = "authenc-hmac-sha384-cbc-des3_ede-caam",
														
 
															 		.blocksize = DES3_EDE_BLOCK_SIZE,
														
 
															+		.type = CRYPTO_ALG_TYPE_AEAD,
														
 
															 		.template_aead = {
														
 
															 			.setkey = aead_setkey,
														
 
															 			.setauthsize = aead_setauthsize,
														
@@ -1926,6 +1930,7 @@ static struct caam_alg_template driver_algs[] = {
 
															 		.name = "authenc(hmac(sha224),cbc(des))",
														
 
															 		.driver_name = "authenc-hmac-sha224-cbc-des-caam",
														
 
															 		.blocksize = DES_BLOCK_SIZE,
														
 
															+		.type = CRYPTO_ALG_TYPE_AEAD,
														
 
															 		.template_aead = {
														
 
															 			.setkey = aead_setkey,
														
 
															 			.setauthsize = aead_setauthsize,
														
@@ -1965,6 +1970,7 @@ static struct caam_alg_template driver_algs[] = {
 
															 		.name = "authenc(hmac(sha384),cbc(des))",
														
 
															 		.driver_name = "authenc-hmac-sha384-cbc-des-caam",
														
 
															 		.blocksize = DES_BLOCK_SIZE,
														
 
															+		.type = CRYPTO_ALG_TYPE_AEAD,
														
 
															 		.template_aead = {
														
 
															 			.setkey = aead_setkey,
														
 
															 			.setauthsize = aead_setauthsize,
														
--- a/drivers/crypto/caam/caamhash.c
+++ b/drivers/crypto/caam/caamhash.c
@@ -411,7 +411,7 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 
															 	return 0;
														
 
															 }
														
 
															-static u32 gen_split_hash_key(struct caam_hash_ctx *ctx, const u8 *key_in,
														
 
															+static int gen_split_hash_key(struct caam_hash_ctx *ctx, const u8 *key_in,
														
 
															 			      u32 keylen)
														
 
															 {
														
 
															 	return gen_split_key(ctx->jrdev, ctx->key, ctx->split_key_len,
														
@@ -420,7 +420,7 @@ static u32 gen_split_hash_key(struct caam_hash_ctx *ctx, const u8 *key_in,
 
															 }
														
 
															 /* Digest hash size if it is too large */
														
 
															-static u32 hash_digest_key(struct caam_hash_ctx *ctx, const u8 *key_in,
														
 
															+static int hash_digest_key(struct caam_hash_ctx *ctx, const u8 *key_in,
														
 
															 			   u32 *keylen, u8 *key_out, u32 digestsize)
														
 
															 {
														
 
															 	struct device *jrdev = ctx->jrdev;
														
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@@ -304,6 +304,9 @@ static int caam_probe(struct platform_device *pdev)
 
															 			caam_remove(pdev);
														
 
															 			return ret;
														
 
															 		}
														
 
															+
														
 
															+		/* Enable RDB bit so that RNG works faster */
														
 
															+		setbits32(&topregs->ctrl.scfgr, SCFGR_RDBENABLE);
														
 
															 	}
														
 
															 	/* NOTE: RTIC detection ought to go here, around Si time */
														
--- a/drivers/crypto/caam/error.c
+++ b/drivers/crypto/caam/error.c
@@ -36,7 +36,7 @@ static void report_jump_idx(u32 status, char *outstr)
 
															 static void report_ccb_status(u32 status, char *outstr)
														
 
															 {
														
 
															-	char *cha_id_list[] = {
														
 
															+	static const char * const cha_id_list[] = {
														
 
															 		"",
														
 
															 		"AES",
														
 
															 		"DES",
														
@@ -51,7 +51,7 @@ static void report_ccb_status(u32 status, char *outstr)
 
															 		"ZUCE",
														
 
															 		"ZUCA",
														
 
															 	};
														
 
															-	char *err_id_list[] = {
														
 
															+	static const char * const err_id_list[] = {
														
 
															 		"No error.",
														
 
															 		"Mode error.",
														
 
															 		"Data size error.",
														
@@ -69,7 +69,7 @@ static void report_ccb_status(u32 status, char *outstr)
 
															 		"Invalid CHA combination was selected",
														
 
															 		"Invalid CHA selected.",
														
 
															 	};
														
 
															-	char *rng_err_id_list[] = {
														
 
															+	static const char * const rng_err_id_list[] = {
														
 
															 		"",
														
 
															 		"",
														
 
															 		"",
														
@@ -117,7 +117,7 @@ static void report_jump_status(u32 status, char *outstr)
 
															 static void report_deco_status(u32 status, char *outstr)
														
 
															 {
														
 
															-	const struct {
														
 
															+	static const struct {
														
 
															 		u8 value;
														
 
															 		char *error_text;
														
 
															 	} desc_error_list[] = {
														
@@ -245,7 +245,7 @@ static void report_cond_code_status(u32 status, char *outstr)
 
															 char *caam_jr_strstatus(char *outstr, u32 status)
														
 
															 {
														
 
															-	struct stat_src {
														
 
															+	static const struct stat_src {
														
 
															 		void (*report_ssed)(u32 status, char *outstr);
														
 
															 		char *error;
														
 
															 	} status_src[] = {
														
--- a/drivers/crypto/caam/intern.h
+++ b/drivers/crypto/caam/intern.h
@@ -41,6 +41,7 @@ struct caam_jrentry_info {
 
															 /* Private sub-storage for a single JobR */
														
 
															 struct caam_drv_private_jr {
														
 
															 	struct device *parentdev;	/* points back to controller dev */
														
 
															+	struct platform_device *jr_pdev;/* points to platform device for JR */
														
 
															 	int ridx;
														
 
															 	struct caam_job_ring __iomem *rregs;	/* JobR's register space */
														
 
															 	struct tasklet_struct irqtask;
														
--- a/drivers/crypto/caam/jr.c
+++ b/drivers/crypto/caam/jr.c
@@ -407,6 +407,7 @@ int caam_jr_shutdown(struct device *dev)
 
															 	dma_free_coherent(dev, sizeof(struct jr_outentry) * JOBR_DEPTH,
														
 
															 			  jrp->outring, outbusaddr);
														
 
															 	kfree(jrp->entinfo);
														
 
															+	of_device_unregister(jrp->jr_pdev);
														
 
															 	return ret;
														
 
															 }
														
@@ -454,6 +455,8 @@ int caam_jr_probe(struct platform_device *pdev, struct device_node *np,
 
															 		kfree(jrpriv);
														
 
															 		return -EINVAL;
														
 
															 	}
														
 
															+
														
 
															+	jrpriv->jr_pdev = jr_pdev;
														
 
															 	jrdev = &jr_pdev->dev;
														
 
															 	dev_set_drvdata(jrdev, jrpriv);
														
 
															 	ctrlpriv->jrdev[ring] = jrdev;
														
@@ -472,6 +475,7 @@ int caam_jr_probe(struct platform_device *pdev, struct device_node *np,
 
															 	/* Now do the platform independent part */
														
 
															 	error = caam_jr_init(jrdev); /* now turn on hardware */
														
 
															 	if (error) {
														
 
															+		of_device_unregister(jr_pdev);
														
 
															 		kfree(jrpriv);
														
 
															 		return error;
														
 
															 	}
														
--- a/drivers/crypto/caam/key_gen.c
+++ b/drivers/crypto/caam/key_gen.c
@@ -44,7 +44,7 @@ Split key generation-----------------------------------------------
 
															 [06] 0x64260028    fifostr: class2 mdsplit-jdk len=40
														
 
															 			@0xffe04000
														
 
															 */
														
 
															-u32 gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
														
 
															+int gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
														
 
															 		  int split_key_pad_len, const u8 *key_in, u32 keylen,
														
 
															 		  u32 alg_op)
														
 
															 {
														
--- a/drivers/crypto/caam/key_gen.h
+++ b/drivers/crypto/caam/key_gen.h
@@ -12,6 +12,6 @@ struct split_key_result {
 
															 void split_key_done(struct device *dev, u32 *desc, u32 err, void *context);
														
 
															-u32 gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
														
 
															+int gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
														
 
															 		    int split_key_pad_len, const u8 *key_in, u32 keylen,
														
 
															 		    u32 alg_op);
														
--- a/drivers/crypto/caam/regs.h
+++ b/drivers/crypto/caam/regs.h
@@ -252,7 +252,8 @@ struct caam_ctrl {
 
															 	/* Read/Writable					        */
														
 
															 	u32 rsvd1;
														
 
															 	u32 mcr;		/* MCFG      Master Config Register  */
														
 
															-	u32 rsvd2[2];
														
 
															+	u32 rsvd2;
														
 
															+	u32 scfgr;		/* SCFGR, Security Config Register */
														
 
															 	/* Bus Access Configuration Section			010-11f */
														
 
															 	/* Read/Writable                                                */
														
@@ -299,6 +300,7 @@ struct caam_ctrl {
 
															 #define MCFGR_WDFAIL		0x20000000 /* DECO watchdog force-fail */
														
 
															 #define MCFGR_DMA_RESET		0x10000000
														
 
															 #define MCFGR_LONG_PTR		0x00010000 /* Use >32-bit desc addressing */
														
 
															+#define SCFGR_RDBENABLE		0x00000400
														
 
															 /* AXI read cache control */
														
 
															 #define MCFGR_ARCACHE_SHIFT	12
														
--- a/drivers/crypto/omap-aes.c
+++ b/drivers/crypto/omap-aes.c
@@ -636,7 +636,7 @@ static void omap_aes_finish_req(struct omap_aes_dev *dd, int err)
 
															 	pr_debug("err: %d\n", err);
														
 
															-	pm_runtime_put_sync(dd->dev);
														
 
															+	pm_runtime_put(dd->dev);
														
 
															 	dd->flags &= ~FLAGS_BUSY;
														
 
															 	req->base.complete(&req->base, err);
														
@@ -1248,18 +1248,7 @@ static struct platform_driver omap_aes_driver = {
 
															 	},
														
 
															 };
														
 
															-static int __init omap_aes_mod_init(void)
														
 
															-{
														
 
															-	return  platform_driver_register(&omap_aes_driver);
														
 
															-}
														
 
															-
														
 
															-static void __exit omap_aes_mod_exit(void)
														
 
															-{
														
 
															-	platform_driver_unregister(&omap_aes_driver);
														
 
															-}
														
 
															-
														
 
															-module_init(omap_aes_mod_init);
														
 
															-module_exit(omap_aes_mod_exit);
														
 
															+module_platform_driver(omap_aes_driver);
														
 
															 MODULE_DESCRIPTION("OMAP AES hw acceleration support.");
														
 
															 MODULE_LICENSE("GPL v2");
														
--- a/drivers/crypto/omap-sham.c
+++ b/drivers/crypto/omap-sham.c
@@ -923,7 +923,7 @@ static void omap_sham_finish_req(struct ahash_request *req, int err)
 
															 	dd->flags &= ~(BIT(FLAGS_BUSY) | BIT(FLAGS_FINAL) | BIT(FLAGS_CPU) |
														
 
															 			BIT(FLAGS_DMA_READY) | BIT(FLAGS_OUTPUT_READY));
														
 
															-	pm_runtime_put_sync(dd->dev);
														
 
															+	pm_runtime_put(dd->dev);
														
 
															 	if (req->base.complete)
														
 
															 		req->base.complete(&req->base, err);
														
@@ -1813,18 +1813,7 @@ static struct platform_driver omap_sham_driver = {
 
															 	},
														
 
															 };
														
 
															-static int __init omap_sham_mod_init(void)
														
 
															-{
														
 
															-	return platform_driver_register(&omap_sham_driver);
														
 
															-}
														
 
															-
														
 
															-static void __exit omap_sham_mod_exit(void)
														
 
															-{
														
 
															-	platform_driver_unregister(&omap_sham_driver);
														
 
															-}
														
 
															-
														
 
															-module_init(omap_sham_mod_init);
														
 
															-module_exit(omap_sham_mod_exit);
														
 
															+module_platform_driver(omap_sham_driver);
														
 
															 MODULE_DESCRIPTION("OMAP SHA1/MD5 hw acceleration support.");
														
 
															 MODULE_LICENSE("GPL v2");
														
--- a/drivers/crypto/picoxcell_crypto.c
+++ b/drivers/crypto/picoxcell_crypto.c
@@ -1688,8 +1688,6 @@ static const struct of_device_id spacc_of_id_table[] = {
 
															 	{ .compatible = "picochip,spacc-l2" },
														
 
															 	{}
														
 
															 };
														
 
															-#else /* CONFIG_OF */
														
 
															-#define spacc_of_id_table NULL
														
 
															 #endif /* CONFIG_OF */
														
 
															 static bool spacc_is_compatible(struct platform_device *pdev,
														
@@ -1874,7 +1872,7 @@ static struct platform_driver spacc_driver = {
 
															 #ifdef CONFIG_PM
														
 
															 		.pm	= &spacc_pm_ops,
														
 
															 #endif /* CONFIG_PM */
														
 
															-		.of_match_table	= spacc_of_id_table,
														
 
															+		.of_match_table	= of_match_ptr(spacc_of_id_table),
														
 
															 	},
														
 
															 	.id_table	= spacc_id_table,
														
 
															 };
														
--- a/drivers/crypto/sahara.c
+++ b/drivers/crypto/sahara.c
@@ -0,0 +1,1070 @@
 
															+/*
														
 
															+ * Cryptographic API.
														
 
															+ *
														
 
															+ * Support for SAHARA cryptographic accelerator.
														
 
															+ *
														
 
															+ * Copyright (c) 2013 Vista Silicon S.L.
														
 
															+ * Author: Javier Martin <javier.martin@vista-silicon.com>
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify
														
 
															+ * it under the terms of the GNU General Public License version 2 as published
														
 
															+ * by the Free Software Foundation.
														
 
															+ *
														
 
															+ * Based on omap-aes.c and tegra-aes.c
														
 
															+ */
														
 
															+
														
 
															+#include <crypto/algapi.h>
														
 
															+#include <crypto/aes.h>
														
 
															+
														
 
															+#include <linux/clk.h>
														
 
															+#include <linux/crypto.h>
														
 
															+#include <linux/interrupt.h>
														
 
															+#include <linux/io.h>
														
 
															+#include <linux/irq.h>
														
 
															+#include <linux/kernel.h>
														
 
															+#include <linux/module.h>
														
 
															+#include <linux/of.h>
														
 
															+#include <linux/platform_device.h>
														
 
															+
														
 
															+#define SAHARA_NAME "sahara"
														
 
															+#define SAHARA_VERSION_3	3
														
 
															+#define SAHARA_TIMEOUT_MS	1000
														
 
															+#define SAHARA_MAX_HW_DESC	2
														
 
															+#define SAHARA_MAX_HW_LINK	20
														
 
															+
														
 
															+#define FLAGS_MODE_MASK		0x000f
														
 
															+#define FLAGS_ENCRYPT		BIT(0)
														
 
															+#define FLAGS_CBC		BIT(1)
														
 
															+#define FLAGS_NEW_KEY		BIT(3)
														
 
															+#define FLAGS_BUSY		4
														
 
															+
														
 
															+#define SAHARA_HDR_BASE			0x00800000
														
 
															+#define SAHARA_HDR_SKHA_ALG_AES	0
														
 
															+#define SAHARA_HDR_SKHA_OP_ENC		(1 << 2)
														
 
															+#define SAHARA_HDR_SKHA_MODE_ECB	(0 << 3)
														
 
															+#define SAHARA_HDR_SKHA_MODE_CBC	(1 << 3)
														
 
															+#define SAHARA_HDR_FORM_DATA		(5 << 16)
														
 
															+#define SAHARA_HDR_FORM_KEY		(8 << 16)
														
 
															+#define SAHARA_HDR_LLO			(1 << 24)
														
 
															+#define SAHARA_HDR_CHA_SKHA		(1 << 28)
														
 
															+#define SAHARA_HDR_CHA_MDHA		(2 << 28)
														
 
															+#define SAHARA_HDR_PARITY_BIT		(1 << 31)
														
 
															+
														
 
															+/* SAHARA can only process one request at a time */
														
 
															+#define SAHARA_QUEUE_LENGTH	1
														
 
															+
														
 
															+#define SAHARA_REG_VERSION	0x00
														
 
															+#define SAHARA_REG_DAR		0x04
														
 
															+#define SAHARA_REG_CONTROL	0x08
														
 
															+#define		SAHARA_CONTROL_SET_THROTTLE(x)	(((x) & 0xff) << 24)
														
 
															+#define		SAHARA_CONTROL_SET_MAXBURST(x)	(((x) & 0xff) << 16)
														
 
															+#define		SAHARA_CONTROL_RNG_AUTORSD	(1 << 7)
														
 
															+#define		SAHARA_CONTROL_ENABLE_INT	(1 << 4)
														
 
															+#define SAHARA_REG_CMD		0x0C
														
 
															+#define		SAHARA_CMD_RESET		(1 << 0)
														
 
															+#define		SAHARA_CMD_CLEAR_INT		(1 << 8)
														
 
															+#define		SAHARA_CMD_CLEAR_ERR		(1 << 9)
														
 
															+#define		SAHARA_CMD_SINGLE_STEP		(1 << 10)
														
 
															+#define		SAHARA_CMD_MODE_BATCH		(1 << 16)
														
 
															+#define		SAHARA_CMD_MODE_DEBUG		(1 << 18)
														
 
															+#define	SAHARA_REG_STATUS	0x10
														
 
															+#define		SAHARA_STATUS_GET_STATE(x)	((x) & 0x7)
														
 
															+#define			SAHARA_STATE_IDLE	0
														
 
															+#define			SAHARA_STATE_BUSY	1
														
 
															+#define			SAHARA_STATE_ERR	2
														
 
															+#define			SAHARA_STATE_FAULT	3
														
 
															+#define			SAHARA_STATE_COMPLETE	4
														
 
															+#define			SAHARA_STATE_COMP_FLAG	(1 << 2)
														
 
															+#define		SAHARA_STATUS_DAR_FULL		(1 << 3)
														
 
															+#define		SAHARA_STATUS_ERROR		(1 << 4)
														
 
															+#define		SAHARA_STATUS_SECURE		(1 << 5)
														
 
															+#define		SAHARA_STATUS_FAIL		(1 << 6)
														
 
															+#define		SAHARA_STATUS_INIT		(1 << 7)
														
 
															+#define		SAHARA_STATUS_RNG_RESEED	(1 << 8)
														
 
															+#define		SAHARA_STATUS_ACTIVE_RNG	(1 << 9)
														
 
															+#define		SAHARA_STATUS_ACTIVE_MDHA	(1 << 10)
														
 
															+#define		SAHARA_STATUS_ACTIVE_SKHA	(1 << 11)
														
 
															+#define		SAHARA_STATUS_MODE_BATCH	(1 << 16)
														
 
															+#define		SAHARA_STATUS_MODE_DEDICATED	(1 << 17)
														
 
															+#define		SAHARA_STATUS_MODE_DEBUG	(1 << 18)
														
 
															+#define		SAHARA_STATUS_GET_ISTATE(x)	(((x) >> 24) & 0xff)
														
 
															+#define SAHARA_REG_ERRSTATUS	0x14
														
 
															+#define		SAHARA_ERRSTATUS_GET_SOURCE(x)	((x) & 0xf)
														
 
															+#define			SAHARA_ERRSOURCE_CHA	14
														
 
															+#define			SAHARA_ERRSOURCE_DMA	15
														
 
															+#define		SAHARA_ERRSTATUS_DMA_DIR	(1 << 8)
														
 
															+#define		SAHARA_ERRSTATUS_GET_DMASZ(x)(((x) >> 9) & 0x3)
														
 
															+#define		SAHARA_ERRSTATUS_GET_DMASRC(x) (((x) >> 13) & 0x7)
														
 
															+#define		SAHARA_ERRSTATUS_GET_CHASRC(x)	(((x) >> 16) & 0xfff)
														
 
															+#define		SAHARA_ERRSTATUS_GET_CHAERR(x)	(((x) >> 28) & 0x3)
														
 
															+#define SAHARA_REG_FADDR	0x18
														
 
															+#define SAHARA_REG_CDAR		0x1C
														
 
															+#define SAHARA_REG_IDAR		0x20
														
 
															+
														
 
															+struct sahara_hw_desc {
														
 
															+	u32		hdr;
														
 
															+	u32		len1;
														
 
															+	dma_addr_t	p1;
														
 
															+	u32		len2;
														
 
															+	dma_addr_t	p2;
														
 
															+	dma_addr_t	next;
														
 
															+};
														
 
															+
														
 
															+struct sahara_hw_link {
														
 
															+	u32		len;
														
 
															+	dma_addr_t	p;
														
 
															+	dma_addr_t	next;
														
 
															+};
														
 
															+
														
 
															+struct sahara_ctx {
														
 
															+	struct sahara_dev *dev;
														
 
															+	unsigned long flags;
														
 
															+	int keylen;
														
 
															+	u8 key[AES_KEYSIZE_128];
														
 
															+	struct crypto_ablkcipher *fallback;
														
 
															+};
														
 
															+
														
 
															+struct sahara_aes_reqctx {
														
 
															+	unsigned long mode;
														
 
															+};
														
 
															+
														
 
															+struct sahara_dev {
														
 
															+	struct device		*device;
														
 
															+	void __iomem		*regs_base;
														
 
															+	struct clk		*clk_ipg;
														
 
															+	struct clk		*clk_ahb;
														
 
															+
														
 
															+	struct sahara_ctx	*ctx;
														
 
															+	spinlock_t		lock;
														
 
															+	struct crypto_queue	queue;
														
 
															+	unsigned long		flags;
														
 
															+
														
 
															+	struct tasklet_struct	done_task;
														
 
															+	struct tasklet_struct	queue_task;
														
 
															+
														
 
															+	struct sahara_hw_desc	*hw_desc[SAHARA_MAX_HW_DESC];
														
 
															+	dma_addr_t		hw_phys_desc[SAHARA_MAX_HW_DESC];
														
 
															+
														
 
															+	u8			*key_base;
														
 
															+	dma_addr_t		key_phys_base;
														
 
															+
														
 
															+	u8			*iv_base;
														
 
															+	dma_addr_t		iv_phys_base;
														
 
															+
														
 
															+	struct sahara_hw_link	*hw_link[SAHARA_MAX_HW_LINK];
														
 
															+	dma_addr_t		hw_phys_link[SAHARA_MAX_HW_LINK];
														
 
															+
														
 
															+	struct ablkcipher_request *req;
														
 
															+	size_t			total;
														
 
															+	struct scatterlist	*in_sg;
														
 
															+	unsigned int		nb_in_sg;
														
 
															+	struct scatterlist	*out_sg;
														
 
															+	unsigned int		nb_out_sg;
														
 
															+
														
 
															+	u32			error;
														
 
															+	struct timer_list	watchdog;
														
 
															+};
														
 
															+
														
 
															+static struct sahara_dev *dev_ptr;
														
 
															+
														
 
															+static inline void sahara_write(struct sahara_dev *dev, u32 data, u32 reg)
														
 
															+{
														
 
															+	writel(data, dev->regs_base + reg);
														
 
															+}
														
 
															+
														
 
															+static inline unsigned int sahara_read(struct sahara_dev *dev, u32 reg)
														
 
															+{
														
 
															+	return readl(dev->regs_base + reg);
														
 
															+}
														
 
															+
														
 
															+static u32 sahara_aes_key_hdr(struct sahara_dev *dev)
														
 
															+{
														
 
															+	u32 hdr = SAHARA_HDR_BASE | SAHARA_HDR_SKHA_ALG_AES |
														
 
															+			SAHARA_HDR_FORM_KEY | SAHARA_HDR_LLO |
														
 
															+			SAHARA_HDR_CHA_SKHA | SAHARA_HDR_PARITY_BIT;
														
 
															+
														
 
															+	if (dev->flags & FLAGS_CBC) {
														
 
															+		hdr |= SAHARA_HDR_SKHA_MODE_CBC;
														
 
															+		hdr ^= SAHARA_HDR_PARITY_BIT;
														
 
															+	}
														
 
															+
														
 
															+	if (dev->flags & FLAGS_ENCRYPT) {
														
 
															+		hdr |= SAHARA_HDR_SKHA_OP_ENC;
														
 
															+		hdr ^= SAHARA_HDR_PARITY_BIT;
														
 
															+	}
														
 
															+
														
 
															+	return hdr;
														
 
															+}
														
 
															+
														
 
															+static u32 sahara_aes_data_link_hdr(struct sahara_dev *dev)
														
 
															+{
														
 
															+	return SAHARA_HDR_BASE | SAHARA_HDR_FORM_DATA |
														
 
															+			SAHARA_HDR_CHA_SKHA | SAHARA_HDR_PARITY_BIT;
														
 
															+}
														
 
															+
														
 
															+static int sahara_sg_length(struct scatterlist *sg,
														
 
															+			    unsigned int total)
														
 
															+{
														
 
															+	int sg_nb;
														
 
															+	unsigned int len;
														
 
															+	struct scatterlist *sg_list;
														
 
															+
														
 
															+	sg_nb = 0;
														
 
															+	sg_list = sg;
														
 
															+
														
 
															+	while (total) {
														
 
															+		len = min(sg_list->length, total);
														
 
															+
														
 
															+		sg_nb++;
														
 
															+		total -= len;
														
 
															+
														
 
															+		sg_list = sg_next(sg_list);
														
 
															+		if (!sg_list)
														
 
															+			total = 0;
														
 
															+	}
														
 
															+
														
 
															+	return sg_nb;
														
 
															+}
														
 
															+
														
 
															+static char *sahara_err_src[16] = {
														
 
															+	"No error",
														
 
															+	"Header error",
														
 
															+	"Descriptor length error",
														
 
															+	"Descriptor length or pointer error",
														
 
															+	"Link length error",
														
 
															+	"Link pointer error",
														
 
															+	"Input buffer error",
														
 
															+	"Output buffer error",
														
 
															+	"Output buffer starvation",
														
 
															+	"Internal state fault",
														
 
															+	"General descriptor problem",
														
 
															+	"Reserved",
														
 
															+	"Descriptor address error",
														
 
															+	"Link address error",
														
 
															+	"CHA error",
														
 
															+	"DMA error"
														
 
															+};
														
 
															+
														
 
															+static char *sahara_err_dmasize[4] = {
														
 
															+	"Byte transfer",
														
 
															+	"Half-word transfer",
														
 
															+	"Word transfer",
														
 
															+	"Reserved"
														
 
															+};
														
 
															+
														
 
															+static char *sahara_err_dmasrc[8] = {
														
 
															+	"No error",
														
 
															+	"AHB bus error",
														
 
															+	"Internal IP bus error",
														
 
															+	"Parity error",
														
 
															+	"DMA crosses 256 byte boundary",
														
 
															+	"DMA is busy",
														
 
															+	"Reserved",
														
 
															+	"DMA HW error"
														
 
															+};
														
 
															+
														
 
															+static char *sahara_cha_errsrc[12] = {
														
 
															+	"Input buffer non-empty",
														
 
															+	"Illegal address",
														
 
															+	"Illegal mode",
														
 
															+	"Illegal data size",
														
 
															+	"Illegal key size",
														
 
															+	"Write during processing",
														
 
															+	"CTX read during processing",
														
 
															+	"HW error",
														
 
															+	"Input buffer disabled/underflow",
														
 
															+	"Output buffer disabled/overflow",
														
 
															+	"DES key parity error",
														
 
															+	"Reserved"
														
 
															+};
														
 
															+
														
 
															+static char *sahara_cha_err[4] = { "No error", "SKHA", "MDHA", "RNG" };
														
 
															+
														
 
															+static void sahara_decode_error(struct sahara_dev *dev, unsigned int error)
														
 
															+{
														
 
															+	u8 source = SAHARA_ERRSTATUS_GET_SOURCE(error);
														
 
															+	u16 chasrc = ffs(SAHARA_ERRSTATUS_GET_CHASRC(error));
														
 
															+
														
 
															+	dev_err(dev->device, "%s: Error Register = 0x%08x\n", __func__, error);
														
 
															+
														
 
															+	dev_err(dev->device, "	- %s.\n", sahara_err_src[source]);
														
 
															+
														
 
															+	if (source == SAHARA_ERRSOURCE_DMA) {
														
 
															+		if (error & SAHARA_ERRSTATUS_DMA_DIR)
														
 
															+			dev_err(dev->device, "		* DMA read.\n");
														
 
															+		else
														
 
															+			dev_err(dev->device, "		* DMA write.\n");
														
 
															+
														
 
															+		dev_err(dev->device, "		* %s.\n",
														
 
															+		       sahara_err_dmasize[SAHARA_ERRSTATUS_GET_DMASZ(error)]);
														
 
															+		dev_err(dev->device, "		* %s.\n",
														
 
															+		       sahara_err_dmasrc[SAHARA_ERRSTATUS_GET_DMASRC(error)]);
														
 
															+	} else if (source == SAHARA_ERRSOURCE_CHA) {
														
 
															+		dev_err(dev->device, "		* %s.\n",
														
 
															+			sahara_cha_errsrc[chasrc]);
														
 
															+		dev_err(dev->device, "		* %s.\n",
														
 
															+		       sahara_cha_err[SAHARA_ERRSTATUS_GET_CHAERR(error)]);
														
 
															+	}
														
 
															+	dev_err(dev->device, "\n");
														
 
															+}
														
 
															+
														
 
															+static char *sahara_state[4] = { "Idle", "Busy", "Error", "HW Fault" };
														
 
															+
														
 
															+static void sahara_decode_status(struct sahara_dev *dev, unsigned int status)
														
 
															+{
														
 
															+	u8 state;
														
 
															+
														
 
															+	if (!IS_ENABLED(DEBUG))
														
 
															+		return;
														
 
															+
														
 
															+	state = SAHARA_STATUS_GET_STATE(status);
														
 
															+
														
 
															+	dev_dbg(dev->device, "%s: Status Register = 0x%08x\n",
														
 
															+		__func__, status);
														
 
															+
														
 
															+	dev_dbg(dev->device, "	- State = %d:\n", state);
														
 
															+	if (state & SAHARA_STATE_COMP_FLAG)
														
 
															+		dev_dbg(dev->device, "		* Descriptor completed. IRQ pending.\n");
														
 
															+
														
 
															+	dev_dbg(dev->device, "		* %s.\n",
														
 
															+	       sahara_state[state & ~SAHARA_STATE_COMP_FLAG]);
														
 
															+
														
 
															+	if (status & SAHARA_STATUS_DAR_FULL)
														
 
															+		dev_dbg(dev->device, "	- DAR Full.\n");
														
 
															+	if (status & SAHARA_STATUS_ERROR)
														
 
															+		dev_dbg(dev->device, "	- Error.\n");
														
 
															+	if (status & SAHARA_STATUS_SECURE)
														
 
															+		dev_dbg(dev->device, "	- Secure.\n");
														
 
															+	if (status & SAHARA_STATUS_FAIL)
														
 
															+		dev_dbg(dev->device, "	- Fail.\n");
														
 
															+	if (status & SAHARA_STATUS_RNG_RESEED)
														
 
															+		dev_dbg(dev->device, "	- RNG Reseed Request.\n");
														
 
															+	if (status & SAHARA_STATUS_ACTIVE_RNG)
														
 
															+		dev_dbg(dev->device, "	- RNG Active.\n");
														
 
															+	if (status & SAHARA_STATUS_ACTIVE_MDHA)
														
 
															+		dev_dbg(dev->device, "	- MDHA Active.\n");
														
 
															+	if (status & SAHARA_STATUS_ACTIVE_SKHA)
														
 
															+		dev_dbg(dev->device, "	- SKHA Active.\n");
														
 
															+
														
 
															+	if (status & SAHARA_STATUS_MODE_BATCH)
														
 
															+		dev_dbg(dev->device, "	- Batch Mode.\n");
														
 
															+	else if (status & SAHARA_STATUS_MODE_DEDICATED)
														
 
															+		dev_dbg(dev->device, "	- Decidated Mode.\n");
														
 
															+	else if (status & SAHARA_STATUS_MODE_DEBUG)
														
 
															+		dev_dbg(dev->device, "	- Debug Mode.\n");
														
 
															+
														
 
															+	dev_dbg(dev->device, "	- Internal state = 0x%02x\n",
														
 
															+	       SAHARA_STATUS_GET_ISTATE(status));
														
 
															+
														
 
															+	dev_dbg(dev->device, "Current DAR: 0x%08x\n",
														
 
															+		sahara_read(dev, SAHARA_REG_CDAR));
														
 
															+	dev_dbg(dev->device, "Initial DAR: 0x%08x\n\n",
														
 
															+		sahara_read(dev, SAHARA_REG_IDAR));
														
 
															+}
														
 
															+
														
 
															+static void sahara_dump_descriptors(struct sahara_dev *dev)
														
 
															+{
														
 
															+	int i;
														
 
															+
														
 
															+	if (!IS_ENABLED(DEBUG))
														
 
															+		return;
														
 
															+
														
 
															+	for (i = 0; i < SAHARA_MAX_HW_DESC; i++) {
														
 
															+		dev_dbg(dev->device, "Descriptor (%d) (0x%08x):\n",
														
 
															+			i, dev->hw_phys_desc[i]);
														
 
															+		dev_dbg(dev->device, "\thdr = 0x%08x\n", dev->hw_desc[i]->hdr);
														
 
															+		dev_dbg(dev->device, "\tlen1 = %u\n", dev->hw_desc[i]->len1);
														
 
															+		dev_dbg(dev->device, "\tp1 = 0x%08x\n", dev->hw_desc[i]->p1);
														
 
															+		dev_dbg(dev->device, "\tlen2 = %u\n", dev->hw_desc[i]->len2);
														
 
															+		dev_dbg(dev->device, "\tp2 = 0x%08x\n", dev->hw_desc[i]->p2);
														
 
															+		dev_dbg(dev->device, "\tnext = 0x%08x\n",
														
 
															+			dev->hw_desc[i]->next);
														
 
															+	}
														
 
															+	dev_dbg(dev->device, "\n");
														
 
															+}
														
 
															+
														
 
															+static void sahara_dump_links(struct sahara_dev *dev)
														
 
															+{
														
 
															+	int i;
														
 
															+
														
 
															+	if (!IS_ENABLED(DEBUG))
														
 
															+		return;
														
 
															+
														
 
															+	for (i = 0; i < SAHARA_MAX_HW_LINK; i++) {
														
 
															+		dev_dbg(dev->device, "Link (%d) (0x%08x):\n",
														
 
															+			i, dev->hw_phys_link[i]);
														
 
															+		dev_dbg(dev->device, "\tlen = %u\n", dev->hw_link[i]->len);
														
 
															+		dev_dbg(dev->device, "\tp = 0x%08x\n", dev->hw_link[i]->p);
														
 
															+		dev_dbg(dev->device, "\tnext = 0x%08x\n",
														
 
															+			dev->hw_link[i]->next);
														
 
															+	}
														
 
															+	dev_dbg(dev->device, "\n");
														
 
															+}
														
 
															+
														
 
															+static void sahara_aes_done_task(unsigned long data)
														
 
															+{
														
 
															+	struct sahara_dev *dev = (struct sahara_dev *)data;
														
 
															+
														
 
															+	dma_unmap_sg(dev->device, dev->out_sg, dev->nb_out_sg,
														
 
															+		DMA_TO_DEVICE);
														
 
															+	dma_unmap_sg(dev->device, dev->in_sg, dev->nb_in_sg,
														
 
															+		DMA_FROM_DEVICE);
														
 
															+
														
 
															+	spin_lock(&dev->lock);
														
 
															+	clear_bit(FLAGS_BUSY, &dev->flags);
														
 
															+	spin_unlock(&dev->lock);
														
 
															+
														
 
															+	dev->req->base.complete(&dev->req->base, dev->error);
														
 
															+}
														
 
															+
														
 
															+void sahara_watchdog(unsigned long data)
														
 
															+{
														
 
															+	struct sahara_dev *dev = (struct sahara_dev *)data;
														
 
															+	unsigned int err = sahara_read(dev, SAHARA_REG_ERRSTATUS);
														
 
															+	unsigned int stat = sahara_read(dev, SAHARA_REG_STATUS);
														
 
															+
														
 
															+	sahara_decode_status(dev, stat);
														
 
															+	sahara_decode_error(dev, err);
														
 
															+	dev->error = -ETIMEDOUT;
														
 
															+	sahara_aes_done_task(data);
														
 
															+}
														
 
															+
														
 
															+static int sahara_hw_descriptor_create(struct sahara_dev *dev)
														
 
															+{
														
 
															+	struct sahara_ctx *ctx = dev->ctx;
														
 
															+	struct scatterlist *sg;
														
 
															+	int ret;
														
 
															+	int i, j;
														
 
															+
														
 
															+	/* Copy new key if necessary */
														
 
															+	if (ctx->flags & FLAGS_NEW_KEY) {
														
 
															+		memcpy(dev->key_base, ctx->key, ctx->keylen);
														
 
															+		ctx->flags &= ~FLAGS_NEW_KEY;
														
 
															+
														
 
															+		if (dev->flags & FLAGS_CBC) {
														
 
															+			dev->hw_desc[0]->len1 = AES_BLOCK_SIZE;
														
 
															+			dev->hw_desc[0]->p1 = dev->iv_phys_base;
														
 
															+		} else {
														
 
															+			dev->hw_desc[0]->len1 = 0;
														
 
															+			dev->hw_desc[0]->p1 = 0;
														
 
															+		}
														
 
															+		dev->hw_desc[0]->len2 = ctx->keylen;
														
 
															+		dev->hw_desc[0]->p2 = dev->key_phys_base;
														
 
															+		dev->hw_desc[0]->next = dev->hw_phys_desc[1];
														
 
															+	}
														
 
															+	dev->hw_desc[0]->hdr = sahara_aes_key_hdr(dev);
														
 
															+
														
 
															+	dev->nb_in_sg = sahara_sg_length(dev->in_sg, dev->total);
														
 
															+	dev->nb_out_sg = sahara_sg_length(dev->out_sg, dev->total);
														
 
															+	if ((dev->nb_in_sg + dev->nb_out_sg) > SAHARA_MAX_HW_LINK) {
														
 
															+		dev_err(dev->device, "not enough hw links (%d)\n",
														
 
															+			dev->nb_in_sg + dev->nb_out_sg);
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	ret = dma_map_sg(dev->device, dev->in_sg, dev->nb_in_sg,
														
 
															+			 DMA_TO_DEVICE);
														
 
															+	if (ret != dev->nb_in_sg) {
														
 
															+		dev_err(dev->device, "couldn't map in sg\n");
														
 
															+		goto unmap_in;
														
 
															+	}
														
 
															+	ret = dma_map_sg(dev->device, dev->out_sg, dev->nb_out_sg,
														
 
															+			 DMA_FROM_DEVICE);
														
 
															+	if (ret != dev->nb_out_sg) {
														
 
															+		dev_err(dev->device, "couldn't map out sg\n");
														
 
															+		goto unmap_out;
														
 
															+	}
														
 
															+
														
 
															+	/* Create input links */
														
 
															+	dev->hw_desc[1]->p1 = dev->hw_phys_link[0];
														
 
															+	sg = dev->in_sg;
														
 
															+	for (i = 0; i < dev->nb_in_sg; i++) {
														
 
															+		dev->hw_link[i]->len = sg->length;
														
 
															+		dev->hw_link[i]->p = sg->dma_address;
														
 
															+		if (i == (dev->nb_in_sg - 1)) {
														
 
															+			dev->hw_link[i]->next = 0;
														
 
															+		} else {
														
 
															+			dev->hw_link[i]->next = dev->hw_phys_link[i + 1];
														
 
															+			sg = sg_next(sg);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* Create output links */
														
 
															+	dev->hw_desc[1]->p2 = dev->hw_phys_link[i];
														
 
															+	sg = dev->out_sg;
														
 
															+	for (j = i; j < dev->nb_out_sg + i; j++) {
														
 
															+		dev->hw_link[j]->len = sg->length;
														
 
															+		dev->hw_link[j]->p = sg->dma_address;
														
 
															+		if (j == (dev->nb_out_sg + i - 1)) {
														
 
															+			dev->hw_link[j]->next = 0;
														
 
															+		} else {
														
 
															+			dev->hw_link[j]->next = dev->hw_phys_link[j + 1];
														
 
															+			sg = sg_next(sg);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	/* Fill remaining fields of hw_desc[1] */
														
 
															+	dev->hw_desc[1]->hdr = sahara_aes_data_link_hdr(dev);
														
 
															+	dev->hw_desc[1]->len1 = dev->total;
														
 
															+	dev->hw_desc[1]->len2 = dev->total;
														
 
															+	dev->hw_desc[1]->next = 0;
														
 
															+
														
 
															+	sahara_dump_descriptors(dev);
														
 
															+	sahara_dump_links(dev);
														
 
															+
														
 
															+	/* Start processing descriptor chain. */
														
 
															+	mod_timer(&dev->watchdog,
														
 
															+		  jiffies + msecs_to_jiffies(SAHARA_TIMEOUT_MS));
														
 
															+	sahara_write(dev, dev->hw_phys_desc[0], SAHARA_REG_DAR);
														
 
															+
														
 
															+	return 0;
														
 
															+
														
 
															+unmap_out:
														
 
															+	dma_unmap_sg(dev->device, dev->out_sg, dev->nb_out_sg,
														
 
															+		DMA_TO_DEVICE);
														
 
															+unmap_in:
														
 
															+	dma_unmap_sg(dev->device, dev->in_sg, dev->nb_in_sg,
														
 
															+		DMA_FROM_DEVICE);
														
 
															+
														
 
															+	return -EINVAL;
														
 
															+}
														
 
															+
														
 
															+static void sahara_aes_queue_task(unsigned long data)
														
 
															+{
														
 
															+	struct sahara_dev *dev = (struct sahara_dev *)data;
														
 
															+	struct crypto_async_request *async_req, *backlog;
														
 
															+	struct sahara_ctx *ctx;
														
 
															+	struct sahara_aes_reqctx *rctx;
														
 
															+	struct ablkcipher_request *req;
														
 
															+	int ret;
														
 
															+
														
 
															+	spin_lock(&dev->lock);
														
 
															+	backlog = crypto_get_backlog(&dev->queue);
														
 
															+	async_req = crypto_dequeue_request(&dev->queue);
														
 
															+	if (!async_req)
														
 
															+		clear_bit(FLAGS_BUSY, &dev->flags);
														
 
															+	spin_unlock(&dev->lock);
														
 
															+
														
 
															+	if (!async_req)
														
 
															+		return;
														
 
															+
														
 
															+	if (backlog)
														
 
															+		backlog->complete(backlog, -EINPROGRESS);
														
 
															+
														
 
															+	req = ablkcipher_request_cast(async_req);
														
 
															+
														
 
															+	/* Request is ready to be dispatched by the device */
														
 
															+	dev_dbg(dev->device,
														
 
															+		"dispatch request (nbytes=%d, src=%p, dst=%p)\n",
														
 
															+		req->nbytes, req->src, req->dst);
														
 
															+
														
 
															+	/* assign new request to device */
														
 
															+	dev->req = req;
														
 
															+	dev->total = req->nbytes;
														
 
															+	dev->in_sg = req->src;
														
 
															+	dev->out_sg = req->dst;
														
 
															+
														
 
															+	rctx = ablkcipher_request_ctx(req);
														
 
															+	ctx = crypto_ablkcipher_ctx(crypto_ablkcipher_reqtfm(req));
														
 
															+	rctx->mode &= FLAGS_MODE_MASK;
														
 
															+	dev->flags = (dev->flags & ~FLAGS_MODE_MASK) | rctx->mode;
														
 
															+
														
 
															+	if ((dev->flags & FLAGS_CBC) && req->info)
														
 
															+		memcpy(dev->iv_base, req->info, AES_KEYSIZE_128);
														
 
															+
														
 
															+	/* assign new context to device */
														
 
															+	ctx->dev = dev;
														
 
															+	dev->ctx = ctx;
														
 
															+
														
 
															+	ret = sahara_hw_descriptor_create(dev);
														
 
															+	if (ret < 0) {
														
 
															+		spin_lock(&dev->lock);
														
 
															+		clear_bit(FLAGS_BUSY, &dev->flags);
														
 
															+		spin_unlock(&dev->lock);
														
 
															+		dev->req->base.complete(&dev->req->base, ret);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int sahara_aes_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
														
 
															+			     unsigned int keylen)
														
 
															+{
														
 
															+	struct sahara_ctx *ctx = crypto_ablkcipher_ctx(tfm);
														
 
															+	int ret;
														
 
															+
														
 
															+	ctx->keylen = keylen;
														
 
															+
														
 
															+	/* SAHARA only supports 128bit keys */
														
 
															+	if (keylen == AES_KEYSIZE_128) {
														
 
															+		memcpy(ctx->key, key, keylen);
														
 
															+		ctx->flags |= FLAGS_NEW_KEY;
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	if (keylen != AES_KEYSIZE_128 &&
														
 
															+	    keylen != AES_KEYSIZE_192 && keylen != AES_KEYSIZE_256)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	/*
														
 
															+	 * The requested key size is not supported by HW, do a fallback.
														
 
															+	 */
														
 
															+	ctx->fallback->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
														
 
															+	ctx->fallback->base.crt_flags |=
														
 
															+		(tfm->base.crt_flags & CRYPTO_TFM_REQ_MASK);
														
 
															+
														
 
															+	ret = crypto_ablkcipher_setkey(ctx->fallback, key, keylen);
														
 
															+	if (ret) {
														
 
															+		struct crypto_tfm *tfm_aux = crypto_ablkcipher_tfm(tfm);
														
 
															+
														
 
															+		tfm_aux->crt_flags &= ~CRYPTO_TFM_RES_MASK;
														
 
															+		tfm_aux->crt_flags |=
														
 
															+			(ctx->fallback->base.crt_flags & CRYPTO_TFM_RES_MASK);
														
 
															+	}
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+static int sahara_aes_crypt(struct ablkcipher_request *req, unsigned long mode)
														
 
															+{
														
 
															+	struct sahara_ctx *ctx = crypto_ablkcipher_ctx(
														
 
															+		crypto_ablkcipher_reqtfm(req));
														
 
															+	struct sahara_aes_reqctx *rctx = ablkcipher_request_ctx(req);
														
 
															+	struct sahara_dev *dev = dev_ptr;
														
 
															+	int err = 0;
														
 
															+	int busy;
														
 
															+
														
 
															+	dev_dbg(dev->device, "nbytes: %d, enc: %d, cbc: %d\n",
														
 
															+		req->nbytes, !!(mode & FLAGS_ENCRYPT), !!(mode & FLAGS_CBC));
														
 
															+
														
 
															+	if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE)) {
														
 
															+		dev_err(dev->device,
														
 
															+			"request size is not exact amount of AES blocks\n");
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	ctx->dev = dev;
														
 
															+
														
 
															+	rctx->mode = mode;
														
 
															+	spin_lock_bh(&dev->lock);
														
 
															+	err = ablkcipher_enqueue_request(&dev->queue, req);
														
 
															+	busy = test_and_set_bit(FLAGS_BUSY, &dev->flags);
														
 
															+	spin_unlock_bh(&dev->lock);
														
 
															+
														
 
															+	if (!busy)
														
 
															+		tasklet_schedule(&dev->queue_task);
														
 
															+
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static int sahara_aes_ecb_encrypt(struct ablkcipher_request *req)
														
 
															+{
														
 
															+	struct crypto_tfm *tfm =
														
 
															+		crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req));
														
 
															+	struct sahara_ctx *ctx = crypto_ablkcipher_ctx(
														
 
															+		crypto_ablkcipher_reqtfm(req));
														
 
															+	int err;
														
 
															+
														
 
															+	if (unlikely(ctx->keylen != AES_KEYSIZE_128)) {
														
 
															+		ablkcipher_request_set_tfm(req, ctx->fallback);
														
 
															+		err = crypto_ablkcipher_encrypt(req);
														
 
															+		ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm));
														
 
															+		return err;
														
 
															+	}
														
 
															+
														
 
															+	return sahara_aes_crypt(req, FLAGS_ENCRYPT);
														
 
															+}
														
 
															+
														
 
															+static int sahara_aes_ecb_decrypt(struct ablkcipher_request *req)
														
 
															+{
														
 
															+	struct crypto_tfm *tfm =
														
 
															+		crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req));
														
 
															+	struct sahara_ctx *ctx = crypto_ablkcipher_ctx(
														
 
															+		crypto_ablkcipher_reqtfm(req));
														
 
															+	int err;
														
 
															+
														
 
															+	if (unlikely(ctx->keylen != AES_KEYSIZE_128)) {
														
 
															+		ablkcipher_request_set_tfm(req, ctx->fallback);
														
 
															+		err = crypto_ablkcipher_decrypt(req);
														
 
															+		ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm));
														
 
															+		return err;
														
 
															+	}
														
 
															+
														
 
															+	return sahara_aes_crypt(req, 0);
														
 
															+}
														
 
															+
														
 
															+static int sahara_aes_cbc_encrypt(struct ablkcipher_request *req)
														
 
															+{
														
 
															+	struct crypto_tfm *tfm =
														
 
															+		crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req));
														
 
															+	struct sahara_ctx *ctx = crypto_ablkcipher_ctx(
														
 
															+		crypto_ablkcipher_reqtfm(req));
														
 
															+	int err;
														
 
															+
														
 
															+	if (unlikely(ctx->keylen != AES_KEYSIZE_128)) {
														
 
															+		ablkcipher_request_set_tfm(req, ctx->fallback);
														
 
															+		err = crypto_ablkcipher_encrypt(req);
														
 
															+		ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm));
														
 
															+		return err;
														
 
															+	}
														
 
															+
														
 
															+	return sahara_aes_crypt(req, FLAGS_ENCRYPT | FLAGS_CBC);
														
 
															+}
														
 
															+
														
 
															+static int sahara_aes_cbc_decrypt(struct ablkcipher_request *req)
														
 
															+{
														
 
															+	struct crypto_tfm *tfm =
														
 
															+		crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req));
														
 
															+	struct sahara_ctx *ctx = crypto_ablkcipher_ctx(
														
 
															+		crypto_ablkcipher_reqtfm(req));
														
 
															+	int err;
														
 
															+
														
 
															+	if (unlikely(ctx->keylen != AES_KEYSIZE_128)) {
														
 
															+		ablkcipher_request_set_tfm(req, ctx->fallback);
														
 
															+		err = crypto_ablkcipher_decrypt(req);
														
 
															+		ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm));
														
 
															+		return err;
														
 
															+	}
														
 
															+
														
 
															+	return sahara_aes_crypt(req, FLAGS_CBC);
														
 
															+}
														
 
															+
														
 
															+static int sahara_aes_cra_init(struct crypto_tfm *tfm)
														
 
															+{
														
 
															+	const char *name = tfm->__crt_alg->cra_name;
														
 
															+	struct sahara_ctx *ctx = crypto_tfm_ctx(tfm);
														
 
															+
														
 
															+	ctx->fallback = crypto_alloc_ablkcipher(name, 0,
														
 
															+				CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
														
 
															+	if (IS_ERR(ctx->fallback)) {
														
 
															+		pr_err("Error allocating fallback algo %s\n", name);
														
 
															+		return PTR_ERR(ctx->fallback);
														
 
															+	}
														
 
															+
														
 
															+	tfm->crt_ablkcipher.reqsize = sizeof(struct sahara_aes_reqctx);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void sahara_aes_cra_exit(struct crypto_tfm *tfm)
														
 
															+{
														
 
															+	struct sahara_ctx *ctx = crypto_tfm_ctx(tfm);
														
 
															+
														
 
															+	if (ctx->fallback)
														
 
															+		crypto_free_ablkcipher(ctx->fallback);
														
 
															+	ctx->fallback = NULL;
														
 
															+}
														
 
															+
														
 
															+static struct crypto_alg aes_algs[] = {
														
 
															+{
														
 
															+	.cra_name		= "ecb(aes)",
														
 
															+	.cra_driver_name	= "sahara-ecb-aes",
														
 
															+	.cra_priority		= 300,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER |
														
 
															+			CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK,
														
 
															+	.cra_blocksize		= AES_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct sahara_ctx),
														
 
															+	.cra_alignmask		= 0x0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= sahara_aes_cra_init,
														
 
															+	.cra_exit		= sahara_aes_cra_exit,
														
 
															+	.cra_u.ablkcipher = {
														
 
															+		.min_keysize	= AES_MIN_KEY_SIZE ,
														
 
															+		.max_keysize	= AES_MAX_KEY_SIZE,
														
 
															+		.setkey		= sahara_aes_setkey,
														
 
															+		.encrypt	= sahara_aes_ecb_encrypt,
														
 
															+		.decrypt	= sahara_aes_ecb_decrypt,
														
 
															+	}
														
 
															+}, {
														
 
															+	.cra_name		= "cbc(aes)",
														
 
															+	.cra_driver_name	= "sahara-cbc-aes",
														
 
															+	.cra_priority		= 300,
														
 
															+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER |
														
 
															+			CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK,
														
 
															+	.cra_blocksize		= AES_BLOCK_SIZE,
														
 
															+	.cra_ctxsize		= sizeof(struct sahara_ctx),
														
 
															+	.cra_alignmask		= 0x0,
														
 
															+	.cra_type		= &crypto_ablkcipher_type,
														
 
															+	.cra_module		= THIS_MODULE,
														
 
															+	.cra_init		= sahara_aes_cra_init,
														
 
															+	.cra_exit		= sahara_aes_cra_exit,
														
 
															+	.cra_u.ablkcipher = {
														
 
															+		.min_keysize	= AES_MIN_KEY_SIZE ,
														
 
															+		.max_keysize	= AES_MAX_KEY_SIZE,
														
 
															+		.ivsize		= AES_BLOCK_SIZE,
														
 
															+		.setkey		= sahara_aes_setkey,
														
 
															+		.encrypt	= sahara_aes_cbc_encrypt,
														
 
															+		.decrypt	= sahara_aes_cbc_decrypt,
														
 
															+	}
														
 
															+}
														
 
															+};
														
 
															+
														
 
															+static irqreturn_t sahara_irq_handler(int irq, void *data)
														
 
															+{
														
 
															+	struct sahara_dev *dev = (struct sahara_dev *)data;
														
 
															+	unsigned int stat = sahara_read(dev, SAHARA_REG_STATUS);
														
 
															+	unsigned int err = sahara_read(dev, SAHARA_REG_ERRSTATUS);
														
 
															+
														
 
															+	del_timer(&dev->watchdog);
														
 
															+
														
 
															+	sahara_write(dev, SAHARA_CMD_CLEAR_INT | SAHARA_CMD_CLEAR_ERR,
														
 
															+		     SAHARA_REG_CMD);
														
 
															+
														
 
															+	sahara_decode_status(dev, stat);
														
 
															+
														
 
															+	if (SAHARA_STATUS_GET_STATE(stat) == SAHARA_STATE_BUSY) {
														
 
															+		return IRQ_NONE;
														
 
															+	} else if (SAHARA_STATUS_GET_STATE(stat) == SAHARA_STATE_COMPLETE) {
														
 
															+		dev->error = 0;
														
 
															+	} else {
														
 
															+		sahara_decode_error(dev, err);
														
 
															+		dev->error = -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	tasklet_schedule(&dev->done_task);
														
 
															+
														
 
															+	return IRQ_HANDLED;
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static int sahara_register_algs(struct sahara_dev *dev)
														
 
															+{
														
 
															+	int err, i, j;
														
 
															+
														
 
															+	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
														
 
															+		INIT_LIST_HEAD(&aes_algs[i].cra_list);
														
 
															+		err = crypto_register_alg(&aes_algs[i]);
														
 
															+		if (err)
														
 
															+			goto err_aes_algs;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+
														
 
															+err_aes_algs:
														
 
															+	for (j = 0; j < i; j++)
														
 
															+		crypto_unregister_alg(&aes_algs[j]);
														
 
															+
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static void sahara_unregister_algs(struct sahara_dev *dev)
														
 
															+{
														
 
															+	int i;
														
 
															+
														
 
															+	for (i = 0; i < ARRAY_SIZE(aes_algs); i++)
														
 
															+		crypto_unregister_alg(&aes_algs[i]);
														
 
															+}
														
 
															+
														
 
															+static struct platform_device_id sahara_platform_ids[] = {
														
 
															+	{ .name = "sahara-imx27" },
														
 
															+	{ /* sentinel */ }
														
 
															+};
														
 
															+MODULE_DEVICE_TABLE(platform, sahara_platform_ids);
														
 
															+
														
 
															+static struct of_device_id sahara_dt_ids[] = {
														
 
															+	{ .compatible = "fsl,imx27-sahara" },
														
 
															+	{ /* sentinel */ }
														
 
															+};
														
 
															+MODULE_DEVICE_TABLE(platform, sahara_dt_ids);
														
 
															+
														
 
															+static int sahara_probe(struct platform_device *pdev)
														
 
															+{
														
 
															+	struct sahara_dev *dev;
														
 
															+	struct resource *res;
														
 
															+	u32 version;
														
 
															+	int irq;
														
 
															+	int err;
														
 
															+	int i;
														
 
															+
														
 
															+	dev = devm_kzalloc(&pdev->dev, sizeof(struct sahara_dev), GFP_KERNEL);
														
 
															+	if (dev == NULL) {
														
 
															+		dev_err(&pdev->dev, "unable to alloc data struct.\n");
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+
														
 
															+	dev->device = &pdev->dev;
														
 
															+	platform_set_drvdata(pdev, dev);
														
 
															+
														
 
															+	/* Get the base address */
														
 
															+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
														
 
															+	if (!res) {
														
 
															+		dev_err(&pdev->dev, "failed to get memory region resource\n");
														
 
															+		return -ENODEV;
														
 
															+	}
														
 
															+
														
 
															+	if (devm_request_mem_region(&pdev->dev, res->start,
														
 
															+			resource_size(res), SAHARA_NAME) == NULL) {
														
 
															+		dev_err(&pdev->dev, "failed to request memory region\n");
														
 
															+		return -ENOENT;
														
 
															+	}
														
 
															+	dev->regs_base = devm_ioremap(&pdev->dev, res->start,
														
 
															+				      resource_size(res));
														
 
															+	if (!dev->regs_base) {
														
 
															+		dev_err(&pdev->dev, "failed to ioremap address region\n");
														
 
															+		return -ENOENT;
														
 
															+	}
														
 
															+
														
 
															+	/* Get the IRQ */
														
 
															+	irq = platform_get_irq(pdev,  0);
														
 
															+	if (irq < 0) {
														
 
															+		dev_err(&pdev->dev, "failed to get irq resource\n");
														
 
															+		return irq;
														
 
															+	}
														
 
															+
														
 
															+	if (devm_request_irq(&pdev->dev, irq, sahara_irq_handler,
														
 
															+		0, SAHARA_NAME, dev) < 0) {
														
 
															+		dev_err(&pdev->dev, "failed to request irq\n");
														
 
															+		return -ENOENT;
														
 
															+	}
														
 
															+
														
 
															+	/* clocks */
														
 
															+	dev->clk_ipg = devm_clk_get(&pdev->dev, "ipg");
														
 
															+	if (IS_ERR(dev->clk_ipg)) {
														
 
															+		dev_err(&pdev->dev, "Could not get ipg clock\n");
														
 
															+		return PTR_ERR(dev->clk_ipg);
														
 
															+	}
														
 
															+
														
 
															+	dev->clk_ahb = devm_clk_get(&pdev->dev, "ahb");
														
 
															+	if (IS_ERR(dev->clk_ahb)) {
														
 
															+		dev_err(&pdev->dev, "Could not get ahb clock\n");
														
 
															+		return PTR_ERR(dev->clk_ahb);
														
 
															+	}
														
 
															+
														
 
															+	/* Allocate HW descriptors */
														
 
															+	dev->hw_desc[0] = dma_alloc_coherent(&pdev->dev,
														
 
															+			SAHARA_MAX_HW_DESC * sizeof(struct sahara_hw_desc),
														
 
															+			&dev->hw_phys_desc[0], GFP_KERNEL);
														
 
															+	if (!dev->hw_desc[0]) {
														
 
															+		dev_err(&pdev->dev, "Could not allocate hw descriptors\n");
														
 
															+		return -ENOMEM;
														
 
															+	}
														
 
															+	dev->hw_desc[1] = dev->hw_desc[0] + 1;
														
 
															+	dev->hw_phys_desc[1] = dev->hw_phys_desc[0] +
														
 
															+				sizeof(struct sahara_hw_desc);
														
 
															+
														
 
															+	/* Allocate space for iv and key */
														
 
															+	dev->key_base = dma_alloc_coherent(&pdev->dev, 2 * AES_KEYSIZE_128,
														
 
															+				&dev->key_phys_base, GFP_KERNEL);
														
 
															+	if (!dev->key_base) {
														
 
															+		dev_err(&pdev->dev, "Could not allocate memory for key\n");
														
 
															+		err = -ENOMEM;
														
 
															+		goto err_key;
														
 
															+	}
														
 
															+	dev->iv_base = dev->key_base + AES_KEYSIZE_128;
														
 
															+	dev->iv_phys_base = dev->key_phys_base + AES_KEYSIZE_128;
														
 
															+
														
 
															+	/* Allocate space for HW links */
														
 
															+	dev->hw_link[0] = dma_alloc_coherent(&pdev->dev,
														
 
															+			SAHARA_MAX_HW_LINK * sizeof(struct sahara_hw_link),
														
 
															+			&dev->hw_phys_link[0], GFP_KERNEL);
														
 
															+	if (!dev->hw_link) {
														
 
															+		dev_err(&pdev->dev, "Could not allocate hw links\n");
														
 
															+		err = -ENOMEM;
														
 
															+		goto err_link;
														
 
															+	}
														
 
															+	for (i = 1; i < SAHARA_MAX_HW_LINK; i++) {
														
 
															+		dev->hw_phys_link[i] = dev->hw_phys_link[i - 1] +
														
 
															+					sizeof(struct sahara_hw_link);
														
 
															+		dev->hw_link[i] = dev->hw_link[i - 1] + 1;
														
 
															+	}
														
 
															+
														
 
															+	crypto_init_queue(&dev->queue, SAHARA_QUEUE_LENGTH);
														
 
															+
														
 
															+	dev_ptr = dev;
														
 
															+
														
 
															+	tasklet_init(&dev->queue_task, sahara_aes_queue_task,
														
 
															+		     (unsigned long)dev);
														
 
															+	tasklet_init(&dev->done_task, sahara_aes_done_task,
														
 
															+		     (unsigned long)dev);
														
 
															+
														
 
															+	init_timer(&dev->watchdog);
														
 
															+	dev->watchdog.function = &sahara_watchdog;
														
 
															+	dev->watchdog.data = (unsigned long)dev;
														
 
															+
														
 
															+	clk_prepare_enable(dev->clk_ipg);
														
 
															+	clk_prepare_enable(dev->clk_ahb);
														
 
															+
														
 
															+	version = sahara_read(dev, SAHARA_REG_VERSION);
														
 
															+	if (version != SAHARA_VERSION_3) {
														
 
															+		dev_err(&pdev->dev, "SAHARA version %d not supported\n",
														
 
															+			version);
														
 
															+		err = -ENODEV;
														
 
															+		goto err_algs;
														
 
															+	}
														
 
															+
														
 
															+	sahara_write(dev, SAHARA_CMD_RESET | SAHARA_CMD_MODE_BATCH,
														
 
															+		     SAHARA_REG_CMD);
														
 
															+	sahara_write(dev, SAHARA_CONTROL_SET_THROTTLE(0) |
														
 
															+			SAHARA_CONTROL_SET_MAXBURST(8) |
														
 
															+			SAHARA_CONTROL_RNG_AUTORSD |
														
 
															+			SAHARA_CONTROL_ENABLE_INT,
														
 
															+			SAHARA_REG_CONTROL);
														
 
															+
														
 
															+	err = sahara_register_algs(dev);
														
 
															+	if (err)
														
 
															+		goto err_algs;
														
 
															+
														
 
															+	dev_info(&pdev->dev, "SAHARA version %d initialized\n", version);
														
 
															+
														
 
															+	return 0;
														
 
															+
														
 
															+err_algs:
														
 
															+	dma_free_coherent(&pdev->dev,
														
 
															+			  SAHARA_MAX_HW_LINK * sizeof(struct sahara_hw_link),
														
 
															+			  dev->hw_link[0], dev->hw_phys_link[0]);
														
 
															+	clk_disable_unprepare(dev->clk_ipg);
														
 
															+	clk_disable_unprepare(dev->clk_ahb);
														
 
															+	dev_ptr = NULL;
														
 
															+err_link:
														
 
															+	dma_free_coherent(&pdev->dev,
														
 
															+			  2 * AES_KEYSIZE_128,
														
 
															+			  dev->key_base, dev->key_phys_base);
														
 
															+err_key:
														
 
															+	dma_free_coherent(&pdev->dev,
														
 
															+			  SAHARA_MAX_HW_DESC * sizeof(struct sahara_hw_desc),
														
 
															+			  dev->hw_desc[0], dev->hw_phys_desc[0]);
														
 
															+
														
 
															+	return err;
														
 
															+}
														
 
															+
														
 
															+static int sahara_remove(struct platform_device *pdev)
														
 
															+{
														
 
															+	struct sahara_dev *dev = platform_get_drvdata(pdev);
														
 
															+
														
 
															+	dma_free_coherent(&pdev->dev,
														
 
															+			  SAHARA_MAX_HW_LINK * sizeof(struct sahara_hw_link),
														
 
															+			  dev->hw_link[0], dev->hw_phys_link[0]);
														
 
															+	dma_free_coherent(&pdev->dev,
														
 
															+			  2 * AES_KEYSIZE_128,
														
 
															+			  dev->key_base, dev->key_phys_base);
														
 
															+	dma_free_coherent(&pdev->dev,
														
 
															+			  SAHARA_MAX_HW_DESC * sizeof(struct sahara_hw_desc),
														
 
															+			  dev->hw_desc[0], dev->hw_phys_desc[0]);
														
 
															+
														
 
															+	tasklet_kill(&dev->done_task);
														
 
															+	tasklet_kill(&dev->queue_task);
														
 
															+
														
 
															+	sahara_unregister_algs(dev);
														
 
															+
														
 
															+	clk_disable_unprepare(dev->clk_ipg);
														
 
															+	clk_disable_unprepare(dev->clk_ahb);
														
 
															+
														
 
															+	dev_ptr = NULL;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static struct platform_driver sahara_driver = {
														
 
															+	.probe		= sahara_probe,
														
 
															+	.remove		= sahara_remove,
														
 
															+	.driver		= {
														
 
															+		.name	= SAHARA_NAME,
														
 
															+		.owner	= THIS_MODULE,
														
 
															+		.of_match_table = of_match_ptr(sahara_dt_ids),
														
 
															+	},
														
 
															+	.id_table = sahara_platform_ids,
														
 
															+};
														
 
															+
														
 
															+module_platform_driver(sahara_driver);
														
 
															+
														
 
															+MODULE_LICENSE("GPL");
														
 
															+MODULE_AUTHOR("Javier Martin <javier.martin@vista-silicon.com>");
														
 
															+MODULE_DESCRIPTION("SAHARA2 HW crypto accelerator");
														
--- a/drivers/crypto/ux500/hash/hash_core.c
+++ b/drivers/crypto/ux500/hash/hash_core.c
@@ -938,6 +938,7 @@ static int hash_dma_final(struct ahash_request *req)
 
															 	if (!ctx->device->dma.nents) {
														
 
															 		dev_err(device_data->dev, "[%s] "
														
 
															 				"ctx->device->dma.nents = 0", __func__);
														
 
															+		ret = ctx->device->dma.nents;
														
 
															 		goto out;
														
 
															 	}
														
@@ -945,6 +946,7 @@ static int hash_dma_final(struct ahash_request *req)
 
															 	if (bytes_written != req->nbytes) {
														
 
															 		dev_err(device_data->dev, "[%s] "
														
 
															 				"hash_dma_write() failed!", __func__);
														
 
															+		ret = bytes_written;
														
 
															 		goto out;
														
 
															 	}
														
@@ -1367,14 +1369,12 @@ static int hash_setkey(struct crypto_ahash *tfm,
 
															 	/**
														
 
															 	 * Freed in final.
														
 
															 	 */
														
 
															-	ctx->key = kmalloc(keylen, GFP_KERNEL);
														
 
															+	ctx->key = kmemdup(key, keylen, GFP_KERNEL);
														
 
															 	if (!ctx->key) {
														
 
															 		pr_err(DEV_DBG_NAME " [%s] Failed to allocate ctx->key "
														
 
															 		       "for %d\n", __func__, alg);
														
 
															 		return -ENOMEM;
														
 
															 	}
														
 
															-
														
 
															-	memcpy(ctx->key, key, keylen);
														
 
															 	ctx->keylen = keylen;
														
 
															 	return ret;
														
--- a/include/crypto/sha.h
+++ b/include/crypto/sha.h
@@ -87,4 +87,9 @@ struct shash_desc;
 
															 extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
														
 
															 			      unsigned int len);
														
 
															+extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
														
 
															+			      unsigned int len);
														
 
															+
														
 
															+extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
														
 
															+			      unsigned int len);
														
 
															 #endif
														
--- a/include/linux/platform_data/atmel-aes.h
+++ b/include/linux/platform_data/atmel-aes.h
@@ -1,22 +0,0 @@
 
															-#ifndef __LINUX_ATMEL_AES_H
														
 
															-#define __LINUX_ATMEL_AES_H
														
 
															-
														
 
															-#include <linux/platform_data/dma-atmel.h>
														
 
															-
														
 
															-/**
														
 
															- * struct aes_dma_data - DMA data for AES
														
 
															- */
														
 
															-struct aes_dma_data {
														
 
															-	struct at_dma_slave	txdata;
														
 
															-	struct at_dma_slave	rxdata;
														
 
															-};
														
 
															-
														
 
															-/**
														
 
															- * struct aes_platform_data - board-specific AES configuration
														
 
															- * @dma_slave: DMA slave interface to use in data transfers.
														
 
															- */
														
 
															-struct aes_platform_data {
														
 
															-	struct aes_dma_data	*dma_slave;
														
 
															-};
														
 
															-
														
 
															-#endif /* __LINUX_ATMEL_AES_H */
														
--- a/include/linux/platform_data/crypto-atmel.h
+++ b/include/linux/platform_data/crypto-atmel.h
@@ -0,0 +1,22 @@
 
															+#ifndef __LINUX_CRYPTO_ATMEL_H
														
 
															+#define __LINUX_CRYPTO_ATMEL_H
														
 
															+
														
 
															+#include <linux/platform_data/dma-atmel.h>
														
 
															+
														
 
															+/**
														
 
															+ * struct crypto_dma_data - DMA data for AES/TDES/SHA
														
 
															+ */
														
 
															+struct crypto_dma_data {
														
 
															+	struct at_dma_slave	txdata;
														
 
															+	struct at_dma_slave	rxdata;
														
 
															+};
														
 
															+
														
 
															+/**
														
 
															+ * struct crypto_platform_data - board-specific AES/TDES/SHA configuration
														
 
															+ * @dma_slave: DMA slave interface to use in data transfers.
														
 
															+ */
														
 
															+struct crypto_platform_data {
														
 
															+	struct crypto_dma_data	*dma_slave;
														
 
															+};
														
 
															+
														
 
															+#endif /* __LINUX_CRYPTO_ATMEL_H */
														
--- a/include/linux/timeriomem-rng.h
+++ b/include/linux/timeriomem-rng.h
@@ -8,12 +8,7 @@
 
															  * published by the Free Software Foundation.
														
 
															  */
														
 
															-#include <linux/completion.h>
														
 
															-
														
 
															 struct timeriomem_rng_data {
														
 
															-	struct completion	completion;
														
 
															-	unsigned int		present:1;
														
 
															-
														
 
															 	void __iomem		*address;
														
 
															 	/* measures in usecs */
														
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -311,6 +311,19 @@ static struct xfrm_algo_desc aalg_list[] = {
 
															 		.sadb_alg_maxbits = 128
														
 
															 	}
														
 
															 },
														
 
															+{
														
 
															+	/* rfc4494 */
														
 
															+	.name = "cmac(aes)",
														
 
															+
														
 
															+	.uinfo = {
														
 
															+		.auth = {
														
 
															+			.icv_truncbits = 96,
														
 
															+			.icv_fullbits = 128,
														
 
															+		}
														
 
															+	},
														
 
															+
														
 
															+	.pfkey_supported = 0,
														
 
															+},
														
 
															 };
														
 
															 static struct xfrm_algo_desc ealg_list[] = {