11 years ago · cf5c95db57
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -343,5 +343,8 @@ source "arch/arm64/Kconfig.debug"
 
				 source "security/Kconfig"
			
 
				 
			
 
				 source "crypto/Kconfig"
			
 
				+if CRYPTO
			
 
				+source "arch/arm64/crypto/Kconfig"
			
 
				+endif
			
 
				 
			
 
				 source "lib/Kconfig"
			
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -45,6 +45,7 @@ export	TEXT_OFFSET GZFLAGS
 
				 core-y		+= arch/arm64/kernel/ arch/arm64/mm/
			
 
				 core-$(CONFIG_KVM) += arch/arm64/kvm/
			
 
				 core-$(CONFIG_XEN) += arch/arm64/xen/
			
 
				+core-$(CONFIG_CRYPTO) += arch/arm64/crypto/
			
 
				 libs-y		:= arch/arm64/lib/ $(libs-y)
			
 
				 libs-y		+= $(LIBGCC)
			
 
				 
			
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -0,0 +1,53 @@
 
				+
			
 
				+menuconfig ARM64_CRYPTO
			
 
				+	bool "ARM64 Accelerated Cryptographic Algorithms"
			
 
				+	depends on ARM64
			
 
				+	help
			
 
				+	  Say Y here to choose from a selection of cryptographic algorithms
			
 
				+	  implemented using ARM64 specific CPU features or instructions.
			
 
				+
			
 
				+if ARM64_CRYPTO
			
 
				+
			
 
				+config CRYPTO_SHA1_ARM64_CE
			
 
				+	tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
			
 
				+	depends on ARM64 && KERNEL_MODE_NEON
			
 
				+	select CRYPTO_HASH
			
 
				+
			
 
				+config CRYPTO_SHA2_ARM64_CE
			
 
				+	tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
			
 
				+	depends on ARM64 && KERNEL_MODE_NEON
			
 
				+	select CRYPTO_HASH
			
 
				+
			
 
				+config CRYPTO_GHASH_ARM64_CE
			
 
				+	tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions"
			
 
				+	depends on ARM64 && KERNEL_MODE_NEON
			
 
				+	select CRYPTO_HASH
			
 
				+
			
 
				+config CRYPTO_AES_ARM64_CE
			
 
				+	tristate "AES core cipher using ARMv8 Crypto Extensions"
			
 
				+	depends on ARM64 && KERNEL_MODE_NEON
			
 
				+	select CRYPTO_ALGAPI
			
 
				+	select CRYPTO_AES
			
 
				+
			
 
				+config CRYPTO_AES_ARM64_CE_CCM
			
 
				+	tristate "AES in CCM mode using ARMv8 Crypto Extensions"
			
 
				+	depends on ARM64 && KERNEL_MODE_NEON
			
 
				+	select CRYPTO_ALGAPI
			
 
				+	select CRYPTO_AES
			
 
				+	select CRYPTO_AEAD
			
 
				+
			
 
				+config CRYPTO_AES_ARM64_CE_BLK
			
 
				+	tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
			
 
				+	depends on ARM64 && KERNEL_MODE_NEON
			
 
				+	select CRYPTO_BLKCIPHER
			
 
				+	select CRYPTO_AES
			
 
				+	select CRYPTO_ABLK_HELPER
			
 
				+
			
 
				+config CRYPTO_AES_ARM64_NEON_BLK
			
 
				+	tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
			
 
				+	depends on ARM64 && KERNEL_MODE_NEON
			
 
				+	select CRYPTO_BLKCIPHER
			
 
				+	select CRYPTO_AES
			
 
				+	select CRYPTO_ABLK_HELPER
			
 
				+
			
 
				+endif
			
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -0,0 +1,38 @@
 
				+#
			
 
				+# linux/arch/arm64/crypto/Makefile
			
 
				+#
			
 
				+# Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+#
			
 
				+# This program is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU General Public License version 2 as
			
 
				+# published by the Free Software Foundation.
			
 
				+#
			
 
				+
			
 
				+obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o
			
 
				+sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
			
 
				+
			
 
				+obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o
			
 
				+sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
			
 
				+
			
 
				+obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
			
 
				+ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
			
 
				+
			
 
				+obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
			
 
				+CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
			
 
				+
			
 
				+obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
			
 
				+aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
			
 
				+
			
 
				+obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o
			
 
				+aes-ce-blk-y := aes-glue-ce.o aes-ce.o
			
 
				+
			
 
				+obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o
			
 
				+aes-neon-blk-y := aes-glue-neon.o aes-neon.o
			
 
				+
			
 
				+AFLAGS_aes-ce.o		:= -DINTERLEAVE=2 -DINTERLEAVE_INLINE
			
 
				+AFLAGS_aes-neon.o	:= -DINTERLEAVE=4
			
 
				+
			
 
				+CFLAGS_aes-glue-ce.o	:= -DUSE_V8_CRYPTO_EXTENSIONS
			
 
				+
			
 
				+$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
			
 
				+	$(call if_changed_dep,cc_o_c)
			
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -0,0 +1,222 @@
 
				+/*
			
 
				+ * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions
			
 
				+ *
			
 
				+ * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+
			
 
				+	.text
			
 
				+	.arch	armv8-a+crypto
			
 
				+
			
 
				+	/*
			
 
				+	 * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
			
 
				+	 *			     u32 *macp, u8 const rk[], u32 rounds);
			
 
				+	 */
			
 
				+ENTRY(ce_aes_ccm_auth_data)
			
 
				+	ldr	w8, [x3]			/* leftover from prev round? */
			
 
				+	ld1	{v0.2d}, [x0]			/* load mac */
			
 
				+	cbz	w8, 1f
			
 
				+	sub	w8, w8, #16
			
 
				+	eor	v1.16b, v1.16b, v1.16b
			
 
				+0:	ldrb	w7, [x1], #1			/* get 1 byte of input */
			
 
				+	subs	w2, w2, #1
			
 
				+	add	w8, w8, #1
			
 
				+	ins	v1.b[0], w7
			
 
				+	ext	v1.16b, v1.16b, v1.16b, #1	/* rotate in the input bytes */
			
 
				+	beq	8f				/* out of input? */
			
 
				+	cbnz	w8, 0b
			
 
				+	eor	v0.16b, v0.16b, v1.16b
			
 
				+1:	ld1	{v3.2d}, [x4]			/* load first round key */
			
 
				+	prfm	pldl1strm, [x1]
			
 
				+	cmp	w5, #12				/* which key size? */
			
 
				+	add	x6, x4, #16
			
 
				+	sub	w7, w5, #2			/* modified # of rounds */
			
 
				+	bmi	2f
			
 
				+	bne	5f
			
 
				+	mov	v5.16b, v3.16b
			
 
				+	b	4f
			
 
				+2:	mov	v4.16b, v3.16b
			
 
				+	ld1	{v5.2d}, [x6], #16		/* load 2nd round key */
			
 
				+3:	aese	v0.16b, v4.16b
			
 
				+	aesmc	v0.16b, v0.16b
			
 
				+4:	ld1	{v3.2d}, [x6], #16		/* load next round key */
			
 
				+	aese	v0.16b, v5.16b
			
 
				+	aesmc	v0.16b, v0.16b
			
 
				+5:	ld1	{v4.2d}, [x6], #16		/* load next round key */
			
 
				+	subs	w7, w7, #3
			
 
				+	aese	v0.16b, v3.16b
			
 
				+	aesmc	v0.16b, v0.16b
			
 
				+	ld1	{v5.2d}, [x6], #16		/* load next round key */
			
 
				+	bpl	3b
			
 
				+	aese	v0.16b, v4.16b
			
 
				+	subs	w2, w2, #16			/* last data? */
			
 
				+	eor	v0.16b, v0.16b, v5.16b		/* final round */
			
 
				+	bmi	6f
			
 
				+	ld1	{v1.16b}, [x1], #16		/* load next input block */
			
 
				+	eor	v0.16b, v0.16b, v1.16b		/* xor with mac */
			
 
				+	bne	1b
			
 
				+6:	st1	{v0.2d}, [x0]			/* store mac */
			
 
				+	beq	10f
			
 
				+	adds	w2, w2, #16
			
 
				+	beq	10f
			
 
				+	mov	w8, w2
			
 
				+7:	ldrb	w7, [x1], #1
			
 
				+	umov	w6, v0.b[0]
			
 
				+	eor	w6, w6, w7
			
 
				+	strb	w6, [x0], #1
			
 
				+	subs	w2, w2, #1
			
 
				+	beq	10f
			
 
				+	ext	v0.16b, v0.16b, v0.16b, #1	/* rotate out the mac bytes */
			
 
				+	b	7b
			
 
				+8:	mov	w7, w8
			
 
				+	add	w8, w8, #16
			
 
				+9:	ext	v1.16b, v1.16b, v1.16b, #1
			
 
				+	adds	w7, w7, #1
			
 
				+	bne	9b
			
 
				+	eor	v0.16b, v0.16b, v1.16b
			
 
				+	st1	{v0.2d}, [x0]
			
 
				+10:	str	w8, [x3]
			
 
				+	ret
			
 
				+ENDPROC(ce_aes_ccm_auth_data)
			
 
				+
			
 
				+	/*
			
 
				+	 * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
			
 
				+	 * 			 u32 rounds);
			
 
				+	 */
			
 
				+ENTRY(ce_aes_ccm_final)
			
 
				+	ld1	{v3.2d}, [x2], #16		/* load first round key */
			
 
				+	ld1	{v0.2d}, [x0]			/* load mac */
			
 
				+	cmp	w3, #12				/* which key size? */
			
 
				+	sub	w3, w3, #2			/* modified # of rounds */
			
 
				+	ld1	{v1.2d}, [x1]			/* load 1st ctriv */
			
 
				+	bmi	0f
			
 
				+	bne	3f
			
 
				+	mov	v5.16b, v3.16b
			
 
				+	b	2f
			
 
				+0:	mov	v4.16b, v3.16b
			
 
				+1:	ld1	{v5.2d}, [x2], #16		/* load next round key */
			
 
				+	aese	v0.16b, v4.16b
			
 
				+	aese	v1.16b, v4.16b
			
 
				+	aesmc	v0.16b, v0.16b
			
 
				+	aesmc	v1.16b, v1.16b
			
 
				+2:	ld1	{v3.2d}, [x2], #16		/* load next round key */
			
 
				+	aese	v0.16b, v5.16b
			
 
				+	aese	v1.16b, v5.16b
			
 
				+	aesmc	v0.16b, v0.16b
			
 
				+	aesmc	v1.16b, v1.16b
			
 
				+3:	ld1	{v4.2d}, [x2], #16		/* load next round key */
			
 
				+	subs	w3, w3, #3
			
 
				+	aese	v0.16b, v3.16b
			
 
				+	aese	v1.16b, v3.16b
			
 
				+	aesmc	v0.16b, v0.16b
			
 
				+	aesmc	v1.16b, v1.16b
			
 
				+	bpl	1b
			
 
				+	aese	v0.16b, v4.16b
			
 
				+	aese	v1.16b, v4.16b
			
 
				+	/* final round key cancels out */
			
 
				+	eor	v0.16b, v0.16b, v1.16b		/* en-/decrypt the mac */
			
 
				+	st1	{v0.2d}, [x0]			/* store result */
			
 
				+	ret
			
 
				+ENDPROC(ce_aes_ccm_final)
			
 
				+
			
 
				+	.macro	aes_ccm_do_crypt,enc
			
 
				+	ldr	x8, [x6, #8]			/* load lower ctr */
			
 
				+	ld1	{v0.2d}, [x5]			/* load mac */
			
 
				+	rev	x8, x8				/* keep swabbed ctr in reg */
			
 
				+0:	/* outer loop */
			
 
				+	ld1	{v1.1d}, [x6]			/* load upper ctr */
			
 
				+	prfm	pldl1strm, [x1]
			
 
				+	add	x8, x8, #1
			
 
				+	rev	x9, x8
			
 
				+	cmp	w4, #12				/* which key size? */
			
 
				+	sub	w7, w4, #2			/* get modified # of rounds */
			
 
				+	ins	v1.d[1], x9			/* no carry in lower ctr */
			
 
				+	ld1	{v3.2d}, [x3]			/* load first round key */
			
 
				+	add	x10, x3, #16
			
 
				+	bmi	1f
			
 
				+	bne	4f
			
 
				+	mov	v5.16b, v3.16b
			
 
				+	b	3f
			
 
				+1:	mov	v4.16b, v3.16b
			
 
				+	ld1	{v5.2d}, [x10], #16		/* load 2nd round key */
			
 
				+2:	/* inner loop: 3 rounds, 2x interleaved */
			
 
				+	aese	v0.16b, v4.16b
			
 
				+	aese	v1.16b, v4.16b
			
 
				+	aesmc	v0.16b, v0.16b
			
 
				+	aesmc	v1.16b, v1.16b
			
 
				+3:	ld1	{v3.2d}, [x10], #16		/* load next round key */
			
 
				+	aese	v0.16b, v5.16b
			
 
				+	aese	v1.16b, v5.16b
			
 
				+	aesmc	v0.16b, v0.16b
			
 
				+	aesmc	v1.16b, v1.16b
			
 
				+4:	ld1	{v4.2d}, [x10], #16		/* load next round key */
			
 
				+	subs	w7, w7, #3
			
 
				+	aese	v0.16b, v3.16b
			
 
				+	aese	v1.16b, v3.16b
			
 
				+	aesmc	v0.16b, v0.16b
			
 
				+	aesmc	v1.16b, v1.16b
			
 
				+	ld1	{v5.2d}, [x10], #16		/* load next round key */
			
 
				+	bpl	2b
			
 
				+	aese	v0.16b, v4.16b
			
 
				+	aese	v1.16b, v4.16b
			
 
				+	subs	w2, w2, #16
			
 
				+	bmi	6f				/* partial block? */
			
 
				+	ld1	{v2.16b}, [x1], #16		/* load next input block */
			
 
				+	.if	\enc == 1
			
 
				+	eor	v2.16b, v2.16b, v5.16b		/* final round enc+mac */
			
 
				+	eor	v1.16b, v1.16b, v2.16b		/* xor with crypted ctr */
			
 
				+	.else
			
 
				+	eor	v2.16b, v2.16b, v1.16b		/* xor with crypted ctr */
			
 
				+	eor	v1.16b, v2.16b, v5.16b		/* final round enc */
			
 
				+	.endif
			
 
				+	eor	v0.16b, v0.16b, v2.16b		/* xor mac with pt ^ rk[last] */
			
 
				+	st1	{v1.16b}, [x0], #16		/* write output block */
			
 
				+	bne	0b
			
 
				+	rev	x8, x8
			
 
				+	st1	{v0.2d}, [x5]			/* store mac */
			
 
				+	str	x8, [x6, #8]			/* store lsb end of ctr (BE) */
			
 
				+5:	ret
			
 
				+
			
 
				+6:	eor	v0.16b, v0.16b, v5.16b		/* final round mac */
			
 
				+	eor	v1.16b, v1.16b, v5.16b		/* final round enc */
			
 
				+	st1	{v0.2d}, [x5]			/* store mac */
			
 
				+	add	w2, w2, #16			/* process partial tail block */
			
 
				+7:	ldrb	w9, [x1], #1			/* get 1 byte of input */
			
 
				+	umov	w6, v1.b[0]			/* get top crypted ctr byte */
			
 
				+	umov	w7, v0.b[0]			/* get top mac byte */
			
 
				+	.if	\enc == 1
			
 
				+	eor	w7, w7, w9
			
 
				+	eor	w9, w9, w6
			
 
				+	.else
			
 
				+	eor	w9, w9, w6
			
 
				+	eor	w7, w7, w9
			
 
				+	.endif
			
 
				+	strb	w9, [x0], #1			/* store out byte */
			
 
				+	strb	w7, [x5], #1			/* store mac byte */
			
 
				+	subs	w2, w2, #1
			
 
				+	beq	5b
			
 
				+	ext	v0.16b, v0.16b, v0.16b, #1	/* shift out mac byte */
			
 
				+	ext	v1.16b, v1.16b, v1.16b, #1	/* shift out ctr byte */
			
 
				+	b	7b
			
 
				+	.endm
			
 
				+
			
 
				+	/*
			
 
				+	 * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
			
 
				+	 * 			   u8 const rk[], u32 rounds, u8 mac[],
			
 
				+	 * 			   u8 ctr[]);
			
 
				+	 * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
			
 
				+	 * 			   u8 const rk[], u32 rounds, u8 mac[],
			
 
				+	 * 			   u8 ctr[]);
			
 
				+	 */
			
 
				+ENTRY(ce_aes_ccm_encrypt)
			
 
				+	aes_ccm_do_crypt	1
			
 
				+ENDPROC(ce_aes_ccm_encrypt)
			
 
				+
			
 
				+ENTRY(ce_aes_ccm_decrypt)
			
 
				+	aes_ccm_do_crypt	0
			
 
				+ENDPROC(ce_aes_ccm_decrypt)
			
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -0,0 +1,297 @@
 
				+/*
			
 
				+ * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions
			
 
				+ *
			
 
				+ * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <asm/neon.h>
			
 
				+#include <asm/unaligned.h>
			
 
				+#include <crypto/aes.h>
			
 
				+#include <crypto/algapi.h>
			
 
				+#include <crypto/scatterwalk.h>
			
 
				+#include <linux/crypto.h>
			
 
				+#include <linux/module.h>
			
 
				+
			
 
				+static int num_rounds(struct crypto_aes_ctx *ctx)
			
 
				+{
			
 
				+	/*
			
 
				+	 * # of rounds specified by AES:
			
 
				+	 * 128 bit key		10 rounds
			
 
				+	 * 192 bit key		12 rounds
			
 
				+	 * 256 bit key		14 rounds
			
 
				+	 * => n byte key	=> 6 + (n/4) rounds
			
 
				+	 */
			
 
				+	return 6 + ctx->key_length / 4;
			
 
				+}
			
 
				+
			
 
				+asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
			
 
				+				     u32 *macp, u32 const rk[], u32 rounds);
			
 
				+
			
 
				+asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes,
			
 
				+				   u32 const rk[], u32 rounds, u8 mac[],
			
 
				+				   u8 ctr[]);
			
 
				+
			
 
				+asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
			
 
				+				   u32 const rk[], u32 rounds, u8 mac[],
			
 
				+				   u8 ctr[]);
			
 
				+
			
 
				+asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[],
			
 
				+				 u32 rounds);
			
 
				+
			
 
				+static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key,
			
 
				+		      unsigned int key_len)
			
 
				+{
			
 
				+	struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm);
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = crypto_aes_expand_key(ctx, in_key, key_len);
			
 
				+	if (!ret)
			
 
				+		return 0;
			
 
				+
			
 
				+	tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
			
 
				+{
			
 
				+	if ((authsize & 1) || authsize < 4)
			
 
				+		return -EINVAL;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
			
 
				+{
			
 
				+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
			
 
				+	__be32 *n = (__be32 *)&maciv[AES_BLOCK_SIZE - 8];
			
 
				+	u32 l = req->iv[0] + 1;
			
 
				+
			
 
				+	/* verify that CCM dimension 'L' is set correctly in the IV */
			
 
				+	if (l < 2 || l > 8)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/* verify that msglen can in fact be represented in L bytes */
			
 
				+	if (l < 4 && msglen >> (8 * l))
			
 
				+		return -EOVERFLOW;
			
 
				+
			
 
				+	/*
			
 
				+	 * Even if the CCM spec allows L values of up to 8, the Linux cryptoapi
			
 
				+	 * uses a u32 type to represent msglen so the top 4 bytes are always 0.
			
 
				+	 */
			
 
				+	n[0] = 0;
			
 
				+	n[1] = cpu_to_be32(msglen);
			
 
				+
			
 
				+	memcpy(maciv, req->iv, AES_BLOCK_SIZE - l);
			
 
				+
			
 
				+	/*
			
 
				+	 * Meaning of byte 0 according to CCM spec (RFC 3610/NIST 800-38C)
			
 
				+	 * - bits 0..2	: max # of bytes required to represent msglen, minus 1
			
 
				+	 *                (already set by caller)
			
 
				+	 * - bits 3..5	: size of auth tag (1 => 4 bytes, 2 => 6 bytes, etc)
			
 
				+	 * - bit 6	: indicates presence of authenticate-only data
			
 
				+	 */
			
 
				+	maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2;
			
 
				+	if (req->assoclen)
			
 
				+		maciv[0] |= 0x40;
			
 
				+
			
 
				+	memset(&req->iv[AES_BLOCK_SIZE - l], 0, l);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
			
 
				+{
			
 
				+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
			
 
				+	struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
			
 
				+	struct __packed { __be16 l; __be32 h; u16 len; } ltag;
			
 
				+	struct scatter_walk walk;
			
 
				+	u32 len = req->assoclen;
			
 
				+	u32 macp = 0;
			
 
				+
			
 
				+	/* prepend the AAD with a length tag */
			
 
				+	if (len < 0xff00) {
			
 
				+		ltag.l = cpu_to_be16(len);
			
 
				+		ltag.len = 2;
			
 
				+	} else  {
			
 
				+		ltag.l = cpu_to_be16(0xfffe);
			
 
				+		put_unaligned_be32(len, &ltag.h);
			
 
				+		ltag.len = 6;
			
 
				+	}
			
 
				+
			
 
				+	ce_aes_ccm_auth_data(mac, (u8 *)&ltag, ltag.len, &macp, ctx->key_enc,
			
 
				+			     num_rounds(ctx));
			
 
				+	scatterwalk_start(&walk, req->assoc);
			
 
				+
			
 
				+	do {
			
 
				+		u32 n = scatterwalk_clamp(&walk, len);
			
 
				+		u8 *p;
			
 
				+
			
 
				+		if (!n) {
			
 
				+			scatterwalk_start(&walk, sg_next(walk.sg));
			
 
				+			n = scatterwalk_clamp(&walk, len);
			
 
				+		}
			
 
				+		p = scatterwalk_map(&walk);
			
 
				+		ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc,
			
 
				+				     num_rounds(ctx));
			
 
				+		len -= n;
			
 
				+
			
 
				+		scatterwalk_unmap(p);
			
 
				+		scatterwalk_advance(&walk, n);
			
 
				+		scatterwalk_done(&walk, 0, len);
			
 
				+	} while (len);
			
 
				+}
			
 
				+
			
 
				+static int ccm_encrypt(struct aead_request *req)
			
 
				+{
			
 
				+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
			
 
				+	struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
			
 
				+	struct blkcipher_desc desc = { .info = req->iv };
			
 
				+	struct blkcipher_walk walk;
			
 
				+	u8 __aligned(8) mac[AES_BLOCK_SIZE];
			
 
				+	u8 buf[AES_BLOCK_SIZE];
			
 
				+	u32 len = req->cryptlen;
			
 
				+	int err;
			
 
				+
			
 
				+	err = ccm_init_mac(req, mac, len);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	kernel_neon_begin_partial(6);
			
 
				+
			
 
				+	if (req->assoclen)
			
 
				+		ccm_calculate_auth_mac(req, mac);
			
 
				+
			
 
				+	/* preserve the original iv for the final round */
			
 
				+	memcpy(buf, req->iv, AES_BLOCK_SIZE);
			
 
				+
			
 
				+	blkcipher_walk_init(&walk, req->dst, req->src, len);
			
 
				+	err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
			
 
				+					     AES_BLOCK_SIZE);
			
 
				+
			
 
				+	while (walk.nbytes) {
			
 
				+		u32 tail = walk.nbytes % AES_BLOCK_SIZE;
			
 
				+
			
 
				+		if (walk.nbytes == len)
			
 
				+			tail = 0;
			
 
				+
			
 
				+		ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				   walk.nbytes - tail, ctx->key_enc,
			
 
				+				   num_rounds(ctx), mac, walk.iv);
			
 
				+
			
 
				+		len -= walk.nbytes - tail;
			
 
				+		err = blkcipher_walk_done(&desc, &walk, tail);
			
 
				+	}
			
 
				+	if (!err)
			
 
				+		ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
			
 
				+
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	/* copy authtag to end of dst */
			
 
				+	scatterwalk_map_and_copy(mac, req->dst, req->cryptlen,
			
 
				+				 crypto_aead_authsize(aead), 1);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int ccm_decrypt(struct aead_request *req)
			
 
				+{
			
 
				+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
			
 
				+	struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
			
 
				+	unsigned int authsize = crypto_aead_authsize(aead);
			
 
				+	struct blkcipher_desc desc = { .info = req->iv };
			
 
				+	struct blkcipher_walk walk;
			
 
				+	u8 __aligned(8) mac[AES_BLOCK_SIZE];
			
 
				+	u8 buf[AES_BLOCK_SIZE];
			
 
				+	u32 len = req->cryptlen - authsize;
			
 
				+	int err;
			
 
				+
			
 
				+	err = ccm_init_mac(req, mac, len);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	kernel_neon_begin_partial(6);
			
 
				+
			
 
				+	if (req->assoclen)
			
 
				+		ccm_calculate_auth_mac(req, mac);
			
 
				+
			
 
				+	/* preserve the original iv for the final round */
			
 
				+	memcpy(buf, req->iv, AES_BLOCK_SIZE);
			
 
				+
			
 
				+	blkcipher_walk_init(&walk, req->dst, req->src, len);
			
 
				+	err = blkcipher_aead_walk_virt_block(&desc, &walk, aead,
			
 
				+					     AES_BLOCK_SIZE);
			
 
				+
			
 
				+	while (walk.nbytes) {
			
 
				+		u32 tail = walk.nbytes % AES_BLOCK_SIZE;
			
 
				+
			
 
				+		if (walk.nbytes == len)
			
 
				+			tail = 0;
			
 
				+
			
 
				+		ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				   walk.nbytes - tail, ctx->key_enc,
			
 
				+				   num_rounds(ctx), mac, walk.iv);
			
 
				+
			
 
				+		len -= walk.nbytes - tail;
			
 
				+		err = blkcipher_walk_done(&desc, &walk, tail);
			
 
				+	}
			
 
				+	if (!err)
			
 
				+		ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx));
			
 
				+
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	/* compare calculated auth tag with the stored one */
			
 
				+	scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize,
			
 
				+				 authsize, 0);
			
 
				+
			
 
				+	if (memcmp(mac, buf, authsize))
			
 
				+		return -EBADMSG;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct crypto_alg ccm_aes_alg = {
			
 
				+	.cra_name		= "ccm(aes)",
			
 
				+	.cra_driver_name	= "ccm-aes-ce",
			
 
				+	.cra_priority		= 300,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_AEAD,
			
 
				+	.cra_blocksize		= 1,
			
 
				+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_aead_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_aead = {
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.maxauthsize	= AES_BLOCK_SIZE,
			
 
				+		.setkey		= ccm_setkey,
			
 
				+		.setauthsize	= ccm_setauthsize,
			
 
				+		.encrypt	= ccm_encrypt,
			
 
				+		.decrypt	= ccm_decrypt,
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+static int __init aes_mod_init(void)
			
 
				+{
			
 
				+	if (!(elf_hwcap & HWCAP_AES))
			
 
				+		return -ENODEV;
			
 
				+	return crypto_register_alg(&ccm_aes_alg);
			
 
				+}
			
 
				+
			
 
				+static void __exit aes_mod_exit(void)
			
 
				+{
			
 
				+	crypto_unregister_alg(&ccm_aes_alg);
			
 
				+}
			
 
				+
			
 
				+module_init(aes_mod_init);
			
 
				+module_exit(aes_mod_exit);
			
 
				+
			
 
				+MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions");
			
 
				+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				+MODULE_LICENSE("GPL v2");
			
 
				+MODULE_ALIAS("ccm(aes)");
			
--- a/arch/arm64/crypto/aes-ce-cipher.c
+++ b/arch/arm64/crypto/aes-ce-cipher.c
@@ -0,0 +1,155 @@
 
				+/*
			
 
				+ * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions
			
 
				+ *
			
 
				+ * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <asm/neon.h>
			
 
				+#include <crypto/aes.h>
			
 
				+#include <linux/cpufeature.h>
			
 
				+#include <linux/crypto.h>
			
 
				+#include <linux/module.h>
			
 
				+
			
 
				+MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions");
			
 
				+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				+MODULE_LICENSE("GPL v2");
			
 
				+
			
 
				+struct aes_block {
			
 
				+	u8 b[AES_BLOCK_SIZE];
			
 
				+};
			
 
				+
			
 
				+static int num_rounds(struct crypto_aes_ctx *ctx)
			
 
				+{
			
 
				+	/*
			
 
				+	 * # of rounds specified by AES:
			
 
				+	 * 128 bit key		10 rounds
			
 
				+	 * 192 bit key		12 rounds
			
 
				+	 * 256 bit key		14 rounds
			
 
				+	 * => n byte key	=> 6 + (n/4) rounds
			
 
				+	 */
			
 
				+	return 6 + ctx->key_length / 4;
			
 
				+}
			
 
				+
			
 
				+static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
			
 
				+{
			
 
				+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	struct aes_block *out = (struct aes_block *)dst;
			
 
				+	struct aes_block const *in = (struct aes_block *)src;
			
 
				+	void *dummy0;
			
 
				+	int dummy1;
			
 
				+
			
 
				+	kernel_neon_begin_partial(4);
			
 
				+
			
 
				+	__asm__("	ld1	{v0.16b}, %[in]			;"
			
 
				+		"	ld1	{v1.2d}, [%[key]], #16		;"
			
 
				+		"	cmp	%w[rounds], #10			;"
			
 
				+		"	bmi	0f				;"
			
 
				+		"	bne	3f				;"
			
 
				+		"	mov	v3.16b, v1.16b			;"
			
 
				+		"	b	2f				;"
			
 
				+		"0:	mov	v2.16b, v1.16b			;"
			
 
				+		"	ld1	{v3.2d}, [%[key]], #16		;"
			
 
				+		"1:	aese	v0.16b, v2.16b			;"
			
 
				+		"	aesmc	v0.16b, v0.16b			;"
			
 
				+		"2:	ld1	{v1.2d}, [%[key]], #16		;"
			
 
				+		"	aese	v0.16b, v3.16b			;"
			
 
				+		"	aesmc	v0.16b, v0.16b			;"
			
 
				+		"3:	ld1	{v2.2d}, [%[key]], #16		;"
			
 
				+		"	subs	%w[rounds], %w[rounds], #3	;"
			
 
				+		"	aese	v0.16b, v1.16b			;"
			
 
				+		"	aesmc	v0.16b, v0.16b			;"
			
 
				+		"	ld1	{v3.2d}, [%[key]], #16		;"
			
 
				+		"	bpl	1b				;"
			
 
				+		"	aese	v0.16b, v2.16b			;"
			
 
				+		"	eor	v0.16b, v0.16b, v3.16b		;"
			
 
				+		"	st1	{v0.16b}, %[out]		;"
			
 
				+
			
 
				+	:	[out]		"=Q"(*out),
			
 
				+		[key]		"=r"(dummy0),
			
 
				+		[rounds]	"=r"(dummy1)
			
 
				+	:	[in]		"Q"(*in),
			
 
				+				"1"(ctx->key_enc),
			
 
				+				"2"(num_rounds(ctx) - 2)
			
 
				+	:	"cc");
			
 
				+
			
 
				+	kernel_neon_end();
			
 
				+}
			
 
				+
			
 
				+static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
			
 
				+{
			
 
				+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	struct aes_block *out = (struct aes_block *)dst;
			
 
				+	struct aes_block const *in = (struct aes_block *)src;
			
 
				+	void *dummy0;
			
 
				+	int dummy1;
			
 
				+
			
 
				+	kernel_neon_begin_partial(4);
			
 
				+
			
 
				+	__asm__("	ld1	{v0.16b}, %[in]			;"
			
 
				+		"	ld1	{v1.2d}, [%[key]], #16		;"
			
 
				+		"	cmp	%w[rounds], #10			;"
			
 
				+		"	bmi	0f				;"
			
 
				+		"	bne	3f				;"
			
 
				+		"	mov	v3.16b, v1.16b			;"
			
 
				+		"	b	2f				;"
			
 
				+		"0:	mov	v2.16b, v1.16b			;"
			
 
				+		"	ld1	{v3.2d}, [%[key]], #16		;"
			
 
				+		"1:	aesd	v0.16b, v2.16b			;"
			
 
				+		"	aesimc	v0.16b, v0.16b			;"
			
 
				+		"2:	ld1	{v1.2d}, [%[key]], #16		;"
			
 
				+		"	aesd	v0.16b, v3.16b			;"
			
 
				+		"	aesimc	v0.16b, v0.16b			;"
			
 
				+		"3:	ld1	{v2.2d}, [%[key]], #16		;"
			
 
				+		"	subs	%w[rounds], %w[rounds], #3	;"
			
 
				+		"	aesd	v0.16b, v1.16b			;"
			
 
				+		"	aesimc	v0.16b, v0.16b			;"
			
 
				+		"	ld1	{v3.2d}, [%[key]], #16		;"
			
 
				+		"	bpl	1b				;"
			
 
				+		"	aesd	v0.16b, v2.16b			;"
			
 
				+		"	eor	v0.16b, v0.16b, v3.16b		;"
			
 
				+		"	st1	{v0.16b}, %[out]		;"
			
 
				+
			
 
				+	:	[out]		"=Q"(*out),
			
 
				+		[key]		"=r"(dummy0),
			
 
				+		[rounds]	"=r"(dummy1)
			
 
				+	:	[in]		"Q"(*in),
			
 
				+				"1"(ctx->key_dec),
			
 
				+				"2"(num_rounds(ctx) - 2)
			
 
				+	:	"cc");
			
 
				+
			
 
				+	kernel_neon_end();
			
 
				+}
			
 
				+
			
 
				+static struct crypto_alg aes_alg = {
			
 
				+	.cra_name		= "aes",
			
 
				+	.cra_driver_name	= "aes-ce",
			
 
				+	.cra_priority		= 300,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
			
 
				+	.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_cipher = {
			
 
				+		.cia_min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+		.cia_max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+		.cia_setkey		= crypto_aes_set_key,
			
 
				+		.cia_encrypt		= aes_cipher_encrypt,
			
 
				+		.cia_decrypt		= aes_cipher_decrypt
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+static int __init aes_mod_init(void)
			
 
				+{
			
 
				+	return crypto_register_alg(&aes_alg);
			
 
				+}
			
 
				+
			
 
				+static void __exit aes_mod_exit(void)
			
 
				+{
			
 
				+	crypto_unregister_alg(&aes_alg);
			
 
				+}
			
 
				+
			
 
				+module_cpu_feature_match(AES, aes_mod_init);
			
 
				+module_exit(aes_mod_exit);
			
--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@@ -0,0 +1,133 @@
 
				+/*
			
 
				+ * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with
			
 
				+ *                                    Crypto Extensions
			
 
				+ *
			
 
				+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+
			
 
				+#define AES_ENTRY(func)		ENTRY(ce_ ## func)
			
 
				+#define AES_ENDPROC(func)	ENDPROC(ce_ ## func)
			
 
				+
			
 
				+	.arch		armv8-a+crypto
			
 
				+
			
 
				+	/* preload all round keys */
			
 
				+	.macro		load_round_keys, rounds, rk
			
 
				+	cmp		\rounds, #12
			
 
				+	blo		2222f		/* 128 bits */
			
 
				+	beq		1111f		/* 192 bits */
			
 
				+	ld1		{v17.16b-v18.16b}, [\rk], #32
			
 
				+1111:	ld1		{v19.16b-v20.16b}, [\rk], #32
			
 
				+2222:	ld1		{v21.16b-v24.16b}, [\rk], #64
			
 
				+	ld1		{v25.16b-v28.16b}, [\rk], #64
			
 
				+	ld1		{v29.16b-v31.16b}, [\rk]
			
 
				+	.endm
			
 
				+
			
 
				+	/* prepare for encryption with key in rk[] */
			
 
				+	.macro		enc_prepare, rounds, rk, ignore
			
 
				+	load_round_keys	\rounds, \rk
			
 
				+	.endm
			
 
				+
			
 
				+	/* prepare for encryption (again) but with new key in rk[] */
			
 
				+	.macro		enc_switch_key, rounds, rk, ignore
			
 
				+	load_round_keys	\rounds, \rk
			
 
				+	.endm
			
 
				+
			
 
				+	/* prepare for decryption with key in rk[] */
			
 
				+	.macro		dec_prepare, rounds, rk, ignore
			
 
				+	load_round_keys	\rounds, \rk
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		do_enc_Nx, de, mc, k, i0, i1, i2, i3
			
 
				+	aes\de		\i0\().16b, \k\().16b
			
 
				+	.ifnb		\i1
			
 
				+	aes\de		\i1\().16b, \k\().16b
			
 
				+	.ifnb		\i3
			
 
				+	aes\de		\i2\().16b, \k\().16b
			
 
				+	aes\de		\i3\().16b, \k\().16b
			
 
				+	.endif
			
 
				+	.endif
			
 
				+	aes\mc		\i0\().16b, \i0\().16b
			
 
				+	.ifnb		\i1
			
 
				+	aes\mc		\i1\().16b, \i1\().16b
			
 
				+	.ifnb		\i3
			
 
				+	aes\mc		\i2\().16b, \i2\().16b
			
 
				+	aes\mc		\i3\().16b, \i3\().16b
			
 
				+	.endif
			
 
				+	.endif
			
 
				+	.endm
			
 
				+
			
 
				+	/* up to 4 interleaved encryption rounds with the same round key */
			
 
				+	.macro		round_Nx, enc, k, i0, i1, i2, i3
			
 
				+	.ifc		\enc, e
			
 
				+	do_enc_Nx	e, mc, \k, \i0, \i1, \i2, \i3
			
 
				+	.else
			
 
				+	do_enc_Nx	d, imc, \k, \i0, \i1, \i2, \i3
			
 
				+	.endif
			
 
				+	.endm
			
 
				+
			
 
				+	/* up to 4 interleaved final rounds */
			
 
				+	.macro		fin_round_Nx, de, k, k2, i0, i1, i2, i3
			
 
				+	aes\de		\i0\().16b, \k\().16b
			
 
				+	.ifnb		\i1
			
 
				+	aes\de		\i1\().16b, \k\().16b
			
 
				+	.ifnb		\i3
			
 
				+	aes\de		\i2\().16b, \k\().16b
			
 
				+	aes\de		\i3\().16b, \k\().16b
			
 
				+	.endif
			
 
				+	.endif
			
 
				+	eor		\i0\().16b, \i0\().16b, \k2\().16b
			
 
				+	.ifnb		\i1
			
 
				+	eor		\i1\().16b, \i1\().16b, \k2\().16b
			
 
				+	.ifnb		\i3
			
 
				+	eor		\i2\().16b, \i2\().16b, \k2\().16b
			
 
				+	eor		\i3\().16b, \i3\().16b, \k2\().16b
			
 
				+	.endif
			
 
				+	.endif
			
 
				+	.endm
			
 
				+
			
 
				+	/* up to 4 interleaved blocks */
			
 
				+	.macro		do_block_Nx, enc, rounds, i0, i1, i2, i3
			
 
				+	cmp		\rounds, #12
			
 
				+	blo		2222f		/* 128 bits */
			
 
				+	beq		1111f		/* 192 bits */
			
 
				+	round_Nx	\enc, v17, \i0, \i1, \i2, \i3
			
 
				+	round_Nx	\enc, v18, \i0, \i1, \i2, \i3
			
 
				+1111:	round_Nx	\enc, v19, \i0, \i1, \i2, \i3
			
 
				+	round_Nx	\enc, v20, \i0, \i1, \i2, \i3
			
 
				+2222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
			
 
				+	round_Nx	\enc, \key, \i0, \i1, \i2, \i3
			
 
				+	.endr
			
 
				+	fin_round_Nx	\enc, v30, v31, \i0, \i1, \i2, \i3
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		encrypt_block, in, rounds, t0, t1, t2
			
 
				+	do_block_Nx	e, \rounds, \in
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		encrypt_block2x, i0, i1, rounds, t0, t1, t2
			
 
				+	do_block_Nx	e, \rounds, \i0, \i1
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
			
 
				+	do_block_Nx	e, \rounds, \i0, \i1, \i2, \i3
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		decrypt_block, in, rounds, t0, t1, t2
			
 
				+	do_block_Nx	d, \rounds, \in
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		decrypt_block2x, i0, i1, rounds, t0, t1, t2
			
 
				+	do_block_Nx	d, \rounds, \i0, \i1
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
			
 
				+	do_block_Nx	d, \rounds, \i0, \i1, \i2, \i3
			
 
				+	.endm
			
 
				+
			
 
				+#include "aes-modes.S"
			
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -0,0 +1,446 @@
 
				+/*
			
 
				+ * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES
			
 
				+ *
			
 
				+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <asm/neon.h>
			
 
				+#include <asm/hwcap.h>
			
 
				+#include <crypto/aes.h>
			
 
				+#include <crypto/ablk_helper.h>
			
 
				+#include <crypto/algapi.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/cpufeature.h>
			
 
				+
			
 
				+#ifdef USE_V8_CRYPTO_EXTENSIONS
			
 
				+#define MODE			"ce"
			
 
				+#define PRIO			300
			
 
				+#define aes_ecb_encrypt		ce_aes_ecb_encrypt
			
 
				+#define aes_ecb_decrypt		ce_aes_ecb_decrypt
			
 
				+#define aes_cbc_encrypt		ce_aes_cbc_encrypt
			
 
				+#define aes_cbc_decrypt		ce_aes_cbc_decrypt
			
 
				+#define aes_ctr_encrypt		ce_aes_ctr_encrypt
			
 
				+#define aes_xts_encrypt		ce_aes_xts_encrypt
			
 
				+#define aes_xts_decrypt		ce_aes_xts_decrypt
			
 
				+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
			
 
				+#else
			
 
				+#define MODE			"neon"
			
 
				+#define PRIO			200
			
 
				+#define aes_ecb_encrypt		neon_aes_ecb_encrypt
			
 
				+#define aes_ecb_decrypt		neon_aes_ecb_decrypt
			
 
				+#define aes_cbc_encrypt		neon_aes_cbc_encrypt
			
 
				+#define aes_cbc_decrypt		neon_aes_cbc_decrypt
			
 
				+#define aes_ctr_encrypt		neon_aes_ctr_encrypt
			
 
				+#define aes_xts_encrypt		neon_aes_xts_encrypt
			
 
				+#define aes_xts_decrypt		neon_aes_xts_decrypt
			
 
				+MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
			
 
				+MODULE_ALIAS("ecb(aes)");
			
 
				+MODULE_ALIAS("cbc(aes)");
			
 
				+MODULE_ALIAS("ctr(aes)");
			
 
				+MODULE_ALIAS("xts(aes)");
			
 
				+#endif
			
 
				+
			
 
				+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				+MODULE_LICENSE("GPL v2");
			
 
				+
			
 
				+/* defined in aes-modes.S */
			
 
				+asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				int rounds, int blocks, int first);
			
 
				+asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				int rounds, int blocks, int first);
			
 
				+
			
 
				+asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				int rounds, int blocks, u8 iv[], int first);
			
 
				+asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				int rounds, int blocks, u8 iv[], int first);
			
 
				+
			
 
				+asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				int rounds, int blocks, u8 ctr[], int first);
			
 
				+
			
 
				+asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
			
 
				+				int rounds, int blocks, u8 const rk2[], u8 iv[],
			
 
				+				int first);
			
 
				+asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
			
 
				+				int rounds, int blocks, u8 const rk2[], u8 iv[],
			
 
				+				int first);
			
 
				+
			
 
				+struct crypto_aes_xts_ctx {
			
 
				+	struct crypto_aes_ctx key1;
			
 
				+	struct crypto_aes_ctx __aligned(8) key2;
			
 
				+};
			
 
				+
			
 
				+static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key,
			
 
				+		       unsigned int key_len)
			
 
				+{
			
 
				+	struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2);
			
 
				+	if (!ret)
			
 
				+		ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2],
			
 
				+					    key_len / 2);
			
 
				+	if (!ret)
			
 
				+		return 0;
			
 
				+
			
 
				+	tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
			
 
				+		       struct scatterlist *src, unsigned int nbytes)
			
 
				+{
			
 
				+	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
			
 
				+	int err, first, rounds = 6 + ctx->key_length / 4;
			
 
				+	struct blkcipher_walk walk;
			
 
				+	unsigned int blocks;
			
 
				+
			
 
				+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
			
 
				+	blkcipher_walk_init(&walk, dst, src, nbytes);
			
 
				+	err = blkcipher_walk_virt(desc, &walk);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
			
 
				+		aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				(u8 *)ctx->key_enc, rounds, blocks, first);
			
 
				+		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
			
 
				+		       struct scatterlist *src, unsigned int nbytes)
			
 
				+{
			
 
				+	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
			
 
				+	int err, first, rounds = 6 + ctx->key_length / 4;
			
 
				+	struct blkcipher_walk walk;
			
 
				+	unsigned int blocks;
			
 
				+
			
 
				+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
			
 
				+	blkcipher_walk_init(&walk, dst, src, nbytes);
			
 
				+	err = blkcipher_walk_virt(desc, &walk);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
			
 
				+		aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				(u8 *)ctx->key_dec, rounds, blocks, first);
			
 
				+		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
			
 
				+		       struct scatterlist *src, unsigned int nbytes)
			
 
				+{
			
 
				+	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
			
 
				+	int err, first, rounds = 6 + ctx->key_length / 4;
			
 
				+	struct blkcipher_walk walk;
			
 
				+	unsigned int blocks;
			
 
				+
			
 
				+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
			
 
				+	blkcipher_walk_init(&walk, dst, src, nbytes);
			
 
				+	err = blkcipher_walk_virt(desc, &walk);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
			
 
				+		aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				(u8 *)ctx->key_enc, rounds, blocks, walk.iv,
			
 
				+				first);
			
 
				+		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
			
 
				+		       struct scatterlist *src, unsigned int nbytes)
			
 
				+{
			
 
				+	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
			
 
				+	int err, first, rounds = 6 + ctx->key_length / 4;
			
 
				+	struct blkcipher_walk walk;
			
 
				+	unsigned int blocks;
			
 
				+
			
 
				+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
			
 
				+	blkcipher_walk_init(&walk, dst, src, nbytes);
			
 
				+	err = blkcipher_walk_virt(desc, &walk);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
			
 
				+		aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				(u8 *)ctx->key_dec, rounds, blocks, walk.iv,
			
 
				+				first);
			
 
				+		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
			
 
				+		       struct scatterlist *src, unsigned int nbytes)
			
 
				+{
			
 
				+	struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
			
 
				+	int err, first, rounds = 6 + ctx->key_length / 4;
			
 
				+	struct blkcipher_walk walk;
			
 
				+	int blocks;
			
 
				+
			
 
				+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
			
 
				+	blkcipher_walk_init(&walk, dst, src, nbytes);
			
 
				+	err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
			
 
				+
			
 
				+	first = 1;
			
 
				+	kernel_neon_begin();
			
 
				+	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
			
 
				+		aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				(u8 *)ctx->key_enc, rounds, blocks, walk.iv,
			
 
				+				first);
			
 
				+		first = 0;
			
 
				+		nbytes -= blocks * AES_BLOCK_SIZE;
			
 
				+		if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE)
			
 
				+			break;
			
 
				+		err = blkcipher_walk_done(desc, &walk,
			
 
				+					  walk.nbytes % AES_BLOCK_SIZE);
			
 
				+	}
			
 
				+	if (nbytes) {
			
 
				+		u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
			
 
				+		u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
			
 
				+		u8 __aligned(8) tail[AES_BLOCK_SIZE];
			
 
				+
			
 
				+		/*
			
 
				+		 * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
			
 
				+		 * to tell aes_ctr_encrypt() to only read half a block.
			
 
				+		 */
			
 
				+		blocks = (nbytes <= 8) ? -1 : 1;
			
 
				+
			
 
				+		aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds,
			
 
				+				blocks, walk.iv, first);
			
 
				+		memcpy(tdst, tail, nbytes);
			
 
				+		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
			
 
				+		       struct scatterlist *src, unsigned int nbytes)
			
 
				+{
			
 
				+	struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
			
 
				+	int err, first, rounds = 6 + ctx->key1.key_length / 4;
			
 
				+	struct blkcipher_walk walk;
			
 
				+	unsigned int blocks;
			
 
				+
			
 
				+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
			
 
				+	blkcipher_walk_init(&walk, dst, src, nbytes);
			
 
				+	err = blkcipher_walk_virt(desc, &walk);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
			
 
				+		aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				(u8 *)ctx->key1.key_enc, rounds, blocks,
			
 
				+				(u8 *)ctx->key2.key_enc, walk.iv, first);
			
 
				+		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
			
 
				+		       struct scatterlist *src, unsigned int nbytes)
			
 
				+{
			
 
				+	struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
			
 
				+	int err, first, rounds = 6 + ctx->key1.key_length / 4;
			
 
				+	struct blkcipher_walk walk;
			
 
				+	unsigned int blocks;
			
 
				+
			
 
				+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
			
 
				+	blkcipher_walk_init(&walk, dst, src, nbytes);
			
 
				+	err = blkcipher_walk_virt(desc, &walk);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
			
 
				+		aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				(u8 *)ctx->key1.key_dec, rounds, blocks,
			
 
				+				(u8 *)ctx->key2.key_enc, walk.iv, first);
			
 
				+		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static struct crypto_alg aes_algs[] = { {
			
 
				+	.cra_name		= "__ecb-aes-" MODE,
			
 
				+	.cra_driver_name	= "__driver-ecb-aes-" MODE,
			
 
				+	.cra_priority		= 0,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
			
 
				+	.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_blkcipher_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_blkcipher = {
			
 
				+		.min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+		.max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.setkey		= crypto_aes_set_key,
			
 
				+		.encrypt	= ecb_encrypt,
			
 
				+		.decrypt	= ecb_decrypt,
			
 
				+	},
			
 
				+}, {
			
 
				+	.cra_name		= "__cbc-aes-" MODE,
			
 
				+	.cra_driver_name	= "__driver-cbc-aes-" MODE,
			
 
				+	.cra_priority		= 0,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
			
 
				+	.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_blkcipher_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_blkcipher = {
			
 
				+		.min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+		.max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.setkey		= crypto_aes_set_key,
			
 
				+		.encrypt	= cbc_encrypt,
			
 
				+		.decrypt	= cbc_decrypt,
			
 
				+	},
			
 
				+}, {
			
 
				+	.cra_name		= "__ctr-aes-" MODE,
			
 
				+	.cra_driver_name	= "__driver-ctr-aes-" MODE,
			
 
				+	.cra_priority		= 0,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
			
 
				+	.cra_blocksize		= 1,
			
 
				+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_blkcipher_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_blkcipher = {
			
 
				+		.min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+		.max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.setkey		= crypto_aes_set_key,
			
 
				+		.encrypt	= ctr_encrypt,
			
 
				+		.decrypt	= ctr_encrypt,
			
 
				+	},
			
 
				+}, {
			
 
				+	.cra_name		= "__xts-aes-" MODE,
			
 
				+	.cra_driver_name	= "__driver-xts-aes-" MODE,
			
 
				+	.cra_priority		= 0,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
			
 
				+	.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				+	.cra_ctxsize		= sizeof(struct crypto_aes_xts_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_blkcipher_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_blkcipher = {
			
 
				+		.min_keysize	= 2 * AES_MIN_KEY_SIZE,
			
 
				+		.max_keysize	= 2 * AES_MAX_KEY_SIZE,
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.setkey		= xts_set_key,
			
 
				+		.encrypt	= xts_encrypt,
			
 
				+		.decrypt	= xts_decrypt,
			
 
				+	},
			
 
				+}, {
			
 
				+	.cra_name		= "ecb(aes)",
			
 
				+	.cra_driver_name	= "ecb-aes-" MODE,
			
 
				+	.cra_priority		= PRIO,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
			
 
				+	.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_ablkcipher_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_init		= ablk_init,
			
 
				+	.cra_exit		= ablk_exit,
			
 
				+	.cra_ablkcipher = {
			
 
				+		.min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+		.max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.setkey		= ablk_set_key,
			
 
				+		.encrypt	= ablk_encrypt,
			
 
				+		.decrypt	= ablk_decrypt,
			
 
				+	}
			
 
				+}, {
			
 
				+	.cra_name		= "cbc(aes)",
			
 
				+	.cra_driver_name	= "cbc-aes-" MODE,
			
 
				+	.cra_priority		= PRIO,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
			
 
				+	.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_ablkcipher_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_init		= ablk_init,
			
 
				+	.cra_exit		= ablk_exit,
			
 
				+	.cra_ablkcipher = {
			
 
				+		.min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+		.max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.setkey		= ablk_set_key,
			
 
				+		.encrypt	= ablk_encrypt,
			
 
				+		.decrypt	= ablk_decrypt,
			
 
				+	}
			
 
				+}, {
			
 
				+	.cra_name		= "ctr(aes)",
			
 
				+	.cra_driver_name	= "ctr-aes-" MODE,
			
 
				+	.cra_priority		= PRIO,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
			
 
				+	.cra_blocksize		= 1,
			
 
				+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_ablkcipher_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_init		= ablk_init,
			
 
				+	.cra_exit		= ablk_exit,
			
 
				+	.cra_ablkcipher = {
			
 
				+		.min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+		.max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.setkey		= ablk_set_key,
			
 
				+		.encrypt	= ablk_encrypt,
			
 
				+		.decrypt	= ablk_decrypt,
			
 
				+	}
			
 
				+}, {
			
 
				+	.cra_name		= "xts(aes)",
			
 
				+	.cra_driver_name	= "xts-aes-" MODE,
			
 
				+	.cra_priority		= PRIO,
			
 
				+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
			
 
				+	.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
			
 
				+	.cra_alignmask		= 7,
			
 
				+	.cra_type		= &crypto_ablkcipher_type,
			
 
				+	.cra_module		= THIS_MODULE,
			
 
				+	.cra_init		= ablk_init,
			
 
				+	.cra_exit		= ablk_exit,
			
 
				+	.cra_ablkcipher = {
			
 
				+		.min_keysize	= 2 * AES_MIN_KEY_SIZE,
			
 
				+		.max_keysize	= 2 * AES_MAX_KEY_SIZE,
			
 
				+		.ivsize		= AES_BLOCK_SIZE,
			
 
				+		.setkey		= ablk_set_key,
			
 
				+		.encrypt	= ablk_encrypt,
			
 
				+		.decrypt	= ablk_decrypt,
			
 
				+	}
			
 
				+} };
			
 
				+
			
 
				+static int __init aes_init(void)
			
 
				+{
			
 
				+	return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs));
			
 
				+}
			
 
				+
			
 
				+static void __exit aes_exit(void)
			
 
				+{
			
 
				+	crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs));
			
 
				+}
			
 
				+
			
 
				+#ifdef USE_V8_CRYPTO_EXTENSIONS
			
 
				+module_cpu_feature_match(AES, aes_init);
			
 
				+#else
			
 
				+module_init(aes_init);
			
 
				+#endif
			
 
				+module_exit(aes_exit);
			
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -0,0 +1,532 @@
 
				+/*
			
 
				+ * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
			
 
				+ *
			
 
				+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+/* included by aes-ce.S and aes-neon.S */
			
 
				+
			
 
				+	.text
			
 
				+	.align		4
			
 
				+
			
 
				+/*
			
 
				+ * There are several ways to instantiate this code:
			
 
				+ * - no interleave, all inline
			
 
				+ * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
			
 
				+ * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
			
 
				+ * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
			
 
				+ * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
			
 
				+ *
			
 
				+ * Macros imported by this code:
			
 
				+ * - enc_prepare	- setup NEON registers for encryption
			
 
				+ * - dec_prepare	- setup NEON registers for decryption
			
 
				+ * - enc_switch_key	- change to new key after having prepared for encryption
			
 
				+ * - encrypt_block	- encrypt a single block
			
 
				+ * - decrypt block	- decrypt a single block
			
 
				+ * - encrypt_block2x	- encrypt 2 blocks in parallel (if INTERLEAVE == 2)
			
 
				+ * - decrypt_block2x	- decrypt 2 blocks in parallel (if INTERLEAVE == 2)
			
 
				+ * - encrypt_block4x	- encrypt 4 blocks in parallel (if INTERLEAVE == 4)
			
 
				+ * - decrypt_block4x	- decrypt 4 blocks in parallel (if INTERLEAVE == 4)
			
 
				+ */
			
 
				+
			
 
				+#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
			
 
				+#define FRAME_PUSH	stp x29, x30, [sp,#-16]! ; mov x29, sp
			
 
				+#define FRAME_POP	ldp x29, x30, [sp],#16
			
 
				+
			
 
				+#if INTERLEAVE == 2
			
 
				+
			
 
				+aes_encrypt_block2x:
			
 
				+	encrypt_block2x	v0, v1, w3, x2, x6, w7
			
 
				+	ret
			
 
				+ENDPROC(aes_encrypt_block2x)
			
 
				+
			
 
				+aes_decrypt_block2x:
			
 
				+	decrypt_block2x	v0, v1, w3, x2, x6, w7
			
 
				+	ret
			
 
				+ENDPROC(aes_decrypt_block2x)
			
 
				+
			
 
				+#elif INTERLEAVE == 4
			
 
				+
			
 
				+aes_encrypt_block4x:
			
 
				+	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
			
 
				+	ret
			
 
				+ENDPROC(aes_encrypt_block4x)
			
 
				+
			
 
				+aes_decrypt_block4x:
			
 
				+	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
			
 
				+	ret
			
 
				+ENDPROC(aes_decrypt_block4x)
			
 
				+
			
 
				+#else
			
 
				+#error INTERLEAVE should equal 2 or 4
			
 
				+#endif
			
 
				+
			
 
				+	.macro		do_encrypt_block2x
			
 
				+	bl		aes_encrypt_block2x
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		do_decrypt_block2x
			
 
				+	bl		aes_decrypt_block2x
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		do_encrypt_block4x
			
 
				+	bl		aes_encrypt_block4x
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		do_decrypt_block4x
			
 
				+	bl		aes_decrypt_block4x
			
 
				+	.endm
			
 
				+
			
 
				+#else
			
 
				+#define FRAME_PUSH
			
 
				+#define FRAME_POP
			
 
				+
			
 
				+	.macro		do_encrypt_block2x
			
 
				+	encrypt_block2x	v0, v1, w3, x2, x6, w7
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		do_decrypt_block2x
			
 
				+	decrypt_block2x	v0, v1, w3, x2, x6, w7
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		do_encrypt_block4x
			
 
				+	encrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		do_decrypt_block4x
			
 
				+	decrypt_block4x	v0, v1, v2, v3, w3, x2, x6, w7
			
 
				+	.endm
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+	/*
			
 
				+	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
			
 
				+	 *		   int blocks, int first)
			
 
				+	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
			
 
				+	 *		   int blocks, int first)
			
 
				+	 */
			
 
				+
			
 
				+AES_ENTRY(aes_ecb_encrypt)
			
 
				+	FRAME_PUSH
			
 
				+	cbz		w5, .LecbencloopNx
			
 
				+
			
 
				+	enc_prepare	w3, x2, x5
			
 
				+
			
 
				+.LecbencloopNx:
			
 
				+#if INTERLEAVE >= 2
			
 
				+	subs		w4, w4, #INTERLEAVE
			
 
				+	bmi		.Lecbenc1x
			
 
				+#if INTERLEAVE == 2
			
 
				+	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */
			
 
				+	do_encrypt_block2x
			
 
				+	st1		{v0.16b-v1.16b}, [x0], #32
			
 
				+#else
			
 
				+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
			
 
				+	do_encrypt_block4x
			
 
				+	st1		{v0.16b-v3.16b}, [x0], #64
			
 
				+#endif
			
 
				+	b		.LecbencloopNx
			
 
				+.Lecbenc1x:
			
 
				+	adds		w4, w4, #INTERLEAVE
			
 
				+	beq		.Lecbencout
			
 
				+#endif
			
 
				+.Lecbencloop:
			
 
				+	ld1		{v0.16b}, [x1], #16		/* get next pt block */
			
 
				+	encrypt_block	v0, w3, x2, x5, w6
			
 
				+	st1		{v0.16b}, [x0], #16
			
 
				+	subs		w4, w4, #1
			
 
				+	bne		.Lecbencloop
			
 
				+.Lecbencout:
			
 
				+	FRAME_POP
			
 
				+	ret
			
 
				+AES_ENDPROC(aes_ecb_encrypt)
			
 
				+
			
 
				+
			
 
				+AES_ENTRY(aes_ecb_decrypt)
			
 
				+	FRAME_PUSH
			
 
				+	cbz		w5, .LecbdecloopNx
			
 
				+
			
 
				+	dec_prepare	w3, x2, x5
			
 
				+
			
 
				+.LecbdecloopNx:
			
 
				+#if INTERLEAVE >= 2
			
 
				+	subs		w4, w4, #INTERLEAVE
			
 
				+	bmi		.Lecbdec1x
			
 
				+#if INTERLEAVE == 2
			
 
				+	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
			
 
				+	do_decrypt_block2x
			
 
				+	st1		{v0.16b-v1.16b}, [x0], #32
			
 
				+#else
			
 
				+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
			
 
				+	do_decrypt_block4x
			
 
				+	st1		{v0.16b-v3.16b}, [x0], #64
			
 
				+#endif
			
 
				+	b		.LecbdecloopNx
			
 
				+.Lecbdec1x:
			
 
				+	adds		w4, w4, #INTERLEAVE
			
 
				+	beq		.Lecbdecout
			
 
				+#endif
			
 
				+.Lecbdecloop:
			
 
				+	ld1		{v0.16b}, [x1], #16		/* get next ct block */
			
 
				+	decrypt_block	v0, w3, x2, x5, w6
			
 
				+	st1		{v0.16b}, [x0], #16
			
 
				+	subs		w4, w4, #1
			
 
				+	bne		.Lecbdecloop
			
 
				+.Lecbdecout:
			
 
				+	FRAME_POP
			
 
				+	ret
			
 
				+AES_ENDPROC(aes_ecb_decrypt)
			
 
				+
			
 
				+
			
 
				+	/*
			
 
				+	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
			
 
				+	 *		   int blocks, u8 iv[], int first)
			
 
				+	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
			
 
				+	 *		   int blocks, u8 iv[], int first)
			
 
				+	 */
			
 
				+
			
 
				+AES_ENTRY(aes_cbc_encrypt)
			
 
				+	cbz		w6, .Lcbcencloop
			
 
				+
			
 
				+	ld1		{v0.16b}, [x5]			/* get iv */
			
 
				+	enc_prepare	w3, x2, x5
			
 
				+
			
 
				+.Lcbcencloop:
			
 
				+	ld1		{v1.16b}, [x1], #16		/* get next pt block */
			
 
				+	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with iv */
			
 
				+	encrypt_block	v0, w3, x2, x5, w6
			
 
				+	st1		{v0.16b}, [x0], #16
			
 
				+	subs		w4, w4, #1
			
 
				+	bne		.Lcbcencloop
			
 
				+	ret
			
 
				+AES_ENDPROC(aes_cbc_encrypt)
			
 
				+
			
 
				+
			
 
				+AES_ENTRY(aes_cbc_decrypt)
			
 
				+	FRAME_PUSH
			
 
				+	cbz		w6, .LcbcdecloopNx
			
 
				+
			
 
				+	ld1		{v7.16b}, [x5]			/* get iv */
			
 
				+	dec_prepare	w3, x2, x5
			
 
				+
			
 
				+.LcbcdecloopNx:
			
 
				+#if INTERLEAVE >= 2
			
 
				+	subs		w4, w4, #INTERLEAVE
			
 
				+	bmi		.Lcbcdec1x
			
 
				+#if INTERLEAVE == 2
			
 
				+	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
			
 
				+	mov		v2.16b, v0.16b
			
 
				+	mov		v3.16b, v1.16b
			
 
				+	do_decrypt_block2x
			
 
				+	eor		v0.16b, v0.16b, v7.16b
			
 
				+	eor		v1.16b, v1.16b, v2.16b
			
 
				+	mov		v7.16b, v3.16b
			
 
				+	st1		{v0.16b-v1.16b}, [x0], #32
			
 
				+#else
			
 
				+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
			
 
				+	mov		v4.16b, v0.16b
			
 
				+	mov		v5.16b, v1.16b
			
 
				+	mov		v6.16b, v2.16b
			
 
				+	do_decrypt_block4x
			
 
				+	sub		x1, x1, #16
			
 
				+	eor		v0.16b, v0.16b, v7.16b
			
 
				+	eor		v1.16b, v1.16b, v4.16b
			
 
				+	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */
			
 
				+	eor		v2.16b, v2.16b, v5.16b
			
 
				+	eor		v3.16b, v3.16b, v6.16b
			
 
				+	st1		{v0.16b-v3.16b}, [x0], #64
			
 
				+#endif
			
 
				+	b		.LcbcdecloopNx
			
 
				+.Lcbcdec1x:
			
 
				+	adds		w4, w4, #INTERLEAVE
			
 
				+	beq		.Lcbcdecout
			
 
				+#endif
			
 
				+.Lcbcdecloop:
			
 
				+	ld1		{v1.16b}, [x1], #16		/* get next ct block */
			
 
				+	mov		v0.16b, v1.16b			/* ...and copy to v0 */
			
 
				+	decrypt_block	v0, w3, x2, x5, w6
			
 
				+	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */
			
 
				+	mov		v7.16b, v1.16b			/* ct is next iv */
			
 
				+	st1		{v0.16b}, [x0], #16
			
 
				+	subs		w4, w4, #1
			
 
				+	bne		.Lcbcdecloop
			
 
				+.Lcbcdecout:
			
 
				+	FRAME_POP
			
 
				+	ret
			
 
				+AES_ENDPROC(aes_cbc_decrypt)
			
 
				+
			
 
				+
			
 
				+	/*
			
 
				+	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
			
 
				+	 *		   int blocks, u8 ctr[], int first)
			
 
				+	 */
			
 
				+
			
 
				+AES_ENTRY(aes_ctr_encrypt)
			
 
				+	FRAME_PUSH
			
 
				+	cbnz		w6, .Lctrfirst		/* 1st time around? */
			
 
				+	umov		x5, v4.d[1]		/* keep swabbed ctr in reg */
			
 
				+	rev		x5, x5
			
 
				+#if INTERLEAVE >= 2
			
 
				+	cmn		w5, w4			/* 32 bit overflow? */
			
 
				+	bcs		.Lctrinc
			
 
				+	add		x5, x5, #1		/* increment BE ctr */
			
 
				+	b		.LctrincNx
			
 
				+#else
			
 
				+	b		.Lctrinc
			
 
				+#endif
			
 
				+.Lctrfirst:
			
 
				+	enc_prepare	w3, x2, x6
			
 
				+	ld1		{v4.16b}, [x5]
			
 
				+	umov		x5, v4.d[1]		/* keep swabbed ctr in reg */
			
 
				+	rev		x5, x5
			
 
				+#if INTERLEAVE >= 2
			
 
				+	cmn		w5, w4			/* 32 bit overflow? */
			
 
				+	bcs		.Lctrloop
			
 
				+.LctrloopNx:
			
 
				+	subs		w4, w4, #INTERLEAVE
			
 
				+	bmi		.Lctr1x
			
 
				+#if INTERLEAVE == 2
			
 
				+	mov		v0.8b, v4.8b
			
 
				+	mov		v1.8b, v4.8b
			
 
				+	rev		x7, x5
			
 
				+	add		x5, x5, #1
			
 
				+	ins		v0.d[1], x7
			
 
				+	rev		x7, x5
			
 
				+	add		x5, x5, #1
			
 
				+	ins		v1.d[1], x7
			
 
				+	ld1		{v2.16b-v3.16b}, [x1], #32	/* get 2 input blocks */
			
 
				+	do_encrypt_block2x
			
 
				+	eor		v0.16b, v0.16b, v2.16b
			
 
				+	eor		v1.16b, v1.16b, v3.16b
			
 
				+	st1		{v0.16b-v1.16b}, [x0], #32
			
 
				+#else
			
 
				+	ldr		q8, =0x30000000200000001	/* addends 1,2,3[,0] */
			
 
				+	dup		v7.4s, w5
			
 
				+	mov		v0.16b, v4.16b
			
 
				+	add		v7.4s, v7.4s, v8.4s
			
 
				+	mov		v1.16b, v4.16b
			
 
				+	rev32		v8.16b, v7.16b
			
 
				+	mov		v2.16b, v4.16b
			
 
				+	mov		v3.16b, v4.16b
			
 
				+	mov		v1.s[3], v8.s[0]
			
 
				+	mov		v2.s[3], v8.s[1]
			
 
				+	mov		v3.s[3], v8.s[2]
			
 
				+	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
			
 
				+	do_encrypt_block4x
			
 
				+	eor		v0.16b, v5.16b, v0.16b
			
 
				+	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */
			
 
				+	eor		v1.16b, v6.16b, v1.16b
			
 
				+	eor		v2.16b, v7.16b, v2.16b
			
 
				+	eor		v3.16b, v5.16b, v3.16b
			
 
				+	st1		{v0.16b-v3.16b}, [x0], #64
			
 
				+	add		x5, x5, #INTERLEAVE
			
 
				+#endif
			
 
				+	cbz		w4, .LctroutNx
			
 
				+.LctrincNx:
			
 
				+	rev		x7, x5
			
 
				+	ins		v4.d[1], x7
			
 
				+	b		.LctrloopNx
			
 
				+.LctroutNx:
			
 
				+	sub		x5, x5, #1
			
 
				+	rev		x7, x5
			
 
				+	ins		v4.d[1], x7
			
 
				+	b		.Lctrout
			
 
				+.Lctr1x:
			
 
				+	adds		w4, w4, #INTERLEAVE
			
 
				+	beq		.Lctrout
			
 
				+#endif
			
 
				+.Lctrloop:
			
 
				+	mov		v0.16b, v4.16b
			
 
				+	encrypt_block	v0, w3, x2, x6, w7
			
 
				+	subs		w4, w4, #1
			
 
				+	bmi		.Lctrhalfblock		/* blocks < 0 means 1/2 block */
			
 
				+	ld1		{v3.16b}, [x1], #16
			
 
				+	eor		v3.16b, v0.16b, v3.16b
			
 
				+	st1		{v3.16b}, [x0], #16
			
 
				+	beq		.Lctrout
			
 
				+.Lctrinc:
			
 
				+	adds		x5, x5, #1		/* increment BE ctr */
			
 
				+	rev		x7, x5
			
 
				+	ins		v4.d[1], x7
			
 
				+	bcc		.Lctrloop		/* no overflow? */
			
 
				+	umov		x7, v4.d[0]		/* load upper word of ctr  */
			
 
				+	rev		x7, x7			/* ... to handle the carry */
			
 
				+	add		x7, x7, #1
			
 
				+	rev		x7, x7
			
 
				+	ins		v4.d[0], x7
			
 
				+	b		.Lctrloop
			
 
				+.Lctrhalfblock:
			
 
				+	ld1		{v3.8b}, [x1]
			
 
				+	eor		v3.8b, v0.8b, v3.8b
			
 
				+	st1		{v3.8b}, [x0]
			
 
				+.Lctrout:
			
 
				+	FRAME_POP
			
 
				+	ret
			
 
				+AES_ENDPROC(aes_ctr_encrypt)
			
 
				+	.ltorg
			
 
				+
			
 
				+
			
 
				+	/*
			
 
				+	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
			
 
				+	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
			
 
				+	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
			
 
				+	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
			
 
				+	 */
			
 
				+
			
 
				+	.macro		next_tweak, out, in, const, tmp
			
 
				+	sshr		\tmp\().2d,  \in\().2d,   #63
			
 
				+	and		\tmp\().16b, \tmp\().16b, \const\().16b
			
 
				+	add		\out\().2d,  \in\().2d,   \in\().2d
			
 
				+	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
			
 
				+	eor		\out\().16b, \out\().16b, \tmp\().16b
			
 
				+	.endm
			
 
				+
			
 
				+.Lxts_mul_x:
			
 
				+	.word		1, 0, 0x87, 0
			
 
				+
			
 
				+AES_ENTRY(aes_xts_encrypt)
			
 
				+	FRAME_PUSH
			
 
				+	cbz		w7, .LxtsencloopNx
			
 
				+
			
 
				+	ld1		{v4.16b}, [x6]
			
 
				+	enc_prepare	w3, x5, x6
			
 
				+	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */
			
 
				+	enc_switch_key	w3, x2, x6
			
 
				+	ldr		q7, .Lxts_mul_x
			
 
				+	b		.LxtsencNx
			
 
				+
			
 
				+.LxtsencloopNx:
			
 
				+	ldr		q7, .Lxts_mul_x
			
 
				+	next_tweak	v4, v4, v7, v8
			
 
				+.LxtsencNx:
			
 
				+#if INTERLEAVE >= 2
			
 
				+	subs		w4, w4, #INTERLEAVE
			
 
				+	bmi		.Lxtsenc1x
			
 
				+#if INTERLEAVE == 2
			
 
				+	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 pt blocks */
			
 
				+	next_tweak	v5, v4, v7, v8
			
 
				+	eor		v0.16b, v0.16b, v4.16b
			
 
				+	eor		v1.16b, v1.16b, v5.16b
			
 
				+	do_encrypt_block2x
			
 
				+	eor		v0.16b, v0.16b, v4.16b
			
 
				+	eor		v1.16b, v1.16b, v5.16b
			
 
				+	st1		{v0.16b-v1.16b}, [x0], #32
			
 
				+	cbz		w4, .LxtsencoutNx
			
 
				+	next_tweak	v4, v5, v7, v8
			
 
				+	b		.LxtsencNx
			
 
				+.LxtsencoutNx:
			
 
				+	mov		v4.16b, v5.16b
			
 
				+	b		.Lxtsencout
			
 
				+#else
			
 
				+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
			
 
				+	next_tweak	v5, v4, v7, v8
			
 
				+	eor		v0.16b, v0.16b, v4.16b
			
 
				+	next_tweak	v6, v5, v7, v8
			
 
				+	eor		v1.16b, v1.16b, v5.16b
			
 
				+	eor		v2.16b, v2.16b, v6.16b
			
 
				+	next_tweak	v7, v6, v7, v8
			
 
				+	eor		v3.16b, v3.16b, v7.16b
			
 
				+	do_encrypt_block4x
			
 
				+	eor		v3.16b, v3.16b, v7.16b
			
 
				+	eor		v0.16b, v0.16b, v4.16b
			
 
				+	eor		v1.16b, v1.16b, v5.16b
			
 
				+	eor		v2.16b, v2.16b, v6.16b
			
 
				+	st1		{v0.16b-v3.16b}, [x0], #64
			
 
				+	mov		v4.16b, v7.16b
			
 
				+	cbz		w4, .Lxtsencout
			
 
				+	b		.LxtsencloopNx
			
 
				+#endif
			
 
				+.Lxtsenc1x:
			
 
				+	adds		w4, w4, #INTERLEAVE
			
 
				+	beq		.Lxtsencout
			
 
				+#endif
			
 
				+.Lxtsencloop:
			
 
				+	ld1		{v1.16b}, [x1], #16
			
 
				+	eor		v0.16b, v1.16b, v4.16b
			
 
				+	encrypt_block	v0, w3, x2, x6, w7
			
 
				+	eor		v0.16b, v0.16b, v4.16b
			
 
				+	st1		{v0.16b}, [x0], #16
			
 
				+	subs		w4, w4, #1
			
 
				+	beq		.Lxtsencout
			
 
				+	next_tweak	v4, v4, v7, v8
			
 
				+	b		.Lxtsencloop
			
 
				+.Lxtsencout:
			
 
				+	FRAME_POP
			
 
				+	ret
			
 
				+AES_ENDPROC(aes_xts_encrypt)
			
 
				+
			
 
				+
			
 
				+AES_ENTRY(aes_xts_decrypt)
			
 
				+	FRAME_PUSH
			
 
				+	cbz		w7, .LxtsdecloopNx
			
 
				+
			
 
				+	ld1		{v4.16b}, [x6]
			
 
				+	enc_prepare	w3, x5, x6
			
 
				+	encrypt_block	v4, w3, x5, x6, w7		/* first tweak */
			
 
				+	dec_prepare	w3, x2, x6
			
 
				+	ldr		q7, .Lxts_mul_x
			
 
				+	b		.LxtsdecNx
			
 
				+
			
 
				+.LxtsdecloopNx:
			
 
				+	ldr		q7, .Lxts_mul_x
			
 
				+	next_tweak	v4, v4, v7, v8
			
 
				+.LxtsdecNx:
			
 
				+#if INTERLEAVE >= 2
			
 
				+	subs		w4, w4, #INTERLEAVE
			
 
				+	bmi		.Lxtsdec1x
			
 
				+#if INTERLEAVE == 2
			
 
				+	ld1		{v0.16b-v1.16b}, [x1], #32	/* get 2 ct blocks */
			
 
				+	next_tweak	v5, v4, v7, v8
			
 
				+	eor		v0.16b, v0.16b, v4.16b
			
 
				+	eor		v1.16b, v1.16b, v5.16b
			
 
				+	do_decrypt_block2x
			
 
				+	eor		v0.16b, v0.16b, v4.16b
			
 
				+	eor		v1.16b, v1.16b, v5.16b
			
 
				+	st1		{v0.16b-v1.16b}, [x0], #32
			
 
				+	cbz		w4, .LxtsdecoutNx
			
 
				+	next_tweak	v4, v5, v7, v8
			
 
				+	b		.LxtsdecNx
			
 
				+.LxtsdecoutNx:
			
 
				+	mov		v4.16b, v5.16b
			
 
				+	b		.Lxtsdecout
			
 
				+#else
			
 
				+	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
			
 
				+	next_tweak	v5, v4, v7, v8
			
 
				+	eor		v0.16b, v0.16b, v4.16b
			
 
				+	next_tweak	v6, v5, v7, v8
			
 
				+	eor		v1.16b, v1.16b, v5.16b
			
 
				+	eor		v2.16b, v2.16b, v6.16b
			
 
				+	next_tweak	v7, v6, v7, v8
			
 
				+	eor		v3.16b, v3.16b, v7.16b
			
 
				+	do_decrypt_block4x
			
 
				+	eor		v3.16b, v3.16b, v7.16b
			
 
				+	eor		v0.16b, v0.16b, v4.16b
			
 
				+	eor		v1.16b, v1.16b, v5.16b
			
 
				+	eor		v2.16b, v2.16b, v6.16b
			
 
				+	st1		{v0.16b-v3.16b}, [x0], #64
			
 
				+	mov		v4.16b, v7.16b
			
 
				+	cbz		w4, .Lxtsdecout
			
 
				+	b		.LxtsdecloopNx
			
 
				+#endif
			
 
				+.Lxtsdec1x:
			
 
				+	adds		w4, w4, #INTERLEAVE
			
 
				+	beq		.Lxtsdecout
			
 
				+#endif
			
 
				+.Lxtsdecloop:
			
 
				+	ld1		{v1.16b}, [x1], #16
			
 
				+	eor		v0.16b, v1.16b, v4.16b
			
 
				+	decrypt_block	v0, w3, x2, x6, w7
			
 
				+	eor		v0.16b, v0.16b, v4.16b
			
 
				+	st1		{v0.16b}, [x0], #16
			
 
				+	subs		w4, w4, #1
			
 
				+	beq		.Lxtsdecout
			
 
				+	next_tweak	v4, v4, v7, v8
			
 
				+	b		.Lxtsdecloop
			
 
				+.Lxtsdecout:
			
 
				+	FRAME_POP
			
 
				+	ret
			
 
				+AES_ENDPROC(aes_xts_decrypt)
			
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -0,0 +1,382 @@
 
				+/*
			
 
				+ * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
			
 
				+ *
			
 
				+ * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+
			
 
				+#define AES_ENTRY(func)		ENTRY(neon_ ## func)
			
 
				+#define AES_ENDPROC(func)	ENDPROC(neon_ ## func)
			
 
				+
			
 
				+	/* multiply by polynomial 'x' in GF(2^8) */
			
 
				+	.macro		mul_by_x, out, in, temp, const
			
 
				+	sshr		\temp, \in, #7
			
 
				+	add		\out, \in, \in
			
 
				+	and		\temp, \temp, \const
			
 
				+	eor		\out, \out, \temp
			
 
				+	.endm
			
 
				+
			
 
				+	/* preload the entire Sbox */
			
 
				+	.macro		prepare, sbox, shiftrows, temp
			
 
				+	adr		\temp, \sbox
			
 
				+	movi		v12.16b, #0x40
			
 
				+	ldr		q13, \shiftrows
			
 
				+	movi		v14.16b, #0x1b
			
 
				+	ld1		{v16.16b-v19.16b}, [\temp], #64
			
 
				+	ld1		{v20.16b-v23.16b}, [\temp], #64
			
 
				+	ld1		{v24.16b-v27.16b}, [\temp], #64
			
 
				+	ld1		{v28.16b-v31.16b}, [\temp]
			
 
				+	.endm
			
 
				+
			
 
				+	/* do preload for encryption */
			
 
				+	.macro		enc_prepare, ignore0, ignore1, temp
			
 
				+	prepare		.LForward_Sbox, .LForward_ShiftRows, \temp
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		enc_switch_key, ignore0, ignore1, temp
			
 
				+	/* do nothing */
			
 
				+	.endm
			
 
				+
			
 
				+	/* do preload for decryption */
			
 
				+	.macro		dec_prepare, ignore0, ignore1, temp
			
 
				+	prepare		.LReverse_Sbox, .LReverse_ShiftRows, \temp
			
 
				+	.endm
			
 
				+
			
 
				+	/* apply SubBytes transformation using the the preloaded Sbox */
			
 
				+	.macro		sub_bytes, in
			
 
				+	sub		v9.16b, \in\().16b, v12.16b
			
 
				+	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
			
 
				+	sub		v10.16b, v9.16b, v12.16b
			
 
				+	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
			
 
				+	sub		v11.16b, v10.16b, v12.16b
			
 
				+	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
			
 
				+	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
			
 
				+	.endm
			
 
				+
			
 
				+	/* apply MixColumns transformation */
			
 
				+	.macro		mix_columns, in
			
 
				+	mul_by_x	v10.16b, \in\().16b, v9.16b, v14.16b
			
 
				+	rev32		v8.8h, \in\().8h
			
 
				+	eor		\in\().16b, v10.16b, \in\().16b
			
 
				+	shl		v9.4s, v8.4s, #24
			
 
				+	shl		v11.4s, \in\().4s, #24
			
 
				+	sri		v9.4s, v8.4s, #8
			
 
				+	sri		v11.4s, \in\().4s, #8
			
 
				+	eor		v9.16b, v9.16b, v8.16b
			
 
				+	eor		v10.16b, v10.16b, v9.16b
			
 
				+	eor		\in\().16b, v10.16b, v11.16b
			
 
				+	.endm
			
 
				+
			
 
				+	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
			
 
				+	.macro		inv_mix_columns, in
			
 
				+	mul_by_x	v11.16b, \in\().16b, v10.16b, v14.16b
			
 
				+	mul_by_x	v11.16b, v11.16b, v10.16b, v14.16b
			
 
				+	eor		\in\().16b, \in\().16b, v11.16b
			
 
				+	rev32		v11.8h, v11.8h
			
 
				+	eor		\in\().16b, \in\().16b, v11.16b
			
 
				+	mix_columns	\in
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		do_block, enc, in, rounds, rk, rkp, i
			
 
				+	ld1		{v15.16b}, [\rk]
			
 
				+	add		\rkp, \rk, #16
			
 
				+	mov		\i, \rounds
			
 
				+1111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
			
 
				+	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
			
 
				+	sub_bytes	\in
			
 
				+	ld1		{v15.16b}, [\rkp], #16
			
 
				+	subs		\i, \i, #1
			
 
				+	beq		2222f
			
 
				+	.if		\enc == 1
			
 
				+	mix_columns	\in
			
 
				+	.else
			
 
				+	inv_mix_columns	\in
			
 
				+	.endif
			
 
				+	b		1111b
			
 
				+2222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		encrypt_block, in, rounds, rk, rkp, i
			
 
				+	do_block	1, \in, \rounds, \rk, \rkp, \i
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		decrypt_block, in, rounds, rk, rkp, i
			
 
				+	do_block	0, \in, \rounds, \rk, \rkp, \i
			
 
				+	.endm
			
 
				+
			
 
				+	/*
			
 
				+	 * Interleaved versions: functionally equivalent to the
			
 
				+	 * ones above, but applied to 2 or 4 AES states in parallel.
			
 
				+	 */
			
 
				+
			
 
				+	.macro		sub_bytes_2x, in0, in1
			
 
				+	sub		v8.16b, \in0\().16b, v12.16b
			
 
				+	sub		v9.16b, \in1\().16b, v12.16b
			
 
				+	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
			
 
				+	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
			
 
				+	sub		v10.16b, v8.16b, v12.16b
			
 
				+	sub		v11.16b, v9.16b, v12.16b
			
 
				+	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
			
 
				+	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
			
 
				+	sub		v8.16b, v10.16b, v12.16b
			
 
				+	sub		v9.16b, v11.16b, v12.16b
			
 
				+	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
			
 
				+	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
			
 
				+	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
			
 
				+	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		sub_bytes_4x, in0, in1, in2, in3
			
 
				+	sub		v8.16b, \in0\().16b, v12.16b
			
 
				+	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
			
 
				+	sub		v9.16b, \in1\().16b, v12.16b
			
 
				+	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
			
 
				+	sub		v10.16b, \in2\().16b, v12.16b
			
 
				+	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
			
 
				+	sub		v11.16b, \in3\().16b, v12.16b
			
 
				+	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
			
 
				+	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
			
 
				+	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
			
 
				+	sub		v8.16b, v8.16b, v12.16b
			
 
				+	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
			
 
				+	sub		v9.16b, v9.16b, v12.16b
			
 
				+	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
			
 
				+	sub		v10.16b, v10.16b, v12.16b
			
 
				+	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
			
 
				+	sub		v11.16b, v11.16b, v12.16b
			
 
				+	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
			
 
				+	sub		v8.16b, v8.16b, v12.16b
			
 
				+	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
			
 
				+	sub		v9.16b, v9.16b, v12.16b
			
 
				+	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
			
 
				+	sub		v10.16b, v10.16b, v12.16b
			
 
				+	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
			
 
				+	sub		v11.16b, v11.16b, v12.16b
			
 
				+	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
			
 
				+	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
			
 
				+	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
			
 
				+	sshr		\tmp0\().16b, \in0\().16b,  #7
			
 
				+	add		\out0\().16b, \in0\().16b,  \in0\().16b
			
 
				+	sshr		\tmp1\().16b, \in1\().16b,  #7
			
 
				+	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
			
 
				+	add		\out1\().16b, \in1\().16b,  \in1\().16b
			
 
				+	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
			
 
				+	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
			
 
				+	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		mix_columns_2x, in0, in1
			
 
				+	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
			
 
				+	rev32		v10.8h, \in0\().8h
			
 
				+	rev32		v11.8h, \in1\().8h
			
 
				+	eor		\in0\().16b, v8.16b, \in0\().16b
			
 
				+	eor		\in1\().16b, v9.16b, \in1\().16b
			
 
				+	shl		v12.4s, v10.4s, #24
			
 
				+	shl		v13.4s, v11.4s, #24
			
 
				+	eor		v8.16b, v8.16b, v10.16b
			
 
				+	sri		v12.4s, v10.4s, #8
			
 
				+	shl		v10.4s, \in0\().4s, #24
			
 
				+	eor		v9.16b, v9.16b, v11.16b
			
 
				+	sri		v13.4s, v11.4s, #8
			
 
				+	shl		v11.4s, \in1\().4s, #24
			
 
				+	sri		v10.4s, \in0\().4s, #8
			
 
				+	eor		\in0\().16b, v8.16b, v12.16b
			
 
				+	sri		v11.4s, \in1\().4s, #8
			
 
				+	eor		\in1\().16b, v9.16b, v13.16b
			
 
				+	eor		\in0\().16b, v10.16b, \in0\().16b
			
 
				+	eor		\in1\().16b, v11.16b, \in1\().16b
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		inv_mix_cols_2x, in0, in1
			
 
				+	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
			
 
				+	mul_by_x_2x	v8, v9, v8, v9, v10, v11, v14
			
 
				+	eor		\in0\().16b, \in0\().16b, v8.16b
			
 
				+	eor		\in1\().16b, \in1\().16b, v9.16b
			
 
				+	rev32		v8.8h, v8.8h
			
 
				+	rev32		v9.8h, v9.8h
			
 
				+	eor		\in0\().16b, \in0\().16b, v8.16b
			
 
				+	eor		\in1\().16b, \in1\().16b, v9.16b
			
 
				+	mix_columns_2x	\in0, \in1
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		inv_mix_cols_4x, in0, in1, in2, in3
			
 
				+	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
			
 
				+	mul_by_x_2x	v10, v11, \in2, \in3, v12, v13, v14
			
 
				+	mul_by_x_2x	v8, v9, v8, v9, v12, v13, v14
			
 
				+	mul_by_x_2x	v10, v11, v10, v11, v12, v13, v14
			
 
				+	eor		\in0\().16b, \in0\().16b, v8.16b
			
 
				+	eor		\in1\().16b, \in1\().16b, v9.16b
			
 
				+	eor		\in2\().16b, \in2\().16b, v10.16b
			
 
				+	eor		\in3\().16b, \in3\().16b, v11.16b
			
 
				+	rev32		v8.8h, v8.8h
			
 
				+	rev32		v9.8h, v9.8h
			
 
				+	rev32		v10.8h, v10.8h
			
 
				+	rev32		v11.8h, v11.8h
			
 
				+	eor		\in0\().16b, \in0\().16b, v8.16b
			
 
				+	eor		\in1\().16b, \in1\().16b, v9.16b
			
 
				+	eor		\in2\().16b, \in2\().16b, v10.16b
			
 
				+	eor		\in3\().16b, \in3\().16b, v11.16b
			
 
				+	mix_columns_2x	\in0, \in1
			
 
				+	mix_columns_2x	\in2, \in3
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		do_block_2x, enc, in0, in1 rounds, rk, rkp, i
			
 
				+	ld1		{v15.16b}, [\rk]
			
 
				+	add		\rkp, \rk, #16
			
 
				+	mov		\i, \rounds
			
 
				+1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
			
 
				+	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
			
 
				+	sub_bytes_2x	\in0, \in1
			
 
				+	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
			
 
				+	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
			
 
				+	ld1		{v15.16b}, [\rkp], #16
			
 
				+	subs		\i, \i, #1
			
 
				+	beq		2222f
			
 
				+	.if		\enc == 1
			
 
				+	mix_columns_2x	\in0, \in1
			
 
				+	ldr		q13, .LForward_ShiftRows
			
 
				+	.else
			
 
				+	inv_mix_cols_2x	\in0, \in1
			
 
				+	ldr		q13, .LReverse_ShiftRows
			
 
				+	.endif
			
 
				+	movi		v12.16b, #0x40
			
 
				+	b		1111b
			
 
				+2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
			
 
				+	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
			
 
				+	ld1		{v15.16b}, [\rk]
			
 
				+	add		\rkp, \rk, #16
			
 
				+	mov		\i, \rounds
			
 
				+1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
			
 
				+	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
			
 
				+	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
			
 
				+	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
			
 
				+	sub_bytes_4x	\in0, \in1, \in2, \in3
			
 
				+	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
			
 
				+	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
			
 
				+	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
			
 
				+	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
			
 
				+	ld1		{v15.16b}, [\rkp], #16
			
 
				+	subs		\i, \i, #1
			
 
				+	beq		2222f
			
 
				+	.if		\enc == 1
			
 
				+	mix_columns_2x	\in0, \in1
			
 
				+	mix_columns_2x	\in2, \in3
			
 
				+	ldr		q13, .LForward_ShiftRows
			
 
				+	.else
			
 
				+	inv_mix_cols_4x	\in0, \in1, \in2, \in3
			
 
				+	ldr		q13, .LReverse_ShiftRows
			
 
				+	.endif
			
 
				+	movi		v12.16b, #0x40
			
 
				+	b		1111b
			
 
				+2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
			
 
				+	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
			
 
				+	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
			
 
				+	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		encrypt_block2x, in0, in1, rounds, rk, rkp, i
			
 
				+	do_block_2x	1, \in0, \in1, \rounds, \rk, \rkp, \i
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		decrypt_block2x, in0, in1, rounds, rk, rkp, i
			
 
				+	do_block_2x	0, \in0, \in1, \rounds, \rk, \rkp, \i
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
			
 
				+	do_block_4x	1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
			
 
				+	do_block_4x	0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
			
 
				+	.endm
			
 
				+
			
 
				+#include "aes-modes.S"
			
 
				+
			
 
				+	.text
			
 
				+	.align		4
			
 
				+.LForward_ShiftRows:
			
 
				+	.byte		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3
			
 
				+	.byte		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb
			
 
				+
			
 
				+.LReverse_ShiftRows:
			
 
				+	.byte		0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb
			
 
				+	.byte		0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3
			
 
				+
			
 
				+.LForward_Sbox:
			
 
				+	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
			
 
				+	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
			
 
				+	.byte		0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
			
 
				+	.byte		0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
			
 
				+	.byte		0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
			
 
				+	.byte		0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
			
 
				+	.byte		0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
			
 
				+	.byte		0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
			
 
				+	.byte		0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
			
 
				+	.byte		0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
			
 
				+	.byte		0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
			
 
				+	.byte		0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
			
 
				+	.byte		0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
			
 
				+	.byte		0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
			
 
				+	.byte		0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
			
 
				+	.byte		0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
			
 
				+	.byte		0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
			
 
				+	.byte		0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
			
 
				+	.byte		0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
			
 
				+	.byte		0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
			
 
				+	.byte		0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
			
 
				+	.byte		0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
			
 
				+	.byte		0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
			
 
				+	.byte		0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
			
 
				+	.byte		0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
			
 
				+	.byte		0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
			
 
				+	.byte		0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
			
 
				+	.byte		0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
			
 
				+	.byte		0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
			
 
				+	.byte		0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
			
 
				+	.byte		0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
			
 
				+	.byte		0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
			
 
				+
			
 
				+.LReverse_Sbox:
			
 
				+	.byte		0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
			
 
				+	.byte		0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
			
 
				+	.byte		0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
			
 
				+	.byte		0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
			
 
				+	.byte		0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
			
 
				+	.byte		0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
			
 
				+	.byte		0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
			
 
				+	.byte		0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
			
 
				+	.byte		0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
			
 
				+	.byte		0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
			
 
				+	.byte		0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
			
 
				+	.byte		0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
			
 
				+	.byte		0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
			
 
				+	.byte		0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
			
 
				+	.byte		0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
			
 
				+	.byte		0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
			
 
				+	.byte		0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
			
 
				+	.byte		0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
			
 
				+	.byte		0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
			
 
				+	.byte		0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
			
 
				+	.byte		0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
			
 
				+	.byte		0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
			
 
				+	.byte		0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
			
 
				+	.byte		0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
			
 
				+	.byte		0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
			
 
				+	.byte		0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
			
 
				+	.byte		0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
			
 
				+	.byte		0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
			
 
				+	.byte		0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
			
 
				+	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
			
 
				+	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
			
 
				+	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
			
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -0,0 +1,95 @@
 
				+/*
			
 
				+ * Accelerated GHASH implementation with ARMv8 PMULL instructions.
			
 
				+ *
			
 
				+ * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
			
 
				+ *
			
 
				+ * Copyright (c) 2009 Intel Corp.
			
 
				+ *   Author: Huang Ying <ying.huang@intel.com>
			
 
				+ *           Vinodh Gopal
			
 
				+ *           Erdinc Ozturk
			
 
				+ *           Deniz Karakoyunlu
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify it
			
 
				+ * under the terms of the GNU General Public License version 2 as published
			
 
				+ * by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+#include <asm/assembler.h>
			
 
				+
			
 
				+	DATA	.req	v0
			
 
				+	SHASH	.req	v1
			
 
				+	IN1	.req	v2
			
 
				+	T1	.req	v2
			
 
				+	T2	.req	v3
			
 
				+	T3	.req	v4
			
 
				+	VZR	.req	v5
			
 
				+
			
 
				+	.text
			
 
				+	.arch		armv8-a+crypto
			
 
				+
			
 
				+	/*
			
 
				+	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
			
 
				+	 *			   struct ghash_key const *k, const char *head)
			
 
				+	 */
			
 
				+ENTRY(pmull_ghash_update)
			
 
				+	ld1		{DATA.16b}, [x1]
			
 
				+	ld1		{SHASH.16b}, [x3]
			
 
				+	eor		VZR.16b, VZR.16b, VZR.16b
			
 
				+
			
 
				+	/* do the head block first, if supplied */
			
 
				+	cbz		x4, 0f
			
 
				+	ld1		{IN1.2d}, [x4]
			
 
				+	b		1f
			
 
				+
			
 
				+0:	ld1		{IN1.2d}, [x2], #16
			
 
				+	sub		w0, w0, #1
			
 
				+1:	ext		IN1.16b, IN1.16b, IN1.16b, #8
			
 
				+CPU_LE(	rev64		IN1.16b, IN1.16b	)
			
 
				+	eor		DATA.16b, DATA.16b, IN1.16b
			
 
				+
			
 
				+	/* multiply DATA by SHASH in GF(2^128) */
			
 
				+	ext		T2.16b, DATA.16b, DATA.16b, #8
			
 
				+	ext		T3.16b, SHASH.16b, SHASH.16b, #8
			
 
				+	eor		T2.16b, T2.16b, DATA.16b
			
 
				+	eor		T3.16b, T3.16b, SHASH.16b
			
 
				+
			
 
				+	pmull2		T1.1q, SHASH.2d, DATA.2d	// a1 * b1
			
 
				+	pmull		DATA.1q, SHASH.1d, DATA.1d	// a0 * b0
			
 
				+	pmull		T2.1q, T2.1d, T3.1d		// (a1 + a0)(b1 + b0)
			
 
				+	eor		T2.16b, T2.16b, T1.16b		// (a0 * b1) + (a1 * b0)
			
 
				+	eor		T2.16b, T2.16b, DATA.16b
			
 
				+
			
 
				+	ext		T3.16b, VZR.16b, T2.16b, #8
			
 
				+	ext		T2.16b, T2.16b, VZR.16b, #8
			
 
				+	eor		DATA.16b, DATA.16b, T3.16b
			
 
				+	eor		T1.16b, T1.16b, T2.16b	// <T1:DATA> is result of
			
 
				+						// carry-less multiplication
			
 
				+
			
 
				+	/* first phase of the reduction */
			
 
				+	shl		T3.2d, DATA.2d, #1
			
 
				+	eor		T3.16b, T3.16b, DATA.16b
			
 
				+	shl		T3.2d, T3.2d, #5
			
 
				+	eor		T3.16b, T3.16b, DATA.16b
			
 
				+	shl		T3.2d, T3.2d, #57
			
 
				+	ext		T2.16b, VZR.16b, T3.16b, #8
			
 
				+	ext		T3.16b, T3.16b, VZR.16b, #8
			
 
				+	eor		DATA.16b, DATA.16b, T2.16b
			
 
				+	eor		T1.16b, T1.16b, T3.16b
			
 
				+
			
 
				+	/* second phase of the reduction */
			
 
				+	ushr		T2.2d, DATA.2d, #5
			
 
				+	eor		T2.16b, T2.16b, DATA.16b
			
 
				+	ushr		T2.2d, T2.2d, #1
			
 
				+	eor		T2.16b, T2.16b, DATA.16b
			
 
				+	ushr		T2.2d, T2.2d, #1
			
 
				+	eor		T1.16b, T1.16b, T2.16b
			
 
				+	eor		DATA.16b, DATA.16b, T1.16b
			
 
				+
			
 
				+	cbnz		w0, 0b
			
 
				+
			
 
				+	st1		{DATA.16b}, [x1]
			
 
				+	ret
			
 
				+ENDPROC(pmull_ghash_update)
			
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@@ -0,0 +1,155 @@
 
				+/*
			
 
				+ * Accelerated GHASH implementation with ARMv8 PMULL instructions.
			
 
				+ *
			
 
				+ * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify it
			
 
				+ * under the terms of the GNU General Public License version 2 as published
			
 
				+ * by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <asm/neon.h>
			
 
				+#include <asm/unaligned.h>
			
 
				+#include <crypto/internal/hash.h>
			
 
				+#include <linux/cpufeature.h>
			
 
				+#include <linux/crypto.h>
			
 
				+#include <linux/module.h>
			
 
				+
			
 
				+MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions");
			
 
				+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				+MODULE_LICENSE("GPL v2");
			
 
				+
			
 
				+#define GHASH_BLOCK_SIZE	16
			
 
				+#define GHASH_DIGEST_SIZE	16
			
 
				+
			
 
				+struct ghash_key {
			
 
				+	u64 a;
			
 
				+	u64 b;
			
 
				+};
			
 
				+
			
 
				+struct ghash_desc_ctx {
			
 
				+	u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)];
			
 
				+	u8 buf[GHASH_BLOCK_SIZE];
			
 
				+	u32 count;
			
 
				+};
			
 
				+
			
 
				+asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
			
 
				+				   struct ghash_key const *k, const char *head);
			
 
				+
			
 
				+static int ghash_init(struct shash_desc *desc)
			
 
				+{
			
 
				+	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
			
 
				+
			
 
				+	*ctx = (struct ghash_desc_ctx){};
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int ghash_update(struct shash_desc *desc, const u8 *src,
			
 
				+			unsigned int len)
			
 
				+{
			
 
				+	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
			
 
				+	unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
			
 
				+
			
 
				+	ctx->count += len;
			
 
				+
			
 
				+	if ((partial + len) >= GHASH_BLOCK_SIZE) {
			
 
				+		struct ghash_key *key = crypto_shash_ctx(desc->tfm);
			
 
				+		int blocks;
			
 
				+
			
 
				+		if (partial) {
			
 
				+			int p = GHASH_BLOCK_SIZE - partial;
			
 
				+
			
 
				+			memcpy(ctx->buf + partial, src, p);
			
 
				+			src += p;
			
 
				+			len -= p;
			
 
				+		}
			
 
				+
			
 
				+		blocks = len / GHASH_BLOCK_SIZE;
			
 
				+		len %= GHASH_BLOCK_SIZE;
			
 
				+
			
 
				+		kernel_neon_begin_partial(6);
			
 
				+		pmull_ghash_update(blocks, ctx->digest, src, key,
			
 
				+				   partial ? ctx->buf : NULL);
			
 
				+		kernel_neon_end();
			
 
				+		src += blocks * GHASH_BLOCK_SIZE;
			
 
				+	}
			
 
				+	if (len)
			
 
				+		memcpy(ctx->buf + partial, src, len);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int ghash_final(struct shash_desc *desc, u8 *dst)
			
 
				+{
			
 
				+	struct ghash_desc_ctx *ctx = shash_desc_ctx(desc);
			
 
				+	unsigned int partial = ctx->count % GHASH_BLOCK_SIZE;
			
 
				+
			
 
				+	if (partial) {
			
 
				+		struct ghash_key *key = crypto_shash_ctx(desc->tfm);
			
 
				+
			
 
				+		memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
			
 
				+
			
 
				+		kernel_neon_begin_partial(6);
			
 
				+		pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL);
			
 
				+		kernel_neon_end();
			
 
				+	}
			
 
				+	put_unaligned_be64(ctx->digest[1], dst);
			
 
				+	put_unaligned_be64(ctx->digest[0], dst + 8);
			
 
				+
			
 
				+	*ctx = (struct ghash_desc_ctx){};
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int ghash_setkey(struct crypto_shash *tfm,
			
 
				+			const u8 *inkey, unsigned int keylen)
			
 
				+{
			
 
				+	struct ghash_key *key = crypto_shash_ctx(tfm);
			
 
				+	u64 a, b;
			
 
				+
			
 
				+	if (keylen != GHASH_BLOCK_SIZE) {
			
 
				+		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	/* perform multiplication by 'x' in GF(2^128) */
			
 
				+	b = get_unaligned_be64(inkey);
			
 
				+	a = get_unaligned_be64(inkey + 8);
			
 
				+
			
 
				+	key->a = (a << 1) | (b >> 63);
			
 
				+	key->b = (b << 1) | (a >> 63);
			
 
				+
			
 
				+	if (b >> 63)
			
 
				+		key->b ^= 0xc200000000000000UL;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct shash_alg ghash_alg = {
			
 
				+	.digestsize	= GHASH_DIGEST_SIZE,
			
 
				+	.init		= ghash_init,
			
 
				+	.update		= ghash_update,
			
 
				+	.final		= ghash_final,
			
 
				+	.setkey		= ghash_setkey,
			
 
				+	.descsize	= sizeof(struct ghash_desc_ctx),
			
 
				+	.base		= {
			
 
				+		.cra_name		= "ghash",
			
 
				+		.cra_driver_name	= "ghash-ce",
			
 
				+		.cra_priority		= 200,
			
 
				+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
			
 
				+		.cra_blocksize		= GHASH_BLOCK_SIZE,
			
 
				+		.cra_ctxsize		= sizeof(struct ghash_key),
			
 
				+		.cra_module		= THIS_MODULE,
			
 
				+	},
			
 
				+};
			
 
				+
			
 
				+static int __init ghash_ce_mod_init(void)
			
 
				+{
			
 
				+	return crypto_register_shash(&ghash_alg);
			
 
				+}
			
 
				+
			
 
				+static void __exit ghash_ce_mod_exit(void)
			
 
				+{
			
 
				+	crypto_unregister_shash(&ghash_alg);
			
 
				+}
			
 
				+
			
 
				+module_cpu_feature_match(PMULL, ghash_ce_mod_init);
			
 
				+module_exit(ghash_ce_mod_exit);
			
--- a/arch/arm64/crypto/sha1-ce-core.S
+++ b/arch/arm64/crypto/sha1-ce-core.S
@@ -0,0 +1,153 @@
 
				+/*
			
 
				+ * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
			
 
				+ *
			
 
				+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+#include <asm/assembler.h>
			
 
				+
			
 
				+	.text
			
 
				+	.arch		armv8-a+crypto
			
 
				+
			
 
				+	k0		.req	v0
			
 
				+	k1		.req	v1
			
 
				+	k2		.req	v2
			
 
				+	k3		.req	v3
			
 
				+
			
 
				+	t0		.req	v4
			
 
				+	t1		.req	v5
			
 
				+
			
 
				+	dga		.req	q6
			
 
				+	dgav		.req	v6
			
 
				+	dgb		.req	s7
			
 
				+	dgbv		.req	v7
			
 
				+
			
 
				+	dg0q		.req	q12
			
 
				+	dg0s		.req	s12
			
 
				+	dg0v		.req	v12
			
 
				+	dg1s		.req	s13
			
 
				+	dg1v		.req	v13
			
 
				+	dg2s		.req	s14
			
 
				+
			
 
				+	.macro		add_only, op, ev, rc, s0, dg1
			
 
				+	.ifc		\ev, ev
			
 
				+	add		t1.4s, v\s0\().4s, \rc\().4s
			
 
				+	sha1h		dg2s, dg0s
			
 
				+	.ifnb		\dg1
			
 
				+	sha1\op		dg0q, \dg1, t0.4s
			
 
				+	.else
			
 
				+	sha1\op		dg0q, dg1s, t0.4s
			
 
				+	.endif
			
 
				+	.else
			
 
				+	.ifnb		\s0
			
 
				+	add		t0.4s, v\s0\().4s, \rc\().4s
			
 
				+	.endif
			
 
				+	sha1h		dg1s, dg0s
			
 
				+	sha1\op		dg0q, dg2s, t1.4s
			
 
				+	.endif
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		add_update, op, ev, rc, s0, s1, s2, s3, dg1
			
 
				+	sha1su0		v\s0\().4s, v\s1\().4s, v\s2\().4s
			
 
				+	add_only	\op, \ev, \rc, \s1, \dg1
			
 
				+	sha1su1		v\s0\().4s, v\s3\().4s
			
 
				+	.endm
			
 
				+
			
 
				+	/*
			
 
				+	 * The SHA1 round constants
			
 
				+	 */
			
 
				+	.align		4
			
 
				+.Lsha1_rcon:
			
 
				+	.word		0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6
			
 
				+
			
 
				+	/*
			
 
				+	 * void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
			
 
				+	 * 			  u8 *head, long bytes)
			
 
				+	 */
			
 
				+ENTRY(sha1_ce_transform)
			
 
				+	/* load round constants */
			
 
				+	adr		x6, .Lsha1_rcon
			
 
				+	ld1r		{k0.4s}, [x6], #4
			
 
				+	ld1r		{k1.4s}, [x6], #4
			
 
				+	ld1r		{k2.4s}, [x6], #4
			
 
				+	ld1r		{k3.4s}, [x6]
			
 
				+
			
 
				+	/* load state */
			
 
				+	ldr		dga, [x2]
			
 
				+	ldr		dgb, [x2, #16]
			
 
				+
			
 
				+	/* load partial state (if supplied) */
			
 
				+	cbz		x3, 0f
			
 
				+	ld1		{v8.4s-v11.4s}, [x3]
			
 
				+	b		1f
			
 
				+
			
 
				+	/* load input */
			
 
				+0:	ld1		{v8.4s-v11.4s}, [x1], #64
			
 
				+	sub		w0, w0, #1
			
 
				+
			
 
				+1:
			
 
				+CPU_LE(	rev32		v8.16b, v8.16b		)
			
 
				+CPU_LE(	rev32		v9.16b, v9.16b		)
			
 
				+CPU_LE(	rev32		v10.16b, v10.16b	)
			
 
				+CPU_LE(	rev32		v11.16b, v11.16b	)
			
 
				+
			
 
				+2:	add		t0.4s, v8.4s, k0.4s
			
 
				+	mov		dg0v.16b, dgav.16b
			
 
				+
			
 
				+	add_update	c, ev, k0,  8,  9, 10, 11, dgb
			
 
				+	add_update	c, od, k0,  9, 10, 11,  8
			
 
				+	add_update	c, ev, k0, 10, 11,  8,  9
			
 
				+	add_update	c, od, k0, 11,  8,  9, 10
			
 
				+	add_update	c, ev, k1,  8,  9, 10, 11
			
 
				+
			
 
				+	add_update	p, od, k1,  9, 10, 11,  8
			
 
				+	add_update	p, ev, k1, 10, 11,  8,  9
			
 
				+	add_update	p, od, k1, 11,  8,  9, 10
			
 
				+	add_update	p, ev, k1,  8,  9, 10, 11
			
 
				+	add_update	p, od, k2,  9, 10, 11,  8
			
 
				+
			
 
				+	add_update	m, ev, k2, 10, 11,  8,  9
			
 
				+	add_update	m, od, k2, 11,  8,  9, 10
			
 
				+	add_update	m, ev, k2,  8,  9, 10, 11
			
 
				+	add_update	m, od, k2,  9, 10, 11,  8
			
 
				+	add_update	m, ev, k3, 10, 11,  8,  9
			
 
				+
			
 
				+	add_update	p, od, k3, 11,  8,  9, 10
			
 
				+	add_only	p, ev, k3,  9
			
 
				+	add_only	p, od, k3, 10
			
 
				+	add_only	p, ev, k3, 11
			
 
				+	add_only	p, od
			
 
				+
			
 
				+	/* update state */
			
 
				+	add		dgbv.2s, dgbv.2s, dg1v.2s
			
 
				+	add		dgav.4s, dgav.4s, dg0v.4s
			
 
				+
			
 
				+	cbnz		w0, 0b
			
 
				+
			
 
				+	/*
			
 
				+	 * Final block: add padding and total bit count.
			
 
				+	 * Skip if we have no total byte count in x4. In that case, the input
			
 
				+	 * size was not a round multiple of the block size, and the padding is
			
 
				+	 * handled by the C code.
			
 
				+	 */
			
 
				+	cbz		x4, 3f
			
 
				+	movi		v9.2d, #0
			
 
				+	mov		x8, #0x80000000
			
 
				+	movi		v10.2d, #0
			
 
				+	ror		x7, x4, #29		// ror(lsl(x4, 3), 32)
			
 
				+	fmov		d8, x8
			
 
				+	mov		x4, #0
			
 
				+	mov		v11.d[0], xzr
			
 
				+	mov		v11.d[1], x7
			
 
				+	b		2b
			
 
				+
			
 
				+	/* store new state */
			
 
				+3:	str		dga, [x2]
			
 
				+	str		dgb, [x2, #16]
			
 
				+	ret
			
 
				+ENDPROC(sha1_ce_transform)
			
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@@ -0,0 +1,174 @@
 
				+/*
			
 
				+ * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
			
 
				+ *
			
 
				+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <asm/neon.h>
			
 
				+#include <asm/unaligned.h>
			
 
				+#include <crypto/internal/hash.h>
			
 
				+#include <crypto/sha.h>
			
 
				+#include <linux/cpufeature.h>
			
 
				+#include <linux/crypto.h>
			
 
				+#include <linux/module.h>
			
 
				+
			
 
				+MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
			
 
				+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				+MODULE_LICENSE("GPL v2");
			
 
				+
			
 
				+asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state,
			
 
				+				  u8 *head, long bytes);
			
 
				+
			
 
				+static int sha1_init(struct shash_desc *desc)
			
 
				+{
			
 
				+	struct sha1_state *sctx = shash_desc_ctx(desc);
			
 
				+
			
 
				+	*sctx = (struct sha1_state){
			
 
				+		.state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
			
 
				+	};
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int sha1_update(struct shash_desc *desc, const u8 *data,
			
 
				+		       unsigned int len)
			
 
				+{
			
 
				+	struct sha1_state *sctx = shash_desc_ctx(desc);
			
 
				+	unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
			
 
				+
			
 
				+	sctx->count += len;
			
 
				+
			
 
				+	if ((partial + len) >= SHA1_BLOCK_SIZE) {
			
 
				+		int blocks;
			
 
				+
			
 
				+		if (partial) {
			
 
				+			int p = SHA1_BLOCK_SIZE - partial;
			
 
				+
			
 
				+			memcpy(sctx->buffer + partial, data, p);
			
 
				+			data += p;
			
 
				+			len -= p;
			
 
				+		}
			
 
				+
			
 
				+		blocks = len / SHA1_BLOCK_SIZE;
			
 
				+		len %= SHA1_BLOCK_SIZE;
			
 
				+
			
 
				+		kernel_neon_begin_partial(16);
			
 
				+		sha1_ce_transform(blocks, data, sctx->state,
			
 
				+				  partial ? sctx->buffer : NULL, 0);
			
 
				+		kernel_neon_end();
			
 
				+
			
 
				+		data += blocks * SHA1_BLOCK_SIZE;
			
 
				+		partial = 0;
			
 
				+	}
			
 
				+	if (len)
			
 
				+		memcpy(sctx->buffer + partial, data, len);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int sha1_final(struct shash_desc *desc, u8 *out)
			
 
				+{
			
 
				+	static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
			
 
				+
			
 
				+	struct sha1_state *sctx = shash_desc_ctx(desc);
			
 
				+	__be64 bits = cpu_to_be64(sctx->count << 3);
			
 
				+	__be32 *dst = (__be32 *)out;
			
 
				+	int i;
			
 
				+
			
 
				+	u32 padlen = SHA1_BLOCK_SIZE
			
 
				+		     - ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE);
			
 
				+
			
 
				+	sha1_update(desc, padding, padlen);
			
 
				+	sha1_update(desc, (const u8 *)&bits, sizeof(bits));
			
 
				+
			
 
				+	for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
			
 
				+		put_unaligned_be32(sctx->state[i], dst++);
			
 
				+
			
 
				+	*sctx = (struct sha1_state){};
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int sha1_finup(struct shash_desc *desc, const u8 *data,
			
 
				+		      unsigned int len, u8 *out)
			
 
				+{
			
 
				+	struct sha1_state *sctx = shash_desc_ctx(desc);
			
 
				+	__be32 *dst = (__be32 *)out;
			
 
				+	int blocks;
			
 
				+	int i;
			
 
				+
			
 
				+	if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) {
			
 
				+		sha1_update(desc, data, len);
			
 
				+		return sha1_final(desc, out);
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Use a fast path if the input is a multiple of 64 bytes. In
			
 
				+	 * this case, there is no need to copy data around, and we can
			
 
				+	 * perform the entire digest calculation in a single invocation
			
 
				+	 * of sha1_ce_transform()
			
 
				+	 */
			
 
				+	blocks = len / SHA1_BLOCK_SIZE;
			
 
				+
			
 
				+	kernel_neon_begin_partial(16);
			
 
				+	sha1_ce_transform(blocks, data, sctx->state, NULL, len);
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++)
			
 
				+		put_unaligned_be32(sctx->state[i], dst++);
			
 
				+
			
 
				+	*sctx = (struct sha1_state){};
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int sha1_export(struct shash_desc *desc, void *out)
			
 
				+{
			
 
				+	struct sha1_state *sctx = shash_desc_ctx(desc);
			
 
				+	struct sha1_state *dst = out;
			
 
				+
			
 
				+	*dst = *sctx;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int sha1_import(struct shash_desc *desc, const void *in)
			
 
				+{
			
 
				+	struct sha1_state *sctx = shash_desc_ctx(desc);
			
 
				+	struct sha1_state const *src = in;
			
 
				+
			
 
				+	*sctx = *src;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct shash_alg alg = {
			
 
				+	.init			= sha1_init,
			
 
				+	.update			= sha1_update,
			
 
				+	.final			= sha1_final,
			
 
				+	.finup			= sha1_finup,
			
 
				+	.export			= sha1_export,
			
 
				+	.import			= sha1_import,
			
 
				+	.descsize		= sizeof(struct sha1_state),
			
 
				+	.digestsize		= SHA1_DIGEST_SIZE,
			
 
				+	.statesize		= sizeof(struct sha1_state),
			
 
				+	.base			= {
			
 
				+		.cra_name		= "sha1",
			
 
				+		.cra_driver_name	= "sha1-ce",
			
 
				+		.cra_priority		= 200,
			
 
				+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
			
 
				+		.cra_blocksize		= SHA1_BLOCK_SIZE,
			
 
				+		.cra_module		= THIS_MODULE,
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+static int __init sha1_ce_mod_init(void)
			
 
				+{
			
 
				+	return crypto_register_shash(&alg);
			
 
				+}
			
 
				+
			
 
				+static void __exit sha1_ce_mod_fini(void)
			
 
				+{
			
 
				+	crypto_unregister_shash(&alg);
			
 
				+}
			
 
				+
			
 
				+module_cpu_feature_match(SHA1, sha1_ce_mod_init);
			
 
				+module_exit(sha1_ce_mod_fini);
			
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/crypto/sha2-ce-core.S
@@ -0,0 +1,156 @@
 
				+/*
			
 
				+ * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
			
 
				+ *
			
 
				+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+#include <asm/assembler.h>
			
 
				+
			
 
				+	.text
			
 
				+	.arch		armv8-a+crypto
			
 
				+
			
 
				+	dga		.req	q20
			
 
				+	dgav		.req	v20
			
 
				+	dgb		.req	q21
			
 
				+	dgbv		.req	v21
			
 
				+
			
 
				+	t0		.req	v22
			
 
				+	t1		.req	v23
			
 
				+
			
 
				+	dg0q		.req	q24
			
 
				+	dg0v		.req	v24
			
 
				+	dg1q		.req	q25
			
 
				+	dg1v		.req	v25
			
 
				+	dg2q		.req	q26
			
 
				+	dg2v		.req	v26
			
 
				+
			
 
				+	.macro		add_only, ev, rc, s0
			
 
				+	mov		dg2v.16b, dg0v.16b
			
 
				+	.ifeq		\ev
			
 
				+	add		t1.4s, v\s0\().4s, \rc\().4s
			
 
				+	sha256h		dg0q, dg1q, t0.4s
			
 
				+	sha256h2	dg1q, dg2q, t0.4s
			
 
				+	.else
			
 
				+	.ifnb		\s0
			
 
				+	add		t0.4s, v\s0\().4s, \rc\().4s
			
 
				+	.endif
			
 
				+	sha256h		dg0q, dg1q, t1.4s
			
 
				+	sha256h2	dg1q, dg2q, t1.4s
			
 
				+	.endif
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		add_update, ev, rc, s0, s1, s2, s3
			
 
				+	sha256su0	v\s0\().4s, v\s1\().4s
			
 
				+	add_only	\ev, \rc, \s1
			
 
				+	sha256su1	v\s0\().4s, v\s2\().4s, v\s3\().4s
			
 
				+	.endm
			
 
				+
			
 
				+	/*
			
 
				+	 * The SHA-256 round constants
			
 
				+	 */
			
 
				+	.align		4
			
 
				+.Lsha2_rcon:
			
 
				+	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
			
 
				+	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
			
 
				+	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
			
 
				+	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
			
 
				+	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
			
 
				+	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
			
 
				+	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
			
 
				+	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
			
 
				+	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
			
 
				+	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
			
 
				+	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
			
 
				+	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
			
 
				+	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
			
 
				+	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
			
 
				+	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
			
 
				+	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
			
 
				+
			
 
				+	/*
			
 
				+	 * void sha2_ce_transform(int blocks, u8 const *src, u32 *state,
			
 
				+	 *                        u8 *head, long bytes)
			
 
				+	 */
			
 
				+ENTRY(sha2_ce_transform)
			
 
				+	/* load round constants */
			
 
				+	adr		x8, .Lsha2_rcon
			
 
				+	ld1		{ v0.4s- v3.4s}, [x8], #64
			
 
				+	ld1		{ v4.4s- v7.4s}, [x8], #64
			
 
				+	ld1		{ v8.4s-v11.4s}, [x8], #64
			
 
				+	ld1		{v12.4s-v15.4s}, [x8]
			
 
				+
			
 
				+	/* load state */
			
 
				+	ldp		dga, dgb, [x2]
			
 
				+
			
 
				+	/* load partial input (if supplied) */
			
 
				+	cbz		x3, 0f
			
 
				+	ld1		{v16.4s-v19.4s}, [x3]
			
 
				+	b		1f
			
 
				+
			
 
				+	/* load input */
			
 
				+0:	ld1		{v16.4s-v19.4s}, [x1], #64
			
 
				+	sub		w0, w0, #1
			
 
				+
			
 
				+1:
			
 
				+CPU_LE(	rev32		v16.16b, v16.16b	)
			
 
				+CPU_LE(	rev32		v17.16b, v17.16b	)
			
 
				+CPU_LE(	rev32		v18.16b, v18.16b	)
			
 
				+CPU_LE(	rev32		v19.16b, v19.16b	)
			
 
				+
			
 
				+2:	add		t0.4s, v16.4s, v0.4s
			
 
				+	mov		dg0v.16b, dgav.16b
			
 
				+	mov		dg1v.16b, dgbv.16b
			
 
				+
			
 
				+	add_update	0,  v1, 16, 17, 18, 19
			
 
				+	add_update	1,  v2, 17, 18, 19, 16
			
 
				+	add_update	0,  v3, 18, 19, 16, 17
			
 
				+	add_update	1,  v4, 19, 16, 17, 18
			
 
				+
			
 
				+	add_update	0,  v5, 16, 17, 18, 19
			
 
				+	add_update	1,  v6, 17, 18, 19, 16
			
 
				+	add_update	0,  v7, 18, 19, 16, 17
			
 
				+	add_update	1,  v8, 19, 16, 17, 18
			
 
				+
			
 
				+	add_update	0,  v9, 16, 17, 18, 19
			
 
				+	add_update	1, v10, 17, 18, 19, 16
			
 
				+	add_update	0, v11, 18, 19, 16, 17
			
 
				+	add_update	1, v12, 19, 16, 17, 18
			
 
				+
			
 
				+	add_only	0, v13, 17
			
 
				+	add_only	1, v14, 18
			
 
				+	add_only	0, v15, 19
			
 
				+	add_only	1
			
 
				+
			
 
				+	/* update state */
			
 
				+	add		dgav.4s, dgav.4s, dg0v.4s
			
 
				+	add		dgbv.4s, dgbv.4s, dg1v.4s
			
 
				+
			
 
				+	/* handled all input blocks? */
			
 
				+	cbnz		w0, 0b
			
 
				+
			
 
				+	/*
			
 
				+	 * Final block: add padding and total bit count.
			
 
				+	 * Skip if we have no total byte count in x4. In that case, the input
			
 
				+	 * size was not a round multiple of the block size, and the padding is
			
 
				+	 * handled by the C code.
			
 
				+	 */
			
 
				+	cbz		x4, 3f
			
 
				+	movi		v17.2d, #0
			
 
				+	mov		x8, #0x80000000
			
 
				+	movi		v18.2d, #0
			
 
				+	ror		x7, x4, #29		// ror(lsl(x4, 3), 32)
			
 
				+	fmov		d16, x8
			
 
				+	mov		x4, #0
			
 
				+	mov		v19.d[0], xzr
			
 
				+	mov		v19.d[1], x7
			
 
				+	b		2b
			
 
				+
			
 
				+	/* store new state */
			
 
				+3:	stp		dga, dgb, [x2]
			
 
				+	ret
			
 
				+ENDPROC(sha2_ce_transform)
			
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@@ -0,0 +1,255 @@
 
				+/*
			
 
				+ * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
			
 
				+ *
			
 
				+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <asm/neon.h>
			
 
				+#include <asm/unaligned.h>
			
 
				+#include <crypto/internal/hash.h>
			
 
				+#include <crypto/sha.h>
			
 
				+#include <linux/cpufeature.h>
			
 
				+#include <linux/crypto.h>
			
 
				+#include <linux/module.h>
			
 
				+
			
 
				+MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
			
 
				+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				+MODULE_LICENSE("GPL v2");
			
 
				+
			
 
				+asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state,
			
 
				+				 u8 *head, long bytes);
			
 
				+
			
 
				+static int sha224_init(struct shash_desc *desc)
			
 
				+{
			
 
				+	struct sha256_state *sctx = shash_desc_ctx(desc);
			
 
				+
			
 
				+	*sctx = (struct sha256_state){
			
 
				+		.state = {
			
 
				+			SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3,
			
 
				+			SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7,
			
 
				+		}
			
 
				+	};
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int sha256_init(struct shash_desc *desc)
			
 
				+{
			
 
				+	struct sha256_state *sctx = shash_desc_ctx(desc);
			
 
				+
			
 
				+	*sctx = (struct sha256_state){
			
 
				+		.state = {
			
 
				+			SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3,
			
 
				+			SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7,
			
 
				+		}
			
 
				+	};
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int sha2_update(struct shash_desc *desc, const u8 *data,
			
 
				+		       unsigned int len)
			
 
				+{
			
 
				+	struct sha256_state *sctx = shash_desc_ctx(desc);
			
 
				+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
			
 
				+
			
 
				+	sctx->count += len;
			
 
				+
			
 
				+	if ((partial + len) >= SHA256_BLOCK_SIZE) {
			
 
				+		int blocks;
			
 
				+
			
 
				+		if (partial) {
			
 
				+			int p = SHA256_BLOCK_SIZE - partial;
			
 
				+
			
 
				+			memcpy(sctx->buf + partial, data, p);
			
 
				+			data += p;
			
 
				+			len -= p;
			
 
				+		}
			
 
				+
			
 
				+		blocks = len / SHA256_BLOCK_SIZE;
			
 
				+		len %= SHA256_BLOCK_SIZE;
			
 
				+
			
 
				+		kernel_neon_begin_partial(28);
			
 
				+		sha2_ce_transform(blocks, data, sctx->state,
			
 
				+				  partial ? sctx->buf : NULL, 0);
			
 
				+		kernel_neon_end();
			
 
				+
			
 
				+		data += blocks * SHA256_BLOCK_SIZE;
			
 
				+		partial = 0;
			
 
				+	}
			
 
				+	if (len)
			
 
				+		memcpy(sctx->buf + partial, data, len);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void sha2_final(struct shash_desc *desc)
			
 
				+{
			
 
				+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
			
 
				+
			
 
				+	struct sha256_state *sctx = shash_desc_ctx(desc);
			
 
				+	__be64 bits = cpu_to_be64(sctx->count << 3);
			
 
				+	u32 padlen = SHA256_BLOCK_SIZE
			
 
				+		     - ((sctx->count + sizeof(bits)) % SHA256_BLOCK_SIZE);
			
 
				+
			
 
				+	sha2_update(desc, padding, padlen);
			
 
				+	sha2_update(desc, (const u8 *)&bits, sizeof(bits));
			
 
				+}
			
 
				+
			
 
				+static int sha224_final(struct shash_desc *desc, u8 *out)
			
 
				+{
			
 
				+	struct sha256_state *sctx = shash_desc_ctx(desc);
			
 
				+	__be32 *dst = (__be32 *)out;
			
 
				+	int i;
			
 
				+
			
 
				+	sha2_final(desc);
			
 
				+
			
 
				+	for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++)
			
 
				+		put_unaligned_be32(sctx->state[i], dst++);
			
 
				+
			
 
				+	*sctx = (struct sha256_state){};
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int sha256_final(struct shash_desc *desc, u8 *out)
			
 
				+{
			
 
				+	struct sha256_state *sctx = shash_desc_ctx(desc);
			
 
				+	__be32 *dst = (__be32 *)out;
			
 
				+	int i;
			
 
				+
			
 
				+	sha2_final(desc);
			
 
				+
			
 
				+	for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++)
			
 
				+		put_unaligned_be32(sctx->state[i], dst++);
			
 
				+
			
 
				+	*sctx = (struct sha256_state){};
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void sha2_finup(struct shash_desc *desc, const u8 *data,
			
 
				+		       unsigned int len)
			
 
				+{
			
 
				+	struct sha256_state *sctx = shash_desc_ctx(desc);
			
 
				+	int blocks;
			
 
				+
			
 
				+	if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) {
			
 
				+		sha2_update(desc, data, len);
			
 
				+		sha2_final(desc);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Use a fast path if the input is a multiple of 64 bytes. In
			
 
				+	 * this case, there is no need to copy data around, and we can
			
 
				+	 * perform the entire digest calculation in a single invocation
			
 
				+	 * of sha2_ce_transform()
			
 
				+	 */
			
 
				+	blocks = len / SHA256_BLOCK_SIZE;
			
 
				+
			
 
				+	kernel_neon_begin_partial(28);
			
 
				+	sha2_ce_transform(blocks, data, sctx->state, NULL, len);
			
 
				+	kernel_neon_end();
			
 
				+	data += blocks * SHA256_BLOCK_SIZE;
			
 
				+}
			
 
				+
			
 
				+static int sha224_finup(struct shash_desc *desc, const u8 *data,
			
 
				+			unsigned int len, u8 *out)
			
 
				+{
			
 
				+	struct sha256_state *sctx = shash_desc_ctx(desc);
			
 
				+	__be32 *dst = (__be32 *)out;
			
 
				+	int i;
			
 
				+
			
 
				+	sha2_finup(desc, data, len);
			
 
				+
			
 
				+	for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++)
			
 
				+		put_unaligned_be32(sctx->state[i], dst++);
			
 
				+
			
 
				+	*sctx = (struct sha256_state){};
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int sha256_finup(struct shash_desc *desc, const u8 *data,
			
 
				+			unsigned int len, u8 *out)
			
 
				+{
			
 
				+	struct sha256_state *sctx = shash_desc_ctx(desc);
			
 
				+	__be32 *dst = (__be32 *)out;
			
 
				+	int i;
			
 
				+
			
 
				+	sha2_finup(desc, data, len);
			
 
				+
			
 
				+	for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++)
			
 
				+		put_unaligned_be32(sctx->state[i], dst++);
			
 
				+
			
 
				+	*sctx = (struct sha256_state){};
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int sha2_export(struct shash_desc *desc, void *out)
			
 
				+{
			
 
				+	struct sha256_state *sctx = shash_desc_ctx(desc);
			
 
				+	struct sha256_state *dst = out;
			
 
				+
			
 
				+	*dst = *sctx;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int sha2_import(struct shash_desc *desc, const void *in)
			
 
				+{
			
 
				+	struct sha256_state *sctx = shash_desc_ctx(desc);
			
 
				+	struct sha256_state const *src = in;
			
 
				+
			
 
				+	*sctx = *src;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct shash_alg algs[] = { {
			
 
				+	.init			= sha224_init,
			
 
				+	.update			= sha2_update,
			
 
				+	.final			= sha224_final,
			
 
				+	.finup			= sha224_finup,
			
 
				+	.export			= sha2_export,
			
 
				+	.import			= sha2_import,
			
 
				+	.descsize		= sizeof(struct sha256_state),
			
 
				+	.digestsize		= SHA224_DIGEST_SIZE,
			
 
				+	.statesize		= sizeof(struct sha256_state),
			
 
				+	.base			= {
			
 
				+		.cra_name		= "sha224",
			
 
				+		.cra_driver_name	= "sha224-ce",
			
 
				+		.cra_priority		= 200,
			
 
				+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
			
 
				+		.cra_blocksize		= SHA256_BLOCK_SIZE,
			
 
				+		.cra_module		= THIS_MODULE,
			
 
				+	}
			
 
				+}, {
			
 
				+	.init			= sha256_init,
			
 
				+	.update			= sha2_update,
			
 
				+	.final			= sha256_final,
			
 
				+	.finup			= sha256_finup,
			
 
				+	.export			= sha2_export,
			
 
				+	.import			= sha2_import,
			
 
				+	.descsize		= sizeof(struct sha256_state),
			
 
				+	.digestsize		= SHA256_DIGEST_SIZE,
			
 
				+	.statesize		= sizeof(struct sha256_state),
			
 
				+	.base			= {
			
 
				+		.cra_name		= "sha256",
			
 
				+		.cra_driver_name	= "sha256-ce",
			
 
				+		.cra_priority		= 200,
			
 
				+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
			
 
				+		.cra_blocksize		= SHA256_BLOCK_SIZE,
			
 
				+		.cra_module		= THIS_MODULE,
			
 
				+	}
			
 
				+} };
			
 
				+
			
 
				+static int __init sha2_ce_mod_init(void)
			
 
				+{
			
 
				+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
			
 
				+}
			
 
				+
			
 
				+static void __exit sha2_ce_mod_fini(void)
			
 
				+{
			
 
				+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
			
 
				+}
			
 
				+
			
 
				+module_cpu_feature_match(SHA2, sha2_ce_mod_init);
			
 
				+module_exit(sha2_ce_mod_fini);
			
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -40,6 +40,7 @@ generic-y += segment.h
 
				 generic-y += sembuf.h
			
 
				 generic-y += serial.h
			
 
				 generic-y += shmbuf.h
			
 
				+generic-y += simd.h
			
 
				 generic-y += sizes.h
			
 
				 generic-y += socket.h
			
 
				 generic-y += sockios.h
			
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -37,8 +37,21 @@ struct fpsimd_state {
 
				 			u32 fpcr;
			
 
				 		};
			
 
				 	};
			
 
				+	/* the id of the last cpu to have restored this state */
			
 
				+	unsigned int cpu;
			
 
				 };
			
 
				 
			
 
				+/*
			
 
				+ * Struct for stacking the bottom 'n' FP/SIMD registers.
			
 
				+ */
			
 
				+struct fpsimd_partial_state {
			
 
				+	u32		fpsr;
			
 
				+	u32		fpcr;
			
 
				+	u32		num_regs;
			
 
				+	__uint128_t	vregs[32];
			
 
				+};
			
 
				+
			
 
				+
			
 
				 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
			
 
				 /* Masks for extracting the FPSR and FPCR from the FPSCR */
			
 
				 #define VFP_FPSCR_STAT_MASK	0xf800009f
			
@@ -58,6 +71,16 @@ extern void fpsimd_load_state(struct fpsimd_state *state);
 
				 extern void fpsimd_thread_switch(struct task_struct *next);
			
 
				 extern void fpsimd_flush_thread(void);
			
 
				 
			
 
				+extern void fpsimd_preserve_current_state(void);
			
 
				+extern void fpsimd_restore_current_state(void);
			
 
				+extern void fpsimd_update_current_state(struct fpsimd_state *state);
			
 
				+
			
 
				+extern void fpsimd_flush_task_state(struct task_struct *target);
			
 
				+
			
 
				+extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state,
			
 
				+				      u32 num_regs);
			
 
				+extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state);
			
 
				+
			
 
				 #endif
			
 
				 
			
 
				 #endif
			
--- a/arch/arm64/include/asm/fpsimdmacros.h
+++ b/arch/arm64/include/asm/fpsimdmacros.h
@@ -62,3 +62,38 @@
 
				 	ldr	w\tmpnr, [\state, #16 * 2 + 4]
			
 
				 	msr	fpcr, x\tmpnr
			
 
				 .endm
			
 
				+
			
 
				+.altmacro
			
 
				+.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2
			
 
				+	mrs	x\tmpnr1, fpsr
			
 
				+	str	w\numnr, [\state, #8]
			
 
				+	mrs	x\tmpnr2, fpcr
			
 
				+	stp	w\tmpnr1, w\tmpnr2, [\state]
			
 
				+	adr	x\tmpnr1, 0f
			
 
				+	add	\state, \state, x\numnr, lsl #4
			
 
				+	sub	x\tmpnr1, x\tmpnr1, x\numnr, lsl #1
			
 
				+	br	x\tmpnr1
			
 
				+	.irp	qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
			
 
				+	.irp	qb, %(qa + 1)
			
 
				+	stp	q\qa, q\qb, [\state, # -16 * \qa - 16]
			
 
				+	.endr
			
 
				+	.endr
			
 
				+0:
			
 
				+.endm
			
 
				+
			
 
				+.macro fpsimd_restore_partial state, tmpnr1, tmpnr2
			
 
				+	ldp	w\tmpnr1, w\tmpnr2, [\state]
			
 
				+	msr	fpsr, x\tmpnr1
			
 
				+	msr	fpcr, x\tmpnr2
			
 
				+	adr	x\tmpnr1, 0f
			
 
				+	ldr	w\tmpnr2, [\state, #8]
			
 
				+	add	\state, \state, x\tmpnr2, lsl #4
			
 
				+	sub	x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1
			
 
				+	br	x\tmpnr1
			
 
				+	.irp	qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
			
 
				+	.irp	qb, %(qa + 1)
			
 
				+	ldp	q\qa, q\qb, [\state, # -16 * \qa - 16]
			
 
				+	.endr
			
 
				+	.endr
			
 
				+0:
			
 
				+.endm
			
--- a/arch/arm64/include/asm/neon.h
+++ b/arch/arm64/include/asm/neon.h
@@ -8,7 +8,11 @@
 
				  * published by the Free Software Foundation.
			
 
				  */
			
 
				 
			
 
				+#include <linux/types.h>
			
 
				+
			
 
				 #define cpu_has_neon()		(1)
			
 
				 
			
 
				-void kernel_neon_begin(void);
			
 
				+#define kernel_neon_begin()	kernel_neon_begin_partial(32)
			
 
				+
			
 
				+void kernel_neon_begin_partial(u32 num_regs);
			
 
				 void kernel_neon_end(void);
			
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -103,6 +103,7 @@ static inline struct thread_info *current_thread_info(void)
 
				 #define TIF_SIGPENDING		0
			
 
				 #define TIF_NEED_RESCHED	1
			
 
				 #define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
			
 
				+#define TIF_FOREIGN_FPSTATE	3	/* CPU's FP state is not current's */
			
 
				 #define TIF_SYSCALL_TRACE	8
			
 
				 #define TIF_SYSCALL_AUDIT	9
			
 
				 #define TIF_SYSCALL_TRACEPOINT	10
			
@@ -118,6 +119,7 @@ static inline struct thread_info *current_thread_info(void)
 
				 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
			
 
				 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
			
 
				 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
			
 
				+#define _TIF_FOREIGN_FPSTATE	(1 << TIF_FOREIGN_FPSTATE)
			
 
				 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
			
 
				 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
			
 
				 #define _TIF_SYSCALL_TRACEPOINT	(1 << TIF_SYSCALL_TRACEPOINT)
			
@@ -125,7 +127,7 @@ static inline struct thread_info *current_thread_info(void)
 
				 #define _TIF_32BIT		(1 << TIF_32BIT)
			
 
				 
			
 
				 #define _TIF_WORK_MASK		(_TIF_NEED_RESCHED | _TIF_SIGPENDING | \
			
 
				-				 _TIF_NOTIFY_RESUME)
			
 
				+				 _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
			
 
				 
			
 
				 #define _TIF_SYSCALL_WORK	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
			
 
				 				 _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP)
			
--- a/arch/arm64/kernel/entry-fpsimd.S
+++ b/arch/arm64/kernel/entry-fpsimd.S
@@ -41,3 +41,27 @@ ENTRY(fpsimd_load_state)
 
				 	fpsimd_restore x0, 8
			
 
				 	ret
			
 
				 ENDPROC(fpsimd_load_state)
			
 
				+
			
 
				+#ifdef CONFIG_KERNEL_MODE_NEON
			
 
				+
			
 
				+/*
			
 
				+ * Save the bottom n FP registers.
			
 
				+ *
			
 
				+ * x0 - pointer to struct fpsimd_partial_state
			
 
				+ */
			
 
				+ENTRY(fpsimd_save_partial_state)
			
 
				+	fpsimd_save_partial x0, 1, 8, 9
			
 
				+	ret
			
 
				+ENDPROC(fpsimd_load_partial_state)
			
 
				+
			
 
				+/*
			
 
				+ * Load the bottom n FP registers.
			
 
				+ *
			
 
				+ * x0 - pointer to struct fpsimd_partial_state
			
 
				+ */
			
 
				+ENTRY(fpsimd_load_partial_state)
			
 
				+	fpsimd_restore_partial x0, 8, 9
			
 
				+	ret
			
 
				+ENDPROC(fpsimd_load_partial_state)
			
 
				+
			
 
				+#endif
			
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -562,7 +562,7 @@ fast_work_pending:
 
				 	str	x0, [sp, #S_X0]			// returned x0
			
 
				 work_pending:
			
 
				 	tbnz	x1, #TIF_NEED_RESCHED, work_resched
			
 
				-	/* TIF_SIGPENDING or TIF_NOTIFY_RESUME case */
			
 
				+	/* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
			
 
				 	ldr	x2, [sp, #S_PSTATE]
			
 
				 	mov	x0, sp				// 'regs'
			
 
				 	tst	x2, #PSR_MODE_MASK		// user mode regs?
			
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -34,6 +34,60 @@
 
				 #define FPEXC_IXF	(1 << 4)
			
 
				 #define FPEXC_IDF	(1 << 7)
			
 
				 
			
 
				+/*
			
 
				+ * In order to reduce the number of times the FPSIMD state is needlessly saved
			
 
				+ * and restored, we need to keep track of two things:
			
 
				+ * (a) for each task, we need to remember which CPU was the last one to have
			
 
				+ *     the task's FPSIMD state loaded into its FPSIMD registers;
			
 
				+ * (b) for each CPU, we need to remember which task's userland FPSIMD state has
			
 
				+ *     been loaded into its FPSIMD registers most recently, or whether it has
			
 
				+ *     been used to perform kernel mode NEON in the meantime.
			
 
				+ *
			
 
				+ * For (a), we add a 'cpu' field to struct fpsimd_state, which gets updated to
			
 
				+ * the id of the current CPU everytime the state is loaded onto a CPU. For (b),
			
 
				+ * we add the per-cpu variable 'fpsimd_last_state' (below), which contains the
			
 
				+ * address of the userland FPSIMD state of the task that was loaded onto the CPU
			
 
				+ * the most recently, or NULL if kernel mode NEON has been performed after that.
			
 
				+ *
			
 
				+ * With this in place, we no longer have to restore the next FPSIMD state right
			
 
				+ * when switching between tasks. Instead, we can defer this check to userland
			
 
				+ * resume, at which time we verify whether the CPU's fpsimd_last_state and the
			
 
				+ * task's fpsimd_state.cpu are still mutually in sync. If this is the case, we
			
 
				+ * can omit the FPSIMD restore.
			
 
				+ *
			
 
				+ * As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to
			
 
				+ * indicate whether or not the userland FPSIMD state of the current task is
			
 
				+ * present in the registers. The flag is set unless the FPSIMD registers of this
			
 
				+ * CPU currently contain the most recent userland FPSIMD state of the current
			
 
				+ * task.
			
 
				+ *
			
 
				+ * For a certain task, the sequence may look something like this:
			
 
				+ * - the task gets scheduled in; if both the task's fpsimd_state.cpu field
			
 
				+ *   contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu
			
 
				+ *   variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is
			
 
				+ *   cleared, otherwise it is set;
			
 
				+ *
			
 
				+ * - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's
			
 
				+ *   userland FPSIMD state is copied from memory to the registers, the task's
			
 
				+ *   fpsimd_state.cpu field is set to the id of the current CPU, the current
			
 
				+ *   CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the
			
 
				+ *   TIF_FOREIGN_FPSTATE flag is cleared;
			
 
				+ *
			
 
				+ * - the task executes an ordinary syscall; upon return to userland, the
			
 
				+ *   TIF_FOREIGN_FPSTATE flag will still be cleared, so no FPSIMD state is
			
 
				+ *   restored;
			
 
				+ *
			
 
				+ * - the task executes a syscall which executes some NEON instructions; this is
			
 
				+ *   preceded by a call to kernel_neon_begin(), which copies the task's FPSIMD
			
 
				+ *   register contents to memory, clears the fpsimd_last_state per-cpu variable
			
 
				+ *   and sets the TIF_FOREIGN_FPSTATE flag;
			
 
				+ *
			
 
				+ * - the task gets preempted after kernel_neon_end() is called; as we have not
			
 
				+ *   returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so
			
 
				+ *   whatever is in the FPSIMD registers is not saved to memory, but discarded.
			
 
				+ */
			
 
				+static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state);
			
 
				+
			
 
				 /*
			
 
				  * Trapped FP/ASIMD access.
			
 
				  */
			
@@ -72,43 +126,137 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs)
 
				 
			
 
				 void fpsimd_thread_switch(struct task_struct *next)
			
 
				 {
			
 
				-	/* check if not kernel threads */
			
 
				-	if (current->mm)
			
 
				+	/*
			
 
				+	 * Save the current FPSIMD state to memory, but only if whatever is in
			
 
				+	 * the registers is in fact the most recent userland FPSIMD state of
			
 
				+	 * 'current'.
			
 
				+	 */
			
 
				+	if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
			
 
				 		fpsimd_save_state(&current->thread.fpsimd_state);
			
 
				-	if (next->mm)
			
 
				-		fpsimd_load_state(&next->thread.fpsimd_state);
			
 
				+
			
 
				+	if (next->mm) {
			
 
				+		/*
			
 
				+		 * If we are switching to a task whose most recent userland
			
 
				+		 * FPSIMD state is already in the registers of *this* cpu,
			
 
				+		 * we can skip loading the state from memory. Otherwise, set
			
 
				+		 * the TIF_FOREIGN_FPSTATE flag so the state will be loaded
			
 
				+		 * upon the next return to userland.
			
 
				+		 */
			
 
				+		struct fpsimd_state *st = &next->thread.fpsimd_state;
			
 
				+
			
 
				+		if (__this_cpu_read(fpsimd_last_state) == st
			
 
				+		    && st->cpu == smp_processor_id())
			
 
				+			clear_ti_thread_flag(task_thread_info(next),
			
 
				+					     TIF_FOREIGN_FPSTATE);
			
 
				+		else
			
 
				+			set_ti_thread_flag(task_thread_info(next),
			
 
				+					   TIF_FOREIGN_FPSTATE);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 void fpsimd_flush_thread(void)
			
 
				 {
			
 
				-	preempt_disable();
			
 
				 	memset(&current->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
			
 
				-	fpsimd_load_state(&current->thread.fpsimd_state);
			
 
				+	set_thread_flag(TIF_FOREIGN_FPSTATE);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Save the userland FPSIMD state of 'current' to memory, but only if the state
			
 
				+ * currently held in the registers does in fact belong to 'current'
			
 
				+ */
			
 
				+void fpsimd_preserve_current_state(void)
			
 
				+{
			
 
				+	preempt_disable();
			
 
				+	if (!test_thread_flag(TIF_FOREIGN_FPSTATE))
			
 
				+		fpsimd_save_state(&current->thread.fpsimd_state);
			
 
				+	preempt_enable();
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Load the userland FPSIMD state of 'current' from memory, but only if the
			
 
				+ * FPSIMD state already held in the registers is /not/ the most recent FPSIMD
			
 
				+ * state of 'current'
			
 
				+ */
			
 
				+void fpsimd_restore_current_state(void)
			
 
				+{
			
 
				+	preempt_disable();
			
 
				+	if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
			
 
				+		struct fpsimd_state *st = &current->thread.fpsimd_state;
			
 
				+
			
 
				+		fpsimd_load_state(st);
			
 
				+		this_cpu_write(fpsimd_last_state, st);
			
 
				+		st->cpu = smp_processor_id();
			
 
				+	}
			
 
				+	preempt_enable();
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Load an updated userland FPSIMD state for 'current' from memory and set the
			
 
				+ * flag that indicates that the FPSIMD register contents are the most recent
			
 
				+ * FPSIMD state of 'current'
			
 
				+ */
			
 
				+void fpsimd_update_current_state(struct fpsimd_state *state)
			
 
				+{
			
 
				+	preempt_disable();
			
 
				+	fpsimd_load_state(state);
			
 
				+	if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
			
 
				+		struct fpsimd_state *st = &current->thread.fpsimd_state;
			
 
				+
			
 
				+		this_cpu_write(fpsimd_last_state, st);
			
 
				+		st->cpu = smp_processor_id();
			
 
				+	}
			
 
				 	preempt_enable();
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Invalidate live CPU copies of task t's FPSIMD state
			
 
				+ */
			
 
				+void fpsimd_flush_task_state(struct task_struct *t)
			
 
				+{
			
 
				+	t->thread.fpsimd_state.cpu = NR_CPUS;
			
 
				+}
			
 
				+
			
 
				 #ifdef CONFIG_KERNEL_MODE_NEON
			
 
				 
			
 
				+static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate);
			
 
				+static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate);
			
 
				+
			
 
				 /*
			
 
				  * Kernel-side NEON support functions
			
 
				  */
			
 
				-void kernel_neon_begin(void)
			
 
				+void kernel_neon_begin_partial(u32 num_regs)
			
 
				 {
			
 
				-	/* Avoid using the NEON in interrupt context */
			
 
				-	BUG_ON(in_interrupt());
			
 
				-	preempt_disable();
			
 
				+	if (in_interrupt()) {
			
 
				+		struct fpsimd_partial_state *s = this_cpu_ptr(
			
 
				+			in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
			
 
				 
			
 
				-	if (current->mm)
			
 
				-		fpsimd_save_state(&current->thread.fpsimd_state);
			
 
				+		BUG_ON(num_regs > 32);
			
 
				+		fpsimd_save_partial_state(s, roundup(num_regs, 2));
			
 
				+	} else {
			
 
				+		/*
			
 
				+		 * Save the userland FPSIMD state if we have one and if we
			
 
				+		 * haven't done so already. Clear fpsimd_last_state to indicate
			
 
				+		 * that there is no longer userland FPSIMD state in the
			
 
				+		 * registers.
			
 
				+		 */
			
 
				+		preempt_disable();
			
 
				+		if (current->mm &&
			
 
				+		    !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE))
			
 
				+			fpsimd_save_state(&current->thread.fpsimd_state);
			
 
				+		this_cpu_write(fpsimd_last_state, NULL);
			
 
				+	}
			
 
				 }
			
 
				-EXPORT_SYMBOL(kernel_neon_begin);
			
 
				+EXPORT_SYMBOL(kernel_neon_begin_partial);
			
 
				 
			
 
				 void kernel_neon_end(void)
			
 
				 {
			
 
				-	if (current->mm)
			
 
				-		fpsimd_load_state(&current->thread.fpsimd_state);
			
 
				-
			
 
				-	preempt_enable();
			
 
				+	if (in_interrupt()) {
			
 
				+		struct fpsimd_partial_state *s = this_cpu_ptr(
			
 
				+			in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate);
			
 
				+		fpsimd_load_partial_state(s);
			
 
				+	} else {
			
 
				+		preempt_enable();
			
 
				+	}
			
 
				 }
			
 
				 EXPORT_SYMBOL(kernel_neon_end);
			
 
				 
			
@@ -120,12 +268,12 @@ static int fpsimd_cpu_pm_notifier(struct notifier_block *self,
 
				 {
			
 
				 	switch (cmd) {
			
 
				 	case CPU_PM_ENTER:
			
 
				-		if (current->mm)
			
 
				+		if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
			
 
				 			fpsimd_save_state(&current->thread.fpsimd_state);
			
 
				 		break;
			
 
				 	case CPU_PM_EXIT:
			
 
				 		if (current->mm)
			
 
				-			fpsimd_load_state(&current->thread.fpsimd_state);
			
 
				+			set_thread_flag(TIF_FOREIGN_FPSTATE);
			
 
				 		break;
			
 
				 	case CPU_PM_ENTER_FAILED:
			
 
				 	default:
			
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -206,7 +206,7 @@ void release_thread(struct task_struct *dead_task)
 
				 
			
 
				 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
			
 
				 {
			
 
				-	fpsimd_save_state(&current->thread.fpsimd_state);
			
 
				+	fpsimd_preserve_current_state();
			
 
				 	*dst = *src;
			
 
				 	return 0;
			
 
				 }
			
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -518,6 +518,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
 
				 		return ret;
			
 
				 
			
 
				 	target->thread.fpsimd_state.user_fpsimd = newstate;
			
 
				+	fpsimd_flush_task_state(target);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -765,6 +766,7 @@ static int compat_vfp_set(struct task_struct *target,
 
				 		uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK;
			
 
				 	}
			
 
				 
			
 
				+	fpsimd_flush_task_state(target);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -51,7 +51,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
 
				 	int err;
			
 
				 
			
 
				 	/* dump the hardware registers to the fpsimd_state structure */
			
 
				-	fpsimd_save_state(fpsimd);
			
 
				+	fpsimd_preserve_current_state();
			
 
				 
			
 
				 	/* copy the FP and status/control registers */
			
 
				 	err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs));
			
@@ -86,11 +86,8 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx)
 
				 	__get_user_error(fpsimd.fpcr, &ctx->fpcr, err);
			
 
				 
			
 
				 	/* load the hardware registers from the fpsimd_state structure */
			
 
				-	if (!err) {
			
 
				-		preempt_disable();
			
 
				-		fpsimd_load_state(&fpsimd);
			
 
				-		preempt_enable();
			
 
				-	}
			
 
				+	if (!err)
			
 
				+		fpsimd_update_current_state(&fpsimd);
			
 
				 
			
 
				 	return err ? -EFAULT : 0;
			
 
				 }
			
@@ -433,4 +430,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs,
 
				 		clear_thread_flag(TIF_NOTIFY_RESUME);
			
 
				 		tracehook_notify_resume(regs);
			
 
				 	}
			
 
				+
			
 
				+	if (thread_flags & _TIF_FOREIGN_FPSTATE)
			
 
				+		fpsimd_restore_current_state();
			
 
				+
			
 
				 }
			
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -222,7 +222,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
 
				 	 * Note that this also saves V16-31, which aren't visible
			
 
				 	 * in AArch32.
			
 
				 	 */
			
 
				-	fpsimd_save_state(fpsimd);
			
 
				+	fpsimd_preserve_current_state();
			
 
				 
			
 
				 	/* Place structure header on the stack */
			
 
				 	__put_user_error(magic, &frame->magic, err);
			
@@ -285,11 +285,8 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
 
				 	 * We don't need to touch the exception register, so
			
 
				 	 * reload the hardware state.
			
 
				 	 */
			
 
				-	if (!err) {
			
 
				-		preempt_disable();
			
 
				-		fpsimd_load_state(&fpsimd);
			
 
				-		preempt_enable();
			
 
				-	}
			
 
				+	if (!err)
			
 
				+		fpsimd_update_current_state(&fpsimd);
			
 
				 
			
 
				 	return err ? -EFAULT : 0;
			
 
				 }
			
--- a/include/asm-generic/unaligned.h
+++ b/include/asm-generic/unaligned.h
@@ -4,22 +4,27 @@
 
				 /*
			
 
				  * This is the most generic implementation of unaligned accesses
			
 
				  * and should work almost anywhere.
			
 
				- *
			
 
				- * If an architecture can handle unaligned accesses in hardware,
			
 
				- * it may want to use the linux/unaligned/access_ok.h implementation
			
 
				- * instead.
			
 
				  */
			
 
				 #include <asm/byteorder.h>
			
 
				 
			
 
				+/* Set by the arch if it can handle unaligned accesses in hardware. */
			
 
				+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
			
 
				+# include <linux/unaligned/access_ok.h>
			
 
				+#endif
			
 
				+
			
 
				 #if defined(__LITTLE_ENDIAN)
			
 
				-# include <linux/unaligned/le_struct.h>
			
 
				-# include <linux/unaligned/be_byteshift.h>
			
 
				+# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
			
 
				+#  include <linux/unaligned/le_struct.h>
			
 
				+#  include <linux/unaligned/be_byteshift.h>
			
 
				+# endif
			
 
				 # include <linux/unaligned/generic.h>
			
 
				 # define get_unaligned	__get_unaligned_le
			
 
				 # define put_unaligned	__put_unaligned_le
			
 
				 #elif defined(__BIG_ENDIAN)
			
 
				-# include <linux/unaligned/be_struct.h>
			
 
				-# include <linux/unaligned/le_byteshift.h>
			
 
				+# ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
			
 
				+#  include <linux/unaligned/be_struct.h>
			
 
				+#  include <linux/unaligned/le_byteshift.h>
			
 
				+# endif
			
 
				 # include <linux/unaligned/generic.h>
			
 
				 # define get_unaligned	__get_unaligned_be
			
 
				 # define put_unaligned	__put_unaligned_be