8 năm trước cách đây · 5bcbe22ca4
--- a/Documentation/crypto/api-digest.rst
+++ b/Documentation/crypto/api-digest.rst
@@ -14,7 +14,7 @@ Asynchronous Message Digest API
 
				    :doc: Asynchronous Message Digest API
			
 
				 
			
 
				 .. kernel-doc:: include/crypto/hash.h
			
 
				-   :functions: crypto_alloc_ahash crypto_free_ahash crypto_ahash_init crypto_ahash_digestsize crypto_ahash_reqtfm crypto_ahash_reqsize crypto_ahash_setkey crypto_ahash_finup crypto_ahash_final crypto_ahash_digest crypto_ahash_export crypto_ahash_import
			
 
				+   :functions: crypto_alloc_ahash crypto_free_ahash crypto_ahash_init crypto_ahash_digestsize crypto_ahash_reqtfm crypto_ahash_reqsize crypto_ahash_statesize crypto_ahash_setkey crypto_ahash_finup crypto_ahash_final crypto_ahash_digest crypto_ahash_export crypto_ahash_import
			
 
				 
			
 
				 Asynchronous Hash Request Handle
			
 
				 --------------------------------
			
--- a/Documentation/crypto/api-skcipher.rst
+++ b/Documentation/crypto/api-skcipher.rst
@@ -59,4 +59,4 @@ Synchronous Block Cipher API - Deprecated
 
				    :doc: Synchronous Block Cipher API
			
 
				 
			
 
				 .. kernel-doc:: include/linux/crypto.h
			
 
				-   :functions: crypto_alloc_blkcipher rypto_free_blkcipher crypto_has_blkcipher crypto_blkcipher_name crypto_blkcipher_ivsize crypto_blkcipher_blocksize crypto_blkcipher_setkey crypto_blkcipher_encrypt crypto_blkcipher_encrypt_iv crypto_blkcipher_decrypt crypto_blkcipher_decrypt_iv crypto_blkcipher_set_iv crypto_blkcipher_get_iv
			
 
				+   :functions: crypto_alloc_blkcipher crypto_free_blkcipher crypto_has_blkcipher crypto_blkcipher_name crypto_blkcipher_ivsize crypto_blkcipher_blocksize crypto_blkcipher_setkey crypto_blkcipher_encrypt crypto_blkcipher_encrypt_iv crypto_blkcipher_decrypt crypto_blkcipher_decrypt_iv crypto_blkcipher_set_iv crypto_blkcipher_get_iv
			
--- a/Documentation/devicetree/bindings/crypto/brcm,spu-crypto.txt
+++ b/Documentation/devicetree/bindings/crypto/brcm,spu-crypto.txt
@@ -0,0 +1,22 @@
 
				+The Broadcom Secure Processing Unit (SPU) hardware supports symmetric
			
 
				+cryptographic offload for Broadcom SoCs. A SoC may have multiple SPU hardware
			
 
				+blocks.
			
 
				+
			
 
				+Required properties:
			
 
				+- compatible: Should be one of the following:
			
 
				+  brcm,spum-crypto - for devices with SPU-M hardware
			
 
				+  brcm,spu2-crypto - for devices with SPU2 hardware
			
 
				+  brcm,spu2-v2-crypto - for devices with enhanced SPU2 hardware features like SHA3
			
 
				+  and Rabin Fingerprint support
			
 
				+  brcm,spum-nsp-crypto - for the Northstar Plus variant of the SPU-M hardware
			
 
				+
			
 
				+- reg: Should contain SPU registers location and length.
			
 
				+- mboxes: The mailbox channel to be used to communicate with the SPU.
			
 
				+  Mailbox channels correspond to DMA rings on the device.
			
 
				+
			
 
				+Example:
			
 
				+	crypto@612d0000 {
			
 
				+		compatible = "brcm,spum-crypto";
			
 
				+		reg = <0 0x612d0000 0 0x900>;
			
 
				+		mboxes = <&pdc0 0>;
			
 
				+	};
			
--- a/Documentation/devicetree/bindings/crypto/mediatek-crypto.txt
+++ b/Documentation/devicetree/bindings/crypto/mediatek-crypto.txt
@@ -0,0 +1,27 @@
 
				+MediaTek cryptographic accelerators
			
 
				+
			
 
				+Required properties:
			
 
				+- compatible: Should be "mediatek,eip97-crypto"
			
 
				+- reg: Address and length of the register set for the device
			
 
				+- interrupts: Should contain the five crypto engines interrupts in numeric
			
 
				+	order. These are global system and four descriptor rings.
			
 
				+- clocks: the clock used by the core
			
 
				+- clock-names: the names of the clock listed in the clocks property. These are
			
 
				+	"ethif", "cryp"
			
 
				+- power-domains: Must contain a reference to the PM domain.
			
 
				+
			
 
				+
			
 
				+Example:
			
 
				+	crypto: crypto@1b240000 {
			
 
				+		compatible = "mediatek,eip97-crypto";
			
 
				+		reg = <0 0x1b240000 0 0x20000>;
			
 
				+		interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_LOW>,
			
 
				+			     <GIC_SPI 83 IRQ_TYPE_LEVEL_LOW>,
			
 
				+			     <GIC_SPI 84 IRQ_TYPE_LEVEL_LOW>,
			
 
				+			     <GIC_SPI 91 IRQ_TYPE_LEVEL_LOW>,
			
 
				+			     <GIC_SPI 97 IRQ_TYPE_LEVEL_LOW>;
			
 
				+		clocks = <&topckgen CLK_TOP_ETHIF_SEL>,
			
 
				+			 <&ethsys CLK_ETHSYS_CRYPTO>;
			
 
				+		clock-names = "ethif","cryp";
			
 
				+		power-domains = <&scpsys MT2701_POWER_DOMAIN_ETH>;
			
 
				+	};
			
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3031,6 +3031,13 @@ W:     http://www.cavium.com
 
				 S:     Supported
			
 
				 F:     drivers/net/ethernet/cavium/liquidio/
			
 
				 
			
 
				+CAVIUM OCTEON-TX CRYPTO DRIVER
			
 
				+M:	George Cherian <george.cherian@cavium.com>
			
 
				+L:	linux-crypto@vger.kernel.org
			
 
				+W:	http://www.cavium.com
			
 
				+S:	Supported
			
 
				+F:	drivers/crypto/cavium/cpt/
			
 
				+
			
 
				 CC2520 IEEE-802.15.4 RADIO DRIVER
			
 
				 M:	Varka Bhadram <varkabhadram@gmail.com>
			
 
				 L:	linux-wpan@vger.kernel.org
			
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -62,35 +62,18 @@ config CRYPTO_SHA512_ARM
 
				 	  using optimized ARM assembler and NEON, when available.
			
 
				 
			
 
				 config CRYPTO_AES_ARM
			
 
				-	tristate "AES cipher algorithms (ARM-asm)"
			
 
				-	depends on ARM
			
 
				+	tristate "Scalar AES cipher for ARM"
			
 
				 	select CRYPTO_ALGAPI
			
 
				 	select CRYPTO_AES
			
 
				 	help
			
 
				 	  Use optimized AES assembler routines for ARM platforms.
			
 
				 
			
 
				-	  AES cipher algorithms (FIPS-197). AES uses the Rijndael
			
 
				-	  algorithm.
			
 
				-
			
 
				-	  Rijndael appears to be consistently a very good performer in
			
 
				-	  both hardware and software across a wide range of computing
			
 
				-	  environments regardless of its use in feedback or non-feedback
			
 
				-	  modes. Its key setup time is excellent, and its key agility is
			
 
				-	  good. Rijndael's very low memory requirements make it very well
			
 
				-	  suited for restricted-space environments, in which it also
			
 
				-	  demonstrates excellent performance. Rijndael's operations are
			
 
				-	  among the easiest to defend against power and timing attacks.
			
 
				-
			
 
				-	  The AES specifies three key sizes: 128, 192 and 256 bits
			
 
				-
			
 
				-	  See <http://csrc.nist.gov/encryption/aes/> for more information.
			
 
				-
			
 
				 config CRYPTO_AES_ARM_BS
			
 
				 	tristate "Bit sliced AES using NEON instructions"
			
 
				 	depends on KERNEL_MODE_NEON
			
 
				-	select CRYPTO_AES_ARM
			
 
				 	select CRYPTO_BLKCIPHER
			
 
				 	select CRYPTO_SIMD
			
 
				+	select CRYPTO_AES_ARM
			
 
				 	help
			
 
				 	  Use a faster and more secure NEON based implementation of AES in CBC,
			
 
				 	  CTR and XTS modes
			
@@ -130,4 +113,10 @@ config CRYPTO_CRC32_ARM_CE
 
				 	depends on KERNEL_MODE_NEON && CRC32
			
 
				 	select CRYPTO_HASH
			
 
				 
			
 
				+config CRYPTO_CHACHA20_NEON
			
 
				+	tristate "NEON accelerated ChaCha20 symmetric cipher"
			
 
				+	depends on KERNEL_MODE_NEON
			
 
				+	select CRYPTO_BLKCIPHER
			
 
				+	select CRYPTO_CHACHA20
			
 
				+
			
 
				 endif
			
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 
				 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
			
 
				 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
			
 
				 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
			
 
				+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
			
 
				 
			
 
				 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
			
 
				 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
			
@@ -26,8 +27,8 @@ $(warning $(ce-obj-y) $(ce-obj-m))
 
				 endif
			
 
				 endif
			
 
				 
			
 
				-aes-arm-y	:= aes-armv4.o aes_glue.o
			
 
				-aes-arm-bs-y	:= aesbs-core.o aesbs-glue.o
			
 
				+aes-arm-y	:= aes-cipher-core.o aes-cipher-glue.o
			
 
				+aes-arm-bs-y	:= aes-neonbs-core.o aes-neonbs-glue.o
			
 
				 sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
			
 
				 sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
			
 
				 sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
			
@@ -40,17 +41,15 @@ aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 
				 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
			
 
				 crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
			
 
				 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
			
 
				+chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
			
 
				 
			
 
				 quiet_cmd_perl = PERL    $@
			
 
				       cmd_perl = $(PERL) $(<) > $(@)
			
 
				 
			
 
				-$(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
			
 
				-	$(call cmd,perl)
			
 
				-
			
 
				 $(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
			
 
				 	$(call cmd,perl)
			
 
				 
			
 
				 $(src)/sha512-core.S_shipped: $(src)/sha512-armv4.pl
			
 
				 	$(call cmd,perl)
			
 
				 
			
 
				-.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S $(obj)/sha512-core.S
			
 
				+.PRECIOUS: $(obj)/sha256-core.S $(obj)/sha512-core.S
			
--- a/arch/arm/crypto/aes-armv4.S
+++ b/arch/arm/crypto/aes-armv4.S
@@ -1,1089 +0,0 @@
 
				-#define __ARM_ARCH__ __LINUX_ARM_ARCH__
			
 
				-@ ====================================================================
			
 
				-@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
			
 
				-@ project. The module is, however, dual licensed under OpenSSL and
			
 
				-@ CRYPTOGAMS licenses depending on where you obtain it. For further
			
 
				-@ details see http://www.openssl.org/~appro/cryptogams/.
			
 
				-@ ====================================================================
			
 
				-
			
 
				-@ AES for ARMv4
			
 
				-
			
 
				-@ January 2007.
			
 
				-@
			
 
				-@ Code uses single 1K S-box and is >2 times faster than code generated
			
 
				-@ by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
			
 
				-@ allows to merge logical or arithmetic operation with shift or rotate
			
 
				-@ in one instruction and emit combined result every cycle. The module
			
 
				-@ is endian-neutral. The performance is ~42 cycles/byte for 128-bit
			
 
				-@ key [on single-issue Xscale PXA250 core].
			
 
				-
			
 
				-@ May 2007.
			
 
				-@
			
 
				-@ AES_set_[en|de]crypt_key is added.
			
 
				-
			
 
				-@ July 2010.
			
 
				-@
			
 
				-@ Rescheduling for dual-issue pipeline resulted in 12% improvement on
			
 
				-@ Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
			
 
				-
			
 
				-@ February 2011.
			
 
				-@
			
 
				-@ Profiler-assisted and platform-specific optimization resulted in 16%
			
 
				-@ improvement on Cortex A8 core and ~21.5 cycles per byte.
			
 
				-
			
 
				-@ A little glue here to select the correct code below for the ARM CPU
			
 
				-@ that is being targetted.
			
 
				-
			
 
				-#include <linux/linkage.h>
			
 
				-#include <asm/assembler.h>
			
 
				-
			
 
				-.text
			
 
				-
			
 
				-.type	AES_Te,%object
			
 
				-.align	5
			
 
				-AES_Te:
			
 
				-.word	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
			
 
				-.word	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
			
 
				-.word	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
			
 
				-.word	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
			
 
				-.word	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
			
 
				-.word	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
			
 
				-.word	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
			
 
				-.word	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
			
 
				-.word	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
			
 
				-.word	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
			
 
				-.word	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
			
 
				-.word	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
			
 
				-.word	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
			
 
				-.word	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
			
 
				-.word	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
			
 
				-.word	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
			
 
				-.word	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
			
 
				-.word	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
			
 
				-.word	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
			
 
				-.word	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
			
 
				-.word	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
			
 
				-.word	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
			
 
				-.word	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
			
 
				-.word	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
			
 
				-.word	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
			
 
				-.word	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
			
 
				-.word	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
			
 
				-.word	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
			
 
				-.word	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
			
 
				-.word	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
			
 
				-.word	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
			
 
				-.word	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
			
 
				-.word	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
			
 
				-.word	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
			
 
				-.word	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
			
 
				-.word	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
			
 
				-.word	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
			
 
				-.word	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
			
 
				-.word	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
			
 
				-.word	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
			
 
				-.word	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
			
 
				-.word	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
			
 
				-.word	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
			
 
				-.word	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
			
 
				-.word	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
			
 
				-.word	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
			
 
				-.word	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
			
 
				-.word	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
			
 
				-.word	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
			
 
				-.word	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
			
 
				-.word	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
			
 
				-.word	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
			
 
				-.word	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
			
 
				-.word	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
			
 
				-.word	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
			
 
				-.word	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
			
 
				-.word	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
			
 
				-.word	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
			
 
				-.word	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
			
 
				-.word	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
			
 
				-.word	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
			
 
				-.word	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
			
 
				-.word	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
			
 
				-.word	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
			
 
				-@ Te4[256]
			
 
				-.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
			
 
				-.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
			
 
				-.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
			
 
				-.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
			
 
				-.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
			
 
				-.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
			
 
				-.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
			
 
				-.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
			
 
				-.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
			
 
				-.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
			
 
				-.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
			
 
				-.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
			
 
				-.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
			
 
				-.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
			
 
				-.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
			
 
				-.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
			
 
				-.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
			
 
				-.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
			
 
				-.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
			
 
				-.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
			
 
				-.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
			
 
				-.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
			
 
				-.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
			
 
				-.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
			
 
				-.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
			
 
				-.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
			
 
				-.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
			
 
				-.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
			
 
				-.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
			
 
				-.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
			
 
				-.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
			
 
				-.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
			
 
				-@ rcon[]
			
 
				-.word	0x01000000, 0x02000000, 0x04000000, 0x08000000
			
 
				-.word	0x10000000, 0x20000000, 0x40000000, 0x80000000
			
 
				-.word	0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
			
 
				-.size	AES_Te,.-AES_Te
			
 
				-
			
 
				-@ void AES_encrypt(const unsigned char *in, unsigned char *out,
			
 
				-@ 		 const AES_KEY *key) {
			
 
				-.align	5
			
 
				-ENTRY(AES_encrypt)
			
 
				-	adr	r3,AES_encrypt
			
 
				-	stmdb   sp!,{r1,r4-r12,lr}
			
 
				-	mov	r12,r0		@ inp
			
 
				-	mov	r11,r2
			
 
				-	sub	r10,r3,#AES_encrypt-AES_Te	@ Te
			
 
				-#if __ARM_ARCH__<7
			
 
				-	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
			
 
				-	ldrb	r4,[r12,#2]	@ manner...
			
 
				-	ldrb	r5,[r12,#1]
			
 
				-	ldrb	r6,[r12,#0]
			
 
				-	orr	r0,r0,r4,lsl#8
			
 
				-	ldrb	r1,[r12,#7]
			
 
				-	orr	r0,r0,r5,lsl#16
			
 
				-	ldrb	r4,[r12,#6]
			
 
				-	orr	r0,r0,r6,lsl#24
			
 
				-	ldrb	r5,[r12,#5]
			
 
				-	ldrb	r6,[r12,#4]
			
 
				-	orr	r1,r1,r4,lsl#8
			
 
				-	ldrb	r2,[r12,#11]
			
 
				-	orr	r1,r1,r5,lsl#16
			
 
				-	ldrb	r4,[r12,#10]
			
 
				-	orr	r1,r1,r6,lsl#24
			
 
				-	ldrb	r5,[r12,#9]
			
 
				-	ldrb	r6,[r12,#8]
			
 
				-	orr	r2,r2,r4,lsl#8
			
 
				-	ldrb	r3,[r12,#15]
			
 
				-	orr	r2,r2,r5,lsl#16
			
 
				-	ldrb	r4,[r12,#14]
			
 
				-	orr	r2,r2,r6,lsl#24
			
 
				-	ldrb	r5,[r12,#13]
			
 
				-	ldrb	r6,[r12,#12]
			
 
				-	orr	r3,r3,r4,lsl#8
			
 
				-	orr	r3,r3,r5,lsl#16
			
 
				-	orr	r3,r3,r6,lsl#24
			
 
				-#else
			
 
				-	ldr	r0,[r12,#0]
			
 
				-	ldr	r1,[r12,#4]
			
 
				-	ldr	r2,[r12,#8]
			
 
				-	ldr	r3,[r12,#12]
			
 
				-#ifdef __ARMEL__
			
 
				-	rev	r0,r0
			
 
				-	rev	r1,r1
			
 
				-	rev	r2,r2
			
 
				-	rev	r3,r3
			
 
				-#endif
			
 
				-#endif
			
 
				-	bl	_armv4_AES_encrypt
			
 
				-
			
 
				-	ldr	r12,[sp],#4		@ pop out
			
 
				-#if __ARM_ARCH__>=7
			
 
				-#ifdef __ARMEL__
			
 
				-	rev	r0,r0
			
 
				-	rev	r1,r1
			
 
				-	rev	r2,r2
			
 
				-	rev	r3,r3
			
 
				-#endif
			
 
				-	str	r0,[r12,#0]
			
 
				-	str	r1,[r12,#4]
			
 
				-	str	r2,[r12,#8]
			
 
				-	str	r3,[r12,#12]
			
 
				-#else
			
 
				-	mov	r4,r0,lsr#24		@ write output in endian-neutral
			
 
				-	mov	r5,r0,lsr#16		@ manner...
			
 
				-	mov	r6,r0,lsr#8
			
 
				-	strb	r4,[r12,#0]
			
 
				-	strb	r5,[r12,#1]
			
 
				-	mov	r4,r1,lsr#24
			
 
				-	strb	r6,[r12,#2]
			
 
				-	mov	r5,r1,lsr#16
			
 
				-	strb	r0,[r12,#3]
			
 
				-	mov	r6,r1,lsr#8
			
 
				-	strb	r4,[r12,#4]
			
 
				-	strb	r5,[r12,#5]
			
 
				-	mov	r4,r2,lsr#24
			
 
				-	strb	r6,[r12,#6]
			
 
				-	mov	r5,r2,lsr#16
			
 
				-	strb	r1,[r12,#7]
			
 
				-	mov	r6,r2,lsr#8
			
 
				-	strb	r4,[r12,#8]
			
 
				-	strb	r5,[r12,#9]
			
 
				-	mov	r4,r3,lsr#24
			
 
				-	strb	r6,[r12,#10]
			
 
				-	mov	r5,r3,lsr#16
			
 
				-	strb	r2,[r12,#11]
			
 
				-	mov	r6,r3,lsr#8
			
 
				-	strb	r4,[r12,#12]
			
 
				-	strb	r5,[r12,#13]
			
 
				-	strb	r6,[r12,#14]
			
 
				-	strb	r3,[r12,#15]
			
 
				-#endif
			
 
				-	ldmia	sp!,{r4-r12,pc}
			
 
				-ENDPROC(AES_encrypt)
			
 
				-
			
 
				-.type   _armv4_AES_encrypt,%function
			
 
				-.align	2
			
 
				-_armv4_AES_encrypt:
			
 
				-	str	lr,[sp,#-4]!		@ push lr
			
 
				-	ldmia	r11!,{r4-r7}
			
 
				-	eor	r0,r0,r4
			
 
				-	ldr	r12,[r11,#240-16]
			
 
				-	eor	r1,r1,r5
			
 
				-	eor	r2,r2,r6
			
 
				-	eor	r3,r3,r7
			
 
				-	sub	r12,r12,#1
			
 
				-	mov	lr,#255
			
 
				-
			
 
				-	and	r7,lr,r0
			
 
				-	and	r8,lr,r0,lsr#8
			
 
				-	and	r9,lr,r0,lsr#16
			
 
				-	mov	r0,r0,lsr#24
			
 
				-.Lenc_loop:
			
 
				-	ldr	r4,[r10,r7,lsl#2]	@ Te3[s0>>0]
			
 
				-	and	r7,lr,r1,lsr#16	@ i0
			
 
				-	ldr	r5,[r10,r8,lsl#2]	@ Te2[s0>>8]
			
 
				-	and	r8,lr,r1
			
 
				-	ldr	r6,[r10,r9,lsl#2]	@ Te1[s0>>16]
			
 
				-	and	r9,lr,r1,lsr#8
			
 
				-	ldr	r0,[r10,r0,lsl#2]	@ Te0[s0>>24]
			
 
				-	mov	r1,r1,lsr#24
			
 
				-
			
 
				-	ldr	r7,[r10,r7,lsl#2]	@ Te1[s1>>16]
			
 
				-	ldr	r8,[r10,r8,lsl#2]	@ Te3[s1>>0]
			
 
				-	ldr	r9,[r10,r9,lsl#2]	@ Te2[s1>>8]
			
 
				-	eor	r0,r0,r7,ror#8
			
 
				-	ldr	r1,[r10,r1,lsl#2]	@ Te0[s1>>24]
			
 
				-	and	r7,lr,r2,lsr#8	@ i0
			
 
				-	eor	r5,r5,r8,ror#8
			
 
				-	and	r8,lr,r2,lsr#16	@ i1
			
 
				-	eor	r6,r6,r9,ror#8
			
 
				-	and	r9,lr,r2
			
 
				-	ldr	r7,[r10,r7,lsl#2]	@ Te2[s2>>8]
			
 
				-	eor	r1,r1,r4,ror#24
			
 
				-	ldr	r8,[r10,r8,lsl#2]	@ Te1[s2>>16]
			
 
				-	mov	r2,r2,lsr#24
			
 
				-
			
 
				-	ldr	r9,[r10,r9,lsl#2]	@ Te3[s2>>0]
			
 
				-	eor	r0,r0,r7,ror#16
			
 
				-	ldr	r2,[r10,r2,lsl#2]	@ Te0[s2>>24]
			
 
				-	and	r7,lr,r3		@ i0
			
 
				-	eor	r1,r1,r8,ror#8
			
 
				-	and	r8,lr,r3,lsr#8	@ i1
			
 
				-	eor	r6,r6,r9,ror#16
			
 
				-	and	r9,lr,r3,lsr#16	@ i2
			
 
				-	ldr	r7,[r10,r7,lsl#2]	@ Te3[s3>>0]
			
 
				-	eor	r2,r2,r5,ror#16
			
 
				-	ldr	r8,[r10,r8,lsl#2]	@ Te2[s3>>8]
			
 
				-	mov	r3,r3,lsr#24
			
 
				-
			
 
				-	ldr	r9,[r10,r9,lsl#2]	@ Te1[s3>>16]
			
 
				-	eor	r0,r0,r7,ror#24
			
 
				-	ldr	r7,[r11],#16
			
 
				-	eor	r1,r1,r8,ror#16
			
 
				-	ldr	r3,[r10,r3,lsl#2]	@ Te0[s3>>24]
			
 
				-	eor	r2,r2,r9,ror#8
			
 
				-	ldr	r4,[r11,#-12]
			
 
				-	eor	r3,r3,r6,ror#8
			
 
				-
			
 
				-	ldr	r5,[r11,#-8]
			
 
				-	eor	r0,r0,r7
			
 
				-	ldr	r6,[r11,#-4]
			
 
				-	and	r7,lr,r0
			
 
				-	eor	r1,r1,r4
			
 
				-	and	r8,lr,r0,lsr#8
			
 
				-	eor	r2,r2,r5
			
 
				-	and	r9,lr,r0,lsr#16
			
 
				-	eor	r3,r3,r6
			
 
				-	mov	r0,r0,lsr#24
			
 
				-
			
 
				-	subs	r12,r12,#1
			
 
				-	bne	.Lenc_loop
			
 
				-
			
 
				-	add	r10,r10,#2
			
 
				-
			
 
				-	ldrb	r4,[r10,r7,lsl#2]	@ Te4[s0>>0]
			
 
				-	and	r7,lr,r1,lsr#16	@ i0
			
 
				-	ldrb	r5,[r10,r8,lsl#2]	@ Te4[s0>>8]
			
 
				-	and	r8,lr,r1
			
 
				-	ldrb	r6,[r10,r9,lsl#2]	@ Te4[s0>>16]
			
 
				-	and	r9,lr,r1,lsr#8
			
 
				-	ldrb	r0,[r10,r0,lsl#2]	@ Te4[s0>>24]
			
 
				-	mov	r1,r1,lsr#24
			
 
				-
			
 
				-	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s1>>16]
			
 
				-	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s1>>0]
			
 
				-	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s1>>8]
			
 
				-	eor	r0,r7,r0,lsl#8
			
 
				-	ldrb	r1,[r10,r1,lsl#2]	@ Te4[s1>>24]
			
 
				-	and	r7,lr,r2,lsr#8	@ i0
			
 
				-	eor	r5,r8,r5,lsl#8
			
 
				-	and	r8,lr,r2,lsr#16	@ i1
			
 
				-	eor	r6,r9,r6,lsl#8
			
 
				-	and	r9,lr,r2
			
 
				-	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s2>>8]
			
 
				-	eor	r1,r4,r1,lsl#24
			
 
				-	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s2>>16]
			
 
				-	mov	r2,r2,lsr#24
			
 
				-
			
 
				-	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s2>>0]
			
 
				-	eor	r0,r7,r0,lsl#8
			
 
				-	ldrb	r2,[r10,r2,lsl#2]	@ Te4[s2>>24]
			
 
				-	and	r7,lr,r3		@ i0
			
 
				-	eor	r1,r1,r8,lsl#16
			
 
				-	and	r8,lr,r3,lsr#8	@ i1
			
 
				-	eor	r6,r9,r6,lsl#8
			
 
				-	and	r9,lr,r3,lsr#16	@ i2
			
 
				-	ldrb	r7,[r10,r7,lsl#2]	@ Te4[s3>>0]
			
 
				-	eor	r2,r5,r2,lsl#24
			
 
				-	ldrb	r8,[r10,r8,lsl#2]	@ Te4[s3>>8]
			
 
				-	mov	r3,r3,lsr#24
			
 
				-
			
 
				-	ldrb	r9,[r10,r9,lsl#2]	@ Te4[s3>>16]
			
 
				-	eor	r0,r7,r0,lsl#8
			
 
				-	ldr	r7,[r11,#0]
			
 
				-	ldrb	r3,[r10,r3,lsl#2]	@ Te4[s3>>24]
			
 
				-	eor	r1,r1,r8,lsl#8
			
 
				-	ldr	r4,[r11,#4]
			
 
				-	eor	r2,r2,r9,lsl#16
			
 
				-	ldr	r5,[r11,#8]
			
 
				-	eor	r3,r6,r3,lsl#24
			
 
				-	ldr	r6,[r11,#12]
			
 
				-
			
 
				-	eor	r0,r0,r7
			
 
				-	eor	r1,r1,r4
			
 
				-	eor	r2,r2,r5
			
 
				-	eor	r3,r3,r6
			
 
				-
			
 
				-	sub	r10,r10,#2
			
 
				-	ldr	pc,[sp],#4		@ pop and return
			
 
				-.size	_armv4_AES_encrypt,.-_armv4_AES_encrypt
			
 
				-
			
 
				-.align	5
			
 
				-ENTRY(private_AES_set_encrypt_key)
			
 
				-_armv4_AES_set_encrypt_key:
			
 
				-	adr	r3,_armv4_AES_set_encrypt_key
			
 
				-	teq	r0,#0
			
 
				-	moveq	r0,#-1
			
 
				-	beq	.Labrt
			
 
				-	teq	r2,#0
			
 
				-	moveq	r0,#-1
			
 
				-	beq	.Labrt
			
 
				-
			
 
				-	teq	r1,#128
			
 
				-	beq	.Lok
			
 
				-	teq	r1,#192
			
 
				-	beq	.Lok
			
 
				-	teq	r1,#256
			
 
				-	movne	r0,#-1
			
 
				-	bne	.Labrt
			
 
				-
			
 
				-.Lok:	stmdb   sp!,{r4-r12,lr}
			
 
				-	sub	r10,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024	@ Te4
			
 
				-
			
 
				-	mov	r12,r0		@ inp
			
 
				-	mov	lr,r1			@ bits
			
 
				-	mov	r11,r2			@ key
			
 
				-
			
 
				-#if __ARM_ARCH__<7
			
 
				-	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
			
 
				-	ldrb	r4,[r12,#2]	@ manner...
			
 
				-	ldrb	r5,[r12,#1]
			
 
				-	ldrb	r6,[r12,#0]
			
 
				-	orr	r0,r0,r4,lsl#8
			
 
				-	ldrb	r1,[r12,#7]
			
 
				-	orr	r0,r0,r5,lsl#16
			
 
				-	ldrb	r4,[r12,#6]
			
 
				-	orr	r0,r0,r6,lsl#24
			
 
				-	ldrb	r5,[r12,#5]
			
 
				-	ldrb	r6,[r12,#4]
			
 
				-	orr	r1,r1,r4,lsl#8
			
 
				-	ldrb	r2,[r12,#11]
			
 
				-	orr	r1,r1,r5,lsl#16
			
 
				-	ldrb	r4,[r12,#10]
			
 
				-	orr	r1,r1,r6,lsl#24
			
 
				-	ldrb	r5,[r12,#9]
			
 
				-	ldrb	r6,[r12,#8]
			
 
				-	orr	r2,r2,r4,lsl#8
			
 
				-	ldrb	r3,[r12,#15]
			
 
				-	orr	r2,r2,r5,lsl#16
			
 
				-	ldrb	r4,[r12,#14]
			
 
				-	orr	r2,r2,r6,lsl#24
			
 
				-	ldrb	r5,[r12,#13]
			
 
				-	ldrb	r6,[r12,#12]
			
 
				-	orr	r3,r3,r4,lsl#8
			
 
				-	str	r0,[r11],#16
			
 
				-	orr	r3,r3,r5,lsl#16
			
 
				-	str	r1,[r11,#-12]
			
 
				-	orr	r3,r3,r6,lsl#24
			
 
				-	str	r2,[r11,#-8]
			
 
				-	str	r3,[r11,#-4]
			
 
				-#else
			
 
				-	ldr	r0,[r12,#0]
			
 
				-	ldr	r1,[r12,#4]
			
 
				-	ldr	r2,[r12,#8]
			
 
				-	ldr	r3,[r12,#12]
			
 
				-#ifdef __ARMEL__
			
 
				-	rev	r0,r0
			
 
				-	rev	r1,r1
			
 
				-	rev	r2,r2
			
 
				-	rev	r3,r3
			
 
				-#endif
			
 
				-	str	r0,[r11],#16
			
 
				-	str	r1,[r11,#-12]
			
 
				-	str	r2,[r11,#-8]
			
 
				-	str	r3,[r11,#-4]
			
 
				-#endif
			
 
				-
			
 
				-	teq	lr,#128
			
 
				-	bne	.Lnot128
			
 
				-	mov	r12,#10
			
 
				-	str	r12,[r11,#240-16]
			
 
				-	add	r6,r10,#256			@ rcon
			
 
				-	mov	lr,#255
			
 
				-
			
 
				-.L128_loop:
			
 
				-	and	r5,lr,r3,lsr#24
			
 
				-	and	r7,lr,r3,lsr#16
			
 
				-	ldrb	r5,[r10,r5]
			
 
				-	and	r8,lr,r3,lsr#8
			
 
				-	ldrb	r7,[r10,r7]
			
 
				-	and	r9,lr,r3
			
 
				-	ldrb	r8,[r10,r8]
			
 
				-	orr	r5,r5,r7,lsl#24
			
 
				-	ldrb	r9,[r10,r9]
			
 
				-	orr	r5,r5,r8,lsl#16
			
 
				-	ldr	r4,[r6],#4			@ rcon[i++]
			
 
				-	orr	r5,r5,r9,lsl#8
			
 
				-	eor	r5,r5,r4
			
 
				-	eor	r0,r0,r5			@ rk[4]=rk[0]^...
			
 
				-	eor	r1,r1,r0			@ rk[5]=rk[1]^rk[4]
			
 
				-	str	r0,[r11],#16
			
 
				-	eor	r2,r2,r1			@ rk[6]=rk[2]^rk[5]
			
 
				-	str	r1,[r11,#-12]
			
 
				-	eor	r3,r3,r2			@ rk[7]=rk[3]^rk[6]
			
 
				-	str	r2,[r11,#-8]
			
 
				-	subs	r12,r12,#1
			
 
				-	str	r3,[r11,#-4]
			
 
				-	bne	.L128_loop
			
 
				-	sub	r2,r11,#176
			
 
				-	b	.Ldone
			
 
				-
			
 
				-.Lnot128:
			
 
				-#if __ARM_ARCH__<7
			
 
				-	ldrb	r8,[r12,#19]
			
 
				-	ldrb	r4,[r12,#18]
			
 
				-	ldrb	r5,[r12,#17]
			
 
				-	ldrb	r6,[r12,#16]
			
 
				-	orr	r8,r8,r4,lsl#8
			
 
				-	ldrb	r9,[r12,#23]
			
 
				-	orr	r8,r8,r5,lsl#16
			
 
				-	ldrb	r4,[r12,#22]
			
 
				-	orr	r8,r8,r6,lsl#24
			
 
				-	ldrb	r5,[r12,#21]
			
 
				-	ldrb	r6,[r12,#20]
			
 
				-	orr	r9,r9,r4,lsl#8
			
 
				-	orr	r9,r9,r5,lsl#16
			
 
				-	str	r8,[r11],#8
			
 
				-	orr	r9,r9,r6,lsl#24
			
 
				-	str	r9,[r11,#-4]
			
 
				-#else
			
 
				-	ldr	r8,[r12,#16]
			
 
				-	ldr	r9,[r12,#20]
			
 
				-#ifdef __ARMEL__
			
 
				-	rev	r8,r8
			
 
				-	rev	r9,r9
			
 
				-#endif
			
 
				-	str	r8,[r11],#8
			
 
				-	str	r9,[r11,#-4]
			
 
				-#endif
			
 
				-
			
 
				-	teq	lr,#192
			
 
				-	bne	.Lnot192
			
 
				-	mov	r12,#12
			
 
				-	str	r12,[r11,#240-24]
			
 
				-	add	r6,r10,#256			@ rcon
			
 
				-	mov	lr,#255
			
 
				-	mov	r12,#8
			
 
				-
			
 
				-.L192_loop:
			
 
				-	and	r5,lr,r9,lsr#24
			
 
				-	and	r7,lr,r9,lsr#16
			
 
				-	ldrb	r5,[r10,r5]
			
 
				-	and	r8,lr,r9,lsr#8
			
 
				-	ldrb	r7,[r10,r7]
			
 
				-	and	r9,lr,r9
			
 
				-	ldrb	r8,[r10,r8]
			
 
				-	orr	r5,r5,r7,lsl#24
			
 
				-	ldrb	r9,[r10,r9]
			
 
				-	orr	r5,r5,r8,lsl#16
			
 
				-	ldr	r4,[r6],#4			@ rcon[i++]
			
 
				-	orr	r5,r5,r9,lsl#8
			
 
				-	eor	r9,r5,r4
			
 
				-	eor	r0,r0,r9			@ rk[6]=rk[0]^...
			
 
				-	eor	r1,r1,r0			@ rk[7]=rk[1]^rk[6]
			
 
				-	str	r0,[r11],#24
			
 
				-	eor	r2,r2,r1			@ rk[8]=rk[2]^rk[7]
			
 
				-	str	r1,[r11,#-20]
			
 
				-	eor	r3,r3,r2			@ rk[9]=rk[3]^rk[8]
			
 
				-	str	r2,[r11,#-16]
			
 
				-	subs	r12,r12,#1
			
 
				-	str	r3,[r11,#-12]
			
 
				-	subeq	r2,r11,#216
			
 
				-	beq	.Ldone
			
 
				-
			
 
				-	ldr	r7,[r11,#-32]
			
 
				-	ldr	r8,[r11,#-28]
			
 
				-	eor	r7,r7,r3			@ rk[10]=rk[4]^rk[9]
			
 
				-	eor	r9,r8,r7			@ rk[11]=rk[5]^rk[10]
			
 
				-	str	r7,[r11,#-8]
			
 
				-	str	r9,[r11,#-4]
			
 
				-	b	.L192_loop
			
 
				-
			
 
				-.Lnot192:
			
 
				-#if __ARM_ARCH__<7
			
 
				-	ldrb	r8,[r12,#27]
			
 
				-	ldrb	r4,[r12,#26]
			
 
				-	ldrb	r5,[r12,#25]
			
 
				-	ldrb	r6,[r12,#24]
			
 
				-	orr	r8,r8,r4,lsl#8
			
 
				-	ldrb	r9,[r12,#31]
			
 
				-	orr	r8,r8,r5,lsl#16
			
 
				-	ldrb	r4,[r12,#30]
			
 
				-	orr	r8,r8,r6,lsl#24
			
 
				-	ldrb	r5,[r12,#29]
			
 
				-	ldrb	r6,[r12,#28]
			
 
				-	orr	r9,r9,r4,lsl#8
			
 
				-	orr	r9,r9,r5,lsl#16
			
 
				-	str	r8,[r11],#8
			
 
				-	orr	r9,r9,r6,lsl#24
			
 
				-	str	r9,[r11,#-4]
			
 
				-#else
			
 
				-	ldr	r8,[r12,#24]
			
 
				-	ldr	r9,[r12,#28]
			
 
				-#ifdef __ARMEL__
			
 
				-	rev	r8,r8
			
 
				-	rev	r9,r9
			
 
				-#endif
			
 
				-	str	r8,[r11],#8
			
 
				-	str	r9,[r11,#-4]
			
 
				-#endif
			
 
				-
			
 
				-	mov	r12,#14
			
 
				-	str	r12,[r11,#240-32]
			
 
				-	add	r6,r10,#256			@ rcon
			
 
				-	mov	lr,#255
			
 
				-	mov	r12,#7
			
 
				-
			
 
				-.L256_loop:
			
 
				-	and	r5,lr,r9,lsr#24
			
 
				-	and	r7,lr,r9,lsr#16
			
 
				-	ldrb	r5,[r10,r5]
			
 
				-	and	r8,lr,r9,lsr#8
			
 
				-	ldrb	r7,[r10,r7]
			
 
				-	and	r9,lr,r9
			
 
				-	ldrb	r8,[r10,r8]
			
 
				-	orr	r5,r5,r7,lsl#24
			
 
				-	ldrb	r9,[r10,r9]
			
 
				-	orr	r5,r5,r8,lsl#16
			
 
				-	ldr	r4,[r6],#4			@ rcon[i++]
			
 
				-	orr	r5,r5,r9,lsl#8
			
 
				-	eor	r9,r5,r4
			
 
				-	eor	r0,r0,r9			@ rk[8]=rk[0]^...
			
 
				-	eor	r1,r1,r0			@ rk[9]=rk[1]^rk[8]
			
 
				-	str	r0,[r11],#32
			
 
				-	eor	r2,r2,r1			@ rk[10]=rk[2]^rk[9]
			
 
				-	str	r1,[r11,#-28]
			
 
				-	eor	r3,r3,r2			@ rk[11]=rk[3]^rk[10]
			
 
				-	str	r2,[r11,#-24]
			
 
				-	subs	r12,r12,#1
			
 
				-	str	r3,[r11,#-20]
			
 
				-	subeq	r2,r11,#256
			
 
				-	beq	.Ldone
			
 
				-
			
 
				-	and	r5,lr,r3
			
 
				-	and	r7,lr,r3,lsr#8
			
 
				-	ldrb	r5,[r10,r5]
			
 
				-	and	r8,lr,r3,lsr#16
			
 
				-	ldrb	r7,[r10,r7]
			
 
				-	and	r9,lr,r3,lsr#24
			
 
				-	ldrb	r8,[r10,r8]
			
 
				-	orr	r5,r5,r7,lsl#8
			
 
				-	ldrb	r9,[r10,r9]
			
 
				-	orr	r5,r5,r8,lsl#16
			
 
				-	ldr	r4,[r11,#-48]
			
 
				-	orr	r5,r5,r9,lsl#24
			
 
				-
			
 
				-	ldr	r7,[r11,#-44]
			
 
				-	ldr	r8,[r11,#-40]
			
 
				-	eor	r4,r4,r5			@ rk[12]=rk[4]^...
			
 
				-	ldr	r9,[r11,#-36]
			
 
				-	eor	r7,r7,r4			@ rk[13]=rk[5]^rk[12]
			
 
				-	str	r4,[r11,#-16]
			
 
				-	eor	r8,r8,r7			@ rk[14]=rk[6]^rk[13]
			
 
				-	str	r7,[r11,#-12]
			
 
				-	eor	r9,r9,r8			@ rk[15]=rk[7]^rk[14]
			
 
				-	str	r8,[r11,#-8]
			
 
				-	str	r9,[r11,#-4]
			
 
				-	b	.L256_loop
			
 
				-
			
 
				-.Ldone:	mov	r0,#0
			
 
				-	ldmia   sp!,{r4-r12,lr}
			
 
				-.Labrt:	ret	lr
			
 
				-ENDPROC(private_AES_set_encrypt_key)
			
 
				-
			
 
				-.align	5
			
 
				-ENTRY(private_AES_set_decrypt_key)
			
 
				-	str	lr,[sp,#-4]!            @ push lr
			
 
				-#if 0
			
 
				-	@ kernel does both of these in setkey so optimise this bit out by
			
 
				-	@ expecting the key to already have the enc_key work done (see aes_glue.c)
			
 
				-	bl	_armv4_AES_set_encrypt_key
			
 
				-#else
			
 
				-	mov	r0,#0
			
 
				-#endif
			
 
				-	teq	r0,#0
			
 
				-	ldrne	lr,[sp],#4              @ pop lr
			
 
				-	bne	.Labrt
			
 
				-
			
 
				-	stmdb   sp!,{r4-r12}
			
 
				-
			
 
				-	ldr	r12,[r2,#240]	@ AES_set_encrypt_key preserves r2,
			
 
				-	mov	r11,r2			@ which is AES_KEY *key
			
 
				-	mov	r7,r2
			
 
				-	add	r8,r2,r12,lsl#4
			
 
				-
			
 
				-.Linv:	ldr	r0,[r7]
			
 
				-	ldr	r1,[r7,#4]
			
 
				-	ldr	r2,[r7,#8]
			
 
				-	ldr	r3,[r7,#12]
			
 
				-	ldr	r4,[r8]
			
 
				-	ldr	r5,[r8,#4]
			
 
				-	ldr	r6,[r8,#8]
			
 
				-	ldr	r9,[r8,#12]
			
 
				-	str	r0,[r8],#-16
			
 
				-	str	r1,[r8,#16+4]
			
 
				-	str	r2,[r8,#16+8]
			
 
				-	str	r3,[r8,#16+12]
			
 
				-	str	r4,[r7],#16
			
 
				-	str	r5,[r7,#-12]
			
 
				-	str	r6,[r7,#-8]
			
 
				-	str	r9,[r7,#-4]
			
 
				-	teq	r7,r8
			
 
				-	bne	.Linv
			
 
				-	ldr	r0,[r11,#16]!		@ prefetch tp1
			
 
				-	mov	r7,#0x80
			
 
				-	mov	r8,#0x1b
			
 
				-	orr	r7,r7,#0x8000
			
 
				-	orr	r8,r8,#0x1b00
			
 
				-	orr	r7,r7,r7,lsl#16
			
 
				-	orr	r8,r8,r8,lsl#16
			
 
				-	sub	r12,r12,#1
			
 
				-	mvn	r9,r7
			
 
				-	mov	r12,r12,lsl#2	@ (rounds-1)*4
			
 
				-
			
 
				-.Lmix:	and	r4,r0,r7
			
 
				-	and	r1,r0,r9
			
 
				-	sub	r4,r4,r4,lsr#7
			
 
				-	and	r4,r4,r8
			
 
				-	eor	r1,r4,r1,lsl#1	@ tp2
			
 
				-
			
 
				-	and	r4,r1,r7
			
 
				-	and	r2,r1,r9
			
 
				-	sub	r4,r4,r4,lsr#7
			
 
				-	and	r4,r4,r8
			
 
				-	eor	r2,r4,r2,lsl#1	@ tp4
			
 
				-
			
 
				-	and	r4,r2,r7
			
 
				-	and	r3,r2,r9
			
 
				-	sub	r4,r4,r4,lsr#7
			
 
				-	and	r4,r4,r8
			
 
				-	eor	r3,r4,r3,lsl#1	@ tp8
			
 
				-
			
 
				-	eor	r4,r1,r2
			
 
				-	eor	r5,r0,r3		@ tp9
			
 
				-	eor	r4,r4,r3		@ tpe
			
 
				-	eor	r4,r4,r1,ror#24
			
 
				-	eor	r4,r4,r5,ror#24	@ ^= ROTATE(tpb=tp9^tp2,8)
			
 
				-	eor	r4,r4,r2,ror#16
			
 
				-	eor	r4,r4,r5,ror#16	@ ^= ROTATE(tpd=tp9^tp4,16)
			
 
				-	eor	r4,r4,r5,ror#8	@ ^= ROTATE(tp9,24)
			
 
				-
			
 
				-	ldr	r0,[r11,#4]		@ prefetch tp1
			
 
				-	str	r4,[r11],#4
			
 
				-	subs	r12,r12,#1
			
 
				-	bne	.Lmix
			
 
				-
			
 
				-	mov	r0,#0
			
 
				-	ldmia	sp!,{r4-r12,pc}
			
 
				-ENDPROC(private_AES_set_decrypt_key)
			
 
				-
			
 
				-.type	AES_Td,%object
			
 
				-.align	5
			
 
				-AES_Td:
			
 
				-.word	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
			
 
				-.word	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
			
 
				-.word	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
			
 
				-.word	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
			
 
				-.word	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
			
 
				-.word	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
			
 
				-.word	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
			
 
				-.word	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
			
 
				-.word	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
			
 
				-.word	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
			
 
				-.word	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
			
 
				-.word	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
			
 
				-.word	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
			
 
				-.word	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
			
 
				-.word	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
			
 
				-.word	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
			
 
				-.word	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
			
 
				-.word	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
			
 
				-.word	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
			
 
				-.word	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
			
 
				-.word	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
			
 
				-.word	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
			
 
				-.word	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
			
 
				-.word	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
			
 
				-.word	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
			
 
				-.word	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
			
 
				-.word	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
			
 
				-.word	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
			
 
				-.word	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
			
 
				-.word	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
			
 
				-.word	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
			
 
				-.word	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
			
 
				-.word	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
			
 
				-.word	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
			
 
				-.word	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
			
 
				-.word	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
			
 
				-.word	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
			
 
				-.word	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
			
 
				-.word	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
			
 
				-.word	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
			
 
				-.word	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
			
 
				-.word	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
			
 
				-.word	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
			
 
				-.word	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
			
 
				-.word	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
			
 
				-.word	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
			
 
				-.word	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
			
 
				-.word	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
			
 
				-.word	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
			
 
				-.word	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
			
 
				-.word	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
			
 
				-.word	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
			
 
				-.word	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
			
 
				-.word	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
			
 
				-.word	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
			
 
				-.word	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
			
 
				-.word	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
			
 
				-.word	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
			
 
				-.word	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
			
 
				-.word	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
			
 
				-.word	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
			
 
				-.word	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
			
 
				-.word	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
			
 
				-.word	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
			
 
				-@ Td4[256]
			
 
				-.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
			
 
				-.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
			
 
				-.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
			
 
				-.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
			
 
				-.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
			
 
				-.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
			
 
				-.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
			
 
				-.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
			
 
				-.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
			
 
				-.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
			
 
				-.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
			
 
				-.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
			
 
				-.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
			
 
				-.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
			
 
				-.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
			
 
				-.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
			
 
				-.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
			
 
				-.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
			
 
				-.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
			
 
				-.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
			
 
				-.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
			
 
				-.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
			
 
				-.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
			
 
				-.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
			
 
				-.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
			
 
				-.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
			
 
				-.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
			
 
				-.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
			
 
				-.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
			
 
				-.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
			
 
				-.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
			
 
				-.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
			
 
				-.size	AES_Td,.-AES_Td
			
 
				-
			
 
				-@ void AES_decrypt(const unsigned char *in, unsigned char *out,
			
 
				-@ 		 const AES_KEY *key) {
			
 
				-.align	5
			
 
				-ENTRY(AES_decrypt)
			
 
				-	adr	r3,AES_decrypt
			
 
				-	stmdb   sp!,{r1,r4-r12,lr}
			
 
				-	mov	r12,r0		@ inp
			
 
				-	mov	r11,r2
			
 
				-	sub	r10,r3,#AES_decrypt-AES_Td		@ Td
			
 
				-#if __ARM_ARCH__<7
			
 
				-	ldrb	r0,[r12,#3]	@ load input data in endian-neutral
			
 
				-	ldrb	r4,[r12,#2]	@ manner...
			
 
				-	ldrb	r5,[r12,#1]
			
 
				-	ldrb	r6,[r12,#0]
			
 
				-	orr	r0,r0,r4,lsl#8
			
 
				-	ldrb	r1,[r12,#7]
			
 
				-	orr	r0,r0,r5,lsl#16
			
 
				-	ldrb	r4,[r12,#6]
			
 
				-	orr	r0,r0,r6,lsl#24
			
 
				-	ldrb	r5,[r12,#5]
			
 
				-	ldrb	r6,[r12,#4]
			
 
				-	orr	r1,r1,r4,lsl#8
			
 
				-	ldrb	r2,[r12,#11]
			
 
				-	orr	r1,r1,r5,lsl#16
			
 
				-	ldrb	r4,[r12,#10]
			
 
				-	orr	r1,r1,r6,lsl#24
			
 
				-	ldrb	r5,[r12,#9]
			
 
				-	ldrb	r6,[r12,#8]
			
 
				-	orr	r2,r2,r4,lsl#8
			
 
				-	ldrb	r3,[r12,#15]
			
 
				-	orr	r2,r2,r5,lsl#16
			
 
				-	ldrb	r4,[r12,#14]
			
 
				-	orr	r2,r2,r6,lsl#24
			
 
				-	ldrb	r5,[r12,#13]
			
 
				-	ldrb	r6,[r12,#12]
			
 
				-	orr	r3,r3,r4,lsl#8
			
 
				-	orr	r3,r3,r5,lsl#16
			
 
				-	orr	r3,r3,r6,lsl#24
			
 
				-#else
			
 
				-	ldr	r0,[r12,#0]
			
 
				-	ldr	r1,[r12,#4]
			
 
				-	ldr	r2,[r12,#8]
			
 
				-	ldr	r3,[r12,#12]
			
 
				-#ifdef __ARMEL__
			
 
				-	rev	r0,r0
			
 
				-	rev	r1,r1
			
 
				-	rev	r2,r2
			
 
				-	rev	r3,r3
			
 
				-#endif
			
 
				-#endif
			
 
				-	bl	_armv4_AES_decrypt
			
 
				-
			
 
				-	ldr	r12,[sp],#4		@ pop out
			
 
				-#if __ARM_ARCH__>=7
			
 
				-#ifdef __ARMEL__
			
 
				-	rev	r0,r0
			
 
				-	rev	r1,r1
			
 
				-	rev	r2,r2
			
 
				-	rev	r3,r3
			
 
				-#endif
			
 
				-	str	r0,[r12,#0]
			
 
				-	str	r1,[r12,#4]
			
 
				-	str	r2,[r12,#8]
			
 
				-	str	r3,[r12,#12]
			
 
				-#else
			
 
				-	mov	r4,r0,lsr#24		@ write output in endian-neutral
			
 
				-	mov	r5,r0,lsr#16		@ manner...
			
 
				-	mov	r6,r0,lsr#8
			
 
				-	strb	r4,[r12,#0]
			
 
				-	strb	r5,[r12,#1]
			
 
				-	mov	r4,r1,lsr#24
			
 
				-	strb	r6,[r12,#2]
			
 
				-	mov	r5,r1,lsr#16
			
 
				-	strb	r0,[r12,#3]
			
 
				-	mov	r6,r1,lsr#8
			
 
				-	strb	r4,[r12,#4]
			
 
				-	strb	r5,[r12,#5]
			
 
				-	mov	r4,r2,lsr#24
			
 
				-	strb	r6,[r12,#6]
			
 
				-	mov	r5,r2,lsr#16
			
 
				-	strb	r1,[r12,#7]
			
 
				-	mov	r6,r2,lsr#8
			
 
				-	strb	r4,[r12,#8]
			
 
				-	strb	r5,[r12,#9]
			
 
				-	mov	r4,r3,lsr#24
			
 
				-	strb	r6,[r12,#10]
			
 
				-	mov	r5,r3,lsr#16
			
 
				-	strb	r2,[r12,#11]
			
 
				-	mov	r6,r3,lsr#8
			
 
				-	strb	r4,[r12,#12]
			
 
				-	strb	r5,[r12,#13]
			
 
				-	strb	r6,[r12,#14]
			
 
				-	strb	r3,[r12,#15]
			
 
				-#endif
			
 
				-	ldmia	sp!,{r4-r12,pc}
			
 
				-ENDPROC(AES_decrypt)
			
 
				-
			
 
				-.type   _armv4_AES_decrypt,%function
			
 
				-.align	2
			
 
				-_armv4_AES_decrypt:
			
 
				-	str	lr,[sp,#-4]!		@ push lr
			
 
				-	ldmia	r11!,{r4-r7}
			
 
				-	eor	r0,r0,r4
			
 
				-	ldr	r12,[r11,#240-16]
			
 
				-	eor	r1,r1,r5
			
 
				-	eor	r2,r2,r6
			
 
				-	eor	r3,r3,r7
			
 
				-	sub	r12,r12,#1
			
 
				-	mov	lr,#255
			
 
				-
			
 
				-	and	r7,lr,r0,lsr#16
			
 
				-	and	r8,lr,r0,lsr#8
			
 
				-	and	r9,lr,r0
			
 
				-	mov	r0,r0,lsr#24
			
 
				-.Ldec_loop:
			
 
				-	ldr	r4,[r10,r7,lsl#2]	@ Td1[s0>>16]
			
 
				-	and	r7,lr,r1		@ i0
			
 
				-	ldr	r5,[r10,r8,lsl#2]	@ Td2[s0>>8]
			
 
				-	and	r8,lr,r1,lsr#16
			
 
				-	ldr	r6,[r10,r9,lsl#2]	@ Td3[s0>>0]
			
 
				-	and	r9,lr,r1,lsr#8
			
 
				-	ldr	r0,[r10,r0,lsl#2]	@ Td0[s0>>24]
			
 
				-	mov	r1,r1,lsr#24
			
 
				-
			
 
				-	ldr	r7,[r10,r7,lsl#2]	@ Td3[s1>>0]
			
 
				-	ldr	r8,[r10,r8,lsl#2]	@ Td1[s1>>16]
			
 
				-	ldr	r9,[r10,r9,lsl#2]	@ Td2[s1>>8]
			
 
				-	eor	r0,r0,r7,ror#24
			
 
				-	ldr	r1,[r10,r1,lsl#2]	@ Td0[s1>>24]
			
 
				-	and	r7,lr,r2,lsr#8	@ i0
			
 
				-	eor	r5,r8,r5,ror#8
			
 
				-	and	r8,lr,r2		@ i1
			
 
				-	eor	r6,r9,r6,ror#8
			
 
				-	and	r9,lr,r2,lsr#16
			
 
				-	ldr	r7,[r10,r7,lsl#2]	@ Td2[s2>>8]
			
 
				-	eor	r1,r1,r4,ror#8
			
 
				-	ldr	r8,[r10,r8,lsl#2]	@ Td3[s2>>0]
			
 
				-	mov	r2,r2,lsr#24
			
 
				-
			
 
				-	ldr	r9,[r10,r9,lsl#2]	@ Td1[s2>>16]
			
 
				-	eor	r0,r0,r7,ror#16
			
 
				-	ldr	r2,[r10,r2,lsl#2]	@ Td0[s2>>24]
			
 
				-	and	r7,lr,r3,lsr#16	@ i0
			
 
				-	eor	r1,r1,r8,ror#24
			
 
				-	and	r8,lr,r3,lsr#8	@ i1
			
 
				-	eor	r6,r9,r6,ror#8
			
 
				-	and	r9,lr,r3		@ i2
			
 
				-	ldr	r7,[r10,r7,lsl#2]	@ Td1[s3>>16]
			
 
				-	eor	r2,r2,r5,ror#8
			
 
				-	ldr	r8,[r10,r8,lsl#2]	@ Td2[s3>>8]
			
 
				-	mov	r3,r3,lsr#24
			
 
				-
			
 
				-	ldr	r9,[r10,r9,lsl#2]	@ Td3[s3>>0]
			
 
				-	eor	r0,r0,r7,ror#8
			
 
				-	ldr	r7,[r11],#16
			
 
				-	eor	r1,r1,r8,ror#16
			
 
				-	ldr	r3,[r10,r3,lsl#2]	@ Td0[s3>>24]
			
 
				-	eor	r2,r2,r9,ror#24
			
 
				-
			
 
				-	ldr	r4,[r11,#-12]
			
 
				-	eor	r0,r0,r7
			
 
				-	ldr	r5,[r11,#-8]
			
 
				-	eor	r3,r3,r6,ror#8
			
 
				-	ldr	r6,[r11,#-4]
			
 
				-	and	r7,lr,r0,lsr#16
			
 
				-	eor	r1,r1,r4
			
 
				-	and	r8,lr,r0,lsr#8
			
 
				-	eor	r2,r2,r5
			
 
				-	and	r9,lr,r0
			
 
				-	eor	r3,r3,r6
			
 
				-	mov	r0,r0,lsr#24
			
 
				-
			
 
				-	subs	r12,r12,#1
			
 
				-	bne	.Ldec_loop
			
 
				-
			
 
				-	add	r10,r10,#1024
			
 
				-
			
 
				-	ldr	r5,[r10,#0]		@ prefetch Td4
			
 
				-	ldr	r6,[r10,#32]
			
 
				-	ldr	r4,[r10,#64]
			
 
				-	ldr	r5,[r10,#96]
			
 
				-	ldr	r6,[r10,#128]
			
 
				-	ldr	r4,[r10,#160]
			
 
				-	ldr	r5,[r10,#192]
			
 
				-	ldr	r6,[r10,#224]
			
 
				-
			
 
				-	ldrb	r0,[r10,r0]		@ Td4[s0>>24]
			
 
				-	ldrb	r4,[r10,r7]		@ Td4[s0>>16]
			
 
				-	and	r7,lr,r1		@ i0
			
 
				-	ldrb	r5,[r10,r8]		@ Td4[s0>>8]
			
 
				-	and	r8,lr,r1,lsr#16
			
 
				-	ldrb	r6,[r10,r9]		@ Td4[s0>>0]
			
 
				-	and	r9,lr,r1,lsr#8
			
 
				-
			
 
				-	ldrb	r7,[r10,r7]		@ Td4[s1>>0]
			
 
				- ARM(	ldrb	r1,[r10,r1,lsr#24]  )	@ Td4[s1>>24]
			
 
				- THUMB(	add	r1,r10,r1,lsr#24    ) 	@ Td4[s1>>24]
			
 
				- THUMB(	ldrb	r1,[r1]		    )
			
 
				-	ldrb	r8,[r10,r8]		@ Td4[s1>>16]
			
 
				-	eor	r0,r7,r0,lsl#24
			
 
				-	ldrb	r9,[r10,r9]		@ Td4[s1>>8]
			
 
				-	eor	r1,r4,r1,lsl#8
			
 
				-	and	r7,lr,r2,lsr#8	@ i0
			
 
				-	eor	r5,r5,r8,lsl#8
			
 
				-	and	r8,lr,r2		@ i1
			
 
				-	ldrb	r7,[r10,r7]		@ Td4[s2>>8]
			
 
				-	eor	r6,r6,r9,lsl#8
			
 
				-	ldrb	r8,[r10,r8]		@ Td4[s2>>0]
			
 
				-	and	r9,lr,r2,lsr#16
			
 
				-
			
 
				- ARM(	ldrb	r2,[r10,r2,lsr#24]  )	@ Td4[s2>>24]
			
 
				- THUMB(	add	r2,r10,r2,lsr#24    )	@ Td4[s2>>24]
			
 
				- THUMB(	ldrb	r2,[r2]		    )
			
 
				-	eor	r0,r0,r7,lsl#8
			
 
				-	ldrb	r9,[r10,r9]		@ Td4[s2>>16]
			
 
				-	eor	r1,r8,r1,lsl#16
			
 
				-	and	r7,lr,r3,lsr#16	@ i0
			
 
				-	eor	r2,r5,r2,lsl#16
			
 
				-	and	r8,lr,r3,lsr#8	@ i1
			
 
				-	ldrb	r7,[r10,r7]		@ Td4[s3>>16]
			
 
				-	eor	r6,r6,r9,lsl#16
			
 
				-	ldrb	r8,[r10,r8]		@ Td4[s3>>8]
			
 
				-	and	r9,lr,r3		@ i2
			
 
				-
			
 
				-	ldrb	r9,[r10,r9]		@ Td4[s3>>0]
			
 
				- ARM(	ldrb	r3,[r10,r3,lsr#24]  )	@ Td4[s3>>24]
			
 
				- THUMB(	add	r3,r10,r3,lsr#24    )	@ Td4[s3>>24]
			
 
				- THUMB(	ldrb	r3,[r3]		    )
			
 
				-	eor	r0,r0,r7,lsl#16
			
 
				-	ldr	r7,[r11,#0]
			
 
				-	eor	r1,r1,r8,lsl#8
			
 
				-	ldr	r4,[r11,#4]
			
 
				-	eor	r2,r9,r2,lsl#8
			
 
				-	ldr	r5,[r11,#8]
			
 
				-	eor	r3,r6,r3,lsl#24
			
 
				-	ldr	r6,[r11,#12]
			
 
				-
			
 
				-	eor	r0,r0,r7
			
 
				-	eor	r1,r1,r4
			
 
				-	eor	r2,r2,r5
			
 
				-	eor	r3,r3,r6
			
 
				-
			
 
				-	sub	r10,r10,#1024
			
 
				-	ldr	pc,[sp],#4		@ pop and return
			
 
				-.size	_armv4_AES_decrypt,.-_armv4_AES_decrypt
			
 
				-.asciz	"AES for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
			
 
				-.align	2
			
--- a/arch/arm/crypto/aes-ce-core.S
+++ b/arch/arm/crypto/aes-ce-core.S
@@ -169,19 +169,19 @@ ENTRY(ce_aes_ecb_encrypt)
 
				 .Lecbencloop3x:
			
 
				 	subs		r4, r4, #3
			
 
				 	bmi		.Lecbenc1x
			
 
				-	vld1.8		{q0-q1}, [r1, :64]!
			
 
				-	vld1.8		{q2}, [r1, :64]!
			
 
				+	vld1.8		{q0-q1}, [r1]!
			
 
				+	vld1.8		{q2}, [r1]!
			
 
				 	bl		aes_encrypt_3x
			
 
				-	vst1.8		{q0-q1}, [r0, :64]!
			
 
				-	vst1.8		{q2}, [r0, :64]!
			
 
				+	vst1.8		{q0-q1}, [r0]!
			
 
				+	vst1.8		{q2}, [r0]!
			
 
				 	b		.Lecbencloop3x
			
 
				 .Lecbenc1x:
			
 
				 	adds		r4, r4, #3
			
 
				 	beq		.Lecbencout
			
 
				 .Lecbencloop:
			
 
				-	vld1.8		{q0}, [r1, :64]!
			
 
				+	vld1.8		{q0}, [r1]!
			
 
				 	bl		aes_encrypt
			
 
				-	vst1.8		{q0}, [r0, :64]!
			
 
				+	vst1.8		{q0}, [r0]!
			
 
				 	subs		r4, r4, #1
			
 
				 	bne		.Lecbencloop
			
 
				 .Lecbencout:
			
@@ -195,19 +195,19 @@ ENTRY(ce_aes_ecb_decrypt)
 
				 .Lecbdecloop3x:
			
 
				 	subs		r4, r4, #3
			
 
				 	bmi		.Lecbdec1x
			
 
				-	vld1.8		{q0-q1}, [r1, :64]!
			
 
				-	vld1.8		{q2}, [r1, :64]!
			
 
				+	vld1.8		{q0-q1}, [r1]!
			
 
				+	vld1.8		{q2}, [r1]!
			
 
				 	bl		aes_decrypt_3x
			
 
				-	vst1.8		{q0-q1}, [r0, :64]!
			
 
				-	vst1.8		{q2}, [r0, :64]!
			
 
				+	vst1.8		{q0-q1}, [r0]!
			
 
				+	vst1.8		{q2}, [r0]!
			
 
				 	b		.Lecbdecloop3x
			
 
				 .Lecbdec1x:
			
 
				 	adds		r4, r4, #3
			
 
				 	beq		.Lecbdecout
			
 
				 .Lecbdecloop:
			
 
				-	vld1.8		{q0}, [r1, :64]!
			
 
				+	vld1.8		{q0}, [r1]!
			
 
				 	bl		aes_decrypt
			
 
				-	vst1.8		{q0}, [r0, :64]!
			
 
				+	vst1.8		{q0}, [r0]!
			
 
				 	subs		r4, r4, #1
			
 
				 	bne		.Lecbdecloop
			
 
				 .Lecbdecout:
			
@@ -226,10 +226,10 @@ ENTRY(ce_aes_cbc_encrypt)
 
				 	vld1.8		{q0}, [r5]
			
 
				 	prepare_key	r2, r3
			
 
				 .Lcbcencloop:
			
 
				-	vld1.8		{q1}, [r1, :64]!	@ get next pt block
			
 
				+	vld1.8		{q1}, [r1]!		@ get next pt block
			
 
				 	veor		q0, q0, q1		@ ..and xor with iv
			
 
				 	bl		aes_encrypt
			
 
				-	vst1.8		{q0}, [r0, :64]!
			
 
				+	vst1.8		{q0}, [r0]!
			
 
				 	subs		r4, r4, #1
			
 
				 	bne		.Lcbcencloop
			
 
				 	vst1.8		{q0}, [r5]
			
@@ -244,8 +244,8 @@ ENTRY(ce_aes_cbc_decrypt)
 
				 .Lcbcdecloop3x:
			
 
				 	subs		r4, r4, #3
			
 
				 	bmi		.Lcbcdec1x
			
 
				-	vld1.8		{q0-q1}, [r1, :64]!
			
 
				-	vld1.8		{q2}, [r1, :64]!
			
 
				+	vld1.8		{q0-q1}, [r1]!
			
 
				+	vld1.8		{q2}, [r1]!
			
 
				 	vmov		q3, q0
			
 
				 	vmov		q4, q1
			
 
				 	vmov		q5, q2
			
@@ -254,19 +254,19 @@ ENTRY(ce_aes_cbc_decrypt)
 
				 	veor		q1, q1, q3
			
 
				 	veor		q2, q2, q4
			
 
				 	vmov		q6, q5
			
 
				-	vst1.8		{q0-q1}, [r0, :64]!
			
 
				-	vst1.8		{q2}, [r0, :64]!
			
 
				+	vst1.8		{q0-q1}, [r0]!
			
 
				+	vst1.8		{q2}, [r0]!
			
 
				 	b		.Lcbcdecloop3x
			
 
				 .Lcbcdec1x:
			
 
				 	adds		r4, r4, #3
			
 
				 	beq		.Lcbcdecout
			
 
				 	vmov		q15, q14		@ preserve last round key
			
 
				 .Lcbcdecloop:
			
 
				-	vld1.8		{q0}, [r1, :64]!	@ get next ct block
			
 
				+	vld1.8		{q0}, [r1]!		@ get next ct block
			
 
				 	veor		q14, q15, q6		@ combine prev ct with last key
			
 
				 	vmov		q6, q0
			
 
				 	bl		aes_decrypt
			
 
				-	vst1.8		{q0}, [r0, :64]!
			
 
				+	vst1.8		{q0}, [r0]!
			
 
				 	subs		r4, r4, #1
			
 
				 	bne		.Lcbcdecloop
			
 
				 .Lcbcdecout:
			
@@ -300,15 +300,15 @@ ENTRY(ce_aes_ctr_encrypt)
 
				 	rev		ip, r6
			
 
				 	add		r6, r6, #1
			
 
				 	vmov		s11, ip
			
 
				-	vld1.8		{q3-q4}, [r1, :64]!
			
 
				-	vld1.8		{q5}, [r1, :64]!
			
 
				+	vld1.8		{q3-q4}, [r1]!
			
 
				+	vld1.8		{q5}, [r1]!
			
 
				 	bl		aes_encrypt_3x
			
 
				 	veor		q0, q0, q3
			
 
				 	veor		q1, q1, q4
			
 
				 	veor		q2, q2, q5
			
 
				 	rev		ip, r6
			
 
				-	vst1.8		{q0-q1}, [r0, :64]!
			
 
				-	vst1.8		{q2}, [r0, :64]!
			
 
				+	vst1.8		{q0-q1}, [r0]!
			
 
				+	vst1.8		{q2}, [r0]!
			
 
				 	vmov		s27, ip
			
 
				 	b		.Lctrloop3x
			
 
				 .Lctr1x:
			
@@ -318,10 +318,10 @@ ENTRY(ce_aes_ctr_encrypt)
 
				 	vmov		q0, q6
			
 
				 	bl		aes_encrypt
			
 
				 	subs		r4, r4, #1
			
 
				-	bmi		.Lctrhalfblock		@ blocks < 0 means 1/2 block
			
 
				-	vld1.8		{q3}, [r1, :64]!
			
 
				+	bmi		.Lctrtailblock		@ blocks < 0 means tail block
			
 
				+	vld1.8		{q3}, [r1]!
			
 
				 	veor		q3, q0, q3
			
 
				-	vst1.8		{q3}, [r0, :64]!
			
 
				+	vst1.8		{q3}, [r0]!
			
 
				 
			
 
				 	adds		r6, r6, #1		@ increment BE ctr
			
 
				 	rev		ip, r6
			
@@ -333,10 +333,8 @@ ENTRY(ce_aes_ctr_encrypt)
 
				 	vst1.8		{q6}, [r5]
			
 
				 	pop		{r4-r6, pc}
			
 
				 
			
 
				-.Lctrhalfblock:
			
 
				-	vld1.8		{d1}, [r1, :64]
			
 
				-	veor		d0, d0, d1
			
 
				-	vst1.8		{d0}, [r0, :64]
			
 
				+.Lctrtailblock:
			
 
				+	vst1.8		{q0}, [r0, :64]		@ return just the key stream
			
 
				 	pop		{r4-r6, pc}
			
 
				 
			
 
				 .Lctrcarry:
			
@@ -405,8 +403,8 @@ ENTRY(ce_aes_xts_encrypt)
 
				 .Lxtsenc3x:
			
 
				 	subs		r4, r4, #3
			
 
				 	bmi		.Lxtsenc1x
			
 
				-	vld1.8		{q0-q1}, [r1, :64]!	@ get 3 pt blocks
			
 
				-	vld1.8		{q2}, [r1, :64]!
			
 
				+	vld1.8		{q0-q1}, [r1]!		@ get 3 pt blocks
			
 
				+	vld1.8		{q2}, [r1]!
			
 
				 	next_tweak	q4, q3, q7, q6
			
 
				 	veor		q0, q0, q3
			
 
				 	next_tweak	q5, q4, q7, q6
			
@@ -416,8 +414,8 @@ ENTRY(ce_aes_xts_encrypt)
 
				 	veor		q0, q0, q3
			
 
				 	veor		q1, q1, q4
			
 
				 	veor		q2, q2, q5
			
 
				-	vst1.8		{q0-q1}, [r0, :64]!	@ write 3 ct blocks
			
 
				-	vst1.8		{q2}, [r0, :64]!
			
 
				+	vst1.8		{q0-q1}, [r0]!		@ write 3 ct blocks
			
 
				+	vst1.8		{q2}, [r0]!
			
 
				 	vmov		q3, q5
			
 
				 	teq		r4, #0
			
 
				 	beq		.Lxtsencout
			
@@ -426,11 +424,11 @@ ENTRY(ce_aes_xts_encrypt)
 
				 	adds		r4, r4, #3
			
 
				 	beq		.Lxtsencout
			
 
				 .Lxtsencloop:
			
 
				-	vld1.8		{q0}, [r1, :64]!
			
 
				+	vld1.8		{q0}, [r1]!
			
 
				 	veor		q0, q0, q3
			
 
				 	bl		aes_encrypt
			
 
				 	veor		q0, q0, q3
			
 
				-	vst1.8		{q0}, [r0, :64]!
			
 
				+	vst1.8		{q0}, [r0]!
			
 
				 	subs		r4, r4, #1
			
 
				 	beq		.Lxtsencout
			
 
				 	next_tweak	q3, q3, q7, q6
			
@@ -456,8 +454,8 @@ ENTRY(ce_aes_xts_decrypt)
 
				 .Lxtsdec3x:
			
 
				 	subs		r4, r4, #3
			
 
				 	bmi		.Lxtsdec1x
			
 
				-	vld1.8		{q0-q1}, [r1, :64]!	@ get 3 ct blocks
			
 
				-	vld1.8		{q2}, [r1, :64]!
			
 
				+	vld1.8		{q0-q1}, [r1]!		@ get 3 ct blocks
			
 
				+	vld1.8		{q2}, [r1]!
			
 
				 	next_tweak	q4, q3, q7, q6
			
 
				 	veor		q0, q0, q3
			
 
				 	next_tweak	q5, q4, q7, q6
			
@@ -467,8 +465,8 @@ ENTRY(ce_aes_xts_decrypt)
 
				 	veor		q0, q0, q3
			
 
				 	veor		q1, q1, q4
			
 
				 	veor		q2, q2, q5
			
 
				-	vst1.8		{q0-q1}, [r0, :64]!	@ write 3 pt blocks
			
 
				-	vst1.8		{q2}, [r0, :64]!
			
 
				+	vst1.8		{q0-q1}, [r0]!		@ write 3 pt blocks
			
 
				+	vst1.8		{q2}, [r0]!
			
 
				 	vmov		q3, q5
			
 
				 	teq		r4, #0
			
 
				 	beq		.Lxtsdecout
			
@@ -477,12 +475,12 @@ ENTRY(ce_aes_xts_decrypt)
 
				 	adds		r4, r4, #3
			
 
				 	beq		.Lxtsdecout
			
 
				 .Lxtsdecloop:
			
 
				-	vld1.8		{q0}, [r1, :64]!
			
 
				+	vld1.8		{q0}, [r1]!
			
 
				 	veor		q0, q0, q3
			
 
				 	add		ip, r2, #32		@ 3rd round key
			
 
				 	bl		aes_decrypt
			
 
				 	veor		q0, q0, q3
			
 
				-	vst1.8		{q0}, [r0, :64]!
			
 
				+	vst1.8		{q0}, [r0]!
			
 
				 	subs		r4, r4, #1
			
 
				 	beq		.Lxtsdecout
			
 
				 	next_tweak	q3, q3, q7, q6
			
--- a/arch/arm/crypto/aes-ce-glue.c
+++ b/arch/arm/crypto/aes-ce-glue.c
@@ -278,14 +278,15 @@ static int ctr_encrypt(struct skcipher_request *req)
 
				 		u8 *tsrc = walk.src.virt.addr;
			
 
				 
			
 
				 		/*
			
 
				-		 * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
			
 
				-		 * to tell aes_ctr_encrypt() to only read half a block.
			
 
				+		 * Tell aes_ctr_encrypt() to process a tail block.
			
 
				 		 */
			
 
				-		blocks = (nbytes <= 8) ? -1 : 1;
			
 
				+		blocks = -1;
			
 
				 
			
 
				-		ce_aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc,
			
 
				+		ce_aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc,
			
 
				 				   num_rounds(ctx), blocks, walk.iv);
			
 
				-		memcpy(tdst, tail, nbytes);
			
 
				+		if (tdst != tsrc)
			
 
				+			memcpy(tdst, tsrc, nbytes);
			
 
				+		crypto_xor(tdst, tail, nbytes);
			
 
				 		err = skcipher_walk_done(&walk, 0);
			
 
				 	}
			
 
				 	kernel_neon_end();
			
@@ -345,7 +346,6 @@ static struct skcipher_alg aes_algs[] = { {
 
				 		.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				 		.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
			
 
				-		.cra_alignmask		= 7,
			
 
				 		.cra_module		= THIS_MODULE,
			
 
				 	},
			
 
				 	.min_keysize	= AES_MIN_KEY_SIZE,
			
@@ -361,7 +361,6 @@ static struct skcipher_alg aes_algs[] = { {
 
				 		.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				 		.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
			
 
				-		.cra_alignmask		= 7,
			
 
				 		.cra_module		= THIS_MODULE,
			
 
				 	},
			
 
				 	.min_keysize	= AES_MIN_KEY_SIZE,
			
@@ -378,7 +377,6 @@ static struct skcipher_alg aes_algs[] = { {
 
				 		.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				 		.cra_blocksize		= 1,
			
 
				 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
			
 
				-		.cra_alignmask		= 7,
			
 
				 		.cra_module		= THIS_MODULE,
			
 
				 	},
			
 
				 	.min_keysize	= AES_MIN_KEY_SIZE,
			
@@ -396,7 +394,6 @@ static struct skcipher_alg aes_algs[] = { {
 
				 		.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				 		.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				 		.cra_ctxsize		= sizeof(struct crypto_aes_xts_ctx),
			
 
				-		.cra_alignmask		= 7,
			
 
				 		.cra_module		= THIS_MODULE,
			
 
				 	},
			
 
				 	.min_keysize	= 2 * AES_MIN_KEY_SIZE,
			
--- a/arch/arm/crypto/aes-cipher-core.S
+++ b/arch/arm/crypto/aes-cipher-core.S
@@ -0,0 +1,179 @@
 
				+/*
			
 
				+ * Scalar AES core transform
			
 
				+ *
			
 
				+ * Copyright (C) 2017 Linaro Ltd.
			
 
				+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+
			
 
				+	.text
			
 
				+	.align		5
			
 
				+
			
 
				+	rk		.req	r0
			
 
				+	rounds		.req	r1
			
 
				+	in		.req	r2
			
 
				+	out		.req	r3
			
 
				+	ttab		.req	ip
			
 
				+
			
 
				+	t0		.req	lr
			
 
				+	t1		.req	r2
			
 
				+	t2		.req	r3
			
 
				+
			
 
				+	.macro		__select, out, in, idx
			
 
				+	.if		__LINUX_ARM_ARCH__ < 7
			
 
				+	and		\out, \in, #0xff << (8 * \idx)
			
 
				+	.else
			
 
				+	ubfx		\out, \in, #(8 * \idx), #8
			
 
				+	.endif
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		__load, out, in, idx
			
 
				+	.if		__LINUX_ARM_ARCH__ < 7 && \idx > 0
			
 
				+	ldr		\out, [ttab, \in, lsr #(8 * \idx) - 2]
			
 
				+	.else
			
 
				+	ldr		\out, [ttab, \in, lsl #2]
			
 
				+	.endif
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		__hround, out0, out1, in0, in1, in2, in3, t3, t4, enc
			
 
				+	__select	\out0, \in0, 0
			
 
				+	__select	t0, \in1, 1
			
 
				+	__load		\out0, \out0, 0
			
 
				+	__load		t0, t0, 1
			
 
				+
			
 
				+	.if		\enc
			
 
				+	__select	\out1, \in1, 0
			
 
				+	__select	t1, \in2, 1
			
 
				+	.else
			
 
				+	__select	\out1, \in3, 0
			
 
				+	__select	t1, \in0, 1
			
 
				+	.endif
			
 
				+	__load		\out1, \out1, 0
			
 
				+	__select	t2, \in2, 2
			
 
				+	__load		t1, t1, 1
			
 
				+	__load		t2, t2, 2
			
 
				+
			
 
				+	eor		\out0, \out0, t0, ror #24
			
 
				+
			
 
				+	__select	t0, \in3, 3
			
 
				+	.if		\enc
			
 
				+	__select	\t3, \in3, 2
			
 
				+	__select	\t4, \in0, 3
			
 
				+	.else
			
 
				+	__select	\t3, \in1, 2
			
 
				+	__select	\t4, \in2, 3
			
 
				+	.endif
			
 
				+	__load		\t3, \t3, 2
			
 
				+	__load		t0, t0, 3
			
 
				+	__load		\t4, \t4, 3
			
 
				+
			
 
				+	eor		\out1, \out1, t1, ror #24
			
 
				+	eor		\out0, \out0, t2, ror #16
			
 
				+	ldm		rk!, {t1, t2}
			
 
				+	eor		\out1, \out1, \t3, ror #16
			
 
				+	eor		\out0, \out0, t0, ror #8
			
 
				+	eor		\out1, \out1, \t4, ror #8
			
 
				+	eor		\out0, \out0, t1
			
 
				+	eor		\out1, \out1, t2
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3
			
 
				+	__hround	\out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1
			
 
				+	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3
			
 
				+	__hround	\out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0
			
 
				+	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		__rev, out, in
			
 
				+	.if		__LINUX_ARM_ARCH__ < 6
			
 
				+	lsl		t0, \in, #24
			
 
				+	and		t1, \in, #0xff00
			
 
				+	and		t2, \in, #0xff0000
			
 
				+	orr		\out, t0, \in, lsr #24
			
 
				+	orr		\out, \out, t1, lsl #8
			
 
				+	orr		\out, \out, t2, lsr #8
			
 
				+	.else
			
 
				+	rev		\out, \in
			
 
				+	.endif
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		__adrl, out, sym, c
			
 
				+	.if		__LINUX_ARM_ARCH__ < 7
			
 
				+	ldr\c		\out, =\sym
			
 
				+	.else
			
 
				+	movw\c		\out, #:lower16:\sym
			
 
				+	movt\c		\out, #:upper16:\sym
			
 
				+	.endif
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		do_crypt, round, ttab, ltab
			
 
				+	push		{r3-r11, lr}
			
 
				+
			
 
				+	ldr		r4, [in]
			
 
				+	ldr		r5, [in, #4]
			
 
				+	ldr		r6, [in, #8]
			
 
				+	ldr		r7, [in, #12]
			
 
				+
			
 
				+	ldm		rk!, {r8-r11}
			
 
				+
			
 
				+#ifdef CONFIG_CPU_BIG_ENDIAN
			
 
				+	__rev		r4, r4
			
 
				+	__rev		r5, r5
			
 
				+	__rev		r6, r6
			
 
				+	__rev		r7, r7
			
 
				+#endif
			
 
				+
			
 
				+	eor		r4, r4, r8
			
 
				+	eor		r5, r5, r9
			
 
				+	eor		r6, r6, r10
			
 
				+	eor		r7, r7, r11
			
 
				+
			
 
				+	__adrl		ttab, \ttab
			
 
				+
			
 
				+	tst		rounds, #2
			
 
				+	bne		1f
			
 
				+
			
 
				+0:	\round		r8, r9, r10, r11, r4, r5, r6, r7
			
 
				+	\round		r4, r5, r6, r7, r8, r9, r10, r11
			
 
				+
			
 
				+1:	subs		rounds, rounds, #4
			
 
				+	\round		r8, r9, r10, r11, r4, r5, r6, r7
			
 
				+	__adrl		ttab, \ltab, ls
			
 
				+	\round		r4, r5, r6, r7, r8, r9, r10, r11
			
 
				+	bhi		0b
			
 
				+
			
 
				+#ifdef CONFIG_CPU_BIG_ENDIAN
			
 
				+	__rev		r4, r4
			
 
				+	__rev		r5, r5
			
 
				+	__rev		r6, r6
			
 
				+	__rev		r7, r7
			
 
				+#endif
			
 
				+
			
 
				+	ldr		out, [sp]
			
 
				+
			
 
				+	str		r4, [out]
			
 
				+	str		r5, [out, #4]
			
 
				+	str		r6, [out, #8]
			
 
				+	str		r7, [out, #12]
			
 
				+
			
 
				+	pop		{r3-r11, pc}
			
 
				+
			
 
				+	.align		3
			
 
				+	.ltorg
			
 
				+	.endm
			
 
				+
			
 
				+ENTRY(__aes_arm_encrypt)
			
 
				+	do_crypt	fround, crypto_ft_tab, crypto_fl_tab
			
 
				+ENDPROC(__aes_arm_encrypt)
			
 
				+
			
 
				+ENTRY(__aes_arm_decrypt)
			
 
				+	do_crypt	iround, crypto_it_tab, crypto_il_tab
			
 
				+ENDPROC(__aes_arm_decrypt)
			
--- a/arch/arm/crypto/aes-cipher-glue.c
+++ b/arch/arm/crypto/aes-cipher-glue.c
@@ -0,0 +1,74 @@
 
				+/*
			
 
				+ * Scalar AES core transform
			
 
				+ *
			
 
				+ * Copyright (C) 2017 Linaro Ltd.
			
 
				+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <crypto/aes.h>
			
 
				+#include <linux/crypto.h>
			
 
				+#include <linux/module.h>
			
 
				+
			
 
				+asmlinkage void __aes_arm_encrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
			
 
				+EXPORT_SYMBOL(__aes_arm_encrypt);
			
 
				+
			
 
				+asmlinkage void __aes_arm_decrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
			
 
				+EXPORT_SYMBOL(__aes_arm_decrypt);
			
 
				+
			
 
				+static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
			
 
				+{
			
 
				+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	int rounds = 6 + ctx->key_length / 4;
			
 
				+
			
 
				+	__aes_arm_encrypt(ctx->key_enc, rounds, in, out);
			
 
				+}
			
 
				+
			
 
				+static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
			
 
				+{
			
 
				+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	int rounds = 6 + ctx->key_length / 4;
			
 
				+
			
 
				+	__aes_arm_decrypt(ctx->key_dec, rounds, in, out);
			
 
				+}
			
 
				+
			
 
				+static struct crypto_alg aes_alg = {
			
 
				+	.cra_name			= "aes",
			
 
				+	.cra_driver_name		= "aes-arm",
			
 
				+	.cra_priority			= 200,
			
 
				+	.cra_flags			= CRYPTO_ALG_TYPE_CIPHER,
			
 
				+	.cra_blocksize			= AES_BLOCK_SIZE,
			
 
				+	.cra_ctxsize			= sizeof(struct crypto_aes_ctx),
			
 
				+	.cra_module			= THIS_MODULE,
			
 
				+
			
 
				+	.cra_cipher.cia_min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+	.cra_cipher.cia_max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+	.cra_cipher.cia_setkey		= crypto_aes_set_key,
			
 
				+	.cra_cipher.cia_encrypt		= aes_encrypt,
			
 
				+	.cra_cipher.cia_decrypt		= aes_decrypt,
			
 
				+
			
 
				+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
			
 
				+	.cra_alignmask			= 3,
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+static int __init aes_init(void)
			
 
				+{
			
 
				+	return crypto_register_alg(&aes_alg);
			
 
				+}
			
 
				+
			
 
				+static void __exit aes_fini(void)
			
 
				+{
			
 
				+	crypto_unregister_alg(&aes_alg);
			
 
				+}
			
 
				+
			
 
				+module_init(aes_init);
			
 
				+module_exit(aes_fini);
			
 
				+
			
 
				+MODULE_DESCRIPTION("Scalar AES cipher for ARM");
			
 
				+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				+MODULE_LICENSE("GPL v2");
			
 
				+MODULE_ALIAS_CRYPTO("aes");
			
--- a/arch/arm/crypto/aes-neonbs-core.S
+++ b/arch/arm/crypto/aes-neonbs-core.S
@@ -0,0 +1,1023 @@
 
				+/*
			
 
				+ * Bit sliced AES using NEON instructions
			
 
				+ *
			
 
				+ * Copyright (C) 2017 Linaro Ltd.
			
 
				+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * The algorithm implemented here is described in detail by the paper
			
 
				+ * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
			
 
				+ * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
			
 
				+ *
			
 
				+ * This implementation is based primarily on the OpenSSL implementation
			
 
				+ * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
			
 
				+ */
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+#include <asm/assembler.h>
			
 
				+
			
 
				+	.text
			
 
				+	.fpu		neon
			
 
				+
			
 
				+	rounds		.req	ip
			
 
				+	bskey		.req	r4
			
 
				+
			
 
				+	q0l		.req	d0
			
 
				+	q0h		.req	d1
			
 
				+	q1l		.req	d2
			
 
				+	q1h		.req	d3
			
 
				+	q2l		.req	d4
			
 
				+	q2h		.req	d5
			
 
				+	q3l		.req	d6
			
 
				+	q3h		.req	d7
			
 
				+	q4l		.req	d8
			
 
				+	q4h		.req	d9
			
 
				+	q5l		.req	d10
			
 
				+	q5h		.req	d11
			
 
				+	q6l		.req	d12
			
 
				+	q6h		.req	d13
			
 
				+	q7l		.req	d14
			
 
				+	q7h		.req	d15
			
 
				+	q8l		.req	d16
			
 
				+	q8h		.req	d17
			
 
				+	q9l		.req	d18
			
 
				+	q9h		.req	d19
			
 
				+	q10l		.req	d20
			
 
				+	q10h		.req	d21
			
 
				+	q11l		.req	d22
			
 
				+	q11h		.req	d23
			
 
				+	q12l		.req	d24
			
 
				+	q12h		.req	d25
			
 
				+	q13l		.req	d26
			
 
				+	q13h		.req	d27
			
 
				+	q14l		.req	d28
			
 
				+	q14h		.req	d29
			
 
				+	q15l		.req	d30
			
 
				+	q15h		.req	d31
			
 
				+
			
 
				+	.macro		__tbl, out, tbl, in, tmp
			
 
				+	.ifc		\out, \tbl
			
 
				+	.ifb		\tmp
			
 
				+	.error		__tbl needs temp register if out == tbl
			
 
				+	.endif
			
 
				+	vmov		\tmp, \out
			
 
				+	.endif
			
 
				+	vtbl.8		\out\()l, {\tbl}, \in\()l
			
 
				+	.ifc		\out, \tbl
			
 
				+	vtbl.8		\out\()h, {\tmp}, \in\()h
			
 
				+	.else
			
 
				+	vtbl.8		\out\()h, {\tbl}, \in\()h
			
 
				+	.endif
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		__ldr, out, sym
			
 
				+	vldr		\out\()l, \sym
			
 
				+	vldr		\out\()h, \sym + 8
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		__adr, reg, lbl
			
 
				+	adr		\reg, \lbl
			
 
				+THUMB(	orr		\reg, \reg, #1		)
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
			
 
				+	veor		\b2, \b2, \b1
			
 
				+	veor		\b5, \b5, \b6
			
 
				+	veor		\b3, \b3, \b0
			
 
				+	veor		\b6, \b6, \b2
			
 
				+	veor		\b5, \b5, \b0
			
 
				+	veor		\b6, \b6, \b3
			
 
				+	veor		\b3, \b3, \b7
			
 
				+	veor		\b7, \b7, \b5
			
 
				+	veor		\b3, \b3, \b4
			
 
				+	veor		\b4, \b4, \b5
			
 
				+	veor		\b2, \b2, \b7
			
 
				+	veor		\b3, \b3, \b1
			
 
				+	veor		\b1, \b1, \b5
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
			
 
				+	veor		\b0, \b0, \b6
			
 
				+	veor		\b1, \b1, \b4
			
 
				+	veor		\b4, \b4, \b6
			
 
				+	veor		\b2, \b2, \b0
			
 
				+	veor		\b6, \b6, \b1
			
 
				+	veor		\b1, \b1, \b5
			
 
				+	veor		\b5, \b5, \b3
			
 
				+	veor		\b3, \b3, \b7
			
 
				+	veor		\b7, \b7, \b5
			
 
				+	veor		\b2, \b2, \b5
			
 
				+	veor		\b4, \b4, \b7
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
			
 
				+	veor		\b1, \b1, \b7
			
 
				+	veor		\b4, \b4, \b7
			
 
				+	veor		\b7, \b7, \b5
			
 
				+	veor		\b1, \b1, \b3
			
 
				+	veor		\b2, \b2, \b5
			
 
				+	veor		\b3, \b3, \b7
			
 
				+	veor		\b6, \b6, \b1
			
 
				+	veor		\b2, \b2, \b0
			
 
				+	veor		\b5, \b5, \b3
			
 
				+	veor		\b4, \b4, \b6
			
 
				+	veor		\b0, \b0, \b6
			
 
				+	veor		\b1, \b1, \b4
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
			
 
				+	veor		\b1, \b1, \b5
			
 
				+	veor		\b2, \b2, \b7
			
 
				+	veor		\b3, \b3, \b1
			
 
				+	veor		\b4, \b4, \b5
			
 
				+	veor		\b7, \b7, \b5
			
 
				+	veor		\b3, \b3, \b4
			
 
				+	veor 		\b5, \b5, \b0
			
 
				+	veor		\b3, \b3, \b7
			
 
				+	veor		\b6, \b6, \b2
			
 
				+	veor		\b2, \b2, \b1
			
 
				+	veor		\b6, \b6, \b3
			
 
				+	veor		\b3, \b3, \b0
			
 
				+	veor		\b5, \b5, \b6
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
			
 
				+	veor 		\t0, \y0, \y1
			
 
				+	vand		\t0, \t0, \x0
			
 
				+	veor		\x0, \x0, \x1
			
 
				+	vand		\t1, \x1, \y0
			
 
				+	vand		\x0, \x0, \y1
			
 
				+	veor		\x1, \t1, \t0
			
 
				+	veor		\x0, \x0, \t1
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
			
 
				+	veor		\t0, \y0, \y1
			
 
				+	veor 		\t1, \y2, \y3
			
 
				+	vand		\t0, \t0, \x0
			
 
				+	vand		\t1, \t1, \x2
			
 
				+	veor		\x0, \x0, \x1
			
 
				+	veor		\x2, \x2, \x3
			
 
				+	vand		\x1, \x1, \y0
			
 
				+	vand		\x3, \x3, \y2
			
 
				+	vand		\x0, \x0, \y1
			
 
				+	vand		\x2, \x2, \y3
			
 
				+	veor		\x1, \x1, \x0
			
 
				+	veor		\x2, \x2, \x3
			
 
				+	veor		\x0, \x0, \t0
			
 
				+	veor		\x3, \x3, \t1
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
			
 
				+				    y0, y1, y2, y3, t0, t1, t2, t3
			
 
				+	veor		\t0, \x0, \x2
			
 
				+	veor		\t1, \x1, \x3
			
 
				+	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
			
 
				+	veor		\y0, \y0, \y2
			
 
				+	veor		\y1, \y1, \y3
			
 
				+	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
			
 
				+	veor		\x0, \x0, \t0
			
 
				+	veor		\x2, \x2, \t0
			
 
				+	veor		\x1, \x1, \t1
			
 
				+	veor		\x3, \x3, \t1
			
 
				+	veor		\t0, \x4, \x6
			
 
				+	veor		\t1, \x5, \x7
			
 
				+	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
			
 
				+	veor		\y0, \y0, \y2
			
 
				+	veor		\y1, \y1, \y3
			
 
				+	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
			
 
				+	veor		\x4, \x4, \t0
			
 
				+	veor		\x6, \x6, \t0
			
 
				+	veor		\x5, \x5, \t1
			
 
				+	veor		\x7, \x7, \t1
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
			
 
				+				   t0, t1, t2, t3, s0, s1, s2, s3
			
 
				+	veor		\t3, \x4, \x6
			
 
				+	veor		\t0, \x5, \x7
			
 
				+	veor		\t1, \x1, \x3
			
 
				+	veor		\s1, \x7, \x6
			
 
				+	veor		\s0, \x0, \x2
			
 
				+	veor		\s3, \t3, \t0
			
 
				+	vorr		\t2, \t0, \t1
			
 
				+	vand		\s2, \t3, \s0
			
 
				+	vorr		\t3, \t3, \s0
			
 
				+	veor		\s0, \s0, \t1
			
 
				+	vand		\t0, \t0, \t1
			
 
				+	veor		\t1, \x3, \x2
			
 
				+	vand		\s3, \s3, \s0
			
 
				+	vand		\s1, \s1, \t1
			
 
				+	veor		\t1, \x4, \x5
			
 
				+	veor		\s0, \x1, \x0
			
 
				+	veor		\t3, \t3, \s1
			
 
				+	veor		\t2, \t2, \s1
			
 
				+	vand		\s1, \t1, \s0
			
 
				+	vorr		\t1, \t1, \s0
			
 
				+	veor		\t3, \t3, \s3
			
 
				+	veor		\t0, \t0, \s1
			
 
				+	veor		\t2, \t2, \s2
			
 
				+	veor		\t1, \t1, \s3
			
 
				+	veor		\t0, \t0, \s2
			
 
				+	vand		\s0, \x7, \x3
			
 
				+	veor		\t1, \t1, \s2
			
 
				+	vand		\s1, \x6, \x2
			
 
				+	vand		\s2, \x5, \x1
			
 
				+	vorr		\s3, \x4, \x0
			
 
				+	veor		\t3, \t3, \s0
			
 
				+	veor		\t1, \t1, \s2
			
 
				+	veor		\s0, \t0, \s3
			
 
				+	veor		\t2, \t2, \s1
			
 
				+	vand		\s2, \t3, \t1
			
 
				+	veor		\s1, \t2, \s2
			
 
				+	veor		\s3, \s0, \s2
			
 
				+	vbsl		\s1, \t1, \s0
			
 
				+	vmvn		\t0, \s0
			
 
				+	vbsl		\s0, \s1, \s3
			
 
				+	vbsl		\t0, \s1, \s3
			
 
				+	vbsl		\s3, \t3, \t2
			
 
				+	veor		\t3, \t3, \t2
			
 
				+	vand		\s2, \s0, \s3
			
 
				+	veor		\t1, \t1, \t0
			
 
				+	veor		\s2, \s2, \t3
			
 
				+	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
			
 
				+			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
			
 
				+			      t0, t1, t2, t3, s0, s1, s2, s3
			
 
				+	in_bs_ch	\b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
			
 
				+	inv_gf256	\b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
			
 
				+			\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
			
 
				+	out_bs_ch	\b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
			
 
				+				  t0, t1, t2, t3, s0, s1, s2, s3
			
 
				+	inv_in_bs_ch	\b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
			
 
				+	inv_gf256	\b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
			
 
				+			\t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
			
 
				+	inv_out_bs_ch	\b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
			
 
				+				    t0, t1, t2, t3, mask
			
 
				+	vld1.8		{\t0-\t1}, [bskey, :256]!
			
 
				+	veor		\t0, \t0, \x0
			
 
				+	vld1.8		{\t2-\t3}, [bskey, :256]!
			
 
				+	veor		\t1, \t1, \x1
			
 
				+	__tbl		\x0, \t0, \mask
			
 
				+	veor		\t2, \t2, \x2
			
 
				+	__tbl		\x1, \t1, \mask
			
 
				+	vld1.8		{\t0-\t1}, [bskey, :256]!
			
 
				+	veor		\t3, \t3, \x3
			
 
				+	__tbl		\x2, \t2, \mask
			
 
				+	__tbl		\x3, \t3, \mask
			
 
				+	vld1.8		{\t2-\t3}, [bskey, :256]!
			
 
				+	veor		\t0, \t0, \x4
			
 
				+	veor		\t1, \t1, \x5
			
 
				+	__tbl		\x4, \t0, \mask
			
 
				+	veor		\t2, \t2, \x6
			
 
				+	__tbl		\x5, \t1, \mask
			
 
				+	veor		\t3, \t3, \x7
			
 
				+	__tbl		\x6, \t2, \mask
			
 
				+	__tbl		\x7, \t3, \mask
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
			
 
				+					t0, t1, t2, t3, mask
			
 
				+	__tbl		\x0, \x0, \mask, \t0
			
 
				+	__tbl		\x1, \x1, \mask, \t1
			
 
				+	__tbl		\x2, \x2, \mask, \t2
			
 
				+	__tbl		\x3, \x3, \mask, \t3
			
 
				+	__tbl		\x4, \x4, \mask, \t0
			
 
				+	__tbl		\x5, \x5, \mask, \t1
			
 
				+	__tbl		\x6, \x6, \mask, \t2
			
 
				+	__tbl		\x7, \x7, \mask, \t3
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
			
 
				+				  t0, t1, t2, t3, t4, t5, t6, t7, inv
			
 
				+	vext.8		\t0, \x0, \x0, #12
			
 
				+	vext.8		\t1, \x1, \x1, #12
			
 
				+	veor		\x0, \x0, \t0
			
 
				+	vext.8		\t2, \x2, \x2, #12
			
 
				+	veor		\x1, \x1, \t1
			
 
				+	vext.8		\t3, \x3, \x3, #12
			
 
				+	veor		\x2, \x2, \t2
			
 
				+	vext.8		\t4, \x4, \x4, #12
			
 
				+	veor		\x3, \x3, \t3
			
 
				+	vext.8		\t5, \x5, \x5, #12
			
 
				+	veor		\x4, \x4, \t4
			
 
				+	vext.8		\t6, \x6, \x6, #12
			
 
				+	veor		\x5, \x5, \t5
			
 
				+	vext.8		\t7, \x7, \x7, #12
			
 
				+	veor		\x6, \x6, \t6
			
 
				+	veor		\t1, \t1, \x0
			
 
				+	veor.8		\x7, \x7, \t7
			
 
				+	vext.8		\x0, \x0, \x0, #8
			
 
				+	veor		\t2, \t2, \x1
			
 
				+	veor		\t0, \t0, \x7
			
 
				+	veor		\t1, \t1, \x7
			
 
				+	vext.8		\x1, \x1, \x1, #8
			
 
				+	veor		\t5, \t5, \x4
			
 
				+	veor		\x0, \x0, \t0
			
 
				+	veor		\t6, \t6, \x5
			
 
				+	veor		\x1, \x1, \t1
			
 
				+	vext.8		\t0, \x4, \x4, #8
			
 
				+	veor		\t4, \t4, \x3
			
 
				+	vext.8		\t1, \x5, \x5, #8
			
 
				+	veor		\t7, \t7, \x6
			
 
				+	vext.8		\x4, \x3, \x3, #8
			
 
				+	veor		\t3, \t3, \x2
			
 
				+	vext.8		\x5, \x7, \x7, #8
			
 
				+	veor		\t4, \t4, \x7
			
 
				+	vext.8		\x3, \x6, \x6, #8
			
 
				+	veor		\t3, \t3, \x7
			
 
				+	vext.8		\x6, \x2, \x2, #8
			
 
				+	veor		\x7, \t1, \t5
			
 
				+	.ifb		\inv
			
 
				+	veor		\x2, \t0, \t4
			
 
				+	veor		\x4, \x4, \t3
			
 
				+	veor		\x5, \x5, \t7
			
 
				+	veor		\x3, \x3, \t6
			
 
				+	veor		\x6, \x6, \t2
			
 
				+	.else
			
 
				+	veor		\t3, \t3, \x4
			
 
				+	veor		\x5, \x5, \t7
			
 
				+	veor		\x2, \x3, \t6
			
 
				+	veor		\x3, \t0, \t4
			
 
				+	veor		\x4, \x6, \t2
			
 
				+	vmov		\x6, \t3
			
 
				+	.endif
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
			
 
				+				      t0, t1, t2, t3, t4, t5, t6, t7
			
 
				+	vld1.8		{\t0-\t1}, [bskey, :256]!
			
 
				+	veor		\x0, \x0, \t0
			
 
				+	vld1.8		{\t2-\t3}, [bskey, :256]!
			
 
				+	veor		\x1, \x1, \t1
			
 
				+	vld1.8		{\t4-\t5}, [bskey, :256]!
			
 
				+	veor		\x2, \x2, \t2
			
 
				+	vld1.8		{\t6-\t7}, [bskey, :256]
			
 
				+	sub		bskey, bskey, #224
			
 
				+	veor		\x3, \x3, \t3
			
 
				+	veor		\x4, \x4, \t4
			
 
				+	veor		\x5, \x5, \t5
			
 
				+	veor		\x6, \x6, \t6
			
 
				+	veor		\x7, \x7, \t7
			
 
				+	vext.8		\t0, \x0, \x0, #8
			
 
				+	vext.8		\t6, \x6, \x6, #8
			
 
				+	vext.8		\t7, \x7, \x7, #8
			
 
				+	veor		\t0, \t0, \x0
			
 
				+	vext.8		\t1, \x1, \x1, #8
			
 
				+	veor		\t6, \t6, \x6
			
 
				+	vext.8		\t2, \x2, \x2, #8
			
 
				+	veor		\t7, \t7, \x7
			
 
				+	vext.8		\t3, \x3, \x3, #8
			
 
				+	veor		\t1, \t1, \x1
			
 
				+	vext.8		\t4, \x4, \x4, #8
			
 
				+	veor		\t2, \t2, \x2
			
 
				+	vext.8		\t5, \x5, \x5, #8
			
 
				+	veor		\t3, \t3, \x3
			
 
				+	veor		\t4, \t4, \x4
			
 
				+	veor		\t5, \t5, \x5
			
 
				+	veor		\x0, \x0, \t6
			
 
				+	veor		\x1, \x1, \t6
			
 
				+	veor		\x2, \x2, \t0
			
 
				+	veor		\x4, \x4, \t2
			
 
				+	veor		\x3, \x3, \t1
			
 
				+	veor		\x1, \x1, \t7
			
 
				+	veor		\x2, \x2, \t7
			
 
				+	veor		\x4, \x4, \t6
			
 
				+	veor		\x5, \x5, \t3
			
 
				+	veor		\x3, \x3, \t6
			
 
				+	veor		\x6, \x6, \t4
			
 
				+	veor		\x4, \x4, \t7
			
 
				+	veor		\x5, \x5, \t7
			
 
				+	veor		\x7, \x7, \t5
			
 
				+	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
			
 
				+			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
			
 
				+	vshr.u64	\t0, \b0, #\n
			
 
				+	vshr.u64	\t1, \b1, #\n
			
 
				+	veor		\t0, \t0, \a0
			
 
				+	veor		\t1, \t1, \a1
			
 
				+	vand		\t0, \t0, \mask
			
 
				+	vand		\t1, \t1, \mask
			
 
				+	veor		\a0, \a0, \t0
			
 
				+	vshl.s64	\t0, \t0, #\n
			
 
				+	veor		\a1, \a1, \t1
			
 
				+	vshl.s64	\t1, \t1, #\n
			
 
				+	veor		\b0, \b0, \t0
			
 
				+	veor		\b1, \b1, \t1
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
			
 
				+	vmov.i8		\t0, #0x55
			
 
				+	vmov.i8		\t1, #0x33
			
 
				+	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
			
 
				+	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
			
 
				+	vmov.i8		\t0, #0x0f
			
 
				+	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
			
 
				+	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
			
 
				+	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
			
 
				+	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
			
 
				+	.endm
			
 
				+
			
 
				+	.align		4
			
 
				+M0:	.quad		0x02060a0e03070b0f, 0x0004080c0105090d
			
 
				+
			
 
				+	/*
			
 
				+	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
			
 
				+	 */
			
 
				+ENTRY(aesbs_convert_key)
			
 
				+	vld1.32		{q7}, [r1]!		// load round 0 key
			
 
				+	vld1.32		{q15}, [r1]!		// load round 1 key
			
 
				+
			
 
				+	vmov.i8		q8,  #0x01		// bit masks
			
 
				+	vmov.i8		q9,  #0x02
			
 
				+	vmov.i8		q10, #0x04
			
 
				+	vmov.i8		q11, #0x08
			
 
				+	vmov.i8		q12, #0x10
			
 
				+	vmov.i8		q13, #0x20
			
 
				+	__ldr		q14, M0
			
 
				+
			
 
				+	sub		r2, r2, #1
			
 
				+	vst1.8		{q7}, [r0, :128]!	// save round 0 key
			
 
				+
			
 
				+.Lkey_loop:
			
 
				+	__tbl		q7, q15, q14
			
 
				+	vmov.i8		q6, #0x40
			
 
				+	vmov.i8		q15, #0x80
			
 
				+
			
 
				+	vtst.8		q0, q7, q8
			
 
				+	vtst.8		q1, q7, q9
			
 
				+	vtst.8		q2, q7, q10
			
 
				+	vtst.8		q3, q7, q11
			
 
				+	vtst.8		q4, q7, q12
			
 
				+	vtst.8		q5, q7, q13
			
 
				+	vtst.8		q6, q7, q6
			
 
				+	vtst.8		q7, q7, q15
			
 
				+	vld1.32		{q15}, [r1]!		// load next round key
			
 
				+	vmvn		q0, q0
			
 
				+	vmvn		q1, q1
			
 
				+	vmvn		q5, q5
			
 
				+	vmvn		q6, q6
			
 
				+
			
 
				+	subs		r2, r2, #1
			
 
				+	vst1.8		{q0-q1}, [r0, :256]!
			
 
				+	vst1.8		{q2-q3}, [r0, :256]!
			
 
				+	vst1.8		{q4-q5}, [r0, :256]!
			
 
				+	vst1.8		{q6-q7}, [r0, :256]!
			
 
				+	bne		.Lkey_loop
			
 
				+
			
 
				+	vmov.i8		q7, #0x63		// compose .L63
			
 
				+	veor		q15, q15, q7
			
 
				+	vst1.8		{q15}, [r0, :128]
			
 
				+	bx		lr
			
 
				+ENDPROC(aesbs_convert_key)
			
 
				+
			
 
				+	.align		4
			
 
				+M0SR:	.quad		0x0a0e02060f03070b, 0x0004080c05090d01
			
 
				+
			
 
				+aesbs_encrypt8:
			
 
				+	vld1.8		{q9}, [bskey, :128]!	// round 0 key
			
 
				+	__ldr		q8, M0SR
			
 
				+
			
 
				+	veor		q10, q0, q9		// xor with round0 key
			
 
				+	veor		q11, q1, q9
			
 
				+	__tbl		q0, q10, q8
			
 
				+	veor		q12, q2, q9
			
 
				+	__tbl		q1, q11, q8
			
 
				+	veor		q13, q3, q9
			
 
				+	__tbl		q2, q12, q8
			
 
				+	veor		q14, q4, q9
			
 
				+	__tbl		q3, q13, q8
			
 
				+	veor		q15, q5, q9
			
 
				+	__tbl		q4, q14, q8
			
 
				+	veor		q10, q6, q9
			
 
				+	__tbl		q5, q15, q8
			
 
				+	veor		q11, q7, q9
			
 
				+	__tbl		q6, q10, q8
			
 
				+	__tbl		q7, q11, q8
			
 
				+
			
 
				+	bitslice	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
			
 
				+
			
 
				+	sub		rounds, rounds, #1
			
 
				+	b		.Lenc_sbox
			
 
				+
			
 
				+	.align		5
			
 
				+SR:	.quad		0x0504070600030201, 0x0f0e0d0c0a09080b
			
 
				+SRM0:	.quad		0x0304090e00050a0f, 0x01060b0c0207080d
			
 
				+
			
 
				+.Lenc_last:
			
 
				+	__ldr		q12, SRM0
			
 
				+.Lenc_loop:
			
 
				+	shift_rows	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
			
 
				+.Lenc_sbox:
			
 
				+	sbox		q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
			
 
				+								q13, q14, q15
			
 
				+	subs		rounds, rounds, #1
			
 
				+	bcc		.Lenc_done
			
 
				+
			
 
				+	mix_cols	q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
			
 
				+								q13, q14, q15
			
 
				+
			
 
				+	beq		.Lenc_last
			
 
				+	__ldr		q12, SR
			
 
				+	b		.Lenc_loop
			
 
				+
			
 
				+.Lenc_done:
			
 
				+	vld1.8		{q12}, [bskey, :128]	// last round key
			
 
				+
			
 
				+	bitslice	q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
			
 
				+
			
 
				+	veor		q0, q0, q12
			
 
				+	veor		q1, q1, q12
			
 
				+	veor		q4, q4, q12
			
 
				+	veor		q6, q6, q12
			
 
				+	veor		q3, q3, q12
			
 
				+	veor		q7, q7, q12
			
 
				+	veor		q2, q2, q12
			
 
				+	veor		q5, q5, q12
			
 
				+	bx		lr
			
 
				+ENDPROC(aesbs_encrypt8)
			
 
				+
			
 
				+	.align		4
			
 
				+M0ISR:	.quad		0x0a0e0206070b0f03, 0x0004080c0d010509
			
 
				+
			
 
				+aesbs_decrypt8:
			
 
				+	add		bskey, bskey, rounds, lsl #7
			
 
				+	sub		bskey, bskey, #112
			
 
				+	vld1.8		{q9}, [bskey, :128]	// round 0 key
			
 
				+	sub		bskey, bskey, #128
			
 
				+	__ldr		q8, M0ISR
			
 
				+
			
 
				+	veor		q10, q0, q9		// xor with round0 key
			
 
				+	veor		q11, q1, q9
			
 
				+	__tbl		q0, q10, q8
			
 
				+	veor		q12, q2, q9
			
 
				+	__tbl		q1, q11, q8
			
 
				+	veor		q13, q3, q9
			
 
				+	__tbl		q2, q12, q8
			
 
				+	veor		q14, q4, q9
			
 
				+	__tbl		q3, q13, q8
			
 
				+	veor		q15, q5, q9
			
 
				+	__tbl		q4, q14, q8
			
 
				+	veor		q10, q6, q9
			
 
				+	__tbl		q5, q15, q8
			
 
				+	veor		q11, q7, q9
			
 
				+	__tbl		q6, q10, q8
			
 
				+	__tbl		q7, q11, q8
			
 
				+
			
 
				+	bitslice	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
			
 
				+
			
 
				+	sub		rounds, rounds, #1
			
 
				+	b		.Ldec_sbox
			
 
				+
			
 
				+	.align		5
			
 
				+ISR:	.quad		0x0504070602010003, 0x0f0e0d0c080b0a09
			
 
				+ISRM0:	.quad		0x01040b0e0205080f, 0x0306090c00070a0d
			
 
				+
			
 
				+.Ldec_last:
			
 
				+	__ldr		q12, ISRM0
			
 
				+.Ldec_loop:
			
 
				+	inv_shift_rows	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
			
 
				+.Ldec_sbox:
			
 
				+	inv_sbox	q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
			
 
				+								q13, q14, q15
			
 
				+	subs		rounds, rounds, #1
			
 
				+	bcc		.Ldec_done
			
 
				+
			
 
				+	inv_mix_cols	q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
			
 
				+								q13, q14, q15
			
 
				+
			
 
				+	beq		.Ldec_last
			
 
				+	__ldr		q12, ISR
			
 
				+	b		.Ldec_loop
			
 
				+
			
 
				+.Ldec_done:
			
 
				+	add		bskey, bskey, #112
			
 
				+	vld1.8		{q12}, [bskey, :128]	// last round key
			
 
				+
			
 
				+	bitslice	q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
			
 
				+
			
 
				+	veor		q0, q0, q12
			
 
				+	veor		q1, q1, q12
			
 
				+	veor		q6, q6, q12
			
 
				+	veor		q4, q4, q12
			
 
				+	veor		q2, q2, q12
			
 
				+	veor		q7, q7, q12
			
 
				+	veor		q3, q3, q12
			
 
				+	veor		q5, q5, q12
			
 
				+	bx		lr
			
 
				+ENDPROC(aesbs_decrypt8)
			
 
				+
			
 
				+	/*
			
 
				+	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
			
 
				+	 *		     int blocks)
			
 
				+	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
			
 
				+	 *		     int blocks)
			
 
				+	 */
			
 
				+	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
			
 
				+	push		{r4-r6, lr}
			
 
				+	ldr		r5, [sp, #16]		// number of blocks
			
 
				+
			
 
				+99:	__adr		ip, 0f
			
 
				+	and		lr, r5, #7
			
 
				+	cmp		r5, #8
			
 
				+	sub		ip, ip, lr, lsl #2
			
 
				+	bxlt		ip			// computed goto if blocks < 8
			
 
				+
			
 
				+	vld1.8		{q0}, [r1]!
			
 
				+	vld1.8		{q1}, [r1]!
			
 
				+	vld1.8		{q2}, [r1]!
			
 
				+	vld1.8		{q3}, [r1]!
			
 
				+	vld1.8		{q4}, [r1]!
			
 
				+	vld1.8		{q5}, [r1]!
			
 
				+	vld1.8		{q6}, [r1]!
			
 
				+	vld1.8		{q7}, [r1]!
			
 
				+
			
 
				+0:	mov		bskey, r2
			
 
				+	mov		rounds, r3
			
 
				+	bl		\do8
			
 
				+
			
 
				+	__adr		ip, 1f
			
 
				+	and		lr, r5, #7
			
 
				+	cmp		r5, #8
			
 
				+	sub		ip, ip, lr, lsl #2
			
 
				+	bxlt		ip			// computed goto if blocks < 8
			
 
				+
			
 
				+	vst1.8		{\o0}, [r0]!
			
 
				+	vst1.8		{\o1}, [r0]!
			
 
				+	vst1.8		{\o2}, [r0]!
			
 
				+	vst1.8		{\o3}, [r0]!
			
 
				+	vst1.8		{\o4}, [r0]!
			
 
				+	vst1.8		{\o5}, [r0]!
			
 
				+	vst1.8		{\o6}, [r0]!
			
 
				+	vst1.8		{\o7}, [r0]!
			
 
				+
			
 
				+1:	subs		r5, r5, #8
			
 
				+	bgt		99b
			
 
				+
			
 
				+	pop		{r4-r6, pc}
			
 
				+	.endm
			
 
				+
			
 
				+	.align		4
			
 
				+ENTRY(aesbs_ecb_encrypt)
			
 
				+	__ecb_crypt	aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
			
 
				+ENDPROC(aesbs_ecb_encrypt)
			
 
				+
			
 
				+	.align		4
			
 
				+ENTRY(aesbs_ecb_decrypt)
			
 
				+	__ecb_crypt	aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
			
 
				+ENDPROC(aesbs_ecb_decrypt)
			
 
				+
			
 
				+	/*
			
 
				+	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+	 *		     int rounds, int blocks, u8 iv[])
			
 
				+	 */
			
 
				+	.align		4
			
 
				+ENTRY(aesbs_cbc_decrypt)
			
 
				+	mov		ip, sp
			
 
				+	push		{r4-r6, lr}
			
 
				+	ldm		ip, {r5-r6}		// load args 4-5
			
 
				+
			
 
				+99:	__adr		ip, 0f
			
 
				+	and		lr, r5, #7
			
 
				+	cmp		r5, #8
			
 
				+	sub		ip, ip, lr, lsl #2
			
 
				+	mov		lr, r1
			
 
				+	bxlt		ip			// computed goto if blocks < 8
			
 
				+
			
 
				+	vld1.8		{q0}, [lr]!
			
 
				+	vld1.8		{q1}, [lr]!
			
 
				+	vld1.8		{q2}, [lr]!
			
 
				+	vld1.8		{q3}, [lr]!
			
 
				+	vld1.8		{q4}, [lr]!
			
 
				+	vld1.8		{q5}, [lr]!
			
 
				+	vld1.8		{q6}, [lr]!
			
 
				+	vld1.8		{q7}, [lr]
			
 
				+
			
 
				+0:	mov		bskey, r2
			
 
				+	mov		rounds, r3
			
 
				+	bl		aesbs_decrypt8
			
 
				+
			
 
				+	vld1.8		{q8}, [r6]
			
 
				+	vmov		q9, q8
			
 
				+	vmov		q10, q8
			
 
				+	vmov		q11, q8
			
 
				+	vmov		q12, q8
			
 
				+	vmov		q13, q8
			
 
				+	vmov		q14, q8
			
 
				+	vmov		q15, q8
			
 
				+
			
 
				+	__adr		ip, 1f
			
 
				+	and		lr, r5, #7
			
 
				+	cmp		r5, #8
			
 
				+	sub		ip, ip, lr, lsl #2
			
 
				+	bxlt		ip			// computed goto if blocks < 8
			
 
				+
			
 
				+	vld1.8		{q9}, [r1]!
			
 
				+	vld1.8		{q10}, [r1]!
			
 
				+	vld1.8		{q11}, [r1]!
			
 
				+	vld1.8		{q12}, [r1]!
			
 
				+	vld1.8		{q13}, [r1]!
			
 
				+	vld1.8		{q14}, [r1]!
			
 
				+	vld1.8		{q15}, [r1]!
			
 
				+	W(nop)
			
 
				+
			
 
				+1:	__adr		ip, 2f
			
 
				+	sub		ip, ip, lr, lsl #3
			
 
				+	bxlt		ip			// computed goto if blocks < 8
			
 
				+
			
 
				+	veor		q0, q0, q8
			
 
				+	vst1.8		{q0}, [r0]!
			
 
				+	veor		q1, q1, q9
			
 
				+	vst1.8		{q1}, [r0]!
			
 
				+	veor		q6, q6, q10
			
 
				+	vst1.8		{q6}, [r0]!
			
 
				+	veor		q4, q4, q11
			
 
				+	vst1.8		{q4}, [r0]!
			
 
				+	veor		q2, q2, q12
			
 
				+	vst1.8		{q2}, [r0]!
			
 
				+	veor		q7, q7, q13
			
 
				+	vst1.8		{q7}, [r0]!
			
 
				+	veor		q3, q3, q14
			
 
				+	vst1.8		{q3}, [r0]!
			
 
				+	veor		q5, q5, q15
			
 
				+	vld1.8		{q8}, [r1]!		// load next round's iv
			
 
				+2:	vst1.8		{q5}, [r0]!
			
 
				+
			
 
				+	subs		r5, r5, #8
			
 
				+	vst1.8		{q8}, [r6]		// store next round's iv
			
 
				+	bgt		99b
			
 
				+
			
 
				+	pop		{r4-r6, pc}
			
 
				+ENDPROC(aesbs_cbc_decrypt)
			
 
				+
			
 
				+	.macro		next_ctr, q
			
 
				+	vmov.32		\q\()h[1], r10
			
 
				+	adds		r10, r10, #1
			
 
				+	vmov.32		\q\()h[0], r9
			
 
				+	adcs		r9, r9, #0
			
 
				+	vmov.32		\q\()l[1], r8
			
 
				+	adcs		r8, r8, #0
			
 
				+	vmov.32		\q\()l[0], r7
			
 
				+	adc		r7, r7, #0
			
 
				+	vrev32.8	\q, \q
			
 
				+	.endm
			
 
				+
			
 
				+	/*
			
 
				+	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+	 *		     int rounds, int blocks, u8 ctr[], u8 final[])
			
 
				+	 */
			
 
				+ENTRY(aesbs_ctr_encrypt)
			
 
				+	mov		ip, sp
			
 
				+	push		{r4-r10, lr}
			
 
				+
			
 
				+	ldm		ip, {r5-r7}		// load args 4-6
			
 
				+	teq		r7, #0
			
 
				+	addne		r5, r5, #1		// one extra block if final != 0
			
 
				+
			
 
				+	vld1.8		{q0}, [r6]		// load counter
			
 
				+	vrev32.8	q1, q0
			
 
				+	vmov		r9, r10, d3
			
 
				+	vmov		r7, r8, d2
			
 
				+
			
 
				+	adds		r10, r10, #1
			
 
				+	adcs		r9, r9, #0
			
 
				+	adcs		r8, r8, #0
			
 
				+	adc		r7, r7, #0
			
 
				+
			
 
				+99:	vmov		q1, q0
			
 
				+	vmov		q2, q0
			
 
				+	vmov		q3, q0
			
 
				+	vmov		q4, q0
			
 
				+	vmov		q5, q0
			
 
				+	vmov		q6, q0
			
 
				+	vmov		q7, q0
			
 
				+
			
 
				+	__adr		ip, 0f
			
 
				+	sub		lr, r5, #1
			
 
				+	and		lr, lr, #7
			
 
				+	cmp		r5, #8
			
 
				+	sub		ip, ip, lr, lsl #5
			
 
				+	sub		ip, ip, lr, lsl #2
			
 
				+	bxlt		ip			// computed goto if blocks < 8
			
 
				+
			
 
				+	next_ctr	q1
			
 
				+	next_ctr	q2
			
 
				+	next_ctr	q3
			
 
				+	next_ctr	q4
			
 
				+	next_ctr	q5
			
 
				+	next_ctr	q6
			
 
				+	next_ctr	q7
			
 
				+
			
 
				+0:	mov		bskey, r2
			
 
				+	mov		rounds, r3
			
 
				+	bl		aesbs_encrypt8
			
 
				+
			
 
				+	__adr		ip, 1f
			
 
				+	and		lr, r5, #7
			
 
				+	cmp		r5, #8
			
 
				+	movgt		r4, #0
			
 
				+	ldrle		r4, [sp, #40]		// load final in the last round
			
 
				+	sub		ip, ip, lr, lsl #2
			
 
				+	bxlt		ip			// computed goto if blocks < 8
			
 
				+
			
 
				+	vld1.8		{q8}, [r1]!
			
 
				+	vld1.8		{q9}, [r1]!
			
 
				+	vld1.8		{q10}, [r1]!
			
 
				+	vld1.8		{q11}, [r1]!
			
 
				+	vld1.8		{q12}, [r1]!
			
 
				+	vld1.8		{q13}, [r1]!
			
 
				+	vld1.8		{q14}, [r1]!
			
 
				+	teq		r4, #0			// skip last block if 'final'
			
 
				+1:	bne		2f
			
 
				+	vld1.8		{q15}, [r1]!
			
 
				+
			
 
				+2:	__adr		ip, 3f
			
 
				+	cmp		r5, #8
			
 
				+	sub		ip, ip, lr, lsl #3
			
 
				+	bxlt		ip			// computed goto if blocks < 8
			
 
				+
			
 
				+	veor		q0, q0, q8
			
 
				+	vst1.8		{q0}, [r0]!
			
 
				+	veor		q1, q1, q9
			
 
				+	vst1.8		{q1}, [r0]!
			
 
				+	veor		q4, q4, q10
			
 
				+	vst1.8		{q4}, [r0]!
			
 
				+	veor		q6, q6, q11
			
 
				+	vst1.8		{q6}, [r0]!
			
 
				+	veor		q3, q3, q12
			
 
				+	vst1.8		{q3}, [r0]!
			
 
				+	veor		q7, q7, q13
			
 
				+	vst1.8		{q7}, [r0]!
			
 
				+	veor		q2, q2, q14
			
 
				+	vst1.8		{q2}, [r0]!
			
 
				+	teq		r4, #0			// skip last block if 'final'
			
 
				+	W(bne)		5f
			
 
				+3:	veor		q5, q5, q15
			
 
				+	vst1.8		{q5}, [r0]!
			
 
				+
			
 
				+4:	next_ctr	q0
			
 
				+
			
 
				+	subs		r5, r5, #8
			
 
				+	bgt		99b
			
 
				+
			
 
				+	vst1.8		{q0}, [r6]
			
 
				+	pop		{r4-r10, pc}
			
 
				+
			
 
				+5:	vst1.8		{q5}, [r4]
			
 
				+	b		4b
			
 
				+ENDPROC(aesbs_ctr_encrypt)
			
 
				+
			
 
				+	.macro		next_tweak, out, in, const, tmp
			
 
				+	vshr.s64	\tmp, \in, #63
			
 
				+	vand		\tmp, \tmp, \const
			
 
				+	vadd.u64	\out, \in, \in
			
 
				+	vext.8		\tmp, \tmp, \tmp, #8
			
 
				+	veor		\out, \out, \tmp
			
 
				+	.endm
			
 
				+
			
 
				+	.align		4
			
 
				+.Lxts_mul_x:
			
 
				+	.quad		1, 0x87
			
 
				+
			
 
				+	/*
			
 
				+	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
			
 
				+	 *		     int blocks, u8 iv[])
			
 
				+	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
			
 
				+	 *		     int blocks, u8 iv[])
			
 
				+	 */
			
 
				+__xts_prepare8:
			
 
				+	vld1.8		{q14}, [r7]		// load iv
			
 
				+	__ldr		q15, .Lxts_mul_x	// load tweak mask
			
 
				+	vmov		q12, q14
			
 
				+
			
 
				+	__adr		ip, 0f
			
 
				+	and		r4, r6, #7
			
 
				+	cmp		r6, #8
			
 
				+	sub		ip, ip, r4, lsl #5
			
 
				+	mov		r4, sp
			
 
				+	bxlt		ip			// computed goto if blocks < 8
			
 
				+
			
 
				+	vld1.8		{q0}, [r1]!
			
 
				+	next_tweak	q12, q14, q15, q13
			
 
				+	veor		q0, q0, q14
			
 
				+	vst1.8		{q14}, [r4, :128]!
			
 
				+
			
 
				+	vld1.8		{q1}, [r1]!
			
 
				+	next_tweak	q14, q12, q15, q13
			
 
				+	veor		q1, q1, q12
			
 
				+	vst1.8		{q12}, [r4, :128]!
			
 
				+
			
 
				+	vld1.8		{q2}, [r1]!
			
 
				+	next_tweak	q12, q14, q15, q13
			
 
				+	veor		q2, q2, q14
			
 
				+	vst1.8		{q14}, [r4, :128]!
			
 
				+
			
 
				+	vld1.8		{q3}, [r1]!
			
 
				+	next_tweak	q14, q12, q15, q13
			
 
				+	veor		q3, q3, q12
			
 
				+	vst1.8		{q12}, [r4, :128]!
			
 
				+
			
 
				+	vld1.8		{q4}, [r1]!
			
 
				+	next_tweak	q12, q14, q15, q13
			
 
				+	veor		q4, q4, q14
			
 
				+	vst1.8		{q14}, [r4, :128]!
			
 
				+
			
 
				+	vld1.8		{q5}, [r1]!
			
 
				+	next_tweak	q14, q12, q15, q13
			
 
				+	veor		q5, q5, q12
			
 
				+	vst1.8		{q12}, [r4, :128]!
			
 
				+
			
 
				+	vld1.8		{q6}, [r1]!
			
 
				+	next_tweak	q12, q14, q15, q13
			
 
				+	veor		q6, q6, q14
			
 
				+	vst1.8		{q14}, [r4, :128]!
			
 
				+
			
 
				+	vld1.8		{q7}, [r1]!
			
 
				+	next_tweak	q14, q12, q15, q13
			
 
				+	veor		q7, q7, q12
			
 
				+	vst1.8		{q12}, [r4, :128]
			
 
				+
			
 
				+0:	vst1.8		{q14}, [r7]		// store next iv
			
 
				+	bx		lr
			
 
				+ENDPROC(__xts_prepare8)
			
 
				+
			
 
				+	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
			
 
				+	push		{r4-r8, lr}
			
 
				+	mov		r5, sp			// preserve sp
			
 
				+	ldrd		r6, r7, [sp, #24]	// get blocks and iv args
			
 
				+	sub		ip, sp, #128		// make room for 8x tweak
			
 
				+	bic		ip, ip, #0xf		// align sp to 16 bytes
			
 
				+	mov		sp, ip
			
 
				+
			
 
				+99:	bl		__xts_prepare8
			
 
				+
			
 
				+	mov		bskey, r2
			
 
				+	mov		rounds, r3
			
 
				+	bl		\do8
			
 
				+
			
 
				+	__adr		ip, 0f
			
 
				+	and		lr, r6, #7
			
 
				+	cmp		r6, #8
			
 
				+	sub		ip, ip, lr, lsl #2
			
 
				+	mov		r4, sp
			
 
				+	bxlt		ip			// computed goto if blocks < 8
			
 
				+
			
 
				+	vld1.8		{q8}, [r4, :128]!
			
 
				+	vld1.8		{q9}, [r4, :128]!
			
 
				+	vld1.8		{q10}, [r4, :128]!
			
 
				+	vld1.8		{q11}, [r4, :128]!
			
 
				+	vld1.8		{q12}, [r4, :128]!
			
 
				+	vld1.8		{q13}, [r4, :128]!
			
 
				+	vld1.8		{q14}, [r4, :128]!
			
 
				+	vld1.8		{q15}, [r4, :128]
			
 
				+
			
 
				+0:	__adr		ip, 1f
			
 
				+	sub		ip, ip, lr, lsl #3
			
 
				+	bxlt		ip			// computed goto if blocks < 8
			
 
				+
			
 
				+	veor		\o0, \o0, q8
			
 
				+	vst1.8		{\o0}, [r0]!
			
 
				+	veor		\o1, \o1, q9
			
 
				+	vst1.8		{\o1}, [r0]!
			
 
				+	veor		\o2, \o2, q10
			
 
				+	vst1.8		{\o2}, [r0]!
			
 
				+	veor		\o3, \o3, q11
			
 
				+	vst1.8		{\o3}, [r0]!
			
 
				+	veor		\o4, \o4, q12
			
 
				+	vst1.8		{\o4}, [r0]!
			
 
				+	veor		\o5, \o5, q13
			
 
				+	vst1.8		{\o5}, [r0]!
			
 
				+	veor		\o6, \o6, q14
			
 
				+	vst1.8		{\o6}, [r0]!
			
 
				+	veor		\o7, \o7, q15
			
 
				+	vst1.8		{\o7}, [r0]!
			
 
				+
			
 
				+1:	subs		r6, r6, #8
			
 
				+	bgt		99b
			
 
				+
			
 
				+	mov		sp, r5
			
 
				+	pop		{r4-r8, pc}
			
 
				+	.endm
			
 
				+
			
 
				+ENTRY(aesbs_xts_encrypt)
			
 
				+	__xts_crypt	aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
			
 
				+ENDPROC(aesbs_xts_encrypt)
			
 
				+
			
 
				+ENTRY(aesbs_xts_decrypt)
			
 
				+	__xts_crypt	aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
			
 
				+ENDPROC(aesbs_xts_decrypt)
			
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@@ -0,0 +1,406 @@
 
				+/*
			
 
				+ * Bit sliced AES using NEON instructions
			
 
				+ *
			
 
				+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <asm/neon.h>
			
 
				+#include <crypto/aes.h>
			
 
				+#include <crypto/cbc.h>
			
 
				+#include <crypto/internal/simd.h>
			
 
				+#include <crypto/internal/skcipher.h>
			
 
				+#include <crypto/xts.h>
			
 
				+#include <linux/module.h>
			
 
				+
			
 
				+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				+MODULE_LICENSE("GPL v2");
			
 
				+
			
 
				+MODULE_ALIAS_CRYPTO("ecb(aes)");
			
 
				+MODULE_ALIAS_CRYPTO("cbc(aes)");
			
 
				+MODULE_ALIAS_CRYPTO("ctr(aes)");
			
 
				+MODULE_ALIAS_CRYPTO("xts(aes)");
			
 
				+
			
 
				+asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds);
			
 
				+
			
 
				+asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks);
			
 
				+asmlinkage void aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks);
			
 
				+
			
 
				+asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks, u8 iv[]);
			
 
				+
			
 
				+asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks, u8 ctr[], u8 final[]);
			
 
				+
			
 
				+asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks, u8 iv[]);
			
 
				+asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks, u8 iv[]);
			
 
				+
			
 
				+asmlinkage void __aes_arm_encrypt(const u32 rk[], int rounds, const u8 in[],
			
 
				+				  u8 out[]);
			
 
				+
			
 
				+struct aesbs_ctx {
			
 
				+	int	rounds;
			
 
				+	u8	rk[13 * (8 * AES_BLOCK_SIZE) + 32] __aligned(AES_BLOCK_SIZE);
			
 
				+};
			
 
				+
			
 
				+struct aesbs_cbc_ctx {
			
 
				+	struct aesbs_ctx	key;
			
 
				+	u32			enc[AES_MAX_KEYLENGTH_U32];
			
 
				+};
			
 
				+
			
 
				+struct aesbs_xts_ctx {
			
 
				+	struct aesbs_ctx	key;
			
 
				+	u32			twkey[AES_MAX_KEYLENGTH_U32];
			
 
				+};
			
 
				+
			
 
				+static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
			
 
				+			unsigned int key_len)
			
 
				+{
			
 
				+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct crypto_aes_ctx rk;
			
 
				+	int err;
			
 
				+
			
 
				+	err = crypto_aes_expand_key(&rk, in_key, key_len);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	ctx->rounds = 6 + key_len / 4;
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	aesbs_convert_key(ctx->rk, rk.key_enc, ctx->rounds);
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int __ecb_crypt(struct skcipher_request *req,
			
 
				+		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks))
			
 
				+{
			
 
				+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct skcipher_walk walk;
			
 
				+	int err;
			
 
				+
			
 
				+	err = skcipher_walk_virt(&walk, req, true);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	while (walk.nbytes >= AES_BLOCK_SIZE) {
			
 
				+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
			
 
				+
			
 
				+		if (walk.nbytes < walk.total)
			
 
				+			blocks = round_down(blocks,
			
 
				+					    walk.stride / AES_BLOCK_SIZE);
			
 
				+
			
 
				+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
			
 
				+		   ctx->rounds, blocks);
			
 
				+		err = skcipher_walk_done(&walk,
			
 
				+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ecb_encrypt(struct skcipher_request *req)
			
 
				+{
			
 
				+	return __ecb_crypt(req, aesbs_ecb_encrypt);
			
 
				+}
			
 
				+
			
 
				+static int ecb_decrypt(struct skcipher_request *req)
			
 
				+{
			
 
				+	return __ecb_crypt(req, aesbs_ecb_decrypt);
			
 
				+}
			
 
				+
			
 
				+static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
			
 
				+			    unsigned int key_len)
			
 
				+{
			
 
				+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct crypto_aes_ctx rk;
			
 
				+	int err;
			
 
				+
			
 
				+	err = crypto_aes_expand_key(&rk, in_key, key_len);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	ctx->key.rounds = 6 + key_len / 4;
			
 
				+
			
 
				+	memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc));
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void cbc_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
			
 
				+{
			
 
				+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+
			
 
				+	__aes_arm_encrypt(ctx->enc, ctx->key.rounds, src, dst);
			
 
				+}
			
 
				+
			
 
				+static int cbc_encrypt(struct skcipher_request *req)
			
 
				+{
			
 
				+	return crypto_cbc_encrypt_walk(req, cbc_encrypt_one);
			
 
				+}
			
 
				+
			
 
				+static int cbc_decrypt(struct skcipher_request *req)
			
 
				+{
			
 
				+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct skcipher_walk walk;
			
 
				+	int err;
			
 
				+
			
 
				+	err = skcipher_walk_virt(&walk, req, true);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	while (walk.nbytes >= AES_BLOCK_SIZE) {
			
 
				+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
			
 
				+
			
 
				+		if (walk.nbytes < walk.total)
			
 
				+			blocks = round_down(blocks,
			
 
				+					    walk.stride / AES_BLOCK_SIZE);
			
 
				+
			
 
				+		aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				  ctx->key.rk, ctx->key.rounds, blocks,
			
 
				+				  walk.iv);
			
 
				+		err = skcipher_walk_done(&walk,
			
 
				+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ctr_encrypt(struct skcipher_request *req)
			
 
				+{
			
 
				+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct skcipher_walk walk;
			
 
				+	u8 buf[AES_BLOCK_SIZE];
			
 
				+	int err;
			
 
				+
			
 
				+	err = skcipher_walk_virt(&walk, req, true);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	while (walk.nbytes > 0) {
			
 
				+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
			
 
				+		u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
			
 
				+
			
 
				+		if (walk.nbytes < walk.total) {
			
 
				+			blocks = round_down(blocks,
			
 
				+					    walk.stride / AES_BLOCK_SIZE);
			
 
				+			final = NULL;
			
 
				+		}
			
 
				+
			
 
				+		aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				  ctx->rk, ctx->rounds, blocks, walk.iv, final);
			
 
				+
			
 
				+		if (final) {
			
 
				+			u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
			
 
				+			u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
			
 
				+
			
 
				+			if (dst != src)
			
 
				+				memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
			
 
				+			crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE);
			
 
				+
			
 
				+			err = skcipher_walk_done(&walk, 0);
			
 
				+			break;
			
 
				+		}
			
 
				+		err = skcipher_walk_done(&walk,
			
 
				+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
			
 
				+			    unsigned int key_len)
			
 
				+{
			
 
				+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct crypto_aes_ctx rk;
			
 
				+	int err;
			
 
				+
			
 
				+	err = xts_verify_key(tfm, in_key, key_len);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	key_len /= 2;
			
 
				+	err = crypto_aes_expand_key(&rk, in_key + key_len, key_len);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	memcpy(ctx->twkey, rk.key_enc, sizeof(ctx->twkey));
			
 
				+
			
 
				+	return aesbs_setkey(tfm, in_key, key_len);
			
 
				+}
			
 
				+
			
 
				+static int __xts_crypt(struct skcipher_request *req,
			
 
				+		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks, u8 iv[]))
			
 
				+{
			
 
				+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct skcipher_walk walk;
			
 
				+	int err;
			
 
				+
			
 
				+	err = skcipher_walk_virt(&walk, req, true);
			
 
				+
			
 
				+	__aes_arm_encrypt(ctx->twkey, ctx->key.rounds, walk.iv, walk.iv);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	while (walk.nbytes >= AES_BLOCK_SIZE) {
			
 
				+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
			
 
				+
			
 
				+		if (walk.nbytes < walk.total)
			
 
				+			blocks = round_down(blocks,
			
 
				+					    walk.stride / AES_BLOCK_SIZE);
			
 
				+
			
 
				+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk,
			
 
				+		   ctx->key.rounds, blocks, walk.iv);
			
 
				+		err = skcipher_walk_done(&walk,
			
 
				+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int xts_encrypt(struct skcipher_request *req)
			
 
				+{
			
 
				+	return __xts_crypt(req, aesbs_xts_encrypt);
			
 
				+}
			
 
				+
			
 
				+static int xts_decrypt(struct skcipher_request *req)
			
 
				+{
			
 
				+	return __xts_crypt(req, aesbs_xts_decrypt);
			
 
				+}
			
 
				+
			
 
				+static struct skcipher_alg aes_algs[] = { {
			
 
				+	.base.cra_name		= "__ecb(aes)",
			
 
				+	.base.cra_driver_name	= "__ecb-aes-neonbs",
			
 
				+	.base.cra_priority	= 250,
			
 
				+	.base.cra_blocksize	= AES_BLOCK_SIZE,
			
 
				+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				+
			
 
				+	.min_keysize		= AES_MIN_KEY_SIZE,
			
 
				+	.max_keysize		= AES_MAX_KEY_SIZE,
			
 
				+	.walksize		= 8 * AES_BLOCK_SIZE,
			
 
				+	.setkey			= aesbs_setkey,
			
 
				+	.encrypt		= ecb_encrypt,
			
 
				+	.decrypt		= ecb_decrypt,
			
 
				+}, {
			
 
				+	.base.cra_name		= "__cbc(aes)",
			
 
				+	.base.cra_driver_name	= "__cbc-aes-neonbs",
			
 
				+	.base.cra_priority	= 250,
			
 
				+	.base.cra_blocksize	= AES_BLOCK_SIZE,
			
 
				+	.base.cra_ctxsize	= sizeof(struct aesbs_cbc_ctx),
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				+
			
 
				+	.min_keysize		= AES_MIN_KEY_SIZE,
			
 
				+	.max_keysize		= AES_MAX_KEY_SIZE,
			
 
				+	.walksize		= 8 * AES_BLOCK_SIZE,
			
 
				+	.ivsize			= AES_BLOCK_SIZE,
			
 
				+	.setkey			= aesbs_cbc_setkey,
			
 
				+	.encrypt		= cbc_encrypt,
			
 
				+	.decrypt		= cbc_decrypt,
			
 
				+}, {
			
 
				+	.base.cra_name		= "__ctr(aes)",
			
 
				+	.base.cra_driver_name	= "__ctr-aes-neonbs",
			
 
				+	.base.cra_priority	= 250,
			
 
				+	.base.cra_blocksize	= 1,
			
 
				+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				+
			
 
				+	.min_keysize		= AES_MIN_KEY_SIZE,
			
 
				+	.max_keysize		= AES_MAX_KEY_SIZE,
			
 
				+	.chunksize		= AES_BLOCK_SIZE,
			
 
				+	.walksize		= 8 * AES_BLOCK_SIZE,
			
 
				+	.ivsize			= AES_BLOCK_SIZE,
			
 
				+	.setkey			= aesbs_setkey,
			
 
				+	.encrypt		= ctr_encrypt,
			
 
				+	.decrypt		= ctr_encrypt,
			
 
				+}, {
			
 
				+	.base.cra_name		= "__xts(aes)",
			
 
				+	.base.cra_driver_name	= "__xts-aes-neonbs",
			
 
				+	.base.cra_priority	= 250,
			
 
				+	.base.cra_blocksize	= AES_BLOCK_SIZE,
			
 
				+	.base.cra_ctxsize	= sizeof(struct aesbs_xts_ctx),
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				+
			
 
				+	.min_keysize		= 2 * AES_MIN_KEY_SIZE,
			
 
				+	.max_keysize		= 2 * AES_MAX_KEY_SIZE,
			
 
				+	.walksize		= 8 * AES_BLOCK_SIZE,
			
 
				+	.ivsize			= AES_BLOCK_SIZE,
			
 
				+	.setkey			= aesbs_xts_setkey,
			
 
				+	.encrypt		= xts_encrypt,
			
 
				+	.decrypt		= xts_decrypt,
			
 
				+} };
			
 
				+
			
 
				+static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
			
 
				+
			
 
				+static void aes_exit(void)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
			
 
				+		if (aes_simd_algs[i])
			
 
				+			simd_skcipher_free(aes_simd_algs[i]);
			
 
				+
			
 
				+	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
			
 
				+}
			
 
				+
			
 
				+static int __init aes_init(void)
			
 
				+{
			
 
				+	struct simd_skcipher_alg *simd;
			
 
				+	const char *basename;
			
 
				+	const char *algname;
			
 
				+	const char *drvname;
			
 
				+	int err;
			
 
				+	int i;
			
 
				+
			
 
				+	if (!(elf_hwcap & HWCAP_NEON))
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
			
 
				+		if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
			
 
				+			continue;
			
 
				+
			
 
				+		algname = aes_algs[i].base.cra_name + 2;
			
 
				+		drvname = aes_algs[i].base.cra_driver_name + 2;
			
 
				+		basename = aes_algs[i].base.cra_driver_name;
			
 
				+		simd = simd_skcipher_create_compat(algname, drvname, basename);
			
 
				+		err = PTR_ERR(simd);
			
 
				+		if (IS_ERR(simd))
			
 
				+			goto unregister_simds;
			
 
				+
			
 
				+		aes_simd_algs[i] = simd;
			
 
				+	}
			
 
				+	return 0;
			
 
				+
			
 
				+unregister_simds:
			
 
				+	aes_exit();
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+module_init(aes_init);
			
 
				+module_exit(aes_exit);
			
--- a/arch/arm/crypto/aes_glue.c
+++ b/arch/arm/crypto/aes_glue.c
@@ -1,98 +0,0 @@
 
				-/*
			
 
				- * Glue Code for the asm optimized version of the AES Cipher Algorithm
			
 
				- */
			
 
				-
			
 
				-#include <linux/module.h>
			
 
				-#include <linux/crypto.h>
			
 
				-#include <crypto/aes.h>
			
 
				-
			
 
				-#include "aes_glue.h"
			
 
				-
			
 
				-EXPORT_SYMBOL(AES_encrypt);
			
 
				-EXPORT_SYMBOL(AES_decrypt);
			
 
				-EXPORT_SYMBOL(private_AES_set_encrypt_key);
			
 
				-EXPORT_SYMBOL(private_AES_set_decrypt_key);
			
 
				-
			
 
				-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
			
 
				-{
			
 
				-	struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
			
 
				-	AES_encrypt(src, dst, &ctx->enc_key);
			
 
				-}
			
 
				-
			
 
				-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
			
 
				-{
			
 
				-	struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
			
 
				-	AES_decrypt(src, dst, &ctx->dec_key);
			
 
				-}
			
 
				-
			
 
				-static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
			
 
				-		unsigned int key_len)
			
 
				-{
			
 
				-	struct AES_CTX *ctx = crypto_tfm_ctx(tfm);
			
 
				-
			
 
				-	switch (key_len) {
			
 
				-	case AES_KEYSIZE_128:
			
 
				-		key_len = 128;
			
 
				-		break;
			
 
				-	case AES_KEYSIZE_192:
			
 
				-		key_len = 192;
			
 
				-		break;
			
 
				-	case AES_KEYSIZE_256:
			
 
				-		key_len = 256;
			
 
				-		break;
			
 
				-	default:
			
 
				-		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-
			
 
				-	if (private_AES_set_encrypt_key(in_key, key_len, &ctx->enc_key) == -1) {
			
 
				-		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-	/* private_AES_set_decrypt_key expects an encryption key as input */
			
 
				-	ctx->dec_key = ctx->enc_key;
			
 
				-	if (private_AES_set_decrypt_key(in_key, key_len, &ctx->dec_key) == -1) {
			
 
				-		tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static struct crypto_alg aes_alg = {
			
 
				-	.cra_name		= "aes",
			
 
				-	.cra_driver_name	= "aes-asm",
			
 
				-	.cra_priority		= 200,
			
 
				-	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
			
 
				-	.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				-	.cra_ctxsize		= sizeof(struct AES_CTX),
			
 
				-	.cra_module		= THIS_MODULE,
			
 
				-	.cra_list		= LIST_HEAD_INIT(aes_alg.cra_list),
			
 
				-	.cra_u	= {
			
 
				-		.cipher	= {
			
 
				-			.cia_min_keysize	= AES_MIN_KEY_SIZE,
			
 
				-			.cia_max_keysize	= AES_MAX_KEY_SIZE,
			
 
				-			.cia_setkey		= aes_set_key,
			
 
				-			.cia_encrypt		= aes_encrypt,
			
 
				-			.cia_decrypt		= aes_decrypt
			
 
				-		}
			
 
				-	}
			
 
				-};
			
 
				-
			
 
				-static int __init aes_init(void)
			
 
				-{
			
 
				-	return crypto_register_alg(&aes_alg);
			
 
				-}
			
 
				-
			
 
				-static void __exit aes_fini(void)
			
 
				-{
			
 
				-	crypto_unregister_alg(&aes_alg);
			
 
				-}
			
 
				-
			
 
				-module_init(aes_init);
			
 
				-module_exit(aes_fini);
			
 
				-
			
 
				-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm (ASM)");
			
 
				-MODULE_LICENSE("GPL");
			
 
				-MODULE_ALIAS_CRYPTO("aes");
			
 
				-MODULE_ALIAS_CRYPTO("aes-asm");
			
 
				-MODULE_AUTHOR("David McCullough <ucdevel@gmail.com>");
			
--- a/arch/arm/crypto/aes_glue.h
+++ b/arch/arm/crypto/aes_glue.h
@@ -1,19 +0,0 @@
 
				-
			
 
				-#define AES_MAXNR 14
			
 
				-
			
 
				-struct AES_KEY {
			
 
				-	unsigned int rd_key[4 * (AES_MAXNR + 1)];
			
 
				-	int rounds;
			
 
				-};
			
 
				-
			
 
				-struct AES_CTX {
			
 
				-	struct AES_KEY enc_key;
			
 
				-	struct AES_KEY dec_key;
			
 
				-};
			
 
				-
			
 
				-asmlinkage void AES_encrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
			
 
				-asmlinkage void AES_decrypt(const u8 *in, u8 *out, struct AES_KEY *ctx);
			
 
				-asmlinkage int private_AES_set_decrypt_key(const unsigned char *userKey,
			
 
				-					   const int bits, struct AES_KEY *key);
			
 
				-asmlinkage int private_AES_set_encrypt_key(const unsigned char *userKey,
			
 
				-					   const int bits, struct AES_KEY *key);
			
--- a/arch/arm/crypto/aesbs-core.S_shipped
+++ b/arch/arm/crypto/aesbs-core.S_shipped
@@ -1,2548 +0,0 @@
 
				-
			
 
				-@ ====================================================================
			
 
				-@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
			
 
				-@ project. The module is, however, dual licensed under OpenSSL and
			
 
				-@ CRYPTOGAMS licenses depending on where you obtain it. For further
			
 
				-@ details see http://www.openssl.org/~appro/cryptogams/.
			
 
				-@
			
 
				-@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
			
 
				-@ <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
			
 
				-@ granted.
			
 
				-@ ====================================================================
			
 
				-
			
 
				-@ Bit-sliced AES for ARM NEON
			
 
				-@
			
 
				-@ February 2012.
			
 
				-@
			
 
				-@ This implementation is direct adaptation of bsaes-x86_64 module for
			
 
				-@ ARM NEON. Except that this module is endian-neutral [in sense that
			
 
				-@ it can be compiled for either endianness] by courtesy of vld1.8's
			
 
				-@ neutrality. Initial version doesn't implement interface to OpenSSL,
			
 
				-@ only low-level primitives and unsupported entry points, just enough
			
 
				-@ to collect performance results, which for Cortex-A8 core are:
			
 
				-@
			
 
				-@ encrypt	19.5 cycles per byte processed with 128-bit key
			
 
				-@ decrypt	22.1 cycles per byte processed with 128-bit key
			
 
				-@ key conv.	440  cycles per 128-bit key/0.18 of 8x block
			
 
				-@
			
 
				-@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
			
 
				-@ which is [much] worse than anticipated (for further details see
			
 
				-@ http://www.openssl.org/~appro/Snapdragon-S4.html).
			
 
				-@
			
 
				-@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
			
 
				-@ manages in 20.0 cycles].
			
 
				-@
			
 
				-@ When comparing to x86_64 results keep in mind that NEON unit is
			
 
				-@ [mostly] single-issue and thus can't [fully] benefit from
			
 
				-@ instruction-level parallelism. And when comparing to aes-armv4
			
 
				-@ results keep in mind key schedule conversion overhead (see
			
 
				-@ bsaes-x86_64.pl for further details)...
			
 
				-@
			
 
				-@						<appro@openssl.org>
			
 
				-
			
 
				-@ April-August 2013
			
 
				-@
			
 
				-@ Add CBC, CTR and XTS subroutines, adapt for kernel use.
			
 
				-@
			
 
				-@					<ard.biesheuvel@linaro.org>
			
 
				-
			
 
				-#ifndef __KERNEL__
			
 
				-# include "arm_arch.h"
			
 
				-
			
 
				-# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
			
 
				-# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
			
 
				-# define VFP_ABI_FRAME	0x40
			
 
				-#else
			
 
				-# define VFP_ABI_PUSH
			
 
				-# define VFP_ABI_POP
			
 
				-# define VFP_ABI_FRAME	0
			
 
				-# define BSAES_ASM_EXTENDED_KEY
			
 
				-# define XTS_CHAIN_TWEAK
			
 
				-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
			
 
				-# define __ARM_MAX_ARCH__ 7
			
 
				-#endif
			
 
				-
			
 
				-#ifdef __thumb__
			
 
				-# define adrl adr
			
 
				-#endif
			
 
				-
			
 
				-#if __ARM_MAX_ARCH__>=7
			
 
				-.arch	armv7-a
			
 
				-.fpu	neon
			
 
				-
			
 
				-.text
			
 
				-.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
			
 
				-#ifdef __thumb2__
			
 
				-.thumb
			
 
				-#else
			
 
				-.code   32
			
 
				-#endif
			
 
				-
			
 
				-.type	_bsaes_decrypt8,%function
			
 
				-.align	4
			
 
				-_bsaes_decrypt8:
			
 
				-	adr	r6,_bsaes_decrypt8
			
 
				-	vldmia	r4!, {q9}		@ round 0 key
			
 
				-	add	r6,r6,#.LM0ISR-_bsaes_decrypt8
			
 
				-
			
 
				-	vldmia	r6!, {q8}		@ .LM0ISR
			
 
				-	veor	q10, q0, q9	@ xor with round0 key
			
 
				-	veor	q11, q1, q9
			
 
				-	 vtbl.8	d0, {q10}, d16
			
 
				-	 vtbl.8	d1, {q10}, d17
			
 
				-	veor	q12, q2, q9
			
 
				-	 vtbl.8	d2, {q11}, d16
			
 
				-	 vtbl.8	d3, {q11}, d17
			
 
				-	veor	q13, q3, q9
			
 
				-	 vtbl.8	d4, {q12}, d16
			
 
				-	 vtbl.8	d5, {q12}, d17
			
 
				-	veor	q14, q4, q9
			
 
				-	 vtbl.8	d6, {q13}, d16
			
 
				-	 vtbl.8	d7, {q13}, d17
			
 
				-	veor	q15, q5, q9
			
 
				-	 vtbl.8	d8, {q14}, d16
			
 
				-	 vtbl.8	d9, {q14}, d17
			
 
				-	veor	q10, q6, q9
			
 
				-	 vtbl.8	d10, {q15}, d16
			
 
				-	 vtbl.8	d11, {q15}, d17
			
 
				-	veor	q11, q7, q9
			
 
				-	 vtbl.8	d12, {q10}, d16
			
 
				-	 vtbl.8	d13, {q10}, d17
			
 
				-	 vtbl.8	d14, {q11}, d16
			
 
				-	 vtbl.8	d15, {q11}, d17
			
 
				-	vmov.i8	q8,#0x55			@ compose .LBS0
			
 
				-	vmov.i8	q9,#0x33			@ compose .LBS1
			
 
				-	vshr.u64	q10, q6, #1
			
 
				-	 vshr.u64	q11, q4, #1
			
 
				-	veor		q10, q10, q7
			
 
				-	 veor		q11, q11, q5
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q7, q7, q10
			
 
				-	vshl.u64	q10, q10, #1
			
 
				-	 veor		q5, q5, q11
			
 
				-	 vshl.u64	q11, q11, #1
			
 
				-	veor		q6, q6, q10
			
 
				-	 veor		q4, q4, q11
			
 
				-	vshr.u64	q10, q2, #1
			
 
				-	 vshr.u64	q11, q0, #1
			
 
				-	veor		q10, q10, q3
			
 
				-	 veor		q11, q11, q1
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q3, q3, q10
			
 
				-	vshl.u64	q10, q10, #1
			
 
				-	 veor		q1, q1, q11
			
 
				-	 vshl.u64	q11, q11, #1
			
 
				-	veor		q2, q2, q10
			
 
				-	 veor		q0, q0, q11
			
 
				-	vmov.i8	q8,#0x0f			@ compose .LBS2
			
 
				-	vshr.u64	q10, q5, #2
			
 
				-	 vshr.u64	q11, q4, #2
			
 
				-	veor		q10, q10, q7
			
 
				-	 veor		q11, q11, q6
			
 
				-	vand		q10, q10, q9
			
 
				-	 vand		q11, q11, q9
			
 
				-	veor		q7, q7, q10
			
 
				-	vshl.u64	q10, q10, #2
			
 
				-	 veor		q6, q6, q11
			
 
				-	 vshl.u64	q11, q11, #2
			
 
				-	veor		q5, q5, q10
			
 
				-	 veor		q4, q4, q11
			
 
				-	vshr.u64	q10, q1, #2
			
 
				-	 vshr.u64	q11, q0, #2
			
 
				-	veor		q10, q10, q3
			
 
				-	 veor		q11, q11, q2
			
 
				-	vand		q10, q10, q9
			
 
				-	 vand		q11, q11, q9
			
 
				-	veor		q3, q3, q10
			
 
				-	vshl.u64	q10, q10, #2
			
 
				-	 veor		q2, q2, q11
			
 
				-	 vshl.u64	q11, q11, #2
			
 
				-	veor		q1, q1, q10
			
 
				-	 veor		q0, q0, q11
			
 
				-	vshr.u64	q10, q3, #4
			
 
				-	 vshr.u64	q11, q2, #4
			
 
				-	veor		q10, q10, q7
			
 
				-	 veor		q11, q11, q6
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q7, q7, q10
			
 
				-	vshl.u64	q10, q10, #4
			
 
				-	 veor		q6, q6, q11
			
 
				-	 vshl.u64	q11, q11, #4
			
 
				-	veor		q3, q3, q10
			
 
				-	 veor		q2, q2, q11
			
 
				-	vshr.u64	q10, q1, #4
			
 
				-	 vshr.u64	q11, q0, #4
			
 
				-	veor		q10, q10, q5
			
 
				-	 veor		q11, q11, q4
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q5, q5, q10
			
 
				-	vshl.u64	q10, q10, #4
			
 
				-	 veor		q4, q4, q11
			
 
				-	 vshl.u64	q11, q11, #4
			
 
				-	veor		q1, q1, q10
			
 
				-	 veor		q0, q0, q11
			
 
				-	sub	r5,r5,#1
			
 
				-	b	.Ldec_sbox
			
 
				-.align	4
			
 
				-.Ldec_loop:
			
 
				-	vldmia	r4!, {q8-q11}
			
 
				-	veor	q8, q8, q0
			
 
				-	veor	q9, q9, q1
			
 
				-	vtbl.8	d0, {q8}, d24
			
 
				-	vtbl.8	d1, {q8}, d25
			
 
				-	vldmia	r4!, {q8}
			
 
				-	veor	q10, q10, q2
			
 
				-	vtbl.8	d2, {q9}, d24
			
 
				-	vtbl.8	d3, {q9}, d25
			
 
				-	vldmia	r4!, {q9}
			
 
				-	veor	q11, q11, q3
			
 
				-	vtbl.8	d4, {q10}, d24
			
 
				-	vtbl.8	d5, {q10}, d25
			
 
				-	vldmia	r4!, {q10}
			
 
				-	vtbl.8	d6, {q11}, d24
			
 
				-	vtbl.8	d7, {q11}, d25
			
 
				-	vldmia	r4!, {q11}
			
 
				-	veor	q8, q8, q4
			
 
				-	veor	q9, q9, q5
			
 
				-	vtbl.8	d8, {q8}, d24
			
 
				-	vtbl.8	d9, {q8}, d25
			
 
				-	veor	q10, q10, q6
			
 
				-	vtbl.8	d10, {q9}, d24
			
 
				-	vtbl.8	d11, {q9}, d25
			
 
				-	veor	q11, q11, q7
			
 
				-	vtbl.8	d12, {q10}, d24
			
 
				-	vtbl.8	d13, {q10}, d25
			
 
				-	vtbl.8	d14, {q11}, d24
			
 
				-	vtbl.8	d15, {q11}, d25
			
 
				-.Ldec_sbox:
			
 
				-	 veor	q1, q1, q4
			
 
				-	veor	q3, q3, q4
			
 
				-
			
 
				-	veor	q4, q4, q7
			
 
				-	 veor	q1, q1, q6
			
 
				-	veor	q2, q2, q7
			
 
				-	veor	q6, q6, q4
			
 
				-
			
 
				-	veor	q0, q0, q1
			
 
				-	veor	q2, q2, q5
			
 
				-	 veor	q7, q7, q6
			
 
				-	veor	q3, q3, q0
			
 
				-	veor	q5, q5, q0
			
 
				-	veor	q1, q1, q3
			
 
				-	veor	q11, q3, q0
			
 
				-	veor	q10, q7, q4
			
 
				-	veor	q9, q1, q6
			
 
				-	veor	q13, q4, q0
			
 
				-	 vmov	q8, q10
			
 
				-	veor	q12, q5, q2
			
 
				-
			
 
				-	vorr	q10, q10, q9
			
 
				-	veor	q15, q11, q8
			
 
				-	vand	q14, q11, q12
			
 
				-	vorr	q11, q11, q12
			
 
				-	veor	q12, q12, q9
			
 
				-	vand	q8, q8, q9
			
 
				-	veor	q9, q6, q2
			
 
				-	vand	q15, q15, q12
			
 
				-	vand	q13, q13, q9
			
 
				-	veor	q9, q3, q7
			
 
				-	veor	q12, q1, q5
			
 
				-	veor	q11, q11, q13
			
 
				-	veor	q10, q10, q13
			
 
				-	vand	q13, q9, q12
			
 
				-	vorr	q9, q9, q12
			
 
				-	veor	q11, q11, q15
			
 
				-	veor	q8, q8, q13
			
 
				-	veor	q10, q10, q14
			
 
				-	veor	q9, q9, q15
			
 
				-	veor	q8, q8, q14
			
 
				-	vand	q12, q4, q6
			
 
				-	veor	q9, q9, q14
			
 
				-	vand	q13, q0, q2
			
 
				-	vand	q14, q7, q1
			
 
				-	vorr	q15, q3, q5
			
 
				-	veor	q11, q11, q12
			
 
				-	veor	q9, q9, q14
			
 
				-	veor	q8, q8, q15
			
 
				-	veor	q10, q10, q13
			
 
				-
			
 
				-	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
			
 
				-
			
 
				-	@ new smaller inversion
			
 
				-
			
 
				-	vand	q14, q11, q9
			
 
				-	vmov	q12, q8
			
 
				-
			
 
				-	veor	q13, q10, q14
			
 
				-	veor	q15, q8, q14
			
 
				-	veor	q14, q8, q14	@ q14=q15
			
 
				-
			
 
				-	vbsl	q13, q9, q8
			
 
				-	vbsl	q15, q11, q10
			
 
				-	veor	q11, q11, q10
			
 
				-
			
 
				-	vbsl	q12, q13, q14
			
 
				-	vbsl	q8, q14, q13
			
 
				-
			
 
				-	vand	q14, q12, q15
			
 
				-	veor	q9, q9, q8
			
 
				-
			
 
				-	veor	q14, q14, q11
			
 
				-	veor	q12, q5, q2
			
 
				-	veor	q8, q1, q6
			
 
				-	veor 	q10, q15, q14
			
 
				-	vand	q10, q10, q5
			
 
				-	veor	q5, q5, q1
			
 
				-	vand	q11, q1, q15
			
 
				-	vand	q5, q5, q14
			
 
				-	veor	q1, q11, q10
			
 
				-	veor	q5, q5, q11
			
 
				-	veor	q15, q15, q13
			
 
				-	veor	q14, q14, q9
			
 
				-	veor	q11, q15, q14
			
 
				-	 veor 	q10, q13, q9
			
 
				-	vand	q11, q11, q12
			
 
				-	 vand	q10, q10, q2
			
 
				-	veor	q12, q12, q8
			
 
				-	 veor	q2, q2, q6
			
 
				-	vand	q8, q8, q15
			
 
				-	 vand	q6, q6, q13
			
 
				-	vand	q12, q12, q14
			
 
				-	 vand	q2, q2, q9
			
 
				-	veor	q8, q8, q12
			
 
				-	 veor	q2, q2, q6
			
 
				-	veor	q12, q12, q11
			
 
				-	 veor	q6, q6, q10
			
 
				-	veor	q5, q5, q12
			
 
				-	veor	q2, q2, q12
			
 
				-	veor	q1, q1, q8
			
 
				-	veor	q6, q6, q8
			
 
				-
			
 
				-	veor	q12, q3, q0
			
 
				-	veor	q8, q7, q4
			
 
				-	veor	q11, q15, q14
			
 
				-	 veor 	q10, q13, q9
			
 
				-	vand	q11, q11, q12
			
 
				-	 vand	q10, q10, q0
			
 
				-	veor	q12, q12, q8
			
 
				-	 veor	q0, q0, q4
			
 
				-	vand	q8, q8, q15
			
 
				-	 vand	q4, q4, q13
			
 
				-	vand	q12, q12, q14
			
 
				-	 vand	q0, q0, q9
			
 
				-	veor	q8, q8, q12
			
 
				-	 veor	q0, q0, q4
			
 
				-	veor	q12, q12, q11
			
 
				-	 veor	q4, q4, q10
			
 
				-	veor	q15, q15, q13
			
 
				-	veor	q14, q14, q9
			
 
				-	veor 	q10, q15, q14
			
 
				-	vand	q10, q10, q3
			
 
				-	veor	q3, q3, q7
			
 
				-	vand	q11, q7, q15
			
 
				-	vand	q3, q3, q14
			
 
				-	veor	q7, q11, q10
			
 
				-	veor	q3, q3, q11
			
 
				-	veor	q3, q3, q12
			
 
				-	veor	q0, q0, q12
			
 
				-	veor	q7, q7, q8
			
 
				-	veor	q4, q4, q8
			
 
				-	veor	q1, q1, q7
			
 
				-	veor	q6, q6, q5
			
 
				-
			
 
				-	veor	q4, q4, q1
			
 
				-	veor	q2, q2, q7
			
 
				-	veor	q5, q5, q7
			
 
				-	veor	q4, q4, q2
			
 
				-	 veor 	q7, q7, q0
			
 
				-	veor	q4, q4, q5
			
 
				-	 veor	q3, q3, q6
			
 
				-	 veor	q6, q6, q1
			
 
				-	veor	q3, q3, q4
			
 
				-
			
 
				-	veor	q4, q4, q0
			
 
				-	veor	q7, q7, q3
			
 
				-	subs	r5,r5,#1
			
 
				-	bcc	.Ldec_done
			
 
				-	@ multiplication by 0x05-0x00-0x04-0x00
			
 
				-	vext.8	q8, q0, q0, #8
			
 
				-	vext.8	q14, q3, q3, #8
			
 
				-	vext.8	q15, q5, q5, #8
			
 
				-	veor	q8, q8, q0
			
 
				-	vext.8	q9, q1, q1, #8
			
 
				-	veor	q14, q14, q3
			
 
				-	vext.8	q10, q6, q6, #8
			
 
				-	veor	q15, q15, q5
			
 
				-	vext.8	q11, q4, q4, #8
			
 
				-	veor	q9, q9, q1
			
 
				-	vext.8	q12, q2, q2, #8
			
 
				-	veor	q10, q10, q6
			
 
				-	vext.8	q13, q7, q7, #8
			
 
				-	veor	q11, q11, q4
			
 
				-	veor	q12, q12, q2
			
 
				-	veor	q13, q13, q7
			
 
				-
			
 
				-	 veor	q0, q0, q14
			
 
				-	 veor	q1, q1, q14
			
 
				-	 veor	q6, q6, q8
			
 
				-	 veor	q2, q2, q10
			
 
				-	 veor	q4, q4, q9
			
 
				-	 veor	q1, q1, q15
			
 
				-	 veor	q6, q6, q15
			
 
				-	 veor	q2, q2, q14
			
 
				-	 veor	q7, q7, q11
			
 
				-	 veor	q4, q4, q14
			
 
				-	 veor	q3, q3, q12
			
 
				-	 veor	q2, q2, q15
			
 
				-	 veor	q7, q7, q15
			
 
				-	 veor	q5, q5, q13
			
 
				-	vext.8	q8, q0, q0, #12	@ x0 <<< 32
			
 
				-	vext.8	q9, q1, q1, #12
			
 
				-	 veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
			
 
				-	vext.8	q10, q6, q6, #12
			
 
				-	 veor	q1, q1, q9
			
 
				-	vext.8	q11, q4, q4, #12
			
 
				-	 veor	q6, q6, q10
			
 
				-	vext.8	q12, q2, q2, #12
			
 
				-	 veor	q4, q4, q11
			
 
				-	vext.8	q13, q7, q7, #12
			
 
				-	 veor	q2, q2, q12
			
 
				-	vext.8	q14, q3, q3, #12
			
 
				-	 veor	q7, q7, q13
			
 
				-	vext.8	q15, q5, q5, #12
			
 
				-	 veor	q3, q3, q14
			
 
				-
			
 
				-	veor	q9, q9, q0
			
 
				-	 veor	q5, q5, q15
			
 
				-	 vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
			
 
				-	veor	q10, q10, q1
			
 
				-	veor	q8, q8, q5
			
 
				-	veor	q9, q9, q5
			
 
				-	 vext.8	q1, q1, q1, #8
			
 
				-	veor	q13, q13, q2
			
 
				-	 veor	q0, q0, q8
			
 
				-	veor	q14, q14, q7
			
 
				-	 veor	q1, q1, q9
			
 
				-	 vext.8	q8, q2, q2, #8
			
 
				-	veor	q12, q12, q4
			
 
				-	 vext.8	q9, q7, q7, #8
			
 
				-	veor	q15, q15, q3
			
 
				-	 vext.8	q2, q4, q4, #8
			
 
				-	veor	q11, q11, q6
			
 
				-	 vext.8	q7, q5, q5, #8
			
 
				-	veor	q12, q12, q5
			
 
				-	 vext.8	q4, q3, q3, #8
			
 
				-	veor	q11, q11, q5
			
 
				-	 vext.8	q3, q6, q6, #8
			
 
				-	veor	q5, q9, q13
			
 
				-	veor	q11, q11, q2
			
 
				-	veor	q7, q7, q15
			
 
				-	veor	q6, q4, q14
			
 
				-	veor	q4, q8, q12
			
 
				-	veor	q2, q3, q10
			
 
				-	vmov	q3, q11
			
 
				-	 @ vmov	q5, q9
			
 
				-	vldmia	r6, {q12}		@ .LISR
			
 
				-	ite	eq				@ Thumb2 thing, sanity check in ARM
			
 
				-	addeq	r6,r6,#0x10
			
 
				-	bne	.Ldec_loop
			
 
				-	vldmia	r6, {q12}		@ .LISRM0
			
 
				-	b	.Ldec_loop
			
 
				-.align	4
			
 
				-.Ldec_done:
			
 
				-	vmov.i8	q8,#0x55			@ compose .LBS0
			
 
				-	vmov.i8	q9,#0x33			@ compose .LBS1
			
 
				-	vshr.u64	q10, q3, #1
			
 
				-	 vshr.u64	q11, q2, #1
			
 
				-	veor		q10, q10, q5
			
 
				-	 veor		q11, q11, q7
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q5, q5, q10
			
 
				-	vshl.u64	q10, q10, #1
			
 
				-	 veor		q7, q7, q11
			
 
				-	 vshl.u64	q11, q11, #1
			
 
				-	veor		q3, q3, q10
			
 
				-	 veor		q2, q2, q11
			
 
				-	vshr.u64	q10, q6, #1
			
 
				-	 vshr.u64	q11, q0, #1
			
 
				-	veor		q10, q10, q4
			
 
				-	 veor		q11, q11, q1
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q4, q4, q10
			
 
				-	vshl.u64	q10, q10, #1
			
 
				-	 veor		q1, q1, q11
			
 
				-	 vshl.u64	q11, q11, #1
			
 
				-	veor		q6, q6, q10
			
 
				-	 veor		q0, q0, q11
			
 
				-	vmov.i8	q8,#0x0f			@ compose .LBS2
			
 
				-	vshr.u64	q10, q7, #2
			
 
				-	 vshr.u64	q11, q2, #2
			
 
				-	veor		q10, q10, q5
			
 
				-	 veor		q11, q11, q3
			
 
				-	vand		q10, q10, q9
			
 
				-	 vand		q11, q11, q9
			
 
				-	veor		q5, q5, q10
			
 
				-	vshl.u64	q10, q10, #2
			
 
				-	 veor		q3, q3, q11
			
 
				-	 vshl.u64	q11, q11, #2
			
 
				-	veor		q7, q7, q10
			
 
				-	 veor		q2, q2, q11
			
 
				-	vshr.u64	q10, q1, #2
			
 
				-	 vshr.u64	q11, q0, #2
			
 
				-	veor		q10, q10, q4
			
 
				-	 veor		q11, q11, q6
			
 
				-	vand		q10, q10, q9
			
 
				-	 vand		q11, q11, q9
			
 
				-	veor		q4, q4, q10
			
 
				-	vshl.u64	q10, q10, #2
			
 
				-	 veor		q6, q6, q11
			
 
				-	 vshl.u64	q11, q11, #2
			
 
				-	veor		q1, q1, q10
			
 
				-	 veor		q0, q0, q11
			
 
				-	vshr.u64	q10, q4, #4
			
 
				-	 vshr.u64	q11, q6, #4
			
 
				-	veor		q10, q10, q5
			
 
				-	 veor		q11, q11, q3
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q5, q5, q10
			
 
				-	vshl.u64	q10, q10, #4
			
 
				-	 veor		q3, q3, q11
			
 
				-	 vshl.u64	q11, q11, #4
			
 
				-	veor		q4, q4, q10
			
 
				-	 veor		q6, q6, q11
			
 
				-	vshr.u64	q10, q1, #4
			
 
				-	 vshr.u64	q11, q0, #4
			
 
				-	veor		q10, q10, q7
			
 
				-	 veor		q11, q11, q2
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q7, q7, q10
			
 
				-	vshl.u64	q10, q10, #4
			
 
				-	 veor		q2, q2, q11
			
 
				-	 vshl.u64	q11, q11, #4
			
 
				-	veor		q1, q1, q10
			
 
				-	 veor		q0, q0, q11
			
 
				-	vldmia	r4, {q8}			@ last round key
			
 
				-	veor	q6, q6, q8
			
 
				-	veor	q4, q4, q8
			
 
				-	veor	q2, q2, q8
			
 
				-	veor	q7, q7, q8
			
 
				-	veor	q3, q3, q8
			
 
				-	veor	q5, q5, q8
			
 
				-	veor	q0, q0, q8
			
 
				-	veor	q1, q1, q8
			
 
				-	bx	lr
			
 
				-.size	_bsaes_decrypt8,.-_bsaes_decrypt8
			
 
				-
			
 
				-.type	_bsaes_const,%object
			
 
				-.align	6
			
 
				-_bsaes_const:
			
 
				-.LM0ISR:	@ InvShiftRows constants
			
 
				-	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
			
 
				-.LISR:
			
 
				-	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
			
 
				-.LISRM0:
			
 
				-	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
			
 
				-.LM0SR:		@ ShiftRows constants
			
 
				-	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
			
 
				-.LSR:
			
 
				-	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
			
 
				-.LSRM0:
			
 
				-	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
			
 
				-.LM0:
			
 
				-	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
			
 
				-.LREVM0SR:
			
 
				-	.quad	0x090d01050c000408, 0x03070b0f060a0e02
			
 
				-.asciz	"Bit-sliced AES for NEON, CRYPTOGAMS by <appro@openssl.org>"
			
 
				-.align	6
			
 
				-.size	_bsaes_const,.-_bsaes_const
			
 
				-
			
 
				-.type	_bsaes_encrypt8,%function
			
 
				-.align	4
			
 
				-_bsaes_encrypt8:
			
 
				-	adr	r6,_bsaes_encrypt8
			
 
				-	vldmia	r4!, {q9}		@ round 0 key
			
 
				-	sub	r6,r6,#_bsaes_encrypt8-.LM0SR
			
 
				-
			
 
				-	vldmia	r6!, {q8}		@ .LM0SR
			
 
				-_bsaes_encrypt8_alt:
			
 
				-	veor	q10, q0, q9	@ xor with round0 key
			
 
				-	veor	q11, q1, q9
			
 
				-	 vtbl.8	d0, {q10}, d16
			
 
				-	 vtbl.8	d1, {q10}, d17
			
 
				-	veor	q12, q2, q9
			
 
				-	 vtbl.8	d2, {q11}, d16
			
 
				-	 vtbl.8	d3, {q11}, d17
			
 
				-	veor	q13, q3, q9
			
 
				-	 vtbl.8	d4, {q12}, d16
			
 
				-	 vtbl.8	d5, {q12}, d17
			
 
				-	veor	q14, q4, q9
			
 
				-	 vtbl.8	d6, {q13}, d16
			
 
				-	 vtbl.8	d7, {q13}, d17
			
 
				-	veor	q15, q5, q9
			
 
				-	 vtbl.8	d8, {q14}, d16
			
 
				-	 vtbl.8	d9, {q14}, d17
			
 
				-	veor	q10, q6, q9
			
 
				-	 vtbl.8	d10, {q15}, d16
			
 
				-	 vtbl.8	d11, {q15}, d17
			
 
				-	veor	q11, q7, q9
			
 
				-	 vtbl.8	d12, {q10}, d16
			
 
				-	 vtbl.8	d13, {q10}, d17
			
 
				-	 vtbl.8	d14, {q11}, d16
			
 
				-	 vtbl.8	d15, {q11}, d17
			
 
				-_bsaes_encrypt8_bitslice:
			
 
				-	vmov.i8	q8,#0x55			@ compose .LBS0
			
 
				-	vmov.i8	q9,#0x33			@ compose .LBS1
			
 
				-	vshr.u64	q10, q6, #1
			
 
				-	 vshr.u64	q11, q4, #1
			
 
				-	veor		q10, q10, q7
			
 
				-	 veor		q11, q11, q5
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q7, q7, q10
			
 
				-	vshl.u64	q10, q10, #1
			
 
				-	 veor		q5, q5, q11
			
 
				-	 vshl.u64	q11, q11, #1
			
 
				-	veor		q6, q6, q10
			
 
				-	 veor		q4, q4, q11
			
 
				-	vshr.u64	q10, q2, #1
			
 
				-	 vshr.u64	q11, q0, #1
			
 
				-	veor		q10, q10, q3
			
 
				-	 veor		q11, q11, q1
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q3, q3, q10
			
 
				-	vshl.u64	q10, q10, #1
			
 
				-	 veor		q1, q1, q11
			
 
				-	 vshl.u64	q11, q11, #1
			
 
				-	veor		q2, q2, q10
			
 
				-	 veor		q0, q0, q11
			
 
				-	vmov.i8	q8,#0x0f			@ compose .LBS2
			
 
				-	vshr.u64	q10, q5, #2
			
 
				-	 vshr.u64	q11, q4, #2
			
 
				-	veor		q10, q10, q7
			
 
				-	 veor		q11, q11, q6
			
 
				-	vand		q10, q10, q9
			
 
				-	 vand		q11, q11, q9
			
 
				-	veor		q7, q7, q10
			
 
				-	vshl.u64	q10, q10, #2
			
 
				-	 veor		q6, q6, q11
			
 
				-	 vshl.u64	q11, q11, #2
			
 
				-	veor		q5, q5, q10
			
 
				-	 veor		q4, q4, q11
			
 
				-	vshr.u64	q10, q1, #2
			
 
				-	 vshr.u64	q11, q0, #2
			
 
				-	veor		q10, q10, q3
			
 
				-	 veor		q11, q11, q2
			
 
				-	vand		q10, q10, q9
			
 
				-	 vand		q11, q11, q9
			
 
				-	veor		q3, q3, q10
			
 
				-	vshl.u64	q10, q10, #2
			
 
				-	 veor		q2, q2, q11
			
 
				-	 vshl.u64	q11, q11, #2
			
 
				-	veor		q1, q1, q10
			
 
				-	 veor		q0, q0, q11
			
 
				-	vshr.u64	q10, q3, #4
			
 
				-	 vshr.u64	q11, q2, #4
			
 
				-	veor		q10, q10, q7
			
 
				-	 veor		q11, q11, q6
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q7, q7, q10
			
 
				-	vshl.u64	q10, q10, #4
			
 
				-	 veor		q6, q6, q11
			
 
				-	 vshl.u64	q11, q11, #4
			
 
				-	veor		q3, q3, q10
			
 
				-	 veor		q2, q2, q11
			
 
				-	vshr.u64	q10, q1, #4
			
 
				-	 vshr.u64	q11, q0, #4
			
 
				-	veor		q10, q10, q5
			
 
				-	 veor		q11, q11, q4
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q5, q5, q10
			
 
				-	vshl.u64	q10, q10, #4
			
 
				-	 veor		q4, q4, q11
			
 
				-	 vshl.u64	q11, q11, #4
			
 
				-	veor		q1, q1, q10
			
 
				-	 veor		q0, q0, q11
			
 
				-	sub	r5,r5,#1
			
 
				-	b	.Lenc_sbox
			
 
				-.align	4
			
 
				-.Lenc_loop:
			
 
				-	vldmia	r4!, {q8-q11}
			
 
				-	veor	q8, q8, q0
			
 
				-	veor	q9, q9, q1
			
 
				-	vtbl.8	d0, {q8}, d24
			
 
				-	vtbl.8	d1, {q8}, d25
			
 
				-	vldmia	r4!, {q8}
			
 
				-	veor	q10, q10, q2
			
 
				-	vtbl.8	d2, {q9}, d24
			
 
				-	vtbl.8	d3, {q9}, d25
			
 
				-	vldmia	r4!, {q9}
			
 
				-	veor	q11, q11, q3
			
 
				-	vtbl.8	d4, {q10}, d24
			
 
				-	vtbl.8	d5, {q10}, d25
			
 
				-	vldmia	r4!, {q10}
			
 
				-	vtbl.8	d6, {q11}, d24
			
 
				-	vtbl.8	d7, {q11}, d25
			
 
				-	vldmia	r4!, {q11}
			
 
				-	veor	q8, q8, q4
			
 
				-	veor	q9, q9, q5
			
 
				-	vtbl.8	d8, {q8}, d24
			
 
				-	vtbl.8	d9, {q8}, d25
			
 
				-	veor	q10, q10, q6
			
 
				-	vtbl.8	d10, {q9}, d24
			
 
				-	vtbl.8	d11, {q9}, d25
			
 
				-	veor	q11, q11, q7
			
 
				-	vtbl.8	d12, {q10}, d24
			
 
				-	vtbl.8	d13, {q10}, d25
			
 
				-	vtbl.8	d14, {q11}, d24
			
 
				-	vtbl.8	d15, {q11}, d25
			
 
				-.Lenc_sbox:
			
 
				-	veor	q2, q2, q1
			
 
				-	veor	q5, q5, q6
			
 
				-	veor	q3, q3, q0
			
 
				-	veor	q6, q6, q2
			
 
				-	veor	q5, q5, q0
			
 
				-
			
 
				-	veor	q6, q6, q3
			
 
				-	veor	q3, q3, q7
			
 
				-	veor	q7, q7, q5
			
 
				-	veor	q3, q3, q4
			
 
				-	veor	q4, q4, q5
			
 
				-
			
 
				-	veor	q2, q2, q7
			
 
				-	veor	q3, q3, q1
			
 
				-	veor	q1, q1, q5
			
 
				-	veor	q11, q7, q4
			
 
				-	veor	q10, q1, q2
			
 
				-	veor	q9, q5, q3
			
 
				-	veor	q13, q2, q4
			
 
				-	 vmov	q8, q10
			
 
				-	veor	q12, q6, q0
			
 
				-
			
 
				-	vorr	q10, q10, q9
			
 
				-	veor	q15, q11, q8
			
 
				-	vand	q14, q11, q12
			
 
				-	vorr	q11, q11, q12
			
 
				-	veor	q12, q12, q9
			
 
				-	vand	q8, q8, q9
			
 
				-	veor	q9, q3, q0
			
 
				-	vand	q15, q15, q12
			
 
				-	vand	q13, q13, q9
			
 
				-	veor	q9, q7, q1
			
 
				-	veor	q12, q5, q6
			
 
				-	veor	q11, q11, q13
			
 
				-	veor	q10, q10, q13
			
 
				-	vand	q13, q9, q12
			
 
				-	vorr	q9, q9, q12
			
 
				-	veor	q11, q11, q15
			
 
				-	veor	q8, q8, q13
			
 
				-	veor	q10, q10, q14
			
 
				-	veor	q9, q9, q15
			
 
				-	veor	q8, q8, q14
			
 
				-	vand	q12, q2, q3
			
 
				-	veor	q9, q9, q14
			
 
				-	vand	q13, q4, q0
			
 
				-	vand	q14, q1, q5
			
 
				-	vorr	q15, q7, q6
			
 
				-	veor	q11, q11, q12
			
 
				-	veor	q9, q9, q14
			
 
				-	veor	q8, q8, q15
			
 
				-	veor	q10, q10, q13
			
 
				-
			
 
				-	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
			
 
				-
			
 
				-	@ new smaller inversion
			
 
				-
			
 
				-	vand	q14, q11, q9
			
 
				-	vmov	q12, q8
			
 
				-
			
 
				-	veor	q13, q10, q14
			
 
				-	veor	q15, q8, q14
			
 
				-	veor	q14, q8, q14	@ q14=q15
			
 
				-
			
 
				-	vbsl	q13, q9, q8
			
 
				-	vbsl	q15, q11, q10
			
 
				-	veor	q11, q11, q10
			
 
				-
			
 
				-	vbsl	q12, q13, q14
			
 
				-	vbsl	q8, q14, q13
			
 
				-
			
 
				-	vand	q14, q12, q15
			
 
				-	veor	q9, q9, q8
			
 
				-
			
 
				-	veor	q14, q14, q11
			
 
				-	veor	q12, q6, q0
			
 
				-	veor	q8, q5, q3
			
 
				-	veor 	q10, q15, q14
			
 
				-	vand	q10, q10, q6
			
 
				-	veor	q6, q6, q5
			
 
				-	vand	q11, q5, q15
			
 
				-	vand	q6, q6, q14
			
 
				-	veor	q5, q11, q10
			
 
				-	veor	q6, q6, q11
			
 
				-	veor	q15, q15, q13
			
 
				-	veor	q14, q14, q9
			
 
				-	veor	q11, q15, q14
			
 
				-	 veor 	q10, q13, q9
			
 
				-	vand	q11, q11, q12
			
 
				-	 vand	q10, q10, q0
			
 
				-	veor	q12, q12, q8
			
 
				-	 veor	q0, q0, q3
			
 
				-	vand	q8, q8, q15
			
 
				-	 vand	q3, q3, q13
			
 
				-	vand	q12, q12, q14
			
 
				-	 vand	q0, q0, q9
			
 
				-	veor	q8, q8, q12
			
 
				-	 veor	q0, q0, q3
			
 
				-	veor	q12, q12, q11
			
 
				-	 veor	q3, q3, q10
			
 
				-	veor	q6, q6, q12
			
 
				-	veor	q0, q0, q12
			
 
				-	veor	q5, q5, q8
			
 
				-	veor	q3, q3, q8
			
 
				-
			
 
				-	veor	q12, q7, q4
			
 
				-	veor	q8, q1, q2
			
 
				-	veor	q11, q15, q14
			
 
				-	 veor 	q10, q13, q9
			
 
				-	vand	q11, q11, q12
			
 
				-	 vand	q10, q10, q4
			
 
				-	veor	q12, q12, q8
			
 
				-	 veor	q4, q4, q2
			
 
				-	vand	q8, q8, q15
			
 
				-	 vand	q2, q2, q13
			
 
				-	vand	q12, q12, q14
			
 
				-	 vand	q4, q4, q9
			
 
				-	veor	q8, q8, q12
			
 
				-	 veor	q4, q4, q2
			
 
				-	veor	q12, q12, q11
			
 
				-	 veor	q2, q2, q10
			
 
				-	veor	q15, q15, q13
			
 
				-	veor	q14, q14, q9
			
 
				-	veor 	q10, q15, q14
			
 
				-	vand	q10, q10, q7
			
 
				-	veor	q7, q7, q1
			
 
				-	vand	q11, q1, q15
			
 
				-	vand	q7, q7, q14
			
 
				-	veor	q1, q11, q10
			
 
				-	veor	q7, q7, q11
			
 
				-	veor	q7, q7, q12
			
 
				-	veor	q4, q4, q12
			
 
				-	veor	q1, q1, q8
			
 
				-	veor	q2, q2, q8
			
 
				-	veor	q7, q7, q0
			
 
				-	veor	q1, q1, q6
			
 
				-	veor	q6, q6, q0
			
 
				-	veor	q4, q4, q7
			
 
				-	veor	q0, q0, q1
			
 
				-
			
 
				-	veor	q1, q1, q5
			
 
				-	veor	q5, q5, q2
			
 
				-	veor	q2, q2, q3
			
 
				-	veor	q3, q3, q5
			
 
				-	veor	q4, q4, q5
			
 
				-
			
 
				-	veor	q6, q6, q3
			
 
				-	subs	r5,r5,#1
			
 
				-	bcc	.Lenc_done
			
 
				-	vext.8	q8, q0, q0, #12	@ x0 <<< 32
			
 
				-	vext.8	q9, q1, q1, #12
			
 
				-	 veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
			
 
				-	vext.8	q10, q4, q4, #12
			
 
				-	 veor	q1, q1, q9
			
 
				-	vext.8	q11, q6, q6, #12
			
 
				-	 veor	q4, q4, q10
			
 
				-	vext.8	q12, q3, q3, #12
			
 
				-	 veor	q6, q6, q11
			
 
				-	vext.8	q13, q7, q7, #12
			
 
				-	 veor	q3, q3, q12
			
 
				-	vext.8	q14, q2, q2, #12
			
 
				-	 veor	q7, q7, q13
			
 
				-	vext.8	q15, q5, q5, #12
			
 
				-	 veor	q2, q2, q14
			
 
				-
			
 
				-	veor	q9, q9, q0
			
 
				-	 veor	q5, q5, q15
			
 
				-	 vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
			
 
				-	veor	q10, q10, q1
			
 
				-	veor	q8, q8, q5
			
 
				-	veor	q9, q9, q5
			
 
				-	 vext.8	q1, q1, q1, #8
			
 
				-	veor	q13, q13, q3
			
 
				-	 veor	q0, q0, q8
			
 
				-	veor	q14, q14, q7
			
 
				-	 veor	q1, q1, q9
			
 
				-	 vext.8	q8, q3, q3, #8
			
 
				-	veor	q12, q12, q6
			
 
				-	 vext.8	q9, q7, q7, #8
			
 
				-	veor	q15, q15, q2
			
 
				-	 vext.8	q3, q6, q6, #8
			
 
				-	veor	q11, q11, q4
			
 
				-	 vext.8	q7, q5, q5, #8
			
 
				-	veor	q12, q12, q5
			
 
				-	 vext.8	q6, q2, q2, #8
			
 
				-	veor	q11, q11, q5
			
 
				-	 vext.8	q2, q4, q4, #8
			
 
				-	veor	q5, q9, q13
			
 
				-	veor	q4, q8, q12
			
 
				-	veor	q3, q3, q11
			
 
				-	veor	q7, q7, q15
			
 
				-	veor	q6, q6, q14
			
 
				-	 @ vmov	q4, q8
			
 
				-	veor	q2, q2, q10
			
 
				-	 @ vmov	q5, q9
			
 
				-	vldmia	r6, {q12}		@ .LSR
			
 
				-	ite	eq				@ Thumb2 thing, samity check in ARM
			
 
				-	addeq	r6,r6,#0x10
			
 
				-	bne	.Lenc_loop
			
 
				-	vldmia	r6, {q12}		@ .LSRM0
			
 
				-	b	.Lenc_loop
			
 
				-.align	4
			
 
				-.Lenc_done:
			
 
				-	vmov.i8	q8,#0x55			@ compose .LBS0
			
 
				-	vmov.i8	q9,#0x33			@ compose .LBS1
			
 
				-	vshr.u64	q10, q2, #1
			
 
				-	 vshr.u64	q11, q3, #1
			
 
				-	veor		q10, q10, q5
			
 
				-	 veor		q11, q11, q7
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q5, q5, q10
			
 
				-	vshl.u64	q10, q10, #1
			
 
				-	 veor		q7, q7, q11
			
 
				-	 vshl.u64	q11, q11, #1
			
 
				-	veor		q2, q2, q10
			
 
				-	 veor		q3, q3, q11
			
 
				-	vshr.u64	q10, q4, #1
			
 
				-	 vshr.u64	q11, q0, #1
			
 
				-	veor		q10, q10, q6
			
 
				-	 veor		q11, q11, q1
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q6, q6, q10
			
 
				-	vshl.u64	q10, q10, #1
			
 
				-	 veor		q1, q1, q11
			
 
				-	 vshl.u64	q11, q11, #1
			
 
				-	veor		q4, q4, q10
			
 
				-	 veor		q0, q0, q11
			
 
				-	vmov.i8	q8,#0x0f			@ compose .LBS2
			
 
				-	vshr.u64	q10, q7, #2
			
 
				-	 vshr.u64	q11, q3, #2
			
 
				-	veor		q10, q10, q5
			
 
				-	 veor		q11, q11, q2
			
 
				-	vand		q10, q10, q9
			
 
				-	 vand		q11, q11, q9
			
 
				-	veor		q5, q5, q10
			
 
				-	vshl.u64	q10, q10, #2
			
 
				-	 veor		q2, q2, q11
			
 
				-	 vshl.u64	q11, q11, #2
			
 
				-	veor		q7, q7, q10
			
 
				-	 veor		q3, q3, q11
			
 
				-	vshr.u64	q10, q1, #2
			
 
				-	 vshr.u64	q11, q0, #2
			
 
				-	veor		q10, q10, q6
			
 
				-	 veor		q11, q11, q4
			
 
				-	vand		q10, q10, q9
			
 
				-	 vand		q11, q11, q9
			
 
				-	veor		q6, q6, q10
			
 
				-	vshl.u64	q10, q10, #2
			
 
				-	 veor		q4, q4, q11
			
 
				-	 vshl.u64	q11, q11, #2
			
 
				-	veor		q1, q1, q10
			
 
				-	 veor		q0, q0, q11
			
 
				-	vshr.u64	q10, q6, #4
			
 
				-	 vshr.u64	q11, q4, #4
			
 
				-	veor		q10, q10, q5
			
 
				-	 veor		q11, q11, q2
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q5, q5, q10
			
 
				-	vshl.u64	q10, q10, #4
			
 
				-	 veor		q2, q2, q11
			
 
				-	 vshl.u64	q11, q11, #4
			
 
				-	veor		q6, q6, q10
			
 
				-	 veor		q4, q4, q11
			
 
				-	vshr.u64	q10, q1, #4
			
 
				-	 vshr.u64	q11, q0, #4
			
 
				-	veor		q10, q10, q7
			
 
				-	 veor		q11, q11, q3
			
 
				-	vand		q10, q10, q8
			
 
				-	 vand		q11, q11, q8
			
 
				-	veor		q7, q7, q10
			
 
				-	vshl.u64	q10, q10, #4
			
 
				-	 veor		q3, q3, q11
			
 
				-	 vshl.u64	q11, q11, #4
			
 
				-	veor		q1, q1, q10
			
 
				-	 veor		q0, q0, q11
			
 
				-	vldmia	r4, {q8}			@ last round key
			
 
				-	veor	q4, q4, q8
			
 
				-	veor	q6, q6, q8
			
 
				-	veor	q3, q3, q8
			
 
				-	veor	q7, q7, q8
			
 
				-	veor	q2, q2, q8
			
 
				-	veor	q5, q5, q8
			
 
				-	veor	q0, q0, q8
			
 
				-	veor	q1, q1, q8
			
 
				-	bx	lr
			
 
				-.size	_bsaes_encrypt8,.-_bsaes_encrypt8
			
 
				-.type	_bsaes_key_convert,%function
			
 
				-.align	4
			
 
				-_bsaes_key_convert:
			
 
				-	adr	r6,_bsaes_key_convert
			
 
				-	vld1.8	{q7},  [r4]!		@ load round 0 key
			
 
				-	sub	r6,r6,#_bsaes_key_convert-.LM0
			
 
				-	vld1.8	{q15}, [r4]!		@ load round 1 key
			
 
				-
			
 
				-	vmov.i8	q8,  #0x01			@ bit masks
			
 
				-	vmov.i8	q9,  #0x02
			
 
				-	vmov.i8	q10, #0x04
			
 
				-	vmov.i8	q11, #0x08
			
 
				-	vmov.i8	q12, #0x10
			
 
				-	vmov.i8	q13, #0x20
			
 
				-	vldmia	r6, {q14}		@ .LM0
			
 
				-
			
 
				-#ifdef __ARMEL__
			
 
				-	vrev32.8	q7,  q7
			
 
				-	vrev32.8	q15, q15
			
 
				-#endif
			
 
				-	sub	r5,r5,#1
			
 
				-	vstmia	r12!, {q7}		@ save round 0 key
			
 
				-	b	.Lkey_loop
			
 
				-
			
 
				-.align	4
			
 
				-.Lkey_loop:
			
 
				-	vtbl.8	d14,{q15},d28
			
 
				-	vtbl.8	d15,{q15},d29
			
 
				-	vmov.i8	q6,  #0x40
			
 
				-	vmov.i8	q15, #0x80
			
 
				-
			
 
				-	vtst.8	q0, q7, q8
			
 
				-	vtst.8	q1, q7, q9
			
 
				-	vtst.8	q2, q7, q10
			
 
				-	vtst.8	q3, q7, q11
			
 
				-	vtst.8	q4, q7, q12
			
 
				-	vtst.8	q5, q7, q13
			
 
				-	vtst.8	q6, q7, q6
			
 
				-	vtst.8	q7, q7, q15
			
 
				-	vld1.8	{q15}, [r4]!		@ load next round key
			
 
				-	vmvn	q0, q0		@ "pnot"
			
 
				-	vmvn	q1, q1
			
 
				-	vmvn	q5, q5
			
 
				-	vmvn	q6, q6
			
 
				-#ifdef __ARMEL__
			
 
				-	vrev32.8	q15, q15
			
 
				-#endif
			
 
				-	subs	r5,r5,#1
			
 
				-	vstmia	r12!,{q0-q7}		@ write bit-sliced round key
			
 
				-	bne	.Lkey_loop
			
 
				-
			
 
				-	vmov.i8	q7,#0x63			@ compose .L63
			
 
				-	@ don't save last round key
			
 
				-	bx	lr
			
 
				-.size	_bsaes_key_convert,.-_bsaes_key_convert
			
 
				-.extern AES_cbc_encrypt
			
 
				-.extern AES_decrypt
			
 
				-
			
 
				-.global	bsaes_cbc_encrypt
			
 
				-.type	bsaes_cbc_encrypt,%function
			
 
				-.align	5
			
 
				-bsaes_cbc_encrypt:
			
 
				-#ifndef	__KERNEL__
			
 
				-	cmp	r2, #128
			
 
				-#ifndef	__thumb__
			
 
				-	blo	AES_cbc_encrypt
			
 
				-#else
			
 
				-	bhs	1f
			
 
				-	b	AES_cbc_encrypt
			
 
				-1:
			
 
				-#endif
			
 
				-#endif
			
 
				-
			
 
				-	@ it is up to the caller to make sure we are called with enc == 0
			
 
				-
			
 
				-	mov	ip, sp
			
 
				-	stmdb	sp!, {r4-r10, lr}
			
 
				-	VFP_ABI_PUSH
			
 
				-	ldr	r8, [ip]			@ IV is 1st arg on the stack
			
 
				-	mov	r2, r2, lsr#4		@ len in 16 byte blocks
			
 
				-	sub	sp, #0x10			@ scratch space to carry over the IV
			
 
				-	mov	r9, sp				@ save sp
			
 
				-
			
 
				-	ldr	r10, [r3, #240]		@ get # of rounds
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	@ allocate the key schedule on the stack
			
 
				-	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
			
 
				-	add	r12, #96			@ sifze of bit-slices key schedule
			
 
				-
			
 
				-	@ populate the key schedule
			
 
				-	mov	r4, r3			@ pass key
			
 
				-	mov	r5, r10			@ pass # of rounds
			
 
				-	mov	sp, r12				@ sp is sp
			
 
				-	bl	_bsaes_key_convert
			
 
				-	vldmia	sp, {q6}
			
 
				-	vstmia	r12,  {q15}		@ save last round key
			
 
				-	veor	q7, q7, q6	@ fix up round 0 key
			
 
				-	vstmia	sp, {q7}
			
 
				-#else
			
 
				-	ldr	r12, [r3, #244]
			
 
				-	eors	r12, #1
			
 
				-	beq	0f
			
 
				-
			
 
				-	@ populate the key schedule
			
 
				-	str	r12, [r3, #244]
			
 
				-	mov	r4, r3			@ pass key
			
 
				-	mov	r5, r10			@ pass # of rounds
			
 
				-	add	r12, r3, #248			@ pass key schedule
			
 
				-	bl	_bsaes_key_convert
			
 
				-	add	r4, r3, #248
			
 
				-	vldmia	r4, {q6}
			
 
				-	vstmia	r12, {q15}			@ save last round key
			
 
				-	veor	q7, q7, q6	@ fix up round 0 key
			
 
				-	vstmia	r4, {q7}
			
 
				-
			
 
				-.align	2
			
 
				-0:
			
 
				-#endif
			
 
				-
			
 
				-	vld1.8	{q15}, [r8]		@ load IV
			
 
				-	b	.Lcbc_dec_loop
			
 
				-
			
 
				-.align	4
			
 
				-.Lcbc_dec_loop:
			
 
				-	subs	r2, r2, #0x8
			
 
				-	bmi	.Lcbc_dec_loop_finish
			
 
				-
			
 
				-	vld1.8	{q0-q1}, [r0]!	@ load input
			
 
				-	vld1.8	{q2-q3}, [r0]!
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	mov	r4, sp			@ pass the key
			
 
				-#else
			
 
				-	add	r4, r3, #248
			
 
				-#endif
			
 
				-	vld1.8	{q4-q5}, [r0]!
			
 
				-	mov	r5, r10
			
 
				-	vld1.8	{q6-q7}, [r0]
			
 
				-	sub	r0, r0, #0x60
			
 
				-	vstmia	r9, {q15}			@ put aside IV
			
 
				-
			
 
				-	bl	_bsaes_decrypt8
			
 
				-
			
 
				-	vldmia	r9, {q14}			@ reload IV
			
 
				-	vld1.8	{q8-q9}, [r0]!	@ reload input
			
 
				-	veor	q0, q0, q14	@ ^= IV
			
 
				-	vld1.8	{q10-q11}, [r0]!
			
 
				-	veor	q1, q1, q8
			
 
				-	veor	q6, q6, q9
			
 
				-	vld1.8	{q12-q13}, [r0]!
			
 
				-	veor	q4, q4, q10
			
 
				-	veor	q2, q2, q11
			
 
				-	vld1.8	{q14-q15}, [r0]!
			
 
				-	veor	q7, q7, q12
			
 
				-	vst1.8	{q0-q1}, [r1]!	@ write output
			
 
				-	veor	q3, q3, q13
			
 
				-	vst1.8	{q6}, [r1]!
			
 
				-	veor	q5, q5, q14
			
 
				-	vst1.8	{q4}, [r1]!
			
 
				-	vst1.8	{q2}, [r1]!
			
 
				-	vst1.8	{q7}, [r1]!
			
 
				-	vst1.8	{q3}, [r1]!
			
 
				-	vst1.8	{q5}, [r1]!
			
 
				-
			
 
				-	b	.Lcbc_dec_loop
			
 
				-
			
 
				-.Lcbc_dec_loop_finish:
			
 
				-	adds	r2, r2, #8
			
 
				-	beq	.Lcbc_dec_done
			
 
				-
			
 
				-	vld1.8	{q0}, [r0]!		@ load input
			
 
				-	cmp	r2, #2
			
 
				-	blo	.Lcbc_dec_one
			
 
				-	vld1.8	{q1}, [r0]!
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	mov	r4, sp			@ pass the key
			
 
				-#else
			
 
				-	add	r4, r3, #248
			
 
				-#endif
			
 
				-	mov	r5, r10
			
 
				-	vstmia	r9, {q15}			@ put aside IV
			
 
				-	beq	.Lcbc_dec_two
			
 
				-	vld1.8	{q2}, [r0]!
			
 
				-	cmp	r2, #4
			
 
				-	blo	.Lcbc_dec_three
			
 
				-	vld1.8	{q3}, [r0]!
			
 
				-	beq	.Lcbc_dec_four
			
 
				-	vld1.8	{q4}, [r0]!
			
 
				-	cmp	r2, #6
			
 
				-	blo	.Lcbc_dec_five
			
 
				-	vld1.8	{q5}, [r0]!
			
 
				-	beq	.Lcbc_dec_six
			
 
				-	vld1.8	{q6}, [r0]!
			
 
				-	sub	r0, r0, #0x70
			
 
				-
			
 
				-	bl	_bsaes_decrypt8
			
 
				-
			
 
				-	vldmia	r9, {q14}			@ reload IV
			
 
				-	vld1.8	{q8-q9}, [r0]!	@ reload input
			
 
				-	veor	q0, q0, q14	@ ^= IV
			
 
				-	vld1.8	{q10-q11}, [r0]!
			
 
				-	veor	q1, q1, q8
			
 
				-	veor	q6, q6, q9
			
 
				-	vld1.8	{q12-q13}, [r0]!
			
 
				-	veor	q4, q4, q10
			
 
				-	veor	q2, q2, q11
			
 
				-	vld1.8	{q15}, [r0]!
			
 
				-	veor	q7, q7, q12
			
 
				-	vst1.8	{q0-q1}, [r1]!	@ write output
			
 
				-	veor	q3, q3, q13
			
 
				-	vst1.8	{q6}, [r1]!
			
 
				-	vst1.8	{q4}, [r1]!
			
 
				-	vst1.8	{q2}, [r1]!
			
 
				-	vst1.8	{q7}, [r1]!
			
 
				-	vst1.8	{q3}, [r1]!
			
 
				-	b	.Lcbc_dec_done
			
 
				-.align	4
			
 
				-.Lcbc_dec_six:
			
 
				-	sub	r0, r0, #0x60
			
 
				-	bl	_bsaes_decrypt8
			
 
				-	vldmia	r9,{q14}			@ reload IV
			
 
				-	vld1.8	{q8-q9}, [r0]!	@ reload input
			
 
				-	veor	q0, q0, q14	@ ^= IV
			
 
				-	vld1.8	{q10-q11}, [r0]!
			
 
				-	veor	q1, q1, q8
			
 
				-	veor	q6, q6, q9
			
 
				-	vld1.8	{q12}, [r0]!
			
 
				-	veor	q4, q4, q10
			
 
				-	veor	q2, q2, q11
			
 
				-	vld1.8	{q15}, [r0]!
			
 
				-	veor	q7, q7, q12
			
 
				-	vst1.8	{q0-q1}, [r1]!	@ write output
			
 
				-	vst1.8	{q6}, [r1]!
			
 
				-	vst1.8	{q4}, [r1]!
			
 
				-	vst1.8	{q2}, [r1]!
			
 
				-	vst1.8	{q7}, [r1]!
			
 
				-	b	.Lcbc_dec_done
			
 
				-.align	4
			
 
				-.Lcbc_dec_five:
			
 
				-	sub	r0, r0, #0x50
			
 
				-	bl	_bsaes_decrypt8
			
 
				-	vldmia	r9, {q14}			@ reload IV
			
 
				-	vld1.8	{q8-q9}, [r0]!	@ reload input
			
 
				-	veor	q0, q0, q14	@ ^= IV
			
 
				-	vld1.8	{q10-q11}, [r0]!
			
 
				-	veor	q1, q1, q8
			
 
				-	veor	q6, q6, q9
			
 
				-	vld1.8	{q15}, [r0]!
			
 
				-	veor	q4, q4, q10
			
 
				-	vst1.8	{q0-q1}, [r1]!	@ write output
			
 
				-	veor	q2, q2, q11
			
 
				-	vst1.8	{q6}, [r1]!
			
 
				-	vst1.8	{q4}, [r1]!
			
 
				-	vst1.8	{q2}, [r1]!
			
 
				-	b	.Lcbc_dec_done
			
 
				-.align	4
			
 
				-.Lcbc_dec_four:
			
 
				-	sub	r0, r0, #0x40
			
 
				-	bl	_bsaes_decrypt8
			
 
				-	vldmia	r9, {q14}			@ reload IV
			
 
				-	vld1.8	{q8-q9}, [r0]!	@ reload input
			
 
				-	veor	q0, q0, q14	@ ^= IV
			
 
				-	vld1.8	{q10}, [r0]!
			
 
				-	veor	q1, q1, q8
			
 
				-	veor	q6, q6, q9
			
 
				-	vld1.8	{q15}, [r0]!
			
 
				-	veor	q4, q4, q10
			
 
				-	vst1.8	{q0-q1}, [r1]!	@ write output
			
 
				-	vst1.8	{q6}, [r1]!
			
 
				-	vst1.8	{q4}, [r1]!
			
 
				-	b	.Lcbc_dec_done
			
 
				-.align	4
			
 
				-.Lcbc_dec_three:
			
 
				-	sub	r0, r0, #0x30
			
 
				-	bl	_bsaes_decrypt8
			
 
				-	vldmia	r9, {q14}			@ reload IV
			
 
				-	vld1.8	{q8-q9}, [r0]!	@ reload input
			
 
				-	veor	q0, q0, q14	@ ^= IV
			
 
				-	vld1.8	{q15}, [r0]!
			
 
				-	veor	q1, q1, q8
			
 
				-	veor	q6, q6, q9
			
 
				-	vst1.8	{q0-q1}, [r1]!	@ write output
			
 
				-	vst1.8	{q6}, [r1]!
			
 
				-	b	.Lcbc_dec_done
			
 
				-.align	4
			
 
				-.Lcbc_dec_two:
			
 
				-	sub	r0, r0, #0x20
			
 
				-	bl	_bsaes_decrypt8
			
 
				-	vldmia	r9, {q14}			@ reload IV
			
 
				-	vld1.8	{q8}, [r0]!		@ reload input
			
 
				-	veor	q0, q0, q14	@ ^= IV
			
 
				-	vld1.8	{q15}, [r0]!		@ reload input
			
 
				-	veor	q1, q1, q8
			
 
				-	vst1.8	{q0-q1}, [r1]!	@ write output
			
 
				-	b	.Lcbc_dec_done
			
 
				-.align	4
			
 
				-.Lcbc_dec_one:
			
 
				-	sub	r0, r0, #0x10
			
 
				-	mov	r10, r1			@ save original out pointer
			
 
				-	mov	r1, r9			@ use the iv scratch space as out buffer
			
 
				-	mov	r2, r3
			
 
				-	vmov	q4,q15		@ just in case ensure that IV
			
 
				-	vmov	q5,q0			@ and input are preserved
			
 
				-	bl	AES_decrypt
			
 
				-	vld1.8	{q0}, [r9,:64]		@ load result
			
 
				-	veor	q0, q0, q4	@ ^= IV
			
 
				-	vmov	q15, q5		@ q5 holds input
			
 
				-	vst1.8	{q0}, [r10]		@ write output
			
 
				-
			
 
				-.Lcbc_dec_done:
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	vmov.i32	q0, #0
			
 
				-	vmov.i32	q1, #0
			
 
				-.Lcbc_dec_bzero:				@ wipe key schedule [if any]
			
 
				-	vstmia		sp!, {q0-q1}
			
 
				-	cmp		sp, r9
			
 
				-	bne		.Lcbc_dec_bzero
			
 
				-#endif
			
 
				-
			
 
				-	mov	sp, r9
			
 
				-	add	sp, #0x10			@ add sp,r9,#0x10 is no good for thumb
			
 
				-	vst1.8	{q15}, [r8]		@ return IV
			
 
				-	VFP_ABI_POP
			
 
				-	ldmia	sp!, {r4-r10, pc}
			
 
				-.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
			
 
				-.extern	AES_encrypt
			
 
				-.global	bsaes_ctr32_encrypt_blocks
			
 
				-.type	bsaes_ctr32_encrypt_blocks,%function
			
 
				-.align	5
			
 
				-bsaes_ctr32_encrypt_blocks:
			
 
				-	cmp	r2, #8			@ use plain AES for
			
 
				-	blo	.Lctr_enc_short			@ small sizes
			
 
				-
			
 
				-	mov	ip, sp
			
 
				-	stmdb	sp!, {r4-r10, lr}
			
 
				-	VFP_ABI_PUSH
			
 
				-	ldr	r8, [ip]			@ ctr is 1st arg on the stack
			
 
				-	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
			
 
				-	mov	r9, sp				@ save sp
			
 
				-
			
 
				-	ldr	r10, [r3, #240]		@ get # of rounds
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	@ allocate the key schedule on the stack
			
 
				-	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
			
 
				-	add	r12, #96			@ size of bit-sliced key schedule
			
 
				-
			
 
				-	@ populate the key schedule
			
 
				-	mov	r4, r3			@ pass key
			
 
				-	mov	r5, r10			@ pass # of rounds
			
 
				-	mov	sp, r12				@ sp is sp
			
 
				-	bl	_bsaes_key_convert
			
 
				-	veor	q7,q7,q15	@ fix up last round key
			
 
				-	vstmia	r12, {q7}			@ save last round key
			
 
				-
			
 
				-	vld1.8	{q0}, [r8]		@ load counter
			
 
				-	add	r8, r6, #.LREVM0SR-.LM0	@ borrow r8
			
 
				-	vldmia	sp, {q4}		@ load round0 key
			
 
				-#else
			
 
				-	ldr	r12, [r3, #244]
			
 
				-	eors	r12, #1
			
 
				-	beq	0f
			
 
				-
			
 
				-	@ populate the key schedule
			
 
				-	str	r12, [r3, #244]
			
 
				-	mov	r4, r3			@ pass key
			
 
				-	mov	r5, r10			@ pass # of rounds
			
 
				-	add	r12, r3, #248			@ pass key schedule
			
 
				-	bl	_bsaes_key_convert
			
 
				-	veor	q7,q7,q15	@ fix up last round key
			
 
				-	vstmia	r12, {q7}			@ save last round key
			
 
				-
			
 
				-.align	2
			
 
				-0:	add	r12, r3, #248
			
 
				-	vld1.8	{q0}, [r8]		@ load counter
			
 
				-	adrl	r8, .LREVM0SR			@ borrow r8
			
 
				-	vldmia	r12, {q4}			@ load round0 key
			
 
				-	sub	sp, #0x10			@ place for adjusted round0 key
			
 
				-#endif
			
 
				-
			
 
				-	vmov.i32	q8,#1		@ compose 1<<96
			
 
				-	veor		q9,q9,q9
			
 
				-	vrev32.8	q0,q0
			
 
				-	vext.8		q8,q9,q8,#4
			
 
				-	vrev32.8	q4,q4
			
 
				-	vadd.u32	q9,q8,q8	@ compose 2<<96
			
 
				-	vstmia	sp, {q4}		@ save adjusted round0 key
			
 
				-	b	.Lctr_enc_loop
			
 
				-
			
 
				-.align	4
			
 
				-.Lctr_enc_loop:
			
 
				-	vadd.u32	q10, q8, q9	@ compose 3<<96
			
 
				-	vadd.u32	q1, q0, q8	@ +1
			
 
				-	vadd.u32	q2, q0, q9	@ +2
			
 
				-	vadd.u32	q3, q0, q10	@ +3
			
 
				-	vadd.u32	q4, q1, q10
			
 
				-	vadd.u32	q5, q2, q10
			
 
				-	vadd.u32	q6, q3, q10
			
 
				-	vadd.u32	q7, q4, q10
			
 
				-	vadd.u32	q10, q5, q10	@ next counter
			
 
				-
			
 
				-	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
			
 
				-	@ to flip byte order in 32-bit counter
			
 
				-
			
 
				-	vldmia		sp, {q9}		@ load round0 key
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x10		@ pass next round key
			
 
				-#else
			
 
				-	add		r4, r3, #264
			
 
				-#endif
			
 
				-	vldmia		r8, {q8}			@ .LREVM0SR
			
 
				-	mov		r5, r10			@ pass rounds
			
 
				-	vstmia		r9, {q10}			@ save next counter
			
 
				-	sub		r6, r8, #.LREVM0SR-.LSR	@ pass constants
			
 
				-
			
 
				-	bl		_bsaes_encrypt8_alt
			
 
				-
			
 
				-	subs		r2, r2, #8
			
 
				-	blo		.Lctr_enc_loop_done
			
 
				-
			
 
				-	vld1.8		{q8-q9}, [r0]!	@ load input
			
 
				-	vld1.8		{q10-q11}, [r0]!
			
 
				-	veor		q0, q8
			
 
				-	veor		q1, q9
			
 
				-	vld1.8		{q12-q13}, [r0]!
			
 
				-	veor		q4, q10
			
 
				-	veor		q6, q11
			
 
				-	vld1.8		{q14-q15}, [r0]!
			
 
				-	veor		q3, q12
			
 
				-	vst1.8		{q0-q1}, [r1]!	@ write output
			
 
				-	veor		q7, q13
			
 
				-	veor		q2, q14
			
 
				-	vst1.8		{q4}, [r1]!
			
 
				-	veor		q5, q15
			
 
				-	vst1.8		{q6}, [r1]!
			
 
				-	vmov.i32	q8, #1			@ compose 1<<96
			
 
				-	vst1.8		{q3}, [r1]!
			
 
				-	veor		q9, q9, q9
			
 
				-	vst1.8		{q7}, [r1]!
			
 
				-	vext.8		q8, q9, q8, #4
			
 
				-	vst1.8		{q2}, [r1]!
			
 
				-	vadd.u32	q9,q8,q8		@ compose 2<<96
			
 
				-	vst1.8		{q5}, [r1]!
			
 
				-	vldmia		r9, {q0}			@ load counter
			
 
				-
			
 
				-	bne		.Lctr_enc_loop
			
 
				-	b		.Lctr_enc_done
			
 
				-
			
 
				-.align	4
			
 
				-.Lctr_enc_loop_done:
			
 
				-	add		r2, r2, #8
			
 
				-	vld1.8		{q8}, [r0]!	@ load input
			
 
				-	veor		q0, q8
			
 
				-	vst1.8		{q0}, [r1]!	@ write output
			
 
				-	cmp		r2, #2
			
 
				-	blo		.Lctr_enc_done
			
 
				-	vld1.8		{q9}, [r0]!
			
 
				-	veor		q1, q9
			
 
				-	vst1.8		{q1}, [r1]!
			
 
				-	beq		.Lctr_enc_done
			
 
				-	vld1.8		{q10}, [r0]!
			
 
				-	veor		q4, q10
			
 
				-	vst1.8		{q4}, [r1]!
			
 
				-	cmp		r2, #4
			
 
				-	blo		.Lctr_enc_done
			
 
				-	vld1.8		{q11}, [r0]!
			
 
				-	veor		q6, q11
			
 
				-	vst1.8		{q6}, [r1]!
			
 
				-	beq		.Lctr_enc_done
			
 
				-	vld1.8		{q12}, [r0]!
			
 
				-	veor		q3, q12
			
 
				-	vst1.8		{q3}, [r1]!
			
 
				-	cmp		r2, #6
			
 
				-	blo		.Lctr_enc_done
			
 
				-	vld1.8		{q13}, [r0]!
			
 
				-	veor		q7, q13
			
 
				-	vst1.8		{q7}, [r1]!
			
 
				-	beq		.Lctr_enc_done
			
 
				-	vld1.8		{q14}, [r0]
			
 
				-	veor		q2, q14
			
 
				-	vst1.8		{q2}, [r1]!
			
 
				-
			
 
				-.Lctr_enc_done:
			
 
				-	vmov.i32	q0, #0
			
 
				-	vmov.i32	q1, #0
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-.Lctr_enc_bzero:			@ wipe key schedule [if any]
			
 
				-	vstmia		sp!, {q0-q1}
			
 
				-	cmp		sp, r9
			
 
				-	bne		.Lctr_enc_bzero
			
 
				-#else
			
 
				-	vstmia		sp, {q0-q1}
			
 
				-#endif
			
 
				-
			
 
				-	mov	sp, r9
			
 
				-	add	sp, #0x10		@ add sp,r9,#0x10 is no good for thumb
			
 
				-	VFP_ABI_POP
			
 
				-	ldmia	sp!, {r4-r10, pc}	@ return
			
 
				-
			
 
				-.align	4
			
 
				-.Lctr_enc_short:
			
 
				-	ldr	ip, [sp]		@ ctr pointer is passed on stack
			
 
				-	stmdb	sp!, {r4-r8, lr}
			
 
				-
			
 
				-	mov	r4, r0		@ copy arguments
			
 
				-	mov	r5, r1
			
 
				-	mov	r6, r2
			
 
				-	mov	r7, r3
			
 
				-	ldr	r8, [ip, #12]		@ load counter LSW
			
 
				-	vld1.8	{q1}, [ip]		@ load whole counter value
			
 
				-#ifdef __ARMEL__
			
 
				-	rev	r8, r8
			
 
				-#endif
			
 
				-	sub	sp, sp, #0x10
			
 
				-	vst1.8	{q1}, [sp,:64]	@ copy counter value
			
 
				-	sub	sp, sp, #0x10
			
 
				-
			
 
				-.Lctr_enc_short_loop:
			
 
				-	add	r0, sp, #0x10		@ input counter value
			
 
				-	mov	r1, sp			@ output on the stack
			
 
				-	mov	r2, r7			@ key
			
 
				-
			
 
				-	bl	AES_encrypt
			
 
				-
			
 
				-	vld1.8	{q0}, [r4]!	@ load input
			
 
				-	vld1.8	{q1}, [sp,:64]	@ load encrypted counter
			
 
				-	add	r8, r8, #1
			
 
				-#ifdef __ARMEL__
			
 
				-	rev	r0, r8
			
 
				-	str	r0, [sp, #0x1c]		@ next counter value
			
 
				-#else
			
 
				-	str	r8, [sp, #0x1c]		@ next counter value
			
 
				-#endif
			
 
				-	veor	q0,q0,q1
			
 
				-	vst1.8	{q0}, [r5]!	@ store output
			
 
				-	subs	r6, r6, #1
			
 
				-	bne	.Lctr_enc_short_loop
			
 
				-
			
 
				-	vmov.i32	q0, #0
			
 
				-	vmov.i32	q1, #0
			
 
				-	vstmia		sp!, {q0-q1}
			
 
				-
			
 
				-	ldmia	sp!, {r4-r8, pc}
			
 
				-.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
			
 
				-.globl	bsaes_xts_encrypt
			
 
				-.type	bsaes_xts_encrypt,%function
			
 
				-.align	4
			
 
				-bsaes_xts_encrypt:
			
 
				-	mov	ip, sp
			
 
				-	stmdb	sp!, {r4-r10, lr}		@ 0x20
			
 
				-	VFP_ABI_PUSH
			
 
				-	mov	r6, sp				@ future r3
			
 
				-
			
 
				-	mov	r7, r0
			
 
				-	mov	r8, r1
			
 
				-	mov	r9, r2
			
 
				-	mov	r10, r3
			
 
				-
			
 
				-	sub	r0, sp, #0x10			@ 0x10
			
 
				-	bic	r0, #0xf			@ align at 16 bytes
			
 
				-	mov	sp, r0
			
 
				-
			
 
				-#ifdef	XTS_CHAIN_TWEAK
			
 
				-	ldr	r0, [ip]			@ pointer to input tweak
			
 
				-#else
			
 
				-	@ generate initial tweak
			
 
				-	ldr	r0, [ip, #4]			@ iv[]
			
 
				-	mov	r1, sp
			
 
				-	ldr	r2, [ip, #0]			@ key2
			
 
				-	bl	AES_encrypt
			
 
				-	mov	r0,sp				@ pointer to initial tweak
			
 
				-#endif
			
 
				-
			
 
				-	ldr	r1, [r10, #240]		@ get # of rounds
			
 
				-	mov	r3, r6
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	@ allocate the key schedule on the stack
			
 
				-	sub	r12, sp, r1, lsl#7		@ 128 bytes per inner round key
			
 
				-	@ add	r12, #96			@ size of bit-sliced key schedule
			
 
				-	sub	r12, #48			@ place for tweak[9]
			
 
				-
			
 
				-	@ populate the key schedule
			
 
				-	mov	r4, r10			@ pass key
			
 
				-	mov	r5, r1			@ pass # of rounds
			
 
				-	mov	sp, r12
			
 
				-	add	r12, #0x90			@ pass key schedule
			
 
				-	bl	_bsaes_key_convert
			
 
				-	veor	q7, q7, q15	@ fix up last round key
			
 
				-	vstmia	r12, {q7}			@ save last round key
			
 
				-#else
			
 
				-	ldr	r12, [r10, #244]
			
 
				-	eors	r12, #1
			
 
				-	beq	0f
			
 
				-
			
 
				-	str	r12, [r10, #244]
			
 
				-	mov	r4, r10			@ pass key
			
 
				-	mov	r5, r1			@ pass # of rounds
			
 
				-	add	r12, r10, #248			@ pass key schedule
			
 
				-	bl	_bsaes_key_convert
			
 
				-	veor	q7, q7, q15	@ fix up last round key
			
 
				-	vstmia	r12, {q7}
			
 
				-
			
 
				-.align	2
			
 
				-0:	sub	sp, #0x90			@ place for tweak[9]
			
 
				-#endif
			
 
				-
			
 
				-	vld1.8	{q8}, [r0]			@ initial tweak
			
 
				-	adr	r2, .Lxts_magic
			
 
				-
			
 
				-	subs	r9, #0x80
			
 
				-	blo	.Lxts_enc_short
			
 
				-	b	.Lxts_enc_loop
			
 
				-
			
 
				-.align	4
			
 
				-.Lxts_enc_loop:
			
 
				-	vldmia		r2, {q5}	@ load XTS magic
			
 
				-	vshr.s64	q6, q8, #63
			
 
				-	mov		r0, sp
			
 
				-	vand		q6, q6, q5
			
 
				-	vadd.u64	q9, q8, q8
			
 
				-	vst1.64		{q8}, [r0,:128]!
			
 
				-	vswp		d13,d12
			
 
				-	vshr.s64	q7, q9, #63
			
 
				-	veor		q9, q9, q6
			
 
				-	vand		q7, q7, q5
			
 
				-	vadd.u64	q10, q9, q9
			
 
				-	vst1.64		{q9}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	vshr.s64	q6, q10, #63
			
 
				-	veor		q10, q10, q7
			
 
				-	vand		q6, q6, q5
			
 
				-	vld1.8		{q0}, [r7]!
			
 
				-	vadd.u64	q11, q10, q10
			
 
				-	vst1.64		{q10}, [r0,:128]!
			
 
				-	vswp		d13,d12
			
 
				-	vshr.s64	q7, q11, #63
			
 
				-	veor		q11, q11, q6
			
 
				-	vand		q7, q7, q5
			
 
				-	vld1.8		{q1}, [r7]!
			
 
				-	veor		q0, q0, q8
			
 
				-	vadd.u64	q12, q11, q11
			
 
				-	vst1.64		{q11}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	vshr.s64	q6, q12, #63
			
 
				-	veor		q12, q12, q7
			
 
				-	vand		q6, q6, q5
			
 
				-	vld1.8		{q2}, [r7]!
			
 
				-	veor		q1, q1, q9
			
 
				-	vadd.u64	q13, q12, q12
			
 
				-	vst1.64		{q12}, [r0,:128]!
			
 
				-	vswp		d13,d12
			
 
				-	vshr.s64	q7, q13, #63
			
 
				-	veor		q13, q13, q6
			
 
				-	vand		q7, q7, q5
			
 
				-	vld1.8		{q3}, [r7]!
			
 
				-	veor		q2, q2, q10
			
 
				-	vadd.u64	q14, q13, q13
			
 
				-	vst1.64		{q13}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	vshr.s64	q6, q14, #63
			
 
				-	veor		q14, q14, q7
			
 
				-	vand		q6, q6, q5
			
 
				-	vld1.8		{q4}, [r7]!
			
 
				-	veor		q3, q3, q11
			
 
				-	vadd.u64	q15, q14, q14
			
 
				-	vst1.64		{q14}, [r0,:128]!
			
 
				-	vswp		d13,d12
			
 
				-	vshr.s64	q7, q15, #63
			
 
				-	veor		q15, q15, q6
			
 
				-	vand		q7, q7, q5
			
 
				-	vld1.8		{q5}, [r7]!
			
 
				-	veor		q4, q4, q12
			
 
				-	vadd.u64	q8, q15, q15
			
 
				-	vst1.64		{q15}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	veor		q8, q8, q7
			
 
				-	vst1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	vld1.8		{q6-q7}, [r7]!
			
 
				-	veor		q5, q5, q13
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, r10, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		q6, q6, q14
			
 
				-	mov		r5, r1			@ pass rounds
			
 
				-	veor		q7, q7, q15
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_encrypt8
			
 
				-
			
 
				-	vld1.64		{q8-q9}, [r0,:128]!
			
 
				-	vld1.64		{q10-q11}, [r0,:128]!
			
 
				-	veor		q0, q0, q8
			
 
				-	vld1.64		{q12-q13}, [r0,:128]!
			
 
				-	veor		q1, q1, q9
			
 
				-	veor		q8, q4, q10
			
 
				-	vst1.8		{q0-q1}, [r8]!
			
 
				-	veor		q9, q6, q11
			
 
				-	vld1.64		{q14-q15}, [r0,:128]!
			
 
				-	veor		q10, q3, q12
			
 
				-	vst1.8		{q8-q9}, [r8]!
			
 
				-	veor		q11, q7, q13
			
 
				-	veor		q12, q2, q14
			
 
				-	vst1.8		{q10-q11}, [r8]!
			
 
				-	veor		q13, q5, q15
			
 
				-	vst1.8		{q12-q13}, [r8]!
			
 
				-
			
 
				-	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	subs		r9, #0x80
			
 
				-	bpl		.Lxts_enc_loop
			
 
				-
			
 
				-.Lxts_enc_short:
			
 
				-	adds		r9, #0x70
			
 
				-	bmi		.Lxts_enc_done
			
 
				-
			
 
				-	vldmia		r2, {q5}	@ load XTS magic
			
 
				-	vshr.s64	q7, q8, #63
			
 
				-	mov		r0, sp
			
 
				-	vand		q7, q7, q5
			
 
				-	vadd.u64	q9, q8, q8
			
 
				-	vst1.64		{q8}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	vshr.s64	q6, q9, #63
			
 
				-	veor		q9, q9, q7
			
 
				-	vand		q6, q6, q5
			
 
				-	vadd.u64	q10, q9, q9
			
 
				-	vst1.64		{q9}, [r0,:128]!
			
 
				-	vswp		d13,d12
			
 
				-	vshr.s64	q7, q10, #63
			
 
				-	veor		q10, q10, q6
			
 
				-	vand		q7, q7, q5
			
 
				-	vld1.8		{q0}, [r7]!
			
 
				-	subs		r9, #0x10
			
 
				-	bmi		.Lxts_enc_1
			
 
				-	vadd.u64	q11, q10, q10
			
 
				-	vst1.64		{q10}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	vshr.s64	q6, q11, #63
			
 
				-	veor		q11, q11, q7
			
 
				-	vand		q6, q6, q5
			
 
				-	vld1.8		{q1}, [r7]!
			
 
				-	subs		r9, #0x10
			
 
				-	bmi		.Lxts_enc_2
			
 
				-	veor		q0, q0, q8
			
 
				-	vadd.u64	q12, q11, q11
			
 
				-	vst1.64		{q11}, [r0,:128]!
			
 
				-	vswp		d13,d12
			
 
				-	vshr.s64	q7, q12, #63
			
 
				-	veor		q12, q12, q6
			
 
				-	vand		q7, q7, q5
			
 
				-	vld1.8		{q2}, [r7]!
			
 
				-	subs		r9, #0x10
			
 
				-	bmi		.Lxts_enc_3
			
 
				-	veor		q1, q1, q9
			
 
				-	vadd.u64	q13, q12, q12
			
 
				-	vst1.64		{q12}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	vshr.s64	q6, q13, #63
			
 
				-	veor		q13, q13, q7
			
 
				-	vand		q6, q6, q5
			
 
				-	vld1.8		{q3}, [r7]!
			
 
				-	subs		r9, #0x10
			
 
				-	bmi		.Lxts_enc_4
			
 
				-	veor		q2, q2, q10
			
 
				-	vadd.u64	q14, q13, q13
			
 
				-	vst1.64		{q13}, [r0,:128]!
			
 
				-	vswp		d13,d12
			
 
				-	vshr.s64	q7, q14, #63
			
 
				-	veor		q14, q14, q6
			
 
				-	vand		q7, q7, q5
			
 
				-	vld1.8		{q4}, [r7]!
			
 
				-	subs		r9, #0x10
			
 
				-	bmi		.Lxts_enc_5
			
 
				-	veor		q3, q3, q11
			
 
				-	vadd.u64	q15, q14, q14
			
 
				-	vst1.64		{q14}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	vshr.s64	q6, q15, #63
			
 
				-	veor		q15, q15, q7
			
 
				-	vand		q6, q6, q5
			
 
				-	vld1.8		{q5}, [r7]!
			
 
				-	subs		r9, #0x10
			
 
				-	bmi		.Lxts_enc_6
			
 
				-	veor		q4, q4, q12
			
 
				-	sub		r9, #0x10
			
 
				-	vst1.64		{q15}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	vld1.8		{q6}, [r7]!
			
 
				-	veor		q5, q5, q13
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, r10, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		q6, q6, q14
			
 
				-	mov		r5, r1			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_encrypt8
			
 
				-
			
 
				-	vld1.64		{q8-q9}, [r0,:128]!
			
 
				-	vld1.64		{q10-q11}, [r0,:128]!
			
 
				-	veor		q0, q0, q8
			
 
				-	vld1.64		{q12-q13}, [r0,:128]!
			
 
				-	veor		q1, q1, q9
			
 
				-	veor		q8, q4, q10
			
 
				-	vst1.8		{q0-q1}, [r8]!
			
 
				-	veor		q9, q6, q11
			
 
				-	vld1.64		{q14}, [r0,:128]!
			
 
				-	veor		q10, q3, q12
			
 
				-	vst1.8		{q8-q9}, [r8]!
			
 
				-	veor		q11, q7, q13
			
 
				-	veor		q12, q2, q14
			
 
				-	vst1.8		{q10-q11}, [r8]!
			
 
				-	vst1.8		{q12}, [r8]!
			
 
				-
			
 
				-	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_enc_done
			
 
				-.align	4
			
 
				-.Lxts_enc_6:
			
 
				-	vst1.64		{q14}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		q4, q4, q12
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, r10, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		q5, q5, q13
			
 
				-	mov		r5, r1			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_encrypt8
			
 
				-
			
 
				-	vld1.64		{q8-q9}, [r0,:128]!
			
 
				-	vld1.64		{q10-q11}, [r0,:128]!
			
 
				-	veor		q0, q0, q8
			
 
				-	vld1.64		{q12-q13}, [r0,:128]!
			
 
				-	veor		q1, q1, q9
			
 
				-	veor		q8, q4, q10
			
 
				-	vst1.8		{q0-q1}, [r8]!
			
 
				-	veor		q9, q6, q11
			
 
				-	veor		q10, q3, q12
			
 
				-	vst1.8		{q8-q9}, [r8]!
			
 
				-	veor		q11, q7, q13
			
 
				-	vst1.8		{q10-q11}, [r8]!
			
 
				-
			
 
				-	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_enc_done
			
 
				-
			
 
				-@ put this in range for both ARM and Thumb mode adr instructions
			
 
				-.align	5
			
 
				-.Lxts_magic:
			
 
				-	.quad	1, 0x87
			
 
				-
			
 
				-.align	5
			
 
				-.Lxts_enc_5:
			
 
				-	vst1.64		{q13}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		q3, q3, q11
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, r10, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		q4, q4, q12
			
 
				-	mov		r5, r1			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_encrypt8
			
 
				-
			
 
				-	vld1.64		{q8-q9}, [r0,:128]!
			
 
				-	vld1.64		{q10-q11}, [r0,:128]!
			
 
				-	veor		q0, q0, q8
			
 
				-	vld1.64		{q12}, [r0,:128]!
			
 
				-	veor		q1, q1, q9
			
 
				-	veor		q8, q4, q10
			
 
				-	vst1.8		{q0-q1}, [r8]!
			
 
				-	veor		q9, q6, q11
			
 
				-	veor		q10, q3, q12
			
 
				-	vst1.8		{q8-q9}, [r8]!
			
 
				-	vst1.8		{q10}, [r8]!
			
 
				-
			
 
				-	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_enc_done
			
 
				-.align	4
			
 
				-.Lxts_enc_4:
			
 
				-	vst1.64		{q12}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		q2, q2, q10
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, r10, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		q3, q3, q11
			
 
				-	mov		r5, r1			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_encrypt8
			
 
				-
			
 
				-	vld1.64		{q8-q9}, [r0,:128]!
			
 
				-	vld1.64		{q10-q11}, [r0,:128]!
			
 
				-	veor		q0, q0, q8
			
 
				-	veor		q1, q1, q9
			
 
				-	veor		q8, q4, q10
			
 
				-	vst1.8		{q0-q1}, [r8]!
			
 
				-	veor		q9, q6, q11
			
 
				-	vst1.8		{q8-q9}, [r8]!
			
 
				-
			
 
				-	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_enc_done
			
 
				-.align	4
			
 
				-.Lxts_enc_3:
			
 
				-	vst1.64		{q11}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		q1, q1, q9
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, r10, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		q2, q2, q10
			
 
				-	mov		r5, r1			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_encrypt8
			
 
				-
			
 
				-	vld1.64		{q8-q9}, [r0,:128]!
			
 
				-	vld1.64		{q10}, [r0,:128]!
			
 
				-	veor		q0, q0, q8
			
 
				-	veor		q1, q1, q9
			
 
				-	veor		q8, q4, q10
			
 
				-	vst1.8		{q0-q1}, [r8]!
			
 
				-	vst1.8		{q8}, [r8]!
			
 
				-
			
 
				-	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_enc_done
			
 
				-.align	4
			
 
				-.Lxts_enc_2:
			
 
				-	vst1.64		{q10}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		q0, q0, q8
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, r10, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		q1, q1, q9
			
 
				-	mov		r5, r1			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_encrypt8
			
 
				-
			
 
				-	vld1.64		{q8-q9}, [r0,:128]!
			
 
				-	veor		q0, q0, q8
			
 
				-	veor		q1, q1, q9
			
 
				-	vst1.8		{q0-q1}, [r8]!
			
 
				-
			
 
				-	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_enc_done
			
 
				-.align	4
			
 
				-.Lxts_enc_1:
			
 
				-	mov		r0, sp
			
 
				-	veor		q0, q8
			
 
				-	mov		r1, sp
			
 
				-	vst1.8		{q0}, [sp,:128]
			
 
				-	mov		r2, r10
			
 
				-	mov		r4, r3				@ preserve fp
			
 
				-
			
 
				-	bl		AES_encrypt
			
 
				-
			
 
				-	vld1.8		{q0}, [sp,:128]
			
 
				-	veor		q0, q0, q8
			
 
				-	vst1.8		{q0}, [r8]!
			
 
				-	mov		r3, r4
			
 
				-
			
 
				-	vmov		q8, q9		@ next round tweak
			
 
				-
			
 
				-.Lxts_enc_done:
			
 
				-#ifndef	XTS_CHAIN_TWEAK
			
 
				-	adds		r9, #0x10
			
 
				-	beq		.Lxts_enc_ret
			
 
				-	sub		r6, r8, #0x10
			
 
				-
			
 
				-.Lxts_enc_steal:
			
 
				-	ldrb		r0, [r7], #1
			
 
				-	ldrb		r1, [r8, #-0x10]
			
 
				-	strb		r0, [r8, #-0x10]
			
 
				-	strb		r1, [r8], #1
			
 
				-
			
 
				-	subs		r9, #1
			
 
				-	bhi		.Lxts_enc_steal
			
 
				-
			
 
				-	vld1.8		{q0}, [r6]
			
 
				-	mov		r0, sp
			
 
				-	veor		q0, q0, q8
			
 
				-	mov		r1, sp
			
 
				-	vst1.8		{q0}, [sp,:128]
			
 
				-	mov		r2, r10
			
 
				-	mov		r4, r3			@ preserve fp
			
 
				-
			
 
				-	bl		AES_encrypt
			
 
				-
			
 
				-	vld1.8		{q0}, [sp,:128]
			
 
				-	veor		q0, q0, q8
			
 
				-	vst1.8		{q0}, [r6]
			
 
				-	mov		r3, r4
			
 
				-#endif
			
 
				-
			
 
				-.Lxts_enc_ret:
			
 
				-	bic		r0, r3, #0xf
			
 
				-	vmov.i32	q0, #0
			
 
				-	vmov.i32	q1, #0
			
 
				-#ifdef	XTS_CHAIN_TWEAK
			
 
				-	ldr		r1, [r3, #0x20+VFP_ABI_FRAME]	@ chain tweak
			
 
				-#endif
			
 
				-.Lxts_enc_bzero:				@ wipe key schedule [if any]
			
 
				-	vstmia		sp!, {q0-q1}
			
 
				-	cmp		sp, r0
			
 
				-	bne		.Lxts_enc_bzero
			
 
				-
			
 
				-	mov		sp, r3
			
 
				-#ifdef	XTS_CHAIN_TWEAK
			
 
				-	vst1.8		{q8}, [r1]
			
 
				-#endif
			
 
				-	VFP_ABI_POP
			
 
				-	ldmia		sp!, {r4-r10, pc}	@ return
			
 
				-
			
 
				-.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
			
 
				-
			
 
				-.globl	bsaes_xts_decrypt
			
 
				-.type	bsaes_xts_decrypt,%function
			
 
				-.align	4
			
 
				-bsaes_xts_decrypt:
			
 
				-	mov	ip, sp
			
 
				-	stmdb	sp!, {r4-r10, lr}		@ 0x20
			
 
				-	VFP_ABI_PUSH
			
 
				-	mov	r6, sp				@ future r3
			
 
				-
			
 
				-	mov	r7, r0
			
 
				-	mov	r8, r1
			
 
				-	mov	r9, r2
			
 
				-	mov	r10, r3
			
 
				-
			
 
				-	sub	r0, sp, #0x10			@ 0x10
			
 
				-	bic	r0, #0xf			@ align at 16 bytes
			
 
				-	mov	sp, r0
			
 
				-
			
 
				-#ifdef	XTS_CHAIN_TWEAK
			
 
				-	ldr	r0, [ip]			@ pointer to input tweak
			
 
				-#else
			
 
				-	@ generate initial tweak
			
 
				-	ldr	r0, [ip, #4]			@ iv[]
			
 
				-	mov	r1, sp
			
 
				-	ldr	r2, [ip, #0]			@ key2
			
 
				-	bl	AES_encrypt
			
 
				-	mov	r0, sp				@ pointer to initial tweak
			
 
				-#endif
			
 
				-
			
 
				-	ldr	r1, [r10, #240]		@ get # of rounds
			
 
				-	mov	r3, r6
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	@ allocate the key schedule on the stack
			
 
				-	sub	r12, sp, r1, lsl#7		@ 128 bytes per inner round key
			
 
				-	@ add	r12, #96			@ size of bit-sliced key schedule
			
 
				-	sub	r12, #48			@ place for tweak[9]
			
 
				-
			
 
				-	@ populate the key schedule
			
 
				-	mov	r4, r10			@ pass key
			
 
				-	mov	r5, r1			@ pass # of rounds
			
 
				-	mov	sp, r12
			
 
				-	add	r12, #0x90			@ pass key schedule
			
 
				-	bl	_bsaes_key_convert
			
 
				-	add	r4, sp, #0x90
			
 
				-	vldmia	r4, {q6}
			
 
				-	vstmia	r12,  {q15}		@ save last round key
			
 
				-	veor	q7, q7, q6	@ fix up round 0 key
			
 
				-	vstmia	r4, {q7}
			
 
				-#else
			
 
				-	ldr	r12, [r10, #244]
			
 
				-	eors	r12, #1
			
 
				-	beq	0f
			
 
				-
			
 
				-	str	r12, [r10, #244]
			
 
				-	mov	r4, r10			@ pass key
			
 
				-	mov	r5, r1			@ pass # of rounds
			
 
				-	add	r12, r10, #248			@ pass key schedule
			
 
				-	bl	_bsaes_key_convert
			
 
				-	add	r4, r10, #248
			
 
				-	vldmia	r4, {q6}
			
 
				-	vstmia	r12,  {q15}		@ save last round key
			
 
				-	veor	q7, q7, q6	@ fix up round 0 key
			
 
				-	vstmia	r4, {q7}
			
 
				-
			
 
				-.align	2
			
 
				-0:	sub	sp, #0x90			@ place for tweak[9]
			
 
				-#endif
			
 
				-	vld1.8	{q8}, [r0]			@ initial tweak
			
 
				-	adr	r2, .Lxts_magic
			
 
				-
			
 
				-#ifndef	XTS_CHAIN_TWEAK
			
 
				-	tst	r9, #0xf			@ if not multiple of 16
			
 
				-	it	ne				@ Thumb2 thing, sanity check in ARM
			
 
				-	subne	r9, #0x10			@ subtract another 16 bytes
			
 
				-#endif
			
 
				-	subs	r9, #0x80
			
 
				-
			
 
				-	blo	.Lxts_dec_short
			
 
				-	b	.Lxts_dec_loop
			
 
				-
			
 
				-.align	4
			
 
				-.Lxts_dec_loop:
			
 
				-	vldmia		r2, {q5}	@ load XTS magic
			
 
				-	vshr.s64	q6, q8, #63
			
 
				-	mov		r0, sp
			
 
				-	vand		q6, q6, q5
			
 
				-	vadd.u64	q9, q8, q8
			
 
				-	vst1.64		{q8}, [r0,:128]!
			
 
				-	vswp		d13,d12
			
 
				-	vshr.s64	q7, q9, #63
			
 
				-	veor		q9, q9, q6
			
 
				-	vand		q7, q7, q5
			
 
				-	vadd.u64	q10, q9, q9
			
 
				-	vst1.64		{q9}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	vshr.s64	q6, q10, #63
			
 
				-	veor		q10, q10, q7
			
 
				-	vand		q6, q6, q5
			
 
				-	vld1.8		{q0}, [r7]!
			
 
				-	vadd.u64	q11, q10, q10
			
 
				-	vst1.64		{q10}, [r0,:128]!
			
 
				-	vswp		d13,d12
			
 
				-	vshr.s64	q7, q11, #63
			
 
				-	veor		q11, q11, q6
			
 
				-	vand		q7, q7, q5
			
 
				-	vld1.8		{q1}, [r7]!
			
 
				-	veor		q0, q0, q8
			
 
				-	vadd.u64	q12, q11, q11
			
 
				-	vst1.64		{q11}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	vshr.s64	q6, q12, #63
			
 
				-	veor		q12, q12, q7
			
 
				-	vand		q6, q6, q5
			
 
				-	vld1.8		{q2}, [r7]!
			
 
				-	veor		q1, q1, q9
			
 
				-	vadd.u64	q13, q12, q12
			
 
				-	vst1.64		{q12}, [r0,:128]!
			
 
				-	vswp		d13,d12
			
 
				-	vshr.s64	q7, q13, #63
			
 
				-	veor		q13, q13, q6
			
 
				-	vand		q7, q7, q5
			
 
				-	vld1.8		{q3}, [r7]!
			
 
				-	veor		q2, q2, q10
			
 
				-	vadd.u64	q14, q13, q13
			
 
				-	vst1.64		{q13}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	vshr.s64	q6, q14, #63
			
 
				-	veor		q14, q14, q7
			
 
				-	vand		q6, q6, q5
			
 
				-	vld1.8		{q4}, [r7]!
			
 
				-	veor		q3, q3, q11
			
 
				-	vadd.u64	q15, q14, q14
			
 
				-	vst1.64		{q14}, [r0,:128]!
			
 
				-	vswp		d13,d12
			
 
				-	vshr.s64	q7, q15, #63
			
 
				-	veor		q15, q15, q6
			
 
				-	vand		q7, q7, q5
			
 
				-	vld1.8		{q5}, [r7]!
			
 
				-	veor		q4, q4, q12
			
 
				-	vadd.u64	q8, q15, q15
			
 
				-	vst1.64		{q15}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	veor		q8, q8, q7
			
 
				-	vst1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	vld1.8		{q6-q7}, [r7]!
			
 
				-	veor		q5, q5, q13
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, r10, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		q6, q6, q14
			
 
				-	mov		r5, r1			@ pass rounds
			
 
				-	veor		q7, q7, q15
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_decrypt8
			
 
				-
			
 
				-	vld1.64		{q8-q9}, [r0,:128]!
			
 
				-	vld1.64		{q10-q11}, [r0,:128]!
			
 
				-	veor		q0, q0, q8
			
 
				-	vld1.64		{q12-q13}, [r0,:128]!
			
 
				-	veor		q1, q1, q9
			
 
				-	veor		q8, q6, q10
			
 
				-	vst1.8		{q0-q1}, [r8]!
			
 
				-	veor		q9, q4, q11
			
 
				-	vld1.64		{q14-q15}, [r0,:128]!
			
 
				-	veor		q10, q2, q12
			
 
				-	vst1.8		{q8-q9}, [r8]!
			
 
				-	veor		q11, q7, q13
			
 
				-	veor		q12, q3, q14
			
 
				-	vst1.8		{q10-q11}, [r8]!
			
 
				-	veor		q13, q5, q15
			
 
				-	vst1.8		{q12-q13}, [r8]!
			
 
				-
			
 
				-	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	subs		r9, #0x80
			
 
				-	bpl		.Lxts_dec_loop
			
 
				-
			
 
				-.Lxts_dec_short:
			
 
				-	adds		r9, #0x70
			
 
				-	bmi		.Lxts_dec_done
			
 
				-
			
 
				-	vldmia		r2, {q5}	@ load XTS magic
			
 
				-	vshr.s64	q7, q8, #63
			
 
				-	mov		r0, sp
			
 
				-	vand		q7, q7, q5
			
 
				-	vadd.u64	q9, q8, q8
			
 
				-	vst1.64		{q8}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	vshr.s64	q6, q9, #63
			
 
				-	veor		q9, q9, q7
			
 
				-	vand		q6, q6, q5
			
 
				-	vadd.u64	q10, q9, q9
			
 
				-	vst1.64		{q9}, [r0,:128]!
			
 
				-	vswp		d13,d12
			
 
				-	vshr.s64	q7, q10, #63
			
 
				-	veor		q10, q10, q6
			
 
				-	vand		q7, q7, q5
			
 
				-	vld1.8		{q0}, [r7]!
			
 
				-	subs		r9, #0x10
			
 
				-	bmi		.Lxts_dec_1
			
 
				-	vadd.u64	q11, q10, q10
			
 
				-	vst1.64		{q10}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	vshr.s64	q6, q11, #63
			
 
				-	veor		q11, q11, q7
			
 
				-	vand		q6, q6, q5
			
 
				-	vld1.8		{q1}, [r7]!
			
 
				-	subs		r9, #0x10
			
 
				-	bmi		.Lxts_dec_2
			
 
				-	veor		q0, q0, q8
			
 
				-	vadd.u64	q12, q11, q11
			
 
				-	vst1.64		{q11}, [r0,:128]!
			
 
				-	vswp		d13,d12
			
 
				-	vshr.s64	q7, q12, #63
			
 
				-	veor		q12, q12, q6
			
 
				-	vand		q7, q7, q5
			
 
				-	vld1.8		{q2}, [r7]!
			
 
				-	subs		r9, #0x10
			
 
				-	bmi		.Lxts_dec_3
			
 
				-	veor		q1, q1, q9
			
 
				-	vadd.u64	q13, q12, q12
			
 
				-	vst1.64		{q12}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	vshr.s64	q6, q13, #63
			
 
				-	veor		q13, q13, q7
			
 
				-	vand		q6, q6, q5
			
 
				-	vld1.8		{q3}, [r7]!
			
 
				-	subs		r9, #0x10
			
 
				-	bmi		.Lxts_dec_4
			
 
				-	veor		q2, q2, q10
			
 
				-	vadd.u64	q14, q13, q13
			
 
				-	vst1.64		{q13}, [r0,:128]!
			
 
				-	vswp		d13,d12
			
 
				-	vshr.s64	q7, q14, #63
			
 
				-	veor		q14, q14, q6
			
 
				-	vand		q7, q7, q5
			
 
				-	vld1.8		{q4}, [r7]!
			
 
				-	subs		r9, #0x10
			
 
				-	bmi		.Lxts_dec_5
			
 
				-	veor		q3, q3, q11
			
 
				-	vadd.u64	q15, q14, q14
			
 
				-	vst1.64		{q14}, [r0,:128]!
			
 
				-	vswp		d15,d14
			
 
				-	vshr.s64	q6, q15, #63
			
 
				-	veor		q15, q15, q7
			
 
				-	vand		q6, q6, q5
			
 
				-	vld1.8		{q5}, [r7]!
			
 
				-	subs		r9, #0x10
			
 
				-	bmi		.Lxts_dec_6
			
 
				-	veor		q4, q4, q12
			
 
				-	sub		r9, #0x10
			
 
				-	vst1.64		{q15}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	vld1.8		{q6}, [r7]!
			
 
				-	veor		q5, q5, q13
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, r10, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		q6, q6, q14
			
 
				-	mov		r5, r1			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_decrypt8
			
 
				-
			
 
				-	vld1.64		{q8-q9}, [r0,:128]!
			
 
				-	vld1.64		{q10-q11}, [r0,:128]!
			
 
				-	veor		q0, q0, q8
			
 
				-	vld1.64		{q12-q13}, [r0,:128]!
			
 
				-	veor		q1, q1, q9
			
 
				-	veor		q8, q6, q10
			
 
				-	vst1.8		{q0-q1}, [r8]!
			
 
				-	veor		q9, q4, q11
			
 
				-	vld1.64		{q14}, [r0,:128]!
			
 
				-	veor		q10, q2, q12
			
 
				-	vst1.8		{q8-q9}, [r8]!
			
 
				-	veor		q11, q7, q13
			
 
				-	veor		q12, q3, q14
			
 
				-	vst1.8		{q10-q11}, [r8]!
			
 
				-	vst1.8		{q12}, [r8]!
			
 
				-
			
 
				-	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_dec_done
			
 
				-.align	4
			
 
				-.Lxts_dec_6:
			
 
				-	vst1.64		{q14}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		q4, q4, q12
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, r10, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		q5, q5, q13
			
 
				-	mov		r5, r1			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_decrypt8
			
 
				-
			
 
				-	vld1.64		{q8-q9}, [r0,:128]!
			
 
				-	vld1.64		{q10-q11}, [r0,:128]!
			
 
				-	veor		q0, q0, q8
			
 
				-	vld1.64		{q12-q13}, [r0,:128]!
			
 
				-	veor		q1, q1, q9
			
 
				-	veor		q8, q6, q10
			
 
				-	vst1.8		{q0-q1}, [r8]!
			
 
				-	veor		q9, q4, q11
			
 
				-	veor		q10, q2, q12
			
 
				-	vst1.8		{q8-q9}, [r8]!
			
 
				-	veor		q11, q7, q13
			
 
				-	vst1.8		{q10-q11}, [r8]!
			
 
				-
			
 
				-	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_dec_done
			
 
				-.align	4
			
 
				-.Lxts_dec_5:
			
 
				-	vst1.64		{q13}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		q3, q3, q11
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, r10, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		q4, q4, q12
			
 
				-	mov		r5, r1			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_decrypt8
			
 
				-
			
 
				-	vld1.64		{q8-q9}, [r0,:128]!
			
 
				-	vld1.64		{q10-q11}, [r0,:128]!
			
 
				-	veor		q0, q0, q8
			
 
				-	vld1.64		{q12}, [r0,:128]!
			
 
				-	veor		q1, q1, q9
			
 
				-	veor		q8, q6, q10
			
 
				-	vst1.8		{q0-q1}, [r8]!
			
 
				-	veor		q9, q4, q11
			
 
				-	veor		q10, q2, q12
			
 
				-	vst1.8		{q8-q9}, [r8]!
			
 
				-	vst1.8		{q10}, [r8]!
			
 
				-
			
 
				-	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_dec_done
			
 
				-.align	4
			
 
				-.Lxts_dec_4:
			
 
				-	vst1.64		{q12}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		q2, q2, q10
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, r10, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		q3, q3, q11
			
 
				-	mov		r5, r1			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_decrypt8
			
 
				-
			
 
				-	vld1.64		{q8-q9}, [r0,:128]!
			
 
				-	vld1.64		{q10-q11}, [r0,:128]!
			
 
				-	veor		q0, q0, q8
			
 
				-	veor		q1, q1, q9
			
 
				-	veor		q8, q6, q10
			
 
				-	vst1.8		{q0-q1}, [r8]!
			
 
				-	veor		q9, q4, q11
			
 
				-	vst1.8		{q8-q9}, [r8]!
			
 
				-
			
 
				-	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_dec_done
			
 
				-.align	4
			
 
				-.Lxts_dec_3:
			
 
				-	vst1.64		{q11}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		q1, q1, q9
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, r10, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		q2, q2, q10
			
 
				-	mov		r5, r1			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_decrypt8
			
 
				-
			
 
				-	vld1.64		{q8-q9}, [r0,:128]!
			
 
				-	vld1.64		{q10}, [r0,:128]!
			
 
				-	veor		q0, q0, q8
			
 
				-	veor		q1, q1, q9
			
 
				-	veor		q8, q6, q10
			
 
				-	vst1.8		{q0-q1}, [r8]!
			
 
				-	vst1.8		{q8}, [r8]!
			
 
				-
			
 
				-	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_dec_done
			
 
				-.align	4
			
 
				-.Lxts_dec_2:
			
 
				-	vst1.64		{q10}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		q0, q0, q8
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, r10, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		q1, q1, q9
			
 
				-	mov		r5, r1			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_decrypt8
			
 
				-
			
 
				-	vld1.64		{q8-q9}, [r0,:128]!
			
 
				-	veor		q0, q0, q8
			
 
				-	veor		q1, q1, q9
			
 
				-	vst1.8		{q0-q1}, [r8]!
			
 
				-
			
 
				-	vld1.64		{q8}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_dec_done
			
 
				-.align	4
			
 
				-.Lxts_dec_1:
			
 
				-	mov		r0, sp
			
 
				-	veor		q0, q8
			
 
				-	mov		r1, sp
			
 
				-	vst1.8		{q0}, [sp,:128]
			
 
				-	mov		r2, r10
			
 
				-	mov		r4, r3				@ preserve fp
			
 
				-	mov		r5, r2			@ preserve magic
			
 
				-
			
 
				-	bl		AES_decrypt
			
 
				-
			
 
				-	vld1.8		{q0}, [sp,:128]
			
 
				-	veor		q0, q0, q8
			
 
				-	vst1.8		{q0}, [r8]!
			
 
				-	mov		r3, r4
			
 
				-	mov		r2, r5
			
 
				-
			
 
				-	vmov		q8, q9		@ next round tweak
			
 
				-
			
 
				-.Lxts_dec_done:
			
 
				-#ifndef	XTS_CHAIN_TWEAK
			
 
				-	adds		r9, #0x10
			
 
				-	beq		.Lxts_dec_ret
			
 
				-
			
 
				-	@ calculate one round of extra tweak for the stolen ciphertext
			
 
				-	vldmia		r2, {q5}
			
 
				-	vshr.s64	q6, q8, #63
			
 
				-	vand		q6, q6, q5
			
 
				-	vadd.u64	q9, q8, q8
			
 
				-	vswp		d13,d12
			
 
				-	veor		q9, q9, q6
			
 
				-
			
 
				-	@ perform the final decryption with the last tweak value
			
 
				-	vld1.8		{q0}, [r7]!
			
 
				-	mov		r0, sp
			
 
				-	veor		q0, q0, q9
			
 
				-	mov		r1, sp
			
 
				-	vst1.8		{q0}, [sp,:128]
			
 
				-	mov		r2, r10
			
 
				-	mov		r4, r3			@ preserve fp
			
 
				-
			
 
				-	bl		AES_decrypt
			
 
				-
			
 
				-	vld1.8		{q0}, [sp,:128]
			
 
				-	veor		q0, q0, q9
			
 
				-	vst1.8		{q0}, [r8]
			
 
				-
			
 
				-	mov		r6, r8
			
 
				-.Lxts_dec_steal:
			
 
				-	ldrb		r1, [r8]
			
 
				-	ldrb		r0, [r7], #1
			
 
				-	strb		r1, [r8, #0x10]
			
 
				-	strb		r0, [r8], #1
			
 
				-
			
 
				-	subs		r9, #1
			
 
				-	bhi		.Lxts_dec_steal
			
 
				-
			
 
				-	vld1.8		{q0}, [r6]
			
 
				-	mov		r0, sp
			
 
				-	veor		q0, q8
			
 
				-	mov		r1, sp
			
 
				-	vst1.8		{q0}, [sp,:128]
			
 
				-	mov		r2, r10
			
 
				-
			
 
				-	bl		AES_decrypt
			
 
				-
			
 
				-	vld1.8		{q0}, [sp,:128]
			
 
				-	veor		q0, q0, q8
			
 
				-	vst1.8		{q0}, [r6]
			
 
				-	mov		r3, r4
			
 
				-#endif
			
 
				-
			
 
				-.Lxts_dec_ret:
			
 
				-	bic		r0, r3, #0xf
			
 
				-	vmov.i32	q0, #0
			
 
				-	vmov.i32	q1, #0
			
 
				-#ifdef	XTS_CHAIN_TWEAK
			
 
				-	ldr		r1, [r3, #0x20+VFP_ABI_FRAME]	@ chain tweak
			
 
				-#endif
			
 
				-.Lxts_dec_bzero:				@ wipe key schedule [if any]
			
 
				-	vstmia		sp!, {q0-q1}
			
 
				-	cmp		sp, r0
			
 
				-	bne		.Lxts_dec_bzero
			
 
				-
			
 
				-	mov		sp, r3
			
 
				-#ifdef	XTS_CHAIN_TWEAK
			
 
				-	vst1.8		{q8}, [r1]
			
 
				-#endif
			
 
				-	VFP_ABI_POP
			
 
				-	ldmia		sp!, {r4-r10, pc}	@ return
			
 
				-
			
 
				-.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
			
 
				-#endif
			
--- a/arch/arm/crypto/aesbs-glue.c
+++ b/arch/arm/crypto/aesbs-glue.c
@@ -1,367 +0,0 @@
 
				-/*
			
 
				- * linux/arch/arm/crypto/aesbs-glue.c - glue code for NEON bit sliced AES
			
 
				- *
			
 
				- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				- *
			
 
				- * This program is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU General Public License version 2 as
			
 
				- * published by the Free Software Foundation.
			
 
				- */
			
 
				-
			
 
				-#include <asm/neon.h>
			
 
				-#include <crypto/aes.h>
			
 
				-#include <crypto/cbc.h>
			
 
				-#include <crypto/internal/simd.h>
			
 
				-#include <crypto/internal/skcipher.h>
			
 
				-#include <linux/module.h>
			
 
				-#include <crypto/xts.h>
			
 
				-
			
 
				-#include "aes_glue.h"
			
 
				-
			
 
				-#define BIT_SLICED_KEY_MAXSIZE	(128 * (AES_MAXNR - 1) + 2 * AES_BLOCK_SIZE)
			
 
				-
			
 
				-struct BS_KEY {
			
 
				-	struct AES_KEY	rk;
			
 
				-	int		converted;
			
 
				-	u8 __aligned(8)	bs[BIT_SLICED_KEY_MAXSIZE];
			
 
				-} __aligned(8);
			
 
				-
			
 
				-asmlinkage void bsaes_enc_key_convert(u8 out[], struct AES_KEY const *in);
			
 
				-asmlinkage void bsaes_dec_key_convert(u8 out[], struct AES_KEY const *in);
			
 
				-
			
 
				-asmlinkage void bsaes_cbc_encrypt(u8 const in[], u8 out[], u32 bytes,
			
 
				-				  struct BS_KEY *key, u8 iv[]);
			
 
				-
			
 
				-asmlinkage void bsaes_ctr32_encrypt_blocks(u8 const in[], u8 out[], u32 blocks,
			
 
				-					   struct BS_KEY *key, u8 const iv[]);
			
 
				-
			
 
				-asmlinkage void bsaes_xts_encrypt(u8 const in[], u8 out[], u32 bytes,
			
 
				-				  struct BS_KEY *key, u8 tweak[]);
			
 
				-
			
 
				-asmlinkage void bsaes_xts_decrypt(u8 const in[], u8 out[], u32 bytes,
			
 
				-				  struct BS_KEY *key, u8 tweak[]);
			
 
				-
			
 
				-struct aesbs_cbc_ctx {
			
 
				-	struct AES_KEY	enc;
			
 
				-	struct BS_KEY	dec;
			
 
				-};
			
 
				-
			
 
				-struct aesbs_ctr_ctx {
			
 
				-	struct BS_KEY	enc;
			
 
				-};
			
 
				-
			
 
				-struct aesbs_xts_ctx {
			
 
				-	struct BS_KEY	enc;
			
 
				-	struct BS_KEY	dec;
			
 
				-	struct AES_KEY	twkey;
			
 
				-};
			
 
				-
			
 
				-static int aesbs_cbc_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
			
 
				-			     unsigned int key_len)
			
 
				-{
			
 
				-	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				-	int bits = key_len * 8;
			
 
				-
			
 
				-	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc)) {
			
 
				-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-	ctx->dec.rk = ctx->enc;
			
 
				-	private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
			
 
				-	ctx->dec.converted = 0;
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int aesbs_ctr_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
			
 
				-			     unsigned int key_len)
			
 
				-{
			
 
				-	struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				-	int bits = key_len * 8;
			
 
				-
			
 
				-	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
			
 
				-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-	ctx->enc.converted = 0;
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int aesbs_xts_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
			
 
				-			     unsigned int key_len)
			
 
				-{
			
 
				-	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				-	int bits = key_len * 4;
			
 
				-	int err;
			
 
				-
			
 
				-	err = xts_verify_key(tfm, in_key, key_len);
			
 
				-	if (err)
			
 
				-		return err;
			
 
				-
			
 
				-	if (private_AES_set_encrypt_key(in_key, bits, &ctx->enc.rk)) {
			
 
				-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-	ctx->dec.rk = ctx->enc.rk;
			
 
				-	private_AES_set_decrypt_key(in_key, bits, &ctx->dec.rk);
			
 
				-	private_AES_set_encrypt_key(in_key + key_len / 2, bits, &ctx->twkey);
			
 
				-	ctx->enc.converted = ctx->dec.converted = 0;
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static inline void aesbs_encrypt_one(struct crypto_skcipher *tfm,
			
 
				-				     const u8 *src, u8 *dst)
			
 
				-{
			
 
				-	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				-
			
 
				-	AES_encrypt(src, dst, &ctx->enc);
			
 
				-}
			
 
				-
			
 
				-static int aesbs_cbc_encrypt(struct skcipher_request *req)
			
 
				-{
			
 
				-	return crypto_cbc_encrypt_walk(req, aesbs_encrypt_one);
			
 
				-}
			
 
				-
			
 
				-static inline void aesbs_decrypt_one(struct crypto_skcipher *tfm,
			
 
				-				     const u8 *src, u8 *dst)
			
 
				-{
			
 
				-	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				-
			
 
				-	AES_decrypt(src, dst, &ctx->dec.rk);
			
 
				-}
			
 
				-
			
 
				-static int aesbs_cbc_decrypt(struct skcipher_request *req)
			
 
				-{
			
 
				-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				-	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				-	struct skcipher_walk walk;
			
 
				-	unsigned int nbytes;
			
 
				-	int err;
			
 
				-
			
 
				-	for (err = skcipher_walk_virt(&walk, req, false);
			
 
				-	     (nbytes = walk.nbytes); err = skcipher_walk_done(&walk, nbytes)) {
			
 
				-		u32 blocks = nbytes / AES_BLOCK_SIZE;
			
 
				-		u8 *dst = walk.dst.virt.addr;
			
 
				-		u8 *src = walk.src.virt.addr;
			
 
				-		u8 *iv = walk.iv;
			
 
				-
			
 
				-		if (blocks >= 8) {
			
 
				-			kernel_neon_begin();
			
 
				-			bsaes_cbc_encrypt(src, dst, nbytes, &ctx->dec, iv);
			
 
				-			kernel_neon_end();
			
 
				-			nbytes %= AES_BLOCK_SIZE;
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-		nbytes = crypto_cbc_decrypt_blocks(&walk, tfm,
			
 
				-						   aesbs_decrypt_one);
			
 
				-	}
			
 
				-	return err;
			
 
				-}
			
 
				-
			
 
				-static void inc_be128_ctr(__be32 ctr[], u32 addend)
			
 
				-{
			
 
				-	int i;
			
 
				-
			
 
				-	for (i = 3; i >= 0; i--, addend = 1) {
			
 
				-		u32 n = be32_to_cpu(ctr[i]) + addend;
			
 
				-
			
 
				-		ctr[i] = cpu_to_be32(n);
			
 
				-		if (n >= addend)
			
 
				-			break;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static int aesbs_ctr_encrypt(struct skcipher_request *req)
			
 
				-{
			
 
				-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				-	struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				-	struct skcipher_walk walk;
			
 
				-	u32 blocks;
			
 
				-	int err;
			
 
				-
			
 
				-	err = skcipher_walk_virt(&walk, req, false);
			
 
				-
			
 
				-	while ((blocks = walk.nbytes / AES_BLOCK_SIZE)) {
			
 
				-		u32 tail = walk.nbytes % AES_BLOCK_SIZE;
			
 
				-		__be32 *ctr = (__be32 *)walk.iv;
			
 
				-		u32 headroom = UINT_MAX - be32_to_cpu(ctr[3]);
			
 
				-
			
 
				-		/* avoid 32 bit counter overflow in the NEON code */
			
 
				-		if (unlikely(headroom < blocks)) {
			
 
				-			blocks = headroom + 1;
			
 
				-			tail = walk.nbytes - blocks * AES_BLOCK_SIZE;
			
 
				-		}
			
 
				-		kernel_neon_begin();
			
 
				-		bsaes_ctr32_encrypt_blocks(walk.src.virt.addr,
			
 
				-					   walk.dst.virt.addr, blocks,
			
 
				-					   &ctx->enc, walk.iv);
			
 
				-		kernel_neon_end();
			
 
				-		inc_be128_ctr(ctr, blocks);
			
 
				-
			
 
				-		err = skcipher_walk_done(&walk, tail);
			
 
				-	}
			
 
				-	if (walk.nbytes) {
			
 
				-		u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
			
 
				-		u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
			
 
				-		u8 ks[AES_BLOCK_SIZE];
			
 
				-
			
 
				-		AES_encrypt(walk.iv, ks, &ctx->enc.rk);
			
 
				-		if (tdst != tsrc)
			
 
				-			memcpy(tdst, tsrc, walk.nbytes);
			
 
				-		crypto_xor(tdst, ks, walk.nbytes);
			
 
				-		err = skcipher_walk_done(&walk, 0);
			
 
				-	}
			
 
				-	return err;
			
 
				-}
			
 
				-
			
 
				-static int aesbs_xts_encrypt(struct skcipher_request *req)
			
 
				-{
			
 
				-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				-	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				-	struct skcipher_walk walk;
			
 
				-	int err;
			
 
				-
			
 
				-	err = skcipher_walk_virt(&walk, req, false);
			
 
				-
			
 
				-	/* generate the initial tweak */
			
 
				-	AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
			
 
				-
			
 
				-	while (walk.nbytes) {
			
 
				-		kernel_neon_begin();
			
 
				-		bsaes_xts_encrypt(walk.src.virt.addr, walk.dst.virt.addr,
			
 
				-				  walk.nbytes, &ctx->enc, walk.iv);
			
 
				-		kernel_neon_end();
			
 
				-		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
			
 
				-	}
			
 
				-	return err;
			
 
				-}
			
 
				-
			
 
				-static int aesbs_xts_decrypt(struct skcipher_request *req)
			
 
				-{
			
 
				-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				-	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				-	struct skcipher_walk walk;
			
 
				-	int err;
			
 
				-
			
 
				-	err = skcipher_walk_virt(&walk, req, false);
			
 
				-
			
 
				-	/* generate the initial tweak */
			
 
				-	AES_encrypt(walk.iv, walk.iv, &ctx->twkey);
			
 
				-
			
 
				-	while (walk.nbytes) {
			
 
				-		kernel_neon_begin();
			
 
				-		bsaes_xts_decrypt(walk.src.virt.addr, walk.dst.virt.addr,
			
 
				-				  walk.nbytes, &ctx->dec, walk.iv);
			
 
				-		kernel_neon_end();
			
 
				-		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
			
 
				-	}
			
 
				-	return err;
			
 
				-}
			
 
				-
			
 
				-static struct skcipher_alg aesbs_algs[] = { {
			
 
				-	.base = {
			
 
				-		.cra_name		= "__cbc(aes)",
			
 
				-		.cra_driver_name	= "__cbc-aes-neonbs",
			
 
				-		.cra_priority		= 300,
			
 
				-		.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				-		.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				-		.cra_ctxsize		= sizeof(struct aesbs_cbc_ctx),
			
 
				-		.cra_alignmask		= 7,
			
 
				-		.cra_module		= THIS_MODULE,
			
 
				-	},
			
 
				-	.min_keysize	= AES_MIN_KEY_SIZE,
			
 
				-	.max_keysize	= AES_MAX_KEY_SIZE,
			
 
				-	.ivsize		= AES_BLOCK_SIZE,
			
 
				-	.setkey		= aesbs_cbc_set_key,
			
 
				-	.encrypt	= aesbs_cbc_encrypt,
			
 
				-	.decrypt	= aesbs_cbc_decrypt,
			
 
				-}, {
			
 
				-	.base = {
			
 
				-		.cra_name		= "__ctr(aes)",
			
 
				-		.cra_driver_name	= "__ctr-aes-neonbs",
			
 
				-		.cra_priority		= 300,
			
 
				-		.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				-		.cra_blocksize		= 1,
			
 
				-		.cra_ctxsize		= sizeof(struct aesbs_ctr_ctx),
			
 
				-		.cra_alignmask		= 7,
			
 
				-		.cra_module		= THIS_MODULE,
			
 
				-	},
			
 
				-	.min_keysize	= AES_MIN_KEY_SIZE,
			
 
				-	.max_keysize	= AES_MAX_KEY_SIZE,
			
 
				-	.ivsize		= AES_BLOCK_SIZE,
			
 
				-	.chunksize	= AES_BLOCK_SIZE,
			
 
				-	.setkey		= aesbs_ctr_set_key,
			
 
				-	.encrypt	= aesbs_ctr_encrypt,
			
 
				-	.decrypt	= aesbs_ctr_encrypt,
			
 
				-}, {
			
 
				-	.base = {
			
 
				-		.cra_name		= "__xts(aes)",
			
 
				-		.cra_driver_name	= "__xts-aes-neonbs",
			
 
				-		.cra_priority		= 300,
			
 
				-		.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				-		.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				-		.cra_ctxsize		= sizeof(struct aesbs_xts_ctx),
			
 
				-		.cra_alignmask		= 7,
			
 
				-		.cra_module		= THIS_MODULE,
			
 
				-	},
			
 
				-	.min_keysize	= 2 * AES_MIN_KEY_SIZE,
			
 
				-	.max_keysize	= 2 * AES_MAX_KEY_SIZE,
			
 
				-	.ivsize		= AES_BLOCK_SIZE,
			
 
				-	.setkey		= aesbs_xts_set_key,
			
 
				-	.encrypt	= aesbs_xts_encrypt,
			
 
				-	.decrypt	= aesbs_xts_decrypt,
			
 
				-} };
			
 
				-
			
 
				-struct simd_skcipher_alg *aesbs_simd_algs[ARRAY_SIZE(aesbs_algs)];
			
 
				-
			
 
				-static void aesbs_mod_exit(void)
			
 
				-{
			
 
				-	int i;
			
 
				-
			
 
				-	for (i = 0; i < ARRAY_SIZE(aesbs_simd_algs) && aesbs_simd_algs[i]; i++)
			
 
				-		simd_skcipher_free(aesbs_simd_algs[i]);
			
 
				-
			
 
				-	crypto_unregister_skciphers(aesbs_algs, ARRAY_SIZE(aesbs_algs));
			
 
				-}
			
 
				-
			
 
				-static int __init aesbs_mod_init(void)
			
 
				-{
			
 
				-	struct simd_skcipher_alg *simd;
			
 
				-	const char *basename;
			
 
				-	const char *algname;
			
 
				-	const char *drvname;
			
 
				-	int err;
			
 
				-	int i;
			
 
				-
			
 
				-	if (!cpu_has_neon())
			
 
				-		return -ENODEV;
			
 
				-
			
 
				-	err = crypto_register_skciphers(aesbs_algs, ARRAY_SIZE(aesbs_algs));
			
 
				-	if (err)
			
 
				-		return err;
			
 
				-
			
 
				-	for (i = 0; i < ARRAY_SIZE(aesbs_algs); i++) {
			
 
				-		algname = aesbs_algs[i].base.cra_name + 2;
			
 
				-		drvname = aesbs_algs[i].base.cra_driver_name + 2;
			
 
				-		basename = aesbs_algs[i].base.cra_driver_name;
			
 
				-		simd = simd_skcipher_create_compat(algname, drvname, basename);
			
 
				-		err = PTR_ERR(simd);
			
 
				-		if (IS_ERR(simd))
			
 
				-			goto unregister_simds;
			
 
				-
			
 
				-		aesbs_simd_algs[i] = simd;
			
 
				-	}
			
 
				-
			
 
				-	return 0;
			
 
				-
			
 
				-unregister_simds:
			
 
				-	aesbs_mod_exit();
			
 
				-	return err;
			
 
				-}
			
 
				-
			
 
				-module_init(aesbs_mod_init);
			
 
				-module_exit(aesbs_mod_exit);
			
 
				-
			
 
				-MODULE_DESCRIPTION("Bit sliced AES in CBC/CTR/XTS modes using NEON");
			
 
				-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				-MODULE_LICENSE("GPL");
			
--- a/arch/arm/crypto/bsaes-armv7.pl
+++ b/arch/arm/crypto/bsaes-armv7.pl
@@ -1,2471 +0,0 @@
 
				-#!/usr/bin/env perl
			
 
				-
			
 
				-# ====================================================================
			
 
				-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
			
 
				-# project. The module is, however, dual licensed under OpenSSL and
			
 
				-# CRYPTOGAMS licenses depending on where you obtain it. For further
			
 
				-# details see http://www.openssl.org/~appro/cryptogams/.
			
 
				-#
			
 
				-# Specific modes and adaptation for Linux kernel by Ard Biesheuvel
			
 
				-# <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
			
 
				-# granted.
			
 
				-# ====================================================================
			
 
				-
			
 
				-# Bit-sliced AES for ARM NEON
			
 
				-#
			
 
				-# February 2012.
			
 
				-#
			
 
				-# This implementation is direct adaptation of bsaes-x86_64 module for
			
 
				-# ARM NEON. Except that this module is endian-neutral [in sense that
			
 
				-# it can be compiled for either endianness] by courtesy of vld1.8's
			
 
				-# neutrality. Initial version doesn't implement interface to OpenSSL,
			
 
				-# only low-level primitives and unsupported entry points, just enough
			
 
				-# to collect performance results, which for Cortex-A8 core are:
			
 
				-#
			
 
				-# encrypt	19.5 cycles per byte processed with 128-bit key
			
 
				-# decrypt	22.1 cycles per byte processed with 128-bit key
			
 
				-# key conv.	440  cycles per 128-bit key/0.18 of 8x block
			
 
				-#
			
 
				-# Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
			
 
				-# which is [much] worse than anticipated (for further details see
			
 
				-# http://www.openssl.org/~appro/Snapdragon-S4.html).
			
 
				-#
			
 
				-# Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
			
 
				-# manages in 20.0 cycles].
			
 
				-#
			
 
				-# When comparing to x86_64 results keep in mind that NEON unit is
			
 
				-# [mostly] single-issue and thus can't [fully] benefit from
			
 
				-# instruction-level parallelism. And when comparing to aes-armv4
			
 
				-# results keep in mind key schedule conversion overhead (see
			
 
				-# bsaes-x86_64.pl for further details)...
			
 
				-#
			
 
				-#						<appro@openssl.org>
			
 
				-
			
 
				-# April-August 2013
			
 
				-#
			
 
				-# Add CBC, CTR and XTS subroutines, adapt for kernel use.
			
 
				-#
			
 
				-#					<ard.biesheuvel@linaro.org>
			
 
				-
			
 
				-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
			
 
				-open STDOUT,">$output";
			
 
				-
			
 
				-my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
			
 
				-my @XMM=map("q$_",(0..15));
			
 
				-
			
 
				-{
			
 
				-my ($key,$rounds,$const)=("r4","r5","r6");
			
 
				-
			
 
				-sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
			
 
				-sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
			
 
				-
			
 
				-sub Sbox {
			
 
				-# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
			
 
				-# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
			
 
				-my @b=@_[0..7];
			
 
				-my @t=@_[8..11];
			
 
				-my @s=@_[12..15];
			
 
				-	&InBasisChange	(@b);
			
 
				-	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
			
 
				-	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
			
 
				-}
			
 
				-
			
 
				-sub InBasisChange {
			
 
				-# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
			
 
				-# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
			
 
				-my @b=@_[0..7];
			
 
				-$code.=<<___;
			
 
				-	veor	@b[2], @b[2], @b[1]
			
 
				-	veor	@b[5], @b[5], @b[6]
			
 
				-	veor	@b[3], @b[3], @b[0]
			
 
				-	veor	@b[6], @b[6], @b[2]
			
 
				-	veor	@b[5], @b[5], @b[0]
			
 
				-
			
 
				-	veor	@b[6], @b[6], @b[3]
			
 
				-	veor	@b[3], @b[3], @b[7]
			
 
				-	veor	@b[7], @b[7], @b[5]
			
 
				-	veor	@b[3], @b[3], @b[4]
			
 
				-	veor	@b[4], @b[4], @b[5]
			
 
				-
			
 
				-	veor	@b[2], @b[2], @b[7]
			
 
				-	veor	@b[3], @b[3], @b[1]
			
 
				-	veor	@b[1], @b[1], @b[5]
			
 
				-___
			
 
				-}
			
 
				-
			
 
				-sub OutBasisChange {
			
 
				-# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
			
 
				-# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
			
 
				-my @b=@_[0..7];
			
 
				-$code.=<<___;
			
 
				-	veor	@b[0], @b[0], @b[6]
			
 
				-	veor	@b[1], @b[1], @b[4]
			
 
				-	veor	@b[4], @b[4], @b[6]
			
 
				-	veor	@b[2], @b[2], @b[0]
			
 
				-	veor	@b[6], @b[6], @b[1]
			
 
				-
			
 
				-	veor	@b[1], @b[1], @b[5]
			
 
				-	veor	@b[5], @b[5], @b[3]
			
 
				-	veor	@b[3], @b[3], @b[7]
			
 
				-	veor	@b[7], @b[7], @b[5]
			
 
				-	veor	@b[2], @b[2], @b[5]
			
 
				-
			
 
				-	veor	@b[4], @b[4], @b[7]
			
 
				-___
			
 
				-}
			
 
				-
			
 
				-sub InvSbox {
			
 
				-# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
			
 
				-# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
			
 
				-my @b=@_[0..7];
			
 
				-my @t=@_[8..11];
			
 
				-my @s=@_[12..15];
			
 
				-	&InvInBasisChange	(@b);
			
 
				-	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
			
 
				-	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
			
 
				-}
			
 
				-
			
 
				-sub InvInBasisChange {		# OutBasisChange in reverse (with twist)
			
 
				-my @b=@_[5,1,2,6,3,7,0,4];
			
 
				-$code.=<<___
			
 
				-	 veor	@b[1], @b[1], @b[7]
			
 
				-	veor	@b[4], @b[4], @b[7]
			
 
				-
			
 
				-	veor	@b[7], @b[7], @b[5]
			
 
				-	 veor	@b[1], @b[1], @b[3]
			
 
				-	veor	@b[2], @b[2], @b[5]
			
 
				-	veor	@b[3], @b[3], @b[7]
			
 
				-
			
 
				-	veor	@b[6], @b[6], @b[1]
			
 
				-	veor	@b[2], @b[2], @b[0]
			
 
				-	 veor	@b[5], @b[5], @b[3]
			
 
				-	veor	@b[4], @b[4], @b[6]
			
 
				-	veor	@b[0], @b[0], @b[6]
			
 
				-	veor	@b[1], @b[1], @b[4]
			
 
				-___
			
 
				-}
			
 
				-
			
 
				-sub InvOutBasisChange {		# InBasisChange in reverse
			
 
				-my @b=@_[2,5,7,3,6,1,0,4];
			
 
				-$code.=<<___;
			
 
				-	veor	@b[1], @b[1], @b[5]
			
 
				-	veor	@b[2], @b[2], @b[7]
			
 
				-
			
 
				-	veor	@b[3], @b[3], @b[1]
			
 
				-	veor	@b[4], @b[4], @b[5]
			
 
				-	veor	@b[7], @b[7], @b[5]
			
 
				-	veor	@b[3], @b[3], @b[4]
			
 
				-	 veor 	@b[5], @b[5], @b[0]
			
 
				-	veor	@b[3], @b[3], @b[7]
			
 
				-	 veor	@b[6], @b[6], @b[2]
			
 
				-	 veor	@b[2], @b[2], @b[1]
			
 
				-	veor	@b[6], @b[6], @b[3]
			
 
				-
			
 
				-	veor	@b[3], @b[3], @b[0]
			
 
				-	veor	@b[5], @b[5], @b[6]
			
 
				-___
			
 
				-}
			
 
				-
			
 
				-sub Mul_GF4 {
			
 
				-#;*************************************************************
			
 
				-#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
			
 
				-#;*************************************************************
			
 
				-my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
			
 
				-$code.=<<___;
			
 
				-	veor 	$t0, $y0, $y1
			
 
				-	vand	$t0, $t0, $x0
			
 
				-	veor	$x0, $x0, $x1
			
 
				-	vand	$t1, $x1, $y0
			
 
				-	vand	$x0, $x0, $y1
			
 
				-	veor	$x1, $t1, $t0
			
 
				-	veor	$x0, $x0, $t1
			
 
				-___
			
 
				-}
			
 
				-
			
 
				-sub Mul_GF4_N {				# not used, see next subroutine
			
 
				-# multiply and scale by N
			
 
				-my ($x0,$x1,$y0,$y1,$t0)=@_;
			
 
				-$code.=<<___;
			
 
				-	veor	$t0, $y0, $y1
			
 
				-	vand	$t0, $t0, $x0
			
 
				-	veor	$x0, $x0, $x1
			
 
				-	vand	$x1, $x1, $y0
			
 
				-	vand	$x0, $x0, $y1
			
 
				-	veor	$x1, $x1, $x0
			
 
				-	veor	$x0, $x0, $t0
			
 
				-___
			
 
				-}
			
 
				-
			
 
				-sub Mul_GF4_N_GF4 {
			
 
				-# interleaved Mul_GF4_N and Mul_GF4
			
 
				-my ($x0,$x1,$y0,$y1,$t0,
			
 
				-    $x2,$x3,$y2,$y3,$t1)=@_;
			
 
				-$code.=<<___;
			
 
				-	veor	$t0, $y0, $y1
			
 
				-	 veor 	$t1, $y2, $y3
			
 
				-	vand	$t0, $t0, $x0
			
 
				-	 vand	$t1, $t1, $x2
			
 
				-	veor	$x0, $x0, $x1
			
 
				-	 veor	$x2, $x2, $x3
			
 
				-	vand	$x1, $x1, $y0
			
 
				-	 vand	$x3, $x3, $y2
			
 
				-	vand	$x0, $x0, $y1
			
 
				-	 vand	$x2, $x2, $y3
			
 
				-	veor	$x1, $x1, $x0
			
 
				-	 veor	$x2, $x2, $x3
			
 
				-	veor	$x0, $x0, $t0
			
 
				-	 veor	$x3, $x3, $t1
			
 
				-___
			
 
				-}
			
 
				-sub Mul_GF16_2 {
			
 
				-my @x=@_[0..7];
			
 
				-my @y=@_[8..11];
			
 
				-my @t=@_[12..15];
			
 
				-$code.=<<___;
			
 
				-	veor	@t[0], @x[0], @x[2]
			
 
				-	veor	@t[1], @x[1], @x[3]
			
 
				-___
			
 
				-	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2..3]);
			
 
				-$code.=<<___;
			
 
				-	veor	@y[0], @y[0], @y[2]
			
 
				-	veor	@y[1], @y[1], @y[3]
			
 
				-___
			
 
				-	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
			
 
				-			 @x[2], @x[3], @y[2], @y[3], @t[2]);
			
 
				-$code.=<<___;
			
 
				-	veor	@x[0], @x[0], @t[0]
			
 
				-	veor	@x[2], @x[2], @t[0]
			
 
				-	veor	@x[1], @x[1], @t[1]
			
 
				-	veor	@x[3], @x[3], @t[1]
			
 
				-
			
 
				-	veor	@t[0], @x[4], @x[6]
			
 
				-	veor	@t[1], @x[5], @x[7]
			
 
				-___
			
 
				-	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
			
 
				-			 @x[6], @x[7], @y[2], @y[3], @t[2]);
			
 
				-$code.=<<___;
			
 
				-	veor	@y[0], @y[0], @y[2]
			
 
				-	veor	@y[1], @y[1], @y[3]
			
 
				-___
			
 
				-	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[2..3]);
			
 
				-$code.=<<___;
			
 
				-	veor	@x[4], @x[4], @t[0]
			
 
				-	veor	@x[6], @x[6], @t[0]
			
 
				-	veor	@x[5], @x[5], @t[1]
			
 
				-	veor	@x[7], @x[7], @t[1]
			
 
				-___
			
 
				-}
			
 
				-sub Inv_GF256 {
			
 
				-#;********************************************************************
			
 
				-#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
			
 
				-#;********************************************************************
			
 
				-my @x=@_[0..7];
			
 
				-my @t=@_[8..11];
			
 
				-my @s=@_[12..15];
			
 
				-# direct optimizations from hardware
			
 
				-$code.=<<___;
			
 
				-	veor	@t[3], @x[4], @x[6]
			
 
				-	veor	@t[2], @x[5], @x[7]
			
 
				-	veor	@t[1], @x[1], @x[3]
			
 
				-	veor	@s[1], @x[7], @x[6]
			
 
				-	 vmov	@t[0], @t[2]
			
 
				-	veor	@s[0], @x[0], @x[2]
			
 
				-
			
 
				-	vorr	@t[2], @t[2], @t[1]
			
 
				-	veor	@s[3], @t[3], @t[0]
			
 
				-	vand	@s[2], @t[3], @s[0]
			
 
				-	vorr	@t[3], @t[3], @s[0]
			
 
				-	veor	@s[0], @s[0], @t[1]
			
 
				-	vand	@t[0], @t[0], @t[1]
			
 
				-	veor	@t[1], @x[3], @x[2]
			
 
				-	vand	@s[3], @s[3], @s[0]
			
 
				-	vand	@s[1], @s[1], @t[1]
			
 
				-	veor	@t[1], @x[4], @x[5]
			
 
				-	veor	@s[0], @x[1], @x[0]
			
 
				-	veor	@t[3], @t[3], @s[1]
			
 
				-	veor	@t[2], @t[2], @s[1]
			
 
				-	vand	@s[1], @t[1], @s[0]
			
 
				-	vorr	@t[1], @t[1], @s[0]
			
 
				-	veor	@t[3], @t[3], @s[3]
			
 
				-	veor	@t[0], @t[0], @s[1]
			
 
				-	veor	@t[2], @t[2], @s[2]
			
 
				-	veor	@t[1], @t[1], @s[3]
			
 
				-	veor	@t[0], @t[0], @s[2]
			
 
				-	vand	@s[0], @x[7], @x[3]
			
 
				-	veor	@t[1], @t[1], @s[2]
			
 
				-	vand	@s[1], @x[6], @x[2]
			
 
				-	vand	@s[2], @x[5], @x[1]
			
 
				-	vorr	@s[3], @x[4], @x[0]
			
 
				-	veor	@t[3], @t[3], @s[0]
			
 
				-	veor	@t[1], @t[1], @s[2]
			
 
				-	veor	@t[0], @t[0], @s[3]
			
 
				-	veor	@t[2], @t[2], @s[1]
			
 
				-
			
 
				-	@ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
			
 
				-
			
 
				-	@ new smaller inversion
			
 
				-
			
 
				-	vand	@s[2], @t[3], @t[1]
			
 
				-	vmov	@s[0], @t[0]
			
 
				-
			
 
				-	veor	@s[1], @t[2], @s[2]
			
 
				-	veor	@s[3], @t[0], @s[2]
			
 
				-	veor	@s[2], @t[0], @s[2]	@ @s[2]=@s[3]
			
 
				-
			
 
				-	vbsl	@s[1], @t[1], @t[0]
			
 
				-	vbsl	@s[3], @t[3], @t[2]
			
 
				-	veor	@t[3], @t[3], @t[2]
			
 
				-
			
 
				-	vbsl	@s[0], @s[1], @s[2]
			
 
				-	vbsl	@t[0], @s[2], @s[1]
			
 
				-
			
 
				-	vand	@s[2], @s[0], @s[3]
			
 
				-	veor	@t[1], @t[1], @t[0]
			
 
				-
			
 
				-	veor	@s[2], @s[2], @t[3]
			
 
				-___
			
 
				-# output in s3, s2, s1, t1
			
 
				-
			
 
				-# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
			
 
				-
			
 
				-# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
			
 
				-	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
			
 
				-
			
 
				-### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
			
 
				-}
			
 
				-
			
 
				-# AES linear components
			
 
				-
			
 
				-sub ShiftRows {
			
 
				-my @x=@_[0..7];
			
 
				-my @t=@_[8..11];
			
 
				-my $mask=pop;
			
 
				-$code.=<<___;
			
 
				-	vldmia	$key!, {@t[0]-@t[3]}
			
 
				-	veor	@t[0], @t[0], @x[0]
			
 
				-	veor	@t[1], @t[1], @x[1]
			
 
				-	vtbl.8	`&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
			
 
				-	vtbl.8	`&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
			
 
				-	vldmia	$key!, {@t[0]}
			
 
				-	veor	@t[2], @t[2], @x[2]
			
 
				-	vtbl.8	`&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
			
 
				-	vtbl.8	`&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
			
 
				-	vldmia	$key!, {@t[1]}
			
 
				-	veor	@t[3], @t[3], @x[3]
			
 
				-	vtbl.8	`&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
			
 
				-	vtbl.8	`&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
			
 
				-	vldmia	$key!, {@t[2]}
			
 
				-	vtbl.8	`&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
			
 
				-	vtbl.8	`&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
			
 
				-	vldmia	$key!, {@t[3]}
			
 
				-	veor	@t[0], @t[0], @x[4]
			
 
				-	veor	@t[1], @t[1], @x[5]
			
 
				-	vtbl.8	`&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
			
 
				-	vtbl.8	`&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
			
 
				-	veor	@t[2], @t[2], @x[6]
			
 
				-	vtbl.8	`&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
			
 
				-	vtbl.8	`&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
			
 
				-	veor	@t[3], @t[3], @x[7]
			
 
				-	vtbl.8	`&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
			
 
				-	vtbl.8	`&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
			
 
				-	vtbl.8	`&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
			
 
				-	vtbl.8	`&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
			
 
				-___
			
 
				-}
			
 
				-
			
 
				-sub MixColumns {
			
 
				-# modified to emit output in order suitable for feeding back to aesenc[last]
			
 
				-my @x=@_[0..7];
			
 
				-my @t=@_[8..15];
			
 
				-my $inv=@_[16];	# optional
			
 
				-$code.=<<___;
			
 
				-	vext.8	@t[0], @x[0], @x[0], #12	@ x0 <<< 32
			
 
				-	vext.8	@t[1], @x[1], @x[1], #12
			
 
				-	 veor	@x[0], @x[0], @t[0]		@ x0 ^ (x0 <<< 32)
			
 
				-	vext.8	@t[2], @x[2], @x[2], #12
			
 
				-	 veor	@x[1], @x[1], @t[1]
			
 
				-	vext.8	@t[3], @x[3], @x[3], #12
			
 
				-	 veor	@x[2], @x[2], @t[2]
			
 
				-	vext.8	@t[4], @x[4], @x[4], #12
			
 
				-	 veor	@x[3], @x[3], @t[3]
			
 
				-	vext.8	@t[5], @x[5], @x[5], #12
			
 
				-	 veor	@x[4], @x[4], @t[4]
			
 
				-	vext.8	@t[6], @x[6], @x[6], #12
			
 
				-	 veor	@x[5], @x[5], @t[5]
			
 
				-	vext.8	@t[7], @x[7], @x[7], #12
			
 
				-	 veor	@x[6], @x[6], @t[6]
			
 
				-
			
 
				-	veor	@t[1], @t[1], @x[0]
			
 
				-	 veor	@x[7], @x[7], @t[7]
			
 
				-	 vext.8	@x[0], @x[0], @x[0], #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
			
 
				-	veor	@t[2], @t[2], @x[1]
			
 
				-	veor	@t[0], @t[0], @x[7]
			
 
				-	veor	@t[1], @t[1], @x[7]
			
 
				-	 vext.8	@x[1], @x[1], @x[1], #8
			
 
				-	veor	@t[5], @t[5], @x[4]
			
 
				-	 veor	@x[0], @x[0], @t[0]
			
 
				-	veor	@t[6], @t[6], @x[5]
			
 
				-	 veor	@x[1], @x[1], @t[1]
			
 
				-	 vext.8	@t[0], @x[4], @x[4], #8
			
 
				-	veor	@t[4], @t[4], @x[3]
			
 
				-	 vext.8	@t[1], @x[5], @x[5], #8
			
 
				-	veor	@t[7], @t[7], @x[6]
			
 
				-	 vext.8	@x[4], @x[3], @x[3], #8
			
 
				-	veor	@t[3], @t[3], @x[2]
			
 
				-	 vext.8	@x[5], @x[7], @x[7], #8
			
 
				-	veor	@t[4], @t[4], @x[7]
			
 
				-	 vext.8	@x[3], @x[6], @x[6], #8
			
 
				-	veor	@t[3], @t[3], @x[7]
			
 
				-	 vext.8	@x[6], @x[2], @x[2], #8
			
 
				-	veor	@x[7], @t[1], @t[5]
			
 
				-___
			
 
				-$code.=<<___ if (!$inv);
			
 
				-	veor	@x[2], @t[0], @t[4]
			
 
				-	veor	@x[4], @x[4], @t[3]
			
 
				-	veor	@x[5], @x[5], @t[7]
			
 
				-	veor	@x[3], @x[3], @t[6]
			
 
				-	 @ vmov	@x[2], @t[0]
			
 
				-	veor	@x[6], @x[6], @t[2]
			
 
				-	 @ vmov	@x[7], @t[1]
			
 
				-___
			
 
				-$code.=<<___ if ($inv);
			
 
				-	veor	@t[3], @t[3], @x[4]
			
 
				-	veor	@x[5], @x[5], @t[7]
			
 
				-	veor	@x[2], @x[3], @t[6]
			
 
				-	veor	@x[3], @t[0], @t[4]
			
 
				-	veor	@x[4], @x[6], @t[2]
			
 
				-	vmov	@x[6], @t[3]
			
 
				-	 @ vmov	@x[7], @t[1]
			
 
				-___
			
 
				-}
			
 
				-
			
 
				-sub InvMixColumns_orig {
			
 
				-my @x=@_[0..7];
			
 
				-my @t=@_[8..15];
			
 
				-
			
 
				-$code.=<<___;
			
 
				-	@ multiplication by 0x0e
			
 
				-	vext.8	@t[7], @x[7], @x[7], #12
			
 
				-	vmov	@t[2], @x[2]
			
 
				-	veor	@x[2], @x[2], @x[5]		@ 2 5
			
 
				-	veor	@x[7], @x[7], @x[5]		@ 7 5
			
 
				-	vext.8	@t[0], @x[0], @x[0], #12
			
 
				-	vmov	@t[5], @x[5]
			
 
				-	veor	@x[5], @x[5], @x[0]		@ 5 0		[1]
			
 
				-	veor	@x[0], @x[0], @x[1]		@ 0 1
			
 
				-	vext.8	@t[1], @x[1], @x[1], #12
			
 
				-	veor	@x[1], @x[1], @x[2]		@ 1 25
			
 
				-	veor	@x[0], @x[0], @x[6]		@ 01 6		[2]
			
 
				-	vext.8	@t[3], @x[3], @x[3], #12
			
 
				-	veor	@x[1], @x[1], @x[3]		@ 125 3		[4]
			
 
				-	veor	@x[2], @x[2], @x[0]		@ 25 016	[3]
			
 
				-	veor	@x[3], @x[3], @x[7]		@ 3 75
			
 
				-	veor	@x[7], @x[7], @x[6]		@ 75 6		[0]
			
 
				-	vext.8	@t[6], @x[6], @x[6], #12
			
 
				-	vmov	@t[4], @x[4]
			
 
				-	veor	@x[6], @x[6], @x[4]		@ 6 4
			
 
				-	veor	@x[4], @x[4], @x[3]		@ 4 375		[6]
			
 
				-	veor	@x[3], @x[3], @x[7]		@ 375 756=36
			
 
				-	veor	@x[6], @x[6], @t[5]		@ 64 5		[7]
			
 
				-	veor	@x[3], @x[3], @t[2]		@ 36 2
			
 
				-	vext.8	@t[5], @t[5], @t[5], #12
			
 
				-	veor	@x[3], @x[3], @t[4]		@ 362 4		[5]
			
 
				-___
			
 
				-					my @y = @x[7,5,0,2,1,3,4,6];
			
 
				-$code.=<<___;
			
 
				-	@ multiplication by 0x0b
			
 
				-	veor	@y[1], @y[1], @y[0]
			
 
				-	veor	@y[0], @y[0], @t[0]
			
 
				-	vext.8	@t[2], @t[2], @t[2], #12
			
 
				-	veor	@y[1], @y[1], @t[1]
			
 
				-	veor	@y[0], @y[0], @t[5]
			
 
				-	vext.8	@t[4], @t[4], @t[4], #12
			
 
				-	veor	@y[1], @y[1], @t[6]
			
 
				-	veor	@y[0], @y[0], @t[7]
			
 
				-	veor	@t[7], @t[7], @t[6]		@ clobber t[7]
			
 
				-
			
 
				-	veor	@y[3], @y[3], @t[0]
			
 
				-	 veor	@y[1], @y[1], @y[0]
			
 
				-	vext.8	@t[0], @t[0], @t[0], #12
			
 
				-	veor	@y[2], @y[2], @t[1]
			
 
				-	veor	@y[4], @y[4], @t[1]
			
 
				-	vext.8	@t[1], @t[1], @t[1], #12
			
 
				-	veor	@y[2], @y[2], @t[2]
			
 
				-	veor	@y[3], @y[3], @t[2]
			
 
				-	veor	@y[5], @y[5], @t[2]
			
 
				-	veor	@y[2], @y[2], @t[7]
			
 
				-	vext.8	@t[2], @t[2], @t[2], #12
			
 
				-	veor	@y[3], @y[3], @t[3]
			
 
				-	veor	@y[6], @y[6], @t[3]
			
 
				-	veor	@y[4], @y[4], @t[3]
			
 
				-	veor	@y[7], @y[7], @t[4]
			
 
				-	vext.8	@t[3], @t[3], @t[3], #12
			
 
				-	veor	@y[5], @y[5], @t[4]
			
 
				-	veor	@y[7], @y[7], @t[7]
			
 
				-	veor	@t[7], @t[7], @t[5]		@ clobber t[7] even more
			
 
				-	veor	@y[3], @y[3], @t[5]
			
 
				-	veor	@y[4], @y[4], @t[4]
			
 
				-
			
 
				-	veor	@y[5], @y[5], @t[7]
			
 
				-	vext.8	@t[4], @t[4], @t[4], #12
			
 
				-	veor	@y[6], @y[6], @t[7]
			
 
				-	veor	@y[4], @y[4], @t[7]
			
 
				-
			
 
				-	veor	@t[7], @t[7], @t[5]
			
 
				-	vext.8	@t[5], @t[5], @t[5], #12
			
 
				-
			
 
				-	@ multiplication by 0x0d
			
 
				-	veor	@y[4], @y[4], @y[7]
			
 
				-	 veor	@t[7], @t[7], @t[6]		@ restore t[7]
			
 
				-	veor	@y[7], @y[7], @t[4]
			
 
				-	vext.8	@t[6], @t[6], @t[6], #12
			
 
				-	veor	@y[2], @y[2], @t[0]
			
 
				-	veor	@y[7], @y[7], @t[5]
			
 
				-	vext.8	@t[7], @t[7], @t[7], #12
			
 
				-	veor	@y[2], @y[2], @t[2]
			
 
				-
			
 
				-	veor	@y[3], @y[3], @y[1]
			
 
				-	veor	@y[1], @y[1], @t[1]
			
 
				-	veor	@y[0], @y[0], @t[0]
			
 
				-	veor	@y[3], @y[3], @t[0]
			
 
				-	veor	@y[1], @y[1], @t[5]
			
 
				-	veor	@y[0], @y[0], @t[5]
			
 
				-	vext.8	@t[0], @t[0], @t[0], #12
			
 
				-	veor	@y[1], @y[1], @t[7]
			
 
				-	veor	@y[0], @y[0], @t[6]
			
 
				-	veor	@y[3], @y[3], @y[1]
			
 
				-	veor	@y[4], @y[4], @t[1]
			
 
				-	vext.8	@t[1], @t[1], @t[1], #12
			
 
				-
			
 
				-	veor	@y[7], @y[7], @t[7]
			
 
				-	veor	@y[4], @y[4], @t[2]
			
 
				-	veor	@y[5], @y[5], @t[2]
			
 
				-	veor	@y[2], @y[2], @t[6]
			
 
				-	veor	@t[6], @t[6], @t[3]		@ clobber t[6]
			
 
				-	vext.8	@t[2], @t[2], @t[2], #12
			
 
				-	veor	@y[4], @y[4], @y[7]
			
 
				-	veor	@y[3], @y[3], @t[6]
			
 
				-
			
 
				-	veor	@y[6], @y[6], @t[6]
			
 
				-	veor	@y[5], @y[5], @t[5]
			
 
				-	vext.8	@t[5], @t[5], @t[5], #12
			
 
				-	veor	@y[6], @y[6], @t[4]
			
 
				-	vext.8	@t[4], @t[4], @t[4], #12
			
 
				-	veor	@y[5], @y[5], @t[6]
			
 
				-	veor	@y[6], @y[6], @t[7]
			
 
				-	vext.8	@t[7], @t[7], @t[7], #12
			
 
				-	veor	@t[6], @t[6], @t[3]		@ restore t[6]
			
 
				-	vext.8	@t[3], @t[3], @t[3], #12
			
 
				-
			
 
				-	@ multiplication by 0x09
			
 
				-	veor	@y[4], @y[4], @y[1]
			
 
				-	veor	@t[1], @t[1], @y[1]		@ t[1]=y[1]
			
 
				-	veor	@t[0], @t[0], @t[5]		@ clobber t[0]
			
 
				-	vext.8	@t[6], @t[6], @t[6], #12
			
 
				-	veor	@t[1], @t[1], @t[5]
			
 
				-	veor	@y[3], @y[3], @t[0]
			
 
				-	veor	@t[0], @t[0], @y[0]		@ t[0]=y[0]
			
 
				-	veor	@t[1], @t[1], @t[6]
			
 
				-	veor	@t[6], @t[6], @t[7]		@ clobber t[6]
			
 
				-	veor	@y[4], @y[4], @t[1]
			
 
				-	veor	@y[7], @y[7], @t[4]
			
 
				-	veor	@y[6], @y[6], @t[3]
			
 
				-	veor	@y[5], @y[5], @t[2]
			
 
				-	veor	@t[4], @t[4], @y[4]		@ t[4]=y[4]
			
 
				-	veor	@t[3], @t[3], @y[3]		@ t[3]=y[3]
			
 
				-	veor	@t[5], @t[5], @y[5]		@ t[5]=y[5]
			
 
				-	veor	@t[2], @t[2], @y[2]		@ t[2]=y[2]
			
 
				-	veor	@t[3], @t[3], @t[7]
			
 
				-	veor	@XMM[5], @t[5], @t[6]
			
 
				-	veor	@XMM[6], @t[6], @y[6]		@ t[6]=y[6]
			
 
				-	veor	@XMM[2], @t[2], @t[6]
			
 
				-	veor	@XMM[7], @t[7], @y[7]		@ t[7]=y[7]
			
 
				-
			
 
				-	vmov	@XMM[0], @t[0]
			
 
				-	vmov	@XMM[1], @t[1]
			
 
				-	@ vmov	@XMM[2], @t[2]
			
 
				-	vmov	@XMM[3], @t[3]
			
 
				-	vmov	@XMM[4], @t[4]
			
 
				-	@ vmov	@XMM[5], @t[5]
			
 
				-	@ vmov	@XMM[6], @t[6]
			
 
				-	@ vmov	@XMM[7], @t[7]
			
 
				-___
			
 
				-}
			
 
				-
			
 
				-sub InvMixColumns {
			
 
				-my @x=@_[0..7];
			
 
				-my @t=@_[8..15];
			
 
				-
			
 
				-# Thanks to Jussi Kivilinna for providing pointer to
			
 
				-#
			
 
				-# | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
			
 
				-# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
			
 
				-# | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
			
 
				-# | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
			
 
				-
			
 
				-$code.=<<___;
			
 
				-	@ multiplication by 0x05-0x00-0x04-0x00
			
 
				-	vext.8	@t[0], @x[0], @x[0], #8
			
 
				-	vext.8	@t[6], @x[6], @x[6], #8
			
 
				-	vext.8	@t[7], @x[7], @x[7], #8
			
 
				-	veor	@t[0], @t[0], @x[0]
			
 
				-	vext.8	@t[1], @x[1], @x[1], #8
			
 
				-	veor	@t[6], @t[6], @x[6]
			
 
				-	vext.8	@t[2], @x[2], @x[2], #8
			
 
				-	veor	@t[7], @t[7], @x[7]
			
 
				-	vext.8	@t[3], @x[3], @x[3], #8
			
 
				-	veor	@t[1], @t[1], @x[1]
			
 
				-	vext.8	@t[4], @x[4], @x[4], #8
			
 
				-	veor	@t[2], @t[2], @x[2]
			
 
				-	vext.8	@t[5], @x[5], @x[5], #8
			
 
				-	veor	@t[3], @t[3], @x[3]
			
 
				-	veor	@t[4], @t[4], @x[4]
			
 
				-	veor	@t[5], @t[5], @x[5]
			
 
				-
			
 
				-	 veor	@x[0], @x[0], @t[6]
			
 
				-	 veor	@x[1], @x[1], @t[6]
			
 
				-	 veor	@x[2], @x[2], @t[0]
			
 
				-	 veor	@x[4], @x[4], @t[2]
			
 
				-	 veor	@x[3], @x[3], @t[1]
			
 
				-	 veor	@x[1], @x[1], @t[7]
			
 
				-	 veor	@x[2], @x[2], @t[7]
			
 
				-	 veor	@x[4], @x[4], @t[6]
			
 
				-	 veor	@x[5], @x[5], @t[3]
			
 
				-	 veor	@x[3], @x[3], @t[6]
			
 
				-	 veor	@x[6], @x[6], @t[4]
			
 
				-	 veor	@x[4], @x[4], @t[7]
			
 
				-	 veor	@x[5], @x[5], @t[7]
			
 
				-	 veor	@x[7], @x[7], @t[5]
			
 
				-___
			
 
				-	&MixColumns	(@x,@t,1);	# flipped 2<->3 and 4<->6
			
 
				-}
			
 
				-
			
 
				-sub swapmove {
			
 
				-my ($a,$b,$n,$mask,$t)=@_;
			
 
				-$code.=<<___;
			
 
				-	vshr.u64	$t, $b, #$n
			
 
				-	veor		$t, $t, $a
			
 
				-	vand		$t, $t, $mask
			
 
				-	veor		$a, $a, $t
			
 
				-	vshl.u64	$t, $t, #$n
			
 
				-	veor		$b, $b, $t
			
 
				-___
			
 
				-}
			
 
				-sub swapmove2x {
			
 
				-my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
			
 
				-$code.=<<___;
			
 
				-	vshr.u64	$t0, $b0, #$n
			
 
				-	 vshr.u64	$t1, $b1, #$n
			
 
				-	veor		$t0, $t0, $a0
			
 
				-	 veor		$t1, $t1, $a1
			
 
				-	vand		$t0, $t0, $mask
			
 
				-	 vand		$t1, $t1, $mask
			
 
				-	veor		$a0, $a0, $t0
			
 
				-	vshl.u64	$t0, $t0, #$n
			
 
				-	 veor		$a1, $a1, $t1
			
 
				-	 vshl.u64	$t1, $t1, #$n
			
 
				-	veor		$b0, $b0, $t0
			
 
				-	 veor		$b1, $b1, $t1
			
 
				-___
			
 
				-}
			
 
				-
			
 
				-sub bitslice {
			
 
				-my @x=reverse(@_[0..7]);
			
 
				-my ($t0,$t1,$t2,$t3)=@_[8..11];
			
 
				-$code.=<<___;
			
 
				-	vmov.i8	$t0,#0x55			@ compose .LBS0
			
 
				-	vmov.i8	$t1,#0x33			@ compose .LBS1
			
 
				-___
			
 
				-	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
			
 
				-	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
			
 
				-$code.=<<___;
			
 
				-	vmov.i8	$t0,#0x0f			@ compose .LBS2
			
 
				-___
			
 
				-	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
			
 
				-	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
			
 
				-
			
 
				-	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
			
 
				-	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
			
 
				-}
			
 
				-
			
 
				-$code.=<<___;
			
 
				-#ifndef __KERNEL__
			
 
				-# include "arm_arch.h"
			
 
				-
			
 
				-# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
			
 
				-# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
			
 
				-# define VFP_ABI_FRAME	0x40
			
 
				-#else
			
 
				-# define VFP_ABI_PUSH
			
 
				-# define VFP_ABI_POP
			
 
				-# define VFP_ABI_FRAME	0
			
 
				-# define BSAES_ASM_EXTENDED_KEY
			
 
				-# define XTS_CHAIN_TWEAK
			
 
				-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
			
 
				-# define __ARM_MAX_ARCH__ 7
			
 
				-#endif
			
 
				-
			
 
				-#ifdef __thumb__
			
 
				-# define adrl adr
			
 
				-#endif
			
 
				-
			
 
				-#if __ARM_MAX_ARCH__>=7
			
 
				-.arch	armv7-a
			
 
				-.fpu	neon
			
 
				-
			
 
				-.text
			
 
				-.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
			
 
				-#ifdef __thumb2__
			
 
				-.thumb
			
 
				-#else
			
 
				-.code   32
			
 
				-#endif
			
 
				-
			
 
				-.type	_bsaes_decrypt8,%function
			
 
				-.align	4
			
 
				-_bsaes_decrypt8:
			
 
				-	adr	$const,_bsaes_decrypt8
			
 
				-	vldmia	$key!, {@XMM[9]}		@ round 0 key
			
 
				-	add	$const,$const,#.LM0ISR-_bsaes_decrypt8
			
 
				-
			
 
				-	vldmia	$const!, {@XMM[8]}		@ .LM0ISR
			
 
				-	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
			
 
				-	veor	@XMM[11], @XMM[1], @XMM[9]
			
 
				-	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
			
 
				-	veor	@XMM[12], @XMM[2], @XMM[9]
			
 
				-	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
			
 
				-	veor	@XMM[13], @XMM[3], @XMM[9]
			
 
				-	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
			
 
				-	veor	@XMM[14], @XMM[4], @XMM[9]
			
 
				-	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
			
 
				-	veor	@XMM[15], @XMM[5], @XMM[9]
			
 
				-	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
			
 
				-	veor	@XMM[10], @XMM[6], @XMM[9]
			
 
				-	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
			
 
				-	veor	@XMM[11], @XMM[7], @XMM[9]
			
 
				-	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
			
 
				-	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
			
 
				-___
			
 
				-	&bitslice	(@XMM[0..7, 8..11]);
			
 
				-$code.=<<___;
			
 
				-	sub	$rounds,$rounds,#1
			
 
				-	b	.Ldec_sbox
			
 
				-.align	4
			
 
				-.Ldec_loop:
			
 
				-___
			
 
				-	&ShiftRows	(@XMM[0..7, 8..12]);
			
 
				-$code.=".Ldec_sbox:\n";
			
 
				-	&InvSbox	(@XMM[0..7, 8..15]);
			
 
				-$code.=<<___;
			
 
				-	subs	$rounds,$rounds,#1
			
 
				-	bcc	.Ldec_done
			
 
				-___
			
 
				-	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
			
 
				-$code.=<<___;
			
 
				-	vldmia	$const, {@XMM[12]}		@ .LISR
			
 
				-	ite	eq				@ Thumb2 thing, sanity check in ARM
			
 
				-	addeq	$const,$const,#0x10
			
 
				-	bne	.Ldec_loop
			
 
				-	vldmia	$const, {@XMM[12]}		@ .LISRM0
			
 
				-	b	.Ldec_loop
			
 
				-.align	4
			
 
				-.Ldec_done:
			
 
				-___
			
 
				-	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
			
 
				-$code.=<<___;
			
 
				-	vldmia	$key, {@XMM[8]}			@ last round key
			
 
				-	veor	@XMM[6], @XMM[6], @XMM[8]
			
 
				-	veor	@XMM[4], @XMM[4], @XMM[8]
			
 
				-	veor	@XMM[2], @XMM[2], @XMM[8]
			
 
				-	veor	@XMM[7], @XMM[7], @XMM[8]
			
 
				-	veor	@XMM[3], @XMM[3], @XMM[8]
			
 
				-	veor	@XMM[5], @XMM[5], @XMM[8]
			
 
				-	veor	@XMM[0], @XMM[0], @XMM[8]
			
 
				-	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				-	bx	lr
			
 
				-.size	_bsaes_decrypt8,.-_bsaes_decrypt8
			
 
				-
			
 
				-.type	_bsaes_const,%object
			
 
				-.align	6
			
 
				-_bsaes_const:
			
 
				-.LM0ISR:	@ InvShiftRows constants
			
 
				-	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
			
 
				-.LISR:
			
 
				-	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
			
 
				-.LISRM0:
			
 
				-	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
			
 
				-.LM0SR:		@ ShiftRows constants
			
 
				-	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
			
 
				-.LSR:
			
 
				-	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
			
 
				-.LSRM0:
			
 
				-	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
			
 
				-.LM0:
			
 
				-	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
			
 
				-.LREVM0SR:
			
 
				-	.quad	0x090d01050c000408, 0x03070b0f060a0e02
			
 
				-.asciz	"Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
			
 
				-.align	6
			
 
				-.size	_bsaes_const,.-_bsaes_const
			
 
				-
			
 
				-.type	_bsaes_encrypt8,%function
			
 
				-.align	4
			
 
				-_bsaes_encrypt8:
			
 
				-	adr	$const,_bsaes_encrypt8
			
 
				-	vldmia	$key!, {@XMM[9]}		@ round 0 key
			
 
				-	sub	$const,$const,#_bsaes_encrypt8-.LM0SR
			
 
				-
			
 
				-	vldmia	$const!, {@XMM[8]}		@ .LM0SR
			
 
				-_bsaes_encrypt8_alt:
			
 
				-	veor	@XMM[10], @XMM[0], @XMM[9]	@ xor with round0 key
			
 
				-	veor	@XMM[11], @XMM[1], @XMM[9]
			
 
				-	 vtbl.8	`&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
			
 
				-	veor	@XMM[12], @XMM[2], @XMM[9]
			
 
				-	 vtbl.8	`&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
			
 
				-	veor	@XMM[13], @XMM[3], @XMM[9]
			
 
				-	 vtbl.8	`&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
			
 
				-	veor	@XMM[14], @XMM[4], @XMM[9]
			
 
				-	 vtbl.8	`&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
			
 
				-	veor	@XMM[15], @XMM[5], @XMM[9]
			
 
				-	 vtbl.8	`&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
			
 
				-	veor	@XMM[10], @XMM[6], @XMM[9]
			
 
				-	 vtbl.8	`&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
			
 
				-	veor	@XMM[11], @XMM[7], @XMM[9]
			
 
				-	 vtbl.8	`&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
			
 
				-	 vtbl.8	`&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
			
 
				-	 vtbl.8	`&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
			
 
				-_bsaes_encrypt8_bitslice:
			
 
				-___
			
 
				-	&bitslice	(@XMM[0..7, 8..11]);
			
 
				-$code.=<<___;
			
 
				-	sub	$rounds,$rounds,#1
			
 
				-	b	.Lenc_sbox
			
 
				-.align	4
			
 
				-.Lenc_loop:
			
 
				-___
			
 
				-	&ShiftRows	(@XMM[0..7, 8..12]);
			
 
				-$code.=".Lenc_sbox:\n";
			
 
				-	&Sbox		(@XMM[0..7, 8..15]);
			
 
				-$code.=<<___;
			
 
				-	subs	$rounds,$rounds,#1
			
 
				-	bcc	.Lenc_done
			
 
				-___
			
 
				-	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
			
 
				-$code.=<<___;
			
 
				-	vldmia	$const, {@XMM[12]}		@ .LSR
			
 
				-	ite	eq				@ Thumb2 thing, samity check in ARM
			
 
				-	addeq	$const,$const,#0x10
			
 
				-	bne	.Lenc_loop
			
 
				-	vldmia	$const, {@XMM[12]}		@ .LSRM0
			
 
				-	b	.Lenc_loop
			
 
				-.align	4
			
 
				-.Lenc_done:
			
 
				-___
			
 
				-	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
			
 
				-	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
			
 
				-$code.=<<___;
			
 
				-	vldmia	$key, {@XMM[8]}			@ last round key
			
 
				-	veor	@XMM[4], @XMM[4], @XMM[8]
			
 
				-	veor	@XMM[6], @XMM[6], @XMM[8]
			
 
				-	veor	@XMM[3], @XMM[3], @XMM[8]
			
 
				-	veor	@XMM[7], @XMM[7], @XMM[8]
			
 
				-	veor	@XMM[2], @XMM[2], @XMM[8]
			
 
				-	veor	@XMM[5], @XMM[5], @XMM[8]
			
 
				-	veor	@XMM[0], @XMM[0], @XMM[8]
			
 
				-	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				-	bx	lr
			
 
				-.size	_bsaes_encrypt8,.-_bsaes_encrypt8
			
 
				-___
			
 
				-}
			
 
				-{
			
 
				-my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
			
 
				-
			
 
				-sub bitslice_key {
			
 
				-my @x=reverse(@_[0..7]);
			
 
				-my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
			
 
				-
			
 
				-	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
			
 
				-$code.=<<___;
			
 
				-	@ &swapmove(@x[2,3],1,$t0,$t2,$t3);
			
 
				-	vmov	@x[2], @x[0]
			
 
				-	vmov	@x[3], @x[1]
			
 
				-___
			
 
				-	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
			
 
				-
			
 
				-	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
			
 
				-$code.=<<___;
			
 
				-	@ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
			
 
				-	vmov	@x[4], @x[0]
			
 
				-	vmov	@x[6], @x[2]
			
 
				-	vmov	@x[5], @x[1]
			
 
				-	vmov	@x[7], @x[3]
			
 
				-___
			
 
				-	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
			
 
				-	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
			
 
				-}
			
 
				-
			
 
				-$code.=<<___;
			
 
				-.type	_bsaes_key_convert,%function
			
 
				-.align	4
			
 
				-_bsaes_key_convert:
			
 
				-	adr	$const,_bsaes_key_convert
			
 
				-	vld1.8	{@XMM[7]},  [$inp]!		@ load round 0 key
			
 
				-	sub	$const,$const,#_bsaes_key_convert-.LM0
			
 
				-	vld1.8	{@XMM[15]}, [$inp]!		@ load round 1 key
			
 
				-
			
 
				-	vmov.i8	@XMM[8],  #0x01			@ bit masks
			
 
				-	vmov.i8	@XMM[9],  #0x02
			
 
				-	vmov.i8	@XMM[10], #0x04
			
 
				-	vmov.i8	@XMM[11], #0x08
			
 
				-	vmov.i8	@XMM[12], #0x10
			
 
				-	vmov.i8	@XMM[13], #0x20
			
 
				-	vldmia	$const, {@XMM[14]}		@ .LM0
			
 
				-
			
 
				-#ifdef __ARMEL__
			
 
				-	vrev32.8	@XMM[7],  @XMM[7]
			
 
				-	vrev32.8	@XMM[15], @XMM[15]
			
 
				-#endif
			
 
				-	sub	$rounds,$rounds,#1
			
 
				-	vstmia	$out!, {@XMM[7]}		@ save round 0 key
			
 
				-	b	.Lkey_loop
			
 
				-
			
 
				-.align	4
			
 
				-.Lkey_loop:
			
 
				-	vtbl.8	`&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
			
 
				-	vtbl.8	`&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
			
 
				-	vmov.i8	@XMM[6],  #0x40
			
 
				-	vmov.i8	@XMM[15], #0x80
			
 
				-
			
 
				-	vtst.8	@XMM[0], @XMM[7], @XMM[8]
			
 
				-	vtst.8	@XMM[1], @XMM[7], @XMM[9]
			
 
				-	vtst.8	@XMM[2], @XMM[7], @XMM[10]
			
 
				-	vtst.8	@XMM[3], @XMM[7], @XMM[11]
			
 
				-	vtst.8	@XMM[4], @XMM[7], @XMM[12]
			
 
				-	vtst.8	@XMM[5], @XMM[7], @XMM[13]
			
 
				-	vtst.8	@XMM[6], @XMM[7], @XMM[6]
			
 
				-	vtst.8	@XMM[7], @XMM[7], @XMM[15]
			
 
				-	vld1.8	{@XMM[15]}, [$inp]!		@ load next round key
			
 
				-	vmvn	@XMM[0], @XMM[0]		@ "pnot"
			
 
				-	vmvn	@XMM[1], @XMM[1]
			
 
				-	vmvn	@XMM[5], @XMM[5]
			
 
				-	vmvn	@XMM[6], @XMM[6]
			
 
				-#ifdef __ARMEL__
			
 
				-	vrev32.8	@XMM[15], @XMM[15]
			
 
				-#endif
			
 
				-	subs	$rounds,$rounds,#1
			
 
				-	vstmia	$out!,{@XMM[0]-@XMM[7]}		@ write bit-sliced round key
			
 
				-	bne	.Lkey_loop
			
 
				-
			
 
				-	vmov.i8	@XMM[7],#0x63			@ compose .L63
			
 
				-	@ don't save last round key
			
 
				-	bx	lr
			
 
				-.size	_bsaes_key_convert,.-_bsaes_key_convert
			
 
				-___
			
 
				-}
			
 
				-
			
 
				-if (0) {		# following four functions are unsupported interface
			
 
				-			# used for benchmarking...
			
 
				-$code.=<<___;
			
 
				-.globl	bsaes_enc_key_convert
			
 
				-.type	bsaes_enc_key_convert,%function
			
 
				-.align	4
			
 
				-bsaes_enc_key_convert:
			
 
				-	stmdb	sp!,{r4-r6,lr}
			
 
				-	vstmdb	sp!,{d8-d15}		@ ABI specification says so
			
 
				-
			
 
				-	ldr	r5,[$inp,#240]			@ pass rounds
			
 
				-	mov	r4,$inp				@ pass key
			
 
				-	mov	r12,$out			@ pass key schedule
			
 
				-	bl	_bsaes_key_convert
			
 
				-	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
			
 
				-	vstmia	r12, {@XMM[7]}			@ save last round key
			
 
				-
			
 
				-	vldmia	sp!,{d8-d15}
			
 
				-	ldmia	sp!,{r4-r6,pc}
			
 
				-.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
			
 
				-
			
 
				-.globl	bsaes_encrypt_128
			
 
				-.type	bsaes_encrypt_128,%function
			
 
				-.align	4
			
 
				-bsaes_encrypt_128:
			
 
				-	stmdb	sp!,{r4-r6,lr}
			
 
				-	vstmdb	sp!,{d8-d15}		@ ABI specification says so
			
 
				-.Lenc128_loop:
			
 
				-	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
			
 
				-	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
			
 
				-	mov	r4,$key				@ pass the key
			
 
				-	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
			
 
				-	mov	r5,#10				@ pass rounds
			
 
				-	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]!
			
 
				-
			
 
				-	bl	_bsaes_encrypt8
			
 
				-
			
 
				-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				-	vst1.8	{@XMM[4]}, [$out]!
			
 
				-	vst1.8	{@XMM[6]}, [$out]!
			
 
				-	vst1.8	{@XMM[3]}, [$out]!
			
 
				-	vst1.8	{@XMM[7]}, [$out]!
			
 
				-	vst1.8	{@XMM[2]}, [$out]!
			
 
				-	subs	$len,$len,#0x80
			
 
				-	vst1.8	{@XMM[5]}, [$out]!
			
 
				-	bhi	.Lenc128_loop
			
 
				-
			
 
				-	vldmia	sp!,{d8-d15}
			
 
				-	ldmia	sp!,{r4-r6,pc}
			
 
				-.size	bsaes_encrypt_128,.-bsaes_encrypt_128
			
 
				-
			
 
				-.globl	bsaes_dec_key_convert
			
 
				-.type	bsaes_dec_key_convert,%function
			
 
				-.align	4
			
 
				-bsaes_dec_key_convert:
			
 
				-	stmdb	sp!,{r4-r6,lr}
			
 
				-	vstmdb	sp!,{d8-d15}		@ ABI specification says so
			
 
				-
			
 
				-	ldr	r5,[$inp,#240]			@ pass rounds
			
 
				-	mov	r4,$inp				@ pass key
			
 
				-	mov	r12,$out			@ pass key schedule
			
 
				-	bl	_bsaes_key_convert
			
 
				-	vldmia	$out, {@XMM[6]}
			
 
				-	vstmia	r12,  {@XMM[15]}		@ save last round key
			
 
				-	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
			
 
				-	vstmia	$out, {@XMM[7]}
			
 
				-
			
 
				-	vldmia	sp!,{d8-d15}
			
 
				-	ldmia	sp!,{r4-r6,pc}
			
 
				-.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
			
 
				-
			
 
				-.globl	bsaes_decrypt_128
			
 
				-.type	bsaes_decrypt_128,%function
			
 
				-.align	4
			
 
				-bsaes_decrypt_128:
			
 
				-	stmdb	sp!,{r4-r6,lr}
			
 
				-	vstmdb	sp!,{d8-d15}		@ ABI specification says so
			
 
				-.Ldec128_loop:
			
 
				-	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
			
 
				-	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
			
 
				-	mov	r4,$key				@ pass the key
			
 
				-	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
			
 
				-	mov	r5,#10				@ pass rounds
			
 
				-	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]!
			
 
				-
			
 
				-	bl	_bsaes_decrypt8
			
 
				-
			
 
				-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				-	vst1.8	{@XMM[6]}, [$out]!
			
 
				-	vst1.8	{@XMM[4]}, [$out]!
			
 
				-	vst1.8	{@XMM[2]}, [$out]!
			
 
				-	vst1.8	{@XMM[7]}, [$out]!
			
 
				-	vst1.8	{@XMM[3]}, [$out]!
			
 
				-	subs	$len,$len,#0x80
			
 
				-	vst1.8	{@XMM[5]}, [$out]!
			
 
				-	bhi	.Ldec128_loop
			
 
				-
			
 
				-	vldmia	sp!,{d8-d15}
			
 
				-	ldmia	sp!,{r4-r6,pc}
			
 
				-.size	bsaes_decrypt_128,.-bsaes_decrypt_128
			
 
				-___
			
 
				-}
			
 
				-{
			
 
				-my ($inp,$out,$len,$key, $ivp,$fp,$rounds)=map("r$_",(0..3,8..10));
			
 
				-my ($keysched)=("sp");
			
 
				-
			
 
				-$code.=<<___;
			
 
				-.extern AES_cbc_encrypt
			
 
				-.extern AES_decrypt
			
 
				-
			
 
				-.global	bsaes_cbc_encrypt
			
 
				-.type	bsaes_cbc_encrypt,%function
			
 
				-.align	5
			
 
				-bsaes_cbc_encrypt:
			
 
				-#ifndef	__KERNEL__
			
 
				-	cmp	$len, #128
			
 
				-#ifndef	__thumb__
			
 
				-	blo	AES_cbc_encrypt
			
 
				-#else
			
 
				-	bhs	1f
			
 
				-	b	AES_cbc_encrypt
			
 
				-1:
			
 
				-#endif
			
 
				-#endif
			
 
				-
			
 
				-	@ it is up to the caller to make sure we are called with enc == 0
			
 
				-
			
 
				-	mov	ip, sp
			
 
				-	stmdb	sp!, {r4-r10, lr}
			
 
				-	VFP_ABI_PUSH
			
 
				-	ldr	$ivp, [ip]			@ IV is 1st arg on the stack
			
 
				-	mov	$len, $len, lsr#4		@ len in 16 byte blocks
			
 
				-	sub	sp, #0x10			@ scratch space to carry over the IV
			
 
				-	mov	$fp, sp				@ save sp
			
 
				-
			
 
				-	ldr	$rounds, [$key, #240]		@ get # of rounds
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	@ allocate the key schedule on the stack
			
 
				-	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
			
 
				-	add	r12, #`128-32`			@ sifze of bit-slices key schedule
			
 
				-
			
 
				-	@ populate the key schedule
			
 
				-	mov	r4, $key			@ pass key
			
 
				-	mov	r5, $rounds			@ pass # of rounds
			
 
				-	mov	sp, r12				@ sp is $keysched
			
 
				-	bl	_bsaes_key_convert
			
 
				-	vldmia	$keysched, {@XMM[6]}
			
 
				-	vstmia	r12,  {@XMM[15]}		@ save last round key
			
 
				-	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
			
 
				-	vstmia	$keysched, {@XMM[7]}
			
 
				-#else
			
 
				-	ldr	r12, [$key, #244]
			
 
				-	eors	r12, #1
			
 
				-	beq	0f
			
 
				-
			
 
				-	@ populate the key schedule
			
 
				-	str	r12, [$key, #244]
			
 
				-	mov	r4, $key			@ pass key
			
 
				-	mov	r5, $rounds			@ pass # of rounds
			
 
				-	add	r12, $key, #248			@ pass key schedule
			
 
				-	bl	_bsaes_key_convert
			
 
				-	add	r4, $key, #248
			
 
				-	vldmia	r4, {@XMM[6]}
			
 
				-	vstmia	r12, {@XMM[15]}			@ save last round key
			
 
				-	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
			
 
				-	vstmia	r4, {@XMM[7]}
			
 
				-
			
 
				-.align	2
			
 
				-0:
			
 
				-#endif
			
 
				-
			
 
				-	vld1.8	{@XMM[15]}, [$ivp]		@ load IV
			
 
				-	b	.Lcbc_dec_loop
			
 
				-
			
 
				-.align	4
			
 
				-.Lcbc_dec_loop:
			
 
				-	subs	$len, $len, #0x8
			
 
				-	bmi	.Lcbc_dec_loop_finish
			
 
				-
			
 
				-	vld1.8	{@XMM[0]-@XMM[1]}, [$inp]!	@ load input
			
 
				-	vld1.8	{@XMM[2]-@XMM[3]}, [$inp]!
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	mov	r4, $keysched			@ pass the key
			
 
				-#else
			
 
				-	add	r4, $key, #248
			
 
				-#endif
			
 
				-	vld1.8	{@XMM[4]-@XMM[5]}, [$inp]!
			
 
				-	mov	r5, $rounds
			
 
				-	vld1.8	{@XMM[6]-@XMM[7]}, [$inp]
			
 
				-	sub	$inp, $inp, #0x60
			
 
				-	vstmia	$fp, {@XMM[15]}			@ put aside IV
			
 
				-
			
 
				-	bl	_bsaes_decrypt8
			
 
				-
			
 
				-	vldmia	$fp, {@XMM[14]}			@ reload IV
			
 
				-	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
			
 
				-	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
			
 
				-	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
			
 
				-	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				-	veor	@XMM[6], @XMM[6], @XMM[9]
			
 
				-	vld1.8	{@XMM[12]-@XMM[13]}, [$inp]!
			
 
				-	veor	@XMM[4], @XMM[4], @XMM[10]
			
 
				-	veor	@XMM[2], @XMM[2], @XMM[11]
			
 
				-	vld1.8	{@XMM[14]-@XMM[15]}, [$inp]!
			
 
				-	veor	@XMM[7], @XMM[7], @XMM[12]
			
 
				-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				-	veor	@XMM[3], @XMM[3], @XMM[13]
			
 
				-	vst1.8	{@XMM[6]}, [$out]!
			
 
				-	veor	@XMM[5], @XMM[5], @XMM[14]
			
 
				-	vst1.8	{@XMM[4]}, [$out]!
			
 
				-	vst1.8	{@XMM[2]}, [$out]!
			
 
				-	vst1.8	{@XMM[7]}, [$out]!
			
 
				-	vst1.8	{@XMM[3]}, [$out]!
			
 
				-	vst1.8	{@XMM[5]}, [$out]!
			
 
				-
			
 
				-	b	.Lcbc_dec_loop
			
 
				-
			
 
				-.Lcbc_dec_loop_finish:
			
 
				-	adds	$len, $len, #8
			
 
				-	beq	.Lcbc_dec_done
			
 
				-
			
 
				-	vld1.8	{@XMM[0]}, [$inp]!		@ load input
			
 
				-	cmp	$len, #2
			
 
				-	blo	.Lcbc_dec_one
			
 
				-	vld1.8	{@XMM[1]}, [$inp]!
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	mov	r4, $keysched			@ pass the key
			
 
				-#else
			
 
				-	add	r4, $key, #248
			
 
				-#endif
			
 
				-	mov	r5, $rounds
			
 
				-	vstmia	$fp, {@XMM[15]}			@ put aside IV
			
 
				-	beq	.Lcbc_dec_two
			
 
				-	vld1.8	{@XMM[2]}, [$inp]!
			
 
				-	cmp	$len, #4
			
 
				-	blo	.Lcbc_dec_three
			
 
				-	vld1.8	{@XMM[3]}, [$inp]!
			
 
				-	beq	.Lcbc_dec_four
			
 
				-	vld1.8	{@XMM[4]}, [$inp]!
			
 
				-	cmp	$len, #6
			
 
				-	blo	.Lcbc_dec_five
			
 
				-	vld1.8	{@XMM[5]}, [$inp]!
			
 
				-	beq	.Lcbc_dec_six
			
 
				-	vld1.8	{@XMM[6]}, [$inp]!
			
 
				-	sub	$inp, $inp, #0x70
			
 
				-
			
 
				-	bl	_bsaes_decrypt8
			
 
				-
			
 
				-	vldmia	$fp, {@XMM[14]}			@ reload IV
			
 
				-	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
			
 
				-	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
			
 
				-	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
			
 
				-	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				-	veor	@XMM[6], @XMM[6], @XMM[9]
			
 
				-	vld1.8	{@XMM[12]-@XMM[13]}, [$inp]!
			
 
				-	veor	@XMM[4], @XMM[4], @XMM[10]
			
 
				-	veor	@XMM[2], @XMM[2], @XMM[11]
			
 
				-	vld1.8	{@XMM[15]}, [$inp]!
			
 
				-	veor	@XMM[7], @XMM[7], @XMM[12]
			
 
				-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				-	veor	@XMM[3], @XMM[3], @XMM[13]
			
 
				-	vst1.8	{@XMM[6]}, [$out]!
			
 
				-	vst1.8	{@XMM[4]}, [$out]!
			
 
				-	vst1.8	{@XMM[2]}, [$out]!
			
 
				-	vst1.8	{@XMM[7]}, [$out]!
			
 
				-	vst1.8	{@XMM[3]}, [$out]!
			
 
				-	b	.Lcbc_dec_done
			
 
				-.align	4
			
 
				-.Lcbc_dec_six:
			
 
				-	sub	$inp, $inp, #0x60
			
 
				-	bl	_bsaes_decrypt8
			
 
				-	vldmia	$fp,{@XMM[14]}			@ reload IV
			
 
				-	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
			
 
				-	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
			
 
				-	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
			
 
				-	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				-	veor	@XMM[6], @XMM[6], @XMM[9]
			
 
				-	vld1.8	{@XMM[12]}, [$inp]!
			
 
				-	veor	@XMM[4], @XMM[4], @XMM[10]
			
 
				-	veor	@XMM[2], @XMM[2], @XMM[11]
			
 
				-	vld1.8	{@XMM[15]}, [$inp]!
			
 
				-	veor	@XMM[7], @XMM[7], @XMM[12]
			
 
				-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				-	vst1.8	{@XMM[6]}, [$out]!
			
 
				-	vst1.8	{@XMM[4]}, [$out]!
			
 
				-	vst1.8	{@XMM[2]}, [$out]!
			
 
				-	vst1.8	{@XMM[7]}, [$out]!
			
 
				-	b	.Lcbc_dec_done
			
 
				-.align	4
			
 
				-.Lcbc_dec_five:
			
 
				-	sub	$inp, $inp, #0x50
			
 
				-	bl	_bsaes_decrypt8
			
 
				-	vldmia	$fp, {@XMM[14]}			@ reload IV
			
 
				-	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
			
 
				-	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
			
 
				-	vld1.8	{@XMM[10]-@XMM[11]}, [$inp]!
			
 
				-	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				-	veor	@XMM[6], @XMM[6], @XMM[9]
			
 
				-	vld1.8	{@XMM[15]}, [$inp]!
			
 
				-	veor	@XMM[4], @XMM[4], @XMM[10]
			
 
				-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				-	veor	@XMM[2], @XMM[2], @XMM[11]
			
 
				-	vst1.8	{@XMM[6]}, [$out]!
			
 
				-	vst1.8	{@XMM[4]}, [$out]!
			
 
				-	vst1.8	{@XMM[2]}, [$out]!
			
 
				-	b	.Lcbc_dec_done
			
 
				-.align	4
			
 
				-.Lcbc_dec_four:
			
 
				-	sub	$inp, $inp, #0x40
			
 
				-	bl	_bsaes_decrypt8
			
 
				-	vldmia	$fp, {@XMM[14]}			@ reload IV
			
 
				-	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
			
 
				-	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
			
 
				-	vld1.8	{@XMM[10]}, [$inp]!
			
 
				-	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				-	veor	@XMM[6], @XMM[6], @XMM[9]
			
 
				-	vld1.8	{@XMM[15]}, [$inp]!
			
 
				-	veor	@XMM[4], @XMM[4], @XMM[10]
			
 
				-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				-	vst1.8	{@XMM[6]}, [$out]!
			
 
				-	vst1.8	{@XMM[4]}, [$out]!
			
 
				-	b	.Lcbc_dec_done
			
 
				-.align	4
			
 
				-.Lcbc_dec_three:
			
 
				-	sub	$inp, $inp, #0x30
			
 
				-	bl	_bsaes_decrypt8
			
 
				-	vldmia	$fp, {@XMM[14]}			@ reload IV
			
 
				-	vld1.8	{@XMM[8]-@XMM[9]}, [$inp]!	@ reload input
			
 
				-	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
			
 
				-	vld1.8	{@XMM[15]}, [$inp]!
			
 
				-	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				-	veor	@XMM[6], @XMM[6], @XMM[9]
			
 
				-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				-	vst1.8	{@XMM[6]}, [$out]!
			
 
				-	b	.Lcbc_dec_done
			
 
				-.align	4
			
 
				-.Lcbc_dec_two:
			
 
				-	sub	$inp, $inp, #0x20
			
 
				-	bl	_bsaes_decrypt8
			
 
				-	vldmia	$fp, {@XMM[14]}			@ reload IV
			
 
				-	vld1.8	{@XMM[8]}, [$inp]!		@ reload input
			
 
				-	veor	@XMM[0], @XMM[0], @XMM[14]	@ ^= IV
			
 
				-	vld1.8	{@XMM[15]}, [$inp]!		@ reload input
			
 
				-	veor	@XMM[1], @XMM[1], @XMM[8]
			
 
				-	vst1.8	{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				-	b	.Lcbc_dec_done
			
 
				-.align	4
			
 
				-.Lcbc_dec_one:
			
 
				-	sub	$inp, $inp, #0x10
			
 
				-	mov	$rounds, $out			@ save original out pointer
			
 
				-	mov	$out, $fp			@ use the iv scratch space as out buffer
			
 
				-	mov	r2, $key
			
 
				-	vmov	@XMM[4],@XMM[15]		@ just in case ensure that IV
			
 
				-	vmov	@XMM[5],@XMM[0]			@ and input are preserved
			
 
				-	bl	AES_decrypt
			
 
				-	vld1.8	{@XMM[0]}, [$fp,:64]		@ load result
			
 
				-	veor	@XMM[0], @XMM[0], @XMM[4]	@ ^= IV
			
 
				-	vmov	@XMM[15], @XMM[5]		@ @XMM[5] holds input
			
 
				-	vst1.8	{@XMM[0]}, [$rounds]		@ write output
			
 
				-
			
 
				-.Lcbc_dec_done:
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	vmov.i32	q0, #0
			
 
				-	vmov.i32	q1, #0
			
 
				-.Lcbc_dec_bzero:				@ wipe key schedule [if any]
			
 
				-	vstmia		$keysched!, {q0-q1}
			
 
				-	cmp		$keysched, $fp
			
 
				-	bne		.Lcbc_dec_bzero
			
 
				-#endif
			
 
				-
			
 
				-	mov	sp, $fp
			
 
				-	add	sp, #0x10			@ add sp,$fp,#0x10 is no good for thumb
			
 
				-	vst1.8	{@XMM[15]}, [$ivp]		@ return IV
			
 
				-	VFP_ABI_POP
			
 
				-	ldmia	sp!, {r4-r10, pc}
			
 
				-.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
			
 
				-___
			
 
				-}
			
 
				-{
			
 
				-my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
			
 
				-my $const = "r6";	# shared with _bsaes_encrypt8_alt
			
 
				-my $keysched = "sp";
			
 
				-
			
 
				-$code.=<<___;
			
 
				-.extern	AES_encrypt
			
 
				-.global	bsaes_ctr32_encrypt_blocks
			
 
				-.type	bsaes_ctr32_encrypt_blocks,%function
			
 
				-.align	5
			
 
				-bsaes_ctr32_encrypt_blocks:
			
 
				-	cmp	$len, #8			@ use plain AES for
			
 
				-	blo	.Lctr_enc_short			@ small sizes
			
 
				-
			
 
				-	mov	ip, sp
			
 
				-	stmdb	sp!, {r4-r10, lr}
			
 
				-	VFP_ABI_PUSH
			
 
				-	ldr	$ctr, [ip]			@ ctr is 1st arg on the stack
			
 
				-	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
			
 
				-	mov	$fp, sp				@ save sp
			
 
				-
			
 
				-	ldr	$rounds, [$key, #240]		@ get # of rounds
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	@ allocate the key schedule on the stack
			
 
				-	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
			
 
				-	add	r12, #`128-32`			@ size of bit-sliced key schedule
			
 
				-
			
 
				-	@ populate the key schedule
			
 
				-	mov	r4, $key			@ pass key
			
 
				-	mov	r5, $rounds			@ pass # of rounds
			
 
				-	mov	sp, r12				@ sp is $keysched
			
 
				-	bl	_bsaes_key_convert
			
 
				-	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
			
 
				-	vstmia	r12, {@XMM[7]}			@ save last round key
			
 
				-
			
 
				-	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
			
 
				-	add	$ctr, $const, #.LREVM0SR-.LM0	@ borrow $ctr
			
 
				-	vldmia	$keysched, {@XMM[4]}		@ load round0 key
			
 
				-#else
			
 
				-	ldr	r12, [$key, #244]
			
 
				-	eors	r12, #1
			
 
				-	beq	0f
			
 
				-
			
 
				-	@ populate the key schedule
			
 
				-	str	r12, [$key, #244]
			
 
				-	mov	r4, $key			@ pass key
			
 
				-	mov	r5, $rounds			@ pass # of rounds
			
 
				-	add	r12, $key, #248			@ pass key schedule
			
 
				-	bl	_bsaes_key_convert
			
 
				-	veor	@XMM[7],@XMM[7],@XMM[15]	@ fix up last round key
			
 
				-	vstmia	r12, {@XMM[7]}			@ save last round key
			
 
				-
			
 
				-.align	2
			
 
				-0:	add	r12, $key, #248
			
 
				-	vld1.8	{@XMM[0]}, [$ctr]		@ load counter
			
 
				-	adrl	$ctr, .LREVM0SR			@ borrow $ctr
			
 
				-	vldmia	r12, {@XMM[4]}			@ load round0 key
			
 
				-	sub	sp, #0x10			@ place for adjusted round0 key
			
 
				-#endif
			
 
				-
			
 
				-	vmov.i32	@XMM[8],#1		@ compose 1<<96
			
 
				-	veor		@XMM[9],@XMM[9],@XMM[9]
			
 
				-	vrev32.8	@XMM[0],@XMM[0]
			
 
				-	vext.8		@XMM[8],@XMM[9],@XMM[8],#4
			
 
				-	vrev32.8	@XMM[4],@XMM[4]
			
 
				-	vadd.u32	@XMM[9],@XMM[8],@XMM[8]	@ compose 2<<96
			
 
				-	vstmia	$keysched, {@XMM[4]}		@ save adjusted round0 key
			
 
				-	b	.Lctr_enc_loop
			
 
				-
			
 
				-.align	4
			
 
				-.Lctr_enc_loop:
			
 
				-	vadd.u32	@XMM[10], @XMM[8], @XMM[9]	@ compose 3<<96
			
 
				-	vadd.u32	@XMM[1], @XMM[0], @XMM[8]	@ +1
			
 
				-	vadd.u32	@XMM[2], @XMM[0], @XMM[9]	@ +2
			
 
				-	vadd.u32	@XMM[3], @XMM[0], @XMM[10]	@ +3
			
 
				-	vadd.u32	@XMM[4], @XMM[1], @XMM[10]
			
 
				-	vadd.u32	@XMM[5], @XMM[2], @XMM[10]
			
 
				-	vadd.u32	@XMM[6], @XMM[3], @XMM[10]
			
 
				-	vadd.u32	@XMM[7], @XMM[4], @XMM[10]
			
 
				-	vadd.u32	@XMM[10], @XMM[5], @XMM[10]	@ next counter
			
 
				-
			
 
				-	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
			
 
				-	@ to flip byte order in 32-bit counter
			
 
				-
			
 
				-	vldmia		$keysched, {@XMM[9]}		@ load round0 key
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, $keysched, #0x10		@ pass next round key
			
 
				-#else
			
 
				-	add		r4, $key, #`248+16`
			
 
				-#endif
			
 
				-	vldmia		$ctr, {@XMM[8]}			@ .LREVM0SR
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	vstmia		$fp, {@XMM[10]}			@ save next counter
			
 
				-	sub		$const, $ctr, #.LREVM0SR-.LSR	@ pass constants
			
 
				-
			
 
				-	bl		_bsaes_encrypt8_alt
			
 
				-
			
 
				-	subs		$len, $len, #8
			
 
				-	blo		.Lctr_enc_loop_done
			
 
				-
			
 
				-	vld1.8		{@XMM[8]-@XMM[9]}, [$inp]!	@ load input
			
 
				-	vld1.8		{@XMM[10]-@XMM[11]}, [$inp]!
			
 
				-	veor		@XMM[0], @XMM[8]
			
 
				-	veor		@XMM[1], @XMM[9]
			
 
				-	vld1.8		{@XMM[12]-@XMM[13]}, [$inp]!
			
 
				-	veor		@XMM[4], @XMM[10]
			
 
				-	veor		@XMM[6], @XMM[11]
			
 
				-	vld1.8		{@XMM[14]-@XMM[15]}, [$inp]!
			
 
				-	veor		@XMM[3], @XMM[12]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!	@ write output
			
 
				-	veor		@XMM[7], @XMM[13]
			
 
				-	veor		@XMM[2], @XMM[14]
			
 
				-	vst1.8		{@XMM[4]}, [$out]!
			
 
				-	veor		@XMM[5], @XMM[15]
			
 
				-	vst1.8		{@XMM[6]}, [$out]!
			
 
				-	vmov.i32	@XMM[8], #1			@ compose 1<<96
			
 
				-	vst1.8		{@XMM[3]}, [$out]!
			
 
				-	veor		@XMM[9], @XMM[9], @XMM[9]
			
 
				-	vst1.8		{@XMM[7]}, [$out]!
			
 
				-	vext.8		@XMM[8], @XMM[9], @XMM[8], #4
			
 
				-	vst1.8		{@XMM[2]}, [$out]!
			
 
				-	vadd.u32	@XMM[9],@XMM[8],@XMM[8]		@ compose 2<<96
			
 
				-	vst1.8		{@XMM[5]}, [$out]!
			
 
				-	vldmia		$fp, {@XMM[0]}			@ load counter
			
 
				-
			
 
				-	bne		.Lctr_enc_loop
			
 
				-	b		.Lctr_enc_done
			
 
				-
			
 
				-.align	4
			
 
				-.Lctr_enc_loop_done:
			
 
				-	add		$len, $len, #8
			
 
				-	vld1.8		{@XMM[8]}, [$inp]!	@ load input
			
 
				-	veor		@XMM[0], @XMM[8]
			
 
				-	vst1.8		{@XMM[0]}, [$out]!	@ write output
			
 
				-	cmp		$len, #2
			
 
				-	blo		.Lctr_enc_done
			
 
				-	vld1.8		{@XMM[9]}, [$inp]!
			
 
				-	veor		@XMM[1], @XMM[9]
			
 
				-	vst1.8		{@XMM[1]}, [$out]!
			
 
				-	beq		.Lctr_enc_done
			
 
				-	vld1.8		{@XMM[10]}, [$inp]!
			
 
				-	veor		@XMM[4], @XMM[10]
			
 
				-	vst1.8		{@XMM[4]}, [$out]!
			
 
				-	cmp		$len, #4
			
 
				-	blo		.Lctr_enc_done
			
 
				-	vld1.8		{@XMM[11]}, [$inp]!
			
 
				-	veor		@XMM[6], @XMM[11]
			
 
				-	vst1.8		{@XMM[6]}, [$out]!
			
 
				-	beq		.Lctr_enc_done
			
 
				-	vld1.8		{@XMM[12]}, [$inp]!
			
 
				-	veor		@XMM[3], @XMM[12]
			
 
				-	vst1.8		{@XMM[3]}, [$out]!
			
 
				-	cmp		$len, #6
			
 
				-	blo		.Lctr_enc_done
			
 
				-	vld1.8		{@XMM[13]}, [$inp]!
			
 
				-	veor		@XMM[7], @XMM[13]
			
 
				-	vst1.8		{@XMM[7]}, [$out]!
			
 
				-	beq		.Lctr_enc_done
			
 
				-	vld1.8		{@XMM[14]}, [$inp]
			
 
				-	veor		@XMM[2], @XMM[14]
			
 
				-	vst1.8		{@XMM[2]}, [$out]!
			
 
				-
			
 
				-.Lctr_enc_done:
			
 
				-	vmov.i32	q0, #0
			
 
				-	vmov.i32	q1, #0
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-.Lctr_enc_bzero:			@ wipe key schedule [if any]
			
 
				-	vstmia		$keysched!, {q0-q1}
			
 
				-	cmp		$keysched, $fp
			
 
				-	bne		.Lctr_enc_bzero
			
 
				-#else
			
 
				-	vstmia		$keysched, {q0-q1}
			
 
				-#endif
			
 
				-
			
 
				-	mov	sp, $fp
			
 
				-	add	sp, #0x10		@ add sp,$fp,#0x10 is no good for thumb
			
 
				-	VFP_ABI_POP
			
 
				-	ldmia	sp!, {r4-r10, pc}	@ return
			
 
				-
			
 
				-.align	4
			
 
				-.Lctr_enc_short:
			
 
				-	ldr	ip, [sp]		@ ctr pointer is passed on stack
			
 
				-	stmdb	sp!, {r4-r8, lr}
			
 
				-
			
 
				-	mov	r4, $inp		@ copy arguments
			
 
				-	mov	r5, $out
			
 
				-	mov	r6, $len
			
 
				-	mov	r7, $key
			
 
				-	ldr	r8, [ip, #12]		@ load counter LSW
			
 
				-	vld1.8	{@XMM[1]}, [ip]		@ load whole counter value
			
 
				-#ifdef __ARMEL__
			
 
				-	rev	r8, r8
			
 
				-#endif
			
 
				-	sub	sp, sp, #0x10
			
 
				-	vst1.8	{@XMM[1]}, [sp,:64]	@ copy counter value
			
 
				-	sub	sp, sp, #0x10
			
 
				-
			
 
				-.Lctr_enc_short_loop:
			
 
				-	add	r0, sp, #0x10		@ input counter value
			
 
				-	mov	r1, sp			@ output on the stack
			
 
				-	mov	r2, r7			@ key
			
 
				-
			
 
				-	bl	AES_encrypt
			
 
				-
			
 
				-	vld1.8	{@XMM[0]}, [r4]!	@ load input
			
 
				-	vld1.8	{@XMM[1]}, [sp,:64]	@ load encrypted counter
			
 
				-	add	r8, r8, #1
			
 
				-#ifdef __ARMEL__
			
 
				-	rev	r0, r8
			
 
				-	str	r0, [sp, #0x1c]		@ next counter value
			
 
				-#else
			
 
				-	str	r8, [sp, #0x1c]		@ next counter value
			
 
				-#endif
			
 
				-	veor	@XMM[0],@XMM[0],@XMM[1]
			
 
				-	vst1.8	{@XMM[0]}, [r5]!	@ store output
			
 
				-	subs	r6, r6, #1
			
 
				-	bne	.Lctr_enc_short_loop
			
 
				-
			
 
				-	vmov.i32	q0, #0
			
 
				-	vmov.i32	q1, #0
			
 
				-	vstmia		sp!, {q0-q1}
			
 
				-
			
 
				-	ldmia	sp!, {r4-r8, pc}
			
 
				-.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
			
 
				-___
			
 
				-}
			
 
				-{
			
 
				-######################################################################
			
 
				-# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
			
 
				-#	const AES_KEY *key1, const AES_KEY *key2,
			
 
				-#	const unsigned char iv[16]);
			
 
				-#
			
 
				-my ($inp,$out,$len,$key,$rounds,$magic,$fp)=(map("r$_",(7..10,1..3)));
			
 
				-my $const="r6";		# returned by _bsaes_key_convert
			
 
				-my $twmask=@XMM[5];
			
 
				-my @T=@XMM[6..7];
			
 
				-
			
 
				-$code.=<<___;
			
 
				-.globl	bsaes_xts_encrypt
			
 
				-.type	bsaes_xts_encrypt,%function
			
 
				-.align	4
			
 
				-bsaes_xts_encrypt:
			
 
				-	mov	ip, sp
			
 
				-	stmdb	sp!, {r4-r10, lr}		@ 0x20
			
 
				-	VFP_ABI_PUSH
			
 
				-	mov	r6, sp				@ future $fp
			
 
				-
			
 
				-	mov	$inp, r0
			
 
				-	mov	$out, r1
			
 
				-	mov	$len, r2
			
 
				-	mov	$key, r3
			
 
				-
			
 
				-	sub	r0, sp, #0x10			@ 0x10
			
 
				-	bic	r0, #0xf			@ align at 16 bytes
			
 
				-	mov	sp, r0
			
 
				-
			
 
				-#ifdef	XTS_CHAIN_TWEAK
			
 
				-	ldr	r0, [ip]			@ pointer to input tweak
			
 
				-#else
			
 
				-	@ generate initial tweak
			
 
				-	ldr	r0, [ip, #4]			@ iv[]
			
 
				-	mov	r1, sp
			
 
				-	ldr	r2, [ip, #0]			@ key2
			
 
				-	bl	AES_encrypt
			
 
				-	mov	r0,sp				@ pointer to initial tweak
			
 
				-#endif
			
 
				-
			
 
				-	ldr	$rounds, [$key, #240]		@ get # of rounds
			
 
				-	mov	$fp, r6
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	@ allocate the key schedule on the stack
			
 
				-	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
			
 
				-	@ add	r12, #`128-32`			@ size of bit-sliced key schedule
			
 
				-	sub	r12, #`32+16`			@ place for tweak[9]
			
 
				-
			
 
				-	@ populate the key schedule
			
 
				-	mov	r4, $key			@ pass key
			
 
				-	mov	r5, $rounds			@ pass # of rounds
			
 
				-	mov	sp, r12
			
 
				-	add	r12, #0x90			@ pass key schedule
			
 
				-	bl	_bsaes_key_convert
			
 
				-	veor	@XMM[7], @XMM[7], @XMM[15]	@ fix up last round key
			
 
				-	vstmia	r12, {@XMM[7]}			@ save last round key
			
 
				-#else
			
 
				-	ldr	r12, [$key, #244]
			
 
				-	eors	r12, #1
			
 
				-	beq	0f
			
 
				-
			
 
				-	str	r12, [$key, #244]
			
 
				-	mov	r4, $key			@ pass key
			
 
				-	mov	r5, $rounds			@ pass # of rounds
			
 
				-	add	r12, $key, #248			@ pass key schedule
			
 
				-	bl	_bsaes_key_convert
			
 
				-	veor	@XMM[7], @XMM[7], @XMM[15]	@ fix up last round key
			
 
				-	vstmia	r12, {@XMM[7]}
			
 
				-
			
 
				-.align	2
			
 
				-0:	sub	sp, #0x90			@ place for tweak[9]
			
 
				-#endif
			
 
				-
			
 
				-	vld1.8	{@XMM[8]}, [r0]			@ initial tweak
			
 
				-	adr	$magic, .Lxts_magic
			
 
				-
			
 
				-	subs	$len, #0x80
			
 
				-	blo	.Lxts_enc_short
			
 
				-	b	.Lxts_enc_loop
			
 
				-
			
 
				-.align	4
			
 
				-.Lxts_enc_loop:
			
 
				-	vldmia		$magic, {$twmask}	@ load XTS magic
			
 
				-	vshr.s64	@T[0], @XMM[8], #63
			
 
				-	mov		r0, sp
			
 
				-	vand		@T[0], @T[0], $twmask
			
 
				-___
			
 
				-for($i=9;$i<16;$i++) {
			
 
				-$code.=<<___;
			
 
				-	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
			
 
				-	vst1.64		{@XMM[$i-1]}, [r0,:128]!
			
 
				-	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
			
 
				-	vshr.s64	@T[1], @XMM[$i], #63
			
 
				-	veor		@XMM[$i], @XMM[$i], @T[0]
			
 
				-	vand		@T[1], @T[1], $twmask
			
 
				-___
			
 
				-	@T=reverse(@T);
			
 
				-
			
 
				-$code.=<<___ if ($i>=10);
			
 
				-	vld1.8		{@XMM[$i-10]}, [$inp]!
			
 
				-___
			
 
				-$code.=<<___ if ($i>=11);
			
 
				-	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
			
 
				-___
			
 
				-}
			
 
				-$code.=<<___;
			
 
				-	vadd.u64	@XMM[8], @XMM[15], @XMM[15]
			
 
				-	vst1.64		{@XMM[15]}, [r0,:128]!
			
 
				-	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
			
 
				-	veor		@XMM[8], @XMM[8], @T[0]
			
 
				-	vst1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	vld1.8		{@XMM[6]-@XMM[7]}, [$inp]!
			
 
				-	veor		@XMM[5], @XMM[5], @XMM[13]
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, $key, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		@XMM[6], @XMM[6], @XMM[14]
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	veor		@XMM[7], @XMM[7], @XMM[15]
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_encrypt8
			
 
				-
			
 
				-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				-	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				-	veor		@XMM[8], @XMM[4], @XMM[10]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				-	veor		@XMM[9], @XMM[6], @XMM[11]
			
 
				-	vld1.64		{@XMM[14]-@XMM[15]}, [r0,:128]!
			
 
				-	veor		@XMM[10], @XMM[3], @XMM[12]
			
 
				-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				-	veor		@XMM[11], @XMM[7], @XMM[13]
			
 
				-	veor		@XMM[12], @XMM[2], @XMM[14]
			
 
				-	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
			
 
				-	veor		@XMM[13], @XMM[5], @XMM[15]
			
 
				-	vst1.8		{@XMM[12]-@XMM[13]}, [$out]!
			
 
				-
			
 
				-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	subs		$len, #0x80
			
 
				-	bpl		.Lxts_enc_loop
			
 
				-
			
 
				-.Lxts_enc_short:
			
 
				-	adds		$len, #0x70
			
 
				-	bmi		.Lxts_enc_done
			
 
				-
			
 
				-	vldmia		$magic, {$twmask}	@ load XTS magic
			
 
				-	vshr.s64	@T[0], @XMM[8], #63
			
 
				-	mov		r0, sp
			
 
				-	vand		@T[0], @T[0], $twmask
			
 
				-___
			
 
				-for($i=9;$i<16;$i++) {
			
 
				-$code.=<<___;
			
 
				-	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
			
 
				-	vst1.64		{@XMM[$i-1]}, [r0,:128]!
			
 
				-	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
			
 
				-	vshr.s64	@T[1], @XMM[$i], #63
			
 
				-	veor		@XMM[$i], @XMM[$i], @T[0]
			
 
				-	vand		@T[1], @T[1], $twmask
			
 
				-___
			
 
				-	@T=reverse(@T);
			
 
				-
			
 
				-$code.=<<___ if ($i>=10);
			
 
				-	vld1.8		{@XMM[$i-10]}, [$inp]!
			
 
				-	subs		$len, #0x10
			
 
				-	bmi		.Lxts_enc_`$i-9`
			
 
				-___
			
 
				-$code.=<<___ if ($i>=11);
			
 
				-	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
			
 
				-___
			
 
				-}
			
 
				-$code.=<<___;
			
 
				-	sub		$len, #0x10
			
 
				-	vst1.64		{@XMM[15]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	vld1.8		{@XMM[6]}, [$inp]!
			
 
				-	veor		@XMM[5], @XMM[5], @XMM[13]
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, $key, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		@XMM[6], @XMM[6], @XMM[14]
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_encrypt8
			
 
				-
			
 
				-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				-	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				-	veor		@XMM[8], @XMM[4], @XMM[10]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				-	veor		@XMM[9], @XMM[6], @XMM[11]
			
 
				-	vld1.64		{@XMM[14]}, [r0,:128]!
			
 
				-	veor		@XMM[10], @XMM[3], @XMM[12]
			
 
				-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				-	veor		@XMM[11], @XMM[7], @XMM[13]
			
 
				-	veor		@XMM[12], @XMM[2], @XMM[14]
			
 
				-	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
			
 
				-	vst1.8		{@XMM[12]}, [$out]!
			
 
				-
			
 
				-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_enc_done
			
 
				-.align	4
			
 
				-.Lxts_enc_6:
			
 
				-	vst1.64		{@XMM[14]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		@XMM[4], @XMM[4], @XMM[12]
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, $key, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		@XMM[5], @XMM[5], @XMM[13]
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_encrypt8
			
 
				-
			
 
				-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				-	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				-	veor		@XMM[8], @XMM[4], @XMM[10]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				-	veor		@XMM[9], @XMM[6], @XMM[11]
			
 
				-	veor		@XMM[10], @XMM[3], @XMM[12]
			
 
				-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				-	veor		@XMM[11], @XMM[7], @XMM[13]
			
 
				-	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
			
 
				-
			
 
				-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_enc_done
			
 
				-
			
 
				-@ put this in range for both ARM and Thumb mode adr instructions
			
 
				-.align	5
			
 
				-.Lxts_magic:
			
 
				-	.quad	1, 0x87
			
 
				-
			
 
				-.align	5
			
 
				-.Lxts_enc_5:
			
 
				-	vst1.64		{@XMM[13]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		@XMM[3], @XMM[3], @XMM[11]
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, $key, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		@XMM[4], @XMM[4], @XMM[12]
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_encrypt8
			
 
				-
			
 
				-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				-	vld1.64		{@XMM[12]}, [r0,:128]!
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				-	veor		@XMM[8], @XMM[4], @XMM[10]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				-	veor		@XMM[9], @XMM[6], @XMM[11]
			
 
				-	veor		@XMM[10], @XMM[3], @XMM[12]
			
 
				-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				-	vst1.8		{@XMM[10]}, [$out]!
			
 
				-
			
 
				-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_enc_done
			
 
				-.align	4
			
 
				-.Lxts_enc_4:
			
 
				-	vst1.64		{@XMM[12]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		@XMM[2], @XMM[2], @XMM[10]
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, $key, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		@XMM[3], @XMM[3], @XMM[11]
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_encrypt8
			
 
				-
			
 
				-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				-	veor		@XMM[8], @XMM[4], @XMM[10]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				-	veor		@XMM[9], @XMM[6], @XMM[11]
			
 
				-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				-
			
 
				-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_enc_done
			
 
				-.align	4
			
 
				-.Lxts_enc_3:
			
 
				-	vst1.64		{@XMM[11]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[9]
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, $key, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		@XMM[2], @XMM[2], @XMM[10]
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_encrypt8
			
 
				-
			
 
				-	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
			
 
				-	vld1.64		{@XMM[10]}, [r0,:128]!
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				-	veor		@XMM[8], @XMM[4], @XMM[10]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				-	vst1.8		{@XMM[8]}, [$out]!
			
 
				-
			
 
				-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_enc_done
			
 
				-.align	4
			
 
				-.Lxts_enc_2:
			
 
				-	vst1.64		{@XMM[10]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[8]
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, $key, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[9]
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_encrypt8
			
 
				-
			
 
				-	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				-
			
 
				-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_enc_done
			
 
				-.align	4
			
 
				-.Lxts_enc_1:
			
 
				-	mov		r0, sp
			
 
				-	veor		@XMM[0], @XMM[8]
			
 
				-	mov		r1, sp
			
 
				-	vst1.8		{@XMM[0]}, [sp,:128]
			
 
				-	mov		r2, $key
			
 
				-	mov		r4, $fp				@ preserve fp
			
 
				-
			
 
				-	bl		AES_encrypt
			
 
				-
			
 
				-	vld1.8		{@XMM[0]}, [sp,:128]
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[8]
			
 
				-	vst1.8		{@XMM[0]}, [$out]!
			
 
				-	mov		$fp, r4
			
 
				-
			
 
				-	vmov		@XMM[8], @XMM[9]		@ next round tweak
			
 
				-
			
 
				-.Lxts_enc_done:
			
 
				-#ifndef	XTS_CHAIN_TWEAK
			
 
				-	adds		$len, #0x10
			
 
				-	beq		.Lxts_enc_ret
			
 
				-	sub		r6, $out, #0x10
			
 
				-
			
 
				-.Lxts_enc_steal:
			
 
				-	ldrb		r0, [$inp], #1
			
 
				-	ldrb		r1, [$out, #-0x10]
			
 
				-	strb		r0, [$out, #-0x10]
			
 
				-	strb		r1, [$out], #1
			
 
				-
			
 
				-	subs		$len, #1
			
 
				-	bhi		.Lxts_enc_steal
			
 
				-
			
 
				-	vld1.8		{@XMM[0]}, [r6]
			
 
				-	mov		r0, sp
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[8]
			
 
				-	mov		r1, sp
			
 
				-	vst1.8		{@XMM[0]}, [sp,:128]
			
 
				-	mov		r2, $key
			
 
				-	mov		r4, $fp			@ preserve fp
			
 
				-
			
 
				-	bl		AES_encrypt
			
 
				-
			
 
				-	vld1.8		{@XMM[0]}, [sp,:128]
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[8]
			
 
				-	vst1.8		{@XMM[0]}, [r6]
			
 
				-	mov		$fp, r4
			
 
				-#endif
			
 
				-
			
 
				-.Lxts_enc_ret:
			
 
				-	bic		r0, $fp, #0xf
			
 
				-	vmov.i32	q0, #0
			
 
				-	vmov.i32	q1, #0
			
 
				-#ifdef	XTS_CHAIN_TWEAK
			
 
				-	ldr		r1, [$fp, #0x20+VFP_ABI_FRAME]	@ chain tweak
			
 
				-#endif
			
 
				-.Lxts_enc_bzero:				@ wipe key schedule [if any]
			
 
				-	vstmia		sp!, {q0-q1}
			
 
				-	cmp		sp, r0
			
 
				-	bne		.Lxts_enc_bzero
			
 
				-
			
 
				-	mov		sp, $fp
			
 
				-#ifdef	XTS_CHAIN_TWEAK
			
 
				-	vst1.8		{@XMM[8]}, [r1]
			
 
				-#endif
			
 
				-	VFP_ABI_POP
			
 
				-	ldmia		sp!, {r4-r10, pc}	@ return
			
 
				-
			
 
				-.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
			
 
				-
			
 
				-.globl	bsaes_xts_decrypt
			
 
				-.type	bsaes_xts_decrypt,%function
			
 
				-.align	4
			
 
				-bsaes_xts_decrypt:
			
 
				-	mov	ip, sp
			
 
				-	stmdb	sp!, {r4-r10, lr}		@ 0x20
			
 
				-	VFP_ABI_PUSH
			
 
				-	mov	r6, sp				@ future $fp
			
 
				-
			
 
				-	mov	$inp, r0
			
 
				-	mov	$out, r1
			
 
				-	mov	$len, r2
			
 
				-	mov	$key, r3
			
 
				-
			
 
				-	sub	r0, sp, #0x10			@ 0x10
			
 
				-	bic	r0, #0xf			@ align at 16 bytes
			
 
				-	mov	sp, r0
			
 
				-
			
 
				-#ifdef	XTS_CHAIN_TWEAK
			
 
				-	ldr	r0, [ip]			@ pointer to input tweak
			
 
				-#else
			
 
				-	@ generate initial tweak
			
 
				-	ldr	r0, [ip, #4]			@ iv[]
			
 
				-	mov	r1, sp
			
 
				-	ldr	r2, [ip, #0]			@ key2
			
 
				-	bl	AES_encrypt
			
 
				-	mov	r0, sp				@ pointer to initial tweak
			
 
				-#endif
			
 
				-
			
 
				-	ldr	$rounds, [$key, #240]		@ get # of rounds
			
 
				-	mov	$fp, r6
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	@ allocate the key schedule on the stack
			
 
				-	sub	r12, sp, $rounds, lsl#7		@ 128 bytes per inner round key
			
 
				-	@ add	r12, #`128-32`			@ size of bit-sliced key schedule
			
 
				-	sub	r12, #`32+16`			@ place for tweak[9]
			
 
				-
			
 
				-	@ populate the key schedule
			
 
				-	mov	r4, $key			@ pass key
			
 
				-	mov	r5, $rounds			@ pass # of rounds
			
 
				-	mov	sp, r12
			
 
				-	add	r12, #0x90			@ pass key schedule
			
 
				-	bl	_bsaes_key_convert
			
 
				-	add	r4, sp, #0x90
			
 
				-	vldmia	r4, {@XMM[6]}
			
 
				-	vstmia	r12,  {@XMM[15]}		@ save last round key
			
 
				-	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
			
 
				-	vstmia	r4, {@XMM[7]}
			
 
				-#else
			
 
				-	ldr	r12, [$key, #244]
			
 
				-	eors	r12, #1
			
 
				-	beq	0f
			
 
				-
			
 
				-	str	r12, [$key, #244]
			
 
				-	mov	r4, $key			@ pass key
			
 
				-	mov	r5, $rounds			@ pass # of rounds
			
 
				-	add	r12, $key, #248			@ pass key schedule
			
 
				-	bl	_bsaes_key_convert
			
 
				-	add	r4, $key, #248
			
 
				-	vldmia	r4, {@XMM[6]}
			
 
				-	vstmia	r12,  {@XMM[15]}		@ save last round key
			
 
				-	veor	@XMM[7], @XMM[7], @XMM[6]	@ fix up round 0 key
			
 
				-	vstmia	r4, {@XMM[7]}
			
 
				-
			
 
				-.align	2
			
 
				-0:	sub	sp, #0x90			@ place for tweak[9]
			
 
				-#endif
			
 
				-	vld1.8	{@XMM[8]}, [r0]			@ initial tweak
			
 
				-	adr	$magic, .Lxts_magic
			
 
				-
			
 
				-#ifndef	XTS_CHAIN_TWEAK
			
 
				-	tst	$len, #0xf			@ if not multiple of 16
			
 
				-	it	ne				@ Thumb2 thing, sanity check in ARM
			
 
				-	subne	$len, #0x10			@ subtract another 16 bytes
			
 
				-#endif
			
 
				-	subs	$len, #0x80
			
 
				-
			
 
				-	blo	.Lxts_dec_short
			
 
				-	b	.Lxts_dec_loop
			
 
				-
			
 
				-.align	4
			
 
				-.Lxts_dec_loop:
			
 
				-	vldmia		$magic, {$twmask}	@ load XTS magic
			
 
				-	vshr.s64	@T[0], @XMM[8], #63
			
 
				-	mov		r0, sp
			
 
				-	vand		@T[0], @T[0], $twmask
			
 
				-___
			
 
				-for($i=9;$i<16;$i++) {
			
 
				-$code.=<<___;
			
 
				-	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
			
 
				-	vst1.64		{@XMM[$i-1]}, [r0,:128]!
			
 
				-	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
			
 
				-	vshr.s64	@T[1], @XMM[$i], #63
			
 
				-	veor		@XMM[$i], @XMM[$i], @T[0]
			
 
				-	vand		@T[1], @T[1], $twmask
			
 
				-___
			
 
				-	@T=reverse(@T);
			
 
				-
			
 
				-$code.=<<___ if ($i>=10);
			
 
				-	vld1.8		{@XMM[$i-10]}, [$inp]!
			
 
				-___
			
 
				-$code.=<<___ if ($i>=11);
			
 
				-	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
			
 
				-___
			
 
				-}
			
 
				-$code.=<<___;
			
 
				-	vadd.u64	@XMM[8], @XMM[15], @XMM[15]
			
 
				-	vst1.64		{@XMM[15]}, [r0,:128]!
			
 
				-	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
			
 
				-	veor		@XMM[8], @XMM[8], @T[0]
			
 
				-	vst1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	vld1.8		{@XMM[6]-@XMM[7]}, [$inp]!
			
 
				-	veor		@XMM[5], @XMM[5], @XMM[13]
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, $key, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		@XMM[6], @XMM[6], @XMM[14]
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	veor		@XMM[7], @XMM[7], @XMM[15]
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_decrypt8
			
 
				-
			
 
				-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				-	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				-	veor		@XMM[8], @XMM[6], @XMM[10]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				-	veor		@XMM[9], @XMM[4], @XMM[11]
			
 
				-	vld1.64		{@XMM[14]-@XMM[15]}, [r0,:128]!
			
 
				-	veor		@XMM[10], @XMM[2], @XMM[12]
			
 
				-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				-	veor		@XMM[11], @XMM[7], @XMM[13]
			
 
				-	veor		@XMM[12], @XMM[3], @XMM[14]
			
 
				-	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
			
 
				-	veor		@XMM[13], @XMM[5], @XMM[15]
			
 
				-	vst1.8		{@XMM[12]-@XMM[13]}, [$out]!
			
 
				-
			
 
				-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	subs		$len, #0x80
			
 
				-	bpl		.Lxts_dec_loop
			
 
				-
			
 
				-.Lxts_dec_short:
			
 
				-	adds		$len, #0x70
			
 
				-	bmi		.Lxts_dec_done
			
 
				-
			
 
				-	vldmia		$magic, {$twmask}	@ load XTS magic
			
 
				-	vshr.s64	@T[0], @XMM[8], #63
			
 
				-	mov		r0, sp
			
 
				-	vand		@T[0], @T[0], $twmask
			
 
				-___
			
 
				-for($i=9;$i<16;$i++) {
			
 
				-$code.=<<___;
			
 
				-	vadd.u64	@XMM[$i], @XMM[$i-1], @XMM[$i-1]
			
 
				-	vst1.64		{@XMM[$i-1]}, [r0,:128]!
			
 
				-	vswp		`&Dhi("@T[0]")`,`&Dlo("@T[0]")`
			
 
				-	vshr.s64	@T[1], @XMM[$i], #63
			
 
				-	veor		@XMM[$i], @XMM[$i], @T[0]
			
 
				-	vand		@T[1], @T[1], $twmask
			
 
				-___
			
 
				-	@T=reverse(@T);
			
 
				-
			
 
				-$code.=<<___ if ($i>=10);
			
 
				-	vld1.8		{@XMM[$i-10]}, [$inp]!
			
 
				-	subs		$len, #0x10
			
 
				-	bmi		.Lxts_dec_`$i-9`
			
 
				-___
			
 
				-$code.=<<___ if ($i>=11);
			
 
				-	veor		@XMM[$i-11], @XMM[$i-11], @XMM[$i-3]
			
 
				-___
			
 
				-}
			
 
				-$code.=<<___;
			
 
				-	sub		$len, #0x10
			
 
				-	vst1.64		{@XMM[15]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	vld1.8		{@XMM[6]}, [$inp]!
			
 
				-	veor		@XMM[5], @XMM[5], @XMM[13]
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, $key, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		@XMM[6], @XMM[6], @XMM[14]
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_decrypt8
			
 
				-
			
 
				-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				-	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				-	veor		@XMM[8], @XMM[6], @XMM[10]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				-	veor		@XMM[9], @XMM[4], @XMM[11]
			
 
				-	vld1.64		{@XMM[14]}, [r0,:128]!
			
 
				-	veor		@XMM[10], @XMM[2], @XMM[12]
			
 
				-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				-	veor		@XMM[11], @XMM[7], @XMM[13]
			
 
				-	veor		@XMM[12], @XMM[3], @XMM[14]
			
 
				-	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
			
 
				-	vst1.8		{@XMM[12]}, [$out]!
			
 
				-
			
 
				-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_dec_done
			
 
				-.align	4
			
 
				-.Lxts_dec_6:
			
 
				-	vst1.64		{@XMM[14]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		@XMM[4], @XMM[4], @XMM[12]
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, $key, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		@XMM[5], @XMM[5], @XMM[13]
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_decrypt8
			
 
				-
			
 
				-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				-	vld1.64		{@XMM[12]-@XMM[13]}, [r0,:128]!
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				-	veor		@XMM[8], @XMM[6], @XMM[10]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				-	veor		@XMM[9], @XMM[4], @XMM[11]
			
 
				-	veor		@XMM[10], @XMM[2], @XMM[12]
			
 
				-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				-	veor		@XMM[11], @XMM[7], @XMM[13]
			
 
				-	vst1.8		{@XMM[10]-@XMM[11]}, [$out]!
			
 
				-
			
 
				-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_dec_done
			
 
				-.align	4
			
 
				-.Lxts_dec_5:
			
 
				-	vst1.64		{@XMM[13]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		@XMM[3], @XMM[3], @XMM[11]
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, $key, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		@XMM[4], @XMM[4], @XMM[12]
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_decrypt8
			
 
				-
			
 
				-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				-	vld1.64		{@XMM[12]}, [r0,:128]!
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				-	veor		@XMM[8], @XMM[6], @XMM[10]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				-	veor		@XMM[9], @XMM[4], @XMM[11]
			
 
				-	veor		@XMM[10], @XMM[2], @XMM[12]
			
 
				-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				-	vst1.8		{@XMM[10]}, [$out]!
			
 
				-
			
 
				-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_dec_done
			
 
				-.align	4
			
 
				-.Lxts_dec_4:
			
 
				-	vst1.64		{@XMM[12]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		@XMM[2], @XMM[2], @XMM[10]
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, $key, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		@XMM[3], @XMM[3], @XMM[11]
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_decrypt8
			
 
				-
			
 
				-	vld1.64		{@XMM[ 8]-@XMM[ 9]}, [r0,:128]!
			
 
				-	vld1.64		{@XMM[10]-@XMM[11]}, [r0,:128]!
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				-	veor		@XMM[8], @XMM[6], @XMM[10]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				-	veor		@XMM[9], @XMM[4], @XMM[11]
			
 
				-	vst1.8		{@XMM[8]-@XMM[9]}, [$out]!
			
 
				-
			
 
				-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_dec_done
			
 
				-.align	4
			
 
				-.Lxts_dec_3:
			
 
				-	vst1.64		{@XMM[11]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[9]
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, $key, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		@XMM[2], @XMM[2], @XMM[10]
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_decrypt8
			
 
				-
			
 
				-	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
			
 
				-	vld1.64		{@XMM[10]}, [r0,:128]!
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				-	veor		@XMM[8], @XMM[6], @XMM[10]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				-	vst1.8		{@XMM[8]}, [$out]!
			
 
				-
			
 
				-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_dec_done
			
 
				-.align	4
			
 
				-.Lxts_dec_2:
			
 
				-	vst1.64		{@XMM[10]}, [r0,:128]		@ next round tweak
			
 
				-
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[8]
			
 
				-#ifndef	BSAES_ASM_EXTENDED_KEY
			
 
				-	add		r4, sp, #0x90			@ pass key schedule
			
 
				-#else
			
 
				-	add		r4, $key, #248			@ pass key schedule
			
 
				-#endif
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[9]
			
 
				-	mov		r5, $rounds			@ pass rounds
			
 
				-	mov		r0, sp
			
 
				-
			
 
				-	bl		_bsaes_decrypt8
			
 
				-
			
 
				-	vld1.64		{@XMM[8]-@XMM[9]}, [r0,:128]!
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[ 8]
			
 
				-	veor		@XMM[1], @XMM[1], @XMM[ 9]
			
 
				-	vst1.8		{@XMM[0]-@XMM[1]}, [$out]!
			
 
				-
			
 
				-	vld1.64		{@XMM[8]}, [r0,:128]		@ next round tweak
			
 
				-	b		.Lxts_dec_done
			
 
				-.align	4
			
 
				-.Lxts_dec_1:
			
 
				-	mov		r0, sp
			
 
				-	veor		@XMM[0], @XMM[8]
			
 
				-	mov		r1, sp
			
 
				-	vst1.8		{@XMM[0]}, [sp,:128]
			
 
				-	mov		r2, $key
			
 
				-	mov		r4, $fp				@ preserve fp
			
 
				-	mov		r5, $magic			@ preserve magic
			
 
				-
			
 
				-	bl		AES_decrypt
			
 
				-
			
 
				-	vld1.8		{@XMM[0]}, [sp,:128]
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[8]
			
 
				-	vst1.8		{@XMM[0]}, [$out]!
			
 
				-	mov		$fp, r4
			
 
				-	mov		$magic, r5
			
 
				-
			
 
				-	vmov		@XMM[8], @XMM[9]		@ next round tweak
			
 
				-
			
 
				-.Lxts_dec_done:
			
 
				-#ifndef	XTS_CHAIN_TWEAK
			
 
				-	adds		$len, #0x10
			
 
				-	beq		.Lxts_dec_ret
			
 
				-
			
 
				-	@ calculate one round of extra tweak for the stolen ciphertext
			
 
				-	vldmia		$magic, {$twmask}
			
 
				-	vshr.s64	@XMM[6], @XMM[8], #63
			
 
				-	vand		@XMM[6], @XMM[6], $twmask
			
 
				-	vadd.u64	@XMM[9], @XMM[8], @XMM[8]
			
 
				-	vswp		`&Dhi("@XMM[6]")`,`&Dlo("@XMM[6]")`
			
 
				-	veor		@XMM[9], @XMM[9], @XMM[6]
			
 
				-
			
 
				-	@ perform the final decryption with the last tweak value
			
 
				-	vld1.8		{@XMM[0]}, [$inp]!
			
 
				-	mov		r0, sp
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[9]
			
 
				-	mov		r1, sp
			
 
				-	vst1.8		{@XMM[0]}, [sp,:128]
			
 
				-	mov		r2, $key
			
 
				-	mov		r4, $fp			@ preserve fp
			
 
				-
			
 
				-	bl		AES_decrypt
			
 
				-
			
 
				-	vld1.8		{@XMM[0]}, [sp,:128]
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[9]
			
 
				-	vst1.8		{@XMM[0]}, [$out]
			
 
				-
			
 
				-	mov		r6, $out
			
 
				-.Lxts_dec_steal:
			
 
				-	ldrb		r1, [$out]
			
 
				-	ldrb		r0, [$inp], #1
			
 
				-	strb		r1, [$out, #0x10]
			
 
				-	strb		r0, [$out], #1
			
 
				-
			
 
				-	subs		$len, #1
			
 
				-	bhi		.Lxts_dec_steal
			
 
				-
			
 
				-	vld1.8		{@XMM[0]}, [r6]
			
 
				-	mov		r0, sp
			
 
				-	veor		@XMM[0], @XMM[8]
			
 
				-	mov		r1, sp
			
 
				-	vst1.8		{@XMM[0]}, [sp,:128]
			
 
				-	mov		r2, $key
			
 
				-
			
 
				-	bl		AES_decrypt
			
 
				-
			
 
				-	vld1.8		{@XMM[0]}, [sp,:128]
			
 
				-	veor		@XMM[0], @XMM[0], @XMM[8]
			
 
				-	vst1.8		{@XMM[0]}, [r6]
			
 
				-	mov		$fp, r4
			
 
				-#endif
			
 
				-
			
 
				-.Lxts_dec_ret:
			
 
				-	bic		r0, $fp, #0xf
			
 
				-	vmov.i32	q0, #0
			
 
				-	vmov.i32	q1, #0
			
 
				-#ifdef	XTS_CHAIN_TWEAK
			
 
				-	ldr		r1, [$fp, #0x20+VFP_ABI_FRAME]	@ chain tweak
			
 
				-#endif
			
 
				-.Lxts_dec_bzero:				@ wipe key schedule [if any]
			
 
				-	vstmia		sp!, {q0-q1}
			
 
				-	cmp		sp, r0
			
 
				-	bne		.Lxts_dec_bzero
			
 
				-
			
 
				-	mov		sp, $fp
			
 
				-#ifdef	XTS_CHAIN_TWEAK
			
 
				-	vst1.8		{@XMM[8]}, [r1]
			
 
				-#endif
			
 
				-	VFP_ABI_POP
			
 
				-	ldmia		sp!, {r4-r10, pc}	@ return
			
 
				-
			
 
				-.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
			
 
				-___
			
 
				-}
			
 
				-$code.=<<___;
			
 
				-#endif
			
 
				-___
			
 
				-
			
 
				-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
			
 
				-
			
 
				-open SELF,$0;
			
 
				-while(<SELF>) {
			
 
				-	next if (/^#!/);
			
 
				-        last if (!s/^#/@/ and !/^$/);
			
 
				-        print;
			
 
				-}
			
 
				-close SELF;
			
 
				-
			
 
				-print $code;
			
 
				-
			
 
				-close STDOUT;
			
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ b/arch/arm/crypto/chacha20-neon-core.S
@@ -0,0 +1,523 @@
 
				+/*
			
 
				+ * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
			
 
				+ *
			
 
				+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * Based on:
			
 
				+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
			
 
				+ *
			
 
				+ * Copyright (C) 2015 Martin Willi
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2 of the License, or
			
 
				+ * (at your option) any later version.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+
			
 
				+	.text
			
 
				+	.fpu		neon
			
 
				+	.align		5
			
 
				+
			
 
				+ENTRY(chacha20_block_xor_neon)
			
 
				+	// r0: Input state matrix, s
			
 
				+	// r1: 1 data block output, o
			
 
				+	// r2: 1 data block input, i
			
 
				+
			
 
				+	//
			
 
				+	// This function encrypts one ChaCha20 block by loading the state matrix
			
 
				+	// in four NEON registers. It performs matrix operation on four words in
			
 
				+	// parallel, but requireds shuffling to rearrange the words after each
			
 
				+	// round.
			
 
				+	//
			
 
				+
			
 
				+	// x0..3 = s0..3
			
 
				+	add		ip, r0, #0x20
			
 
				+	vld1.32		{q0-q1}, [r0]
			
 
				+	vld1.32		{q2-q3}, [ip]
			
 
				+
			
 
				+	vmov		q8, q0
			
 
				+	vmov		q9, q1
			
 
				+	vmov		q10, q2
			
 
				+	vmov		q11, q3
			
 
				+
			
 
				+	mov		r3, #10
			
 
				+
			
 
				+.Ldoubleround:
			
 
				+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
			
 
				+	vadd.i32	q0, q0, q1
			
 
				+	veor		q4, q3, q0
			
 
				+	vshl.u32	q3, q4, #16
			
 
				+	vsri.u32	q3, q4, #16
			
 
				+
			
 
				+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
			
 
				+	vadd.i32	q2, q2, q3
			
 
				+	veor		q4, q1, q2
			
 
				+	vshl.u32	q1, q4, #12
			
 
				+	vsri.u32	q1, q4, #20
			
 
				+
			
 
				+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
			
 
				+	vadd.i32	q0, q0, q1
			
 
				+	veor		q4, q3, q0
			
 
				+	vshl.u32	q3, q4, #8
			
 
				+	vsri.u32	q3, q4, #24
			
 
				+
			
 
				+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
			
 
				+	vadd.i32	q2, q2, q3
			
 
				+	veor		q4, q1, q2
			
 
				+	vshl.u32	q1, q4, #7
			
 
				+	vsri.u32	q1, q4, #25
			
 
				+
			
 
				+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
			
 
				+	vext.8		q1, q1, q1, #4
			
 
				+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
			
 
				+	vext.8		q2, q2, q2, #8
			
 
				+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
			
 
				+	vext.8		q3, q3, q3, #12
			
 
				+
			
 
				+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
			
 
				+	vadd.i32	q0, q0, q1
			
 
				+	veor		q4, q3, q0
			
 
				+	vshl.u32	q3, q4, #16
			
 
				+	vsri.u32	q3, q4, #16
			
 
				+
			
 
				+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
			
 
				+	vadd.i32	q2, q2, q3
			
 
				+	veor		q4, q1, q2
			
 
				+	vshl.u32	q1, q4, #12
			
 
				+	vsri.u32	q1, q4, #20
			
 
				+
			
 
				+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
			
 
				+	vadd.i32	q0, q0, q1
			
 
				+	veor		q4, q3, q0
			
 
				+	vshl.u32	q3, q4, #8
			
 
				+	vsri.u32	q3, q4, #24
			
 
				+
			
 
				+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
			
 
				+	vadd.i32	q2, q2, q3
			
 
				+	veor		q4, q1, q2
			
 
				+	vshl.u32	q1, q4, #7
			
 
				+	vsri.u32	q1, q4, #25
			
 
				+
			
 
				+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
			
 
				+	vext.8		q1, q1, q1, #12
			
 
				+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
			
 
				+	vext.8		q2, q2, q2, #8
			
 
				+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
			
 
				+	vext.8		q3, q3, q3, #4
			
 
				+
			
 
				+	subs		r3, r3, #1
			
 
				+	bne		.Ldoubleround
			
 
				+
			
 
				+	add		ip, r2, #0x20
			
 
				+	vld1.8		{q4-q5}, [r2]
			
 
				+	vld1.8		{q6-q7}, [ip]
			
 
				+
			
 
				+	// o0 = i0 ^ (x0 + s0)
			
 
				+	vadd.i32	q0, q0, q8
			
 
				+	veor		q0, q0, q4
			
 
				+
			
 
				+	// o1 = i1 ^ (x1 + s1)
			
 
				+	vadd.i32	q1, q1, q9
			
 
				+	veor		q1, q1, q5
			
 
				+
			
 
				+	// o2 = i2 ^ (x2 + s2)
			
 
				+	vadd.i32	q2, q2, q10
			
 
				+	veor		q2, q2, q6
			
 
				+
			
 
				+	// o3 = i3 ^ (x3 + s3)
			
 
				+	vadd.i32	q3, q3, q11
			
 
				+	veor		q3, q3, q7
			
 
				+
			
 
				+	add		ip, r1, #0x20
			
 
				+	vst1.8		{q0-q1}, [r1]
			
 
				+	vst1.8		{q2-q3}, [ip]
			
 
				+
			
 
				+	bx		lr
			
 
				+ENDPROC(chacha20_block_xor_neon)
			
 
				+
			
 
				+	.align		5
			
 
				+ENTRY(chacha20_4block_xor_neon)
			
 
				+	push		{r4-r6, lr}
			
 
				+	mov		ip, sp			// preserve the stack pointer
			
 
				+	sub		r3, sp, #0x20		// allocate a 32 byte buffer
			
 
				+	bic		r3, r3, #0x1f		// aligned to 32 bytes
			
 
				+	mov		sp, r3
			
 
				+
			
 
				+	// r0: Input state matrix, s
			
 
				+	// r1: 4 data blocks output, o
			
 
				+	// r2: 4 data blocks input, i
			
 
				+
			
 
				+	//
			
 
				+	// This function encrypts four consecutive ChaCha20 blocks by loading
			
 
				+	// the state matrix in NEON registers four times. The algorithm performs
			
 
				+	// each operation on the corresponding word of each state matrix, hence
			
 
				+	// requires no word shuffling. For final XORing step we transpose the
			
 
				+	// matrix by interleaving 32- and then 64-bit words, which allows us to
			
 
				+	// do XOR in NEON registers.
			
 
				+	//
			
 
				+
			
 
				+	// x0..15[0-3] = s0..3[0..3]
			
 
				+	add		r3, r0, #0x20
			
 
				+	vld1.32		{q0-q1}, [r0]
			
 
				+	vld1.32		{q2-q3}, [r3]
			
 
				+
			
 
				+	adr		r3, CTRINC
			
 
				+	vdup.32		q15, d7[1]
			
 
				+	vdup.32		q14, d7[0]
			
 
				+	vld1.32		{q11}, [r3, :128]
			
 
				+	vdup.32		q13, d6[1]
			
 
				+	vdup.32		q12, d6[0]
			
 
				+	vadd.i32	q12, q12, q11		// x12 += counter values 0-3
			
 
				+	vdup.32		q11, d5[1]
			
 
				+	vdup.32		q10, d5[0]
			
 
				+	vdup.32		q9, d4[1]
			
 
				+	vdup.32		q8, d4[0]
			
 
				+	vdup.32		q7, d3[1]
			
 
				+	vdup.32		q6, d3[0]
			
 
				+	vdup.32		q5, d2[1]
			
 
				+	vdup.32		q4, d2[0]
			
 
				+	vdup.32		q3, d1[1]
			
 
				+	vdup.32		q2, d1[0]
			
 
				+	vdup.32		q1, d0[1]
			
 
				+	vdup.32		q0, d0[0]
			
 
				+
			
 
				+	mov		r3, #10
			
 
				+
			
 
				+.Ldoubleround4:
			
 
				+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
			
 
				+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
			
 
				+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
			
 
				+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
			
 
				+	vadd.i32	q0, q0, q4
			
 
				+	vadd.i32	q1, q1, q5
			
 
				+	vadd.i32	q2, q2, q6
			
 
				+	vadd.i32	q3, q3, q7
			
 
				+
			
 
				+	veor		q12, q12, q0
			
 
				+	veor		q13, q13, q1
			
 
				+	veor		q14, q14, q2
			
 
				+	veor		q15, q15, q3
			
 
				+
			
 
				+	vrev32.16	q12, q12
			
 
				+	vrev32.16	q13, q13
			
 
				+	vrev32.16	q14, q14
			
 
				+	vrev32.16	q15, q15
			
 
				+
			
 
				+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
			
 
				+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
			
 
				+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
			
 
				+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
			
 
				+	vadd.i32	q8, q8, q12
			
 
				+	vadd.i32	q9, q9, q13
			
 
				+	vadd.i32	q10, q10, q14
			
 
				+	vadd.i32	q11, q11, q15
			
 
				+
			
 
				+	vst1.32		{q8-q9}, [sp, :256]
			
 
				+
			
 
				+	veor		q8, q4, q8
			
 
				+	veor		q9, q5, q9
			
 
				+	vshl.u32	q4, q8, #12
			
 
				+	vshl.u32	q5, q9, #12
			
 
				+	vsri.u32	q4, q8, #20
			
 
				+	vsri.u32	q5, q9, #20
			
 
				+
			
 
				+	veor		q8, q6, q10
			
 
				+	veor		q9, q7, q11
			
 
				+	vshl.u32	q6, q8, #12
			
 
				+	vshl.u32	q7, q9, #12
			
 
				+	vsri.u32	q6, q8, #20
			
 
				+	vsri.u32	q7, q9, #20
			
 
				+
			
 
				+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
			
 
				+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
			
 
				+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
			
 
				+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
			
 
				+	vadd.i32	q0, q0, q4
			
 
				+	vadd.i32	q1, q1, q5
			
 
				+	vadd.i32	q2, q2, q6
			
 
				+	vadd.i32	q3, q3, q7
			
 
				+
			
 
				+	veor		q8, q12, q0
			
 
				+	veor		q9, q13, q1
			
 
				+	vshl.u32	q12, q8, #8
			
 
				+	vshl.u32	q13, q9, #8
			
 
				+	vsri.u32	q12, q8, #24
			
 
				+	vsri.u32	q13, q9, #24
			
 
				+
			
 
				+	veor		q8, q14, q2
			
 
				+	veor		q9, q15, q3
			
 
				+	vshl.u32	q14, q8, #8
			
 
				+	vshl.u32	q15, q9, #8
			
 
				+	vsri.u32	q14, q8, #24
			
 
				+	vsri.u32	q15, q9, #24
			
 
				+
			
 
				+	vld1.32		{q8-q9}, [sp, :256]
			
 
				+
			
 
				+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
			
 
				+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
			
 
				+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
			
 
				+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
			
 
				+	vadd.i32	q8, q8, q12
			
 
				+	vadd.i32	q9, q9, q13
			
 
				+	vadd.i32	q10, q10, q14
			
 
				+	vadd.i32	q11, q11, q15
			
 
				+
			
 
				+	vst1.32		{q8-q9}, [sp, :256]
			
 
				+
			
 
				+	veor		q8, q4, q8
			
 
				+	veor		q9, q5, q9
			
 
				+	vshl.u32	q4, q8, #7
			
 
				+	vshl.u32	q5, q9, #7
			
 
				+	vsri.u32	q4, q8, #25
			
 
				+	vsri.u32	q5, q9, #25
			
 
				+
			
 
				+	veor		q8, q6, q10
			
 
				+	veor		q9, q7, q11
			
 
				+	vshl.u32	q6, q8, #7
			
 
				+	vshl.u32	q7, q9, #7
			
 
				+	vsri.u32	q6, q8, #25
			
 
				+	vsri.u32	q7, q9, #25
			
 
				+
			
 
				+	vld1.32		{q8-q9}, [sp, :256]
			
 
				+
			
 
				+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
			
 
				+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
			
 
				+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
			
 
				+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
			
 
				+	vadd.i32	q0, q0, q5
			
 
				+	vadd.i32	q1, q1, q6
			
 
				+	vadd.i32	q2, q2, q7
			
 
				+	vadd.i32	q3, q3, q4
			
 
				+
			
 
				+	veor		q15, q15, q0
			
 
				+	veor		q12, q12, q1
			
 
				+	veor		q13, q13, q2
			
 
				+	veor		q14, q14, q3
			
 
				+
			
 
				+	vrev32.16	q15, q15
			
 
				+	vrev32.16	q12, q12
			
 
				+	vrev32.16	q13, q13
			
 
				+	vrev32.16	q14, q14
			
 
				+
			
 
				+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
			
 
				+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
			
 
				+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
			
 
				+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
			
 
				+	vadd.i32	q10, q10, q15
			
 
				+	vadd.i32	q11, q11, q12
			
 
				+	vadd.i32	q8, q8, q13
			
 
				+	vadd.i32	q9, q9, q14
			
 
				+
			
 
				+	vst1.32		{q8-q9}, [sp, :256]
			
 
				+
			
 
				+	veor		q8, q7, q8
			
 
				+	veor		q9, q4, q9
			
 
				+	vshl.u32	q7, q8, #12
			
 
				+	vshl.u32	q4, q9, #12
			
 
				+	vsri.u32	q7, q8, #20
			
 
				+	vsri.u32	q4, q9, #20
			
 
				+
			
 
				+	veor		q8, q5, q10
			
 
				+	veor		q9, q6, q11
			
 
				+	vshl.u32	q5, q8, #12
			
 
				+	vshl.u32	q6, q9, #12
			
 
				+	vsri.u32	q5, q8, #20
			
 
				+	vsri.u32	q6, q9, #20
			
 
				+
			
 
				+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
			
 
				+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
			
 
				+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
			
 
				+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
			
 
				+	vadd.i32	q0, q0, q5
			
 
				+	vadd.i32	q1, q1, q6
			
 
				+	vadd.i32	q2, q2, q7
			
 
				+	vadd.i32	q3, q3, q4
			
 
				+
			
 
				+	veor		q8, q15, q0
			
 
				+	veor		q9, q12, q1
			
 
				+	vshl.u32	q15, q8, #8
			
 
				+	vshl.u32	q12, q9, #8
			
 
				+	vsri.u32	q15, q8, #24
			
 
				+	vsri.u32	q12, q9, #24
			
 
				+
			
 
				+	veor		q8, q13, q2
			
 
				+	veor		q9, q14, q3
			
 
				+	vshl.u32	q13, q8, #8
			
 
				+	vshl.u32	q14, q9, #8
			
 
				+	vsri.u32	q13, q8, #24
			
 
				+	vsri.u32	q14, q9, #24
			
 
				+
			
 
				+	vld1.32		{q8-q9}, [sp, :256]
			
 
				+
			
 
				+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
			
 
				+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
			
 
				+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
			
 
				+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
			
 
				+	vadd.i32	q10, q10, q15
			
 
				+	vadd.i32	q11, q11, q12
			
 
				+	vadd.i32	q8, q8, q13
			
 
				+	vadd.i32	q9, q9, q14
			
 
				+
			
 
				+	vst1.32		{q8-q9}, [sp, :256]
			
 
				+
			
 
				+	veor		q8, q7, q8
			
 
				+	veor		q9, q4, q9
			
 
				+	vshl.u32	q7, q8, #7
			
 
				+	vshl.u32	q4, q9, #7
			
 
				+	vsri.u32	q7, q8, #25
			
 
				+	vsri.u32	q4, q9, #25
			
 
				+
			
 
				+	veor		q8, q5, q10
			
 
				+	veor		q9, q6, q11
			
 
				+	vshl.u32	q5, q8, #7
			
 
				+	vshl.u32	q6, q9, #7
			
 
				+	vsri.u32	q5, q8, #25
			
 
				+	vsri.u32	q6, q9, #25
			
 
				+
			
 
				+	subs		r3, r3, #1
			
 
				+	beq		0f
			
 
				+
			
 
				+	vld1.32		{q8-q9}, [sp, :256]
			
 
				+	b		.Ldoubleround4
			
 
				+
			
 
				+	// x0[0-3] += s0[0]
			
 
				+	// x1[0-3] += s0[1]
			
 
				+	// x2[0-3] += s0[2]
			
 
				+	// x3[0-3] += s0[3]
			
 
				+0:	ldmia		r0!, {r3-r6}
			
 
				+	vdup.32		q8, r3
			
 
				+	vdup.32		q9, r4
			
 
				+	vadd.i32	q0, q0, q8
			
 
				+	vadd.i32	q1, q1, q9
			
 
				+	vdup.32		q8, r5
			
 
				+	vdup.32		q9, r6
			
 
				+	vadd.i32	q2, q2, q8
			
 
				+	vadd.i32	q3, q3, q9
			
 
				+
			
 
				+	// x4[0-3] += s1[0]
			
 
				+	// x5[0-3] += s1[1]
			
 
				+	// x6[0-3] += s1[2]
			
 
				+	// x7[0-3] += s1[3]
			
 
				+	ldmia		r0!, {r3-r6}
			
 
				+	vdup.32		q8, r3
			
 
				+	vdup.32		q9, r4
			
 
				+	vadd.i32	q4, q4, q8
			
 
				+	vadd.i32	q5, q5, q9
			
 
				+	vdup.32		q8, r5
			
 
				+	vdup.32		q9, r6
			
 
				+	vadd.i32	q6, q6, q8
			
 
				+	vadd.i32	q7, q7, q9
			
 
				+
			
 
				+	// interleave 32-bit words in state n, n+1
			
 
				+	vzip.32		q0, q1
			
 
				+	vzip.32		q2, q3
			
 
				+	vzip.32		q4, q5
			
 
				+	vzip.32		q6, q7
			
 
				+
			
 
				+	// interleave 64-bit words in state n, n+2
			
 
				+	vswp		d1, d4
			
 
				+	vswp		d3, d6
			
 
				+	vswp		d9, d12
			
 
				+	vswp		d11, d14
			
 
				+
			
 
				+	// xor with corresponding input, write to output
			
 
				+	vld1.8		{q8-q9}, [r2]!
			
 
				+	veor		q8, q8, q0
			
 
				+	veor		q9, q9, q4
			
 
				+	vst1.8		{q8-q9}, [r1]!
			
 
				+
			
 
				+	vld1.32		{q8-q9}, [sp, :256]
			
 
				+
			
 
				+	// x8[0-3] += s2[0]
			
 
				+	// x9[0-3] += s2[1]
			
 
				+	// x10[0-3] += s2[2]
			
 
				+	// x11[0-3] += s2[3]
			
 
				+	ldmia		r0!, {r3-r6}
			
 
				+	vdup.32		q0, r3
			
 
				+	vdup.32		q4, r4
			
 
				+	vadd.i32	q8, q8, q0
			
 
				+	vadd.i32	q9, q9, q4
			
 
				+	vdup.32		q0, r5
			
 
				+	vdup.32		q4, r6
			
 
				+	vadd.i32	q10, q10, q0
			
 
				+	vadd.i32	q11, q11, q4
			
 
				+
			
 
				+	// x12[0-3] += s3[0]
			
 
				+	// x13[0-3] += s3[1]
			
 
				+	// x14[0-3] += s3[2]
			
 
				+	// x15[0-3] += s3[3]
			
 
				+	ldmia		r0!, {r3-r6}
			
 
				+	vdup.32		q0, r3
			
 
				+	vdup.32		q4, r4
			
 
				+	adr		r3, CTRINC
			
 
				+	vadd.i32	q12, q12, q0
			
 
				+	vld1.32		{q0}, [r3, :128]
			
 
				+	vadd.i32	q13, q13, q4
			
 
				+	vadd.i32	q12, q12, q0		// x12 += counter values 0-3
			
 
				+
			
 
				+	vdup.32		q0, r5
			
 
				+	vdup.32		q4, r6
			
 
				+	vadd.i32	q14, q14, q0
			
 
				+	vadd.i32	q15, q15, q4
			
 
				+
			
 
				+	// interleave 32-bit words in state n, n+1
			
 
				+	vzip.32		q8, q9
			
 
				+	vzip.32		q10, q11
			
 
				+	vzip.32		q12, q13
			
 
				+	vzip.32		q14, q15
			
 
				+
			
 
				+	// interleave 64-bit words in state n, n+2
			
 
				+	vswp		d17, d20
			
 
				+	vswp		d19, d22
			
 
				+	vswp		d25, d28
			
 
				+	vswp		d27, d30
			
 
				+
			
 
				+	vmov		q4, q1
			
 
				+
			
 
				+	vld1.8		{q0-q1}, [r2]!
			
 
				+	veor		q0, q0, q8
			
 
				+	veor		q1, q1, q12
			
 
				+	vst1.8		{q0-q1}, [r1]!
			
 
				+
			
 
				+	vld1.8		{q0-q1}, [r2]!
			
 
				+	veor		q0, q0, q2
			
 
				+	veor		q1, q1, q6
			
 
				+	vst1.8		{q0-q1}, [r1]!
			
 
				+
			
 
				+	vld1.8		{q0-q1}, [r2]!
			
 
				+	veor		q0, q0, q10
			
 
				+	veor		q1, q1, q14
			
 
				+	vst1.8		{q0-q1}, [r1]!
			
 
				+
			
 
				+	vld1.8		{q0-q1}, [r2]!
			
 
				+	veor		q0, q0, q4
			
 
				+	veor		q1, q1, q5
			
 
				+	vst1.8		{q0-q1}, [r1]!
			
 
				+
			
 
				+	vld1.8		{q0-q1}, [r2]!
			
 
				+	veor		q0, q0, q9
			
 
				+	veor		q1, q1, q13
			
 
				+	vst1.8		{q0-q1}, [r1]!
			
 
				+
			
 
				+	vld1.8		{q0-q1}, [r2]!
			
 
				+	veor		q0, q0, q3
			
 
				+	veor		q1, q1, q7
			
 
				+	vst1.8		{q0-q1}, [r1]!
			
 
				+
			
 
				+	vld1.8		{q0-q1}, [r2]
			
 
				+	veor		q0, q0, q11
			
 
				+	veor		q1, q1, q15
			
 
				+	vst1.8		{q0-q1}, [r1]
			
 
				+
			
 
				+	mov		sp, ip
			
 
				+	pop		{r4-r6, pc}
			
 
				+ENDPROC(chacha20_4block_xor_neon)
			
 
				+
			
 
				+	.align		4
			
 
				+CTRINC:	.word		0, 1, 2, 3
			
--- a/arch/arm/crypto/chacha20-neon-glue.c
+++ b/arch/arm/crypto/chacha20-neon-glue.c
@@ -0,0 +1,127 @@
 
				+/*
			
 
				+ * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
			
 
				+ *
			
 
				+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * Based on:
			
 
				+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
			
 
				+ *
			
 
				+ * Copyright (C) 2015 Martin Willi
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2 of the License, or
			
 
				+ * (at your option) any later version.
			
 
				+ */
			
 
				+
			
 
				+#include <crypto/algapi.h>
			
 
				+#include <crypto/chacha20.h>
			
 
				+#include <crypto/internal/skcipher.h>
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/module.h>
			
 
				+
			
 
				+#include <asm/hwcap.h>
			
 
				+#include <asm/neon.h>
			
 
				+#include <asm/simd.h>
			
 
				+
			
 
				+asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
			
 
				+asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
			
 
				+
			
 
				+static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
			
 
				+			    unsigned int bytes)
			
 
				+{
			
 
				+	u8 buf[CHACHA20_BLOCK_SIZE];
			
 
				+
			
 
				+	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
			
 
				+		chacha20_4block_xor_neon(state, dst, src);
			
 
				+		bytes -= CHACHA20_BLOCK_SIZE * 4;
			
 
				+		src += CHACHA20_BLOCK_SIZE * 4;
			
 
				+		dst += CHACHA20_BLOCK_SIZE * 4;
			
 
				+		state[12] += 4;
			
 
				+	}
			
 
				+	while (bytes >= CHACHA20_BLOCK_SIZE) {
			
 
				+		chacha20_block_xor_neon(state, dst, src);
			
 
				+		bytes -= CHACHA20_BLOCK_SIZE;
			
 
				+		src += CHACHA20_BLOCK_SIZE;
			
 
				+		dst += CHACHA20_BLOCK_SIZE;
			
 
				+		state[12]++;
			
 
				+	}
			
 
				+	if (bytes) {
			
 
				+		memcpy(buf, src, bytes);
			
 
				+		chacha20_block_xor_neon(state, buf, buf);
			
 
				+		memcpy(dst, buf, bytes);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int chacha20_neon(struct skcipher_request *req)
			
 
				+{
			
 
				+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct skcipher_walk walk;
			
 
				+	u32 state[16];
			
 
				+	int err;
			
 
				+
			
 
				+	if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
			
 
				+		return crypto_chacha20_crypt(req);
			
 
				+
			
 
				+	err = skcipher_walk_virt(&walk, req, true);
			
 
				+
			
 
				+	crypto_chacha20_init(state, ctx, walk.iv);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	while (walk.nbytes > 0) {
			
 
				+		unsigned int nbytes = walk.nbytes;
			
 
				+
			
 
				+		if (nbytes < walk.total)
			
 
				+			nbytes = round_down(nbytes, walk.stride);
			
 
				+
			
 
				+		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				nbytes);
			
 
				+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static struct skcipher_alg alg = {
			
 
				+	.base.cra_name		= "chacha20",
			
 
				+	.base.cra_driver_name	= "chacha20-neon",
			
 
				+	.base.cra_priority	= 300,
			
 
				+	.base.cra_blocksize	= 1,
			
 
				+	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+
			
 
				+	.min_keysize		= CHACHA20_KEY_SIZE,
			
 
				+	.max_keysize		= CHACHA20_KEY_SIZE,
			
 
				+	.ivsize			= CHACHA20_IV_SIZE,
			
 
				+	.chunksize		= CHACHA20_BLOCK_SIZE,
			
 
				+	.walksize		= 4 * CHACHA20_BLOCK_SIZE,
			
 
				+	.setkey			= crypto_chacha20_setkey,
			
 
				+	.encrypt		= chacha20_neon,
			
 
				+	.decrypt		= chacha20_neon,
			
 
				+};
			
 
				+
			
 
				+static int __init chacha20_simd_mod_init(void)
			
 
				+{
			
 
				+	if (!(elf_hwcap & HWCAP_NEON))
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	return crypto_register_skcipher(&alg);
			
 
				+}
			
 
				+
			
 
				+static void __exit chacha20_simd_mod_fini(void)
			
 
				+{
			
 
				+	crypto_unregister_skcipher(&alg);
			
 
				+}
			
 
				+
			
 
				+module_init(chacha20_simd_mod_init);
			
 
				+module_exit(chacha20_simd_mod_fini);
			
 
				+
			
 
				+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				+MODULE_LICENSE("GPL v2");
			
 
				+MODULE_ALIAS_CRYPTO("chacha20");
			
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -516,4 +516,3 @@ CONFIG_CRYPTO_GHASH_ARM64_CE=y
 
				 CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
			
 
				 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
			
 
				 # CONFIG_CRYPTO_AES_ARM64_NEON_BLK is not set
			
 
				-CONFIG_CRYPTO_CRC32_ARM64=y
			
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -37,10 +37,14 @@ config CRYPTO_CRCT10DIF_ARM64_CE
 
				 	select CRYPTO_HASH
			
 
				 
			
 
				 config CRYPTO_CRC32_ARM64_CE
			
 
				-	tristate "CRC32 and CRC32C digest algorithms using PMULL instructions"
			
 
				-	depends on KERNEL_MODE_NEON && CRC32
			
 
				+	tristate "CRC32 and CRC32C digest algorithms using ARMv8 extensions"
			
 
				+	depends on CRC32
			
 
				 	select CRYPTO_HASH
			
 
				 
			
 
				+config CRYPTO_AES_ARM64
			
 
				+	tristate "AES core cipher using scalar instructions"
			
 
				+	select CRYPTO_AES
			
 
				+
			
 
				 config CRYPTO_AES_ARM64_CE
			
 
				 	tristate "AES core cipher using ARMv8 Crypto Extensions"
			
 
				 	depends on ARM64 && KERNEL_MODE_NEON
			
@@ -67,9 +71,17 @@ config CRYPTO_AES_ARM64_NEON_BLK
 
				 	select CRYPTO_AES
			
 
				 	select CRYPTO_SIMD
			
 
				 
			
 
				-config CRYPTO_CRC32_ARM64
			
 
				-	tristate "CRC32 and CRC32C using optional ARMv8 instructions"
			
 
				-	depends on ARM64
			
 
				-	select CRYPTO_HASH
			
 
				+config CRYPTO_CHACHA20_NEON
			
 
				+	tristate "NEON accelerated ChaCha20 symmetric cipher"
			
 
				+	depends on KERNEL_MODE_NEON
			
 
				+	select CRYPTO_BLKCIPHER
			
 
				+	select CRYPTO_CHACHA20
			
 
				+
			
 
				+config CRYPTO_AES_ARM64_BS
			
 
				+	tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
			
 
				+	depends on KERNEL_MODE_NEON
			
 
				+	select CRYPTO_BLKCIPHER
			
 
				+	select CRYPTO_AES_ARM64_NEON_BLK
			
 
				+	select CRYPTO_SIMD
			
 
				 
			
 
				 endif
			
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -41,15 +41,20 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
 
				 obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
			
 
				 sha512-arm64-y := sha512-glue.o sha512-core.o
			
 
				 
			
 
				+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
			
 
				+chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
			
 
				+
			
 
				+obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
			
 
				+aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
			
 
				+
			
 
				+obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o
			
 
				+aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
			
 
				+
			
 
				 AFLAGS_aes-ce.o		:= -DINTERLEAVE=4
			
 
				 AFLAGS_aes-neon.o	:= -DINTERLEAVE=4
			
 
				 
			
 
				 CFLAGS_aes-glue-ce.o	:= -DUSE_V8_CRYPTO_EXTENSIONS
			
 
				 
			
 
				-obj-$(CONFIG_CRYPTO_CRC32_ARM64) += crc32-arm64.o
			
 
				-
			
 
				-CFLAGS_crc32-arm64.o	:= -mcpu=generic+crc
			
 
				-
			
 
				 $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
			
 
				 	$(call if_changed_rule,cc_o_c)
			
 
				 
			
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -258,7 +258,6 @@ static struct aead_alg ccm_aes_alg = {
 
				 		.cra_priority		= 300,
			
 
				 		.cra_blocksize		= 1,
			
 
				 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
			
 
				-		.cra_alignmask		= 7,
			
 
				 		.cra_module		= THIS_MODULE,
			
 
				 	},
			
 
				 	.ivsize		= AES_BLOCK_SIZE,
			
--- a/arch/arm64/crypto/aes-cipher-core.S
+++ b/arch/arm64/crypto/aes-cipher-core.S
@@ -0,0 +1,110 @@
 
				+/*
			
 
				+ * Scalar AES core transform
			
 
				+ *
			
 
				+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+#include <asm/assembler.h>
			
 
				+
			
 
				+	.text
			
 
				+
			
 
				+	rk		.req	x0
			
 
				+	out		.req	x1
			
 
				+	in		.req	x2
			
 
				+	rounds		.req	x3
			
 
				+	tt		.req	x4
			
 
				+	lt		.req	x2
			
 
				+
			
 
				+	.macro		__pair, enc, reg0, reg1, in0, in1e, in1d, shift
			
 
				+	ubfx		\reg0, \in0, #\shift, #8
			
 
				+	.if		\enc
			
 
				+	ubfx		\reg1, \in1e, #\shift, #8
			
 
				+	.else
			
 
				+	ubfx		\reg1, \in1d, #\shift, #8
			
 
				+	.endif
			
 
				+	ldr		\reg0, [tt, \reg0, uxtw #2]
			
 
				+	ldr		\reg1, [tt, \reg1, uxtw #2]
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		__hround, out0, out1, in0, in1, in2, in3, t0, t1, enc
			
 
				+	ldp		\out0, \out1, [rk], #8
			
 
				+
			
 
				+	__pair		\enc, w13, w14, \in0, \in1, \in3, 0
			
 
				+	__pair		\enc, w15, w16, \in1, \in2, \in0, 8
			
 
				+	__pair		\enc, w17, w18, \in2, \in3, \in1, 16
			
 
				+	__pair		\enc, \t0, \t1, \in3, \in0, \in2, 24
			
 
				+
			
 
				+	eor		\out0, \out0, w13
			
 
				+	eor		\out1, \out1, w14
			
 
				+	eor		\out0, \out0, w15, ror #24
			
 
				+	eor		\out1, \out1, w16, ror #24
			
 
				+	eor		\out0, \out0, w17, ror #16
			
 
				+	eor		\out1, \out1, w18, ror #16
			
 
				+	eor		\out0, \out0, \t0, ror #8
			
 
				+	eor		\out1, \out1, \t1, ror #8
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		fround, out0, out1, out2, out3, in0, in1, in2, in3
			
 
				+	__hround	\out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1
			
 
				+	__hround	\out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		iround, out0, out1, out2, out3, in0, in1, in2, in3
			
 
				+	__hround	\out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0
			
 
				+	__hround	\out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		do_crypt, round, ttab, ltab
			
 
				+	ldp		w5, w6, [in]
			
 
				+	ldp		w7, w8, [in, #8]
			
 
				+	ldp		w9, w10, [rk], #16
			
 
				+	ldp		w11, w12, [rk, #-8]
			
 
				+
			
 
				+CPU_BE(	rev		w5, w5		)
			
 
				+CPU_BE(	rev		w6, w6		)
			
 
				+CPU_BE(	rev		w7, w7		)
			
 
				+CPU_BE(	rev		w8, w8		)
			
 
				+
			
 
				+	eor		w5, w5, w9
			
 
				+	eor		w6, w6, w10
			
 
				+	eor		w7, w7, w11
			
 
				+	eor		w8, w8, w12
			
 
				+
			
 
				+	adr_l		tt, \ttab
			
 
				+	adr_l		lt, \ltab
			
 
				+
			
 
				+	tbnz		rounds, #1, 1f
			
 
				+
			
 
				+0:	\round		w9, w10, w11, w12, w5, w6, w7, w8
			
 
				+	\round		w5, w6, w7, w8, w9, w10, w11, w12
			
 
				+
			
 
				+1:	subs		rounds, rounds, #4
			
 
				+	\round		w9, w10, w11, w12, w5, w6, w7, w8
			
 
				+	csel		tt, tt, lt, hi
			
 
				+	\round		w5, w6, w7, w8, w9, w10, w11, w12
			
 
				+	b.hi		0b
			
 
				+
			
 
				+CPU_BE(	rev		w5, w5		)
			
 
				+CPU_BE(	rev		w6, w6		)
			
 
				+CPU_BE(	rev		w7, w7		)
			
 
				+CPU_BE(	rev		w8, w8		)
			
 
				+
			
 
				+	stp		w5, w6, [out]
			
 
				+	stp		w7, w8, [out, #8]
			
 
				+	ret
			
 
				+	.endm
			
 
				+
			
 
				+	.align		5
			
 
				+ENTRY(__aes_arm64_encrypt)
			
 
				+	do_crypt	fround, crypto_ft_tab, crypto_fl_tab
			
 
				+ENDPROC(__aes_arm64_encrypt)
			
 
				+
			
 
				+	.align		5
			
 
				+ENTRY(__aes_arm64_decrypt)
			
 
				+	do_crypt	iround, crypto_it_tab, crypto_il_tab
			
 
				+ENDPROC(__aes_arm64_decrypt)
			
--- a/arch/arm64/crypto/aes-cipher-glue.c
+++ b/arch/arm64/crypto/aes-cipher-glue.c
@@ -0,0 +1,69 @@
 
				+/*
			
 
				+ * Scalar AES core transform
			
 
				+ *
			
 
				+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <crypto/aes.h>
			
 
				+#include <linux/crypto.h>
			
 
				+#include <linux/module.h>
			
 
				+
			
 
				+asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
			
 
				+EXPORT_SYMBOL(__aes_arm64_encrypt);
			
 
				+
			
 
				+asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
			
 
				+EXPORT_SYMBOL(__aes_arm64_decrypt);
			
 
				+
			
 
				+static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
			
 
				+{
			
 
				+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	int rounds = 6 + ctx->key_length / 4;
			
 
				+
			
 
				+	__aes_arm64_encrypt(ctx->key_enc, out, in, rounds);
			
 
				+}
			
 
				+
			
 
				+static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
			
 
				+{
			
 
				+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	int rounds = 6 + ctx->key_length / 4;
			
 
				+
			
 
				+	__aes_arm64_decrypt(ctx->key_dec, out, in, rounds);
			
 
				+}
			
 
				+
			
 
				+static struct crypto_alg aes_alg = {
			
 
				+	.cra_name			= "aes",
			
 
				+	.cra_driver_name		= "aes-arm64",
			
 
				+	.cra_priority			= 200,
			
 
				+	.cra_flags			= CRYPTO_ALG_TYPE_CIPHER,
			
 
				+	.cra_blocksize			= AES_BLOCK_SIZE,
			
 
				+	.cra_ctxsize			= sizeof(struct crypto_aes_ctx),
			
 
				+	.cra_module			= THIS_MODULE,
			
 
				+
			
 
				+	.cra_cipher.cia_min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+	.cra_cipher.cia_max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+	.cra_cipher.cia_setkey		= crypto_aes_set_key,
			
 
				+	.cra_cipher.cia_encrypt		= aes_encrypt,
			
 
				+	.cra_cipher.cia_decrypt		= aes_decrypt
			
 
				+};
			
 
				+
			
 
				+static int __init aes_init(void)
			
 
				+{
			
 
				+	return crypto_register_alg(&aes_alg);
			
 
				+}
			
 
				+
			
 
				+static void __exit aes_fini(void)
			
 
				+{
			
 
				+	crypto_unregister_alg(&aes_alg);
			
 
				+}
			
 
				+
			
 
				+module_init(aes_init);
			
 
				+module_exit(aes_fini);
			
 
				+
			
 
				+MODULE_DESCRIPTION("Scalar AES cipher for arm64");
			
 
				+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				+MODULE_LICENSE("GPL v2");
			
 
				+MODULE_ALIAS_CRYPTO("aes");
			
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES
			
 
				  *
			
 
				- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU General Public License version 2 as
			
@@ -11,6 +11,7 @@
 
				 #include <asm/neon.h>
			
 
				 #include <asm/hwcap.h>
			
 
				 #include <crypto/aes.h>
			
 
				+#include <crypto/internal/hash.h>
			
 
				 #include <crypto/internal/simd.h>
			
 
				 #include <crypto/internal/skcipher.h>
			
 
				 #include <linux/module.h>
			
@@ -31,6 +32,7 @@
 
				 #define aes_ctr_encrypt		ce_aes_ctr_encrypt
			
 
				 #define aes_xts_encrypt		ce_aes_xts_encrypt
			
 
				 #define aes_xts_decrypt		ce_aes_xts_decrypt
			
 
				+#define aes_mac_update		ce_aes_mac_update
			
 
				 MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
			
 
				 #else
			
 
				 #define MODE			"neon"
			
@@ -44,11 +46,15 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
 
				 #define aes_ctr_encrypt		neon_aes_ctr_encrypt
			
 
				 #define aes_xts_encrypt		neon_aes_xts_encrypt
			
 
				 #define aes_xts_decrypt		neon_aes_xts_decrypt
			
 
				+#define aes_mac_update		neon_aes_mac_update
			
 
				 MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
			
 
				 MODULE_ALIAS_CRYPTO("ecb(aes)");
			
 
				 MODULE_ALIAS_CRYPTO("cbc(aes)");
			
 
				 MODULE_ALIAS_CRYPTO("ctr(aes)");
			
 
				 MODULE_ALIAS_CRYPTO("xts(aes)");
			
 
				+MODULE_ALIAS_CRYPTO("cmac(aes)");
			
 
				+MODULE_ALIAS_CRYPTO("xcbc(aes)");
			
 
				+MODULE_ALIAS_CRYPTO("cbcmac(aes)");
			
 
				 #endif
			
 
				 
			
 
				 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
@@ -75,11 +81,25 @@ asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
 
				 				int rounds, int blocks, u8 const rk2[], u8 iv[],
			
 
				 				int first);
			
 
				 
			
 
				+asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
			
 
				+			       int blocks, u8 dg[], int enc_before,
			
 
				+			       int enc_after);
			
 
				+
			
 
				 struct crypto_aes_xts_ctx {
			
 
				 	struct crypto_aes_ctx key1;
			
 
				 	struct crypto_aes_ctx __aligned(8) key2;
			
 
				 };
			
 
				 
			
 
				+struct mac_tfm_ctx {
			
 
				+	struct crypto_aes_ctx key;
			
 
				+	u8 __aligned(8) consts[];
			
 
				+};
			
 
				+
			
 
				+struct mac_desc_ctx {
			
 
				+	unsigned int len;
			
 
				+	u8 dg[AES_BLOCK_SIZE];
			
 
				+};
			
 
				+
			
 
				 static int skcipher_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
			
 
				 			       unsigned int key_len)
			
 
				 {
			
@@ -215,14 +235,15 @@ static int ctr_encrypt(struct skcipher_request *req)
 
				 		u8 *tsrc = walk.src.virt.addr;
			
 
				 
			
 
				 		/*
			
 
				-		 * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need
			
 
				-		 * to tell aes_ctr_encrypt() to only read half a block.
			
 
				+		 * Tell aes_ctr_encrypt() to process a tail block.
			
 
				 		 */
			
 
				-		blocks = (nbytes <= 8) ? -1 : 1;
			
 
				+		blocks = -1;
			
 
				 
			
 
				-		aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds,
			
 
				+		aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds,
			
 
				 				blocks, walk.iv, first);
			
 
				-		memcpy(tdst, tail, nbytes);
			
 
				+		if (tdst != tsrc)
			
 
				+			memcpy(tdst, tsrc, nbytes);
			
 
				+		crypto_xor(tdst, tail, nbytes);
			
 
				 		err = skcipher_walk_done(&walk, 0);
			
 
				 	}
			
 
				 	kernel_neon_end();
			
@@ -282,7 +303,6 @@ static struct skcipher_alg aes_algs[] = { {
 
				 		.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				 		.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
			
 
				-		.cra_alignmask		= 7,
			
 
				 		.cra_module		= THIS_MODULE,
			
 
				 	},
			
 
				 	.min_keysize	= AES_MIN_KEY_SIZE,
			
@@ -298,7 +318,6 @@ static struct skcipher_alg aes_algs[] = { {
 
				 		.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				 		.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
			
 
				-		.cra_alignmask		= 7,
			
 
				 		.cra_module		= THIS_MODULE,
			
 
				 	},
			
 
				 	.min_keysize	= AES_MIN_KEY_SIZE,
			
@@ -315,7 +334,22 @@ static struct skcipher_alg aes_algs[] = { {
 
				 		.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				 		.cra_blocksize		= 1,
			
 
				 		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
			
 
				-		.cra_alignmask		= 7,
			
 
				+		.cra_module		= THIS_MODULE,
			
 
				+	},
			
 
				+	.min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+	.max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+	.ivsize		= AES_BLOCK_SIZE,
			
 
				+	.chunksize	= AES_BLOCK_SIZE,
			
 
				+	.setkey		= skcipher_aes_setkey,
			
 
				+	.encrypt	= ctr_encrypt,
			
 
				+	.decrypt	= ctr_encrypt,
			
 
				+}, {
			
 
				+	.base = {
			
 
				+		.cra_name		= "ctr(aes)",
			
 
				+		.cra_driver_name	= "ctr-aes-" MODE,
			
 
				+		.cra_priority		= PRIO - 1,
			
 
				+		.cra_blocksize		= 1,
			
 
				+		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
			
 
				 		.cra_module		= THIS_MODULE,
			
 
				 	},
			
 
				 	.min_keysize	= AES_MIN_KEY_SIZE,
			
@@ -333,7 +367,6 @@ static struct skcipher_alg aes_algs[] = { {
 
				 		.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				 		.cra_blocksize		= AES_BLOCK_SIZE,
			
 
				 		.cra_ctxsize		= sizeof(struct crypto_aes_xts_ctx),
			
 
				-		.cra_alignmask		= 7,
			
 
				 		.cra_module		= THIS_MODULE,
			
 
				 	},
			
 
				 	.min_keysize	= 2 * AES_MIN_KEY_SIZE,
			
@@ -344,15 +377,228 @@ static struct skcipher_alg aes_algs[] = { {
 
				 	.decrypt	= xts_decrypt,
			
 
				 } };
			
 
				 
			
 
				+static int cbcmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
			
 
				+			 unsigned int key_len)
			
 
				+{
			
 
				+	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
			
 
				+	int err;
			
 
				+
			
 
				+	err = aes_expandkey(&ctx->key, in_key, key_len);
			
 
				+	if (err)
			
 
				+		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static void cmac_gf128_mul_by_x(be128 *y, const be128 *x)
			
 
				+{
			
 
				+	u64 a = be64_to_cpu(x->a);
			
 
				+	u64 b = be64_to_cpu(x->b);
			
 
				+
			
 
				+	y->a = cpu_to_be64((a << 1) | (b >> 63));
			
 
				+	y->b = cpu_to_be64((b << 1) ^ ((a >> 63) ? 0x87 : 0));
			
 
				+}
			
 
				+
			
 
				+static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
			
 
				+		       unsigned int key_len)
			
 
				+{
			
 
				+	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
			
 
				+	be128 *consts = (be128 *)ctx->consts;
			
 
				+	u8 *rk = (u8 *)ctx->key.key_enc;
			
 
				+	int rounds = 6 + key_len / 4;
			
 
				+	int err;
			
 
				+
			
 
				+	err = cbcmac_setkey(tfm, in_key, key_len);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	/* encrypt the zero vector */
			
 
				+	kernel_neon_begin();
			
 
				+	aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, rk, rounds, 1, 1);
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	cmac_gf128_mul_by_x(consts, consts);
			
 
				+	cmac_gf128_mul_by_x(consts + 1, consts);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
			
 
				+		       unsigned int key_len)
			
 
				+{
			
 
				+	static u8 const ks[3][AES_BLOCK_SIZE] = {
			
 
				+		{ [0 ... AES_BLOCK_SIZE - 1] = 0x1 },
			
 
				+		{ [0 ... AES_BLOCK_SIZE - 1] = 0x2 },
			
 
				+		{ [0 ... AES_BLOCK_SIZE - 1] = 0x3 },
			
 
				+	};
			
 
				+
			
 
				+	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
			
 
				+	u8 *rk = (u8 *)ctx->key.key_enc;
			
 
				+	int rounds = 6 + key_len / 4;
			
 
				+	u8 key[AES_BLOCK_SIZE];
			
 
				+	int err;
			
 
				+
			
 
				+	err = cbcmac_setkey(tfm, in_key, key_len);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	aes_ecb_encrypt(key, ks[0], rk, rounds, 1, 1);
			
 
				+	aes_ecb_encrypt(ctx->consts, ks[1], rk, rounds, 2, 0);
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return cbcmac_setkey(tfm, key, sizeof(key));
			
 
				+}
			
 
				+
			
 
				+static int mac_init(struct shash_desc *desc)
			
 
				+{
			
 
				+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
			
 
				+
			
 
				+	memset(ctx->dg, 0, AES_BLOCK_SIZE);
			
 
				+	ctx->len = 0;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int mac_update(struct shash_desc *desc, const u8 *p, unsigned int len)
			
 
				+{
			
 
				+	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
			
 
				+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
			
 
				+	int rounds = 6 + tctx->key.key_length / 4;
			
 
				+
			
 
				+	while (len > 0) {
			
 
				+		unsigned int l;
			
 
				+
			
 
				+		if ((ctx->len % AES_BLOCK_SIZE) == 0 &&
			
 
				+		    (ctx->len + len) > AES_BLOCK_SIZE) {
			
 
				+
			
 
				+			int blocks = len / AES_BLOCK_SIZE;
			
 
				+
			
 
				+			len %= AES_BLOCK_SIZE;
			
 
				+
			
 
				+			kernel_neon_begin();
			
 
				+			aes_mac_update(p, tctx->key.key_enc, rounds, blocks,
			
 
				+				       ctx->dg, (ctx->len != 0), (len != 0));
			
 
				+			kernel_neon_end();
			
 
				+
			
 
				+			p += blocks * AES_BLOCK_SIZE;
			
 
				+
			
 
				+			if (!len) {
			
 
				+				ctx->len = AES_BLOCK_SIZE;
			
 
				+				break;
			
 
				+			}
			
 
				+			ctx->len = 0;
			
 
				+		}
			
 
				+
			
 
				+		l = min(len, AES_BLOCK_SIZE - ctx->len);
			
 
				+
			
 
				+		if (l <= AES_BLOCK_SIZE) {
			
 
				+			crypto_xor(ctx->dg + ctx->len, p, l);
			
 
				+			ctx->len += l;
			
 
				+			len -= l;
			
 
				+			p += l;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int cbcmac_final(struct shash_desc *desc, u8 *out)
			
 
				+{
			
 
				+	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
			
 
				+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
			
 
				+	int rounds = 6 + tctx->key.key_length / 4;
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	aes_mac_update(NULL, tctx->key.key_enc, rounds, 0, ctx->dg, 1, 0);
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int cmac_final(struct shash_desc *desc, u8 *out)
			
 
				+{
			
 
				+	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
			
 
				+	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
			
 
				+	int rounds = 6 + tctx->key.key_length / 4;
			
 
				+	u8 *consts = tctx->consts;
			
 
				+
			
 
				+	if (ctx->len != AES_BLOCK_SIZE) {
			
 
				+		ctx->dg[ctx->len] ^= 0x80;
			
 
				+		consts += AES_BLOCK_SIZE;
			
 
				+	}
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	aes_mac_update(consts, tctx->key.key_enc, rounds, 1, ctx->dg, 0, 1);
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	memcpy(out, ctx->dg, AES_BLOCK_SIZE);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct shash_alg mac_algs[] = { {
			
 
				+	.base.cra_name		= "cmac(aes)",
			
 
				+	.base.cra_driver_name	= "cmac-aes-" MODE,
			
 
				+	.base.cra_priority	= PRIO,
			
 
				+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
			
 
				+	.base.cra_blocksize	= AES_BLOCK_SIZE,
			
 
				+	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
			
 
				+				  2 * AES_BLOCK_SIZE,
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+
			
 
				+	.digestsize		= AES_BLOCK_SIZE,
			
 
				+	.init			= mac_init,
			
 
				+	.update			= mac_update,
			
 
				+	.final			= cmac_final,
			
 
				+	.setkey			= cmac_setkey,
			
 
				+	.descsize		= sizeof(struct mac_desc_ctx),
			
 
				+}, {
			
 
				+	.base.cra_name		= "xcbc(aes)",
			
 
				+	.base.cra_driver_name	= "xcbc-aes-" MODE,
			
 
				+	.base.cra_priority	= PRIO,
			
 
				+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
			
 
				+	.base.cra_blocksize	= AES_BLOCK_SIZE,
			
 
				+	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
			
 
				+				  2 * AES_BLOCK_SIZE,
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+
			
 
				+	.digestsize		= AES_BLOCK_SIZE,
			
 
				+	.init			= mac_init,
			
 
				+	.update			= mac_update,
			
 
				+	.final			= cmac_final,
			
 
				+	.setkey			= xcbc_setkey,
			
 
				+	.descsize		= sizeof(struct mac_desc_ctx),
			
 
				+}, {
			
 
				+	.base.cra_name		= "cbcmac(aes)",
			
 
				+	.base.cra_driver_name	= "cbcmac-aes-" MODE,
			
 
				+	.base.cra_priority	= PRIO,
			
 
				+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
			
 
				+	.base.cra_blocksize	= 1,
			
 
				+	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx),
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+
			
 
				+	.digestsize		= AES_BLOCK_SIZE,
			
 
				+	.init			= mac_init,
			
 
				+	.update			= mac_update,
			
 
				+	.final			= cbcmac_final,
			
 
				+	.setkey			= cbcmac_setkey,
			
 
				+	.descsize		= sizeof(struct mac_desc_ctx),
			
 
				+} };
			
 
				+
			
 
				 static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
			
 
				 
			
 
				 static void aes_exit(void)
			
 
				 {
			
 
				 	int i;
			
 
				 
			
 
				-	for (i = 0; i < ARRAY_SIZE(aes_simd_algs) && aes_simd_algs[i]; i++)
			
 
				-		simd_skcipher_free(aes_simd_algs[i]);
			
 
				+	for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
			
 
				+		if (aes_simd_algs[i])
			
 
				+			simd_skcipher_free(aes_simd_algs[i]);
			
 
				 
			
 
				+	crypto_unregister_shashes(mac_algs, ARRAY_SIZE(mac_algs));
			
 
				 	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
			
 
				 }
			
 
				 
			
@@ -369,7 +615,14 @@ static int __init aes_init(void)
 
				 	if (err)
			
 
				 		return err;
			
 
				 
			
 
				+	err = crypto_register_shashes(mac_algs, ARRAY_SIZE(mac_algs));
			
 
				+	if (err)
			
 
				+		goto unregister_ciphers;
			
 
				+
			
 
				 	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
			
 
				+		if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
			
 
				+			continue;
			
 
				+
			
 
				 		algname = aes_algs[i].base.cra_name + 2;
			
 
				 		drvname = aes_algs[i].base.cra_driver_name + 2;
			
 
				 		basename = aes_algs[i].base.cra_driver_name;
			
@@ -385,6 +638,8 @@ static int __init aes_init(void)
 
				 
			
 
				 unregister_simds:
			
 
				 	aes_exit();
			
 
				+unregister_ciphers:
			
 
				+	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
			
 
				 	return err;
			
 
				 }
			
 
				 
			
@@ -392,5 +647,7 @@ unregister_simds:
 
				 module_cpu_feature_match(AES, aes_init);
			
 
				 #else
			
 
				 module_init(aes_init);
			
 
				+EXPORT_SYMBOL(neon_aes_ecb_encrypt);
			
 
				+EXPORT_SYMBOL(neon_aes_cbc_encrypt);
			
 
				 #endif
			
 
				 module_exit(aes_exit);
			
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
			
 
				  *
			
 
				- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU General Public License version 2 as
			
@@ -337,7 +337,7 @@ AES_ENTRY(aes_ctr_encrypt)
 
				 
			
 
				 .Lctrcarrydone:
			
 
				 	subs		w4, w4, #1
			
 
				-	bmi		.Lctrhalfblock		/* blocks < 0 means 1/2 block */
			
 
				+	bmi		.Lctrtailblock		/* blocks <0 means tail block */
			
 
				 	ld1		{v3.16b}, [x1], #16
			
 
				 	eor		v3.16b, v0.16b, v3.16b
			
 
				 	st1		{v3.16b}, [x0], #16
			
@@ -348,10 +348,8 @@ AES_ENTRY(aes_ctr_encrypt)
 
				 	FRAME_POP
			
 
				 	ret
			
 
				 
			
 
				-.Lctrhalfblock:
			
 
				-	ld1		{v3.8b}, [x1]
			
 
				-	eor		v3.8b, v0.8b, v3.8b
			
 
				-	st1		{v3.8b}, [x0]
			
 
				+.Lctrtailblock:
			
 
				+	st1		{v0.16b}, [x0]
			
 
				 	FRAME_POP
			
 
				 	ret
			
 
				 
			
@@ -527,3 +525,30 @@ AES_ENTRY(aes_xts_decrypt)
 
				 	FRAME_POP
			
 
				 	ret
			
 
				 AES_ENDPROC(aes_xts_decrypt)
			
 
				+
			
 
				+	/*
			
 
				+	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
			
 
				+	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
			
 
				+	 */
			
 
				+AES_ENTRY(aes_mac_update)
			
 
				+	ld1		{v0.16b}, [x4]			/* get dg */
			
 
				+	enc_prepare	w2, x1, x7
			
 
				+	cbnz		w5, .Lmacenc
			
 
				+
			
 
				+.Lmacloop:
			
 
				+	cbz		w3, .Lmacout
			
 
				+	ld1		{v1.16b}, [x0], #16		/* get next pt block */
			
 
				+	eor		v0.16b, v0.16b, v1.16b		/* ..and xor with dg */
			
 
				+
			
 
				+	subs		w3, w3, #1
			
 
				+	csinv		x5, x6, xzr, eq
			
 
				+	cbz		w5, .Lmacout
			
 
				+
			
 
				+.Lmacenc:
			
 
				+	encrypt_block	v0, w2, x1, x7, w8
			
 
				+	b		.Lmacloop
			
 
				+
			
 
				+.Lmacout:
			
 
				+	st1		{v0.16b}, [x4]			/* return dg */
			
 
				+	ret
			
 
				+AES_ENDPROC(aes_mac_update)
			
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
			
 
				  *
			
 
				- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU General Public License version 2 as
			
@@ -17,17 +17,25 @@
 
				 	/* multiply by polynomial 'x' in GF(2^8) */
			
 
				 	.macro		mul_by_x, out, in, temp, const
			
 
				 	sshr		\temp, \in, #7
			
 
				-	add		\out, \in, \in
			
 
				+	shl		\out, \in, #1
			
 
				 	and		\temp, \temp, \const
			
 
				 	eor		\out, \out, \temp
			
 
				 	.endm
			
 
				 
			
 
				+	/* multiply by polynomial 'x^2' in GF(2^8) */
			
 
				+	.macro		mul_by_x2, out, in, temp, const
			
 
				+	ushr		\temp, \in, #6
			
 
				+	shl		\out, \in, #2
			
 
				+	pmul		\temp, \temp, \const
			
 
				+	eor		\out, \out, \temp
			
 
				+	.endm
			
 
				+
			
 
				 	/* preload the entire Sbox */
			
 
				 	.macro		prepare, sbox, shiftrows, temp
			
 
				 	adr		\temp, \sbox
			
 
				-	movi		v12.16b, #0x40
			
 
				+	movi		v12.16b, #0x1b
			
 
				 	ldr		q13, \shiftrows
			
 
				-	movi		v14.16b, #0x1b
			
 
				+	ldr		q14, .Lror32by8
			
 
				 	ld1		{v16.16b-v19.16b}, [\temp], #64
			
 
				 	ld1		{v20.16b-v23.16b}, [\temp], #64
			
 
				 	ld1		{v24.16b-v27.16b}, [\temp], #64
			
@@ -50,37 +58,31 @@
 
				 
			
 
				 	/* apply SubBytes transformation using the the preloaded Sbox */
			
 
				 	.macro		sub_bytes, in
			
 
				-	sub		v9.16b, \in\().16b, v12.16b
			
 
				+	sub		v9.16b, \in\().16b, v15.16b
			
 
				 	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
			
 
				-	sub		v10.16b, v9.16b, v12.16b
			
 
				+	sub		v10.16b, v9.16b, v15.16b
			
 
				 	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
			
 
				-	sub		v11.16b, v10.16b, v12.16b
			
 
				+	sub		v11.16b, v10.16b, v15.16b
			
 
				 	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
			
 
				 	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
			
 
				 	.endm
			
 
				 
			
 
				 	/* apply MixColumns transformation */
			
 
				-	.macro		mix_columns, in
			
 
				-	mul_by_x	v10.16b, \in\().16b, v9.16b, v14.16b
			
 
				-	rev32		v8.8h, \in\().8h
			
 
				-	eor		\in\().16b, v10.16b, \in\().16b
			
 
				-	shl		v9.4s, v8.4s, #24
			
 
				-	shl		v11.4s, \in\().4s, #24
			
 
				-	sri		v9.4s, v8.4s, #8
			
 
				-	sri		v11.4s, \in\().4s, #8
			
 
				-	eor		v9.16b, v9.16b, v8.16b
			
 
				-	eor		v10.16b, v10.16b, v9.16b
			
 
				-	eor		\in\().16b, v10.16b, v11.16b
			
 
				-	.endm
			
 
				-
			
 
				+	.macro		mix_columns, in, enc
			
 
				+	.if		\enc == 0
			
 
				 	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
			
 
				-	.macro		inv_mix_columns, in
			
 
				-	mul_by_x	v11.16b, \in\().16b, v10.16b, v14.16b
			
 
				-	mul_by_x	v11.16b, v11.16b, v10.16b, v14.16b
			
 
				-	eor		\in\().16b, \in\().16b, v11.16b
			
 
				-	rev32		v11.8h, v11.8h
			
 
				-	eor		\in\().16b, \in\().16b, v11.16b
			
 
				-	mix_columns	\in
			
 
				+	mul_by_x2	v8.16b, \in\().16b, v9.16b, v12.16b
			
 
				+	eor		\in\().16b, \in\().16b, v8.16b
			
 
				+	rev32		v8.8h, v8.8h
			
 
				+	eor		\in\().16b, \in\().16b, v8.16b
			
 
				+	.endif
			
 
				+
			
 
				+	mul_by_x	v9.16b, \in\().16b, v8.16b, v12.16b
			
 
				+	rev32		v8.8h, \in\().8h
			
 
				+	eor		v8.16b, v8.16b, v9.16b
			
 
				+	eor		\in\().16b, \in\().16b, v8.16b
			
 
				+	tbl		\in\().16b, {\in\().16b}, v14.16b
			
 
				+	eor		\in\().16b, \in\().16b, v8.16b
			
 
				 	.endm
			
 
				 
			
 
				 	.macro		do_block, enc, in, rounds, rk, rkp, i
			
@@ -88,16 +90,13 @@
 
				 	add		\rkp, \rk, #16
			
 
				 	mov		\i, \rounds
			
 
				 1111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
			
 
				+	movi		v15.16b, #0x40
			
 
				 	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
			
 
				 	sub_bytes	\in
			
 
				-	ld1		{v15.4s}, [\rkp], #16
			
 
				 	subs		\i, \i, #1
			
 
				+	ld1		{v15.4s}, [\rkp], #16
			
 
				 	beq		2222f
			
 
				-	.if		\enc == 1
			
 
				-	mix_columns	\in
			
 
				-	.else
			
 
				-	inv_mix_columns	\in
			
 
				-	.endif
			
 
				+	mix_columns	\in, \enc
			
 
				 	b		1111b
			
 
				 2222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
			
 
				 	.endm
			
@@ -116,139 +115,114 @@
 
				 	 */
			
 
				 
			
 
				 	.macro		sub_bytes_2x, in0, in1
			
 
				-	sub		v8.16b, \in0\().16b, v12.16b
			
 
				-	sub		v9.16b, \in1\().16b, v12.16b
			
 
				+	sub		v8.16b, \in0\().16b, v15.16b
			
 
				 	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
			
 
				+	sub		v9.16b, \in1\().16b, v15.16b
			
 
				 	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
			
 
				-	sub		v10.16b, v8.16b, v12.16b
			
 
				-	sub		v11.16b, v9.16b, v12.16b
			
 
				+	sub		v10.16b, v8.16b, v15.16b
			
 
				 	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
			
 
				+	sub		v11.16b, v9.16b, v15.16b
			
 
				 	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
			
 
				-	sub		v8.16b, v10.16b, v12.16b
			
 
				-	sub		v9.16b, v11.16b, v12.16b
			
 
				+	sub		v8.16b, v10.16b, v15.16b
			
 
				 	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
			
 
				+	sub		v9.16b, v11.16b, v15.16b
			
 
				 	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
			
 
				 	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
			
 
				 	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
			
 
				 	.endm
			
 
				 
			
 
				 	.macro		sub_bytes_4x, in0, in1, in2, in3
			
 
				-	sub		v8.16b, \in0\().16b, v12.16b
			
 
				+	sub		v8.16b, \in0\().16b, v15.16b
			
 
				 	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
			
 
				-	sub		v9.16b, \in1\().16b, v12.16b
			
 
				+	sub		v9.16b, \in1\().16b, v15.16b
			
 
				 	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
			
 
				-	sub		v10.16b, \in2\().16b, v12.16b
			
 
				+	sub		v10.16b, \in2\().16b, v15.16b
			
 
				 	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
			
 
				-	sub		v11.16b, \in3\().16b, v12.16b
			
 
				+	sub		v11.16b, \in3\().16b, v15.16b
			
 
				 	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
			
 
				 	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
			
 
				 	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
			
 
				-	sub		v8.16b, v8.16b, v12.16b
			
 
				+	sub		v8.16b, v8.16b, v15.16b
			
 
				 	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
			
 
				-	sub		v9.16b, v9.16b, v12.16b
			
 
				+	sub		v9.16b, v9.16b, v15.16b
			
 
				 	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
			
 
				-	sub		v10.16b, v10.16b, v12.16b
			
 
				+	sub		v10.16b, v10.16b, v15.16b
			
 
				 	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
			
 
				-	sub		v11.16b, v11.16b, v12.16b
			
 
				+	sub		v11.16b, v11.16b, v15.16b
			
 
				 	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
			
 
				-	sub		v8.16b, v8.16b, v12.16b
			
 
				+	sub		v8.16b, v8.16b, v15.16b
			
 
				 	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
			
 
				-	sub		v9.16b, v9.16b, v12.16b
			
 
				+	sub		v9.16b, v9.16b, v15.16b
			
 
				 	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
			
 
				-	sub		v10.16b, v10.16b, v12.16b
			
 
				+	sub		v10.16b, v10.16b, v15.16b
			
 
				 	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
			
 
				-	sub		v11.16b, v11.16b, v12.16b
			
 
				+	sub		v11.16b, v11.16b, v15.16b
			
 
				 	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
			
 
				 	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
			
 
				 	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
			
 
				 	.endm
			
 
				 
			
 
				 	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
			
 
				-	sshr		\tmp0\().16b, \in0\().16b,  #7
			
 
				-	add		\out0\().16b, \in0\().16b,  \in0\().16b
			
 
				-	sshr		\tmp1\().16b, \in1\().16b,  #7
			
 
				+	sshr		\tmp0\().16b, \in0\().16b, #7
			
 
				+	shl		\out0\().16b, \in0\().16b, #1
			
 
				+	sshr		\tmp1\().16b, \in1\().16b, #7
			
 
				 	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
			
 
				-	add		\out1\().16b, \in1\().16b,  \in1\().16b
			
 
				+	shl		\out1\().16b, \in1\().16b, #1
			
 
				 	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
			
 
				 	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
			
 
				 	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
			
 
				 	.endm
			
 
				 
			
 
				-	.macro		mix_columns_2x, in0, in1
			
 
				-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
			
 
				-	rev32		v10.8h, \in0\().8h
			
 
				-	rev32		v11.8h, \in1\().8h
			
 
				-	eor		\in0\().16b, v8.16b, \in0\().16b
			
 
				-	eor		\in1\().16b, v9.16b, \in1\().16b
			
 
				-	shl		v12.4s, v10.4s, #24
			
 
				-	shl		v13.4s, v11.4s, #24
			
 
				-	eor		v8.16b, v8.16b, v10.16b
			
 
				-	sri		v12.4s, v10.4s, #8
			
 
				-	shl		v10.4s, \in0\().4s, #24
			
 
				-	eor		v9.16b, v9.16b, v11.16b
			
 
				-	sri		v13.4s, v11.4s, #8
			
 
				-	shl		v11.4s, \in1\().4s, #24
			
 
				-	sri		v10.4s, \in0\().4s, #8
			
 
				-	eor		\in0\().16b, v8.16b, v12.16b
			
 
				-	sri		v11.4s, \in1\().4s, #8
			
 
				-	eor		\in1\().16b, v9.16b, v13.16b
			
 
				-	eor		\in0\().16b, v10.16b, \in0\().16b
			
 
				-	eor		\in1\().16b, v11.16b, \in1\().16b
			
 
				+	.macro		mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
			
 
				+	ushr		\tmp0\().16b, \in0\().16b, #6
			
 
				+	shl		\out0\().16b, \in0\().16b, #2
			
 
				+	ushr		\tmp1\().16b, \in1\().16b, #6
			
 
				+	pmul		\tmp0\().16b, \tmp0\().16b, \const\().16b
			
 
				+	shl		\out1\().16b, \in1\().16b, #2
			
 
				+	pmul		\tmp1\().16b, \tmp1\().16b, \const\().16b
			
 
				+	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
			
 
				+	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
			
 
				 	.endm
			
 
				 
			
 
				-	.macro		inv_mix_cols_2x, in0, in1
			
 
				-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
			
 
				-	mul_by_x_2x	v8, v9, v8, v9, v10, v11, v14
			
 
				+	.macro		mix_columns_2x, in0, in1, enc
			
 
				+	.if		\enc == 0
			
 
				+	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
			
 
				+	mul_by_x2_2x	v8, v9, \in0, \in1, v10, v11, v12
			
 
				 	eor		\in0\().16b, \in0\().16b, v8.16b
			
 
				-	eor		\in1\().16b, \in1\().16b, v9.16b
			
 
				 	rev32		v8.8h, v8.8h
			
 
				-	rev32		v9.8h, v9.8h
			
 
				-	eor		\in0\().16b, \in0\().16b, v8.16b
			
 
				-	eor		\in1\().16b, \in1\().16b, v9.16b
			
 
				-	mix_columns_2x	\in0, \in1
			
 
				-	.endm
			
 
				-
			
 
				-	.macro		inv_mix_cols_4x, in0, in1, in2, in3
			
 
				-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
			
 
				-	mul_by_x_2x	v10, v11, \in2, \in3, v12, v13, v14
			
 
				-	mul_by_x_2x	v8, v9, v8, v9, v12, v13, v14
			
 
				-	mul_by_x_2x	v10, v11, v10, v11, v12, v13, v14
			
 
				-	eor		\in0\().16b, \in0\().16b, v8.16b
			
 
				 	eor		\in1\().16b, \in1\().16b, v9.16b
			
 
				-	eor		\in2\().16b, \in2\().16b, v10.16b
			
 
				-	eor		\in3\().16b, \in3\().16b, v11.16b
			
 
				-	rev32		v8.8h, v8.8h
			
 
				 	rev32		v9.8h, v9.8h
			
 
				-	rev32		v10.8h, v10.8h
			
 
				-	rev32		v11.8h, v11.8h
			
 
				 	eor		\in0\().16b, \in0\().16b, v8.16b
			
 
				 	eor		\in1\().16b, \in1\().16b, v9.16b
			
 
				-	eor		\in2\().16b, \in2\().16b, v10.16b
			
 
				-	eor		\in3\().16b, \in3\().16b, v11.16b
			
 
				-	mix_columns_2x	\in0, \in1
			
 
				-	mix_columns_2x	\in2, \in3
			
 
				+	.endif
			
 
				+
			
 
				+	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v12
			
 
				+	rev32		v10.8h, \in0\().8h
			
 
				+	rev32		v11.8h, \in1\().8h
			
 
				+	eor		v10.16b, v10.16b, v8.16b
			
 
				+	eor		v11.16b, v11.16b, v9.16b
			
 
				+	eor		\in0\().16b, \in0\().16b, v10.16b
			
 
				+	eor		\in1\().16b, \in1\().16b, v11.16b
			
 
				+	tbl		\in0\().16b, {\in0\().16b}, v14.16b
			
 
				+	tbl		\in1\().16b, {\in1\().16b}, v14.16b
			
 
				+	eor		\in0\().16b, \in0\().16b, v10.16b
			
 
				+	eor		\in1\().16b, \in1\().16b, v11.16b
			
 
				 	.endm
			
 
				 
			
 
				-	.macro		do_block_2x, enc, in0, in1 rounds, rk, rkp, i
			
 
				+	.macro		do_block_2x, enc, in0, in1, rounds, rk, rkp, i
			
 
				 	ld1		{v15.4s}, [\rk]
			
 
				 	add		\rkp, \rk, #16
			
 
				 	mov		\i, \rounds
			
 
				 1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
			
 
				 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
			
 
				-	sub_bytes_2x	\in0, \in1
			
 
				+	movi		v15.16b, #0x40
			
 
				 	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
			
 
				 	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
			
 
				-	ld1		{v15.4s}, [\rkp], #16
			
 
				+	sub_bytes_2x	\in0, \in1
			
 
				 	subs		\i, \i, #1
			
 
				+	ld1		{v15.4s}, [\rkp], #16
			
 
				 	beq		2222f
			
 
				-	.if		\enc == 1
			
 
				-	mix_columns_2x	\in0, \in1
			
 
				-	ldr		q13, .LForward_ShiftRows
			
 
				-	.else
			
 
				-	inv_mix_cols_2x	\in0, \in1
			
 
				-	ldr		q13, .LReverse_ShiftRows
			
 
				-	.endif
			
 
				-	movi		v12.16b, #0x40
			
 
				+	mix_columns_2x	\in0, \in1, \enc
			
 
				 	b		1111b
			
 
				 2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
			
 
				 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
			
@@ -262,23 +236,17 @@
 
				 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
			
 
				 	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
			
 
				 	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
			
 
				-	sub_bytes_4x	\in0, \in1, \in2, \in3
			
 
				+	movi		v15.16b, #0x40
			
 
				 	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
			
 
				 	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
			
 
				 	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
			
 
				 	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
			
 
				-	ld1		{v15.4s}, [\rkp], #16
			
 
				+	sub_bytes_4x	\in0, \in1, \in2, \in3
			
 
				 	subs		\i, \i, #1
			
 
				+	ld1		{v15.4s}, [\rkp], #16
			
 
				 	beq		2222f
			
 
				-	.if		\enc == 1
			
 
				-	mix_columns_2x	\in0, \in1
			
 
				-	mix_columns_2x	\in2, \in3
			
 
				-	ldr		q13, .LForward_ShiftRows
			
 
				-	.else
			
 
				-	inv_mix_cols_4x	\in0, \in1, \in2, \in3
			
 
				-	ldr		q13, .LReverse_ShiftRows
			
 
				-	.endif
			
 
				-	movi		v12.16b, #0x40
			
 
				+	mix_columns_2x	\in0, \in1, \enc
			
 
				+	mix_columns_2x	\in2, \in3, \enc
			
 
				 	b		1111b
			
 
				 2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
			
 
				 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
			
@@ -305,19 +273,7 @@
 
				 #include "aes-modes.S"
			
 
				 
			
 
				 	.text
			
 
				-	.align		4
			
 
				-.LForward_ShiftRows:
			
 
				-CPU_LE(	.byte		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3	)
			
 
				-CPU_LE(	.byte		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb	)
			
 
				-CPU_BE(	.byte		0xb, 0x6, 0x1, 0xc, 0x7, 0x2, 0xd, 0x8	)
			
 
				-CPU_BE(	.byte		0x3, 0xe, 0x9, 0x4, 0xf, 0xa, 0x5, 0x0	)
			
 
				-
			
 
				-.LReverse_ShiftRows:
			
 
				-CPU_LE(	.byte		0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb	)
			
 
				-CPU_LE(	.byte		0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3	)
			
 
				-CPU_BE(	.byte		0x3, 0x6, 0x9, 0xc, 0xf, 0x2, 0x5, 0x8	)
			
 
				-CPU_BE(	.byte		0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0	)
			
 
				-
			
 
				+	.align		6
			
 
				 .LForward_Sbox:
			
 
				 	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
			
 
				 	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
			
@@ -385,3 +341,12 @@ CPU_BE(	.byte		0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0	)
 
				 	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
			
 
				 	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
			
 
				 	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
			
 
				+
			
 
				+.LForward_ShiftRows:
			
 
				+	.octa		0x0b06010c07020d08030e09040f0a0500
			
 
				+
			
 
				+.LReverse_ShiftRows:
			
 
				+	.octa		0x0306090c0f0205080b0e0104070a0d00
			
 
				+
			
 
				+.Lror32by8:
			
 
				+	.octa		0x0c0f0e0d080b0a090407060500030201
			
--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@@ -0,0 +1,972 @@
 
				+/*
			
 
				+ * Bit sliced AES using NEON instructions
			
 
				+ *
			
 
				+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * The algorithm implemented here is described in detail by the paper
			
 
				+ * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
			
 
				+ * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
			
 
				+ *
			
 
				+ * This implementation is based primarily on the OpenSSL implementation
			
 
				+ * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
			
 
				+ */
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+#include <asm/assembler.h>
			
 
				+
			
 
				+	.text
			
 
				+
			
 
				+	rounds		.req	x11
			
 
				+	bskey		.req	x12
			
 
				+
			
 
				+	.macro		in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
			
 
				+	eor		\b2, \b2, \b1
			
 
				+	eor		\b5, \b5, \b6
			
 
				+	eor		\b3, \b3, \b0
			
 
				+	eor		\b6, \b6, \b2
			
 
				+	eor		\b5, \b5, \b0
			
 
				+	eor		\b6, \b6, \b3
			
 
				+	eor		\b3, \b3, \b7
			
 
				+	eor		\b7, \b7, \b5
			
 
				+	eor		\b3, \b3, \b4
			
 
				+	eor		\b4, \b4, \b5
			
 
				+	eor		\b2, \b2, \b7
			
 
				+	eor		\b3, \b3, \b1
			
 
				+	eor		\b1, \b1, \b5
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
			
 
				+	eor		\b0, \b0, \b6
			
 
				+	eor		\b1, \b1, \b4
			
 
				+	eor		\b4, \b4, \b6
			
 
				+	eor		\b2, \b2, \b0
			
 
				+	eor		\b6, \b6, \b1
			
 
				+	eor		\b1, \b1, \b5
			
 
				+	eor		\b5, \b5, \b3
			
 
				+	eor		\b3, \b3, \b7
			
 
				+	eor		\b7, \b7, \b5
			
 
				+	eor		\b2, \b2, \b5
			
 
				+	eor		\b4, \b4, \b7
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
			
 
				+	eor		\b1, \b1, \b7
			
 
				+	eor		\b4, \b4, \b7
			
 
				+	eor		\b7, \b7, \b5
			
 
				+	eor		\b1, \b1, \b3
			
 
				+	eor		\b2, \b2, \b5
			
 
				+	eor		\b3, \b3, \b7
			
 
				+	eor		\b6, \b6, \b1
			
 
				+	eor		\b2, \b2, \b0
			
 
				+	eor		\b5, \b5, \b3
			
 
				+	eor		\b4, \b4, \b6
			
 
				+	eor		\b0, \b0, \b6
			
 
				+	eor		\b1, \b1, \b4
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
			
 
				+	eor		\b1, \b1, \b5
			
 
				+	eor		\b2, \b2, \b7
			
 
				+	eor		\b3, \b3, \b1
			
 
				+	eor		\b4, \b4, \b5
			
 
				+	eor		\b7, \b7, \b5
			
 
				+	eor		\b3, \b3, \b4
			
 
				+	eor 		\b5, \b5, \b0
			
 
				+	eor		\b3, \b3, \b7
			
 
				+	eor		\b6, \b6, \b2
			
 
				+	eor		\b2, \b2, \b1
			
 
				+	eor		\b6, \b6, \b3
			
 
				+	eor		\b3, \b3, \b0
			
 
				+	eor		\b5, \b5, \b6
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		mul_gf4, x0, x1, y0, y1, t0, t1
			
 
				+	eor 		\t0, \y0, \y1
			
 
				+	and		\t0, \t0, \x0
			
 
				+	eor		\x0, \x0, \x1
			
 
				+	and		\t1, \x1, \y0
			
 
				+	and		\x0, \x0, \y1
			
 
				+	eor		\x1, \t1, \t0
			
 
				+	eor		\x0, \x0, \t1
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
			
 
				+	eor		\t0, \y0, \y1
			
 
				+	eor 		\t1, \y2, \y3
			
 
				+	and		\t0, \t0, \x0
			
 
				+	and		\t1, \t1, \x2
			
 
				+	eor		\x0, \x0, \x1
			
 
				+	eor		\x2, \x2, \x3
			
 
				+	and		\x1, \x1, \y0
			
 
				+	and		\x3, \x3, \y2
			
 
				+	and		\x0, \x0, \y1
			
 
				+	and		\x2, \x2, \y3
			
 
				+	eor		\x1, \x1, \x0
			
 
				+	eor		\x2, \x2, \x3
			
 
				+	eor		\x0, \x0, \t0
			
 
				+	eor		\x3, \x3, \t1
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
			
 
				+				    y0, y1, y2, y3, t0, t1, t2, t3
			
 
				+	eor		\t0, \x0, \x2
			
 
				+	eor		\t1, \x1, \x3
			
 
				+	mul_gf4  	\x0, \x1, \y0, \y1, \t2, \t3
			
 
				+	eor		\y0, \y0, \y2
			
 
				+	eor		\y1, \y1, \y3
			
 
				+	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
			
 
				+	eor		\x0, \x0, \t0
			
 
				+	eor		\x2, \x2, \t0
			
 
				+	eor		\x1, \x1, \t1
			
 
				+	eor		\x3, \x3, \t1
			
 
				+	eor		\t0, \x4, \x6
			
 
				+	eor		\t1, \x5, \x7
			
 
				+	mul_gf4_n_gf4	\t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
			
 
				+	eor		\y0, \y0, \y2
			
 
				+	eor		\y1, \y1, \y3
			
 
				+	mul_gf4  	\x4, \x5, \y0, \y1, \t2, \t3
			
 
				+	eor		\x4, \x4, \t0
			
 
				+	eor		\x6, \x6, \t0
			
 
				+	eor		\x5, \x5, \t1
			
 
				+	eor		\x7, \x7, \t1
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
			
 
				+				   t0, t1, t2, t3, s0, s1, s2, s3
			
 
				+	eor		\t3, \x4, \x6
			
 
				+	eor		\t0, \x5, \x7
			
 
				+	eor		\t1, \x1, \x3
			
 
				+	eor		\s1, \x7, \x6
			
 
				+	eor		\s0, \x0, \x2
			
 
				+	eor		\s3, \t3, \t0
			
 
				+	orr		\t2, \t0, \t1
			
 
				+	and		\s2, \t3, \s0
			
 
				+	orr		\t3, \t3, \s0
			
 
				+	eor		\s0, \s0, \t1
			
 
				+	and		\t0, \t0, \t1
			
 
				+	eor		\t1, \x3, \x2
			
 
				+	and		\s3, \s3, \s0
			
 
				+	and		\s1, \s1, \t1
			
 
				+	eor		\t1, \x4, \x5
			
 
				+	eor		\s0, \x1, \x0
			
 
				+	eor		\t3, \t3, \s1
			
 
				+	eor		\t2, \t2, \s1
			
 
				+	and		\s1, \t1, \s0
			
 
				+	orr		\t1, \t1, \s0
			
 
				+	eor		\t3, \t3, \s3
			
 
				+	eor		\t0, \t0, \s1
			
 
				+	eor		\t2, \t2, \s2
			
 
				+	eor		\t1, \t1, \s3
			
 
				+	eor		\t0, \t0, \s2
			
 
				+	and		\s0, \x7, \x3
			
 
				+	eor		\t1, \t1, \s2
			
 
				+	and		\s1, \x6, \x2
			
 
				+	and		\s2, \x5, \x1
			
 
				+	orr		\s3, \x4, \x0
			
 
				+	eor		\t3, \t3, \s0
			
 
				+	eor		\t1, \t1, \s2
			
 
				+	eor		\s0, \t0, \s3
			
 
				+	eor		\t2, \t2, \s1
			
 
				+	and		\s2, \t3, \t1
			
 
				+	eor		\s1, \t2, \s2
			
 
				+	eor		\s3, \s0, \s2
			
 
				+	bsl		\s1, \t1, \s0
			
 
				+	not		\t0, \s0
			
 
				+	bsl		\s0, \s1, \s3
			
 
				+	bsl		\t0, \s1, \s3
			
 
				+	bsl		\s3, \t3, \t2
			
 
				+	eor		\t3, \t3, \t2
			
 
				+	and		\s2, \s0, \s3
			
 
				+	eor		\t1, \t1, \t0
			
 
				+	eor		\s2, \s2, \t3
			
 
				+	mul_gf16_2	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
			
 
				+			\s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
			
 
				+			      t0, t1, t2, t3, s0, s1, s2, s3
			
 
				+	in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
			
 
				+			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
			
 
				+	inv_gf256	\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \
			
 
				+			\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
			
 
				+			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
			
 
				+			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
			
 
				+	out_bs_ch	\b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \
			
 
				+			\b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
			
 
				+				  t0, t1, t2, t3, s0, s1, s2, s3
			
 
				+	inv_in_bs_ch	\b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \
			
 
				+			\b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b
			
 
				+	inv_gf256	\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \
			
 
				+			\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
			
 
				+			\t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \
			
 
				+			\s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b
			
 
				+	inv_out_bs_ch	\b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \
			
 
				+			\b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		enc_next_rk
			
 
				+	ldp		q16, q17, [bskey], #128
			
 
				+	ldp		q18, q19, [bskey, #-96]
			
 
				+	ldp		q20, q21, [bskey, #-64]
			
 
				+	ldp		q22, q23, [bskey, #-32]
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		dec_next_rk
			
 
				+	ldp		q16, q17, [bskey, #-128]!
			
 
				+	ldp		q18, q19, [bskey, #32]
			
 
				+	ldp		q20, q21, [bskey, #64]
			
 
				+	ldp		q22, q23, [bskey, #96]
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		add_round_key, x0, x1, x2, x3, x4, x5, x6, x7
			
 
				+	eor		\x0\().16b, \x0\().16b, v16.16b
			
 
				+	eor		\x1\().16b, \x1\().16b, v17.16b
			
 
				+	eor		\x2\().16b, \x2\().16b, v18.16b
			
 
				+	eor		\x3\().16b, \x3\().16b, v19.16b
			
 
				+	eor		\x4\().16b, \x4\().16b, v20.16b
			
 
				+	eor		\x5\().16b, \x5\().16b, v21.16b
			
 
				+	eor		\x6\().16b, \x6\().16b, v22.16b
			
 
				+	eor		\x7\().16b, \x7\().16b, v23.16b
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask
			
 
				+	tbl		\x0\().16b, {\x0\().16b}, \mask\().16b
			
 
				+	tbl		\x1\().16b, {\x1\().16b}, \mask\().16b
			
 
				+	tbl		\x2\().16b, {\x2\().16b}, \mask\().16b
			
 
				+	tbl		\x3\().16b, {\x3\().16b}, \mask\().16b
			
 
				+	tbl		\x4\().16b, {\x4\().16b}, \mask\().16b
			
 
				+	tbl		\x5\().16b, {\x5\().16b}, \mask\().16b
			
 
				+	tbl		\x6\().16b, {\x6\().16b}, \mask\().16b
			
 
				+	tbl		\x7\().16b, {\x7\().16b}, \mask\().16b
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
			
 
				+				  t0, t1, t2, t3, t4, t5, t6, t7, inv
			
 
				+	ext		\t0\().16b, \x0\().16b, \x0\().16b, #12
			
 
				+	ext		\t1\().16b, \x1\().16b, \x1\().16b, #12
			
 
				+	eor		\x0\().16b, \x0\().16b, \t0\().16b
			
 
				+	ext		\t2\().16b, \x2\().16b, \x2\().16b, #12
			
 
				+	eor		\x1\().16b, \x1\().16b, \t1\().16b
			
 
				+	ext		\t3\().16b, \x3\().16b, \x3\().16b, #12
			
 
				+	eor		\x2\().16b, \x2\().16b, \t2\().16b
			
 
				+	ext		\t4\().16b, \x4\().16b, \x4\().16b, #12
			
 
				+	eor		\x3\().16b, \x3\().16b, \t3\().16b
			
 
				+	ext		\t5\().16b, \x5\().16b, \x5\().16b, #12
			
 
				+	eor		\x4\().16b, \x4\().16b, \t4\().16b
			
 
				+	ext		\t6\().16b, \x6\().16b, \x6\().16b, #12
			
 
				+	eor		\x5\().16b, \x5\().16b, \t5\().16b
			
 
				+	ext		\t7\().16b, \x7\().16b, \x7\().16b, #12
			
 
				+	eor		\x6\().16b, \x6\().16b, \t6\().16b
			
 
				+	eor		\t1\().16b, \t1\().16b, \x0\().16b
			
 
				+	eor		\x7\().16b, \x7\().16b, \t7\().16b
			
 
				+	ext		\x0\().16b, \x0\().16b, \x0\().16b, #8
			
 
				+	eor		\t2\().16b, \t2\().16b, \x1\().16b
			
 
				+	eor		\t0\().16b, \t0\().16b, \x7\().16b
			
 
				+	eor		\t1\().16b, \t1\().16b, \x7\().16b
			
 
				+	ext		\x1\().16b, \x1\().16b, \x1\().16b, #8
			
 
				+	eor		\t5\().16b, \t5\().16b, \x4\().16b
			
 
				+	eor		\x0\().16b, \x0\().16b, \t0\().16b
			
 
				+	eor		\t6\().16b, \t6\().16b, \x5\().16b
			
 
				+	eor		\x1\().16b, \x1\().16b, \t1\().16b
			
 
				+	ext		\t0\().16b, \x4\().16b, \x4\().16b, #8
			
 
				+	eor		\t4\().16b, \t4\().16b, \x3\().16b
			
 
				+	ext		\t1\().16b, \x5\().16b, \x5\().16b, #8
			
 
				+	eor		\t7\().16b, \t7\().16b, \x6\().16b
			
 
				+	ext		\x4\().16b, \x3\().16b, \x3\().16b, #8
			
 
				+	eor		\t3\().16b, \t3\().16b, \x2\().16b
			
 
				+	ext		\x5\().16b, \x7\().16b, \x7\().16b, #8
			
 
				+	eor		\t4\().16b, \t4\().16b, \x7\().16b
			
 
				+	ext		\x3\().16b, \x6\().16b, \x6\().16b, #8
			
 
				+	eor		\t3\().16b, \t3\().16b, \x7\().16b
			
 
				+	ext		\x6\().16b, \x2\().16b, \x2\().16b, #8
			
 
				+	eor		\x7\().16b, \t1\().16b, \t5\().16b
			
 
				+	.ifb		\inv
			
 
				+	eor		\x2\().16b, \t0\().16b, \t4\().16b
			
 
				+	eor		\x4\().16b, \x4\().16b, \t3\().16b
			
 
				+	eor		\x5\().16b, \x5\().16b, \t7\().16b
			
 
				+	eor		\x3\().16b, \x3\().16b, \t6\().16b
			
 
				+	eor		\x6\().16b, \x6\().16b, \t2\().16b
			
 
				+	.else
			
 
				+	eor		\t3\().16b, \t3\().16b, \x4\().16b
			
 
				+	eor		\x5\().16b, \x5\().16b, \t7\().16b
			
 
				+	eor		\x2\().16b, \x3\().16b, \t6\().16b
			
 
				+	eor		\x3\().16b, \t0\().16b, \t4\().16b
			
 
				+	eor		\x4\().16b, \x6\().16b, \t2\().16b
			
 
				+	mov		\x6\().16b, \t3\().16b
			
 
				+	.endif
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
			
 
				+				      t0, t1, t2, t3, t4, t5, t6, t7
			
 
				+	ext		\t0\().16b, \x0\().16b, \x0\().16b, #8
			
 
				+	ext		\t6\().16b, \x6\().16b, \x6\().16b, #8
			
 
				+	ext		\t7\().16b, \x7\().16b, \x7\().16b, #8
			
 
				+	eor		\t0\().16b, \t0\().16b, \x0\().16b
			
 
				+	ext		\t1\().16b, \x1\().16b, \x1\().16b, #8
			
 
				+	eor		\t6\().16b, \t6\().16b, \x6\().16b
			
 
				+	ext		\t2\().16b, \x2\().16b, \x2\().16b, #8
			
 
				+	eor		\t7\().16b, \t7\().16b, \x7\().16b
			
 
				+	ext		\t3\().16b, \x3\().16b, \x3\().16b, #8
			
 
				+	eor		\t1\().16b, \t1\().16b, \x1\().16b
			
 
				+	ext		\t4\().16b, \x4\().16b, \x4\().16b, #8
			
 
				+	eor		\t2\().16b, \t2\().16b, \x2\().16b
			
 
				+	ext		\t5\().16b, \x5\().16b, \x5\().16b, #8
			
 
				+	eor		\t3\().16b, \t3\().16b, \x3\().16b
			
 
				+	eor		\t4\().16b, \t4\().16b, \x4\().16b
			
 
				+	eor		\t5\().16b, \t5\().16b, \x5\().16b
			
 
				+	eor		\x0\().16b, \x0\().16b, \t6\().16b
			
 
				+	eor		\x1\().16b, \x1\().16b, \t6\().16b
			
 
				+	eor		\x2\().16b, \x2\().16b, \t0\().16b
			
 
				+	eor		\x4\().16b, \x4\().16b, \t2\().16b
			
 
				+	eor		\x3\().16b, \x3\().16b, \t1\().16b
			
 
				+	eor		\x1\().16b, \x1\().16b, \t7\().16b
			
 
				+	eor		\x2\().16b, \x2\().16b, \t7\().16b
			
 
				+	eor		\x4\().16b, \x4\().16b, \t6\().16b
			
 
				+	eor		\x5\().16b, \x5\().16b, \t3\().16b
			
 
				+	eor		\x3\().16b, \x3\().16b, \t6\().16b
			
 
				+	eor		\x6\().16b, \x6\().16b, \t4\().16b
			
 
				+	eor		\x4\().16b, \x4\().16b, \t7\().16b
			
 
				+	eor		\x5\().16b, \x5\().16b, \t7\().16b
			
 
				+	eor		\x7\().16b, \x7\().16b, \t5\().16b
			
 
				+	mix_cols	\x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
			
 
				+			\t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
			
 
				+	ushr		\t0\().2d, \b0\().2d, #\n
			
 
				+	ushr		\t1\().2d, \b1\().2d, #\n
			
 
				+	eor		\t0\().16b, \t0\().16b, \a0\().16b
			
 
				+	eor		\t1\().16b, \t1\().16b, \a1\().16b
			
 
				+	and		\t0\().16b, \t0\().16b, \mask\().16b
			
 
				+	and		\t1\().16b, \t1\().16b, \mask\().16b
			
 
				+	eor		\a0\().16b, \a0\().16b, \t0\().16b
			
 
				+	shl		\t0\().2d, \t0\().2d, #\n
			
 
				+	eor		\a1\().16b, \a1\().16b, \t1\().16b
			
 
				+	shl		\t1\().2d, \t1\().2d, #\n
			
 
				+	eor		\b0\().16b, \b0\().16b, \t0\().16b
			
 
				+	eor		\b1\().16b, \b1\().16b, \t1\().16b
			
 
				+	.endm
			
 
				+
			
 
				+	.macro		bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
			
 
				+	movi		\t0\().16b, #0x55
			
 
				+	movi		\t1\().16b, #0x33
			
 
				+	swapmove_2x	\x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
			
 
				+	swapmove_2x	\x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
			
 
				+	movi		\t0\().16b, #0x0f
			
 
				+	swapmove_2x	\x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
			
 
				+	swapmove_2x	\x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
			
 
				+	swapmove_2x	\x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
			
 
				+	swapmove_2x	\x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
			
 
				+	.endm
			
 
				+
			
 
				+
			
 
				+	.align		6
			
 
				+M0:	.octa		0x0004080c0105090d02060a0e03070b0f
			
 
				+
			
 
				+M0SR:	.octa		0x0004080c05090d010a0e02060f03070b
			
 
				+SR:	.octa		0x0f0e0d0c0a09080b0504070600030201
			
 
				+SRM0:	.octa		0x01060b0c0207080d0304090e00050a0f
			
 
				+
			
 
				+M0ISR:	.octa		0x0004080c0d0105090a0e0206070b0f03
			
 
				+ISR:	.octa		0x0f0e0d0c080b0a090504070602010003
			
 
				+ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
			
 
				+
			
 
				+	/*
			
 
				+	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
			
 
				+	 */
			
 
				+ENTRY(aesbs_convert_key)
			
 
				+	ld1		{v7.4s}, [x1], #16		// load round 0 key
			
 
				+	ld1		{v17.4s}, [x1], #16		// load round 1 key
			
 
				+
			
 
				+	movi		v8.16b,  #0x01			// bit masks
			
 
				+	movi		v9.16b,  #0x02
			
 
				+	movi		v10.16b, #0x04
			
 
				+	movi		v11.16b, #0x08
			
 
				+	movi		v12.16b, #0x10
			
 
				+	movi		v13.16b, #0x20
			
 
				+	movi		v14.16b, #0x40
			
 
				+	movi		v15.16b, #0x80
			
 
				+	ldr		q16, M0
			
 
				+
			
 
				+	sub		x2, x2, #1
			
 
				+	str		q7, [x0], #16		// save round 0 key
			
 
				+
			
 
				+.Lkey_loop:
			
 
				+	tbl		v7.16b ,{v17.16b}, v16.16b
			
 
				+	ld1		{v17.4s}, [x1], #16		// load next round key
			
 
				+
			
 
				+	cmtst		v0.16b, v7.16b, v8.16b
			
 
				+	cmtst		v1.16b, v7.16b, v9.16b
			
 
				+	cmtst		v2.16b, v7.16b, v10.16b
			
 
				+	cmtst		v3.16b, v7.16b, v11.16b
			
 
				+	cmtst		v4.16b, v7.16b, v12.16b
			
 
				+	cmtst		v5.16b, v7.16b, v13.16b
			
 
				+	cmtst		v6.16b, v7.16b, v14.16b
			
 
				+	cmtst		v7.16b, v7.16b, v15.16b
			
 
				+	not		v0.16b, v0.16b
			
 
				+	not		v1.16b, v1.16b
			
 
				+	not		v5.16b, v5.16b
			
 
				+	not		v6.16b, v6.16b
			
 
				+
			
 
				+	subs		x2, x2, #1
			
 
				+	stp		q0, q1, [x0], #128
			
 
				+	stp		q2, q3, [x0, #-96]
			
 
				+	stp		q4, q5, [x0, #-64]
			
 
				+	stp		q6, q7, [x0, #-32]
			
 
				+	b.ne		.Lkey_loop
			
 
				+
			
 
				+	movi		v7.16b, #0x63			// compose .L63
			
 
				+	eor		v17.16b, v17.16b, v7.16b
			
 
				+	str		q17, [x0]
			
 
				+	ret
			
 
				+ENDPROC(aesbs_convert_key)
			
 
				+
			
 
				+	.align		4
			
 
				+aesbs_encrypt8:
			
 
				+	ldr		q9, [bskey], #16		// round 0 key
			
 
				+	ldr		q8, M0SR
			
 
				+	ldr		q24, SR
			
 
				+
			
 
				+	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
			
 
				+	eor		v11.16b, v1.16b, v9.16b
			
 
				+	tbl		v0.16b, {v10.16b}, v8.16b
			
 
				+	eor		v12.16b, v2.16b, v9.16b
			
 
				+	tbl		v1.16b, {v11.16b}, v8.16b
			
 
				+	eor		v13.16b, v3.16b, v9.16b
			
 
				+	tbl		v2.16b, {v12.16b}, v8.16b
			
 
				+	eor		v14.16b, v4.16b, v9.16b
			
 
				+	tbl		v3.16b, {v13.16b}, v8.16b
			
 
				+	eor		v15.16b, v5.16b, v9.16b
			
 
				+	tbl		v4.16b, {v14.16b}, v8.16b
			
 
				+	eor		v10.16b, v6.16b, v9.16b
			
 
				+	tbl		v5.16b, {v15.16b}, v8.16b
			
 
				+	eor		v11.16b, v7.16b, v9.16b
			
 
				+	tbl		v6.16b, {v10.16b}, v8.16b
			
 
				+	tbl		v7.16b, {v11.16b}, v8.16b
			
 
				+
			
 
				+	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
			
 
				+
			
 
				+	sub		rounds, rounds, #1
			
 
				+	b		.Lenc_sbox
			
 
				+
			
 
				+.Lenc_loop:
			
 
				+	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
			
 
				+.Lenc_sbox:
			
 
				+	sbox		v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
			
 
				+								v13, v14, v15
			
 
				+	subs		rounds, rounds, #1
			
 
				+	b.cc		.Lenc_done
			
 
				+
			
 
				+	enc_next_rk
			
 
				+
			
 
				+	mix_cols	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \
			
 
				+								v13, v14, v15
			
 
				+
			
 
				+	add_round_key	v0, v1, v2, v3, v4, v5, v6, v7
			
 
				+
			
 
				+	b.ne		.Lenc_loop
			
 
				+	ldr		q24, SRM0
			
 
				+	b		.Lenc_loop
			
 
				+
			
 
				+.Lenc_done:
			
 
				+	ldr		q12, [bskey]			// last round key
			
 
				+
			
 
				+	bitslice	v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11
			
 
				+
			
 
				+	eor		v0.16b, v0.16b, v12.16b
			
 
				+	eor		v1.16b, v1.16b, v12.16b
			
 
				+	eor		v4.16b, v4.16b, v12.16b
			
 
				+	eor		v6.16b, v6.16b, v12.16b
			
 
				+	eor		v3.16b, v3.16b, v12.16b
			
 
				+	eor		v7.16b, v7.16b, v12.16b
			
 
				+	eor		v2.16b, v2.16b, v12.16b
			
 
				+	eor		v5.16b, v5.16b, v12.16b
			
 
				+	ret
			
 
				+ENDPROC(aesbs_encrypt8)
			
 
				+
			
 
				+	.align		4
			
 
				+aesbs_decrypt8:
			
 
				+	lsl		x9, rounds, #7
			
 
				+	add		bskey, bskey, x9
			
 
				+
			
 
				+	ldr		q9, [bskey, #-112]!		// round 0 key
			
 
				+	ldr		q8, M0ISR
			
 
				+	ldr		q24, ISR
			
 
				+
			
 
				+	eor		v10.16b, v0.16b, v9.16b		// xor with round0 key
			
 
				+	eor		v11.16b, v1.16b, v9.16b
			
 
				+	tbl		v0.16b, {v10.16b}, v8.16b
			
 
				+	eor		v12.16b, v2.16b, v9.16b
			
 
				+	tbl		v1.16b, {v11.16b}, v8.16b
			
 
				+	eor		v13.16b, v3.16b, v9.16b
			
 
				+	tbl		v2.16b, {v12.16b}, v8.16b
			
 
				+	eor		v14.16b, v4.16b, v9.16b
			
 
				+	tbl		v3.16b, {v13.16b}, v8.16b
			
 
				+	eor		v15.16b, v5.16b, v9.16b
			
 
				+	tbl		v4.16b, {v14.16b}, v8.16b
			
 
				+	eor		v10.16b, v6.16b, v9.16b
			
 
				+	tbl		v5.16b, {v15.16b}, v8.16b
			
 
				+	eor		v11.16b, v7.16b, v9.16b
			
 
				+	tbl		v6.16b, {v10.16b}, v8.16b
			
 
				+	tbl		v7.16b, {v11.16b}, v8.16b
			
 
				+
			
 
				+	bitslice	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
			
 
				+
			
 
				+	sub		rounds, rounds, #1
			
 
				+	b		.Ldec_sbox
			
 
				+
			
 
				+.Ldec_loop:
			
 
				+	shift_rows	v0, v1, v2, v3, v4, v5, v6, v7, v24
			
 
				+.Ldec_sbox:
			
 
				+	inv_sbox	v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \
			
 
				+								v13, v14, v15
			
 
				+	subs		rounds, rounds, #1
			
 
				+	b.cc		.Ldec_done
			
 
				+
			
 
				+	dec_next_rk
			
 
				+
			
 
				+	add_round_key	v0, v1, v6, v4, v2, v7, v3, v5
			
 
				+
			
 
				+	inv_mix_cols	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \
			
 
				+								v13, v14, v15
			
 
				+
			
 
				+	b.ne		.Ldec_loop
			
 
				+	ldr		q24, ISRM0
			
 
				+	b		.Ldec_loop
			
 
				+.Ldec_done:
			
 
				+	ldr		q12, [bskey, #-16]		// last round key
			
 
				+
			
 
				+	bitslice	v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11
			
 
				+
			
 
				+	eor		v0.16b, v0.16b, v12.16b
			
 
				+	eor		v1.16b, v1.16b, v12.16b
			
 
				+	eor		v6.16b, v6.16b, v12.16b
			
 
				+	eor		v4.16b, v4.16b, v12.16b
			
 
				+	eor		v2.16b, v2.16b, v12.16b
			
 
				+	eor		v7.16b, v7.16b, v12.16b
			
 
				+	eor		v3.16b, v3.16b, v12.16b
			
 
				+	eor		v5.16b, v5.16b, v12.16b
			
 
				+	ret
			
 
				+ENDPROC(aesbs_decrypt8)
			
 
				+
			
 
				+	/*
			
 
				+	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
			
 
				+	 *		     int blocks)
			
 
				+	 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
			
 
				+	 *		     int blocks)
			
 
				+	 */
			
 
				+	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
			
 
				+	stp		x29, x30, [sp, #-16]!
			
 
				+	mov		x29, sp
			
 
				+
			
 
				+99:	mov		x5, #1
			
 
				+	lsl		x5, x5, x4
			
 
				+	subs		w4, w4, #8
			
 
				+	csel		x4, x4, xzr, pl
			
 
				+	csel		x5, x5, xzr, mi
			
 
				+
			
 
				+	ld1		{v0.16b}, [x1], #16
			
 
				+	tbnz		x5, #1, 0f
			
 
				+	ld1		{v1.16b}, [x1], #16
			
 
				+	tbnz		x5, #2, 0f
			
 
				+	ld1		{v2.16b}, [x1], #16
			
 
				+	tbnz		x5, #3, 0f
			
 
				+	ld1		{v3.16b}, [x1], #16
			
 
				+	tbnz		x5, #4, 0f
			
 
				+	ld1		{v4.16b}, [x1], #16
			
 
				+	tbnz		x5, #5, 0f
			
 
				+	ld1		{v5.16b}, [x1], #16
			
 
				+	tbnz		x5, #6, 0f
			
 
				+	ld1		{v6.16b}, [x1], #16
			
 
				+	tbnz		x5, #7, 0f
			
 
				+	ld1		{v7.16b}, [x1], #16
			
 
				+
			
 
				+0:	mov		bskey, x2
			
 
				+	mov		rounds, x3
			
 
				+	bl		\do8
			
 
				+
			
 
				+	st1		{\o0\().16b}, [x0], #16
			
 
				+	tbnz		x5, #1, 1f
			
 
				+	st1		{\o1\().16b}, [x0], #16
			
 
				+	tbnz		x5, #2, 1f
			
 
				+	st1		{\o2\().16b}, [x0], #16
			
 
				+	tbnz		x5, #3, 1f
			
 
				+	st1		{\o3\().16b}, [x0], #16
			
 
				+	tbnz		x5, #4, 1f
			
 
				+	st1		{\o4\().16b}, [x0], #16
			
 
				+	tbnz		x5, #5, 1f
			
 
				+	st1		{\o5\().16b}, [x0], #16
			
 
				+	tbnz		x5, #6, 1f
			
 
				+	st1		{\o6\().16b}, [x0], #16
			
 
				+	tbnz		x5, #7, 1f
			
 
				+	st1		{\o7\().16b}, [x0], #16
			
 
				+
			
 
				+	cbnz		x4, 99b
			
 
				+
			
 
				+1:	ldp		x29, x30, [sp], #16
			
 
				+	ret
			
 
				+	.endm
			
 
				+
			
 
				+	.align		4
			
 
				+ENTRY(aesbs_ecb_encrypt)
			
 
				+	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
			
 
				+ENDPROC(aesbs_ecb_encrypt)
			
 
				+
			
 
				+	.align		4
			
 
				+ENTRY(aesbs_ecb_decrypt)
			
 
				+	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
			
 
				+ENDPROC(aesbs_ecb_decrypt)
			
 
				+
			
 
				+	/*
			
 
				+	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
			
 
				+	 *		     int blocks, u8 iv[])
			
 
				+	 */
			
 
				+	.align		4
			
 
				+ENTRY(aesbs_cbc_decrypt)
			
 
				+	stp		x29, x30, [sp, #-16]!
			
 
				+	mov		x29, sp
			
 
				+
			
 
				+99:	mov		x6, #1
			
 
				+	lsl		x6, x6, x4
			
 
				+	subs		w4, w4, #8
			
 
				+	csel		x4, x4, xzr, pl
			
 
				+	csel		x6, x6, xzr, mi
			
 
				+
			
 
				+	ld1		{v0.16b}, [x1], #16
			
 
				+	mov		v25.16b, v0.16b
			
 
				+	tbnz		x6, #1, 0f
			
 
				+	ld1		{v1.16b}, [x1], #16
			
 
				+	mov		v26.16b, v1.16b
			
 
				+	tbnz		x6, #2, 0f
			
 
				+	ld1		{v2.16b}, [x1], #16
			
 
				+	mov		v27.16b, v2.16b
			
 
				+	tbnz		x6, #3, 0f
			
 
				+	ld1		{v3.16b}, [x1], #16
			
 
				+	mov		v28.16b, v3.16b
			
 
				+	tbnz		x6, #4, 0f
			
 
				+	ld1		{v4.16b}, [x1], #16
			
 
				+	mov		v29.16b, v4.16b
			
 
				+	tbnz		x6, #5, 0f
			
 
				+	ld1		{v5.16b}, [x1], #16
			
 
				+	mov		v30.16b, v5.16b
			
 
				+	tbnz		x6, #6, 0f
			
 
				+	ld1		{v6.16b}, [x1], #16
			
 
				+	mov		v31.16b, v6.16b
			
 
				+	tbnz		x6, #7, 0f
			
 
				+	ld1		{v7.16b}, [x1]
			
 
				+
			
 
				+0:	mov		bskey, x2
			
 
				+	mov		rounds, x3
			
 
				+	bl		aesbs_decrypt8
			
 
				+
			
 
				+	ld1		{v24.16b}, [x5]			// load IV
			
 
				+
			
 
				+	eor		v1.16b, v1.16b, v25.16b
			
 
				+	eor		v6.16b, v6.16b, v26.16b
			
 
				+	eor		v4.16b, v4.16b, v27.16b
			
 
				+	eor		v2.16b, v2.16b, v28.16b
			
 
				+	eor		v7.16b, v7.16b, v29.16b
			
 
				+	eor		v0.16b, v0.16b, v24.16b
			
 
				+	eor		v3.16b, v3.16b, v30.16b
			
 
				+	eor		v5.16b, v5.16b, v31.16b
			
 
				+
			
 
				+	st1		{v0.16b}, [x0], #16
			
 
				+	mov		v24.16b, v25.16b
			
 
				+	tbnz		x6, #1, 1f
			
 
				+	st1		{v1.16b}, [x0], #16
			
 
				+	mov		v24.16b, v26.16b
			
 
				+	tbnz		x6, #2, 1f
			
 
				+	st1		{v6.16b}, [x0], #16
			
 
				+	mov		v24.16b, v27.16b
			
 
				+	tbnz		x6, #3, 1f
			
 
				+	st1		{v4.16b}, [x0], #16
			
 
				+	mov		v24.16b, v28.16b
			
 
				+	tbnz		x6, #4, 1f
			
 
				+	st1		{v2.16b}, [x0], #16
			
 
				+	mov		v24.16b, v29.16b
			
 
				+	tbnz		x6, #5, 1f
			
 
				+	st1		{v7.16b}, [x0], #16
			
 
				+	mov		v24.16b, v30.16b
			
 
				+	tbnz		x6, #6, 1f
			
 
				+	st1		{v3.16b}, [x0], #16
			
 
				+	mov		v24.16b, v31.16b
			
 
				+	tbnz		x6, #7, 1f
			
 
				+	ld1		{v24.16b}, [x1], #16
			
 
				+	st1		{v5.16b}, [x0], #16
			
 
				+1:	st1		{v24.16b}, [x5]			// store IV
			
 
				+
			
 
				+	cbnz		x4, 99b
			
 
				+
			
 
				+	ldp		x29, x30, [sp], #16
			
 
				+	ret
			
 
				+ENDPROC(aesbs_cbc_decrypt)
			
 
				+
			
 
				+	.macro		next_tweak, out, in, const, tmp
			
 
				+	sshr		\tmp\().2d,  \in\().2d,   #63
			
 
				+	and		\tmp\().16b, \tmp\().16b, \const\().16b
			
 
				+	add		\out\().2d,  \in\().2d,   \in\().2d
			
 
				+	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
			
 
				+	eor		\out\().16b, \out\().16b, \tmp\().16b
			
 
				+	.endm
			
 
				+
			
 
				+	.align		4
			
 
				+.Lxts_mul_x:
			
 
				+CPU_LE(	.quad		1, 0x87		)
			
 
				+CPU_BE(	.quad		0x87, 1		)
			
 
				+
			
 
				+	/*
			
 
				+	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
			
 
				+	 *		     int blocks, u8 iv[])
			
 
				+	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
			
 
				+	 *		     int blocks, u8 iv[])
			
 
				+	 */
			
 
				+__xts_crypt8:
			
 
				+	mov		x6, #1
			
 
				+	lsl		x6, x6, x4
			
 
				+	subs		w4, w4, #8
			
 
				+	csel		x4, x4, xzr, pl
			
 
				+	csel		x6, x6, xzr, mi
			
 
				+
			
 
				+	ld1		{v0.16b}, [x1], #16
			
 
				+	next_tweak	v26, v25, v30, v31
			
 
				+	eor		v0.16b, v0.16b, v25.16b
			
 
				+	tbnz		x6, #1, 0f
			
 
				+
			
 
				+	ld1		{v1.16b}, [x1], #16
			
 
				+	next_tweak	v27, v26, v30, v31
			
 
				+	eor		v1.16b, v1.16b, v26.16b
			
 
				+	tbnz		x6, #2, 0f
			
 
				+
			
 
				+	ld1		{v2.16b}, [x1], #16
			
 
				+	next_tweak	v28, v27, v30, v31
			
 
				+	eor		v2.16b, v2.16b, v27.16b
			
 
				+	tbnz		x6, #3, 0f
			
 
				+
			
 
				+	ld1		{v3.16b}, [x1], #16
			
 
				+	next_tweak	v29, v28, v30, v31
			
 
				+	eor		v3.16b, v3.16b, v28.16b
			
 
				+	tbnz		x6, #4, 0f
			
 
				+
			
 
				+	ld1		{v4.16b}, [x1], #16
			
 
				+	str		q29, [sp, #16]
			
 
				+	eor		v4.16b, v4.16b, v29.16b
			
 
				+	next_tweak	v29, v29, v30, v31
			
 
				+	tbnz		x6, #5, 0f
			
 
				+
			
 
				+	ld1		{v5.16b}, [x1], #16
			
 
				+	str		q29, [sp, #32]
			
 
				+	eor		v5.16b, v5.16b, v29.16b
			
 
				+	next_tweak	v29, v29, v30, v31
			
 
				+	tbnz		x6, #6, 0f
			
 
				+
			
 
				+	ld1		{v6.16b}, [x1], #16
			
 
				+	str		q29, [sp, #48]
			
 
				+	eor		v6.16b, v6.16b, v29.16b
			
 
				+	next_tweak	v29, v29, v30, v31
			
 
				+	tbnz		x6, #7, 0f
			
 
				+
			
 
				+	ld1		{v7.16b}, [x1], #16
			
 
				+	str		q29, [sp, #64]
			
 
				+	eor		v7.16b, v7.16b, v29.16b
			
 
				+	next_tweak	v29, v29, v30, v31
			
 
				+
			
 
				+0:	mov		bskey, x2
			
 
				+	mov		rounds, x3
			
 
				+	br		x7
			
 
				+ENDPROC(__xts_crypt8)
			
 
				+
			
 
				+	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
			
 
				+	stp		x29, x30, [sp, #-80]!
			
 
				+	mov		x29, sp
			
 
				+
			
 
				+	ldr		q30, .Lxts_mul_x
			
 
				+	ld1		{v25.16b}, [x5]
			
 
				+
			
 
				+99:	adr		x7, \do8
			
 
				+	bl		__xts_crypt8
			
 
				+
			
 
				+	ldp		q16, q17, [sp, #16]
			
 
				+	ldp		q18, q19, [sp, #48]
			
 
				+
			
 
				+	eor		\o0\().16b, \o0\().16b, v25.16b
			
 
				+	eor		\o1\().16b, \o1\().16b, v26.16b
			
 
				+	eor		\o2\().16b, \o2\().16b, v27.16b
			
 
				+	eor		\o3\().16b, \o3\().16b, v28.16b
			
 
				+
			
 
				+	st1		{\o0\().16b}, [x0], #16
			
 
				+	mov		v25.16b, v26.16b
			
 
				+	tbnz		x6, #1, 1f
			
 
				+	st1		{\o1\().16b}, [x0], #16
			
 
				+	mov		v25.16b, v27.16b
			
 
				+	tbnz		x6, #2, 1f
			
 
				+	st1		{\o2\().16b}, [x0], #16
			
 
				+	mov		v25.16b, v28.16b
			
 
				+	tbnz		x6, #3, 1f
			
 
				+	st1		{\o3\().16b}, [x0], #16
			
 
				+	mov		v25.16b, v29.16b
			
 
				+	tbnz		x6, #4, 1f
			
 
				+
			
 
				+	eor		\o4\().16b, \o4\().16b, v16.16b
			
 
				+	eor		\o5\().16b, \o5\().16b, v17.16b
			
 
				+	eor		\o6\().16b, \o6\().16b, v18.16b
			
 
				+	eor		\o7\().16b, \o7\().16b, v19.16b
			
 
				+
			
 
				+	st1		{\o4\().16b}, [x0], #16
			
 
				+	tbnz		x6, #5, 1f
			
 
				+	st1		{\o5\().16b}, [x0], #16
			
 
				+	tbnz		x6, #6, 1f
			
 
				+	st1		{\o6\().16b}, [x0], #16
			
 
				+	tbnz		x6, #7, 1f
			
 
				+	st1		{\o7\().16b}, [x0], #16
			
 
				+
			
 
				+	cbnz		x4, 99b
			
 
				+
			
 
				+1:	st1		{v25.16b}, [x5]
			
 
				+	ldp		x29, x30, [sp], #80
			
 
				+	ret
			
 
				+	.endm
			
 
				+
			
 
				+ENTRY(aesbs_xts_encrypt)
			
 
				+	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
			
 
				+ENDPROC(aesbs_xts_encrypt)
			
 
				+
			
 
				+ENTRY(aesbs_xts_decrypt)
			
 
				+	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
			
 
				+ENDPROC(aesbs_xts_decrypt)
			
 
				+
			
 
				+	.macro		next_ctr, v
			
 
				+	mov		\v\().d[1], x8
			
 
				+	adds		x8, x8, #1
			
 
				+	mov		\v\().d[0], x7
			
 
				+	adc		x7, x7, xzr
			
 
				+	rev64		\v\().16b, \v\().16b
			
 
				+	.endm
			
 
				+
			
 
				+	/*
			
 
				+	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+	 *		     int rounds, int blocks, u8 iv[], u8 final[])
			
 
				+	 */
			
 
				+ENTRY(aesbs_ctr_encrypt)
			
 
				+	stp		x29, x30, [sp, #-16]!
			
 
				+	mov		x29, sp
			
 
				+
			
 
				+	cmp		x6, #0
			
 
				+	cset		x10, ne
			
 
				+	add		x4, x4, x10		// do one extra block if final
			
 
				+
			
 
				+	ldp		x7, x8, [x5]
			
 
				+	ld1		{v0.16b}, [x5]
			
 
				+CPU_LE(	rev		x7, x7		)
			
 
				+CPU_LE(	rev		x8, x8		)
			
 
				+	adds		x8, x8, #1
			
 
				+	adc		x7, x7, xzr
			
 
				+
			
 
				+99:	mov		x9, #1
			
 
				+	lsl		x9, x9, x4
			
 
				+	subs		w4, w4, #8
			
 
				+	csel		x4, x4, xzr, pl
			
 
				+	csel		x9, x9, xzr, le
			
 
				+
			
 
				+	tbnz		x9, #1, 0f
			
 
				+	next_ctr	v1
			
 
				+	tbnz		x9, #2, 0f
			
 
				+	next_ctr	v2
			
 
				+	tbnz		x9, #3, 0f
			
 
				+	next_ctr	v3
			
 
				+	tbnz		x9, #4, 0f
			
 
				+	next_ctr	v4
			
 
				+	tbnz		x9, #5, 0f
			
 
				+	next_ctr	v5
			
 
				+	tbnz		x9, #6, 0f
			
 
				+	next_ctr	v6
			
 
				+	tbnz		x9, #7, 0f
			
 
				+	next_ctr	v7
			
 
				+
			
 
				+0:	mov		bskey, x2
			
 
				+	mov		rounds, x3
			
 
				+	bl		aesbs_encrypt8
			
 
				+
			
 
				+	lsr		x9, x9, x10		// disregard the extra block
			
 
				+	tbnz		x9, #0, 0f
			
 
				+
			
 
				+	ld1		{v8.16b}, [x1], #16
			
 
				+	eor		v0.16b, v0.16b, v8.16b
			
 
				+	st1		{v0.16b}, [x0], #16
			
 
				+	tbnz		x9, #1, 1f
			
 
				+
			
 
				+	ld1		{v9.16b}, [x1], #16
			
 
				+	eor		v1.16b, v1.16b, v9.16b
			
 
				+	st1		{v1.16b}, [x0], #16
			
 
				+	tbnz		x9, #2, 2f
			
 
				+
			
 
				+	ld1		{v10.16b}, [x1], #16
			
 
				+	eor		v4.16b, v4.16b, v10.16b
			
 
				+	st1		{v4.16b}, [x0], #16
			
 
				+	tbnz		x9, #3, 3f
			
 
				+
			
 
				+	ld1		{v11.16b}, [x1], #16
			
 
				+	eor		v6.16b, v6.16b, v11.16b
			
 
				+	st1		{v6.16b}, [x0], #16
			
 
				+	tbnz		x9, #4, 4f
			
 
				+
			
 
				+	ld1		{v12.16b}, [x1], #16
			
 
				+	eor		v3.16b, v3.16b, v12.16b
			
 
				+	st1		{v3.16b}, [x0], #16
			
 
				+	tbnz		x9, #5, 5f
			
 
				+
			
 
				+	ld1		{v13.16b}, [x1], #16
			
 
				+	eor		v7.16b, v7.16b, v13.16b
			
 
				+	st1		{v7.16b}, [x0], #16
			
 
				+	tbnz		x9, #6, 6f
			
 
				+
			
 
				+	ld1		{v14.16b}, [x1], #16
			
 
				+	eor		v2.16b, v2.16b, v14.16b
			
 
				+	st1		{v2.16b}, [x0], #16
			
 
				+	tbnz		x9, #7, 7f
			
 
				+
			
 
				+	ld1		{v15.16b}, [x1], #16
			
 
				+	eor		v5.16b, v5.16b, v15.16b
			
 
				+	st1		{v5.16b}, [x0], #16
			
 
				+
			
 
				+8:	next_ctr	v0
			
 
				+	cbnz		x4, 99b
			
 
				+
			
 
				+0:	st1		{v0.16b}, [x5]
			
 
				+	ldp		x29, x30, [sp], #16
			
 
				+	ret
			
 
				+
			
 
				+	/*
			
 
				+	 * If we are handling the tail of the input (x6 != NULL), return the
			
 
				+	 * final keystream block back to the caller.
			
 
				+	 */
			
 
				+1:	cbz		x6, 8b
			
 
				+	st1		{v1.16b}, [x6]
			
 
				+	b		8b
			
 
				+2:	cbz		x6, 8b
			
 
				+	st1		{v4.16b}, [x6]
			
 
				+	b		8b
			
 
				+3:	cbz		x6, 8b
			
 
				+	st1		{v6.16b}, [x6]
			
 
				+	b		8b
			
 
				+4:	cbz		x6, 8b
			
 
				+	st1		{v3.16b}, [x6]
			
 
				+	b		8b
			
 
				+5:	cbz		x6, 8b
			
 
				+	st1		{v7.16b}, [x6]
			
 
				+	b		8b
			
 
				+6:	cbz		x6, 8b
			
 
				+	st1		{v2.16b}, [x6]
			
 
				+	b		8b
			
 
				+7:	cbz		x6, 8b
			
 
				+	st1		{v5.16b}, [x6]
			
 
				+	b		8b
			
 
				+ENDPROC(aesbs_ctr_encrypt)
			
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@@ -0,0 +1,439 @@
 
				+/*
			
 
				+ * Bit sliced AES using NEON instructions
			
 
				+ *
			
 
				+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <asm/neon.h>
			
 
				+#include <crypto/aes.h>
			
 
				+#include <crypto/internal/simd.h>
			
 
				+#include <crypto/internal/skcipher.h>
			
 
				+#include <crypto/xts.h>
			
 
				+#include <linux/module.h>
			
 
				+
			
 
				+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				+MODULE_LICENSE("GPL v2");
			
 
				+
			
 
				+MODULE_ALIAS_CRYPTO("ecb(aes)");
			
 
				+MODULE_ALIAS_CRYPTO("cbc(aes)");
			
 
				+MODULE_ALIAS_CRYPTO("ctr(aes)");
			
 
				+MODULE_ALIAS_CRYPTO("xts(aes)");
			
 
				+
			
 
				+asmlinkage void aesbs_convert_key(u8 out[], u32 const rk[], int rounds);
			
 
				+
			
 
				+asmlinkage void aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks);
			
 
				+asmlinkage void aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks);
			
 
				+
			
 
				+asmlinkage void aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks, u8 iv[]);
			
 
				+
			
 
				+asmlinkage void aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks, u8 iv[], u8 final[]);
			
 
				+
			
 
				+asmlinkage void aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks, u8 iv[]);
			
 
				+asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks, u8 iv[]);
			
 
				+
			
 
				+/* borrowed from aes-neon-blk.ko */
			
 
				+asmlinkage void neon_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
			
 
				+				     int rounds, int blocks, int first);
			
 
				+asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
			
 
				+				     int rounds, int blocks, u8 iv[],
			
 
				+				     int first);
			
 
				+
			
 
				+struct aesbs_ctx {
			
 
				+	u8	rk[13 * (8 * AES_BLOCK_SIZE) + 32];
			
 
				+	int	rounds;
			
 
				+} __aligned(AES_BLOCK_SIZE);
			
 
				+
			
 
				+struct aesbs_cbc_ctx {
			
 
				+	struct aesbs_ctx	key;
			
 
				+	u32			enc[AES_MAX_KEYLENGTH_U32];
			
 
				+};
			
 
				+
			
 
				+struct aesbs_xts_ctx {
			
 
				+	struct aesbs_ctx	key;
			
 
				+	u32			twkey[AES_MAX_KEYLENGTH_U32];
			
 
				+};
			
 
				+
			
 
				+static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
			
 
				+			unsigned int key_len)
			
 
				+{
			
 
				+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct crypto_aes_ctx rk;
			
 
				+	int err;
			
 
				+
			
 
				+	err = crypto_aes_expand_key(&rk, in_key, key_len);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	ctx->rounds = 6 + key_len / 4;
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	aesbs_convert_key(ctx->rk, rk.key_enc, ctx->rounds);
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int __ecb_crypt(struct skcipher_request *req,
			
 
				+		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks))
			
 
				+{
			
 
				+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct skcipher_walk walk;
			
 
				+	int err;
			
 
				+
			
 
				+	err = skcipher_walk_virt(&walk, req, true);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	while (walk.nbytes >= AES_BLOCK_SIZE) {
			
 
				+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
			
 
				+
			
 
				+		if (walk.nbytes < walk.total)
			
 
				+			blocks = round_down(blocks,
			
 
				+					    walk.stride / AES_BLOCK_SIZE);
			
 
				+
			
 
				+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
			
 
				+		   ctx->rounds, blocks);
			
 
				+		err = skcipher_walk_done(&walk,
			
 
				+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ecb_encrypt(struct skcipher_request *req)
			
 
				+{
			
 
				+	return __ecb_crypt(req, aesbs_ecb_encrypt);
			
 
				+}
			
 
				+
			
 
				+static int ecb_decrypt(struct skcipher_request *req)
			
 
				+{
			
 
				+	return __ecb_crypt(req, aesbs_ecb_decrypt);
			
 
				+}
			
 
				+
			
 
				+static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
			
 
				+			    unsigned int key_len)
			
 
				+{
			
 
				+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct crypto_aes_ctx rk;
			
 
				+	int err;
			
 
				+
			
 
				+	err = crypto_aes_expand_key(&rk, in_key, key_len);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	ctx->key.rounds = 6 + key_len / 4;
			
 
				+
			
 
				+	memcpy(ctx->enc, rk.key_enc, sizeof(ctx->enc));
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int cbc_encrypt(struct skcipher_request *req)
			
 
				+{
			
 
				+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct skcipher_walk walk;
			
 
				+	int err, first = 1;
			
 
				+
			
 
				+	err = skcipher_walk_virt(&walk, req, true);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	while (walk.nbytes >= AES_BLOCK_SIZE) {
			
 
				+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
			
 
				+
			
 
				+		/* fall back to the non-bitsliced NEON implementation */
			
 
				+		neon_aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				     ctx->enc, ctx->key.rounds, blocks, walk.iv,
			
 
				+				     first);
			
 
				+		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
			
 
				+		first = 0;
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int cbc_decrypt(struct skcipher_request *req)
			
 
				+{
			
 
				+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				+	struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct skcipher_walk walk;
			
 
				+	int err;
			
 
				+
			
 
				+	err = skcipher_walk_virt(&walk, req, true);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	while (walk.nbytes >= AES_BLOCK_SIZE) {
			
 
				+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
			
 
				+
			
 
				+		if (walk.nbytes < walk.total)
			
 
				+			blocks = round_down(blocks,
			
 
				+					    walk.stride / AES_BLOCK_SIZE);
			
 
				+
			
 
				+		aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				  ctx->key.rk, ctx->key.rounds, blocks,
			
 
				+				  walk.iv);
			
 
				+		err = skcipher_walk_done(&walk,
			
 
				+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int ctr_encrypt(struct skcipher_request *req)
			
 
				+{
			
 
				+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				+	struct aesbs_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct skcipher_walk walk;
			
 
				+	u8 buf[AES_BLOCK_SIZE];
			
 
				+	int err;
			
 
				+
			
 
				+	err = skcipher_walk_virt(&walk, req, true);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	while (walk.nbytes > 0) {
			
 
				+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
			
 
				+		u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
			
 
				+
			
 
				+		if (walk.nbytes < walk.total) {
			
 
				+			blocks = round_down(blocks,
			
 
				+					    walk.stride / AES_BLOCK_SIZE);
			
 
				+			final = NULL;
			
 
				+		}
			
 
				+
			
 
				+		aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				  ctx->rk, ctx->rounds, blocks, walk.iv, final);
			
 
				+
			
 
				+		if (final) {
			
 
				+			u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
			
 
				+			u8 *src = walk.src.virt.addr + blocks * AES_BLOCK_SIZE;
			
 
				+
			
 
				+			if (dst != src)
			
 
				+				memcpy(dst, src, walk.total % AES_BLOCK_SIZE);
			
 
				+			crypto_xor(dst, final, walk.total % AES_BLOCK_SIZE);
			
 
				+
			
 
				+			err = skcipher_walk_done(&walk, 0);
			
 
				+			break;
			
 
				+		}
			
 
				+		err = skcipher_walk_done(&walk,
			
 
				+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
			
 
				+			    unsigned int key_len)
			
 
				+{
			
 
				+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct crypto_aes_ctx rk;
			
 
				+	int err;
			
 
				+
			
 
				+	err = xts_verify_key(tfm, in_key, key_len);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	key_len /= 2;
			
 
				+	err = crypto_aes_expand_key(&rk, in_key + key_len, key_len);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	memcpy(ctx->twkey, rk.key_enc, sizeof(ctx->twkey));
			
 
				+
			
 
				+	return aesbs_setkey(tfm, in_key, key_len);
			
 
				+}
			
 
				+
			
 
				+static int __xts_crypt(struct skcipher_request *req,
			
 
				+		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
			
 
				+				  int rounds, int blocks, u8 iv[]))
			
 
				+{
			
 
				+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				+	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct skcipher_walk walk;
			
 
				+	int err;
			
 
				+
			
 
				+	err = skcipher_walk_virt(&walk, req, true);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+
			
 
				+	neon_aes_ecb_encrypt(walk.iv, walk.iv, ctx->twkey,
			
 
				+			     ctx->key.rounds, 1, 1);
			
 
				+
			
 
				+	while (walk.nbytes >= AES_BLOCK_SIZE) {
			
 
				+		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
			
 
				+
			
 
				+		if (walk.nbytes < walk.total)
			
 
				+			blocks = round_down(blocks,
			
 
				+					    walk.stride / AES_BLOCK_SIZE);
			
 
				+
			
 
				+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk,
			
 
				+		   ctx->key.rounds, blocks, walk.iv);
			
 
				+		err = skcipher_walk_done(&walk,
			
 
				+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static int xts_encrypt(struct skcipher_request *req)
			
 
				+{
			
 
				+	return __xts_crypt(req, aesbs_xts_encrypt);
			
 
				+}
			
 
				+
			
 
				+static int xts_decrypt(struct skcipher_request *req)
			
 
				+{
			
 
				+	return __xts_crypt(req, aesbs_xts_decrypt);
			
 
				+}
			
 
				+
			
 
				+static struct skcipher_alg aes_algs[] = { {
			
 
				+	.base.cra_name		= "__ecb(aes)",
			
 
				+	.base.cra_driver_name	= "__ecb-aes-neonbs",
			
 
				+	.base.cra_priority	= 250,
			
 
				+	.base.cra_blocksize	= AES_BLOCK_SIZE,
			
 
				+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				+
			
 
				+	.min_keysize		= AES_MIN_KEY_SIZE,
			
 
				+	.max_keysize		= AES_MAX_KEY_SIZE,
			
 
				+	.walksize		= 8 * AES_BLOCK_SIZE,
			
 
				+	.setkey			= aesbs_setkey,
			
 
				+	.encrypt		= ecb_encrypt,
			
 
				+	.decrypt		= ecb_decrypt,
			
 
				+}, {
			
 
				+	.base.cra_name		= "__cbc(aes)",
			
 
				+	.base.cra_driver_name	= "__cbc-aes-neonbs",
			
 
				+	.base.cra_priority	= 250,
			
 
				+	.base.cra_blocksize	= AES_BLOCK_SIZE,
			
 
				+	.base.cra_ctxsize	= sizeof(struct aesbs_cbc_ctx),
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				+
			
 
				+	.min_keysize		= AES_MIN_KEY_SIZE,
			
 
				+	.max_keysize		= AES_MAX_KEY_SIZE,
			
 
				+	.walksize		= 8 * AES_BLOCK_SIZE,
			
 
				+	.ivsize			= AES_BLOCK_SIZE,
			
 
				+	.setkey			= aesbs_cbc_setkey,
			
 
				+	.encrypt		= cbc_encrypt,
			
 
				+	.decrypt		= cbc_decrypt,
			
 
				+}, {
			
 
				+	.base.cra_name		= "__ctr(aes)",
			
 
				+	.base.cra_driver_name	= "__ctr-aes-neonbs",
			
 
				+	.base.cra_priority	= 250,
			
 
				+	.base.cra_blocksize	= 1,
			
 
				+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				+
			
 
				+	.min_keysize		= AES_MIN_KEY_SIZE,
			
 
				+	.max_keysize		= AES_MAX_KEY_SIZE,
			
 
				+	.chunksize		= AES_BLOCK_SIZE,
			
 
				+	.walksize		= 8 * AES_BLOCK_SIZE,
			
 
				+	.ivsize			= AES_BLOCK_SIZE,
			
 
				+	.setkey			= aesbs_setkey,
			
 
				+	.encrypt		= ctr_encrypt,
			
 
				+	.decrypt		= ctr_encrypt,
			
 
				+}, {
			
 
				+	.base.cra_name		= "ctr(aes)",
			
 
				+	.base.cra_driver_name	= "ctr-aes-neonbs",
			
 
				+	.base.cra_priority	= 250 - 1,
			
 
				+	.base.cra_blocksize	= 1,
			
 
				+	.base.cra_ctxsize	= sizeof(struct aesbs_ctx),
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+
			
 
				+	.min_keysize		= AES_MIN_KEY_SIZE,
			
 
				+	.max_keysize		= AES_MAX_KEY_SIZE,
			
 
				+	.chunksize		= AES_BLOCK_SIZE,
			
 
				+	.walksize		= 8 * AES_BLOCK_SIZE,
			
 
				+	.ivsize			= AES_BLOCK_SIZE,
			
 
				+	.setkey			= aesbs_setkey,
			
 
				+	.encrypt		= ctr_encrypt,
			
 
				+	.decrypt		= ctr_encrypt,
			
 
				+}, {
			
 
				+	.base.cra_name		= "__xts(aes)",
			
 
				+	.base.cra_driver_name	= "__xts-aes-neonbs",
			
 
				+	.base.cra_priority	= 250,
			
 
				+	.base.cra_blocksize	= AES_BLOCK_SIZE,
			
 
				+	.base.cra_ctxsize	= sizeof(struct aesbs_xts_ctx),
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+	.base.cra_flags		= CRYPTO_ALG_INTERNAL,
			
 
				+
			
 
				+	.min_keysize		= 2 * AES_MIN_KEY_SIZE,
			
 
				+	.max_keysize		= 2 * AES_MAX_KEY_SIZE,
			
 
				+	.walksize		= 8 * AES_BLOCK_SIZE,
			
 
				+	.ivsize			= AES_BLOCK_SIZE,
			
 
				+	.setkey			= aesbs_xts_setkey,
			
 
				+	.encrypt		= xts_encrypt,
			
 
				+	.decrypt		= xts_decrypt,
			
 
				+} };
			
 
				+
			
 
				+static struct simd_skcipher_alg *aes_simd_algs[ARRAY_SIZE(aes_algs)];
			
 
				+
			
 
				+static void aes_exit(void)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < ARRAY_SIZE(aes_simd_algs); i++)
			
 
				+		if (aes_simd_algs[i])
			
 
				+			simd_skcipher_free(aes_simd_algs[i]);
			
 
				+
			
 
				+	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
			
 
				+}
			
 
				+
			
 
				+static int __init aes_init(void)
			
 
				+{
			
 
				+	struct simd_skcipher_alg *simd;
			
 
				+	const char *basename;
			
 
				+	const char *algname;
			
 
				+	const char *drvname;
			
 
				+	int err;
			
 
				+	int i;
			
 
				+
			
 
				+	if (!(elf_hwcap & HWCAP_ASIMD))
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
			
 
				+		if (!(aes_algs[i].base.cra_flags & CRYPTO_ALG_INTERNAL))
			
 
				+			continue;
			
 
				+
			
 
				+		algname = aes_algs[i].base.cra_name + 2;
			
 
				+		drvname = aes_algs[i].base.cra_driver_name + 2;
			
 
				+		basename = aes_algs[i].base.cra_driver_name;
			
 
				+		simd = simd_skcipher_create_compat(algname, drvname, basename);
			
 
				+		err = PTR_ERR(simd);
			
 
				+		if (IS_ERR(simd))
			
 
				+			goto unregister_simds;
			
 
				+
			
 
				+		aes_simd_algs[i] = simd;
			
 
				+	}
			
 
				+	return 0;
			
 
				+
			
 
				+unregister_simds:
			
 
				+	aes_exit();
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+module_init(aes_init);
			
 
				+module_exit(aes_exit);
			
--- a/arch/arm64/crypto/chacha20-neon-core.S
+++ b/arch/arm64/crypto/chacha20-neon-core.S
@@ -0,0 +1,450 @@
 
				+/*
			
 
				+ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
			
 
				+ *
			
 
				+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * Based on:
			
 
				+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
			
 
				+ *
			
 
				+ * Copyright (C) 2015 Martin Willi
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2 of the License, or
			
 
				+ * (at your option) any later version.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/linkage.h>
			
 
				+
			
 
				+	.text
			
 
				+	.align		6
			
 
				+
			
 
				+ENTRY(chacha20_block_xor_neon)
			
 
				+	// x0: Input state matrix, s
			
 
				+	// x1: 1 data block output, o
			
 
				+	// x2: 1 data block input, i
			
 
				+
			
 
				+	//
			
 
				+	// This function encrypts one ChaCha20 block by loading the state matrix
			
 
				+	// in four NEON registers. It performs matrix operation on four words in
			
 
				+	// parallel, but requires shuffling to rearrange the words after each
			
 
				+	// round.
			
 
				+	//
			
 
				+
			
 
				+	// x0..3 = s0..3
			
 
				+	adr		x3, ROT8
			
 
				+	ld1		{v0.4s-v3.4s}, [x0]
			
 
				+	ld1		{v8.4s-v11.4s}, [x0]
			
 
				+	ld1		{v12.4s}, [x3]
			
 
				+
			
 
				+	mov		x3, #10
			
 
				+
			
 
				+.Ldoubleround:
			
 
				+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
			
 
				+	add		v0.4s, v0.4s, v1.4s
			
 
				+	eor		v3.16b, v3.16b, v0.16b
			
 
				+	rev32		v3.8h, v3.8h
			
 
				+
			
 
				+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
			
 
				+	add		v2.4s, v2.4s, v3.4s
			
 
				+	eor		v4.16b, v1.16b, v2.16b
			
 
				+	shl		v1.4s, v4.4s, #12
			
 
				+	sri		v1.4s, v4.4s, #20
			
 
				+
			
 
				+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
			
 
				+	add		v0.4s, v0.4s, v1.4s
			
 
				+	eor		v3.16b, v3.16b, v0.16b
			
 
				+	tbl		v3.16b, {v3.16b}, v12.16b
			
 
				+
			
 
				+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
			
 
				+	add		v2.4s, v2.4s, v3.4s
			
 
				+	eor		v4.16b, v1.16b, v2.16b
			
 
				+	shl		v1.4s, v4.4s, #7
			
 
				+	sri		v1.4s, v4.4s, #25
			
 
				+
			
 
				+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
			
 
				+	ext		v1.16b, v1.16b, v1.16b, #4
			
 
				+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
			
 
				+	ext		v2.16b, v2.16b, v2.16b, #8
			
 
				+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
			
 
				+	ext		v3.16b, v3.16b, v3.16b, #12
			
 
				+
			
 
				+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
			
 
				+	add		v0.4s, v0.4s, v1.4s
			
 
				+	eor		v3.16b, v3.16b, v0.16b
			
 
				+	rev32		v3.8h, v3.8h
			
 
				+
			
 
				+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
			
 
				+	add		v2.4s, v2.4s, v3.4s
			
 
				+	eor		v4.16b, v1.16b, v2.16b
			
 
				+	shl		v1.4s, v4.4s, #12
			
 
				+	sri		v1.4s, v4.4s, #20
			
 
				+
			
 
				+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
			
 
				+	add		v0.4s, v0.4s, v1.4s
			
 
				+	eor		v3.16b, v3.16b, v0.16b
			
 
				+	tbl		v3.16b, {v3.16b}, v12.16b
			
 
				+
			
 
				+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
			
 
				+	add		v2.4s, v2.4s, v3.4s
			
 
				+	eor		v4.16b, v1.16b, v2.16b
			
 
				+	shl		v1.4s, v4.4s, #7
			
 
				+	sri		v1.4s, v4.4s, #25
			
 
				+
			
 
				+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
			
 
				+	ext		v1.16b, v1.16b, v1.16b, #12
			
 
				+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
			
 
				+	ext		v2.16b, v2.16b, v2.16b, #8
			
 
				+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
			
 
				+	ext		v3.16b, v3.16b, v3.16b, #4
			
 
				+
			
 
				+	subs		x3, x3, #1
			
 
				+	b.ne		.Ldoubleround
			
 
				+
			
 
				+	ld1		{v4.16b-v7.16b}, [x2]
			
 
				+
			
 
				+	// o0 = i0 ^ (x0 + s0)
			
 
				+	add		v0.4s, v0.4s, v8.4s
			
 
				+	eor		v0.16b, v0.16b, v4.16b
			
 
				+
			
 
				+	// o1 = i1 ^ (x1 + s1)
			
 
				+	add		v1.4s, v1.4s, v9.4s
			
 
				+	eor		v1.16b, v1.16b, v5.16b
			
 
				+
			
 
				+	// o2 = i2 ^ (x2 + s2)
			
 
				+	add		v2.4s, v2.4s, v10.4s
			
 
				+	eor		v2.16b, v2.16b, v6.16b
			
 
				+
			
 
				+	// o3 = i3 ^ (x3 + s3)
			
 
				+	add		v3.4s, v3.4s, v11.4s
			
 
				+	eor		v3.16b, v3.16b, v7.16b
			
 
				+
			
 
				+	st1		{v0.16b-v3.16b}, [x1]
			
 
				+
			
 
				+	ret
			
 
				+ENDPROC(chacha20_block_xor_neon)
			
 
				+
			
 
				+	.align		6
			
 
				+ENTRY(chacha20_4block_xor_neon)
			
 
				+	// x0: Input state matrix, s
			
 
				+	// x1: 4 data blocks output, o
			
 
				+	// x2: 4 data blocks input, i
			
 
				+
			
 
				+	//
			
 
				+	// This function encrypts four consecutive ChaCha20 blocks by loading
			
 
				+	// the state matrix in NEON registers four times. The algorithm performs
			
 
				+	// each operation on the corresponding word of each state matrix, hence
			
 
				+	// requires no word shuffling. For final XORing step we transpose the
			
 
				+	// matrix by interleaving 32- and then 64-bit words, which allows us to
			
 
				+	// do XOR in NEON registers.
			
 
				+	//
			
 
				+	adr		x3, CTRINC		// ... and ROT8
			
 
				+	ld1		{v30.4s-v31.4s}, [x3]
			
 
				+
			
 
				+	// x0..15[0-3] = s0..3[0..3]
			
 
				+	mov		x4, x0
			
 
				+	ld4r		{ v0.4s- v3.4s}, [x4], #16
			
 
				+	ld4r		{ v4.4s- v7.4s}, [x4], #16
			
 
				+	ld4r		{ v8.4s-v11.4s}, [x4], #16
			
 
				+	ld4r		{v12.4s-v15.4s}, [x4]
			
 
				+
			
 
				+	// x12 += counter values 0-3
			
 
				+	add		v12.4s, v12.4s, v30.4s
			
 
				+
			
 
				+	mov		x3, #10
			
 
				+
			
 
				+.Ldoubleround4:
			
 
				+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
			
 
				+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
			
 
				+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
			
 
				+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
			
 
				+	add		v0.4s, v0.4s, v4.4s
			
 
				+	add		v1.4s, v1.4s, v5.4s
			
 
				+	add		v2.4s, v2.4s, v6.4s
			
 
				+	add		v3.4s, v3.4s, v7.4s
			
 
				+
			
 
				+	eor		v12.16b, v12.16b, v0.16b
			
 
				+	eor		v13.16b, v13.16b, v1.16b
			
 
				+	eor		v14.16b, v14.16b, v2.16b
			
 
				+	eor		v15.16b, v15.16b, v3.16b
			
 
				+
			
 
				+	rev32		v12.8h, v12.8h
			
 
				+	rev32		v13.8h, v13.8h
			
 
				+	rev32		v14.8h, v14.8h
			
 
				+	rev32		v15.8h, v15.8h
			
 
				+
			
 
				+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
			
 
				+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
			
 
				+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
			
 
				+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
			
 
				+	add		v8.4s, v8.4s, v12.4s
			
 
				+	add		v9.4s, v9.4s, v13.4s
			
 
				+	add		v10.4s, v10.4s, v14.4s
			
 
				+	add		v11.4s, v11.4s, v15.4s
			
 
				+
			
 
				+	eor		v16.16b, v4.16b, v8.16b
			
 
				+	eor		v17.16b, v5.16b, v9.16b
			
 
				+	eor		v18.16b, v6.16b, v10.16b
			
 
				+	eor		v19.16b, v7.16b, v11.16b
			
 
				+
			
 
				+	shl		v4.4s, v16.4s, #12
			
 
				+	shl		v5.4s, v17.4s, #12
			
 
				+	shl		v6.4s, v18.4s, #12
			
 
				+	shl		v7.4s, v19.4s, #12
			
 
				+
			
 
				+	sri		v4.4s, v16.4s, #20
			
 
				+	sri		v5.4s, v17.4s, #20
			
 
				+	sri		v6.4s, v18.4s, #20
			
 
				+	sri		v7.4s, v19.4s, #20
			
 
				+
			
 
				+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
			
 
				+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
			
 
				+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
			
 
				+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
			
 
				+	add		v0.4s, v0.4s, v4.4s
			
 
				+	add		v1.4s, v1.4s, v5.4s
			
 
				+	add		v2.4s, v2.4s, v6.4s
			
 
				+	add		v3.4s, v3.4s, v7.4s
			
 
				+
			
 
				+	eor		v12.16b, v12.16b, v0.16b
			
 
				+	eor		v13.16b, v13.16b, v1.16b
			
 
				+	eor		v14.16b, v14.16b, v2.16b
			
 
				+	eor		v15.16b, v15.16b, v3.16b
			
 
				+
			
 
				+	tbl		v12.16b, {v12.16b}, v31.16b
			
 
				+	tbl		v13.16b, {v13.16b}, v31.16b
			
 
				+	tbl		v14.16b, {v14.16b}, v31.16b
			
 
				+	tbl		v15.16b, {v15.16b}, v31.16b
			
 
				+
			
 
				+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
			
 
				+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
			
 
				+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
			
 
				+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
			
 
				+	add		v8.4s, v8.4s, v12.4s
			
 
				+	add		v9.4s, v9.4s, v13.4s
			
 
				+	add		v10.4s, v10.4s, v14.4s
			
 
				+	add		v11.4s, v11.4s, v15.4s
			
 
				+
			
 
				+	eor		v16.16b, v4.16b, v8.16b
			
 
				+	eor		v17.16b, v5.16b, v9.16b
			
 
				+	eor		v18.16b, v6.16b, v10.16b
			
 
				+	eor		v19.16b, v7.16b, v11.16b
			
 
				+
			
 
				+	shl		v4.4s, v16.4s, #7
			
 
				+	shl		v5.4s, v17.4s, #7
			
 
				+	shl		v6.4s, v18.4s, #7
			
 
				+	shl		v7.4s, v19.4s, #7
			
 
				+
			
 
				+	sri		v4.4s, v16.4s, #25
			
 
				+	sri		v5.4s, v17.4s, #25
			
 
				+	sri		v6.4s, v18.4s, #25
			
 
				+	sri		v7.4s, v19.4s, #25
			
 
				+
			
 
				+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
			
 
				+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
			
 
				+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
			
 
				+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
			
 
				+	add		v0.4s, v0.4s, v5.4s
			
 
				+	add		v1.4s, v1.4s, v6.4s
			
 
				+	add		v2.4s, v2.4s, v7.4s
			
 
				+	add		v3.4s, v3.4s, v4.4s
			
 
				+
			
 
				+	eor		v15.16b, v15.16b, v0.16b
			
 
				+	eor		v12.16b, v12.16b, v1.16b
			
 
				+	eor		v13.16b, v13.16b, v2.16b
			
 
				+	eor		v14.16b, v14.16b, v3.16b
			
 
				+
			
 
				+	rev32		v15.8h, v15.8h
			
 
				+	rev32		v12.8h, v12.8h
			
 
				+	rev32		v13.8h, v13.8h
			
 
				+	rev32		v14.8h, v14.8h
			
 
				+
			
 
				+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
			
 
				+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
			
 
				+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
			
 
				+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
			
 
				+	add		v10.4s, v10.4s, v15.4s
			
 
				+	add		v11.4s, v11.4s, v12.4s
			
 
				+	add		v8.4s, v8.4s, v13.4s
			
 
				+	add		v9.4s, v9.4s, v14.4s
			
 
				+
			
 
				+	eor		v16.16b, v5.16b, v10.16b
			
 
				+	eor		v17.16b, v6.16b, v11.16b
			
 
				+	eor		v18.16b, v7.16b, v8.16b
			
 
				+	eor		v19.16b, v4.16b, v9.16b
			
 
				+
			
 
				+	shl		v5.4s, v16.4s, #12
			
 
				+	shl		v6.4s, v17.4s, #12
			
 
				+	shl		v7.4s, v18.4s, #12
			
 
				+	shl		v4.4s, v19.4s, #12
			
 
				+
			
 
				+	sri		v5.4s, v16.4s, #20
			
 
				+	sri		v6.4s, v17.4s, #20
			
 
				+	sri		v7.4s, v18.4s, #20
			
 
				+	sri		v4.4s, v19.4s, #20
			
 
				+
			
 
				+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
			
 
				+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
			
 
				+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
			
 
				+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
			
 
				+	add		v0.4s, v0.4s, v5.4s
			
 
				+	add		v1.4s, v1.4s, v6.4s
			
 
				+	add		v2.4s, v2.4s, v7.4s
			
 
				+	add		v3.4s, v3.4s, v4.4s
			
 
				+
			
 
				+	eor		v15.16b, v15.16b, v0.16b
			
 
				+	eor		v12.16b, v12.16b, v1.16b
			
 
				+	eor		v13.16b, v13.16b, v2.16b
			
 
				+	eor		v14.16b, v14.16b, v3.16b
			
 
				+
			
 
				+	tbl		v15.16b, {v15.16b}, v31.16b
			
 
				+	tbl		v12.16b, {v12.16b}, v31.16b
			
 
				+	tbl		v13.16b, {v13.16b}, v31.16b
			
 
				+	tbl		v14.16b, {v14.16b}, v31.16b
			
 
				+
			
 
				+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
			
 
				+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
			
 
				+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
			
 
				+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
			
 
				+	add		v10.4s, v10.4s, v15.4s
			
 
				+	add		v11.4s, v11.4s, v12.4s
			
 
				+	add		v8.4s, v8.4s, v13.4s
			
 
				+	add		v9.4s, v9.4s, v14.4s
			
 
				+
			
 
				+	eor		v16.16b, v5.16b, v10.16b
			
 
				+	eor		v17.16b, v6.16b, v11.16b
			
 
				+	eor		v18.16b, v7.16b, v8.16b
			
 
				+	eor		v19.16b, v4.16b, v9.16b
			
 
				+
			
 
				+	shl		v5.4s, v16.4s, #7
			
 
				+	shl		v6.4s, v17.4s, #7
			
 
				+	shl		v7.4s, v18.4s, #7
			
 
				+	shl		v4.4s, v19.4s, #7
			
 
				+
			
 
				+	sri		v5.4s, v16.4s, #25
			
 
				+	sri		v6.4s, v17.4s, #25
			
 
				+	sri		v7.4s, v18.4s, #25
			
 
				+	sri		v4.4s, v19.4s, #25
			
 
				+
			
 
				+	subs		x3, x3, #1
			
 
				+	b.ne		.Ldoubleround4
			
 
				+
			
 
				+	ld4r		{v16.4s-v19.4s}, [x0], #16
			
 
				+	ld4r		{v20.4s-v23.4s}, [x0], #16
			
 
				+
			
 
				+	// x12 += counter values 0-3
			
 
				+	add		v12.4s, v12.4s, v30.4s
			
 
				+
			
 
				+	// x0[0-3] += s0[0]
			
 
				+	// x1[0-3] += s0[1]
			
 
				+	// x2[0-3] += s0[2]
			
 
				+	// x3[0-3] += s0[3]
			
 
				+	add		v0.4s, v0.4s, v16.4s
			
 
				+	add		v1.4s, v1.4s, v17.4s
			
 
				+	add		v2.4s, v2.4s, v18.4s
			
 
				+	add		v3.4s, v3.4s, v19.4s
			
 
				+
			
 
				+	ld4r		{v24.4s-v27.4s}, [x0], #16
			
 
				+	ld4r		{v28.4s-v31.4s}, [x0]
			
 
				+
			
 
				+	// x4[0-3] += s1[0]
			
 
				+	// x5[0-3] += s1[1]
			
 
				+	// x6[0-3] += s1[2]
			
 
				+	// x7[0-3] += s1[3]
			
 
				+	add		v4.4s, v4.4s, v20.4s
			
 
				+	add		v5.4s, v5.4s, v21.4s
			
 
				+	add		v6.4s, v6.4s, v22.4s
			
 
				+	add		v7.4s, v7.4s, v23.4s
			
 
				+
			
 
				+	// x8[0-3] += s2[0]
			
 
				+	// x9[0-3] += s2[1]
			
 
				+	// x10[0-3] += s2[2]
			
 
				+	// x11[0-3] += s2[3]
			
 
				+	add		v8.4s, v8.4s, v24.4s
			
 
				+	add		v9.4s, v9.4s, v25.4s
			
 
				+	add		v10.4s, v10.4s, v26.4s
			
 
				+	add		v11.4s, v11.4s, v27.4s
			
 
				+
			
 
				+	// x12[0-3] += s3[0]
			
 
				+	// x13[0-3] += s3[1]
			
 
				+	// x14[0-3] += s3[2]
			
 
				+	// x15[0-3] += s3[3]
			
 
				+	add		v12.4s, v12.4s, v28.4s
			
 
				+	add		v13.4s, v13.4s, v29.4s
			
 
				+	add		v14.4s, v14.4s, v30.4s
			
 
				+	add		v15.4s, v15.4s, v31.4s
			
 
				+
			
 
				+	// interleave 32-bit words in state n, n+1
			
 
				+	zip1		v16.4s, v0.4s, v1.4s
			
 
				+	zip2		v17.4s, v0.4s, v1.4s
			
 
				+	zip1		v18.4s, v2.4s, v3.4s
			
 
				+	zip2		v19.4s, v2.4s, v3.4s
			
 
				+	zip1		v20.4s, v4.4s, v5.4s
			
 
				+	zip2		v21.4s, v4.4s, v5.4s
			
 
				+	zip1		v22.4s, v6.4s, v7.4s
			
 
				+	zip2		v23.4s, v6.4s, v7.4s
			
 
				+	zip1		v24.4s, v8.4s, v9.4s
			
 
				+	zip2		v25.4s, v8.4s, v9.4s
			
 
				+	zip1		v26.4s, v10.4s, v11.4s
			
 
				+	zip2		v27.4s, v10.4s, v11.4s
			
 
				+	zip1		v28.4s, v12.4s, v13.4s
			
 
				+	zip2		v29.4s, v12.4s, v13.4s
			
 
				+	zip1		v30.4s, v14.4s, v15.4s
			
 
				+	zip2		v31.4s, v14.4s, v15.4s
			
 
				+
			
 
				+	// interleave 64-bit words in state n, n+2
			
 
				+	zip1		v0.2d, v16.2d, v18.2d
			
 
				+	zip2		v4.2d, v16.2d, v18.2d
			
 
				+	zip1		v8.2d, v17.2d, v19.2d
			
 
				+	zip2		v12.2d, v17.2d, v19.2d
			
 
				+	ld1		{v16.16b-v19.16b}, [x2], #64
			
 
				+
			
 
				+	zip1		v1.2d, v20.2d, v22.2d
			
 
				+	zip2		v5.2d, v20.2d, v22.2d
			
 
				+	zip1		v9.2d, v21.2d, v23.2d
			
 
				+	zip2		v13.2d, v21.2d, v23.2d
			
 
				+	ld1		{v20.16b-v23.16b}, [x2], #64
			
 
				+
			
 
				+	zip1		v2.2d, v24.2d, v26.2d
			
 
				+	zip2		v6.2d, v24.2d, v26.2d
			
 
				+	zip1		v10.2d, v25.2d, v27.2d
			
 
				+	zip2		v14.2d, v25.2d, v27.2d
			
 
				+	ld1		{v24.16b-v27.16b}, [x2], #64
			
 
				+
			
 
				+	zip1		v3.2d, v28.2d, v30.2d
			
 
				+	zip2		v7.2d, v28.2d, v30.2d
			
 
				+	zip1		v11.2d, v29.2d, v31.2d
			
 
				+	zip2		v15.2d, v29.2d, v31.2d
			
 
				+	ld1		{v28.16b-v31.16b}, [x2]
			
 
				+
			
 
				+	// xor with corresponding input, write to output
			
 
				+	eor		v16.16b, v16.16b, v0.16b
			
 
				+	eor		v17.16b, v17.16b, v1.16b
			
 
				+	eor		v18.16b, v18.16b, v2.16b
			
 
				+	eor		v19.16b, v19.16b, v3.16b
			
 
				+	eor		v20.16b, v20.16b, v4.16b
			
 
				+	eor		v21.16b, v21.16b, v5.16b
			
 
				+	st1		{v16.16b-v19.16b}, [x1], #64
			
 
				+	eor		v22.16b, v22.16b, v6.16b
			
 
				+	eor		v23.16b, v23.16b, v7.16b
			
 
				+	eor		v24.16b, v24.16b, v8.16b
			
 
				+	eor		v25.16b, v25.16b, v9.16b
			
 
				+	st1		{v20.16b-v23.16b}, [x1], #64
			
 
				+	eor		v26.16b, v26.16b, v10.16b
			
 
				+	eor		v27.16b, v27.16b, v11.16b
			
 
				+	eor		v28.16b, v28.16b, v12.16b
			
 
				+	st1		{v24.16b-v27.16b}, [x1], #64
			
 
				+	eor		v29.16b, v29.16b, v13.16b
			
 
				+	eor		v30.16b, v30.16b, v14.16b
			
 
				+	eor		v31.16b, v31.16b, v15.16b
			
 
				+	st1		{v28.16b-v31.16b}, [x1]
			
 
				+
			
 
				+	ret
			
 
				+ENDPROC(chacha20_4block_xor_neon)
			
 
				+
			
 
				+CTRINC:	.word		0, 1, 2, 3
			
 
				+ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
			
--- a/arch/arm64/crypto/chacha20-neon-glue.c
+++ b/arch/arm64/crypto/chacha20-neon-glue.c
@@ -0,0 +1,126 @@
 
				+/*
			
 
				+ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
			
 
				+ *
			
 
				+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * Based on:
			
 
				+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
			
 
				+ *
			
 
				+ * Copyright (C) 2015 Martin Willi
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2 of the License, or
			
 
				+ * (at your option) any later version.
			
 
				+ */
			
 
				+
			
 
				+#include <crypto/algapi.h>
			
 
				+#include <crypto/chacha20.h>
			
 
				+#include <crypto/internal/skcipher.h>
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/module.h>
			
 
				+
			
 
				+#include <asm/hwcap.h>
			
 
				+#include <asm/neon.h>
			
 
				+
			
 
				+asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
			
 
				+asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
			
 
				+
			
 
				+static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
			
 
				+			    unsigned int bytes)
			
 
				+{
			
 
				+	u8 buf[CHACHA20_BLOCK_SIZE];
			
 
				+
			
 
				+	while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
			
 
				+		chacha20_4block_xor_neon(state, dst, src);
			
 
				+		bytes -= CHACHA20_BLOCK_SIZE * 4;
			
 
				+		src += CHACHA20_BLOCK_SIZE * 4;
			
 
				+		dst += CHACHA20_BLOCK_SIZE * 4;
			
 
				+		state[12] += 4;
			
 
				+	}
			
 
				+	while (bytes >= CHACHA20_BLOCK_SIZE) {
			
 
				+		chacha20_block_xor_neon(state, dst, src);
			
 
				+		bytes -= CHACHA20_BLOCK_SIZE;
			
 
				+		src += CHACHA20_BLOCK_SIZE;
			
 
				+		dst += CHACHA20_BLOCK_SIZE;
			
 
				+		state[12]++;
			
 
				+	}
			
 
				+	if (bytes) {
			
 
				+		memcpy(buf, src, bytes);
			
 
				+		chacha20_block_xor_neon(state, buf, buf);
			
 
				+		memcpy(dst, buf, bytes);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int chacha20_neon(struct skcipher_request *req)
			
 
				+{
			
 
				+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct skcipher_walk walk;
			
 
				+	u32 state[16];
			
 
				+	int err;
			
 
				+
			
 
				+	if (req->cryptlen <= CHACHA20_BLOCK_SIZE)
			
 
				+		return crypto_chacha20_crypt(req);
			
 
				+
			
 
				+	err = skcipher_walk_virt(&walk, req, true);
			
 
				+
			
 
				+	crypto_chacha20_init(state, ctx, walk.iv);
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	while (walk.nbytes > 0) {
			
 
				+		unsigned int nbytes = walk.nbytes;
			
 
				+
			
 
				+		if (nbytes < walk.total)
			
 
				+			nbytes = round_down(nbytes, walk.stride);
			
 
				+
			
 
				+		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
			
 
				+				nbytes);
			
 
				+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
			
 
				+	}
			
 
				+	kernel_neon_end();
			
 
				+
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static struct skcipher_alg alg = {
			
 
				+	.base.cra_name		= "chacha20",
			
 
				+	.base.cra_driver_name	= "chacha20-neon",
			
 
				+	.base.cra_priority	= 300,
			
 
				+	.base.cra_blocksize	= 1,
			
 
				+	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+
			
 
				+	.min_keysize		= CHACHA20_KEY_SIZE,
			
 
				+	.max_keysize		= CHACHA20_KEY_SIZE,
			
 
				+	.ivsize			= CHACHA20_IV_SIZE,
			
 
				+	.chunksize		= CHACHA20_BLOCK_SIZE,
			
 
				+	.walksize		= 4 * CHACHA20_BLOCK_SIZE,
			
 
				+	.setkey			= crypto_chacha20_setkey,
			
 
				+	.encrypt		= chacha20_neon,
			
 
				+	.decrypt		= chacha20_neon,
			
 
				+};
			
 
				+
			
 
				+static int __init chacha20_simd_mod_init(void)
			
 
				+{
			
 
				+	if (!(elf_hwcap & HWCAP_ASIMD))
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	return crypto_register_skcipher(&alg);
			
 
				+}
			
 
				+
			
 
				+static void __exit chacha20_simd_mod_fini(void)
			
 
				+{
			
 
				+	crypto_unregister_skcipher(&alg);
			
 
				+}
			
 
				+
			
 
				+module_init(chacha20_simd_mod_init);
			
 
				+module_exit(chacha20_simd_mod_fini);
			
 
				+
			
 
				+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				+MODULE_LICENSE("GPL v2");
			
 
				+MODULE_ALIAS_CRYPTO("chacha20");
			
--- a/arch/arm64/crypto/crc32-arm64.c
+++ b/arch/arm64/crypto/crc32-arm64.c
@@ -1,290 +0,0 @@
 
				-/*
			
 
				- * crc32-arm64.c - CRC32 and CRC32C using optional ARMv8 instructions
			
 
				- *
			
 
				- * Module based on crypto/crc32c_generic.c
			
 
				- *
			
 
				- * CRC32 loop taken from Ed Nevill's Hadoop CRC patch
			
 
				- * http://mail-archives.apache.org/mod_mbox/hadoop-common-dev/201406.mbox/%3C1403687030.3355.19.camel%40localhost.localdomain%3E
			
 
				- *
			
 
				- * Using inline assembly instead of intrinsics in order to be backwards
			
 
				- * compatible with older compilers.
			
 
				- *
			
 
				- * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org>
			
 
				- *
			
 
				- * This program is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU General Public License version 2 as
			
 
				- * published by the Free Software Foundation.
			
 
				- */
			
 
				-
			
 
				-#include <linux/unaligned/access_ok.h>
			
 
				-#include <linux/cpufeature.h>
			
 
				-#include <linux/init.h>
			
 
				-#include <linux/kernel.h>
			
 
				-#include <linux/module.h>
			
 
				-#include <linux/string.h>
			
 
				-
			
 
				-#include <crypto/internal/hash.h>
			
 
				-
			
 
				-MODULE_AUTHOR("Yazen Ghannam <yazen.ghannam@linaro.org>");
			
 
				-MODULE_DESCRIPTION("CRC32 and CRC32C using optional ARMv8 instructions");
			
 
				-MODULE_LICENSE("GPL v2");
			
 
				-
			
 
				-#define CRC32X(crc, value) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
			
 
				-#define CRC32W(crc, value) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
			
 
				-#define CRC32H(crc, value) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
			
 
				-#define CRC32B(crc, value) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
			
 
				-#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
			
 
				-#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
			
 
				-#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
			
 
				-#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
			
 
				-
			
 
				-static u32 crc32_arm64_le_hw(u32 crc, const u8 *p, unsigned int len)
			
 
				-{
			
 
				-	s64 length = len;
			
 
				-
			
 
				-	while ((length -= sizeof(u64)) >= 0) {
			
 
				-		CRC32X(crc, get_unaligned_le64(p));
			
 
				-		p += sizeof(u64);
			
 
				-	}
			
 
				-
			
 
				-	/* The following is more efficient than the straight loop */
			
 
				-	if (length & sizeof(u32)) {
			
 
				-		CRC32W(crc, get_unaligned_le32(p));
			
 
				-		p += sizeof(u32);
			
 
				-	}
			
 
				-	if (length & sizeof(u16)) {
			
 
				-		CRC32H(crc, get_unaligned_le16(p));
			
 
				-		p += sizeof(u16);
			
 
				-	}
			
 
				-	if (length & sizeof(u8))
			
 
				-		CRC32B(crc, *p);
			
 
				-
			
 
				-	return crc;
			
 
				-}
			
 
				-
			
 
				-static u32 crc32c_arm64_le_hw(u32 crc, const u8 *p, unsigned int len)
			
 
				-{
			
 
				-	s64 length = len;
			
 
				-
			
 
				-	while ((length -= sizeof(u64)) >= 0) {
			
 
				-		CRC32CX(crc, get_unaligned_le64(p));
			
 
				-		p += sizeof(u64);
			
 
				-	}
			
 
				-
			
 
				-	/* The following is more efficient than the straight loop */
			
 
				-	if (length & sizeof(u32)) {
			
 
				-		CRC32CW(crc, get_unaligned_le32(p));
			
 
				-		p += sizeof(u32);
			
 
				-	}
			
 
				-	if (length & sizeof(u16)) {
			
 
				-		CRC32CH(crc, get_unaligned_le16(p));
			
 
				-		p += sizeof(u16);
			
 
				-	}
			
 
				-	if (length & sizeof(u8))
			
 
				-		CRC32CB(crc, *p);
			
 
				-
			
 
				-	return crc;
			
 
				-}
			
 
				-
			
 
				-#define CHKSUM_BLOCK_SIZE	1
			
 
				-#define CHKSUM_DIGEST_SIZE	4
			
 
				-
			
 
				-struct chksum_ctx {
			
 
				-	u32 key;
			
 
				-};
			
 
				-
			
 
				-struct chksum_desc_ctx {
			
 
				-	u32 crc;
			
 
				-};
			
 
				-
			
 
				-static int chksum_init(struct shash_desc *desc)
			
 
				-{
			
 
				-	struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
			
 
				-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
			
 
				-
			
 
				-	ctx->crc = mctx->key;
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Setting the seed allows arbitrary accumulators and flexible XOR policy
			
 
				- * If your algorithm starts with ~0, then XOR with ~0 before you set
			
 
				- * the seed.
			
 
				- */
			
 
				-static int chksum_setkey(struct crypto_shash *tfm, const u8 *key,
			
 
				-			 unsigned int keylen)
			
 
				-{
			
 
				-	struct chksum_ctx *mctx = crypto_shash_ctx(tfm);
			
 
				-
			
 
				-	if (keylen != sizeof(mctx->key)) {
			
 
				-		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-	mctx->key = get_unaligned_le32(key);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int chksum_update(struct shash_desc *desc, const u8 *data,
			
 
				-			 unsigned int length)
			
 
				-{
			
 
				-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
			
 
				-
			
 
				-	ctx->crc = crc32_arm64_le_hw(ctx->crc, data, length);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int chksumc_update(struct shash_desc *desc, const u8 *data,
			
 
				-			 unsigned int length)
			
 
				-{
			
 
				-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
			
 
				-
			
 
				-	ctx->crc = crc32c_arm64_le_hw(ctx->crc, data, length);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int chksum_final(struct shash_desc *desc, u8 *out)
			
 
				-{
			
 
				-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
			
 
				-
			
 
				-	put_unaligned_le32(ctx->crc, out);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int chksumc_final(struct shash_desc *desc, u8 *out)
			
 
				-{
			
 
				-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
			
 
				-
			
 
				-	put_unaligned_le32(~ctx->crc, out);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int __chksum_finup(u32 crc, const u8 *data, unsigned int len, u8 *out)
			
 
				-{
			
 
				-	put_unaligned_le32(crc32_arm64_le_hw(crc, data, len), out);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int __chksumc_finup(u32 crc, const u8 *data, unsigned int len, u8 *out)
			
 
				-{
			
 
				-	put_unaligned_le32(~crc32c_arm64_le_hw(crc, data, len), out);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int chksum_finup(struct shash_desc *desc, const u8 *data,
			
 
				-			unsigned int len, u8 *out)
			
 
				-{
			
 
				-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
			
 
				-
			
 
				-	return __chksum_finup(ctx->crc, data, len, out);
			
 
				-}
			
 
				-
			
 
				-static int chksumc_finup(struct shash_desc *desc, const u8 *data,
			
 
				-			unsigned int len, u8 *out)
			
 
				-{
			
 
				-	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
			
 
				-
			
 
				-	return __chksumc_finup(ctx->crc, data, len, out);
			
 
				-}
			
 
				-
			
 
				-static int chksum_digest(struct shash_desc *desc, const u8 *data,
			
 
				-			 unsigned int length, u8 *out)
			
 
				-{
			
 
				-	struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
			
 
				-
			
 
				-	return __chksum_finup(mctx->key, data, length, out);
			
 
				-}
			
 
				-
			
 
				-static int chksumc_digest(struct shash_desc *desc, const u8 *data,
			
 
				-			 unsigned int length, u8 *out)
			
 
				-{
			
 
				-	struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
			
 
				-
			
 
				-	return __chksumc_finup(mctx->key, data, length, out);
			
 
				-}
			
 
				-
			
 
				-static int crc32_cra_init(struct crypto_tfm *tfm)
			
 
				-{
			
 
				-	struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
			
 
				-
			
 
				-	mctx->key = 0;
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int crc32c_cra_init(struct crypto_tfm *tfm)
			
 
				-{
			
 
				-	struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
			
 
				-
			
 
				-	mctx->key = ~0;
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static struct shash_alg crc32_alg = {
			
 
				-	.digestsize		=	CHKSUM_DIGEST_SIZE,
			
 
				-	.setkey			=	chksum_setkey,
			
 
				-	.init			=	chksum_init,
			
 
				-	.update			=	chksum_update,
			
 
				-	.final			=	chksum_final,
			
 
				-	.finup			=	chksum_finup,
			
 
				-	.digest			=	chksum_digest,
			
 
				-	.descsize		=	sizeof(struct chksum_desc_ctx),
			
 
				-	.base			=	{
			
 
				-		.cra_name		=	"crc32",
			
 
				-		.cra_driver_name	=	"crc32-arm64-hw",
			
 
				-		.cra_priority		=	300,
			
 
				-		.cra_blocksize		=	CHKSUM_BLOCK_SIZE,
			
 
				-		.cra_alignmask		=	0,
			
 
				-		.cra_ctxsize		=	sizeof(struct chksum_ctx),
			
 
				-		.cra_module		=	THIS_MODULE,
			
 
				-		.cra_init		=	crc32_cra_init,
			
 
				-	}
			
 
				-};
			
 
				-
			
 
				-static struct shash_alg crc32c_alg = {
			
 
				-	.digestsize		=	CHKSUM_DIGEST_SIZE,
			
 
				-	.setkey			=	chksum_setkey,
			
 
				-	.init			=	chksum_init,
			
 
				-	.update			=	chksumc_update,
			
 
				-	.final			=	chksumc_final,
			
 
				-	.finup			=	chksumc_finup,
			
 
				-	.digest			=	chksumc_digest,
			
 
				-	.descsize		=	sizeof(struct chksum_desc_ctx),
			
 
				-	.base			=	{
			
 
				-		.cra_name		=	"crc32c",
			
 
				-		.cra_driver_name	=	"crc32c-arm64-hw",
			
 
				-		.cra_priority		=	300,
			
 
				-		.cra_blocksize		=	CHKSUM_BLOCK_SIZE,
			
 
				-		.cra_alignmask		=	0,
			
 
				-		.cra_ctxsize		=	sizeof(struct chksum_ctx),
			
 
				-		.cra_module		=	THIS_MODULE,
			
 
				-		.cra_init		=	crc32c_cra_init,
			
 
				-	}
			
 
				-};
			
 
				-
			
 
				-static int __init crc32_mod_init(void)
			
 
				-{
			
 
				-	int err;
			
 
				-
			
 
				-	err = crypto_register_shash(&crc32_alg);
			
 
				-
			
 
				-	if (err)
			
 
				-		return err;
			
 
				-
			
 
				-	err = crypto_register_shash(&crc32c_alg);
			
 
				-
			
 
				-	if (err) {
			
 
				-		crypto_unregister_shash(&crc32_alg);
			
 
				-		return err;
			
 
				-	}
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static void __exit crc32_mod_exit(void)
			
 
				-{
			
 
				-	crypto_unregister_shash(&crc32_alg);
			
 
				-	crypto_unregister_shash(&crc32c_alg);
			
 
				-}
			
 
				-
			
 
				-module_cpu_feature_match(CRC32, crc32_mod_init);
			
 
				-module_exit(crc32_mod_exit);
			
--- a/arch/arm64/crypto/crc32-ce-glue.c
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@@ -72,6 +72,24 @@ static int crc32_pmull_init(struct shash_desc *desc)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int crc32_update(struct shash_desc *desc, const u8 *data,
			
 
				+			unsigned int length)
			
 
				+{
			
 
				+	u32 *crc = shash_desc_ctx(desc);
			
 
				+
			
 
				+	*crc = crc32_armv8_le(*crc, data, length);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int crc32c_update(struct shash_desc *desc, const u8 *data,
			
 
				+			 unsigned int length)
			
 
				+{
			
 
				+	u32 *crc = shash_desc_ctx(desc);
			
 
				+
			
 
				+	*crc = crc32c_armv8_le(*crc, data, length);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
			
 
				 			 unsigned int length)
			
 
				 {
			
@@ -156,7 +174,7 @@ static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
 
				 static struct shash_alg crc32_pmull_algs[] = { {
			
 
				 	.setkey			= crc32_pmull_setkey,
			
 
				 	.init			= crc32_pmull_init,
			
 
				-	.update			= crc32_pmull_update,
			
 
				+	.update			= crc32_update,
			
 
				 	.final			= crc32_pmull_final,
			
 
				 	.descsize		= sizeof(u32),
			
 
				 	.digestsize		= sizeof(u32),
			
@@ -171,7 +189,7 @@ static struct shash_alg crc32_pmull_algs[] = { {
 
				 }, {
			
 
				 	.setkey			= crc32_pmull_setkey,
			
 
				 	.init			= crc32_pmull_init,
			
 
				-	.update			= crc32c_pmull_update,
			
 
				+	.update			= crc32c_update,
			
 
				 	.final			= crc32c_pmull_final,
			
 
				 	.descsize		= sizeof(u32),
			
 
				 	.digestsize		= sizeof(u32),
			
@@ -187,14 +205,20 @@ static struct shash_alg crc32_pmull_algs[] = { {
 
				 
			
 
				 static int __init crc32_pmull_mod_init(void)
			
 
				 {
			
 
				-	if (elf_hwcap & HWCAP_CRC32) {
			
 
				-		fallback_crc32 = crc32_armv8_le;
			
 
				-		fallback_crc32c = crc32c_armv8_le;
			
 
				-	} else {
			
 
				-		fallback_crc32 = crc32_le;
			
 
				-		fallback_crc32c = __crc32c_le;
			
 
				+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
			
 
				+		crc32_pmull_algs[0].update = crc32_pmull_update;
			
 
				+		crc32_pmull_algs[1].update = crc32c_pmull_update;
			
 
				+
			
 
				+		if (elf_hwcap & HWCAP_CRC32) {
			
 
				+			fallback_crc32 = crc32_armv8_le;
			
 
				+			fallback_crc32c = crc32c_armv8_le;
			
 
				+		} else {
			
 
				+			fallback_crc32 = crc32_le;
			
 
				+			fallback_crc32c = __crc32c_le;
			
 
				+		}
			
 
				+	} else if (!(elf_hwcap & HWCAP_CRC32)) {
			
 
				+		return -ENODEV;
			
 
				 	}
			
 
				-
			
 
				 	return crypto_register_shashes(crc32_pmull_algs,
			
 
				 				       ARRAY_SIZE(crc32_pmull_algs));
			
 
				 }
			
@@ -205,7 +229,12 @@ static void __exit crc32_pmull_mod_exit(void)
 
				 				  ARRAY_SIZE(crc32_pmull_algs));
			
 
				 }
			
 
				 
			
 
				-module_cpu_feature_match(PMULL, crc32_pmull_mod_init);
			
 
				+static const struct cpu_feature crc32_cpu_feature[] = {
			
 
				+	{ cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
			
 
				+};
			
 
				+MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);
			
 
				+
			
 
				+module_init(crc32_pmull_mod_init);
			
 
				 module_exit(crc32_pmull_mod_exit);
			
 
				 
			
 
				 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -46,28 +46,49 @@
 
				 
			
 
				 #ifdef __x86_64__
			
 
				 
			
 
				-.data
			
 
				+# constants in mergeable sections, linker can reorder and merge
			
 
				+.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
			
 
				 .align 16
			
 
				 .Lgf128mul_x_ble_mask:
			
 
				 	.octa 0x00000000000000010000000000000087
			
 
				+.section	.rodata.cst16.POLY, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 POLY:   .octa 0xC2000000000000000000000000000001
			
 
				+.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 TWOONE: .octa 0x00000001000000000000000000000001
			
 
				 
			
 
				-# order of these constants should not change.
			
 
				-# more specifically, ALL_F should follow SHIFT_MASK,
			
 
				-# and ZERO should follow ALL_F
			
 
				-
			
 
				+.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
			
 
				+.section	.rodata.cst16.MASK1, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 MASK1:      .octa 0x0000000000000000ffffffffffffffff
			
 
				+.section	.rodata.cst16.MASK2, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 MASK2:      .octa 0xffffffffffffffff0000000000000000
			
 
				-SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
			
 
				-ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
			
 
				-ZERO:       .octa 0x00000000000000000000000000000000
			
 
				+.section	.rodata.cst16.ONE, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 ONE:        .octa 0x00000000000000000000000000000001
			
 
				+.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
			
 
				+.section	.rodata.cst16.dec, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 dec:        .octa 0x1
			
 
				+.section	.rodata.cst16.enc, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 enc:        .octa 0x2
			
 
				 
			
 
				+# order of these constants should not change.
			
 
				+# more specifically, ALL_F should follow SHIFT_MASK,
			
 
				+# and zero should follow ALL_F
			
 
				+.section	.rodata, "a", @progbits
			
 
				+.align 16
			
 
				+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
			
 
				+ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
			
 
				+            .octa 0x00000000000000000000000000000000
			
 
				+
			
 
				 
			
 
				 .text
			
 
				 
			
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -122,23 +122,39 @@
 
				 #include <linux/linkage.h>
			
 
				 #include <asm/inst.h>
			
 
				 
			
 
				-.data
			
 
				+# constants in mergeable sections, linker can reorder and merge
			
 
				+.section	.rodata.cst16.POLY, "aM", @progbits, 16
			
 
				 .align 16
			
 
				-
			
 
				 POLY:            .octa     0xC2000000000000000000000000000001
			
 
				+
			
 
				+.section	.rodata.cst16.POLY2, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 POLY2:           .octa     0xC20000000000000000000001C2000000
			
 
				-TWOONE:          .octa     0x00000001000000000000000000000001
			
 
				 
			
 
				-# order of these constants should not change.
			
 
				-# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
			
 
				+.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
			
 
				+.align 16
			
 
				+TWOONE:          .octa     0x00000001000000000000000000000001
			
 
				 
			
 
				+.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
			
 
				-SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
			
 
				-ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
			
 
				-ZERO:            .octa     0x00000000000000000000000000000000
			
 
				+
			
 
				+.section	.rodata.cst16.ONE, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 ONE:             .octa     0x00000000000000000000000000000001
			
 
				+
			
 
				+.section	.rodata.cst16.ONEf, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 ONEf:            .octa     0x01000000000000000000000000000000
			
 
				 
			
 
				+# order of these constants should not change.
			
 
				+# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
			
 
				+.section	.rodata, "a", @progbits
			
 
				+.align 16
			
 
				+SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
			
 
				+ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
			
 
				+                 .octa     0x00000000000000000000000000000000
			
 
				+
			
 
				 .text
			
 
				 
			
 
				 
			
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -740,9 +740,11 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
 
				 	*((__be32 *)(iv+12)) = counter;
			
 
				 
			
 
				 	if (sg_is_last(req->src) &&
			
 
				-	    req->src->offset + req->src->length <= PAGE_SIZE &&
			
 
				+	    (!PageHighMem(sg_page(req->src)) ||
			
 
				+	    req->src->offset + req->src->length <= PAGE_SIZE) &&
			
 
				 	    sg_is_last(req->dst) &&
			
 
				-	    req->dst->offset + req->dst->length <= PAGE_SIZE) {
			
 
				+	    (!PageHighMem(sg_page(req->dst)) ||
			
 
				+	    req->dst->offset + req->dst->length <= PAGE_SIZE)) {
			
 
				 		one_entry_in_sg = 1;
			
 
				 		scatterwalk_start(&src_sg_walk, req->src);
			
 
				 		assoc = scatterwalk_map(&src_sg_walk);
			
@@ -822,9 +824,11 @@ static int helper_rfc4106_decrypt(struct aead_request *req)
 
				 	*((__be32 *)(iv+12)) = counter;
			
 
				 
			
 
				 	if (sg_is_last(req->src) &&
			
 
				-	    req->src->offset + req->src->length <= PAGE_SIZE &&
			
 
				+	    (!PageHighMem(sg_page(req->src)) ||
			
 
				+	    req->src->offset + req->src->length <= PAGE_SIZE) &&
			
 
				 	    sg_is_last(req->dst) &&
			
 
				-	    req->dst->offset + req->dst->length <= PAGE_SIZE) {
			
 
				+	    (!PageHighMem(sg_page(req->dst)) ||
			
 
				+	    req->dst->offset + req->dst->length <= PAGE_SIZE)) {
			
 
				 		one_entry_in_sg = 1;
			
 
				 		scatterwalk_start(&src_sg_walk, req->src);
			
 
				 		assoc = scatterwalk_map(&src_sg_walk);
			
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -571,7 +571,9 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 
				 	vmovdqu y6, 14 * 16(rio); \
			
 
				 	vmovdqu y7, 15 * 16(rio);
			
 
				 
			
 
				-.data
			
 
				+
			
 
				+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
			
 
				+.section	.rodata.cst16, "aM", @progbits, 16
			
 
				 .align 16
			
 
				 
			
 
				 #define SHUFB_BYTES(idx) \
			
@@ -711,6 +713,7 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 
				 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
			
 
				 
			
 
				 /* 4-bit mask */
			
 
				+.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
			
 
				 .align 4
			
 
				 .L0f0f0f0f:
			
 
				 	.long 0x0f0f0f0f
			
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -610,20 +610,25 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 
				 	vmovdqu y6, 14 * 32(rio); \
			
 
				 	vmovdqu y7, 15 * 32(rio);
			
 
				 
			
 
				-.data
			
 
				-.align 32
			
 
				 
			
 
				+.section	.rodata.cst32.shufb_16x16b, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 #define SHUFB_BYTES(idx) \
			
 
				 	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
			
 
				-
			
 
				 .Lshufb_16x16b:
			
 
				 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
			
 
				 	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
			
 
				 
			
 
				+.section	.rodata.cst32.pack_bswap, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 .Lpack_bswap:
			
 
				 	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
			
 
				 	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
			
 
				 
			
 
				+/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
			
 
				+.section	.rodata.cst16, "aM", @progbits, 16
			
 
				+.align 16
			
 
				+
			
 
				 /* For CTR-mode IV byteswap */
			
 
				 .Lbswap128_mask:
			
 
				 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
			
@@ -750,6 +755,7 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 
				 	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
			
 
				 	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
			
 
				 
			
 
				+.section	.rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
			
 
				 .align 4
			
 
				 /* 4-bit mask */
			
 
				 .L0f0f0f0f:
			
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -195,19 +195,29 @@
 
				 	vpshufb rmask,	x0, x0;           \
			
 
				 	vpshufb rmask,	x1, x1;
			
 
				 
			
 
				-.data
			
 
				-
			
 
				+.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
			
 
				 .align 16
			
 
				 .Lbswap_mask:
			
 
				 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
			
 
				+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 .Lbswap128_mask:
			
 
				 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
			
 
				+.section	.rodata.cst16.bswap_iv_mask, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 .Lbswap_iv_mask:
			
 
				 	.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
			
 
				+
			
 
				+.section	.rodata.cst4.16_mask, "aM", @progbits, 4
			
 
				+.align 4
			
 
				 .L16_mask:
			
 
				 	.byte 16, 16, 16, 16
			
 
				+.section	.rodata.cst4.32_mask, "aM", @progbits, 4
			
 
				+.align 4
			
 
				 .L32_mask:
			
 
				 	.byte 32, 0, 0, 0
			
 
				+.section	.rodata.cst4.first_mask, "aM", @progbits, 4
			
 
				+.align 4
			
 
				 .Lfirst_mask:
			
 
				 	.byte 0x1f, 0, 0, 0
			
 
				 
			
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -225,8 +225,7 @@
 
				 	vpshufb rmask,		x2, x2;       \
			
 
				 	vpshufb rmask,		x3, x3;
			
 
				 
			
 
				-.data
			
 
				-
			
 
				+.section	.rodata.cst16, "aM", @progbits, 16
			
 
				 .align 16
			
 
				 .Lxts_gf128mul_and_shl1_mask:
			
 
				 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
			
@@ -244,10 +243,19 @@
 
				 	.byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
			
 
				 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
			
 
				 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
			
 
				+
			
 
				+.section	.rodata.cst4.L16_mask, "aM", @progbits, 4
			
 
				+.align 4
			
 
				 .L16_mask:
			
 
				 	.byte 16, 16, 16, 16
			
 
				+
			
 
				+.section	.rodata.cst4.L32_mask, "aM", @progbits, 4
			
 
				+.align 4
			
 
				 .L32_mask:
			
 
				 	.byte 32, 0, 0, 0
			
 
				+
			
 
				+.section	.rodata.cst4.first_mask, "aM", @progbits, 4
			
 
				+.align 4
			
 
				 .Lfirst_mask:
			
 
				 	.byte 0x1f, 0, 0, 0
			
 
				 
			
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@@ -11,13 +11,18 @@
 
				 
			
 
				 #include <linux/linkage.h>
			
 
				 
			
 
				-.data
			
 
				+.section	.rodata.cst32.ROT8, "aM", @progbits, 32
			
 
				 .align 32
			
 
				-
			
 
				 ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
			
 
				 	.octa 0x0e0d0c0f0a09080b0605040702010003
			
 
				+
			
 
				+.section	.rodata.cst32.ROT16, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
			
 
				 	.octa 0x0d0c0f0e09080b0a0504070601000302
			
 
				+
			
 
				+.section	.rodata.cst32.CTRINC, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 CTRINC:	.octa 0x00000003000000020000000100000000
			
 
				 	.octa 0x00000007000000060000000500000004
			
 
				 
			
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@@ -11,11 +11,14 @@
 
				 
			
 
				 #include <linux/linkage.h>
			
 
				 
			
 
				-.data
			
 
				+.section	.rodata.cst16.ROT8, "aM", @progbits, 16
			
 
				 .align 16
			
 
				-
			
 
				 ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
			
 
				+.section	.rodata.cst16.ROT16, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
			
 
				+.section	.rodata.cst16.CTRINC, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 CTRINC:	.octa 0x00000003000000020000000100000000
			
 
				 
			
 
				 .text
			
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@@ -11,7 +11,7 @@
 
				 
			
 
				 #include <crypto/algapi.h>
			
 
				 #include <crypto/chacha20.h>
			
 
				-#include <linux/crypto.h>
			
 
				+#include <crypto/internal/skcipher.h>
			
 
				 #include <linux/kernel.h>
			
 
				 #include <linux/module.h>
			
 
				 #include <asm/fpu/api.h>
			
@@ -63,36 +63,37 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
			
 
				-			 struct scatterlist *src, unsigned int nbytes)
			
 
				+static int chacha20_simd(struct skcipher_request *req)
			
 
				 {
			
 
				-	u32 *state, state_buf[16 + (CHACHA20_STATE_ALIGN / sizeof(u32)) - 1];
			
 
				-	struct blkcipher_walk walk;
			
 
				+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	u32 *state, state_buf[16 + 2] __aligned(8);
			
 
				+	struct skcipher_walk walk;
			
 
				 	int err;
			
 
				 
			
 
				-	if (nbytes <= CHACHA20_BLOCK_SIZE || !may_use_simd())
			
 
				-		return crypto_chacha20_crypt(desc, dst, src, nbytes);
			
 
				+	BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
			
 
				+	state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
			
 
				 
			
 
				-	state = (u32 *)roundup((uintptr_t)state_buf, CHACHA20_STATE_ALIGN);
			
 
				+	if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
			
 
				+		return crypto_chacha20_crypt(req);
			
 
				 
			
 
				-	blkcipher_walk_init(&walk, dst, src, nbytes);
			
 
				-	err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
			
 
				+	err = skcipher_walk_virt(&walk, req, true);
			
 
				 
			
 
				-	crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
			
 
				+	crypto_chacha20_init(state, ctx, walk.iv);
			
 
				 
			
 
				 	kernel_fpu_begin();
			
 
				 
			
 
				 	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
			
 
				 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
			
 
				 				rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
			
 
				-		err = blkcipher_walk_done(desc, &walk,
			
 
				-					  walk.nbytes % CHACHA20_BLOCK_SIZE);
			
 
				+		err = skcipher_walk_done(&walk,
			
 
				+					 walk.nbytes % CHACHA20_BLOCK_SIZE);
			
 
				 	}
			
 
				 
			
 
				 	if (walk.nbytes) {
			
 
				 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
			
 
				 				walk.nbytes);
			
 
				-		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+		err = skcipher_walk_done(&walk, 0);
			
 
				 	}
			
 
				 
			
 
				 	kernel_fpu_end();
			
@@ -100,27 +101,22 @@ static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
 
				 	return err;
			
 
				 }
			
 
				 
			
 
				-static struct crypto_alg alg = {
			
 
				-	.cra_name		= "chacha20",
			
 
				-	.cra_driver_name	= "chacha20-simd",
			
 
				-	.cra_priority		= 300,
			
 
				-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
			
 
				-	.cra_blocksize		= 1,
			
 
				-	.cra_type		= &crypto_blkcipher_type,
			
 
				-	.cra_ctxsize		= sizeof(struct chacha20_ctx),
			
 
				-	.cra_alignmask		= sizeof(u32) - 1,
			
 
				-	.cra_module		= THIS_MODULE,
			
 
				-	.cra_u			= {
			
 
				-		.blkcipher = {
			
 
				-			.min_keysize	= CHACHA20_KEY_SIZE,
			
 
				-			.max_keysize	= CHACHA20_KEY_SIZE,
			
 
				-			.ivsize		= CHACHA20_IV_SIZE,
			
 
				-			.geniv		= "seqiv",
			
 
				-			.setkey		= crypto_chacha20_setkey,
			
 
				-			.encrypt	= chacha20_simd,
			
 
				-			.decrypt	= chacha20_simd,
			
 
				-		},
			
 
				-	},
			
 
				+static struct skcipher_alg alg = {
			
 
				+	.base.cra_name		= "chacha20",
			
 
				+	.base.cra_driver_name	= "chacha20-simd",
			
 
				+	.base.cra_priority	= 300,
			
 
				+	.base.cra_blocksize	= 1,
			
 
				+	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
			
 
				+	.base.cra_alignmask	= sizeof(u32) - 1,
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+
			
 
				+	.min_keysize		= CHACHA20_KEY_SIZE,
			
 
				+	.max_keysize		= CHACHA20_KEY_SIZE,
			
 
				+	.ivsize			= CHACHA20_IV_SIZE,
			
 
				+	.chunksize		= CHACHA20_BLOCK_SIZE,
			
 
				+	.setkey			= crypto_chacha20_setkey,
			
 
				+	.encrypt		= chacha20_simd,
			
 
				+	.decrypt		= chacha20_simd,
			
 
				 };
			
 
				 
			
 
				 static int __init chacha20_simd_mod_init(void)
			
@@ -133,12 +129,12 @@ static int __init chacha20_simd_mod_init(void)
 
				 			    boot_cpu_has(X86_FEATURE_AVX2) &&
			
 
				 			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
			
 
				 #endif
			
 
				-	return crypto_register_alg(&alg);
			
 
				+	return crypto_register_skcipher(&alg);
			
 
				 }
			
 
				 
			
 
				 static void __exit chacha20_simd_mod_fini(void)
			
 
				 {
			
 
				-	crypto_unregister_alg(&alg);
			
 
				+	crypto_unregister_skcipher(&alg);
			
 
				 }
			
 
				 
			
 
				 module_init(chacha20_simd_mod_init);
			
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -312,7 +312,7 @@ do_return:
 
				         ret
			
 
				 ENDPROC(crc_pcl)
			
 
				 
			
 
				-.section	.rodata, "a", %progbits
			
 
				+.section	.rodata, "a", @progbits
			
 
				         ################################################################
			
 
				         ## jump table        Table is 129 entries x 2 bytes each
			
 
				         ################################################################
			
--- a/arch/x86/crypto/crct10dif-pcl-asm_64.S
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -554,12 +554,11 @@ _only_less_than_2:
 
				 
			
 
				 ENDPROC(crc_t10dif_pcl)
			
 
				 
			
 
				-.data
			
 
				-
			
 
				+.section	.rodata, "a", @progbits
			
 
				+.align 16
			
 
				 # precomputed constants
			
 
				 # these constants are precomputed from the poly:
			
 
				 # 0x8bb70000 (0x8bb7 scaled to 32 bits)
			
 
				-.align 16
			
 
				 # Q = 0x18BB70000
			
 
				 # rk1 = 2^(32*3) mod Q << 32
			
 
				 # rk2 = 2^(32*5) mod Q << 32
			
@@ -613,14 +612,23 @@ rk20:
 
				 
			
 
				 
			
 
				 
			
 
				+.section	.rodata.cst16.mask1, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 mask1:
			
 
				 .octa 0x80808080808080808080808080808080
			
 
				+
			
 
				+.section	.rodata.cst16.mask2, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 mask2:
			
 
				 .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
			
 
				 
			
 
				+.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 SHUF_MASK:
			
 
				 .octa 0x000102030405060708090A0B0C0D0E0F
			
 
				 
			
 
				+.section	.rodata.cst32.pshufb_shf_table, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 pshufb_shf_table:
			
 
				 # use these values for shift constants for the pshufb instruction
			
 
				 # different alignments result in values as shown:
			
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -537,7 +537,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
 
				 	ret;
			
 
				 ENDPROC(des3_ede_x86_64_crypt_blk_3way)
			
 
				 
			
 
				-.data
			
 
				+.section	.rodata, "a", @progbits
			
 
				 .align 16
			
 
				 .L_s1:
			
 
				 	.quad 0x0010100001010400, 0x0000000000000000
			
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -20,8 +20,7 @@
 
				 #include <asm/inst.h>
			
 
				 #include <asm/frame.h>
			
 
				 
			
 
				-.data
			
 
				-
			
 
				+.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
			
 
				 .align 16
			
 
				 .Lbswap_mask:
			
 
				 	.octa 0x000102030405060708090a0b0c0d0e0f
			
--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
+++ b/arch/x86/crypto/poly1305-avx2-x86_64.S
@@ -11,11 +11,13 @@
 
				 
			
 
				 #include <linux/linkage.h>
			
 
				 
			
 
				-.data
			
 
				+.section	.rodata.cst32.ANMASK, "aM", @progbits, 32
			
 
				 .align 32
			
 
				-
			
 
				 ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
			
 
				 	.octa 0x0000000003ffffff0000000003ffffff
			
 
				+
			
 
				+.section	.rodata.cst32.ORMASK, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 ORMASK:	.octa 0x00000000010000000000000001000000
			
 
				 	.octa 0x00000000010000000000000001000000
			
 
				 
			
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -11,10 +11,12 @@
 
				 
			
 
				 #include <linux/linkage.h>
			
 
				 
			
 
				-.data
			
 
				+.section	.rodata.cst16.ANMASK, "aM", @progbits, 16
			
 
				 .align 16
			
 
				-
			
 
				 ANMASK:	.octa 0x0000000003ffffff0000000003ffffff
			
 
				+
			
 
				+.section	.rodata.cst16.ORMASK, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 ORMASK:	.octa 0x00000000010000000000000001000000
			
 
				 
			
 
				 .text
			
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -29,11 +29,12 @@
 
				 
			
 
				 .file "serpent-avx-x86_64-asm_64.S"
			
 
				 
			
 
				-.data
			
 
				+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
			
 
				 .align 16
			
 
				-
			
 
				 .Lbswap128_mask:
			
 
				 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
			
 
				+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 .Lxts_gf128mul_and_shl1_mask:
			
 
				 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
			
 
				 
			
--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -20,13 +20,18 @@
 
				 
			
 
				 .file "serpent-avx2-asm_64.S"
			
 
				 
			
 
				-.data
			
 
				+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
			
 
				 .align 16
			
 
				-
			
 
				 .Lbswap128_mask:
			
 
				 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
			
 
				+
			
 
				+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 .Lxts_gf128mul_and_shl1_mask_0:
			
 
				 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
			
 
				+
			
 
				+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 .Lxts_gf128mul_and_shl1_mask_1:
			
 
				 	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
			
 
				 
			
--- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S
@@ -281,11 +281,13 @@ ENTRY(sha1_mb_mgr_get_comp_job_avx2)
 
				 	ret
			
 
				 ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
			
 
				 
			
 
				-.data
			
 
				-
			
 
				+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
			
 
				 .align 16
			
 
				 clear_low_nibble:
			
 
				 .octa	0x000000000000000000000000FFFFFFF0
			
 
				+
			
 
				+.section	.rodata.cst8, "aM", @progbits, 8
			
 
				+.align 8
			
 
				 one:
			
 
				 .quad  1
			
 
				 two:
			
--- a/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S
@@ -203,8 +203,7 @@ return_null:
 
				 
			
 
				 ENDPROC(sha1_mb_mgr_submit_avx2)
			
 
				 
			
 
				-.data
			
 
				-
			
 
				+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
			
 
				 .align 16
			
 
				 clear_low_nibble:
			
 
				 	.octa	0x000000000000000000000000FFFFFFF0
			
--- a/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
+++ b/arch/x86/crypto/sha1-mb/sha1_x8_avx2.S
@@ -461,21 +461,32 @@ lloop:
 
				 ENDPROC(sha1_x8_avx2)
			
 
				 
			
 
				 
			
 
				-.data
			
 
				-
			
 
				+.section	.rodata.cst32.K00_19, "aM", @progbits, 32
			
 
				 .align 32
			
 
				 K00_19:
			
 
				 .octa 0x5A8279995A8279995A8279995A827999
			
 
				 .octa 0x5A8279995A8279995A8279995A827999
			
 
				+
			
 
				+.section	.rodata.cst32.K20_39, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 K20_39:
			
 
				 .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
			
 
				 .octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
			
 
				+
			
 
				+.section	.rodata.cst32.K40_59, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 K40_59:
			
 
				 .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
			
 
				 .octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
			
 
				+
			
 
				+.section	.rodata.cst32.K60_79, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 K60_79:
			
 
				 .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
			
 
				 .octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
			
 
				+
			
 
				+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 PSHUFFLE_BYTE_FLIP_MASK:
			
 
				 .octa 0x0c0d0e0f08090a0b0405060700010203
			
 
				 .octa 0x0c0d0e0f08090a0b0405060700010203
			
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -293,10 +293,12 @@ ENTRY(sha1_ni_transform)
 
				 	ret
			
 
				 ENDPROC(sha1_ni_transform)
			
 
				 
			
 
				-.data
			
 
				-
			
 
				-.align 64
			
 
				+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 PSHUFFLE_BYTE_FLIP_MASK:
			
 
				 	.octa 0x000102030405060708090a0b0c0d0e0f
			
 
				+
			
 
				+.section	.rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 UPPER_WORD_MASK:
			
 
				 	.octa 0xFFFFFFFF000000000000000000000000
			
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -463,7 +463,7 @@ done_hash:
 
				 	ret
			
 
				 ENDPROC(sha256_transform_avx)
			
 
				 
			
 
				-.data
			
 
				+.section	.rodata.cst256.K256, "aM", @progbits, 256
			
 
				 .align 64
			
 
				 K256:
			
 
				 	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
			
@@ -483,14 +483,21 @@ K256:
 
				 	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
			
 
				 	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
			
 
				 
			
 
				+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 PSHUFFLE_BYTE_FLIP_MASK:
			
 
				 	.octa 0x0c0d0e0f08090a0b0405060700010203
			
 
				 
			
 
				+.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 # shuffle xBxA -> 00BA
			
 
				 _SHUF_00BA:
			
 
				 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
			
 
				 
			
 
				+.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 # shuffle xDxC -> DC00
			
 
				 _SHUF_DC00:
			
 
				 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
			
 
				+
			
 
				 #endif
			
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -723,7 +723,7 @@ done_hash:
 
				 	ret
			
 
				 ENDPROC(sha256_transform_rorx)
			
 
				 
			
 
				-.data
			
 
				+.section	.rodata.cst512.K256, "aM", @progbits, 512
			
 
				 .align 64
			
 
				 K256:
			
 
				 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
			
@@ -759,14 +759,21 @@ K256:
 
				 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
			
 
				 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
			
 
				 
			
 
				+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 PSHUFFLE_BYTE_FLIP_MASK:
			
 
				 	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
			
 
				 
			
 
				 # shuffle xBxA -> 00BA
			
 
				+.section	.rodata.cst32._SHUF_00BA, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 _SHUF_00BA:
			
 
				 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
			
 
				 
			
 
				 # shuffle xDxC -> DC00
			
 
				+.section	.rodata.cst32._SHUF_DC00, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 _SHUF_DC00:
			
 
				 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
			
 
				+
			
 
				 #endif
			
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S
@@ -284,11 +284,13 @@ ENTRY(sha256_mb_mgr_get_comp_job_avx2)
 
				 	ret
			
 
				 ENDPROC(sha256_mb_mgr_get_comp_job_avx2)
			
 
				 
			
 
				-.data
			
 
				-
			
 
				+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
			
 
				 .align 16
			
 
				 clear_low_nibble:
			
 
				 .octa	0x000000000000000000000000FFFFFFF0
			
 
				+
			
 
				+.section	.rodata.cst8, "aM", @progbits, 8
			
 
				+.align 8
			
 
				 one:
			
 
				 .quad	1
			
 
				 two:
			
--- a/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S
@@ -208,8 +208,7 @@ return_null:
 
				 
			
 
				 ENDPROC(sha256_mb_mgr_submit_avx2)
			
 
				 
			
 
				-.data
			
 
				-
			
 
				+.section	.rodata.cst16.clear_low_nibble, "aM", @progbits, 16
			
 
				 .align 16
			
 
				 clear_low_nibble:
			
 
				 	.octa	0x000000000000000000000000FFFFFFF0
			
--- a/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
+++ b/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
@@ -437,7 +437,8 @@ Lrounds_16_xx:
 
				 
			
 
				 	ret
			
 
				 ENDPROC(sha256_x8_avx2)
			
 
				-.data
			
 
				+
			
 
				+.section	.rodata.K256_8, "a", @progbits
			
 
				 .align 64
			
 
				 K256_8:
			
 
				 	.octa	0x428a2f98428a2f98428a2f98428a2f98
			
@@ -568,10 +569,14 @@ K256_8:
 
				 	.octa	0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
			
 
				 	.octa	0xc67178f2c67178f2c67178f2c67178f2
			
 
				 	.octa	0xc67178f2c67178f2c67178f2c67178f2
			
 
				+
			
 
				+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 PSHUFFLE_BYTE_FLIP_MASK:
			
 
				 .octa 0x0c0d0e0f08090a0b0405060700010203
			
 
				 .octa 0x0c0d0e0f08090a0b0405060700010203
			
 
				 
			
 
				+.section	.rodata.cst256.K256, "aM", @progbits, 256
			
 
				 .align 64
			
 
				 .global K256
			
 
				 K256:
			
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -474,7 +474,7 @@ done_hash:
 
				 	ret
			
 
				 ENDPROC(sha256_transform_ssse3)
			
 
				 
			
 
				-.data
			
 
				+.section	.rodata.cst256.K256, "aM", @progbits, 256
			
 
				 .align 64
			
 
				 K256:
			
 
				         .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
			
@@ -494,13 +494,19 @@ K256:
 
				         .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
			
 
				         .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
			
 
				 
			
 
				+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 PSHUFFLE_BYTE_FLIP_MASK:
			
 
				 	.octa 0x0c0d0e0f08090a0b0405060700010203
			
 
				 
			
 
				+.section	.rodata.cst16._SHUF_00BA, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 # shuffle xBxA -> 00BA
			
 
				 _SHUF_00BA:
			
 
				 	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
			
 
				 
			
 
				+.section	.rodata.cst16._SHUF_DC00, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 # shuffle xDxC -> DC00
			
 
				 _SHUF_DC00:
			
 
				 	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
			
--- a/arch/x86/crypto/sha256_ni_asm.S
+++ b/arch/x86/crypto/sha256_ni_asm.S
@@ -329,7 +329,7 @@ ENTRY(sha256_ni_transform)
 
				 	ret
			
 
				 ENDPROC(sha256_ni_transform)
			
 
				 
			
 
				-.data
			
 
				+.section	.rodata.cst256.K256, "aM", @progbits, 256
			
 
				 .align 64
			
 
				 K256:
			
 
				 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
			
@@ -349,5 +349,7 @@ K256:
 
				 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
			
 
				 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
			
 
				 
			
 
				+.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 PSHUFFLE_BYTE_FLIP_MASK:
			
 
				 	.octa 0x0c0d0e0f08090a0b0405060700010203
			
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -370,14 +370,17 @@ ENDPROC(sha512_transform_avx)
 
				 ########################################################################
			
 
				 ### Binary Data
			
 
				 
			
 
				-.data
			
 
				-
			
 
				+.section	.rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
			
 
				 .align 16
			
 
				-
			
 
				 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
			
 
				 XMM_QWORD_BSWAP:
			
 
				 	.octa 0x08090a0b0c0d0e0f0001020304050607
			
 
				 
			
 
				+# Mergeable 640-byte rodata section. This allows linker to merge the table
			
 
				+# with other, exactly the same 640-byte fragment of another rodata section
			
 
				+# (if such section exists).
			
 
				+.section	.rodata.cst640.K512, "aM", @progbits, 640
			
 
				+.align 64
			
 
				 # K[t] used in SHA512 hashing
			
 
				 K512:
			
 
				 	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
			
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -684,8 +684,11 @@ ENDPROC(sha512_transform_rorx)
 
				 ########################################################################
			
 
				 ### Binary Data
			
 
				 
			
 
				-.data
			
 
				 
			
 
				+# Mergeable 640-byte rodata section. This allows linker to merge the table
			
 
				+# with other, exactly the same 640-byte fragment of another rodata section
			
 
				+# (if such section exists).
			
 
				+.section	.rodata.cst640.K512, "aM", @progbits, 640
			
 
				 .align 64
			
 
				 # K[t] used in SHA512 hashing
			
 
				 K512:
			
@@ -730,14 +733,17 @@ K512:
 
				 	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
			
 
				 	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
			
 
				 
			
 
				+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
			
 
				 .align 32
			
 
				-
			
 
				 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
			
 
				 PSHUFFLE_BYTE_FLIP_MASK:
			
 
				 	.octa 0x08090a0b0c0d0e0f0001020304050607
			
 
				 	.octa 0x18191a1b1c1d1e1f1011121314151617
			
 
				 
			
 
				+.section	.rodata.cst32.MASK_YMM_LO, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 MASK_YMM_LO:
			
 
				 	.octa 0x00000000000000000000000000000000
			
 
				 	.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
			
 
				+
			
 
				 #endif
			
--- a/arch/x86/crypto/sha512-mb/sha512_mb.c
+++ b/arch/x86/crypto/sha512-mb/sha512_mb.c
@@ -221,7 +221,7 @@ static struct sha512_hash_ctx *sha512_ctx_mgr_resubmit
 
				 }
			
 
				 
			
 
				 static struct sha512_hash_ctx
			
 
				-		*sha512_ctx_mgr_get_comp_ctx(struct sha512_ctx_mgr *mgr)
			
 
				+		*sha512_ctx_mgr_get_comp_ctx(struct mcryptd_alg_cstate *cstate)
			
 
				 {
			
 
				 	/*
			
 
				 	 * If get_comp_job returns NULL, there are no jobs complete.
			
@@ -233,11 +233,17 @@ static struct sha512_hash_ctx
 
				 	 * Otherwise, all jobs currently being managed by the hash_ctx_mgr
			
 
				 	 * still need processing.
			
 
				 	 */
			
 
				+	struct sha512_ctx_mgr *mgr;
			
 
				 	struct sha512_hash_ctx *ctx;
			
 
				+	unsigned long flags;
			
 
				 
			
 
				+	mgr = cstate->mgr;
			
 
				+	spin_lock_irqsave(&cstate->work_lock, flags);
			
 
				 	ctx = (struct sha512_hash_ctx *)
			
 
				 				sha512_job_mgr_get_comp_job(&mgr->mgr);
			
 
				-	return sha512_ctx_mgr_resubmit(mgr, ctx);
			
 
				+	ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
			
 
				+	spin_unlock_irqrestore(&cstate->work_lock, flags);
			
 
				+	return ctx;
			
 
				 }
			
 
				 
			
 
				 static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr)
			
@@ -246,12 +252,17 @@ static void sha512_ctx_mgr_init(struct sha512_ctx_mgr *mgr)
 
				 }
			
 
				 
			
 
				 static struct sha512_hash_ctx
			
 
				-			*sha512_ctx_mgr_submit(struct sha512_ctx_mgr *mgr,
			
 
				+			*sha512_ctx_mgr_submit(struct mcryptd_alg_cstate *cstate,
			
 
				 					  struct sha512_hash_ctx *ctx,
			
 
				 					  const void *buffer,
			
 
				 					  uint32_t len,
			
 
				 					  int flags)
			
 
				 {
			
 
				+	struct sha512_ctx_mgr *mgr;
			
 
				+	unsigned long irqflags;
			
 
				+
			
 
				+	mgr = cstate->mgr;
			
 
				+	spin_lock_irqsave(&cstate->work_lock, irqflags);
			
 
				 	if (flags & (~HASH_ENTIRE)) {
			
 
				 		/*
			
 
				 		 * User should not pass anything other than FIRST, UPDATE, or
			
@@ -351,20 +362,26 @@ static struct sha512_hash_ctx
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	return sha512_ctx_mgr_resubmit(mgr, ctx);
			
 
				+	ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
			
 
				+	spin_unlock_irqrestore(&cstate->work_lock, irqflags);
			
 
				+	return ctx;
			
 
				 }
			
 
				 
			
 
				-static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct sha512_ctx_mgr *mgr)
			
 
				+static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct mcryptd_alg_cstate *cstate)
			
 
				 {
			
 
				+	struct sha512_ctx_mgr *mgr;
			
 
				 	struct sha512_hash_ctx *ctx;
			
 
				+	unsigned long flags;
			
 
				 
			
 
				+	mgr = cstate->mgr;
			
 
				+	spin_lock_irqsave(&cstate->work_lock, flags);
			
 
				 	while (1) {
			
 
				 		ctx = (struct sha512_hash_ctx *)
			
 
				 					sha512_job_mgr_flush(&mgr->mgr);
			
 
				 
			
 
				 		/* If flush returned 0, there are no more jobs in flight. */
			
 
				 		if (!ctx)
			
 
				-			return NULL;
			
 
				+			break;
			
 
				 
			
 
				 		/*
			
 
				 		 * If flush returned a job, resubmit the job to finish
			
@@ -378,8 +395,10 @@ static struct sha512_hash_ctx *sha512_ctx_mgr_flush(struct sha512_ctx_mgr *mgr)
 
				 		 * the sha512_ctx_mgr still need processing. Loop.
			
 
				 		 */
			
 
				 		if (ctx)
			
 
				-			return ctx;
			
 
				+			break;
			
 
				 	}
			
 
				+	spin_unlock_irqrestore(&cstate->work_lock, flags);
			
 
				+	return ctx;
			
 
				 }
			
 
				 
			
 
				 static int sha512_mb_init(struct ahash_request *areq)
			
@@ -439,11 +458,11 @@ static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
 
				 		sha_ctx = (struct sha512_hash_ctx *)
			
 
				 						ahash_request_ctx(&rctx->areq);
			
 
				 		kernel_fpu_begin();
			
 
				-		sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx,
			
 
				+		sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx,
			
 
				 						rctx->walk.data, nbytes, flag);
			
 
				 		if (!sha_ctx) {
			
 
				 			if (flush)
			
 
				-				sha_ctx = sha512_ctx_mgr_flush(cstate->mgr);
			
 
				+				sha_ctx = sha512_ctx_mgr_flush(cstate);
			
 
				 		}
			
 
				 		kernel_fpu_end();
			
 
				 		if (sha_ctx)
			
@@ -471,11 +490,12 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
 
				 	struct sha512_hash_ctx *sha_ctx;
			
 
				 	struct mcryptd_hash_request_ctx *req_ctx;
			
 
				 	int ret;
			
 
				+	unsigned long flags;
			
 
				 
			
 
				 	/* remove from work list */
			
 
				-	spin_lock(&cstate->work_lock);
			
 
				+	spin_lock_irqsave(&cstate->work_lock, flags);
			
 
				 	list_del(&rctx->waiter);
			
 
				-	spin_unlock(&cstate->work_lock);
			
 
				+	spin_unlock_irqrestore(&cstate->work_lock, flags);
			
 
				 
			
 
				 	if (irqs_disabled())
			
 
				 		rctx->complete(&req->base, err);
			
@@ -486,14 +506,14 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
 
				 	}
			
 
				 
			
 
				 	/* check to see if there are other jobs that are done */
			
 
				-	sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr);
			
 
				+	sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate);
			
 
				 	while (sha_ctx) {
			
 
				 		req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
			
 
				 		ret = sha_finish_walk(&req_ctx, cstate, false);
			
 
				 		if (req_ctx) {
			
 
				-			spin_lock(&cstate->work_lock);
			
 
				+			spin_lock_irqsave(&cstate->work_lock, flags);
			
 
				 			list_del(&req_ctx->waiter);
			
 
				-			spin_unlock(&cstate->work_lock);
			
 
				+			spin_unlock_irqrestore(&cstate->work_lock, flags);
			
 
				 
			
 
				 			req = cast_mcryptd_ctx_to_req(req_ctx);
			
 
				 			if (irqs_disabled())
			
@@ -504,7 +524,7 @@ static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
 
				 				local_bh_enable();
			
 
				 			}
			
 
				 		}
			
 
				-		sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate->mgr);
			
 
				+		sha_ctx = sha512_ctx_mgr_get_comp_ctx(cstate);
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
@@ -515,6 +535,7 @@ static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
 
				 {
			
 
				 	unsigned long next_flush;
			
 
				 	unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
			
 
				+	unsigned long flags;
			
 
				 
			
 
				 	/* initialize tag */
			
 
				 	rctx->tag.arrival = jiffies;    /* tag the arrival time */
			
@@ -522,9 +543,9 @@ static void sha512_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
 
				 	next_flush = rctx->tag.arrival + delay;
			
 
				 	rctx->tag.expire = next_flush;
			
 
				 
			
 
				-	spin_lock(&cstate->work_lock);
			
 
				+	spin_lock_irqsave(&cstate->work_lock, flags);
			
 
				 	list_add_tail(&rctx->waiter, &cstate->work_list);
			
 
				-	spin_unlock(&cstate->work_lock);
			
 
				+	spin_unlock_irqrestore(&cstate->work_lock, flags);
			
 
				 
			
 
				 	mcryptd_arm_flusher(cstate, delay);
			
 
				 }
			
@@ -565,7 +586,7 @@ static int sha512_mb_update(struct ahash_request *areq)
 
				 	sha_ctx = (struct sha512_hash_ctx *) ahash_request_ctx(areq);
			
 
				 	sha512_mb_add_list(rctx, cstate);
			
 
				 	kernel_fpu_begin();
			
 
				-	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
			
 
				+	sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data,
			
 
				 							nbytes, HASH_UPDATE);
			
 
				 	kernel_fpu_end();
			
 
				 
			
@@ -628,7 +649,7 @@ static int sha512_mb_finup(struct ahash_request *areq)
 
				 	sha512_mb_add_list(rctx, cstate);
			
 
				 
			
 
				 	kernel_fpu_begin();
			
 
				-	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data,
			
 
				+	sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, rctx->walk.data,
			
 
				 								nbytes, flag);
			
 
				 	kernel_fpu_end();
			
 
				 
			
@@ -677,8 +698,7 @@ static int sha512_mb_final(struct ahash_request *areq)
 
				 	/* flag HASH_FINAL and 0 data size */
			
 
				 	sha512_mb_add_list(rctx, cstate);
			
 
				 	kernel_fpu_begin();
			
 
				-	sha_ctx = sha512_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0,
			
 
				-								HASH_LAST);
			
 
				+	sha_ctx = sha512_ctx_mgr_submit(cstate, sha_ctx, &data, 0, HASH_LAST);
			
 
				 	kernel_fpu_end();
			
 
				 
			
 
				 	/* check if anything is returned */
			
@@ -940,7 +960,7 @@ static unsigned long sha512_mb_flusher(struct mcryptd_alg_cstate *cstate)
 
				 			break;
			
 
				 		kernel_fpu_begin();
			
 
				 		sha_ctx = (struct sha512_hash_ctx *)
			
 
				-					sha512_ctx_mgr_flush(cstate->mgr);
			
 
				+					sha512_ctx_mgr_flush(cstate);
			
 
				 		kernel_fpu_end();
			
 
				 		if (!sha_ctx) {
			
 
				 			pr_err("sha512_mb error: nothing got flushed for"
			
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S
@@ -280,12 +280,18 @@ ENTRY(sha512_mb_mgr_get_comp_job_avx2)
 
				 	pop     %rbx
			
 
				         ret
			
 
				 ENDPROC(sha512_mb_mgr_get_comp_job_avx2)
			
 
				-.data
			
 
				 
			
 
				-.align 16
			
 
				+.section	.rodata.cst8.one, "aM", @progbits, 8
			
 
				+.align 8
			
 
				 one:
			
 
				 .quad  1
			
 
				+
			
 
				+.section	.rodata.cst8.two, "aM", @progbits, 8
			
 
				+.align 8
			
 
				 two:
			
 
				 .quad  2
			
 
				+
			
 
				+.section	.rodata.cst8.three, "aM", @progbits, 8
			
 
				+.align 8
			
 
				 three:
			
 
				 .quad  3
			
--- a/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S
@@ -209,8 +209,9 @@ return_null:
 
				 	xor     job_rax, job_rax
			
 
				 	jmp     return
			
 
				 ENDPROC(sha512_mb_mgr_submit_avx2)
			
 
				-.data
			
 
				 
			
 
				+/* UNUSED?
			
 
				+.section	.rodata.cst16, "aM", @progbits, 16
			
 
				 .align 16
			
 
				 H0:     .int  0x6a09e667
			
 
				 H1:     .int  0xbb67ae85
			
@@ -220,3 +221,4 @@ H4:     .int  0x510e527f
 
				 H5:     .int  0x9b05688c
			
 
				 H6:     .int  0x1f83d9ab
			
 
				 H7:     .int  0x5be0cd19
			
 
				+*/
			
--- a/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
+++ b/arch/x86/crypto/sha512-mb/sha512_x4_avx2.S
@@ -361,7 +361,7 @@ Lrounds_16_xx:
 
				 	ret
			
 
				 ENDPROC(sha512_x4_avx2)
			
 
				 
			
 
				-.data
			
 
				+.section	.rodata.K512_4, "a", @progbits
			
 
				 .align 64
			
 
				 K512_4:
			
 
				 	.octa 0x428a2f98d728ae22428a2f98d728ae22,\
			
@@ -525,5 +525,7 @@ K512_4:
 
				 	.octa 0x6c44198c4a4758176c44198c4a475817,\
			
 
				 		0x6c44198c4a4758176c44198c4a475817
			
 
				 
			
 
				+.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
			
 
				+.align 32
			
 
				 PSHUFFLE_BYTE_FLIP_MASK: .octa 0x08090a0b0c0d0e0f0001020304050607
			
 
				                          .octa 0x18191a1b1c1d1e1f1011121314151617
			
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -369,14 +369,17 @@ ENDPROC(sha512_transform_ssse3)
 
				 ########################################################################
			
 
				 ### Binary Data
			
 
				 
			
 
				-.data
			
 
				-
			
 
				+.section	.rodata.cst16.XMM_QWORD_BSWAP, "aM", @progbits, 16
			
 
				 .align 16
			
 
				-
			
 
				 # Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
			
 
				 XMM_QWORD_BSWAP:
			
 
				 	.octa 0x08090a0b0c0d0e0f0001020304050607
			
 
				 
			
 
				+# Mergeable 640-byte rodata section. This allows linker to merge the table
			
 
				+# with other, exactly the same 640-byte fragment of another rodata section
			
 
				+# (if such section exists).
			
 
				+.section	.rodata.cst640.K512, "aM", @progbits, 640
			
 
				+.align 64
			
 
				 # K[t] used in SHA512 hashing
			
 
				 K512:
			
 
				 	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
			
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -29,11 +29,13 @@
 
				 
			
 
				 .file "twofish-avx-x86_64-asm_64.S"
			
 
				 
			
 
				-.data
			
 
				+.section	.rodata.cst16.bswap128_mask, "aM", @progbits, 16
			
 
				 .align 16
			
 
				-
			
 
				 .Lbswap128_mask:
			
 
				 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
			
 
				+
			
 
				+.section	.rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
			
 
				+.align 16
			
 
				 .Lxts_gf128mul_and_shl1_mask:
			
 
				 	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
			
 
				 
			
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -263,6 +263,7 @@ comment "Authenticated Encryption with Associated Data"
 
				 config CRYPTO_CCM
			
 
				 	tristate "CCM support"
			
 
				 	select CRYPTO_CTR
			
 
				+	select CRYPTO_HASH
			
 
				 	select CRYPTO_AEAD
			
 
				 	help
			
 
				 	  Support for Counter with CBC MAC. Required for IPsec.
			
@@ -374,6 +375,7 @@ config CRYPTO_XTS
 
				 	select CRYPTO_BLKCIPHER
			
 
				 	select CRYPTO_MANAGER
			
 
				 	select CRYPTO_GF128MUL
			
 
				+	select CRYPTO_ECB
			
 
				 	help
			
 
				 	  XTS: IEEE1619/D16 narrow block cipher use with aes-xts-plain,
			
 
				 	  key size 256, 384 or 512 bits. This implementation currently
			
@@ -895,6 +897,23 @@ config CRYPTO_AES
 
				 
			
 
				 	  See <http://csrc.nist.gov/CryptoToolkit/aes/> for more information.
			
 
				 
			
 
				+config CRYPTO_AES_TI
			
 
				+	tristate "Fixed time AES cipher"
			
 
				+	select CRYPTO_ALGAPI
			
 
				+	help
			
 
				+	  This is a generic implementation of AES that attempts to eliminate
			
 
				+	  data dependent latencies as much as possible without affecting
			
 
				+	  performance too much. It is intended for use by the generic CCM
			
 
				+	  and GCM drivers, and other CTR or CMAC/XCBC based modes that rely
			
 
				+	  solely on encryption (although decryption is supported as well, but
			
 
				+	  with a more dramatic performance hit)
			
 
				+
			
 
				+	  Instead of using 16 lookup tables of 1 KB each, (8 for encryption and
			
 
				+	  8 for decryption), this implementation only uses just two S-boxes of
			
 
				+	  256 bytes each, and attempts to eliminate data dependent latencies by
			
 
				+	  prefetching the entire table into the cache at the start of each
			
 
				+	  block.
			
 
				+
			
 
				 config CRYPTO_AES_586
			
 
				 	tristate "AES cipher algorithms (i586)"
			
 
				 	depends on (X86 || UML_X86) && !64BIT
			
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_CRYPTO_SHA256) += sha256_generic.o
 
				 obj-$(CONFIG_CRYPTO_SHA512) += sha512_generic.o
			
 
				 obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o
			
 
				 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
			
 
				+CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
			
 
				 obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
			
 
				 obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o
			
 
				 obj-$(CONFIG_CRYPTO_ECB) += ecb.o
			
@@ -98,7 +99,9 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_COMMON) += blowfish_common.o
 
				 obj-$(CONFIG_CRYPTO_TWOFISH) += twofish_generic.o
			
 
				 obj-$(CONFIG_CRYPTO_TWOFISH_COMMON) += twofish_common.o
			
 
				 obj-$(CONFIG_CRYPTO_SERPENT) += serpent_generic.o
			
 
				+CFLAGS_serpent_generic.o := $(call cc-option,-fsched-pressure)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
			
 
				 obj-$(CONFIG_CRYPTO_AES) += aes_generic.o
			
 
				+obj-$(CONFIG_CRYPTO_AES_TI) += aes_ti.o
			
 
				 obj-$(CONFIG_CRYPTO_CAMELLIA) += camellia_generic.o
			
 
				 obj-$(CONFIG_CRYPTO_CAST_COMMON) += cast_common.o
			
 
				 obj-$(CONFIG_CRYPTO_CAST5) += cast5_generic.o
			
--- a/crypto/ablkcipher.c
+++ b/crypto/ablkcipher.c
@@ -19,6 +19,7 @@
 
				 #include <linux/slab.h>
			
 
				 #include <linux/seq_file.h>
			
 
				 #include <linux/cryptouser.h>
			
 
				+#include <linux/compiler.h>
			
 
				 #include <net/netlink.h>
			
 
				 
			
 
				 #include <crypto/scatterwalk.h>
			
@@ -394,7 +395,7 @@ static int crypto_ablkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 
				 #endif
			
 
				 
			
 
				 static void crypto_ablkcipher_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				-	__attribute__ ((unused));
			
 
				+	__maybe_unused;
			
 
				 static void crypto_ablkcipher_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				 {
			
 
				 	struct ablkcipher_alg *ablkcipher = &alg->cra_ablkcipher;
			
@@ -468,7 +469,7 @@ static int crypto_givcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 
				 #endif
			
 
				 
			
 
				 static void crypto_givcipher_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				-	__attribute__ ((unused));
			
 
				+	__maybe_unused;
			
 
				 static void crypto_givcipher_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				 {
			
 
				 	struct ablkcipher_alg *ablkcipher = &alg->cra_ablkcipher;
			
--- a/crypto/acompress.c
+++ b/crypto/acompress.c
@@ -20,6 +20,7 @@
 
				 #include <linux/crypto.h>
			
 
				 #include <crypto/algapi.h>
			
 
				 #include <linux/cryptouser.h>
			
 
				+#include <linux/compiler.h>
			
 
				 #include <net/netlink.h>
			
 
				 #include <crypto/internal/acompress.h>
			
 
				 #include <crypto/internal/scompress.h>
			
@@ -50,7 +51,7 @@ static int crypto_acomp_report(struct sk_buff *skb, struct crypto_alg *alg)
 
				 #endif
			
 
				 
			
 
				 static void crypto_acomp_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				-	__attribute__ ((unused));
			
 
				+	__maybe_unused;
			
 
				 
			
 
				 static void crypto_acomp_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				 {
			
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -24,6 +24,7 @@
 
				 #include <linux/slab.h>
			
 
				 #include <linux/seq_file.h>
			
 
				 #include <linux/cryptouser.h>
			
 
				+#include <linux/compiler.h>
			
 
				 #include <net/netlink.h>
			
 
				 
			
 
				 #include "internal.h"
			
@@ -132,7 +133,7 @@ static int crypto_aead_report(struct sk_buff *skb, struct crypto_alg *alg)
 
				 #endif
			
 
				 
			
 
				 static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				-	__attribute__ ((unused));
			
 
				+	__maybe_unused;
			
 
				 static void crypto_aead_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				 {
			
 
				 	struct aead_alg *aead = container_of(alg, struct aead_alg, base);
			
--- a/crypto/aes_generic.c
+++ b/crypto/aes_generic.c
@@ -54,6 +54,7 @@
 
				 #include <linux/errno.h>
			
 
				 #include <linux/crypto.h>
			
 
				 #include <asm/byteorder.h>
			
 
				+#include <asm/unaligned.h>
			
 
				 
			
 
				 static inline u8 byte(const u32 x, const unsigned n)
			
 
				 {
			
@@ -1216,7 +1217,6 @@ EXPORT_SYMBOL_GPL(crypto_il_tab);
 
				 int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
			
 
				 		unsigned int key_len)
			
 
				 {
			
 
				-	const __le32 *key = (const __le32 *)in_key;
			
 
				 	u32 i, t, u, v, w, j;
			
 
				 
			
 
				 	if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
			
@@ -1225,10 +1225,15 @@ int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 
				 
			
 
				 	ctx->key_length = key_len;
			
 
				 
			
 
				-	ctx->key_dec[key_len + 24] = ctx->key_enc[0] = le32_to_cpu(key[0]);
			
 
				-	ctx->key_dec[key_len + 25] = ctx->key_enc[1] = le32_to_cpu(key[1]);
			
 
				-	ctx->key_dec[key_len + 26] = ctx->key_enc[2] = le32_to_cpu(key[2]);
			
 
				-	ctx->key_dec[key_len + 27] = ctx->key_enc[3] = le32_to_cpu(key[3]);
			
 
				+	ctx->key_enc[0] = get_unaligned_le32(in_key);
			
 
				+	ctx->key_enc[1] = get_unaligned_le32(in_key + 4);
			
 
				+	ctx->key_enc[2] = get_unaligned_le32(in_key + 8);
			
 
				+	ctx->key_enc[3] = get_unaligned_le32(in_key + 12);
			
 
				+
			
 
				+	ctx->key_dec[key_len + 24] = ctx->key_enc[0];
			
 
				+	ctx->key_dec[key_len + 25] = ctx->key_enc[1];
			
 
				+	ctx->key_dec[key_len + 26] = ctx->key_enc[2];
			
 
				+	ctx->key_dec[key_len + 27] = ctx->key_enc[3];
			
 
				 
			
 
				 	switch (key_len) {
			
 
				 	case AES_KEYSIZE_128:
			
@@ -1238,17 +1243,17 @@ int crypto_aes_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
 
				 		break;
			
 
				 
			
 
				 	case AES_KEYSIZE_192:
			
 
				-		ctx->key_enc[4] = le32_to_cpu(key[4]);
			
 
				-		t = ctx->key_enc[5] = le32_to_cpu(key[5]);
			
 
				+		ctx->key_enc[4] = get_unaligned_le32(in_key + 16);
			
 
				+		t = ctx->key_enc[5] = get_unaligned_le32(in_key + 20);
			
 
				 		for (i = 0; i < 8; ++i)
			
 
				 			loop6(i);
			
 
				 		break;
			
 
				 
			
 
				 	case AES_KEYSIZE_256:
			
 
				-		ctx->key_enc[4] = le32_to_cpu(key[4]);
			
 
				-		ctx->key_enc[5] = le32_to_cpu(key[5]);
			
 
				-		ctx->key_enc[6] = le32_to_cpu(key[6]);
			
 
				-		t = ctx->key_enc[7] = le32_to_cpu(key[7]);
			
 
				+		ctx->key_enc[4] = get_unaligned_le32(in_key + 16);
			
 
				+		ctx->key_enc[5] = get_unaligned_le32(in_key + 20);
			
 
				+		ctx->key_enc[6] = get_unaligned_le32(in_key + 24);
			
 
				+		t = ctx->key_enc[7] = get_unaligned_le32(in_key + 28);
			
 
				 		for (i = 0; i < 6; ++i)
			
 
				 			loop8(i);
			
 
				 		loop8tophalf(i);
			
@@ -1329,16 +1334,14 @@ EXPORT_SYMBOL_GPL(crypto_aes_set_key);
 
				 static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
			
 
				 {
			
 
				 	const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				-	const __le32 *src = (const __le32 *)in;
			
 
				-	__le32 *dst = (__le32 *)out;
			
 
				 	u32 b0[4], b1[4];
			
 
				 	const u32 *kp = ctx->key_enc + 4;
			
 
				 	const int key_len = ctx->key_length;
			
 
				 
			
 
				-	b0[0] = le32_to_cpu(src[0]) ^ ctx->key_enc[0];
			
 
				-	b0[1] = le32_to_cpu(src[1]) ^ ctx->key_enc[1];
			
 
				-	b0[2] = le32_to_cpu(src[2]) ^ ctx->key_enc[2];
			
 
				-	b0[3] = le32_to_cpu(src[3]) ^ ctx->key_enc[3];
			
 
				+	b0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
			
 
				+	b0[1] = ctx->key_enc[1] ^ get_unaligned_le32(in + 4);
			
 
				+	b0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
			
 
				+	b0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
			
 
				 
			
 
				 	if (key_len > 24) {
			
 
				 		f_nround(b1, b0, kp);
			
@@ -1361,10 +1364,10 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 
				 	f_nround(b1, b0, kp);
			
 
				 	f_lround(b0, b1, kp);
			
 
				 
			
 
				-	dst[0] = cpu_to_le32(b0[0]);
			
 
				-	dst[1] = cpu_to_le32(b0[1]);
			
 
				-	dst[2] = cpu_to_le32(b0[2]);
			
 
				-	dst[3] = cpu_to_le32(b0[3]);
			
 
				+	put_unaligned_le32(b0[0], out);
			
 
				+	put_unaligned_le32(b0[1], out + 4);
			
 
				+	put_unaligned_le32(b0[2], out + 8);
			
 
				+	put_unaligned_le32(b0[3], out + 12);
			
 
				 }
			
 
				 
			
 
				 /* decrypt a block of text */
			
@@ -1401,16 +1404,14 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 
				 static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
			
 
				 {
			
 
				 	const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				-	const __le32 *src = (const __le32 *)in;
			
 
				-	__le32 *dst = (__le32 *)out;
			
 
				 	u32 b0[4], b1[4];
			
 
				 	const int key_len = ctx->key_length;
			
 
				 	const u32 *kp = ctx->key_dec + 4;
			
 
				 
			
 
				-	b0[0] = le32_to_cpu(src[0]) ^  ctx->key_dec[0];
			
 
				-	b0[1] = le32_to_cpu(src[1]) ^  ctx->key_dec[1];
			
 
				-	b0[2] = le32_to_cpu(src[2]) ^  ctx->key_dec[2];
			
 
				-	b0[3] = le32_to_cpu(src[3]) ^  ctx->key_dec[3];
			
 
				+	b0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
			
 
				+	b0[1] = ctx->key_dec[1] ^ get_unaligned_le32(in + 4);
			
 
				+	b0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
			
 
				+	b0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);
			
 
				 
			
 
				 	if (key_len > 24) {
			
 
				 		i_nround(b1, b0, kp);
			
@@ -1433,10 +1434,10 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 
				 	i_nround(b1, b0, kp);
			
 
				 	i_lround(b0, b1, kp);
			
 
				 
			
 
				-	dst[0] = cpu_to_le32(b0[0]);
			
 
				-	dst[1] = cpu_to_le32(b0[1]);
			
 
				-	dst[2] = cpu_to_le32(b0[2]);
			
 
				-	dst[3] = cpu_to_le32(b0[3]);
			
 
				+	put_unaligned_le32(b0[0], out);
			
 
				+	put_unaligned_le32(b0[1], out + 4);
			
 
				+	put_unaligned_le32(b0[2], out + 8);
			
 
				+	put_unaligned_le32(b0[3], out + 12);
			
 
				 }
			
 
				 
			
 
				 static struct crypto_alg aes_alg = {
			
@@ -1446,7 +1447,6 @@ static struct crypto_alg aes_alg = {
 
				 	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
			
 
				 	.cra_blocksize		=	AES_BLOCK_SIZE,
			
 
				 	.cra_ctxsize		=	sizeof(struct crypto_aes_ctx),
			
 
				-	.cra_alignmask		=	3,
			
 
				 	.cra_module		=	THIS_MODULE,
			
 
				 	.cra_u			=	{
			
 
				 		.cipher = {
			
--- a/crypto/aes_ti.c
+++ b/crypto/aes_ti.c
@@ -0,0 +1,375 @@
 
				+/*
			
 
				+ * Scalar fixed time AES core transform
			
 
				+ *
			
 
				+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU General Public License version 2 as
			
 
				+ * published by the Free Software Foundation.
			
 
				+ */
			
 
				+
			
 
				+#include <crypto/aes.h>
			
 
				+#include <linux/crypto.h>
			
 
				+#include <linux/module.h>
			
 
				+#include <asm/unaligned.h>
			
 
				+
			
 
				+/*
			
 
				+ * Emit the sbox as volatile const to prevent the compiler from doing
			
 
				+ * constant folding on sbox references involving fixed indexes.
			
 
				+ */
			
 
				+static volatile const u8 __cacheline_aligned __aesti_sbox[] = {
			
 
				+	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
			
 
				+	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
			
 
				+	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
			
 
				+	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
			
 
				+	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
			
 
				+	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
			
 
				+	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
			
 
				+	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
			
 
				+	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
			
 
				+	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
			
 
				+	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
			
 
				+	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
			
 
				+	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
			
 
				+	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
			
 
				+	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
			
 
				+	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
			
 
				+	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
			
 
				+	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
			
 
				+	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
			
 
				+	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
			
 
				+	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
			
 
				+	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
			
 
				+	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
			
 
				+	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
			
 
				+	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
			
 
				+	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
			
 
				+	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
			
 
				+	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
			
 
				+	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
			
 
				+	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
			
 
				+	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
			
 
				+	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
			
 
				+};
			
 
				+
			
 
				+static volatile const u8 __cacheline_aligned __aesti_inv_sbox[] = {
			
 
				+	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
			
 
				+	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
			
 
				+	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
			
 
				+	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
			
 
				+	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
			
 
				+	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
			
 
				+	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
			
 
				+	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
			
 
				+	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
			
 
				+	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
			
 
				+	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
			
 
				+	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
			
 
				+	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
			
 
				+	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
			
 
				+	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
			
 
				+	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
			
 
				+	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
			
 
				+	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
			
 
				+	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
			
 
				+	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
			
 
				+	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
			
 
				+	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
			
 
				+	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
			
 
				+	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
			
 
				+	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
			
 
				+	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
			
 
				+	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
			
 
				+	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
			
 
				+	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
			
 
				+	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
			
 
				+	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
			
 
				+	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d,
			
 
				+};
			
 
				+
			
 
				+static u32 mul_by_x(u32 w)
			
 
				+{
			
 
				+	u32 x = w & 0x7f7f7f7f;
			
 
				+	u32 y = w & 0x80808080;
			
 
				+
			
 
				+	/* multiply by polynomial 'x' (0b10) in GF(2^8) */
			
 
				+	return (x << 1) ^ (y >> 7) * 0x1b;
			
 
				+}
			
 
				+
			
 
				+static u32 mul_by_x2(u32 w)
			
 
				+{
			
 
				+	u32 x = w & 0x3f3f3f3f;
			
 
				+	u32 y = w & 0x80808080;
			
 
				+	u32 z = w & 0x40404040;
			
 
				+
			
 
				+	/* multiply by polynomial 'x^2' (0b100) in GF(2^8) */
			
 
				+	return (x << 2) ^ (y >> 7) * 0x36 ^ (z >> 6) * 0x1b;
			
 
				+}
			
 
				+
			
 
				+static u32 mix_columns(u32 x)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Perform the following matrix multiplication in GF(2^8)
			
 
				+	 *
			
 
				+	 * | 0x2 0x3 0x1 0x1 |   | x[0] |
			
 
				+	 * | 0x1 0x2 0x3 0x1 |   | x[1] |
			
 
				+	 * | 0x1 0x1 0x2 0x3 | x | x[2] |
			
 
				+	 * | 0x3 0x1 0x1 0x3 |   | x[3] |
			
 
				+	 */
			
 
				+	u32 y = mul_by_x(x) ^ ror32(x, 16);
			
 
				+
			
 
				+	return y ^ ror32(x ^ y, 8);
			
 
				+}
			
 
				+
			
 
				+static u32 inv_mix_columns(u32 x)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Perform the following matrix multiplication in GF(2^8)
			
 
				+	 *
			
 
				+	 * | 0xe 0xb 0xd 0x9 |   | x[0] |
			
 
				+	 * | 0x9 0xe 0xb 0xd |   | x[1] |
			
 
				+	 * | 0xd 0x9 0xe 0xb | x | x[2] |
			
 
				+	 * | 0xb 0xd 0x9 0xe |   | x[3] |
			
 
				+	 *
			
 
				+	 * which can conveniently be reduced to
			
 
				+	 *
			
 
				+	 * | 0x2 0x3 0x1 0x1 |   | 0x5 0x0 0x4 0x0 |   | x[0] |
			
 
				+	 * | 0x1 0x2 0x3 0x1 |   | 0x0 0x5 0x0 0x4 |   | x[1] |
			
 
				+	 * | 0x1 0x1 0x2 0x3 | x | 0x4 0x0 0x5 0x0 | x | x[2] |
			
 
				+	 * | 0x3 0x1 0x1 0x2 |   | 0x0 0x4 0x0 0x5 |   | x[3] |
			
 
				+	 */
			
 
				+	u32 y = mul_by_x2(x);
			
 
				+
			
 
				+	return mix_columns(x ^ y ^ ror32(y, 16));
			
 
				+}
			
 
				+
			
 
				+static __always_inline u32 subshift(u32 in[], int pos)
			
 
				+{
			
 
				+	return (__aesti_sbox[in[pos] & 0xff]) ^
			
 
				+	       (__aesti_sbox[(in[(pos + 1) % 4] >>  8) & 0xff] <<  8) ^
			
 
				+	       (__aesti_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^
			
 
				+	       (__aesti_sbox[(in[(pos + 3) % 4] >> 24) & 0xff] << 24);
			
 
				+}
			
 
				+
			
 
				+static __always_inline u32 inv_subshift(u32 in[], int pos)
			
 
				+{
			
 
				+	return (__aesti_inv_sbox[in[pos] & 0xff]) ^
			
 
				+	       (__aesti_inv_sbox[(in[(pos + 3) % 4] >>  8) & 0xff] <<  8) ^
			
 
				+	       (__aesti_inv_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^
			
 
				+	       (__aesti_inv_sbox[(in[(pos + 1) % 4] >> 24) & 0xff] << 24);
			
 
				+}
			
 
				+
			
 
				+static u32 subw(u32 in)
			
 
				+{
			
 
				+	return (__aesti_sbox[in & 0xff]) ^
			
 
				+	       (__aesti_sbox[(in >>  8) & 0xff] <<  8) ^
			
 
				+	       (__aesti_sbox[(in >> 16) & 0xff] << 16) ^
			
 
				+	       (__aesti_sbox[(in >> 24) & 0xff] << 24);
			
 
				+}
			
 
				+
			
 
				+static int aesti_expand_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
			
 
				+			    unsigned int key_len)
			
 
				+{
			
 
				+	u32 kwords = key_len / sizeof(u32);
			
 
				+	u32 rc, i, j;
			
 
				+
			
 
				+	if (key_len != AES_KEYSIZE_128 &&
			
 
				+	    key_len != AES_KEYSIZE_192 &&
			
 
				+	    key_len != AES_KEYSIZE_256)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	ctx->key_length = key_len;
			
 
				+
			
 
				+	for (i = 0; i < kwords; i++)
			
 
				+		ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32));
			
 
				+
			
 
				+	for (i = 0, rc = 1; i < 10; i++, rc = mul_by_x(rc)) {
			
 
				+		u32 *rki = ctx->key_enc + (i * kwords);
			
 
				+		u32 *rko = rki + kwords;
			
 
				+
			
 
				+		rko[0] = ror32(subw(rki[kwords - 1]), 8) ^ rc ^ rki[0];
			
 
				+		rko[1] = rko[0] ^ rki[1];
			
 
				+		rko[2] = rko[1] ^ rki[2];
			
 
				+		rko[3] = rko[2] ^ rki[3];
			
 
				+
			
 
				+		if (key_len == 24) {
			
 
				+			if (i >= 7)
			
 
				+				break;
			
 
				+			rko[4] = rko[3] ^ rki[4];
			
 
				+			rko[5] = rko[4] ^ rki[5];
			
 
				+		} else if (key_len == 32) {
			
 
				+			if (i >= 6)
			
 
				+				break;
			
 
				+			rko[4] = subw(rko[3]) ^ rki[4];
			
 
				+			rko[5] = rko[4] ^ rki[5];
			
 
				+			rko[6] = rko[5] ^ rki[6];
			
 
				+			rko[7] = rko[6] ^ rki[7];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Generate the decryption keys for the Equivalent Inverse Cipher.
			
 
				+	 * This involves reversing the order of the round keys, and applying
			
 
				+	 * the Inverse Mix Columns transformation to all but the first and
			
 
				+	 * the last one.
			
 
				+	 */
			
 
				+	ctx->key_dec[0] = ctx->key_enc[key_len + 24];
			
 
				+	ctx->key_dec[1] = ctx->key_enc[key_len + 25];
			
 
				+	ctx->key_dec[2] = ctx->key_enc[key_len + 26];
			
 
				+	ctx->key_dec[3] = ctx->key_enc[key_len + 27];
			
 
				+
			
 
				+	for (i = 4, j = key_len + 20; j > 0; i += 4, j -= 4) {
			
 
				+		ctx->key_dec[i]     = inv_mix_columns(ctx->key_enc[j]);
			
 
				+		ctx->key_dec[i + 1] = inv_mix_columns(ctx->key_enc[j + 1]);
			
 
				+		ctx->key_dec[i + 2] = inv_mix_columns(ctx->key_enc[j + 2]);
			
 
				+		ctx->key_dec[i + 3] = inv_mix_columns(ctx->key_enc[j + 3]);
			
 
				+	}
			
 
				+
			
 
				+	ctx->key_dec[i]     = ctx->key_enc[0];
			
 
				+	ctx->key_dec[i + 1] = ctx->key_enc[1];
			
 
				+	ctx->key_dec[i + 2] = ctx->key_enc[2];
			
 
				+	ctx->key_dec[i + 3] = ctx->key_enc[3];
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int aesti_set_key(struct crypto_tfm *tfm, const u8 *in_key,
			
 
				+			 unsigned int key_len)
			
 
				+{
			
 
				+	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	int err;
			
 
				+
			
 
				+	err = aesti_expand_key(ctx, in_key, key_len);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	/*
			
 
				+	 * In order to force the compiler to emit data independent Sbox lookups
			
 
				+	 * at the start of each block, xor the first round key with values at
			
 
				+	 * fixed indexes in the Sbox. This will need to be repeated each time
			
 
				+	 * the key is used, which will pull the entire Sbox into the D-cache
			
 
				+	 * before any data dependent Sbox lookups are performed.
			
 
				+	 */
			
 
				+	ctx->key_enc[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[128];
			
 
				+	ctx->key_enc[1] ^= __aesti_sbox[32] ^ __aesti_sbox[160];
			
 
				+	ctx->key_enc[2] ^= __aesti_sbox[64] ^ __aesti_sbox[192];
			
 
				+	ctx->key_enc[3] ^= __aesti_sbox[96] ^ __aesti_sbox[224];
			
 
				+
			
 
				+	ctx->key_dec[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[128];
			
 
				+	ctx->key_dec[1] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[160];
			
 
				+	ctx->key_dec[2] ^= __aesti_inv_sbox[64] ^ __aesti_inv_sbox[192];
			
 
				+	ctx->key_dec[3] ^= __aesti_inv_sbox[96] ^ __aesti_inv_sbox[224];
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void aesti_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
			
 
				+{
			
 
				+	const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	const u32 *rkp = ctx->key_enc + 4;
			
 
				+	int rounds = 6 + ctx->key_length / 4;
			
 
				+	u32 st0[4], st1[4];
			
 
				+	int round;
			
 
				+
			
 
				+	st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
			
 
				+	st0[1] = ctx->key_enc[1] ^ get_unaligned_le32(in + 4);
			
 
				+	st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
			
 
				+	st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
			
 
				+
			
 
				+	st0[0] ^= __aesti_sbox[ 0] ^ __aesti_sbox[128];
			
 
				+	st0[1] ^= __aesti_sbox[32] ^ __aesti_sbox[160];
			
 
				+	st0[2] ^= __aesti_sbox[64] ^ __aesti_sbox[192];
			
 
				+	st0[3] ^= __aesti_sbox[96] ^ __aesti_sbox[224];
			
 
				+
			
 
				+	for (round = 0;; round += 2, rkp += 8) {
			
 
				+		st1[0] = mix_columns(subshift(st0, 0)) ^ rkp[0];
			
 
				+		st1[1] = mix_columns(subshift(st0, 1)) ^ rkp[1];
			
 
				+		st1[2] = mix_columns(subshift(st0, 2)) ^ rkp[2];
			
 
				+		st1[3] = mix_columns(subshift(st0, 3)) ^ rkp[3];
			
 
				+
			
 
				+		if (round == rounds - 2)
			
 
				+			break;
			
 
				+
			
 
				+		st0[0] = mix_columns(subshift(st1, 0)) ^ rkp[4];
			
 
				+		st0[1] = mix_columns(subshift(st1, 1)) ^ rkp[5];
			
 
				+		st0[2] = mix_columns(subshift(st1, 2)) ^ rkp[6];
			
 
				+		st0[3] = mix_columns(subshift(st1, 3)) ^ rkp[7];
			
 
				+	}
			
 
				+
			
 
				+	put_unaligned_le32(subshift(st1, 0) ^ rkp[4], out);
			
 
				+	put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4);
			
 
				+	put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8);
			
 
				+	put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12);
			
 
				+}
			
 
				+
			
 
				+static void aesti_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
			
 
				+{
			
 
				+	const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	const u32 *rkp = ctx->key_dec + 4;
			
 
				+	int rounds = 6 + ctx->key_length / 4;
			
 
				+	u32 st0[4], st1[4];
			
 
				+	int round;
			
 
				+
			
 
				+	st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
			
 
				+	st0[1] = ctx->key_dec[1] ^ get_unaligned_le32(in + 4);
			
 
				+	st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
			
 
				+	st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);
			
 
				+
			
 
				+	st0[0] ^= __aesti_inv_sbox[ 0] ^ __aesti_inv_sbox[128];
			
 
				+	st0[1] ^= __aesti_inv_sbox[32] ^ __aesti_inv_sbox[160];
			
 
				+	st0[2] ^= __aesti_inv_sbox[64] ^ __aesti_inv_sbox[192];
			
 
				+	st0[3] ^= __aesti_inv_sbox[96] ^ __aesti_inv_sbox[224];
			
 
				+
			
 
				+	for (round = 0;; round += 2, rkp += 8) {
			
 
				+		st1[0] = inv_mix_columns(inv_subshift(st0, 0)) ^ rkp[0];
			
 
				+		st1[1] = inv_mix_columns(inv_subshift(st0, 1)) ^ rkp[1];
			
 
				+		st1[2] = inv_mix_columns(inv_subshift(st0, 2)) ^ rkp[2];
			
 
				+		st1[3] = inv_mix_columns(inv_subshift(st0, 3)) ^ rkp[3];
			
 
				+
			
 
				+		if (round == rounds - 2)
			
 
				+			break;
			
 
				+
			
 
				+		st0[0] = inv_mix_columns(inv_subshift(st1, 0)) ^ rkp[4];
			
 
				+		st0[1] = inv_mix_columns(inv_subshift(st1, 1)) ^ rkp[5];
			
 
				+		st0[2] = inv_mix_columns(inv_subshift(st1, 2)) ^ rkp[6];
			
 
				+		st0[3] = inv_mix_columns(inv_subshift(st1, 3)) ^ rkp[7];
			
 
				+	}
			
 
				+
			
 
				+	put_unaligned_le32(inv_subshift(st1, 0) ^ rkp[4], out);
			
 
				+	put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4);
			
 
				+	put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8);
			
 
				+	put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12);
			
 
				+}
			
 
				+
			
 
				+static struct crypto_alg aes_alg = {
			
 
				+	.cra_name			= "aes",
			
 
				+	.cra_driver_name		= "aes-fixed-time",
			
 
				+	.cra_priority			= 100 + 1,
			
 
				+	.cra_flags			= CRYPTO_ALG_TYPE_CIPHER,
			
 
				+	.cra_blocksize			= AES_BLOCK_SIZE,
			
 
				+	.cra_ctxsize			= sizeof(struct crypto_aes_ctx),
			
 
				+	.cra_module			= THIS_MODULE,
			
 
				+
			
 
				+	.cra_cipher.cia_min_keysize	= AES_MIN_KEY_SIZE,
			
 
				+	.cra_cipher.cia_max_keysize	= AES_MAX_KEY_SIZE,
			
 
				+	.cra_cipher.cia_setkey		= aesti_set_key,
			
 
				+	.cra_cipher.cia_encrypt		= aesti_encrypt,
			
 
				+	.cra_cipher.cia_decrypt		= aesti_decrypt
			
 
				+};
			
 
				+
			
 
				+static int __init aes_init(void)
			
 
				+{
			
 
				+	return crypto_register_alg(&aes_alg);
			
 
				+}
			
 
				+
			
 
				+static void __exit aes_fini(void)
			
 
				+{
			
 
				+	crypto_unregister_alg(&aes_alg);
			
 
				+}
			
 
				+
			
 
				+module_init(aes_init);
			
 
				+module_exit(aes_fini);
			
 
				+
			
 
				+MODULE_DESCRIPTION("Generic fixed time AES");
			
 
				+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
			
 
				+MODULE_LICENSE("GPL v2");
			
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -23,6 +23,7 @@
 
				 #include <linux/slab.h>
			
 
				 #include <linux/seq_file.h>
			
 
				 #include <linux/cryptouser.h>
			
 
				+#include <linux/compiler.h>
			
 
				 #include <net/netlink.h>
			
 
				 
			
 
				 #include "internal.h"
			
@@ -493,7 +494,7 @@ static int crypto_ahash_report(struct sk_buff *skb, struct crypto_alg *alg)
 
				 #endif
			
 
				 
			
 
				 static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				-	__attribute__ ((unused));
			
 
				+	__maybe_unused;
			
 
				 static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				 {
			
 
				 	seq_printf(m, "type         : ahash\n");
			
--- a/crypto/akcipher.c
+++ b/crypto/akcipher.c
@@ -17,6 +17,7 @@
 
				 #include <linux/slab.h>
			
 
				 #include <linux/string.h>
			
 
				 #include <linux/crypto.h>
			
 
				+#include <linux/compiler.h>
			
 
				 #include <crypto/algapi.h>
			
 
				 #include <linux/cryptouser.h>
			
 
				 #include <net/netlink.h>
			
@@ -47,7 +48,7 @@ static int crypto_akcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 
				 #endif
			
 
				 
			
 
				 static void crypto_akcipher_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				-	__attribute__ ((unused));
			
 
				+	__maybe_unused;
			
 
				 
			
 
				 static void crypto_akcipher_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				 {
			
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -962,34 +962,66 @@ void crypto_inc(u8 *a, unsigned int size)
 
				 	__be32 *b = (__be32 *)(a + size);
			
 
				 	u32 c;
			
 
				 
			
 
				-	for (; size >= 4; size -= 4) {
			
 
				-		c = be32_to_cpu(*--b) + 1;
			
 
				-		*b = cpu_to_be32(c);
			
 
				-		if (c)
			
 
				-			return;
			
 
				-	}
			
 
				+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
			
 
				+	    !((unsigned long)b & (__alignof__(*b) - 1)))
			
 
				+		for (; size >= 4; size -= 4) {
			
 
				+			c = be32_to_cpu(*--b) + 1;
			
 
				+			*b = cpu_to_be32(c);
			
 
				+			if (c)
			
 
				+				return;
			
 
				+		}
			
 
				 
			
 
				 	crypto_inc_byte(a, size);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(crypto_inc);
			
 
				 
			
 
				-static inline void crypto_xor_byte(u8 *a, const u8 *b, unsigned int size)
			
 
				+void __crypto_xor(u8 *dst, const u8 *src, unsigned int len)
			
 
				 {
			
 
				-	for (; size; size--)
			
 
				-		*a++ ^= *b++;
			
 
				-}
			
 
				+	int relalign = 0;
			
 
				+
			
 
				+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
			
 
				+		int size = sizeof(unsigned long);
			
 
				+		int d = ((unsigned long)dst ^ (unsigned long)src) & (size - 1);
			
 
				+
			
 
				+		relalign = d ? 1 << __ffs(d) : size;
			
 
				+
			
 
				+		/*
			
 
				+		 * If we care about alignment, process as many bytes as
			
 
				+		 * needed to advance dst and src to values whose alignments
			
 
				+		 * equal their relative alignment. This will allow us to
			
 
				+		 * process the remainder of the input using optimal strides.
			
 
				+		 */
			
 
				+		while (((unsigned long)dst & (relalign - 1)) && len > 0) {
			
 
				+			*dst++ ^= *src++;
			
 
				+			len--;
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				-void crypto_xor(u8 *dst, const u8 *src, unsigned int size)
			
 
				-{
			
 
				-	u32 *a = (u32 *)dst;
			
 
				-	u32 *b = (u32 *)src;
			
 
				+	while (IS_ENABLED(CONFIG_64BIT) && len >= 8 && !(relalign & 7)) {
			
 
				+		*(u64 *)dst ^= *(u64 *)src;
			
 
				+		dst += 8;
			
 
				+		src += 8;
			
 
				+		len -= 8;
			
 
				+	}
			
 
				 
			
 
				-	for (; size >= 4; size -= 4)
			
 
				-		*a++ ^= *b++;
			
 
				+	while (len >= 4 && !(relalign & 3)) {
			
 
				+		*(u32 *)dst ^= *(u32 *)src;
			
 
				+		dst += 4;
			
 
				+		src += 4;
			
 
				+		len -= 4;
			
 
				+	}
			
 
				+
			
 
				+	while (len >= 2 && !(relalign & 1)) {
			
 
				+		*(u16 *)dst ^= *(u16 *)src;
			
 
				+		dst += 2;
			
 
				+		src += 2;
			
 
				+		len -= 2;
			
 
				+	}
			
 
				 
			
 
				-	crypto_xor_byte((u8 *)a, (u8 *)b, size);
			
 
				+	while (len--)
			
 
				+		*dst++ ^= *src++;
			
 
				 }
			
 
				-EXPORT_SYMBOL_GPL(crypto_xor);
			
 
				+EXPORT_SYMBOL_GPL(__crypto_xor);
			
 
				 
			
 
				 unsigned int crypto_alg_extsize(struct crypto_alg *alg)
			
 
				 {
			
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -245,7 +245,7 @@ static int hash_accept(struct socket *sock, struct socket *newsock, int flags)
 
				 	struct alg_sock *ask = alg_sk(sk);
			
 
				 	struct hash_ctx *ctx = ask->private;
			
 
				 	struct ahash_request *req = &ctx->req;
			
 
				-	char state[crypto_ahash_statesize(crypto_ahash_reqtfm(req))];
			
 
				+	char state[crypto_ahash_statesize(crypto_ahash_reqtfm(req)) ? : 1];
			
 
				 	struct sock *sk2;
			
 
				 	struct alg_sock *ask2;
			
 
				 	struct hash_ctx *ctx2;
			
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@@ -1,6 +1,6 @@
 
				 /*
			
 
				  * Block chaining cipher operations.
			
 
				- * 
			
 
				+ *
			
 
				  * Generic encrypt/decrypt wrapper for ciphers, handles operations across
			
 
				  * multiple page boundaries by using temporary blocks.  In user context,
			
 
				  * the kernel is given a chance to schedule us once per page.
			
@@ -9,7 +9,7 @@
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify it
			
 
				  * under the terms of the GNU General Public License as published by the Free
			
 
				- * Software Foundation; either version 2 of the License, or (at your option) 
			
 
				+ * Software Foundation; either version 2 of the License, or (at your option)
			
 
				  * any later version.
			
 
				  *
			
 
				  */
			
@@ -25,6 +25,7 @@
 
				 #include <linux/slab.h>
			
 
				 #include <linux/string.h>
			
 
				 #include <linux/cryptouser.h>
			
 
				+#include <linux/compiler.h>
			
 
				 #include <net/netlink.h>
			
 
				 
			
 
				 #include "internal.h"
			
@@ -534,7 +535,7 @@ static int crypto_blkcipher_report(struct sk_buff *skb, struct crypto_alg *alg)
 
				 #endif
			
 
				 
			
 
				 static void crypto_blkcipher_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				-	__attribute__ ((unused));
			
 
				+	__maybe_unused;
			
 
				 static void crypto_blkcipher_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				 {
			
 
				 	seq_printf(m, "type         : blkcipher\n");
			
--- a/crypto/cbc.c
+++ b/crypto/cbc.c
@@ -145,9 +145,6 @@ static int crypto_cbc_create(struct crypto_template *tmpl, struct rtattr **tb)
 
				 	inst->alg.base.cra_blocksize = alg->cra_blocksize;
			
 
				 	inst->alg.base.cra_alignmask = alg->cra_alignmask;
			
 
				 
			
 
				-	/* We access the data as u32s when xoring. */
			
 
				-	inst->alg.base.cra_alignmask |= __alignof__(u32) - 1;
			
 
				-
			
 
				 	inst->alg.ivsize = alg->cra_blocksize;
			
 
				 	inst->alg.min_keysize = alg->cra_cipher.cia_min_keysize;
			
 
				 	inst->alg.max_keysize = alg->cra_cipher.cia_max_keysize;
			
--- a/crypto/ccm.c
+++ b/crypto/ccm.c
@@ -11,6 +11,7 @@
 
				  */
			
 
				 
			
 
				 #include <crypto/internal/aead.h>
			
 
				+#include <crypto/internal/hash.h>
			
 
				 #include <crypto/internal/skcipher.h>
			
 
				 #include <crypto/scatterwalk.h>
			
 
				 #include <linux/err.h>
			
@@ -23,11 +24,11 @@
 
				 
			
 
				 struct ccm_instance_ctx {
			
 
				 	struct crypto_skcipher_spawn ctr;
			
 
				-	struct crypto_spawn cipher;
			
 
				+	struct crypto_ahash_spawn mac;
			
 
				 };
			
 
				 
			
 
				 struct crypto_ccm_ctx {
			
 
				-	struct crypto_cipher *cipher;
			
 
				+	struct crypto_ahash *mac;
			
 
				 	struct crypto_skcipher *ctr;
			
 
				 };
			
 
				 
			
@@ -44,15 +45,21 @@ struct crypto_rfc4309_req_ctx {
 
				 
			
 
				 struct crypto_ccm_req_priv_ctx {
			
 
				 	u8 odata[16];
			
 
				-	u8 idata[16];
			
 
				 	u8 auth_tag[16];
			
 
				-	u32 ilen;
			
 
				 	u32 flags;
			
 
				 	struct scatterlist src[3];
			
 
				 	struct scatterlist dst[3];
			
 
				 	struct skcipher_request skreq;
			
 
				 };
			
 
				 
			
 
				+struct cbcmac_tfm_ctx {
			
 
				+	struct crypto_cipher *child;
			
 
				+};
			
 
				+
			
 
				+struct cbcmac_desc_ctx {
			
 
				+	unsigned int len;
			
 
				+};
			
 
				+
			
 
				 static inline struct crypto_ccm_req_priv_ctx *crypto_ccm_reqctx(
			
 
				 	struct aead_request *req)
			
 
				 {
			
@@ -84,7 +91,7 @@ static int crypto_ccm_setkey(struct crypto_aead *aead, const u8 *key,
 
				 {
			
 
				 	struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead);
			
 
				 	struct crypto_skcipher *ctr = ctx->ctr;
			
 
				-	struct crypto_cipher *tfm = ctx->cipher;
			
 
				+	struct crypto_ahash *mac = ctx->mac;
			
 
				 	int err = 0;
			
 
				 
			
 
				 	crypto_skcipher_clear_flags(ctr, CRYPTO_TFM_REQ_MASK);
			
@@ -96,11 +103,11 @@ static int crypto_ccm_setkey(struct crypto_aead *aead, const u8 *key,
 
				 	if (err)
			
 
				 		goto out;
			
 
				 
			
 
				-	crypto_cipher_clear_flags(tfm, CRYPTO_TFM_REQ_MASK);
			
 
				-	crypto_cipher_set_flags(tfm, crypto_aead_get_flags(aead) &
			
 
				+	crypto_ahash_clear_flags(mac, CRYPTO_TFM_REQ_MASK);
			
 
				+	crypto_ahash_set_flags(mac, crypto_aead_get_flags(aead) &
			
 
				 				    CRYPTO_TFM_REQ_MASK);
			
 
				-	err = crypto_cipher_setkey(tfm, key, keylen);
			
 
				-	crypto_aead_set_flags(aead, crypto_cipher_get_flags(tfm) &
			
 
				+	err = crypto_ahash_setkey(mac, key, keylen);
			
 
				+	crypto_aead_set_flags(aead, crypto_ahash_get_flags(mac) &
			
 
				 			      CRYPTO_TFM_RES_MASK);
			
 
				 
			
 
				 out:
			
@@ -167,119 +174,61 @@ static int format_adata(u8 *adata, unsigned int a)
 
				 	return len;
			
 
				 }
			
 
				 
			
 
				-static void compute_mac(struct crypto_cipher *tfm, u8 *data, int n,
			
 
				-		       struct crypto_ccm_req_priv_ctx *pctx)
			
 
				-{
			
 
				-	unsigned int bs = 16;
			
 
				-	u8 *odata = pctx->odata;
			
 
				-	u8 *idata = pctx->idata;
			
 
				-	int datalen, getlen;
			
 
				-
			
 
				-	datalen = n;
			
 
				-
			
 
				-	/* first time in here, block may be partially filled. */
			
 
				-	getlen = bs - pctx->ilen;
			
 
				-	if (datalen >= getlen) {
			
 
				-		memcpy(idata + pctx->ilen, data, getlen);
			
 
				-		crypto_xor(odata, idata, bs);
			
 
				-		crypto_cipher_encrypt_one(tfm, odata, odata);
			
 
				-		datalen -= getlen;
			
 
				-		data += getlen;
			
 
				-		pctx->ilen = 0;
			
 
				-	}
			
 
				-
			
 
				-	/* now encrypt rest of data */
			
 
				-	while (datalen >= bs) {
			
 
				-		crypto_xor(odata, data, bs);
			
 
				-		crypto_cipher_encrypt_one(tfm, odata, odata);
			
 
				-
			
 
				-		datalen -= bs;
			
 
				-		data += bs;
			
 
				-	}
			
 
				-
			
 
				-	/* check and see if there's leftover data that wasn't
			
 
				-	 * enough to fill a block.
			
 
				-	 */
			
 
				-	if (datalen) {
			
 
				-		memcpy(idata + pctx->ilen, data, datalen);
			
 
				-		pctx->ilen += datalen;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void get_data_to_compute(struct crypto_cipher *tfm,
			
 
				-			       struct crypto_ccm_req_priv_ctx *pctx,
			
 
				-			       struct scatterlist *sg, unsigned int len)
			
 
				-{
			
 
				-	struct scatter_walk walk;
			
 
				-	u8 *data_src;
			
 
				-	int n;
			
 
				-
			
 
				-	scatterwalk_start(&walk, sg);
			
 
				-
			
 
				-	while (len) {
			
 
				-		n = scatterwalk_clamp(&walk, len);
			
 
				-		if (!n) {
			
 
				-			scatterwalk_start(&walk, sg_next(walk.sg));
			
 
				-			n = scatterwalk_clamp(&walk, len);
			
 
				-		}
			
 
				-		data_src = scatterwalk_map(&walk);
			
 
				-
			
 
				-		compute_mac(tfm, data_src, n, pctx);
			
 
				-		len -= n;
			
 
				-
			
 
				-		scatterwalk_unmap(data_src);
			
 
				-		scatterwalk_advance(&walk, n);
			
 
				-		scatterwalk_done(&walk, 0, len);
			
 
				-		if (len)
			
 
				-			crypto_yield(pctx->flags);
			
 
				-	}
			
 
				-
			
 
				-	/* any leftover needs padding and then encrypted */
			
 
				-	if (pctx->ilen) {
			
 
				-		int padlen;
			
 
				-		u8 *odata = pctx->odata;
			
 
				-		u8 *idata = pctx->idata;
			
 
				-
			
 
				-		padlen = 16 - pctx->ilen;
			
 
				-		memset(idata + pctx->ilen, 0, padlen);
			
 
				-		crypto_xor(odata, idata, 16);
			
 
				-		crypto_cipher_encrypt_one(tfm, odata, odata);
			
 
				-		pctx->ilen = 0;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 static int crypto_ccm_auth(struct aead_request *req, struct scatterlist *plain,
			
 
				 			   unsigned int cryptlen)
			
 
				 {
			
 
				+	struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req);
			
 
				 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
			
 
				 	struct crypto_ccm_ctx *ctx = crypto_aead_ctx(aead);
			
 
				-	struct crypto_ccm_req_priv_ctx *pctx = crypto_ccm_reqctx(req);
			
 
				-	struct crypto_cipher *cipher = ctx->cipher;
			
 
				+	AHASH_REQUEST_ON_STACK(ahreq, ctx->mac);
			
 
				 	unsigned int assoclen = req->assoclen;
			
 
				-	u8 *odata = pctx->odata;
			
 
				-	u8 *idata = pctx->idata;
			
 
				-	int err;
			
 
				+	struct scatterlist sg[3];
			
 
				+	u8 odata[16];
			
 
				+	u8 idata[16];
			
 
				+	int ilen, err;
			
 
				 
			
 
				 	/* format control data for input */
			
 
				 	err = format_input(odata, req, cryptlen);
			
 
				 	if (err)
			
 
				 		goto out;
			
 
				 
			
 
				-	/* encrypt first block to use as start in computing mac  */
			
 
				-	crypto_cipher_encrypt_one(cipher, odata, odata);
			
 
				+	sg_init_table(sg, 3);
			
 
				+	sg_set_buf(&sg[0], odata, 16);
			
 
				 
			
 
				 	/* format associated data and compute into mac */
			
 
				 	if (assoclen) {
			
 
				-		pctx->ilen = format_adata(idata, assoclen);
			
 
				-		get_data_to_compute(cipher, pctx, req->src, req->assoclen);
			
 
				+		ilen = format_adata(idata, assoclen);
			
 
				+		sg_set_buf(&sg[1], idata, ilen);
			
 
				+		sg_chain(sg, 3, req->src);
			
 
				 	} else {
			
 
				-		pctx->ilen = 0;
			
 
				+		ilen = 0;
			
 
				+		sg_chain(sg, 2, req->src);
			
 
				 	}
			
 
				 
			
 
				-	/* compute plaintext into mac */
			
 
				-	if (cryptlen)
			
 
				-		get_data_to_compute(cipher, pctx, plain, cryptlen);
			
 
				+	ahash_request_set_tfm(ahreq, ctx->mac);
			
 
				+	ahash_request_set_callback(ahreq, pctx->flags, NULL, NULL);
			
 
				+	ahash_request_set_crypt(ahreq, sg, NULL, assoclen + ilen + 16);
			
 
				+	err = crypto_ahash_init(ahreq);
			
 
				+	if (err)
			
 
				+		goto out;
			
 
				+	err = crypto_ahash_update(ahreq);
			
 
				+	if (err)
			
 
				+		goto out;
			
 
				 
			
 
				+	/* we need to pad the MAC input to a round multiple of the block size */
			
 
				+	ilen = 16 - (assoclen + ilen) % 16;
			
 
				+	if (ilen < 16) {
			
 
				+		memset(idata, 0, ilen);
			
 
				+		sg_init_table(sg, 2);
			
 
				+		sg_set_buf(&sg[0], idata, ilen);
			
 
				+		if (plain)
			
 
				+			sg_chain(sg, 2, plain);
			
 
				+		plain = sg;
			
 
				+		cryptlen += ilen;
			
 
				+	}
			
 
				+
			
 
				+	ahash_request_set_crypt(ahreq, plain, pctx->odata, cryptlen);
			
 
				+	err = crypto_ahash_finup(ahreq);
			
 
				 out:
			
 
				 	return err;
			
 
				 }
			
@@ -453,21 +402,21 @@ static int crypto_ccm_init_tfm(struct crypto_aead *tfm)
 
				 	struct aead_instance *inst = aead_alg_instance(tfm);
			
 
				 	struct ccm_instance_ctx *ictx = aead_instance_ctx(inst);
			
 
				 	struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm);
			
 
				-	struct crypto_cipher *cipher;
			
 
				+	struct crypto_ahash *mac;
			
 
				 	struct crypto_skcipher *ctr;
			
 
				 	unsigned long align;
			
 
				 	int err;
			
 
				 
			
 
				-	cipher = crypto_spawn_cipher(&ictx->cipher);
			
 
				-	if (IS_ERR(cipher))
			
 
				-		return PTR_ERR(cipher);
			
 
				+	mac = crypto_spawn_ahash(&ictx->mac);
			
 
				+	if (IS_ERR(mac))
			
 
				+		return PTR_ERR(mac);
			
 
				 
			
 
				 	ctr = crypto_spawn_skcipher(&ictx->ctr);
			
 
				 	err = PTR_ERR(ctr);
			
 
				 	if (IS_ERR(ctr))
			
 
				-		goto err_free_cipher;
			
 
				+		goto err_free_mac;
			
 
				 
			
 
				-	ctx->cipher = cipher;
			
 
				+	ctx->mac = mac;
			
 
				 	ctx->ctr = ctr;
			
 
				 
			
 
				 	align = crypto_aead_alignmask(tfm);
			
@@ -479,8 +428,8 @@ static int crypto_ccm_init_tfm(struct crypto_aead *tfm)
 
				 
			
 
				 	return 0;
			
 
				 
			
 
				-err_free_cipher:
			
 
				-	crypto_free_cipher(cipher);
			
 
				+err_free_mac:
			
 
				+	crypto_free_ahash(mac);
			
 
				 	return err;
			
 
				 }
			
 
				 
			
@@ -488,7 +437,7 @@ static void crypto_ccm_exit_tfm(struct crypto_aead *tfm)
 
				 {
			
 
				 	struct crypto_ccm_ctx *ctx = crypto_aead_ctx(tfm);
			
 
				 
			
 
				-	crypto_free_cipher(ctx->cipher);
			
 
				+	crypto_free_ahash(ctx->mac);
			
 
				 	crypto_free_skcipher(ctx->ctr);
			
 
				 }
			
 
				 
			
@@ -496,7 +445,7 @@ static void crypto_ccm_free(struct aead_instance *inst)
 
				 {
			
 
				 	struct ccm_instance_ctx *ctx = aead_instance_ctx(inst);
			
 
				 
			
 
				-	crypto_drop_spawn(&ctx->cipher);
			
 
				+	crypto_drop_ahash(&ctx->mac);
			
 
				 	crypto_drop_skcipher(&ctx->ctr);
			
 
				 	kfree(inst);
			
 
				 }
			
@@ -505,12 +454,13 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 
				 				    struct rtattr **tb,
			
 
				 				    const char *full_name,
			
 
				 				    const char *ctr_name,
			
 
				-				    const char *cipher_name)
			
 
				+				    const char *mac_name)
			
 
				 {
			
 
				 	struct crypto_attr_type *algt;
			
 
				 	struct aead_instance *inst;
			
 
				 	struct skcipher_alg *ctr;
			
 
				-	struct crypto_alg *cipher;
			
 
				+	struct crypto_alg *mac_alg;
			
 
				+	struct hash_alg_common *mac;
			
 
				 	struct ccm_instance_ctx *ictx;
			
 
				 	int err;
			
 
				 
			
@@ -521,25 +471,26 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 
				 	if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				-	cipher = crypto_alg_mod_lookup(cipher_name,  CRYPTO_ALG_TYPE_CIPHER,
			
 
				-				       CRYPTO_ALG_TYPE_MASK);
			
 
				-	if (IS_ERR(cipher))
			
 
				-		return PTR_ERR(cipher);
			
 
				+	mac_alg = crypto_find_alg(mac_name, &crypto_ahash_type,
			
 
				+				  CRYPTO_ALG_TYPE_HASH,
			
 
				+				  CRYPTO_ALG_TYPE_AHASH_MASK |
			
 
				+				  CRYPTO_ALG_ASYNC);
			
 
				+	if (IS_ERR(mac_alg))
			
 
				+		return PTR_ERR(mac_alg);
			
 
				 
			
 
				+	mac = __crypto_hash_alg_common(mac_alg);
			
 
				 	err = -EINVAL;
			
 
				-	if (cipher->cra_blocksize != 16)
			
 
				-		goto out_put_cipher;
			
 
				+	if (mac->digestsize != 16)
			
 
				+		goto out_put_mac;
			
 
				 
			
 
				 	inst = kzalloc(sizeof(*inst) + sizeof(*ictx), GFP_KERNEL);
			
 
				 	err = -ENOMEM;
			
 
				 	if (!inst)
			
 
				-		goto out_put_cipher;
			
 
				+		goto out_put_mac;
			
 
				 
			
 
				 	ictx = aead_instance_ctx(inst);
			
 
				-
			
 
				-	err = crypto_init_spawn(&ictx->cipher, cipher,
			
 
				-				aead_crypto_instance(inst),
			
 
				-				CRYPTO_ALG_TYPE_MASK);
			
 
				+	err = crypto_init_ahash_spawn(&ictx->mac, mac,
			
 
				+				      aead_crypto_instance(inst));
			
 
				 	if (err)
			
 
				 		goto err_free_inst;
			
 
				 
			
@@ -548,7 +499,7 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 
				 				   crypto_requires_sync(algt->type,
			
 
				 							algt->mask));
			
 
				 	if (err)
			
 
				-		goto err_drop_cipher;
			
 
				+		goto err_drop_mac;
			
 
				 
			
 
				 	ctr = crypto_spawn_skcipher_alg(&ictx->ctr);
			
 
				 
			
@@ -564,18 +515,17 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 
				 	err = -ENAMETOOLONG;
			
 
				 	if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
			
 
				 		     "ccm_base(%s,%s)", ctr->base.cra_driver_name,
			
 
				-		     cipher->cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
			
 
				+		     mac->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
			
 
				 		goto err_drop_ctr;
			
 
				 
			
 
				 	memcpy(inst->alg.base.cra_name, full_name, CRYPTO_MAX_ALG_NAME);
			
 
				 
			
 
				 	inst->alg.base.cra_flags = ctr->base.cra_flags & CRYPTO_ALG_ASYNC;
			
 
				-	inst->alg.base.cra_priority = (cipher->cra_priority +
			
 
				+	inst->alg.base.cra_priority = (mac->base.cra_priority +
			
 
				 				       ctr->base.cra_priority) / 2;
			
 
				 	inst->alg.base.cra_blocksize = 1;
			
 
				-	inst->alg.base.cra_alignmask = cipher->cra_alignmask |
			
 
				-				       ctr->base.cra_alignmask |
			
 
				-				       (__alignof__(u32) - 1);
			
 
				+	inst->alg.base.cra_alignmask = mac->base.cra_alignmask |
			
 
				+				       ctr->base.cra_alignmask;
			
 
				 	inst->alg.ivsize = 16;
			
 
				 	inst->alg.chunksize = crypto_skcipher_alg_chunksize(ctr);
			
 
				 	inst->alg.maxauthsize = 16;
			
@@ -593,23 +543,24 @@ static int crypto_ccm_create_common(struct crypto_template *tmpl,
 
				 	if (err)
			
 
				 		goto err_drop_ctr;
			
 
				 
			
 
				-out_put_cipher:
			
 
				-	crypto_mod_put(cipher);
			
 
				+out_put_mac:
			
 
				+	crypto_mod_put(mac_alg);
			
 
				 	return err;
			
 
				 
			
 
				 err_drop_ctr:
			
 
				 	crypto_drop_skcipher(&ictx->ctr);
			
 
				-err_drop_cipher:
			
 
				-	crypto_drop_spawn(&ictx->cipher);
			
 
				+err_drop_mac:
			
 
				+	crypto_drop_ahash(&ictx->mac);
			
 
				 err_free_inst:
			
 
				 	kfree(inst);
			
 
				-	goto out_put_cipher;
			
 
				+	goto out_put_mac;
			
 
				 }
			
 
				 
			
 
				 static int crypto_ccm_create(struct crypto_template *tmpl, struct rtattr **tb)
			
 
				 {
			
 
				 	const char *cipher_name;
			
 
				 	char ctr_name[CRYPTO_MAX_ALG_NAME];
			
 
				+	char mac_name[CRYPTO_MAX_ALG_NAME];
			
 
				 	char full_name[CRYPTO_MAX_ALG_NAME];
			
 
				 
			
 
				 	cipher_name = crypto_attr_alg_name(tb[1]);
			
@@ -620,12 +571,16 @@ static int crypto_ccm_create(struct crypto_template *tmpl, struct rtattr **tb)
 
				 		     cipher_name) >= CRYPTO_MAX_ALG_NAME)
			
 
				 		return -ENAMETOOLONG;
			
 
				 
			
 
				+	if (snprintf(mac_name, CRYPTO_MAX_ALG_NAME, "cbcmac(%s)",
			
 
				+		     cipher_name) >= CRYPTO_MAX_ALG_NAME)
			
 
				+		return -ENAMETOOLONG;
			
 
				+
			
 
				 	if (snprintf(full_name, CRYPTO_MAX_ALG_NAME, "ccm(%s)", cipher_name) >=
			
 
				 	    CRYPTO_MAX_ALG_NAME)
			
 
				 		return -ENAMETOOLONG;
			
 
				 
			
 
				 	return crypto_ccm_create_common(tmpl, tb, full_name, ctr_name,
			
 
				-					cipher_name);
			
 
				+					mac_name);
			
 
				 }
			
 
				 
			
 
				 static struct crypto_template crypto_ccm_tmpl = {
			
@@ -899,14 +854,164 @@ static struct crypto_template crypto_rfc4309_tmpl = {
 
				 	.module = THIS_MODULE,
			
 
				 };
			
 
				 
			
 
				+static int crypto_cbcmac_digest_setkey(struct crypto_shash *parent,
			
 
				+				     const u8 *inkey, unsigned int keylen)
			
 
				+{
			
 
				+	struct cbcmac_tfm_ctx *ctx = crypto_shash_ctx(parent);
			
 
				+
			
 
				+	return crypto_cipher_setkey(ctx->child, inkey, keylen);
			
 
				+}
			
 
				+
			
 
				+static int crypto_cbcmac_digest_init(struct shash_desc *pdesc)
			
 
				+{
			
 
				+	struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
			
 
				+	int bs = crypto_shash_digestsize(pdesc->tfm);
			
 
				+	u8 *dg = (u8 *)ctx + crypto_shash_descsize(pdesc->tfm) - bs;
			
 
				+
			
 
				+	ctx->len = 0;
			
 
				+	memset(dg, 0, bs);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int crypto_cbcmac_digest_update(struct shash_desc *pdesc, const u8 *p,
			
 
				+				       unsigned int len)
			
 
				+{
			
 
				+	struct crypto_shash *parent = pdesc->tfm;
			
 
				+	struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
			
 
				+	struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
			
 
				+	struct crypto_cipher *tfm = tctx->child;
			
 
				+	int bs = crypto_shash_digestsize(parent);
			
 
				+	u8 *dg = (u8 *)ctx + crypto_shash_descsize(parent) - bs;
			
 
				+
			
 
				+	while (len > 0) {
			
 
				+		unsigned int l = min(len, bs - ctx->len);
			
 
				+
			
 
				+		crypto_xor(dg + ctx->len, p, l);
			
 
				+		ctx->len +=l;
			
 
				+		len -= l;
			
 
				+		p += l;
			
 
				+
			
 
				+		if (ctx->len == bs) {
			
 
				+			crypto_cipher_encrypt_one(tfm, dg, dg);
			
 
				+			ctx->len = 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int crypto_cbcmac_digest_final(struct shash_desc *pdesc, u8 *out)
			
 
				+{
			
 
				+	struct crypto_shash *parent = pdesc->tfm;
			
 
				+	struct cbcmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
			
 
				+	struct cbcmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
			
 
				+	struct crypto_cipher *tfm = tctx->child;
			
 
				+	int bs = crypto_shash_digestsize(parent);
			
 
				+	u8 *dg = (u8 *)ctx + crypto_shash_descsize(parent) - bs;
			
 
				+
			
 
				+	if (ctx->len)
			
 
				+		crypto_cipher_encrypt_one(tfm, dg, dg);
			
 
				+
			
 
				+	memcpy(out, dg, bs);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int cbcmac_init_tfm(struct crypto_tfm *tfm)
			
 
				+{
			
 
				+	struct crypto_cipher *cipher;
			
 
				+	struct crypto_instance *inst = (void *)tfm->__crt_alg;
			
 
				+	struct crypto_spawn *spawn = crypto_instance_ctx(inst);
			
 
				+	struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+
			
 
				+	cipher = crypto_spawn_cipher(spawn);
			
 
				+	if (IS_ERR(cipher))
			
 
				+		return PTR_ERR(cipher);
			
 
				+
			
 
				+	ctx->child = cipher;
			
 
				+
			
 
				+	return 0;
			
 
				+};
			
 
				+
			
 
				+static void cbcmac_exit_tfm(struct crypto_tfm *tfm)
			
 
				+{
			
 
				+	struct cbcmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	crypto_free_cipher(ctx->child);
			
 
				+}
			
 
				+
			
 
				+static int cbcmac_create(struct crypto_template *tmpl, struct rtattr **tb)
			
 
				+{
			
 
				+	struct shash_instance *inst;
			
 
				+	struct crypto_alg *alg;
			
 
				+	int err;
			
 
				+
			
 
				+	err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH);
			
 
				+	if (err)
			
 
				+		return err;
			
 
				+
			
 
				+	alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_CIPHER,
			
 
				+				  CRYPTO_ALG_TYPE_MASK);
			
 
				+	if (IS_ERR(alg))
			
 
				+		return PTR_ERR(alg);
			
 
				+
			
 
				+	inst = shash_alloc_instance("cbcmac", alg);
			
 
				+	err = PTR_ERR(inst);
			
 
				+	if (IS_ERR(inst))
			
 
				+		goto out_put_alg;
			
 
				+
			
 
				+	err = crypto_init_spawn(shash_instance_ctx(inst), alg,
			
 
				+				shash_crypto_instance(inst),
			
 
				+				CRYPTO_ALG_TYPE_MASK);
			
 
				+	if (err)
			
 
				+		goto out_free_inst;
			
 
				+
			
 
				+	inst->alg.base.cra_priority = alg->cra_priority;
			
 
				+	inst->alg.base.cra_blocksize = 1;
			
 
				+
			
 
				+	inst->alg.digestsize = alg->cra_blocksize;
			
 
				+	inst->alg.descsize = ALIGN(sizeof(struct cbcmac_desc_ctx),
			
 
				+				   alg->cra_alignmask + 1) +
			
 
				+			     alg->cra_blocksize;
			
 
				+
			
 
				+	inst->alg.base.cra_ctxsize = sizeof(struct cbcmac_tfm_ctx);
			
 
				+	inst->alg.base.cra_init = cbcmac_init_tfm;
			
 
				+	inst->alg.base.cra_exit = cbcmac_exit_tfm;
			
 
				+
			
 
				+	inst->alg.init = crypto_cbcmac_digest_init;
			
 
				+	inst->alg.update = crypto_cbcmac_digest_update;
			
 
				+	inst->alg.final = crypto_cbcmac_digest_final;
			
 
				+	inst->alg.setkey = crypto_cbcmac_digest_setkey;
			
 
				+
			
 
				+	err = shash_register_instance(tmpl, inst);
			
 
				+
			
 
				+out_free_inst:
			
 
				+	if (err)
			
 
				+		shash_free_instance(shash_crypto_instance(inst));
			
 
				+
			
 
				+out_put_alg:
			
 
				+	crypto_mod_put(alg);
			
 
				+	return err;
			
 
				+}
			
 
				+
			
 
				+static struct crypto_template crypto_cbcmac_tmpl = {
			
 
				+	.name = "cbcmac",
			
 
				+	.create = cbcmac_create,
			
 
				+	.free = shash_free_instance,
			
 
				+	.module = THIS_MODULE,
			
 
				+};
			
 
				+
			
 
				 static int __init crypto_ccm_module_init(void)
			
 
				 {
			
 
				 	int err;
			
 
				 
			
 
				-	err = crypto_register_template(&crypto_ccm_base_tmpl);
			
 
				+	err = crypto_register_template(&crypto_cbcmac_tmpl);
			
 
				 	if (err)
			
 
				 		goto out;
			
 
				 
			
 
				+	err = crypto_register_template(&crypto_ccm_base_tmpl);
			
 
				+	if (err)
			
 
				+		goto out_undo_cbcmac;
			
 
				+
			
 
				 	err = crypto_register_template(&crypto_ccm_tmpl);
			
 
				 	if (err)
			
 
				 		goto out_undo_base;
			
@@ -922,6 +1027,8 @@ out_undo_ccm:
 
				 	crypto_unregister_template(&crypto_ccm_tmpl);
			
 
				 out_undo_base:
			
 
				 	crypto_unregister_template(&crypto_ccm_base_tmpl);
			
 
				+out_undo_cbcmac:
			
 
				+	crypto_register_template(&crypto_cbcmac_tmpl);
			
 
				 	goto out;
			
 
				 }
			
 
				 
			
@@ -930,6 +1037,7 @@ static void __exit crypto_ccm_module_exit(void)
 
				 	crypto_unregister_template(&crypto_rfc4309_tmpl);
			
 
				 	crypto_unregister_template(&crypto_ccm_tmpl);
			
 
				 	crypto_unregister_template(&crypto_ccm_base_tmpl);
			
 
				+	crypto_unregister_template(&crypto_cbcmac_tmpl);
			
 
				 }
			
 
				 
			
 
				 module_init(crypto_ccm_module_init);
			
--- a/crypto/chacha20_generic.c
+++ b/crypto/chacha20_generic.c
@@ -10,10 +10,9 @@
 
				  */
			
 
				 
			
 
				 #include <crypto/algapi.h>
			
 
				-#include <linux/crypto.h>
			
 
				-#include <linux/kernel.h>
			
 
				-#include <linux/module.h>
			
 
				 #include <crypto/chacha20.h>
			
 
				+#include <crypto/internal/skcipher.h>
			
 
				+#include <linux/module.h>
			
 
				 
			
 
				 static inline u32 le32_to_cpuvp(const void *p)
			
 
				 {
			
@@ -63,10 +62,10 @@ void crypto_chacha20_init(u32 *state, struct chacha20_ctx *ctx, u8 *iv)
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(crypto_chacha20_init);
			
 
				 
			
 
				-int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
			
 
				+int crypto_chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key,
			
 
				 			   unsigned int keysize)
			
 
				 {
			
 
				-	struct chacha20_ctx *ctx = crypto_tfm_ctx(tfm);
			
 
				+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				 	int i;
			
 
				 
			
 
				 	if (keysize != CHACHA20_KEY_SIZE)
			
@@ -79,66 +78,54 @@ int crypto_chacha20_setkey(struct crypto_tfm *tfm, const u8 *key,
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(crypto_chacha20_setkey);
			
 
				 
			
 
				-int crypto_chacha20_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
			
 
				-			  struct scatterlist *src, unsigned int nbytes)
			
 
				+int crypto_chacha20_crypt(struct skcipher_request *req)
			
 
				 {
			
 
				-	struct blkcipher_walk walk;
			
 
				+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				+	struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
			
 
				+	struct skcipher_walk walk;
			
 
				 	u32 state[16];
			
 
				 	int err;
			
 
				 
			
 
				-	blkcipher_walk_init(&walk, dst, src, nbytes);
			
 
				-	err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
			
 
				-
			
 
				-	crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
			
 
				+	err = skcipher_walk_virt(&walk, req, true);
			
 
				 
			
 
				-	while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
			
 
				-		chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
			
 
				-				 rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
			
 
				-		err = blkcipher_walk_done(desc, &walk,
			
 
				-					  walk.nbytes % CHACHA20_BLOCK_SIZE);
			
 
				-	}
			
 
				+	crypto_chacha20_init(state, ctx, walk.iv);
			
 
				 
			
 
				-	if (walk.nbytes) {
			
 
				+	while (walk.nbytes > 0) {
			
 
				 		chacha20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr,
			
 
				 				 walk.nbytes);
			
 
				-		err = blkcipher_walk_done(desc, &walk, 0);
			
 
				+		err = skcipher_walk_done(&walk, 0);
			
 
				 	}
			
 
				 
			
 
				 	return err;
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(crypto_chacha20_crypt);
			
 
				 
			
 
				-static struct crypto_alg alg = {
			
 
				-	.cra_name		= "chacha20",
			
 
				-	.cra_driver_name	= "chacha20-generic",
			
 
				-	.cra_priority		= 100,
			
 
				-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
			
 
				-	.cra_blocksize		= 1,
			
 
				-	.cra_type		= &crypto_blkcipher_type,
			
 
				-	.cra_ctxsize		= sizeof(struct chacha20_ctx),
			
 
				-	.cra_alignmask		= sizeof(u32) - 1,
			
 
				-	.cra_module		= THIS_MODULE,
			
 
				-	.cra_u			= {
			
 
				-		.blkcipher = {
			
 
				-			.min_keysize	= CHACHA20_KEY_SIZE,
			
 
				-			.max_keysize	= CHACHA20_KEY_SIZE,
			
 
				-			.ivsize		= CHACHA20_IV_SIZE,
			
 
				-			.geniv		= "seqiv",
			
 
				-			.setkey		= crypto_chacha20_setkey,
			
 
				-			.encrypt	= crypto_chacha20_crypt,
			
 
				-			.decrypt	= crypto_chacha20_crypt,
			
 
				-		},
			
 
				-	},
			
 
				+static struct skcipher_alg alg = {
			
 
				+	.base.cra_name		= "chacha20",
			
 
				+	.base.cra_driver_name	= "chacha20-generic",
			
 
				+	.base.cra_priority	= 100,
			
 
				+	.base.cra_blocksize	= 1,
			
 
				+	.base.cra_ctxsize	= sizeof(struct chacha20_ctx),
			
 
				+	.base.cra_alignmask	= sizeof(u32) - 1,
			
 
				+	.base.cra_module	= THIS_MODULE,
			
 
				+
			
 
				+	.min_keysize		= CHACHA20_KEY_SIZE,
			
 
				+	.max_keysize		= CHACHA20_KEY_SIZE,
			
 
				+	.ivsize			= CHACHA20_IV_SIZE,
			
 
				+	.chunksize		= CHACHA20_BLOCK_SIZE,
			
 
				+	.setkey			= crypto_chacha20_setkey,
			
 
				+	.encrypt		= crypto_chacha20_crypt,
			
 
				+	.decrypt		= crypto_chacha20_crypt,
			
 
				 };
			
 
				 
			
 
				 static int __init chacha20_generic_mod_init(void)
			
 
				 {
			
 
				-	return crypto_register_alg(&alg);
			
 
				+	return crypto_register_skcipher(&alg);
			
 
				 }
			
 
				 
			
 
				 static void __exit chacha20_generic_mod_fini(void)
			
 
				 {
			
 
				-	crypto_unregister_alg(&alg);
			
 
				+	crypto_unregister_skcipher(&alg);
			
 
				 }
			
 
				 
			
 
				 module_init(chacha20_generic_mod_init);
			
--- a/crypto/cmac.c
+++ b/crypto/cmac.c
@@ -260,8 +260,7 @@ static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
 
				 	if (err)
			
 
				 		goto out_free_inst;
			
 
				 
			
 
				-	/* We access the data as u32s when xoring. */
			
 
				-	alignmask = alg->cra_alignmask | (__alignof__(u32) - 1);
			
 
				+	alignmask = alg->cra_alignmask;
			
 
				 	inst->alg.base.cra_alignmask = alignmask;
			
 
				 	inst->alg.base.cra_priority = alg->cra_priority;
			
 
				 	inst->alg.base.cra_blocksize = alg->cra_blocksize;
			
--- a/crypto/ctr.c
+++ b/crypto/ctr.c
@@ -209,7 +209,7 @@ static struct crypto_instance *crypto_ctr_alloc(struct rtattr **tb)
 
				 	inst->alg.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER;
			
 
				 	inst->alg.cra_priority = alg->cra_priority;
			
 
				 	inst->alg.cra_blocksize = 1;
			
 
				-	inst->alg.cra_alignmask = alg->cra_alignmask | (__alignof__(u32) - 1);
			
 
				+	inst->alg.cra_alignmask = alg->cra_alignmask;
			
 
				 	inst->alg.cra_type = &crypto_blkcipher_type;
			
 
				 
			
 
				 	inst->alg.cra_blkcipher.ivsize = alg->cra_blocksize;
			
--- a/crypto/cts.c
+++ b/crypto/cts.c
@@ -49,6 +49,7 @@
 
				 #include <linux/scatterlist.h>
			
 
				 #include <crypto/scatterwalk.h>
			
 
				 #include <linux/slab.h>
			
 
				+#include <linux/compiler.h>
			
 
				 
			
 
				 struct crypto_cts_ctx {
			
 
				 	struct crypto_skcipher *child;
			
@@ -103,7 +104,7 @@ static int cts_cbc_encrypt(struct skcipher_request *req)
 
				 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				 	struct skcipher_request *subreq = &rctx->subreq;
			
 
				 	int bsize = crypto_skcipher_blocksize(tfm);
			
 
				-	u8 d[bsize * 2] __attribute__ ((aligned(__alignof__(u32))));
			
 
				+	u8 d[bsize * 2] __aligned(__alignof__(u32));
			
 
				 	struct scatterlist *sg;
			
 
				 	unsigned int offset;
			
 
				 	int lastn;
			
@@ -183,7 +184,7 @@ static int cts_cbc_decrypt(struct skcipher_request *req)
 
				 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
			
 
				 	struct skcipher_request *subreq = &rctx->subreq;
			
 
				 	int bsize = crypto_skcipher_blocksize(tfm);
			
 
				-	u8 d[bsize * 2] __attribute__ ((aligned(__alignof__(u32))));
			
 
				+	u8 d[bsize * 2] __aligned(__alignof__(u32));
			
 
				 	struct scatterlist *sg;
			
 
				 	unsigned int offset;
			
 
				 	u8 *space;
			
@@ -373,9 +374,6 @@ static int crypto_cts_create(struct crypto_template *tmpl, struct rtattr **tb)
 
				 	inst->alg.base.cra_blocksize = alg->base.cra_blocksize;
			
 
				 	inst->alg.base.cra_alignmask = alg->base.cra_alignmask;
			
 
				 
			
 
				-	/* We access the data as u32s when xoring. */
			
 
				-	inst->alg.base.cra_alignmask |= __alignof__(u32) - 1;
			
 
				-
			
 
				 	inst->alg.ivsize = alg->base.cra_blocksize;
			
 
				 	inst->alg.chunksize = crypto_skcipher_alg_chunksize(alg);
			
 
				 	inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(alg);
			
--- a/crypto/kpp.c
+++ b/crypto/kpp.c
@@ -19,6 +19,7 @@
 
				 #include <linux/crypto.h>
			
 
				 #include <crypto/algapi.h>
			
 
				 #include <linux/cryptouser.h>
			
 
				+#include <linux/compiler.h>
			
 
				 #include <net/netlink.h>
			
 
				 #include <crypto/kpp.h>
			
 
				 #include <crypto/internal/kpp.h>
			
@@ -47,7 +48,7 @@ static int crypto_kpp_report(struct sk_buff *skb, struct crypto_alg *alg)
 
				 #endif
			
 
				 
			
 
				 static void crypto_kpp_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				-	__attribute__ ((unused));
			
 
				+	__maybe_unused;
			
 
				 
			
 
				 static void crypto_kpp_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				 {
			
--- a/crypto/pcbc.c
+++ b/crypto/pcbc.c
@@ -20,6 +20,7 @@
 
				 #include <linux/kernel.h>
			
 
				 #include <linux/module.h>
			
 
				 #include <linux/slab.h>
			
 
				+#include <linux/compiler.h>
			
 
				 
			
 
				 struct crypto_pcbc_ctx {
			
 
				 	struct crypto_cipher *child;
			
@@ -146,7 +147,7 @@ static int crypto_pcbc_decrypt_inplace(struct skcipher_request *req,
 
				 	unsigned int nbytes = walk->nbytes;
			
 
				 	u8 *src = walk->src.virt.addr;
			
 
				 	u8 *iv = walk->iv;
			
 
				-	u8 tmpbuf[bsize] __attribute__ ((aligned(__alignof__(u32))));
			
 
				+	u8 tmpbuf[bsize] __aligned(__alignof__(u32));
			
 
				 
			
 
				 	do {
			
 
				 		memcpy(tmpbuf, src, bsize);
			
@@ -259,9 +260,6 @@ static int crypto_pcbc_create(struct crypto_template *tmpl, struct rtattr **tb)
 
				 	inst->alg.base.cra_blocksize = alg->cra_blocksize;
			
 
				 	inst->alg.base.cra_alignmask = alg->cra_alignmask;
			
 
				 
			
 
				-	/* We access the data as u32s when xoring. */
			
 
				-	inst->alg.base.cra_alignmask |= __alignof__(u32) - 1;
			
 
				-
			
 
				 	inst->alg.ivsize = alg->cra_blocksize;
			
 
				 	inst->alg.min_keysize = alg->cra_cipher.cia_min_keysize;
			
 
				 	inst->alg.max_keysize = alg->cra_cipher.cia_max_keysize;
			
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -23,6 +23,7 @@
 
				 #include <linux/slab.h>
			
 
				 #include <linux/string.h>
			
 
				 #include <linux/cryptouser.h>
			
 
				+#include <linux/compiler.h>
			
 
				 #include <net/netlink.h>
			
 
				 
			
 
				 #include "internal.h"
			
@@ -95,7 +96,7 @@ static int crypto_rng_report(struct sk_buff *skb, struct crypto_alg *alg)
 
				 #endif
			
 
				 
			
 
				 static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				-	__attribute__ ((unused));
			
 
				+	__maybe_unused;
			
 
				 static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				 {
			
 
				 	seq_printf(m, "type         : rng\n");
			
--- a/crypto/scompress.c
+++ b/crypto/scompress.c
@@ -18,6 +18,7 @@
 
				 #include <linux/slab.h>
			
 
				 #include <linux/string.h>
			
 
				 #include <linux/crypto.h>
			
 
				+#include <linux/compiler.h>
			
 
				 #include <linux/vmalloc.h>
			
 
				 #include <crypto/algapi.h>
			
 
				 #include <linux/cryptouser.h>
			
@@ -57,7 +58,7 @@ static int crypto_scomp_report(struct sk_buff *skb, struct crypto_alg *alg)
 
				 #endif
			
 
				 
			
 
				 static void crypto_scomp_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				-	__attribute__ ((unused));
			
 
				+	__maybe_unused;
			
 
				 
			
 
				 static void crypto_scomp_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				 {
			
--- a/crypto/seqiv.c
+++ b/crypto/seqiv.c
@@ -153,8 +153,6 @@ static int seqiv_aead_create(struct crypto_template *tmpl, struct rtattr **tb)
 
				 	if (IS_ERR(inst))
			
 
				 		return PTR_ERR(inst);
			
 
				 
			
 
				-	inst->alg.base.cra_alignmask |= __alignof__(u32) - 1;
			
 
				-
			
 
				 	spawn = aead_instance_ctx(inst);
			
 
				 	alg = crypto_spawn_aead_alg(spawn);
			
 
				 
			
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -19,6 +19,7 @@
 
				 #include <linux/seq_file.h>
			
 
				 #include <linux/cryptouser.h>
			
 
				 #include <net/netlink.h>
			
 
				+#include <linux/compiler.h>
			
 
				 
			
 
				 #include "internal.h"
			
 
				 
			
@@ -67,7 +68,7 @@ EXPORT_SYMBOL_GPL(crypto_shash_setkey);
 
				 static inline unsigned int shash_align_buffer_size(unsigned len,
			
 
				 						   unsigned long mask)
			
 
				 {
			
 
				-	typedef u8 __attribute__ ((aligned)) u8_aligned;
			
 
				+	typedef u8 __aligned_largest u8_aligned;
			
 
				 	return len + (mask & ~(__alignof__(u8_aligned) - 1));
			
 
				 }
			
 
				 
			
@@ -80,7 +81,7 @@ static int shash_update_unaligned(struct shash_desc *desc, const u8 *data,
 
				 	unsigned int unaligned_len = alignmask + 1 -
			
 
				 				     ((unsigned long)data & alignmask);
			
 
				 	u8 ubuf[shash_align_buffer_size(unaligned_len, alignmask)]
			
 
				-		__attribute__ ((aligned));
			
 
				+		__aligned_largest;
			
 
				 	u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
			
 
				 	int err;
			
 
				 
			
@@ -116,7 +117,7 @@ static int shash_final_unaligned(struct shash_desc *desc, u8 *out)
 
				 	struct shash_alg *shash = crypto_shash_alg(tfm);
			
 
				 	unsigned int ds = crypto_shash_digestsize(tfm);
			
 
				 	u8 ubuf[shash_align_buffer_size(ds, alignmask)]
			
 
				-		__attribute__ ((aligned));
			
 
				+		__aligned_largest;
			
 
				 	u8 *buf = PTR_ALIGN(&ubuf[0], alignmask + 1);
			
 
				 	int err;
			
 
				 
			
@@ -403,7 +404,7 @@ static int crypto_shash_report(struct sk_buff *skb, struct crypto_alg *alg)
 
				 #endif
			
 
				 
			
 
				 static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				-	__attribute__ ((unused));
			
 
				+	__maybe_unused;
			
 
				 static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				 {
			
 
				 	struct shash_alg *salg = __crypto_shash_alg(alg);
			
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -19,6 +19,7 @@
 
				 #include <crypto/scatterwalk.h>
			
 
				 #include <linux/bug.h>
			
 
				 #include <linux/cryptouser.h>
			
 
				+#include <linux/compiler.h>
			
 
				 #include <linux/list.h>
			
 
				 #include <linux/module.h>
			
 
				 #include <linux/rtnetlink.h>
			
@@ -185,12 +186,12 @@ void skcipher_walk_complete(struct skcipher_walk *walk, int err)
 
				 		data = p->data;
			
 
				 		if (!data) {
			
 
				 			data = PTR_ALIGN(&p->buffer[0], walk->alignmask + 1);
			
 
				-			data = skcipher_get_spot(data, walk->chunksize);
			
 
				+			data = skcipher_get_spot(data, walk->stride);
			
 
				 		}
			
 
				 
			
 
				 		scatterwalk_copychunks(data, &p->dst, p->len, 1);
			
 
				 
			
 
				-		if (offset_in_page(p->data) + p->len + walk->chunksize >
			
 
				+		if (offset_in_page(p->data) + p->len + walk->stride >
			
 
				 		    PAGE_SIZE)
			
 
				 			free_page((unsigned long)p->data);
			
 
				 
			
@@ -299,7 +300,7 @@ static int skcipher_next_copy(struct skcipher_walk *walk)
 
				 	p->len = walk->nbytes;
			
 
				 	skcipher_queue_write(walk, p);
			
 
				 
			
 
				-	if (offset_in_page(walk->page) + walk->nbytes + walk->chunksize >
			
 
				+	if (offset_in_page(walk->page) + walk->nbytes + walk->stride >
			
 
				 	    PAGE_SIZE)
			
 
				 		walk->page = NULL;
			
 
				 	else
			
@@ -344,7 +345,7 @@ static int skcipher_walk_next(struct skcipher_walk *walk)
 
				 			 SKCIPHER_WALK_DIFF);
			
 
				 
			
 
				 	n = walk->total;
			
 
				-	bsize = min(walk->chunksize, max(n, walk->blocksize));
			
 
				+	bsize = min(walk->stride, max(n, walk->blocksize));
			
 
				 	n = scatterwalk_clamp(&walk->in, n);
			
 
				 	n = scatterwalk_clamp(&walk->out, n);
			
 
				 
			
@@ -393,7 +394,7 @@ static int skcipher_copy_iv(struct skcipher_walk *walk)
 
				 	unsigned a = crypto_tfm_ctx_alignment() - 1;
			
 
				 	unsigned alignmask = walk->alignmask;
			
 
				 	unsigned ivsize = walk->ivsize;
			
 
				-	unsigned bs = walk->chunksize;
			
 
				+	unsigned bs = walk->stride;
			
 
				 	unsigned aligned_bs;
			
 
				 	unsigned size;
			
 
				 	u8 *iv;
			
@@ -463,7 +464,7 @@ static int skcipher_walk_skcipher(struct skcipher_walk *walk,
 
				 		       SKCIPHER_WALK_SLEEP : 0;
			
 
				 
			
 
				 	walk->blocksize = crypto_skcipher_blocksize(tfm);
			
 
				-	walk->chunksize = crypto_skcipher_chunksize(tfm);
			
 
				+	walk->stride = crypto_skcipher_walksize(tfm);
			
 
				 	walk->ivsize = crypto_skcipher_ivsize(tfm);
			
 
				 	walk->alignmask = crypto_skcipher_alignmask(tfm);
			
 
				 
			
@@ -525,7 +526,7 @@ static int skcipher_walk_aead_common(struct skcipher_walk *walk,
 
				 		walk->flags &= ~SKCIPHER_WALK_SLEEP;
			
 
				 
			
 
				 	walk->blocksize = crypto_aead_blocksize(tfm);
			
 
				-	walk->chunksize = crypto_aead_chunksize(tfm);
			
 
				+	walk->stride = crypto_aead_chunksize(tfm);
			
 
				 	walk->ivsize = crypto_aead_ivsize(tfm);
			
 
				 	walk->alignmask = crypto_aead_alignmask(tfm);
			
 
				 
			
@@ -807,7 +808,7 @@ static void crypto_skcipher_free_instance(struct crypto_instance *inst)
 
				 }
			
 
				 
			
 
				 static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				-	__attribute__ ((unused));
			
 
				+	__maybe_unused;
			
 
				 static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
			
 
				 {
			
 
				 	struct skcipher_alg *skcipher = container_of(alg, struct skcipher_alg,
			
@@ -821,6 +822,7 @@ static void crypto_skcipher_show(struct seq_file *m, struct crypto_alg *alg)
 
				 	seq_printf(m, "max keysize  : %u\n", skcipher->max_keysize);
			
 
				 	seq_printf(m, "ivsize       : %u\n", skcipher->ivsize);
			
 
				 	seq_printf(m, "chunksize    : %u\n", skcipher->chunksize);
			
 
				+	seq_printf(m, "walksize     : %u\n", skcipher->walksize);
			
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_NET
			
@@ -893,11 +895,14 @@ static int skcipher_prepare_alg(struct skcipher_alg *alg)
 
				 {
			
 
				 	struct crypto_alg *base = &alg->base;
			
 
				 
			
 
				-	if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8)
			
 
				+	if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8 ||
			
 
				+	    alg->walksize > PAGE_SIZE / 8)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				 	if (!alg->chunksize)
			
 
				 		alg->chunksize = base->cra_blocksize;
			
 
				+	if (!alg->walksize)
			
 
				+		alg->walksize = alg->chunksize;
			
 
				 
			
 
				 	base->cra_type = &crypto_skcipher_type2;
			
 
				 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
			
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -22,6 +22,8 @@
 
				  *
			
 
				  */
			
 
				 
			
 
				+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
			
 
				+
			
 
				 #include <crypto/aead.h>
			
 
				 #include <crypto/hash.h>
			
 
				 #include <crypto/skcipher.h>
			
@@ -1010,6 +1012,8 @@ static inline int tcrypt_test(const char *alg)
 
				 {
			
 
				 	int ret;
			
 
				 
			
 
				+	pr_debug("testing %s\n", alg);
			
 
				+
			
 
				 	ret = alg_test(alg, alg, 0, 0);
			
 
				 	/* non-fips algs return -EINVAL in fips mode */
			
 
				 	if (fips_enabled && ret == -EINVAL)
			
@@ -2059,6 +2063,8 @@ static int __init tcrypt_mod_init(void)
 
				 	if (err) {
			
 
				 		printk(KERN_ERR "tcrypt: one or more tests failed!\n");
			
 
				 		goto err_free_tv;
			
 
				+	} else {
			
 
				+		pr_debug("all tests passed\n");
			
 
				 	}
			
 
				 
			
 
				 	/* We intentionaly return -EAGAIN to prevent keeping the module,
			
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -265,6 +265,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 
				 		       const int align_offset)
			
 
				 {
			
 
				 	const char *algo = crypto_tfm_alg_driver_name(crypto_ahash_tfm(tfm));
			
 
				+	size_t digest_size = crypto_ahash_digestsize(tfm);
			
 
				 	unsigned int i, j, k, temp;
			
 
				 	struct scatterlist sg[8];
			
 
				 	char *result;
			
@@ -275,7 +276,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 
				 	char *xbuf[XBUFSIZE];
			
 
				 	int ret = -ENOMEM;
			
 
				 
			
 
				-	result = kmalloc(MAX_DIGEST_SIZE, GFP_KERNEL);
			
 
				+	result = kmalloc(digest_size, GFP_KERNEL);
			
 
				 	if (!result)
			
 
				 		return ret;
			
 
				 	key = kmalloc(MAX_KEYLEN, GFP_KERNEL);
			
@@ -305,7 +306,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 
				 			goto out;
			
 
				 
			
 
				 		j++;
			
 
				-		memset(result, 0, MAX_DIGEST_SIZE);
			
 
				+		memset(result, 0, digest_size);
			
 
				 
			
 
				 		hash_buff = xbuf[0];
			
 
				 		hash_buff += align_offset;
			
@@ -380,7 +381,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 
				 			continue;
			
 
				 
			
 
				 		j++;
			
 
				-		memset(result, 0, MAX_DIGEST_SIZE);
			
 
				+		memset(result, 0, digest_size);
			
 
				 
			
 
				 		temp = 0;
			
 
				 		sg_init_table(sg, template[i].np);
			
@@ -458,7 +459,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template,
 
				 			continue;
			
 
				 
			
 
				 		j++;
			
 
				-		memset(result, 0, MAX_DIGEST_SIZE);
			
 
				+		memset(result, 0, digest_size);
			
 
				 
			
 
				 		ret = -EINVAL;
			
 
				 		hash_buff = xbuf[0];
			
@@ -1463,13 +1464,12 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate,
 
				 		int ilen = ctemplate[i].inlen;
			
 
				 		void *input_vec;
			
 
				 
			
 
				-		input_vec = kmalloc(ilen, GFP_KERNEL);
			
 
				+		input_vec = kmemdup(ctemplate[i].input, ilen, GFP_KERNEL);
			
 
				 		if (!input_vec) {
			
 
				 			ret = -ENOMEM;
			
 
				 			goto out;
			
 
				 		}
			
 
				 
			
 
				-		memcpy(input_vec, ctemplate[i].input, ilen);
			
 
				 		memset(output, 0, dlen);
			
 
				 		init_completion(&result.completion);
			
 
				 		sg_init_one(&src, input_vec, ilen);
			
@@ -1525,13 +1525,12 @@ static int test_acomp(struct crypto_acomp *tfm, struct comp_testvec *ctemplate,
 
				 		int ilen = dtemplate[i].inlen;
			
 
				 		void *input_vec;
			
 
				 
			
 
				-		input_vec = kmalloc(ilen, GFP_KERNEL);
			
 
				+		input_vec = kmemdup(dtemplate[i].input, ilen, GFP_KERNEL);
			
 
				 		if (!input_vec) {
			
 
				 			ret = -ENOMEM;
			
 
				 			goto out;
			
 
				 		}
			
 
				 
			
 
				-		memcpy(input_vec, dtemplate[i].input, ilen);
			
 
				 		memset(output, 0, dlen);
			
 
				 		init_completion(&result.completion);
			
 
				 		sg_init_one(&src, input_vec, ilen);
			
@@ -2251,30 +2250,23 @@ static int alg_test_null(const struct alg_test_desc *desc,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+#define __VECS(tv)	{ .vecs = tv, .count = ARRAY_SIZE(tv) }
			
 
				+
			
 
				 /* Please keep this list sorted by algorithm name. */
			
 
				 static const struct alg_test_desc alg_test_descs[] = {
			
 
				 	{
			
 
				 		.alg = "ansi_cprng",
			
 
				 		.test = alg_test_cprng,
			
 
				 		.suite = {
			
 
				-			.cprng = {
			
 
				-				.vecs = ansi_cprng_aes_tv_template,
			
 
				-				.count = ANSI_CPRNG_AES_TEST_VECTORS
			
 
				-			}
			
 
				+			.cprng = __VECS(ansi_cprng_aes_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "authenc(hmac(md5),ecb(cipher_null))",
			
 
				 		.test = alg_test_aead,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs = hmac_md5_ecb_cipher_null_enc_tv_template,
			
 
				-					.count = HMAC_MD5_ECB_CIPHER_NULL_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = hmac_md5_ecb_cipher_null_dec_tv_template,
			
 
				-					.count = HMAC_MD5_ECB_CIPHER_NULL_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_md5_ecb_cipher_null_enc_tv_template),
			
 
				+				.dec = __VECS(hmac_md5_ecb_cipher_null_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2282,12 +2274,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_aead,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs =
			
 
				-					hmac_sha1_aes_cbc_enc_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA1_AES_CBC_ENC_TEST_VEC
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_sha1_aes_cbc_enc_tv_temp)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2295,12 +2282,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_aead,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs =
			
 
				-					hmac_sha1_des_cbc_enc_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA1_DES_CBC_ENC_TEST_VEC
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_sha1_des_cbc_enc_tv_temp)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2309,12 +2291,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs =
			
 
				-					hmac_sha1_des3_ede_cbc_enc_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA1_DES3_EDE_CBC_ENC_TEST_VEC
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_sha1_des3_ede_cbc_enc_tv_temp)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2326,18 +2303,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_aead,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs =
			
 
				-					hmac_sha1_ecb_cipher_null_enc_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA1_ECB_CIPHER_NULL_ENC_TEST_VEC
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs =
			
 
				-					hmac_sha1_ecb_cipher_null_dec_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA1_ECB_CIPHER_NULL_DEC_TEST_VEC
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_sha1_ecb_cipher_null_enc_tv_temp),
			
 
				+				.dec = __VECS(hmac_sha1_ecb_cipher_null_dec_tv_temp)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2349,12 +2316,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_aead,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs =
			
 
				-					hmac_sha224_des_cbc_enc_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA224_DES_CBC_ENC_TEST_VEC
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_sha224_des_cbc_enc_tv_temp)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2363,12 +2325,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs =
			
 
				-					hmac_sha224_des3_ede_cbc_enc_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA224_DES3_EDE_CBC_ENC_TEST_VEC
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_sha224_des3_ede_cbc_enc_tv_temp)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2377,12 +2334,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs =
			
 
				-					hmac_sha256_aes_cbc_enc_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA256_AES_CBC_ENC_TEST_VEC
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_sha256_aes_cbc_enc_tv_temp)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2390,12 +2342,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_aead,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs =
			
 
				-					hmac_sha256_des_cbc_enc_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA256_DES_CBC_ENC_TEST_VEC
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_sha256_des_cbc_enc_tv_temp)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2404,12 +2351,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs =
			
 
				-					hmac_sha256_des3_ede_cbc_enc_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA256_DES3_EDE_CBC_ENC_TEST_VEC
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_sha256_des3_ede_cbc_enc_tv_temp)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2425,12 +2367,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_aead,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs =
			
 
				-					hmac_sha384_des_cbc_enc_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA384_DES_CBC_ENC_TEST_VEC
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_sha384_des_cbc_enc_tv_temp)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2439,12 +2376,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs =
			
 
				-					hmac_sha384_des3_ede_cbc_enc_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA384_DES3_EDE_CBC_ENC_TEST_VEC
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_sha384_des3_ede_cbc_enc_tv_temp)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2461,12 +2393,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_aead,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs =
			
 
				-					hmac_sha512_aes_cbc_enc_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA512_AES_CBC_ENC_TEST_VEC
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_sha512_aes_cbc_enc_tv_temp)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2474,12 +2401,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_aead,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs =
			
 
				-					hmac_sha512_des_cbc_enc_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA512_DES_CBC_ENC_TEST_VEC
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_sha512_des_cbc_enc_tv_temp)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2488,12 +2410,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs =
			
 
				-					hmac_sha512_des3_ede_cbc_enc_tv_temp,
			
 
				-					.count =
			
 
				-					HMAC_SHA512_DES3_EDE_CBC_ENC_TEST_VEC
			
 
				-				}
			
 
				+				.enc = __VECS(hmac_sha512_des3_ede_cbc_enc_tv_temp)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2510,14 +2427,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = aes_cbc_enc_tv_template,
			
 
				-					.count = AES_CBC_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = aes_cbc_dec_tv_template,
			
 
				-					.count = AES_CBC_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(aes_cbc_enc_tv_template),
			
 
				+				.dec = __VECS(aes_cbc_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2525,14 +2436,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = anubis_cbc_enc_tv_template,
			
 
				-					.count = ANUBIS_CBC_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = anubis_cbc_dec_tv_template,
			
 
				-					.count = ANUBIS_CBC_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(anubis_cbc_enc_tv_template),
			
 
				+				.dec = __VECS(anubis_cbc_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2540,14 +2445,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = bf_cbc_enc_tv_template,
			
 
				-					.count = BF_CBC_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = bf_cbc_dec_tv_template,
			
 
				-					.count = BF_CBC_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(bf_cbc_enc_tv_template),
			
 
				+				.dec = __VECS(bf_cbc_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2555,14 +2454,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = camellia_cbc_enc_tv_template,
			
 
				-					.count = CAMELLIA_CBC_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = camellia_cbc_dec_tv_template,
			
 
				-					.count = CAMELLIA_CBC_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(camellia_cbc_enc_tv_template),
			
 
				+				.dec = __VECS(camellia_cbc_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2570,14 +2463,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = cast5_cbc_enc_tv_template,
			
 
				-					.count = CAST5_CBC_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = cast5_cbc_dec_tv_template,
			
 
				-					.count = CAST5_CBC_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(cast5_cbc_enc_tv_template),
			
 
				+				.dec = __VECS(cast5_cbc_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2585,14 +2472,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = cast6_cbc_enc_tv_template,
			
 
				-					.count = CAST6_CBC_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = cast6_cbc_dec_tv_template,
			
 
				-					.count = CAST6_CBC_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(cast6_cbc_enc_tv_template),
			
 
				+				.dec = __VECS(cast6_cbc_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2600,14 +2481,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = des_cbc_enc_tv_template,
			
 
				-					.count = DES_CBC_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = des_cbc_dec_tv_template,
			
 
				-					.count = DES_CBC_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(des_cbc_enc_tv_template),
			
 
				+				.dec = __VECS(des_cbc_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2616,14 +2491,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = des3_ede_cbc_enc_tv_template,
			
 
				-					.count = DES3_EDE_CBC_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = des3_ede_cbc_dec_tv_template,
			
 
				-					.count = DES3_EDE_CBC_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(des3_ede_cbc_enc_tv_template),
			
 
				+				.dec = __VECS(des3_ede_cbc_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2631,14 +2500,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = serpent_cbc_enc_tv_template,
			
 
				-					.count = SERPENT_CBC_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = serpent_cbc_dec_tv_template,
			
 
				-					.count = SERPENT_CBC_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(serpent_cbc_enc_tv_template),
			
 
				+				.dec = __VECS(serpent_cbc_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2646,30 +2509,25 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = tf_cbc_enc_tv_template,
			
 
				-					.count = TF_CBC_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = tf_cbc_dec_tv_template,
			
 
				-					.count = TF_CBC_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(tf_cbc_enc_tv_template),
			
 
				+				.dec = __VECS(tf_cbc_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				+	}, {
			
 
				+		.alg = "cbcmac(aes)",
			
 
				+		.fips_allowed = 1,
			
 
				+		.test = alg_test_hash,
			
 
				+		.suite = {
			
 
				+			.hash = __VECS(aes_cbcmac_tv_template)
			
 
				+		}
			
 
				 	}, {
			
 
				 		.alg = "ccm(aes)",
			
 
				 		.test = alg_test_aead,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs = aes_ccm_enc_tv_template,
			
 
				-					.count = AES_CCM_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = aes_ccm_dec_tv_template,
			
 
				-					.count = AES_CCM_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(aes_ccm_enc_tv_template),
			
 
				+				.dec = __VECS(aes_ccm_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2677,14 +2535,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = chacha20_enc_tv_template,
			
 
				-					.count = CHACHA20_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = chacha20_enc_tv_template,
			
 
				-					.count = CHACHA20_ENC_TEST_VECTORS
			
 
				-				},
			
 
				+				.enc = __VECS(chacha20_enc_tv_template),
			
 
				+				.dec = __VECS(chacha20_enc_tv_template),
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2692,20 +2544,14 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = aes_cmac128_tv_template,
			
 
				-				.count = CMAC_AES_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(aes_cmac128_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "cmac(des3_ede)",
			
 
				 		.fips_allowed = 1,
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = des3_ede_cmac64_tv_template,
			
 
				-				.count = CMAC_DES3_EDE_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(des3_ede_cmac64_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "compress_null",
			
@@ -2714,30 +2560,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.alg = "crc32",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = crc32_tv_template,
			
 
				-				.count = CRC32_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(crc32_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "crc32c",
			
 
				 		.test = alg_test_crc32c,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = crc32c_tv_template,
			
 
				-				.count = CRC32C_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(crc32c_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "crct10dif",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = crct10dif_tv_template,
			
 
				-				.count = CRCT10DIF_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(crct10dif_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "ctr(aes)",
			
@@ -2745,14 +2582,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = aes_ctr_enc_tv_template,
			
 
				-					.count = AES_CTR_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = aes_ctr_dec_tv_template,
			
 
				-					.count = AES_CTR_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(aes_ctr_enc_tv_template),
			
 
				+				.dec = __VECS(aes_ctr_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2760,14 +2591,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = bf_ctr_enc_tv_template,
			
 
				-					.count = BF_CTR_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = bf_ctr_dec_tv_template,
			
 
				-					.count = BF_CTR_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(bf_ctr_enc_tv_template),
			
 
				+				.dec = __VECS(bf_ctr_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2775,14 +2600,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = camellia_ctr_enc_tv_template,
			
 
				-					.count = CAMELLIA_CTR_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = camellia_ctr_dec_tv_template,
			
 
				-					.count = CAMELLIA_CTR_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(camellia_ctr_enc_tv_template),
			
 
				+				.dec = __VECS(camellia_ctr_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2790,14 +2609,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = cast5_ctr_enc_tv_template,
			
 
				-					.count = CAST5_CTR_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = cast5_ctr_dec_tv_template,
			
 
				-					.count = CAST5_CTR_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(cast5_ctr_enc_tv_template),
			
 
				+				.dec = __VECS(cast5_ctr_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2805,14 +2618,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = cast6_ctr_enc_tv_template,
			
 
				-					.count = CAST6_CTR_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = cast6_ctr_dec_tv_template,
			
 
				-					.count = CAST6_CTR_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(cast6_ctr_enc_tv_template),
			
 
				+				.dec = __VECS(cast6_ctr_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2820,14 +2627,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = des_ctr_enc_tv_template,
			
 
				-					.count = DES_CTR_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = des_ctr_dec_tv_template,
			
 
				-					.count = DES_CTR_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(des_ctr_enc_tv_template),
			
 
				+				.dec = __VECS(des_ctr_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2835,14 +2636,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = des3_ede_ctr_enc_tv_template,
			
 
				-					.count = DES3_EDE_CTR_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = des3_ede_ctr_dec_tv_template,
			
 
				-					.count = DES3_EDE_CTR_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(des3_ede_ctr_enc_tv_template),
			
 
				+				.dec = __VECS(des3_ede_ctr_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2850,14 +2645,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = serpent_ctr_enc_tv_template,
			
 
				-					.count = SERPENT_CTR_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = serpent_ctr_dec_tv_template,
			
 
				-					.count = SERPENT_CTR_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(serpent_ctr_enc_tv_template),
			
 
				+				.dec = __VECS(serpent_ctr_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2865,14 +2654,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = tf_ctr_enc_tv_template,
			
 
				-					.count = TF_CTR_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = tf_ctr_dec_tv_template,
			
 
				-					.count = TF_CTR_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(tf_ctr_enc_tv_template),
			
 
				+				.dec = __VECS(tf_ctr_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2880,14 +2663,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = cts_mode_enc_tv_template,
			
 
				-					.count = CTS_MODE_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = cts_mode_dec_tv_template,
			
 
				-					.count = CTS_MODE_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(cts_mode_enc_tv_template),
			
 
				+				.dec = __VECS(cts_mode_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2896,14 +2673,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.comp = {
			
 
				-				.comp = {
			
 
				-					.vecs = deflate_comp_tv_template,
			
 
				-					.count = DEFLATE_COMP_TEST_VECTORS
			
 
				-				},
			
 
				-				.decomp = {
			
 
				-					.vecs = deflate_decomp_tv_template,
			
 
				-					.count = DEFLATE_DECOMP_TEST_VECTORS
			
 
				-				}
			
 
				+				.comp = __VECS(deflate_comp_tv_template),
			
 
				+				.decomp = __VECS(deflate_decomp_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -2911,10 +2682,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_kpp,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.kpp = {
			
 
				-				.vecs = dh_tv_template,
			
 
				-				.count = DH_TEST_VECTORS
			
 
				-			}
			
 
				+			.kpp = __VECS(dh_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "digest_null",
			
@@ -2924,30 +2692,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_drbg,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.drbg = {
			
 
				-				.vecs = drbg_nopr_ctr_aes128_tv_template,
			
 
				-				.count = ARRAY_SIZE(drbg_nopr_ctr_aes128_tv_template)
			
 
				-			}
			
 
				+			.drbg = __VECS(drbg_nopr_ctr_aes128_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "drbg_nopr_ctr_aes192",
			
 
				 		.test = alg_test_drbg,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.drbg = {
			
 
				-				.vecs = drbg_nopr_ctr_aes192_tv_template,
			
 
				-				.count = ARRAY_SIZE(drbg_nopr_ctr_aes192_tv_template)
			
 
				-			}
			
 
				+			.drbg = __VECS(drbg_nopr_ctr_aes192_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "drbg_nopr_ctr_aes256",
			
 
				 		.test = alg_test_drbg,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.drbg = {
			
 
				-				.vecs = drbg_nopr_ctr_aes256_tv_template,
			
 
				-				.count = ARRAY_SIZE(drbg_nopr_ctr_aes256_tv_template)
			
 
				-			}
			
 
				+			.drbg = __VECS(drbg_nopr_ctr_aes256_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		/*
			
@@ -2962,11 +2721,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_drbg,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.drbg = {
			
 
				-				.vecs = drbg_nopr_hmac_sha256_tv_template,
			
 
				-				.count =
			
 
				-				ARRAY_SIZE(drbg_nopr_hmac_sha256_tv_template)
			
 
				-			}
			
 
				+			.drbg = __VECS(drbg_nopr_hmac_sha256_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		/* covered by drbg_nopr_hmac_sha256 test */
			
@@ -2986,10 +2741,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_drbg,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.drbg = {
			
 
				-				.vecs = drbg_nopr_sha256_tv_template,
			
 
				-				.count = ARRAY_SIZE(drbg_nopr_sha256_tv_template)
			
 
				-			}
			
 
				+			.drbg = __VECS(drbg_nopr_sha256_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		/* covered by drbg_nopr_sha256 test */
			
@@ -3005,10 +2757,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_drbg,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.drbg = {
			
 
				-				.vecs = drbg_pr_ctr_aes128_tv_template,
			
 
				-				.count = ARRAY_SIZE(drbg_pr_ctr_aes128_tv_template)
			
 
				-			}
			
 
				+			.drbg = __VECS(drbg_pr_ctr_aes128_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		/* covered by drbg_pr_ctr_aes128 test */
			
@@ -3028,10 +2777,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_drbg,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.drbg = {
			
 
				-				.vecs = drbg_pr_hmac_sha256_tv_template,
			
 
				-				.count = ARRAY_SIZE(drbg_pr_hmac_sha256_tv_template)
			
 
				-			}
			
 
				+			.drbg = __VECS(drbg_pr_hmac_sha256_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		/* covered by drbg_pr_hmac_sha256 test */
			
@@ -3051,10 +2797,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_drbg,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.drbg = {
			
 
				-				.vecs = drbg_pr_sha256_tv_template,
			
 
				-				.count = ARRAY_SIZE(drbg_pr_sha256_tv_template)
			
 
				-			}
			
 
				+			.drbg = __VECS(drbg_pr_sha256_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		/* covered by drbg_pr_sha256 test */
			
@@ -3071,14 +2814,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = aes_enc_tv_template,
			
 
				-					.count = AES_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = aes_dec_tv_template,
			
 
				-					.count = AES_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(aes_enc_tv_template),
			
 
				+				.dec = __VECS(aes_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3086,14 +2823,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = anubis_enc_tv_template,
			
 
				-					.count = ANUBIS_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = anubis_dec_tv_template,
			
 
				-					.count = ANUBIS_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(anubis_enc_tv_template),
			
 
				+				.dec = __VECS(anubis_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3101,14 +2832,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = arc4_enc_tv_template,
			
 
				-					.count = ARC4_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = arc4_dec_tv_template,
			
 
				-					.count = ARC4_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(arc4_enc_tv_template),
			
 
				+				.dec = __VECS(arc4_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3116,14 +2841,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = bf_enc_tv_template,
			
 
				-					.count = BF_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = bf_dec_tv_template,
			
 
				-					.count = BF_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(bf_enc_tv_template),
			
 
				+				.dec = __VECS(bf_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3131,14 +2850,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = camellia_enc_tv_template,
			
 
				-					.count = CAMELLIA_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = camellia_dec_tv_template,
			
 
				-					.count = CAMELLIA_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(camellia_enc_tv_template),
			
 
				+				.dec = __VECS(camellia_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3146,14 +2859,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = cast5_enc_tv_template,
			
 
				-					.count = CAST5_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = cast5_dec_tv_template,
			
 
				-					.count = CAST5_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(cast5_enc_tv_template),
			
 
				+				.dec = __VECS(cast5_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3161,14 +2868,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = cast6_enc_tv_template,
			
 
				-					.count = CAST6_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = cast6_dec_tv_template,
			
 
				-					.count = CAST6_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(cast6_enc_tv_template),
			
 
				+				.dec = __VECS(cast6_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3179,14 +2880,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = des_enc_tv_template,
			
 
				-					.count = DES_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = des_dec_tv_template,
			
 
				-					.count = DES_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(des_enc_tv_template),
			
 
				+				.dec = __VECS(des_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3195,14 +2890,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = des3_ede_enc_tv_template,
			
 
				-					.count = DES3_EDE_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = des3_ede_dec_tv_template,
			
 
				-					.count = DES3_EDE_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(des3_ede_enc_tv_template),
			
 
				+				.dec = __VECS(des3_ede_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3225,14 +2914,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = khazad_enc_tv_template,
			
 
				-					.count = KHAZAD_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = khazad_dec_tv_template,
			
 
				-					.count = KHAZAD_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(khazad_enc_tv_template),
			
 
				+				.dec = __VECS(khazad_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3240,14 +2923,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = seed_enc_tv_template,
			
 
				-					.count = SEED_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = seed_dec_tv_template,
			
 
				-					.count = SEED_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(seed_enc_tv_template),
			
 
				+				.dec = __VECS(seed_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3255,14 +2932,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = serpent_enc_tv_template,
			
 
				-					.count = SERPENT_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = serpent_dec_tv_template,
			
 
				-					.count = SERPENT_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(serpent_enc_tv_template),
			
 
				+				.dec = __VECS(serpent_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3270,14 +2941,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = tea_enc_tv_template,
			
 
				-					.count = TEA_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = tea_dec_tv_template,
			
 
				-					.count = TEA_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(tea_enc_tv_template),
			
 
				+				.dec = __VECS(tea_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3285,14 +2950,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = tnepres_enc_tv_template,
			
 
				-					.count = TNEPRES_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = tnepres_dec_tv_template,
			
 
				-					.count = TNEPRES_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(tnepres_enc_tv_template),
			
 
				+				.dec = __VECS(tnepres_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3300,14 +2959,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = tf_enc_tv_template,
			
 
				-					.count = TF_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = tf_dec_tv_template,
			
 
				-					.count = TF_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(tf_enc_tv_template),
			
 
				+				.dec = __VECS(tf_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3315,14 +2968,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = xeta_enc_tv_template,
			
 
				-					.count = XETA_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = xeta_dec_tv_template,
			
 
				-					.count = XETA_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(xeta_enc_tv_template),
			
 
				+				.dec = __VECS(xeta_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3330,14 +2977,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = xtea_enc_tv_template,
			
 
				-					.count = XTEA_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = xtea_dec_tv_template,
			
 
				-					.count = XTEA_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(xtea_enc_tv_template),
			
 
				+				.dec = __VECS(xtea_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3345,10 +2986,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_kpp,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.kpp = {
			
 
				-				.vecs = ecdh_tv_template,
			
 
				-				.count = ECDH_TEST_VECTORS
			
 
				-			}
			
 
				+			.kpp = __VECS(ecdh_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "gcm(aes)",
			
@@ -3356,14 +2994,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs = aes_gcm_enc_tv_template,
			
 
				-					.count = AES_GCM_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = aes_gcm_dec_tv_template,
			
 
				-					.count = AES_GCM_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(aes_gcm_enc_tv_template),
			
 
				+				.dec = __VECS(aes_gcm_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3371,136 +3003,94 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = ghash_tv_template,
			
 
				-				.count = GHASH_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(ghash_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "hmac(crc32)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = bfin_crc_tv_template,
			
 
				-				.count = BFIN_CRC_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(bfin_crc_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "hmac(md5)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = hmac_md5_tv_template,
			
 
				-				.count = HMAC_MD5_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(hmac_md5_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "hmac(rmd128)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = hmac_rmd128_tv_template,
			
 
				-				.count = HMAC_RMD128_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(hmac_rmd128_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "hmac(rmd160)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = hmac_rmd160_tv_template,
			
 
				-				.count = HMAC_RMD160_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(hmac_rmd160_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "hmac(sha1)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = hmac_sha1_tv_template,
			
 
				-				.count = HMAC_SHA1_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(hmac_sha1_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "hmac(sha224)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = hmac_sha224_tv_template,
			
 
				-				.count = HMAC_SHA224_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(hmac_sha224_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "hmac(sha256)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = hmac_sha256_tv_template,
			
 
				-				.count = HMAC_SHA256_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(hmac_sha256_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "hmac(sha3-224)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = hmac_sha3_224_tv_template,
			
 
				-				.count = HMAC_SHA3_224_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(hmac_sha3_224_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "hmac(sha3-256)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = hmac_sha3_256_tv_template,
			
 
				-				.count = HMAC_SHA3_256_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(hmac_sha3_256_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "hmac(sha3-384)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = hmac_sha3_384_tv_template,
			
 
				-				.count = HMAC_SHA3_384_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(hmac_sha3_384_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "hmac(sha3-512)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = hmac_sha3_512_tv_template,
			
 
				-				.count = HMAC_SHA3_512_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(hmac_sha3_512_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "hmac(sha384)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = hmac_sha384_tv_template,
			
 
				-				.count = HMAC_SHA384_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(hmac_sha384_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "hmac(sha512)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = hmac_sha512_tv_template,
			
 
				-				.count = HMAC_SHA512_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(hmac_sha512_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "jitterentropy_rng",
			
@@ -3512,14 +3102,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = aes_kw_enc_tv_template,
			
 
				-					.count = ARRAY_SIZE(aes_kw_enc_tv_template)
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = aes_kw_dec_tv_template,
			
 
				-					.count = ARRAY_SIZE(aes_kw_dec_tv_template)
			
 
				-				}
			
 
				+				.enc = __VECS(aes_kw_enc_tv_template),
			
 
				+				.dec = __VECS(aes_kw_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3527,14 +3111,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = aes_lrw_enc_tv_template,
			
 
				-					.count = AES_LRW_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = aes_lrw_dec_tv_template,
			
 
				-					.count = AES_LRW_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(aes_lrw_enc_tv_template),
			
 
				+				.dec = __VECS(aes_lrw_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3542,14 +3120,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = camellia_lrw_enc_tv_template,
			
 
				-					.count = CAMELLIA_LRW_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = camellia_lrw_dec_tv_template,
			
 
				-					.count = CAMELLIA_LRW_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(camellia_lrw_enc_tv_template),
			
 
				+				.dec = __VECS(camellia_lrw_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3557,14 +3129,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = cast6_lrw_enc_tv_template,
			
 
				-					.count = CAST6_LRW_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = cast6_lrw_dec_tv_template,
			
 
				-					.count = CAST6_LRW_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(cast6_lrw_enc_tv_template),
			
 
				+				.dec = __VECS(cast6_lrw_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3572,14 +3138,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = serpent_lrw_enc_tv_template,
			
 
				-					.count = SERPENT_LRW_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = serpent_lrw_dec_tv_template,
			
 
				-					.count = SERPENT_LRW_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(serpent_lrw_enc_tv_template),
			
 
				+				.dec = __VECS(serpent_lrw_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3587,14 +3147,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = tf_lrw_enc_tv_template,
			
 
				-					.count = TF_LRW_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = tf_lrw_dec_tv_template,
			
 
				-					.count = TF_LRW_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(tf_lrw_enc_tv_template),
			
 
				+				.dec = __VECS(tf_lrw_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3603,14 +3157,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.comp = {
			
 
				-				.comp = {
			
 
				-					.vecs = lz4_comp_tv_template,
			
 
				-					.count = LZ4_COMP_TEST_VECTORS
			
 
				-				},
			
 
				-				.decomp = {
			
 
				-					.vecs = lz4_decomp_tv_template,
			
 
				-					.count = LZ4_DECOMP_TEST_VECTORS
			
 
				-				}
			
 
				+				.comp = __VECS(lz4_comp_tv_template),
			
 
				+				.decomp = __VECS(lz4_decomp_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3619,14 +3167,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.comp = {
			
 
				-				.comp = {
			
 
				-					.vecs = lz4hc_comp_tv_template,
			
 
				-					.count = LZ4HC_COMP_TEST_VECTORS
			
 
				-				},
			
 
				-				.decomp = {
			
 
				-					.vecs = lz4hc_decomp_tv_template,
			
 
				-					.count = LZ4HC_DECOMP_TEST_VECTORS
			
 
				-				}
			
 
				+				.comp = __VECS(lz4hc_comp_tv_template),
			
 
				+				.decomp = __VECS(lz4hc_decomp_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3635,42 +3177,27 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.comp = {
			
 
				-				.comp = {
			
 
				-					.vecs = lzo_comp_tv_template,
			
 
				-					.count = LZO_COMP_TEST_VECTORS
			
 
				-				},
			
 
				-				.decomp = {
			
 
				-					.vecs = lzo_decomp_tv_template,
			
 
				-					.count = LZO_DECOMP_TEST_VECTORS
			
 
				-				}
			
 
				+				.comp = __VECS(lzo_comp_tv_template),
			
 
				+				.decomp = __VECS(lzo_decomp_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "md4",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = md4_tv_template,
			
 
				-				.count = MD4_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(md4_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "md5",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = md5_tv_template,
			
 
				-				.count = MD5_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(md5_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "michael_mic",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = michael_mic_tv_template,
			
 
				-				.count = MICHAEL_MIC_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(michael_mic_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "ofb(aes)",
			
@@ -3678,14 +3205,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = aes_ofb_enc_tv_template,
			
 
				-					.count = AES_OFB_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = aes_ofb_dec_tv_template,
			
 
				-					.count = AES_OFB_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(aes_ofb_enc_tv_template),
			
 
				+				.dec = __VECS(aes_ofb_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3693,24 +3214,15 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = fcrypt_pcbc_enc_tv_template,
			
 
				-					.count = FCRYPT_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = fcrypt_pcbc_dec_tv_template,
			
 
				-					.count = FCRYPT_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(fcrypt_pcbc_enc_tv_template),
			
 
				+				.dec = __VECS(fcrypt_pcbc_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "poly1305",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = poly1305_tv_template,
			
 
				-				.count = POLY1305_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(poly1305_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "rfc3686(ctr(aes))",
			
@@ -3718,14 +3230,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = aes_ctr_rfc3686_enc_tv_template,
			
 
				-					.count = AES_CTR_3686_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = aes_ctr_rfc3686_dec_tv_template,
			
 
				-					.count = AES_CTR_3686_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(aes_ctr_rfc3686_enc_tv_template),
			
 
				+				.dec = __VECS(aes_ctr_rfc3686_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3734,14 +3240,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs = aes_gcm_rfc4106_enc_tv_template,
			
 
				-					.count = AES_GCM_4106_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = aes_gcm_rfc4106_dec_tv_template,
			
 
				-					.count = AES_GCM_4106_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(aes_gcm_rfc4106_enc_tv_template),
			
 
				+				.dec = __VECS(aes_gcm_rfc4106_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3750,14 +3250,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs = aes_ccm_rfc4309_enc_tv_template,
			
 
				-					.count = AES_CCM_4309_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = aes_ccm_rfc4309_dec_tv_template,
			
 
				-					.count = AES_CCM_4309_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(aes_ccm_rfc4309_enc_tv_template),
			
 
				+				.dec = __VECS(aes_ccm_rfc4309_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3765,14 +3259,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_aead,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs = aes_gcm_rfc4543_enc_tv_template,
			
 
				-					.count = AES_GCM_4543_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = aes_gcm_rfc4543_dec_tv_template,
			
 
				-					.count = AES_GCM_4543_DEC_TEST_VECTORS
			
 
				-				},
			
 
				+				.enc = __VECS(aes_gcm_rfc4543_enc_tv_template),
			
 
				+				.dec = __VECS(aes_gcm_rfc4543_dec_tv_template),
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3780,14 +3268,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_aead,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs = rfc7539_enc_tv_template,
			
 
				-					.count = RFC7539_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = rfc7539_dec_tv_template,
			
 
				-					.count = RFC7539_DEC_TEST_VECTORS
			
 
				-				},
			
 
				+				.enc = __VECS(rfc7539_enc_tv_template),
			
 
				+				.dec = __VECS(rfc7539_dec_tv_template),
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3795,71 +3277,47 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_aead,
			
 
				 		.suite = {
			
 
				 			.aead = {
			
 
				-				.enc = {
			
 
				-					.vecs = rfc7539esp_enc_tv_template,
			
 
				-					.count = RFC7539ESP_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = rfc7539esp_dec_tv_template,
			
 
				-					.count = RFC7539ESP_DEC_TEST_VECTORS
			
 
				-				},
			
 
				+				.enc = __VECS(rfc7539esp_enc_tv_template),
			
 
				+				.dec = __VECS(rfc7539esp_dec_tv_template),
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "rmd128",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = rmd128_tv_template,
			
 
				-				.count = RMD128_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(rmd128_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "rmd160",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = rmd160_tv_template,
			
 
				-				.count = RMD160_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(rmd160_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "rmd256",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = rmd256_tv_template,
			
 
				-				.count = RMD256_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(rmd256_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "rmd320",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = rmd320_tv_template,
			
 
				-				.count = RMD320_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(rmd320_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "rsa",
			
 
				 		.test = alg_test_akcipher,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.akcipher = {
			
 
				-				.vecs = rsa_tv_template,
			
 
				-				.count = RSA_TEST_VECTORS
			
 
				-			}
			
 
				+			.akcipher = __VECS(rsa_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "salsa20",
			
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = salsa20_stream_enc_tv_template,
			
 
				-					.count = SALSA20_STREAM_ENC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(salsa20_stream_enc_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -3867,162 +3325,111 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = sha1_tv_template,
			
 
				-				.count = SHA1_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(sha1_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "sha224",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = sha224_tv_template,
			
 
				-				.count = SHA224_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(sha224_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "sha256",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = sha256_tv_template,
			
 
				-				.count = SHA256_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(sha256_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "sha3-224",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = sha3_224_tv_template,
			
 
				-				.count = SHA3_224_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(sha3_224_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "sha3-256",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = sha3_256_tv_template,
			
 
				-				.count = SHA3_256_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(sha3_256_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "sha3-384",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = sha3_384_tv_template,
			
 
				-				.count = SHA3_384_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(sha3_384_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "sha3-512",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = sha3_512_tv_template,
			
 
				-				.count = SHA3_512_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(sha3_512_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "sha384",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = sha384_tv_template,
			
 
				-				.count = SHA384_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(sha384_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "sha512",
			
 
				 		.test = alg_test_hash,
			
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = sha512_tv_template,
			
 
				-				.count = SHA512_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(sha512_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "tgr128",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = tgr128_tv_template,
			
 
				-				.count = TGR128_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(tgr128_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "tgr160",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = tgr160_tv_template,
			
 
				-				.count = TGR160_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(tgr160_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "tgr192",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = tgr192_tv_template,
			
 
				-				.count = TGR192_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(tgr192_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "vmac(aes)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = aes_vmac128_tv_template,
			
 
				-				.count = VMAC_AES_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(aes_vmac128_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "wp256",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = wp256_tv_template,
			
 
				-				.count = WP256_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(wp256_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "wp384",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = wp384_tv_template,
			
 
				-				.count = WP384_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(wp384_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "wp512",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = wp512_tv_template,
			
 
				-				.count = WP512_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(wp512_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "xcbc(aes)",
			
 
				 		.test = alg_test_hash,
			
 
				 		.suite = {
			
 
				-			.hash = {
			
 
				-				.vecs = aes_xcbc128_tv_template,
			
 
				-				.count = XCBC_AES_TEST_VECTORS
			
 
				-			}
			
 
				+			.hash = __VECS(aes_xcbc128_tv_template)
			
 
				 		}
			
 
				 	}, {
			
 
				 		.alg = "xts(aes)",
			
@@ -4030,14 +3437,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.fips_allowed = 1,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = aes_xts_enc_tv_template,
			
 
				-					.count = AES_XTS_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = aes_xts_dec_tv_template,
			
 
				-					.count = AES_XTS_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(aes_xts_enc_tv_template),
			
 
				+				.dec = __VECS(aes_xts_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -4045,14 +3446,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = camellia_xts_enc_tv_template,
			
 
				-					.count = CAMELLIA_XTS_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = camellia_xts_dec_tv_template,
			
 
				-					.count = CAMELLIA_XTS_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(camellia_xts_enc_tv_template),
			
 
				+				.dec = __VECS(camellia_xts_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -4060,14 +3455,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = cast6_xts_enc_tv_template,
			
 
				-					.count = CAST6_XTS_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = cast6_xts_dec_tv_template,
			
 
				-					.count = CAST6_XTS_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(cast6_xts_enc_tv_template),
			
 
				+				.dec = __VECS(cast6_xts_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -4075,14 +3464,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = serpent_xts_enc_tv_template,
			
 
				-					.count = SERPENT_XTS_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = serpent_xts_dec_tv_template,
			
 
				-					.count = SERPENT_XTS_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(serpent_xts_enc_tv_template),
			
 
				+				.dec = __VECS(serpent_xts_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}, {
			
@@ -4090,14 +3473,8 @@ static const struct alg_test_desc alg_test_descs[] = {
 
				 		.test = alg_test_skcipher,
			
 
				 		.suite = {
			
 
				 			.cipher = {
			
 
				-				.enc = {
			
 
				-					.vecs = tf_xts_enc_tv_template,
			
 
				-					.count = TF_XTS_ENC_TEST_VECTORS
			
 
				-				},
			
 
				-				.dec = {
			
 
				-					.vecs = tf_xts_dec_tv_template,
			
 
				-					.count = TF_XTS_DEC_TEST_VECTORS
			
 
				-				}
			
 
				+				.enc = __VECS(tf_xts_enc_tv_template),
			
 
				+				.dec = __VECS(tf_xts_dec_tv_template)
			
 
				 			}
			
 
				 		}
			
 
				 	}