Browse Source

Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto update from Herbert Xu:

 - XTS mode optimisation for twofish/cast6/camellia/aes on x86

 - AVX2/x86_64 implementation for blowfish/twofish/serpent/camellia

 - SSSE3/AVX/AVX2 optimisations for sha256/sha512

 - Added driver for SAHARA2 crypto accelerator

 - Fix for GMAC when used in non-IPsec secnarios

 - Added generic CMAC implementation (including IPsec glue)

 - IP update for crypto/atmel

 - Support for more than one device in hwrng/timeriomem

 - Added Broadcom BCM2835 RNG driver

 - Misc fixes

* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (59 commits)
  crypto: caam - fix job ring cleanup code
  crypto: camellia - add AVX2/AES-NI/x86_64 assembler implementation of camellia cipher
  crypto: serpent - add AVX2/x86_64 assembler implementation of serpent cipher
  crypto: twofish - add AVX2/x86_64 assembler implementation of twofish cipher
  crypto: blowfish - add AVX2/x86_64 implementation of blowfish cipher
  crypto: tcrypt - add async cipher speed tests for blowfish
  crypto: testmgr - extend camellia test-vectors for camellia-aesni/avx2
  crypto: aesni_intel - fix Kconfig problem with CRYPTO_GLUE_HELPER_X86
  crypto: aesni_intel - add more optimized XTS mode for x86-64
  crypto: x86/camellia-aesni-avx - add more optimized XTS code
  crypto: cast6-avx: use new optimized XTS code
  crypto: x86/twofish-avx - use optimized XTS code
  crypto: x86 - add more optimized XTS-mode for serpent-avx
  xfrm: add rfc4494 AES-CMAC-96 support
  crypto: add CMAC support to CryptoAPI
  crypto: testmgr - add empty test vectors for null ciphers
  crypto: testmgr - add AES GMAC test vectors
  crypto: gcm - fix rfc4543 to handle async crypto correctly
  crypto: gcm - make GMAC work when dst and src are different
  hwrng: timeriomem - added devicetree hooks
  ...
Linus Torvalds 12 years ago
parent
commit
797994f81a
88 changed files with 15378 additions and 744 deletions
  1. 15 0
      Documentation/devicetree/bindings/crypto/fsl-imx-sahara.txt
  2. 18 0
      Documentation/devicetree/bindings/hwrng/timeriomem_rng.txt
  3. 13 0
      Documentation/devicetree/bindings/rng/brcm,bcm2835.txt
  4. 1 1
      Documentation/hw_random.txt
  5. 6 8
      arch/arm/mach-at91/at91sam9g45_devices.c
  6. 45 12
      arch/x86/crypto/Makefile
  7. 117 0
      arch/x86/crypto/aesni-intel_asm.S
  8. 80 0
      arch/x86/crypto/aesni-intel_glue.c
  9. 449 0
      arch/x86/crypto/blowfish-avx2-asm_64.S
  10. 585 0
      arch/x86/crypto/blowfish_avx2_glue.c
  11. 8 24
      arch/x86/crypto/blowfish_glue.c
  12. 179 1
      arch/x86/crypto/camellia-aesni-avx-asm_64.S
  13. 1368 0
      arch/x86/crypto/camellia-aesni-avx2-asm_64.S
  14. 586 0
      arch/x86/crypto/camellia_aesni_avx2_glue.c
  15. 62 42
      arch/x86/crypto/camellia_aesni_avx_glue.c
  16. 47 1
      arch/x86/crypto/cast6-avx-x86_64-asm_64.S
  17. 51 40
      arch/x86/crypto/cast6_avx_glue.c
  18. 3 3
      arch/x86/crypto/crc32-pclmul_asm.S
  19. 6 4
      arch/x86/crypto/crc32c-pcl-intel-asm_64.S
  20. 60 1
      arch/x86/crypto/glue_helper-asm-avx.S
  21. 180 0
      arch/x86/crypto/glue_helper-asm-avx2.S
  22. 96 1
      arch/x86/crypto/glue_helper.c
  23. 43 2
      arch/x86/crypto/serpent-avx-x86_64-asm_64.S
  24. 800 0
      arch/x86/crypto/serpent-avx2-asm_64.S
  25. 562 0
      arch/x86/crypto/serpent_avx2_glue.c
  26. 85 60
      arch/x86/crypto/serpent_avx_glue.c
  27. 496 0
      arch/x86/crypto/sha256-avx-asm.S
  28. 772 0
      arch/x86/crypto/sha256-avx2-asm.S
  29. 506 0
      arch/x86/crypto/sha256-ssse3-asm.S
  30. 275 0
      arch/x86/crypto/sha256_ssse3_glue.c
  31. 423 0
      arch/x86/crypto/sha512-avx-asm.S
  32. 743 0
      arch/x86/crypto/sha512-avx2-asm.S
  33. 421 0
      arch/x86/crypto/sha512-ssse3-asm.S
  34. 282 0
      arch/x86/crypto/sha512_ssse3_glue.c
  35. 47 1
      arch/x86/crypto/twofish-avx-x86_64-asm_64.S
  36. 600 0
      arch/x86/crypto/twofish-avx2-asm_64.S
  37. 584 0
      arch/x86/crypto/twofish_avx2_glue.c
  38. 61 40
      arch/x86/crypto/twofish_avx_glue.c
  39. 1 0
      arch/x86/include/asm/cpufeature.h
  40. 43 0
      arch/x86/include/asm/crypto/blowfish.h
  41. 19 0
      arch/x86/include/asm/crypto/camellia.h
  42. 24 0
      arch/x86/include/asm/crypto/glue_helper.h
  43. 29 0
      arch/x86/include/asm/crypto/serpent-avx.h
  44. 18 0
      arch/x86/include/asm/crypto/twofish.h
  45. 123 10
      crypto/Kconfig
  46. 1 0
      crypto/Makefile
  47. 315 0
      crypto/cmac.c
  48. 2 2
      crypto/crypto_user.c
  49. 97 19
      crypto/gcm.c
  50. 6 5
      crypto/sha256_generic.c
  51. 7 6
      crypto/sha512_generic.c
  52. 29 1
      crypto/tcrypt.c
  53. 93 2
      crypto/testmgr.c
  54. 1276 38
      crypto/testmgr.h
  55. 12 0
      drivers/char/hw_random/Kconfig
  56. 1 0
      drivers/char/hw_random/Makefile
  57. 113 0
      drivers/char/hw_random/bcm2835-rng.c
  58. 2 1
      drivers/char/hw_random/exynos-rng.c
  59. 4 17
      drivers/char/hw_random/mxc-rnga.c
  60. 136 54
      drivers/char/hw_random/timeriomem-rng.c
  61. 15 3
      drivers/crypto/Kconfig
  62. 1 0
      drivers/crypto/Makefile
  63. 353 118
      drivers/crypto/atmel-aes.c
  64. 6 1
      drivers/crypto/atmel-sha-regs.h
  65. 486 100
      drivers/crypto/atmel-sha.c
  66. 2 0
      drivers/crypto/atmel-tdes-regs.h
  67. 341 53
      drivers/crypto/atmel-tdes.c
  68. 3 3
      drivers/crypto/bfin_crc.c
  69. 1 1
      drivers/crypto/caam/Kconfig
  70. 6 0
      drivers/crypto/caam/caamalg.c
  71. 2 2
      drivers/crypto/caam/caamhash.c
  72. 3 0
      drivers/crypto/caam/ctrl.c
  73. 5 5
      drivers/crypto/caam/error.c
  74. 1 0
      drivers/crypto/caam/intern.h
  75. 4 0
      drivers/crypto/caam/jr.c
  76. 1 1
      drivers/crypto/caam/key_gen.c
  77. 1 1
      drivers/crypto/caam/key_gen.h
  78. 3 1
      drivers/crypto/caam/regs.h
  79. 2 13
      drivers/crypto/omap-aes.c
  80. 2 13
      drivers/crypto/omap-sham.c
  81. 1 3
      drivers/crypto/picoxcell_crypto.c
  82. 1070 0
      drivers/crypto/sahara.c
  83. 3 3
      drivers/crypto/ux500/hash/hash_core.c
  84. 5 0
      include/crypto/sha.h
  85. 0 22
      include/linux/platform_data/atmel-aes.h
  86. 22 0
      include/linux/platform_data/crypto-atmel.h
  87. 0 5
      include/linux/timeriomem-rng.h
  88. 13 0
      net/xfrm/xfrm_algo.c

+ 15 - 0
Documentation/devicetree/bindings/crypto/fsl-imx-sahara.txt

@@ -0,0 +1,15 @@
+Freescale SAHARA Cryptographic Accelerator included in some i.MX chips.
+Currently only i.MX27 is supported.
+
+Required properties:
+- compatible : Should be "fsl,<soc>-sahara"
+- reg : Should contain SAHARA registers location and length
+- interrupts : Should contain SAHARA interrupt number
+
+Example:
+
+sah@10025000 {
+	compatible = "fsl,imx27-sahara";
+	reg = <	0x10025000 0x800>;
+	interrupts = <75>;
+};

+ 18 - 0
Documentation/devicetree/bindings/hwrng/timeriomem_rng.txt

@@ -0,0 +1,18 @@
+HWRNG support for the timeriomem_rng driver
+
+Required properties:
+- compatible : "timeriomem_rng"
+- reg : base address to sample from
+- period : wait time in microseconds to use between samples
+
+N.B. currently 'reg' must be four bytes wide and aligned
+
+Example:
+
+hwrng@44 {
+	#address-cells = <1>;
+	#size-cells = <1>;
+	compatible = "timeriomem_rng";
+	reg = <0x44 0x04>;
+	period = <1000000>;
+};

+ 13 - 0
Documentation/devicetree/bindings/rng/brcm,bcm2835.txt

@@ -0,0 +1,13 @@
+BCM2835 Random number generator
+
+Required properties:
+
+- compatible : should be "brcm,bcm2835-rng"
+- reg : Specifies base physical address and size of the registers.
+
+Example:
+
+rng {
+        compatible = "brcm,bcm2835-rng";
+        reg = <0x7e104000 0x10>;
+};

+ 1 - 1
Documentation/hw_random.txt

@@ -63,7 +63,7 @@ Intel RNG Driver notes:
 
 
 	* FIXME: support poll(2)
 	* FIXME: support poll(2)
 
 
-	NOTE: request_mem_region was removed, for two reasons:
+	NOTE: request_mem_region was removed, for three reasons:
 	1) Only one RNG is supported by this driver, 2) The location
 	1) Only one RNG is supported by this driver, 2) The location
 	used by the RNG is a fixed location in MMIO-addressable memory,
 	used by the RNG is a fixed location in MMIO-addressable memory,
 	3) users with properly working BIOS e820 handling will always
 	3) users with properly working BIOS e820 handling will always

+ 6 - 8
arch/arm/mach-at91/at91sam9g45_devices.c

@@ -18,7 +18,7 @@
 #include <linux/platform_device.h>
 #include <linux/platform_device.h>
 #include <linux/i2c-gpio.h>
 #include <linux/i2c-gpio.h>
 #include <linux/atmel-mci.h>
 #include <linux/atmel-mci.h>
-#include <linux/platform_data/atmel-aes.h>
+#include <linux/platform_data/crypto-atmel.h>
 
 
 #include <linux/platform_data/at91_adc.h>
 #include <linux/platform_data/at91_adc.h>
 
 
@@ -1900,7 +1900,8 @@ static void __init at91_add_device_tdes(void) {}
  * -------------------------------------------------------------------- */
  * -------------------------------------------------------------------- */
 
 
 #if defined(CONFIG_CRYPTO_DEV_ATMEL_AES) || defined(CONFIG_CRYPTO_DEV_ATMEL_AES_MODULE)
 #if defined(CONFIG_CRYPTO_DEV_ATMEL_AES) || defined(CONFIG_CRYPTO_DEV_ATMEL_AES_MODULE)
-static struct aes_platform_data aes_data;
+static struct crypto_platform_data aes_data;
+static struct crypto_dma_data alt_atslave;
 static u64 aes_dmamask = DMA_BIT_MASK(32);
 static u64 aes_dmamask = DMA_BIT_MASK(32);
 
 
 static struct resource aes_resources[] = {
 static struct resource aes_resources[] = {
@@ -1931,23 +1932,20 @@ static struct platform_device at91sam9g45_aes_device = {
 static void __init at91_add_device_aes(void)
 static void __init at91_add_device_aes(void)
 {
 {
 	struct at_dma_slave	*atslave;
 	struct at_dma_slave	*atslave;
-	struct aes_dma_data	*alt_atslave;
-
-	alt_atslave = kzalloc(sizeof(struct aes_dma_data), GFP_KERNEL);
 
 
 	/* DMA TX slave channel configuration */
 	/* DMA TX slave channel configuration */
-	atslave = &alt_atslave->txdata;
+	atslave = &alt_atslave.txdata;
 	atslave->dma_dev = &at_hdmac_device.dev;
 	atslave->dma_dev = &at_hdmac_device.dev;
 	atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE	| ATC_SRC_H2SEL_HW |
 	atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE	| ATC_SRC_H2SEL_HW |
 						ATC_SRC_PER(AT_DMA_ID_AES_RX);
 						ATC_SRC_PER(AT_DMA_ID_AES_RX);
 
 
 	/* DMA RX slave channel configuration */
 	/* DMA RX slave channel configuration */
-	atslave = &alt_atslave->rxdata;
+	atslave = &alt_atslave.rxdata;
 	atslave->dma_dev = &at_hdmac_device.dev;
 	atslave->dma_dev = &at_hdmac_device.dev;
 	atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE	| ATC_DST_H2SEL_HW |
 	atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE	| ATC_DST_H2SEL_HW |
 						ATC_DST_PER(AT_DMA_ID_AES_TX);
 						ATC_DST_PER(AT_DMA_ID_AES_TX);
 
 
-	aes_data.dma_slave = alt_atslave;
+	aes_data.dma_slave = &alt_atslave;
 	platform_device_register(&at91sam9g45_aes_device);
 	platform_device_register(&at91sam9g45_aes_device);
 }
 }
 #else
 #else

+ 45 - 12
arch/x86/crypto/Makefile

@@ -2,6 +2,10 @@
 # Arch-specific CryptoAPI modules.
 # Arch-specific CryptoAPI modules.
 #
 #
 
 
+avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+					$(comma)4)$(comma)%ymm2,yes,no)
+
 obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
 obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o
 obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
 obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
 
 
@@ -12,22 +16,37 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
 
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
 obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
-obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
-obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
-obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
-obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 
 
 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
 obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
 obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
+obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
+obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
+
+# These modules require assembler to support AVX.
+ifeq ($(avx_supported),yes)
+	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
+						camellia-aesni-avx-x86_64.o
+	obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
+	obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
+	obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
+	obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
+endif
+
+# These modules require assembler to support AVX2.
+ifeq ($(avx2_supported),yes)
+	obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o
+	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
+	obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
+	obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o
+endif
 
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
@@ -36,21 +55,35 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
 
 
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
 camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
 camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
-camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
-			       camellia_aesni_avx_glue.o
-cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
-cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o twofish_avx_glue.o
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
-serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
+
+ifeq ($(avx_supported),yes)
+	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
+					camellia_aesni_avx_glue.o
+	cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
+	cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
+	twofish-avx-x86_64-y := twofish-avx-x86_64-asm_64.o \
+				twofish_avx_glue.o
+	serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o \
+				serpent_avx_glue.o
+endif
+
+ifeq ($(avx2_supported),yes)
+	blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o
+	camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
+	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
+	twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o
+endif
 
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 crc32c-intel-y := crc32c-intel_glue.o
 crc32c-intel-y := crc32c-intel_glue.o
-crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
+crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o
 crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
 crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
+sha256-ssse3-y := sha256-ssse3-asm.o sha256-avx-asm.o sha256-avx2-asm.o sha256_ssse3_glue.o
+sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o

+ 117 - 0
arch/x86/crypto/aesni-intel_asm.S

@@ -34,6 +34,10 @@
 
 
 #ifdef __x86_64__
 #ifdef __x86_64__
 .data
 .data
+.align 16
+.Lgf128mul_x_ble_mask:
+	.octa 0x00000000000000010000000000000087
+
 POLY:   .octa 0xC2000000000000000000000000000001
 POLY:   .octa 0xC2000000000000000000000000000001
 TWOONE: .octa 0x00000001000000000000000000000001
 TWOONE: .octa 0x00000001000000000000000000000001
 
 
@@ -105,6 +109,8 @@ enc:        .octa 0x2
 #define CTR	%xmm11
 #define CTR	%xmm11
 #define INC	%xmm12
 #define INC	%xmm12
 
 
+#define GF128MUL_MASK %xmm10
+
 #ifdef __x86_64__
 #ifdef __x86_64__
 #define AREG	%rax
 #define AREG	%rax
 #define KEYP	%rdi
 #define KEYP	%rdi
@@ -2636,4 +2642,115 @@ ENTRY(aesni_ctr_enc)
 .Lctr_enc_just_ret:
 .Lctr_enc_just_ret:
 	ret
 	ret
 ENDPROC(aesni_ctr_enc)
 ENDPROC(aesni_ctr_enc)
+
+/*
+ * _aesni_gf128mul_x_ble:		internal ABI
+ *	Multiply in GF(2^128) for XTS IVs
+ * input:
+ *	IV:	current IV
+ *	GF128MUL_MASK == mask with 0x87 and 0x01
+ * output:
+ *	IV:	next IV
+ * changed:
+ *	CTR:	== temporary value
+ */
+#define _aesni_gf128mul_x_ble() \
+	pshufd $0x13, IV, CTR; \
+	paddq IV, IV; \
+	psrad $31, CTR; \
+	pand GF128MUL_MASK, CTR; \
+	pxor CTR, IV;
+
+/*
+ * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *			 bool enc, u8 *iv)
+ */
+ENTRY(aesni_xts_crypt8)
+	cmpb $0, %cl
+	movl $0, %ecx
+	movl $240, %r10d
+	leaq _aesni_enc4, %r11
+	leaq _aesni_dec4, %rax
+	cmovel %r10d, %ecx
+	cmoveq %rax, %r11
+
+	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+	movups (IVP), IV
+
+	mov 480(KEYP), KLEN
+	addq %rcx, KEYP
+
+	movdqa IV, STATE1
+	pxor 0x00(INP), STATE1
+	movdqu IV, 0x00(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE2
+	pxor 0x10(INP), STATE2
+	movdqu IV, 0x10(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE3
+	pxor 0x20(INP), STATE3
+	movdqu IV, 0x20(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE4
+	pxor 0x30(INP), STATE4
+	movdqu IV, 0x30(OUTP)
+
+	call *%r11
+
+	pxor 0x00(OUTP), STATE1
+	movdqu STATE1, 0x00(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE1
+	pxor 0x40(INP), STATE1
+	movdqu IV, 0x40(OUTP)
+
+	pxor 0x10(OUTP), STATE2
+	movdqu STATE2, 0x10(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE2
+	pxor 0x50(INP), STATE2
+	movdqu IV, 0x50(OUTP)
+
+	pxor 0x20(OUTP), STATE3
+	movdqu STATE3, 0x20(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE3
+	pxor 0x60(INP), STATE3
+	movdqu IV, 0x60(OUTP)
+
+	pxor 0x30(OUTP), STATE4
+	movdqu STATE4, 0x30(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movdqa IV, STATE4
+	pxor 0x70(INP), STATE4
+	movdqu IV, 0x70(OUTP)
+
+	_aesni_gf128mul_x_ble()
+	movups IV, (IVP)
+
+	call *%r11
+
+	pxor 0x40(OUTP), STATE1
+	movdqu STATE1, 0x40(OUTP)
+
+	pxor 0x50(OUTP), STATE2
+	movdqu STATE2, 0x50(OUTP)
+
+	pxor 0x60(OUTP), STATE3
+	movdqu STATE3, 0x60(OUTP)
+
+	pxor 0x70(OUTP), STATE4
+	movdqu STATE4, 0x70(OUTP)
+
+	ret
+ENDPROC(aesni_xts_crypt8)
+
 #endif
 #endif

+ 80 - 0
arch/x86/crypto/aesni-intel_glue.c

@@ -39,6 +39,9 @@
 #include <crypto/internal/aead.h>
 #include <crypto/internal/aead.h>
 #include <linux/workqueue.h>
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
 #include <linux/spinlock.h>
+#ifdef CONFIG_X86_64
+#include <asm/crypto/glue_helper.h>
+#endif
 
 
 #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
 #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE)
 #define HAS_PCBC
 #define HAS_PCBC
@@ -102,6 +105,9 @@ void crypto_fpu_exit(void);
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 			      const u8 *in, unsigned int len, u8 *iv);
 
 
+asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
+				 const u8 *in, bool enc, u8 *iv);
+
 /* asmlinkage void aesni_gcm_enc()
 /* asmlinkage void aesni_gcm_enc()
  * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
  * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
  * u8 *out, Ciphertext output. Encrypt in-place is allowed.
  * u8 *out, Ciphertext output. Encrypt in-place is allowed.
@@ -510,6 +516,78 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in)
 	aesni_enc(ctx, out, in);
 	aesni_enc(ctx, out, in);
 }
 }
 
 
+#ifdef CONFIG_X86_64
+
+static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc));
+}
+
+static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec));
+}
+
+static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv);
+}
+
+static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv);
+}
+
+static const struct common_glue_ctx aesni_enc_xts = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = 1,
+
+	.funcs = { {
+		.num_blocks = 8,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) }
+	} }
+};
+
+static const struct common_glue_ctx aesni_dec_xts = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = 1,
+
+	.funcs = { {
+		.num_blocks = 8,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) }
+	} }
+};
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(aesni_xts_tweak),
+				     aes_ctx(ctx->raw_tweak_ctx),
+				     aes_ctx(ctx->raw_crypt_ctx));
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(aesni_xts_tweak),
+				     aes_ctx(ctx->raw_tweak_ctx),
+				     aes_ctx(ctx->raw_crypt_ctx));
+}
+
+#else
+
 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 		       struct scatterlist *src, unsigned int nbytes)
 {
 {
@@ -560,6 +638,8 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return ret;
 	return ret;
 }
 }
 
 
+#endif
+
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 static int rfc4106_init(struct crypto_tfm *tfm)
 static int rfc4106_init(struct crypto_tfm *tfm)
 {
 {

+ 449 - 0
arch/x86/crypto/blowfish-avx2-asm_64.S

@@ -0,0 +1,449 @@
+/*
+ * x86_64/AVX2 assembler optimized version of Blowfish
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/linkage.h>
+
+.file "blowfish-avx2-asm_64.S"
+
+.data
+.align 32
+
+.Lprefetch_mask:
+.long 0*64
+.long 1*64
+.long 2*64
+.long 3*64
+.long 4*64
+.long 5*64
+.long 6*64
+.long 7*64
+
+.Lbswap32_mask:
+.long 0x00010203
+.long 0x04050607
+.long 0x08090a0b
+.long 0x0c0d0e0f
+
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lbswap_iv_mask:
+	.byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
+
+.text
+/* structure of crypto context */
+#define p	0
+#define s0	((16 + 2) * 4)
+#define s1	((16 + 2 + (1 * 256)) * 4)
+#define s2	((16 + 2 + (2 * 256)) * 4)
+#define s3	((16 + 2 + (3 * 256)) * 4)
+
+/* register macros */
+#define CTX	%rdi
+#define RIO	 %rdx
+
+#define RS0	%rax
+#define RS1	%r8
+#define RS2	%r9
+#define RS3	%r10
+
+#define RLOOP	%r11
+#define RLOOPd	%r11d
+
+#define RXr0	%ymm8
+#define RXr1	%ymm9
+#define RXr2	%ymm10
+#define RXr3	%ymm11
+#define RXl0	%ymm12
+#define RXl1	%ymm13
+#define RXl2	%ymm14
+#define RXl3	%ymm15
+
+/* temp regs */
+#define RT0	%ymm0
+#define RT0x	%xmm0
+#define RT1	%ymm1
+#define RT1x	%xmm1
+#define RIDX0	%ymm2
+#define RIDX1	%ymm3
+#define RIDX1x	%xmm3
+#define RIDX2	%ymm4
+#define RIDX3	%ymm5
+
+/* vpgatherdd mask and '-1' */
+#define RNOT	%ymm6
+
+/* byte mask, (-1 >> 24) */
+#define RBYTE	%ymm7
+
+/***********************************************************************
+ * 32-way AVX2 blowfish
+ ***********************************************************************/
+#define F(xl, xr) \
+	vpsrld $24, xl, RIDX0; \
+	vpsrld $16, xl, RIDX1; \
+	vpsrld $8, xl, RIDX2; \
+	vpand RBYTE, RIDX1, RIDX1; \
+	vpand RBYTE, RIDX2, RIDX2; \
+	vpand RBYTE, xl, RIDX3; \
+	\
+	vpgatherdd RNOT, (RS0, RIDX0, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpcmpeqd RIDX0, RIDX0, RIDX0; \
+	\
+	vpgatherdd RNOT, (RS1, RIDX1, 4), RT1; \
+	vpcmpeqd RIDX1, RIDX1, RIDX1; \
+	vpaddd RT0, RT1, RT0; \
+	\
+	vpgatherdd RIDX0, (RS2, RIDX2, 4), RT1; \
+	vpxor RT0, RT1, RT0; \
+	\
+	vpgatherdd RIDX1, (RS3, RIDX3, 4), RT1; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpaddd RT0, RT1, RT0; \
+	\
+	vpxor RT0, xr, xr;
+
+#define add_roundkey(xl, nmem) \
+	vpbroadcastd nmem, RT0; \
+	vpxor RT0, xl ## 0, xl ## 0; \
+	vpxor RT0, xl ## 1, xl ## 1; \
+	vpxor RT0, xl ## 2, xl ## 2; \
+	vpxor RT0, xl ## 3, xl ## 3;
+
+#define round_enc() \
+	add_roundkey(RXr, p(CTX,RLOOP,4)); \
+	F(RXl0, RXr0); \
+	F(RXl1, RXr1); \
+	F(RXl2, RXr2); \
+	F(RXl3, RXr3); \
+	\
+	add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
+	F(RXr0, RXl0); \
+	F(RXr1, RXl1); \
+	F(RXr2, RXl2); \
+	F(RXr3, RXl3);
+
+#define round_dec() \
+	add_roundkey(RXr, p+4*2(CTX,RLOOP,4)); \
+	F(RXl0, RXr0); \
+	F(RXl1, RXr1); \
+	F(RXl2, RXr2); \
+	F(RXl3, RXr3); \
+	\
+	add_roundkey(RXl, p+4(CTX,RLOOP,4)); \
+	F(RXr0, RXl0); \
+	F(RXr1, RXl1); \
+	F(RXr2, RXl2); \
+	F(RXr3, RXl3);
+
+#define init_round_constants() \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	leaq s0(CTX), RS0; \
+	leaq s1(CTX), RS1; \
+	leaq s2(CTX), RS2; \
+	leaq s3(CTX), RS3; \
+	vpsrld $24, RNOT, RBYTE;
+
+#define transpose_2x2(x0, x1, t0) \
+	vpunpckldq x0, x1, t0; \
+	vpunpckhdq x0, x1, x1; \
+	\
+	vpunpcklqdq t0, x1, x0; \
+	vpunpckhqdq t0, x1, x1;
+
+#define read_block(xl, xr) \
+	vbroadcasti128 .Lbswap32_mask, RT1; \
+	\
+	vpshufb RT1, xl ## 0, xl ## 0; \
+	vpshufb RT1, xr ## 0, xr ## 0; \
+	vpshufb RT1, xl ## 1, xl ## 1; \
+	vpshufb RT1, xr ## 1, xr ## 1; \
+	vpshufb RT1, xl ## 2, xl ## 2; \
+	vpshufb RT1, xr ## 2, xr ## 2; \
+	vpshufb RT1, xl ## 3, xl ## 3; \
+	vpshufb RT1, xr ## 3, xr ## 3; \
+	\
+	transpose_2x2(xl ## 0, xr ## 0, RT0); \
+	transpose_2x2(xl ## 1, xr ## 1, RT0); \
+	transpose_2x2(xl ## 2, xr ## 2, RT0); \
+	transpose_2x2(xl ## 3, xr ## 3, RT0);
+
+#define write_block(xl, xr) \
+	vbroadcasti128 .Lbswap32_mask, RT1; \
+	\
+	transpose_2x2(xl ## 0, xr ## 0, RT0); \
+	transpose_2x2(xl ## 1, xr ## 1, RT0); \
+	transpose_2x2(xl ## 2, xr ## 2, RT0); \
+	transpose_2x2(xl ## 3, xr ## 3, RT0); \
+	\
+	vpshufb RT1, xl ## 0, xl ## 0; \
+	vpshufb RT1, xr ## 0, xr ## 0; \
+	vpshufb RT1, xl ## 1, xl ## 1; \
+	vpshufb RT1, xr ## 1, xr ## 1; \
+	vpshufb RT1, xl ## 2, xl ## 2; \
+	vpshufb RT1, xr ## 2, xr ## 2; \
+	vpshufb RT1, xl ## 3, xl ## 3; \
+	vpshufb RT1, xr ## 3, xr ## 3;
+
+.align 8
+__blowfish_enc_blk32:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RXl0..4, RXr0..4: plaintext
+	 * output:
+	 *	RXl0..4, RXr0..4: ciphertext (RXl <=> RXr swapped)
+	 */
+	init_round_constants();
+
+	read_block(RXl, RXr);
+
+	movl $1, RLOOPd;
+	add_roundkey(RXl, p+4*(0)(CTX));
+
+.align 4
+.L__enc_loop:
+	round_enc();
+
+	leal 2(RLOOPd), RLOOPd;
+	cmpl $17, RLOOPd;
+	jne .L__enc_loop;
+
+	add_roundkey(RXr, p+4*(17)(CTX));
+
+	write_block(RXl, RXr);
+
+	ret;
+ENDPROC(__blowfish_enc_blk32)
+
+.align 8
+__blowfish_dec_blk32:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RXl0..4, RXr0..4: ciphertext
+	 * output:
+	 *	RXl0..4, RXr0..4: plaintext (RXl <=> RXr swapped)
+	 */
+	init_round_constants();
+
+	read_block(RXl, RXr);
+
+	movl $14, RLOOPd;
+	add_roundkey(RXl, p+4*(17)(CTX));
+
+.align 4
+.L__dec_loop:
+	round_dec();
+
+	addl $-2, RLOOPd;
+	jns .L__dec_loop;
+
+	add_roundkey(RXr, p+4*(0)(CTX));
+
+	write_block(RXl, RXr);
+
+	ret;
+ENDPROC(__blowfish_dec_blk32)
+
+ENTRY(blowfish_ecb_enc_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	vmovdqu 0*32(%rdx), RXl0;
+	vmovdqu 1*32(%rdx), RXr0;
+	vmovdqu 2*32(%rdx), RXl1;
+	vmovdqu 3*32(%rdx), RXr1;
+	vmovdqu 4*32(%rdx), RXl2;
+	vmovdqu 5*32(%rdx), RXr2;
+	vmovdqu 6*32(%rdx), RXl3;
+	vmovdqu 7*32(%rdx), RXr3;
+
+	call __blowfish_enc_blk32;
+
+	vmovdqu RXr0, 0*32(%rsi);
+	vmovdqu RXl0, 1*32(%rsi);
+	vmovdqu RXr1, 2*32(%rsi);
+	vmovdqu RXl1, 3*32(%rsi);
+	vmovdqu RXr2, 4*32(%rsi);
+	vmovdqu RXl2, 5*32(%rsi);
+	vmovdqu RXr3, 6*32(%rsi);
+	vmovdqu RXl3, 7*32(%rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(blowfish_ecb_enc_32way)
+
+ENTRY(blowfish_ecb_dec_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	vmovdqu 0*32(%rdx), RXl0;
+	vmovdqu 1*32(%rdx), RXr0;
+	vmovdqu 2*32(%rdx), RXl1;
+	vmovdqu 3*32(%rdx), RXr1;
+	vmovdqu 4*32(%rdx), RXl2;
+	vmovdqu 5*32(%rdx), RXr2;
+	vmovdqu 6*32(%rdx), RXl3;
+	vmovdqu 7*32(%rdx), RXr3;
+
+	call __blowfish_dec_blk32;
+
+	vmovdqu RXr0, 0*32(%rsi);
+	vmovdqu RXl0, 1*32(%rsi);
+	vmovdqu RXr1, 2*32(%rsi);
+	vmovdqu RXl1, 3*32(%rsi);
+	vmovdqu RXr2, 4*32(%rsi);
+	vmovdqu RXl2, 5*32(%rsi);
+	vmovdqu RXr3, 6*32(%rsi);
+	vmovdqu RXl3, 7*32(%rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(blowfish_ecb_dec_32way)
+
+ENTRY(blowfish_cbc_dec_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	vmovdqu 0*32(%rdx), RXl0;
+	vmovdqu 1*32(%rdx), RXr0;
+	vmovdqu 2*32(%rdx), RXl1;
+	vmovdqu 3*32(%rdx), RXr1;
+	vmovdqu 4*32(%rdx), RXl2;
+	vmovdqu 5*32(%rdx), RXr2;
+	vmovdqu 6*32(%rdx), RXl3;
+	vmovdqu 7*32(%rdx), RXr3;
+
+	call __blowfish_dec_blk32;
+
+	/* xor with src */
+	vmovq (%rdx), RT0x;
+	vpshufd $0x4f, RT0x, RT0x;
+	vinserti128 $1, 8(%rdx), RT0, RT0;
+	vpxor RT0, RXr0, RXr0;
+	vpxor 0*32+24(%rdx), RXl0, RXl0;
+	vpxor 1*32+24(%rdx), RXr1, RXr1;
+	vpxor 2*32+24(%rdx), RXl1, RXl1;
+	vpxor 3*32+24(%rdx), RXr2, RXr2;
+	vpxor 4*32+24(%rdx), RXl2, RXl2;
+	vpxor 5*32+24(%rdx), RXr3, RXr3;
+	vpxor 6*32+24(%rdx), RXl3, RXl3;
+
+	vmovdqu RXr0, (0*32)(%rsi);
+	vmovdqu RXl0, (1*32)(%rsi);
+	vmovdqu RXr1, (2*32)(%rsi);
+	vmovdqu RXl1, (3*32)(%rsi);
+	vmovdqu RXr2, (4*32)(%rsi);
+	vmovdqu RXl2, (5*32)(%rsi);
+	vmovdqu RXr3, (6*32)(%rsi);
+	vmovdqu RXl3, (7*32)(%rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(blowfish_cbc_dec_32way)
+
+ENTRY(blowfish_ctr_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: iv (big endian, 64bit)
+	 */
+
+	vzeroupper;
+
+	vpcmpeqd RT0, RT0, RT0;
+	vpsrldq $8, RT0, RT0; /* a: -1, b: 0, c: -1, d: 0 */
+
+	vpcmpeqd RT1x, RT1x, RT1x;
+	vpaddq RT1x, RT1x, RT1x; /* a: -2, b: -2 */
+	vpxor RIDX0, RIDX0, RIDX0;
+	vinserti128 $1, RT1x, RIDX0, RIDX0; /* a: 0, b: 0, c: -2, d: -2 */
+
+	vpaddq RIDX0, RT0, RT0; /* a: -1, b: 0, c: -3, d: -2 */
+
+	vpcmpeqd RT1, RT1, RT1;
+	vpaddq RT1, RT1, RT1; /* a: -2, b: -2, c: -2, d: -2 */
+	vpaddq RT1, RT1, RIDX2; /* a: -4, b: -4, c: -4, d: -4 */
+
+	vbroadcasti128 .Lbswap_iv_mask, RIDX0;
+	vbroadcasti128 .Lbswap128_mask, RIDX1;
+
+	/* load IV and byteswap */
+	vmovq (%rcx), RT1x;
+	vinserti128 $1, RT1x, RT1, RT1; /* a: BE, b: 0, c: BE, d: 0 */
+	vpshufb RIDX0, RT1, RT1; /* a: LE, b: LE, c: LE, d: LE */
+
+	/* construct IVs */
+	vpsubq RT0, RT1, RT1;		/* a: le1, b: le0, c: le3, d: le2 */
+	vpshufb RIDX1, RT1, RXl0;	/* a: be0, b: be1, c: be2, d: be3 */
+	vpsubq RIDX2, RT1, RT1;		/* le5, le4, le7, le6 */
+	vpshufb RIDX1, RT1, RXr0;	/* be4, be5, be6, be7 */
+	vpsubq RIDX2, RT1, RT1;
+	vpshufb RIDX1, RT1, RXl1;
+	vpsubq RIDX2, RT1, RT1;
+	vpshufb RIDX1, RT1, RXr1;
+	vpsubq RIDX2, RT1, RT1;
+	vpshufb RIDX1, RT1, RXl2;
+	vpsubq RIDX2, RT1, RT1;
+	vpshufb RIDX1, RT1, RXr2;
+	vpsubq RIDX2, RT1, RT1;
+	vpshufb RIDX1, RT1, RXl3;
+	vpsubq RIDX2, RT1, RT1;
+	vpshufb RIDX1, RT1, RXr3;
+
+	/* store last IV */
+	vpsubq RIDX2, RT1, RT1; /* a: le33, b: le32, ... */
+	vpshufb RIDX1x, RT1x, RT1x; /* a: be32, ... */
+	vmovq RT1x, (%rcx);
+
+	call __blowfish_enc_blk32;
+
+	/* dst = src ^ iv */
+	vpxor 0*32(%rdx), RXr0, RXr0;
+	vpxor 1*32(%rdx), RXl0, RXl0;
+	vpxor 2*32(%rdx), RXr1, RXr1;
+	vpxor 3*32(%rdx), RXl1, RXl1;
+	vpxor 4*32(%rdx), RXr2, RXr2;
+	vpxor 5*32(%rdx), RXl2, RXl2;
+	vpxor 6*32(%rdx), RXr3, RXr3;
+	vpxor 7*32(%rdx), RXl3, RXl3;
+	vmovdqu RXr0, (0*32)(%rsi);
+	vmovdqu RXl0, (1*32)(%rsi);
+	vmovdqu RXr1, (2*32)(%rsi);
+	vmovdqu RXl1, (3*32)(%rsi);
+	vmovdqu RXr2, (4*32)(%rsi);
+	vmovdqu RXl2, (5*32)(%rsi);
+	vmovdqu RXr3, (6*32)(%rsi);
+	vmovdqu RXl3, (7*32)(%rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(blowfish_ctr_32way)

+ 585 - 0
arch/x86/crypto/blowfish_avx2_glue.c

@@ -0,0 +1,585 @@
+/*
+ * Glue Code for x86_64/AVX2 assembler optimized version of Blowfish
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ *   (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/blowfish.h>
+#include <crypto/cryptd.h>
+#include <crypto/ctr.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/blowfish.h>
+#include <asm/crypto/ablk_helper.h>
+#include <crypto/scatterwalk.h>
+
+#define BF_AVX2_PARALLEL_BLOCKS 32
+
+/* 32-way AVX2 parallel cipher functions */
+asmlinkage void blowfish_ecb_enc_32way(struct bf_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void blowfish_ecb_dec_32way(struct bf_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void blowfish_cbc_dec_32way(struct bf_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void blowfish_ctr_32way(struct bf_ctx *ctx, u8 *dst, const u8 *src,
+				   __be64 *iv);
+
+static inline bool bf_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+	if (fpu_enabled)
+		return true;
+
+	/* FPU is only used when chunk to be processed is large enough, so
+	 * do not enable FPU until it is necessary.
+	 */
+	if (nbytes < BF_BLOCK_SIZE * BF_AVX2_PARALLEL_BLOCKS)
+		return false;
+
+	kernel_fpu_begin();
+	return true;
+}
+
+static inline void bf_fpu_end(bool fpu_enabled)
+{
+	if (fpu_enabled)
+		kernel_fpu_end();
+}
+
+static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+		     bool enc)
+{
+	bool fpu_enabled = false;
+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = BF_BLOCK_SIZE;
+	unsigned int nbytes;
+	int err;
+
+	err = blkcipher_walk_virt(desc, walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	while ((nbytes = walk->nbytes)) {
+		u8 *wsrc = walk->src.virt.addr;
+		u8 *wdst = walk->dst.virt.addr;
+
+		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
+
+		/* Process multi-block AVX2 batch */
+		if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
+			do {
+				if (enc)
+					blowfish_ecb_enc_32way(ctx, wdst, wsrc);
+				else
+					blowfish_ecb_dec_32way(ctx, wdst, wsrc);
+
+				wsrc += bsize * BF_AVX2_PARALLEL_BLOCKS;
+				wdst += bsize * BF_AVX2_PARALLEL_BLOCKS;
+				nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
+			} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
+
+			if (nbytes < bsize)
+				goto done;
+		}
+
+		/* Process multi-block batch */
+		if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
+			do {
+				if (enc)
+					blowfish_enc_blk_4way(ctx, wdst, wsrc);
+				else
+					blowfish_dec_blk_4way(ctx, wdst, wsrc);
+
+				wsrc += bsize * BF_PARALLEL_BLOCKS;
+				wdst += bsize * BF_PARALLEL_BLOCKS;
+				nbytes -= bsize * BF_PARALLEL_BLOCKS;
+			} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
+
+			if (nbytes < bsize)
+				goto done;
+		}
+
+		/* Handle leftovers */
+		do {
+			if (enc)
+				blowfish_enc_blk(ctx, wdst, wsrc);
+			else
+				blowfish_dec_blk(ctx, wdst, wsrc);
+
+			wsrc += bsize;
+			wdst += bsize;
+			nbytes -= bsize;
+		} while (nbytes >= bsize);
+
+done:
+		err = blkcipher_walk_done(desc, walk, nbytes);
+	}
+
+	bf_fpu_end(fpu_enabled);
+	return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ecb_crypt(desc, &walk, true);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	return ecb_crypt(desc, &walk, false);
+}
+
+static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
+				  struct blkcipher_walk *walk)
+{
+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int bsize = BF_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u64 *src = (u64 *)walk->src.virt.addr;
+	u64 *dst = (u64 *)walk->dst.virt.addr;
+	u64 *iv = (u64 *)walk->iv;
+
+	do {
+		*dst = *src ^ *iv;
+		blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
+		iv = dst;
+
+		src += 1;
+		dst += 1;
+		nbytes -= bsize;
+	} while (nbytes >= bsize);
+
+	*(u64 *)walk->iv = *iv;
+	return nbytes;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	while ((nbytes = walk.nbytes)) {
+		nbytes = __cbc_encrypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	return err;
+}
+
+static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
+				  struct blkcipher_walk *walk)
+{
+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	const unsigned int bsize = BF_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u64 *src = (u64 *)walk->src.virt.addr;
+	u64 *dst = (u64 *)walk->dst.virt.addr;
+	u64 last_iv;
+	int i;
+
+	/* Start of the last block. */
+	src += nbytes / bsize - 1;
+	dst += nbytes / bsize - 1;
+
+	last_iv = *src;
+
+	/* Process multi-block AVX2 batch */
+	if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
+		do {
+			nbytes -= bsize * (BF_AVX2_PARALLEL_BLOCKS - 1);
+			src -= BF_AVX2_PARALLEL_BLOCKS - 1;
+			dst -= BF_AVX2_PARALLEL_BLOCKS - 1;
+
+			blowfish_cbc_dec_32way(ctx, (u8 *)dst, (u8 *)src);
+
+			nbytes -= bsize;
+			if (nbytes < bsize)
+				goto done;
+
+			*dst ^= *(src - 1);
+			src -= 1;
+			dst -= 1;
+		} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Process multi-block batch */
+	if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
+		u64 ivs[BF_PARALLEL_BLOCKS - 1];
+
+		do {
+			nbytes -= bsize * (BF_PARALLEL_BLOCKS - 1);
+			src -= BF_PARALLEL_BLOCKS - 1;
+			dst -= BF_PARALLEL_BLOCKS - 1;
+
+			for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
+				ivs[i] = src[i];
+
+			blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src);
+
+			for (i = 0; i < BF_PARALLEL_BLOCKS - 1; i++)
+				dst[i + 1] ^= ivs[i];
+
+			nbytes -= bsize;
+			if (nbytes < bsize)
+				goto done;
+
+			*dst ^= *(src - 1);
+			src -= 1;
+			dst -= 1;
+		} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Handle leftovers */
+	for (;;) {
+		blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src);
+
+		nbytes -= bsize;
+		if (nbytes < bsize)
+			break;
+
+		*dst ^= *(src - 1);
+		src -= 1;
+		dst -= 1;
+	}
+
+done:
+	*dst ^= *(u64 *)walk->iv;
+	*(u64 *)walk->iv = last_iv;
+
+	return nbytes;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	bool fpu_enabled = false;
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	while ((nbytes = walk.nbytes)) {
+		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
+		nbytes = __cbc_decrypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	bf_fpu_end(fpu_enabled);
+	return err;
+}
+
+static void ctr_crypt_final(struct blkcipher_desc *desc,
+			    struct blkcipher_walk *walk)
+{
+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	u8 *ctrblk = walk->iv;
+	u8 keystream[BF_BLOCK_SIZE];
+	u8 *src = walk->src.virt.addr;
+	u8 *dst = walk->dst.virt.addr;
+	unsigned int nbytes = walk->nbytes;
+
+	blowfish_enc_blk(ctx, keystream, ctrblk);
+	crypto_xor(keystream, src, nbytes);
+	memcpy(dst, keystream, nbytes);
+
+	crypto_inc(ctrblk, BF_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
+				struct blkcipher_walk *walk)
+{
+	struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	unsigned int bsize = BF_BLOCK_SIZE;
+	unsigned int nbytes = walk->nbytes;
+	u64 *src = (u64 *)walk->src.virt.addr;
+	u64 *dst = (u64 *)walk->dst.virt.addr;
+	int i;
+
+	/* Process multi-block AVX2 batch */
+	if (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS) {
+		do {
+			blowfish_ctr_32way(ctx, (u8 *)dst, (u8 *)src,
+					   (__be64 *)walk->iv);
+
+			src += BF_AVX2_PARALLEL_BLOCKS;
+			dst += BF_AVX2_PARALLEL_BLOCKS;
+			nbytes -= bsize * BF_AVX2_PARALLEL_BLOCKS;
+		} while (nbytes >= bsize * BF_AVX2_PARALLEL_BLOCKS);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Process four block batch */
+	if (nbytes >= bsize * BF_PARALLEL_BLOCKS) {
+		__be64 ctrblocks[BF_PARALLEL_BLOCKS];
+		u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
+
+		do {
+			/* create ctrblks for parallel encrypt */
+			for (i = 0; i < BF_PARALLEL_BLOCKS; i++) {
+				if (dst != src)
+					dst[i] = src[i];
+
+				ctrblocks[i] = cpu_to_be64(ctrblk++);
+			}
+
+			blowfish_enc_blk_xor_4way(ctx, (u8 *)dst,
+						  (u8 *)ctrblocks);
+
+			src += BF_PARALLEL_BLOCKS;
+			dst += BF_PARALLEL_BLOCKS;
+			nbytes -= bsize * BF_PARALLEL_BLOCKS;
+		} while (nbytes >= bsize * BF_PARALLEL_BLOCKS);
+
+		*(__be64 *)walk->iv = cpu_to_be64(ctrblk);
+
+		if (nbytes < bsize)
+			goto done;
+	}
+
+	/* Handle leftovers */
+	do {
+		u64 ctrblk;
+
+		if (dst != src)
+			*dst = *src;
+
+		ctrblk = *(u64 *)walk->iv;
+		be64_add_cpu((__be64 *)walk->iv, 1);
+
+		blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
+
+		src += 1;
+		dst += 1;
+	} while ((nbytes -= bsize) >= bsize);
+
+done:
+	return nbytes;
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	bool fpu_enabled = false;
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
+		fpu_enabled = bf_fpu_begin(fpu_enabled, nbytes);
+		nbytes = __ctr_crypt(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+
+	bf_fpu_end(fpu_enabled);
+
+	if (walk.nbytes) {
+		ctr_crypt_final(desc, &walk);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+
+	return err;
+}
+
+static struct crypto_alg bf_algs[6] = { {
+	.cra_name		= "__ecb-blowfish-avx2",
+	.cra_driver_name	= "__driver-ecb-blowfish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= BF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct bf_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= BF_MIN_KEY_SIZE,
+			.max_keysize	= BF_MAX_KEY_SIZE,
+			.setkey		= blowfish_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__cbc-blowfish-avx2",
+	.cra_driver_name	= "__driver-cbc-blowfish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= BF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct bf_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= BF_MIN_KEY_SIZE,
+			.max_keysize	= BF_MAX_KEY_SIZE,
+			.setkey		= blowfish_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__ctr-blowfish-avx2",
+	.cra_driver_name	= "__driver-ctr-blowfish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct bf_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= BF_MIN_KEY_SIZE,
+			.max_keysize	= BF_MAX_KEY_SIZE,
+			.ivsize		= BF_BLOCK_SIZE,
+			.setkey		= blowfish_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "ecb(blowfish)",
+	.cra_driver_name	= "ecb-blowfish-avx2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= BF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= BF_MIN_KEY_SIZE,
+			.max_keysize	= BF_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(blowfish)",
+	.cra_driver_name	= "cbc-blowfish-avx2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= BF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= BF_MIN_KEY_SIZE,
+			.max_keysize	= BF_MAX_KEY_SIZE,
+			.ivsize		= BF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= __ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(blowfish)",
+	.cra_driver_name	= "ctr-blowfish-avx2",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= BF_MIN_KEY_SIZE,
+			.max_keysize	= BF_MAX_KEY_SIZE,
+			.ivsize		= BF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_encrypt,
+			.geniv		= "chainiv",
+		},
+	},
+} };
+
+
+static int __init init(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx2 || !cpu_has_osxsave) {
+		pr_info("AVX2 instructions are not detected.\n");
+		return -ENODEV;
+	}
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		pr_info("AVX detected but unusable.\n");
+		return -ENODEV;
+	}
+
+	return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs));
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs));
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Blowfish Cipher Algorithm, AVX2 optimized");
+MODULE_ALIAS("blowfish");
+MODULE_ALIAS("blowfish-asm");

+ 8 - 24
arch/x86/crypto/blowfish_glue.c

@@ -1,7 +1,7 @@
 /*
 /*
  * Glue Code for assembler optimized version of Blowfish
  * Glue Code for assembler optimized version of Blowfish
  *
  *
- * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  *
  * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
  * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
  *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
  *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
@@ -32,40 +32,24 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/types.h>
 #include <crypto/algapi.h>
 #include <crypto/algapi.h>
+#include <asm/crypto/blowfish.h>
 
 
 /* regular block cipher functions */
 /* regular block cipher functions */
 asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
 asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
 				   bool xor);
 				   bool xor);
+EXPORT_SYMBOL_GPL(__blowfish_enc_blk);
+
 asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
 asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
+EXPORT_SYMBOL_GPL(blowfish_dec_blk);
 
 
 /* 4-way parallel cipher functions */
 /* 4-way parallel cipher functions */
 asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
 asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
 					const u8 *src, bool xor);
 					const u8 *src, bool xor);
+EXPORT_SYMBOL_GPL(__blowfish_enc_blk_4way);
+
 asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
 asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
 				      const u8 *src);
 				      const u8 *src);
-
-static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
-{
-	__blowfish_enc_blk(ctx, dst, src, false);
-}
-
-static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
-					const u8 *src)
-{
-	__blowfish_enc_blk(ctx, dst, src, true);
-}
-
-static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
-					 const u8 *src)
-{
-	__blowfish_enc_blk_4way(ctx, dst, src, false);
-}
-
-static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
-				      const u8 *src)
-{
-	__blowfish_enc_blk_4way(ctx, dst, src, true);
-}
+EXPORT_SYMBOL_GPL(blowfish_dec_blk_4way);
 
 
 static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 {

+ 179 - 1
arch/x86/crypto/camellia-aesni-avx-asm_64.S

@@ -1,7 +1,7 @@
 /*
 /*
  * x86_64/AVX/AES-NI assembler implementation of Camellia
  * x86_64/AVX/AES-NI assembler implementation of Camellia
  *
  *
- * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * it under the terms of the GNU General Public License as published by
@@ -589,6 +589,10 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 .Lbswap128_mask:
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 
+/* For XTS mode IV generation */
+.Lxts_gf128mul_and_shl1_mask:
+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+
 /*
 /*
  * pre-SubByte transform
  * pre-SubByte transform
  *
  *
@@ -1090,3 +1094,177 @@ ENTRY(camellia_ctr_16way)
 
 
 	ret;
 	ret;
 ENDPROC(camellia_ctr_16way)
 ENDPROC(camellia_ctr_16way)
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+	vpsrad $31, iv, tmp; \
+	vpaddq iv, iv, iv; \
+	vpshufd $0x13, tmp, tmp; \
+	vpand mask, tmp, tmp; \
+	vpxor tmp, iv, iv;
+
+.align 8
+camellia_xts_crypt_16way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 *	%r8: index for input whitening key
+	 *	%r9: pointer to  __camellia_enc_blk16 or __camellia_dec_blk16
+	 */
+
+	subq $(16 * 16), %rsp;
+	movq %rsp, %rax;
+
+	vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
+
+	/* load IV */
+	vmovdqu (%rcx), %xmm0;
+	vpxor 0 * 16(%rdx), %xmm0, %xmm15;
+	vmovdqu %xmm15, 15 * 16(%rax);
+	vmovdqu %xmm0, 0 * 16(%rsi);
+
+	/* construct IVs */
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 1 * 16(%rdx), %xmm0, %xmm15;
+	vmovdqu %xmm15, 14 * 16(%rax);
+	vmovdqu %xmm0, 1 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 2 * 16(%rdx), %xmm0, %xmm13;
+	vmovdqu %xmm0, 2 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 3 * 16(%rdx), %xmm0, %xmm12;
+	vmovdqu %xmm0, 3 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 4 * 16(%rdx), %xmm0, %xmm11;
+	vmovdqu %xmm0, 4 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 5 * 16(%rdx), %xmm0, %xmm10;
+	vmovdqu %xmm0, 5 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 6 * 16(%rdx), %xmm0, %xmm9;
+	vmovdqu %xmm0, 6 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 7 * 16(%rdx), %xmm0, %xmm8;
+	vmovdqu %xmm0, 7 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 8 * 16(%rdx), %xmm0, %xmm7;
+	vmovdqu %xmm0, 8 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 9 * 16(%rdx), %xmm0, %xmm6;
+	vmovdqu %xmm0, 9 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 10 * 16(%rdx), %xmm0, %xmm5;
+	vmovdqu %xmm0, 10 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 11 * 16(%rdx), %xmm0, %xmm4;
+	vmovdqu %xmm0, 11 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 12 * 16(%rdx), %xmm0, %xmm3;
+	vmovdqu %xmm0, 12 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 13 * 16(%rdx), %xmm0, %xmm2;
+	vmovdqu %xmm0, 13 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 14 * 16(%rdx), %xmm0, %xmm1;
+	vmovdqu %xmm0, 14 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vpxor 15 * 16(%rdx), %xmm0, %xmm15;
+	vmovdqu %xmm15, 0 * 16(%rax);
+	vmovdqu %xmm0, 15 * 16(%rsi);
+
+	gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
+	vmovdqu %xmm0, (%rcx);
+
+	/* inpack16_pre: */
+	vmovq (key_table)(CTX, %r8, 8), %xmm15;
+	vpshufb .Lpack_bswap, %xmm15, %xmm15;
+	vpxor 0 * 16(%rax), %xmm15, %xmm0;
+	vpxor %xmm1, %xmm15, %xmm1;
+	vpxor %xmm2, %xmm15, %xmm2;
+	vpxor %xmm3, %xmm15, %xmm3;
+	vpxor %xmm4, %xmm15, %xmm4;
+	vpxor %xmm5, %xmm15, %xmm5;
+	vpxor %xmm6, %xmm15, %xmm6;
+	vpxor %xmm7, %xmm15, %xmm7;
+	vpxor %xmm8, %xmm15, %xmm8;
+	vpxor %xmm9, %xmm15, %xmm9;
+	vpxor %xmm10, %xmm15, %xmm10;
+	vpxor %xmm11, %xmm15, %xmm11;
+	vpxor %xmm12, %xmm15, %xmm12;
+	vpxor %xmm13, %xmm15, %xmm13;
+	vpxor 14 * 16(%rax), %xmm15, %xmm14;
+	vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+	call *%r9;
+
+	addq $(16 * 16), %rsp;
+
+	vpxor 0 * 16(%rsi), %xmm7, %xmm7;
+	vpxor 1 * 16(%rsi), %xmm6, %xmm6;
+	vpxor 2 * 16(%rsi), %xmm5, %xmm5;
+	vpxor 3 * 16(%rsi), %xmm4, %xmm4;
+	vpxor 4 * 16(%rsi), %xmm3, %xmm3;
+	vpxor 5 * 16(%rsi), %xmm2, %xmm2;
+	vpxor 6 * 16(%rsi), %xmm1, %xmm1;
+	vpxor 7 * 16(%rsi), %xmm0, %xmm0;
+	vpxor 8 * 16(%rsi), %xmm15, %xmm15;
+	vpxor 9 * 16(%rsi), %xmm14, %xmm14;
+	vpxor 10 * 16(%rsi), %xmm13, %xmm13;
+	vpxor 11 * 16(%rsi), %xmm12, %xmm12;
+	vpxor 12 * 16(%rsi), %xmm11, %xmm11;
+	vpxor 13 * 16(%rsi), %xmm10, %xmm10;
+	vpxor 14 * 16(%rsi), %xmm9, %xmm9;
+	vpxor 15 * 16(%rsi), %xmm8, %xmm8;
+	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+		     %xmm8, %rsi);
+
+	ret;
+ENDPROC(camellia_xts_crypt_16way)
+
+ENTRY(camellia_xts_enc_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+	xorl %r8d, %r8d; /* input whitening key, 0 for enc */
+
+	leaq __camellia_enc_blk16, %r9;
+
+	jmp camellia_xts_crypt_16way;
+ENDPROC(camellia_xts_enc_16way)
+
+ENTRY(camellia_xts_dec_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	cmpl $16, key_length(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d;  /* input whitening key, last for dec */
+
+	leaq __camellia_dec_blk16, %r9;
+
+	jmp camellia_xts_crypt_16way;
+ENDPROC(camellia_xts_dec_16way)

+ 1368 - 0
arch/x86/crypto/camellia-aesni-avx2-asm_64.S

@@ -0,0 +1,1368 @@
+/*
+ * x86_64/AVX2/AES-NI assembler implementation of Camellia
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/linkage.h>
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct camellia_ctx: */
+#define key_table 0
+#define key_length CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %r8
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
+	vpand x, mask4bit, tmp0; \
+	vpandn x, mask4bit, x; \
+	vpsrld $4, x, x; \
+	\
+	vpshufb tmp0, lo_t, tmp0; \
+	vpshufb x, hi_t, x; \
+	vpxor tmp0, x, x;
+
+#define ymm0_x xmm0
+#define ymm1_x xmm1
+#define ymm2_x xmm2
+#define ymm3_x xmm3
+#define ymm4_x xmm4
+#define ymm5_x xmm5
+#define ymm6_x xmm6
+#define ymm7_x xmm7
+#define ymm8_x xmm8
+#define ymm9_x xmm9
+#define ymm10_x xmm10
+#define ymm11_x xmm11
+#define ymm12_x xmm12
+#define ymm13_x xmm13
+#define ymm14_x xmm14
+#define ymm15_x xmm15
+
+/*
+ * AES-NI instructions do not support ymmX registers, so we need splitting and
+ * merging.
+ */
+#define vaesenclast256(zero, yreg, tmp) \
+	vextracti128 $1, yreg, tmp##_x; \
+	vaesenclast zero##_x, yreg##_x, yreg##_x; \
+	vaesenclast zero##_x, tmp##_x, tmp##_x; \
+	vinserti128 $1, tmp##_x, yreg, yreg;
+
+/**********************************************************************
+  32-way camellia
+ **********************************************************************/
+
+/*
+ * IN:
+ *   x0..x7: byte-sliced AB state
+ *   mem_cd: register pointer storing CD state
+ *   key: index for key material
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
+		  t7, mem_cd, key) \
+	/* \
+	 * S-function with AES subbytes \
+	 */ \
+	vbroadcasti128 .Linv_shift_row, t4; \
+	vpbroadcastb .L0f0f0f0f, t7; \
+	vbroadcasti128 .Lpre_tf_lo_s1, t0; \
+	vbroadcasti128 .Lpre_tf_hi_s1, t1; \
+	\
+	/* AES inverse shift rows */ \
+	vpshufb t4, x0, x0; \
+	vpshufb t4, x7, x7; \
+	vpshufb t4, x1, x1; \
+	vpshufb t4, x4, x4; \
+	vpshufb t4, x2, x2; \
+	vpshufb t4, x5, x5; \
+	vpshufb t4, x3, x3; \
+	vpshufb t4, x6, x6; \
+	\
+	/* prefilter sboxes 1, 2 and 3 */ \
+	vbroadcasti128 .Lpre_tf_lo_s4, t2; \
+	vbroadcasti128 .Lpre_tf_hi_s4, t3; \
+	filter_8bit(x0, t0, t1, t7, t6); \
+	filter_8bit(x7, t0, t1, t7, t6); \
+	filter_8bit(x1, t0, t1, t7, t6); \
+	filter_8bit(x4, t0, t1, t7, t6); \
+	filter_8bit(x2, t0, t1, t7, t6); \
+	filter_8bit(x5, t0, t1, t7, t6); \
+	\
+	/* prefilter sbox 4 */ \
+	vpxor t4##_x, t4##_x, t4##_x; \
+	filter_8bit(x3, t2, t3, t7, t6); \
+	filter_8bit(x6, t2, t3, t7, t6); \
+	\
+	/* AES subbytes + AES shift rows */ \
+	vbroadcasti128 .Lpost_tf_lo_s1, t0; \
+	vbroadcasti128 .Lpost_tf_hi_s1, t1; \
+	vaesenclast256(t4, x0, t5); \
+	vaesenclast256(t4, x7, t5); \
+	vaesenclast256(t4, x1, t5); \
+	vaesenclast256(t4, x4, t5); \
+	vaesenclast256(t4, x2, t5); \
+	vaesenclast256(t4, x5, t5); \
+	vaesenclast256(t4, x3, t5); \
+	vaesenclast256(t4, x6, t5); \
+	\
+	/* postfilter sboxes 1 and 4 */ \
+	vbroadcasti128 .Lpost_tf_lo_s3, t2; \
+	vbroadcasti128 .Lpost_tf_hi_s3, t3; \
+	filter_8bit(x0, t0, t1, t7, t6); \
+	filter_8bit(x7, t0, t1, t7, t6); \
+	filter_8bit(x3, t0, t1, t7, t6); \
+	filter_8bit(x6, t0, t1, t7, t6); \
+	\
+	/* postfilter sbox 3 */ \
+	vbroadcasti128 .Lpost_tf_lo_s2, t4; \
+	vbroadcasti128 .Lpost_tf_hi_s2, t5; \
+	filter_8bit(x2, t2, t3, t7, t6); \
+	filter_8bit(x5, t2, t3, t7, t6); \
+	\
+	vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
+	\
+	/* postfilter sbox 2 */ \
+	filter_8bit(x1, t4, t5, t7, t2); \
+	filter_8bit(x4, t4, t5, t7, t2); \
+	\
+	vpsrldq $1, t0, t1; \
+	vpsrldq $2, t0, t2; \
+	vpsrldq $3, t0, t3; \
+	vpsrldq $4, t0, t4; \
+	vpsrldq $5, t0, t5; \
+	vpsrldq $6, t0, t6; \
+	vpsrldq $7, t0, t7; \
+	vpbroadcastb t0##_x, t0; \
+	vpbroadcastb t1##_x, t1; \
+	vpbroadcastb t2##_x, t2; \
+	vpbroadcastb t3##_x, t3; \
+	vpbroadcastb t4##_x, t4; \
+	vpbroadcastb t6##_x, t6; \
+	vpbroadcastb t5##_x, t5; \
+	vpbroadcastb t7##_x, t7; \
+	\
+	/* P-function */ \
+	vpxor x5, x0, x0; \
+	vpxor x6, x1, x1; \
+	vpxor x7, x2, x2; \
+	vpxor x4, x3, x3; \
+	\
+	vpxor x2, x4, x4; \
+	vpxor x3, x5, x5; \
+	vpxor x0, x6, x6; \
+	vpxor x1, x7, x7; \
+	\
+	vpxor x7, x0, x0; \
+	vpxor x4, x1, x1; \
+	vpxor x5, x2, x2; \
+	vpxor x6, x3, x3; \
+	\
+	vpxor x3, x4, x4; \
+	vpxor x0, x5, x5; \
+	vpxor x1, x6, x6; \
+	vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+	\
+	/* Add key material and result to CD (x becomes new CD) */ \
+	\
+	vpxor t7, x0, x0; \
+	vpxor 4 * 32(mem_cd), x0, x0; \
+	\
+	vpxor t6, x1, x1; \
+	vpxor 5 * 32(mem_cd), x1, x1; \
+	\
+	vpxor t5, x2, x2; \
+	vpxor 6 * 32(mem_cd), x2, x2; \
+	\
+	vpxor t4, x3, x3; \
+	vpxor 7 * 32(mem_cd), x3, x3; \
+	\
+	vpxor t3, x4, x4; \
+	vpxor 0 * 32(mem_cd), x4, x4; \
+	\
+	vpxor t2, x5, x5; \
+	vpxor 1 * 32(mem_cd), x5, x5; \
+	\
+	vpxor t1, x6, x6; \
+	vpxor 2 * 32(mem_cd), x6, x6; \
+	\
+	vpxor t0, x7, x7; \
+	vpxor 3 * 32(mem_cd), x7, x7;
+
+/*
+ * Size optimization... with inlined roundsm16 binary would be over 5 times
+ * larger and would only marginally faster.
+ */
+.align 8
+roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
+	roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		  %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
+		  %rcx, (%r9));
+	ret;
+ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
+
+.align 8
+roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
+	roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
+		  %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
+		  %rax, (%r9));
+	ret;
+ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
+
+/*
+ * IN/OUT:
+ *  x0..x7: byte-sliced AB state preloaded
+ *  mem_ab: byte-sliced AB state in memory
+ *  mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+	leaq (key_table + (i) * 8)(CTX), %r9; \
+	call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
+	\
+	vmovdqu x0, 4 * 32(mem_cd); \
+	vmovdqu x1, 5 * 32(mem_cd); \
+	vmovdqu x2, 6 * 32(mem_cd); \
+	vmovdqu x3, 7 * 32(mem_cd); \
+	vmovdqu x4, 0 * 32(mem_cd); \
+	vmovdqu x5, 1 * 32(mem_cd); \
+	vmovdqu x6, 2 * 32(mem_cd); \
+	vmovdqu x7, 3 * 32(mem_cd); \
+	\
+	leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
+	call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
+	\
+	store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+	/* Store new AB state */ \
+	vmovdqu x4, 4 * 32(mem_ab); \
+	vmovdqu x5, 5 * 32(mem_ab); \
+	vmovdqu x6, 6 * 32(mem_ab); \
+	vmovdqu x7, 7 * 32(mem_ab); \
+	vmovdqu x0, 0 * 32(mem_ab); \
+	vmovdqu x1, 1 * 32(mem_ab); \
+	vmovdqu x2, 2 * 32(mem_ab); \
+	vmovdqu x3, 3 * 32(mem_ab);
+
+#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, i) \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+	two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ *  v0..3: byte-sliced 32-bit integers
+ * OUT:
+ *  v0..3: (IN <<< 1)
+ */
+#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
+	vpcmpgtb v0, zero, t0; \
+	vpaddb v0, v0, v0; \
+	vpabsb t0, t0; \
+	\
+	vpcmpgtb v1, zero, t1; \
+	vpaddb v1, v1, v1; \
+	vpabsb t1, t1; \
+	\
+	vpcmpgtb v2, zero, t2; \
+	vpaddb v2, v2, v2; \
+	vpabsb t2, t2; \
+	\
+	vpor t0, v1, v1; \
+	\
+	vpcmpgtb v3, zero, t0; \
+	vpaddb v3, v3, v3; \
+	vpabsb t0, t0; \
+	\
+	vpor t1, v2, v2; \
+	vpor t2, v3, v3; \
+	vpor t0, v0, v0;
+
+/*
+ * IN:
+ *   r: byte-sliced AB state in memory
+ *   l: byte-sliced CD state in memory
+ * OUT:
+ *   x0..x7: new byte-sliced CD state
+ */
+#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+	      tt1, tt2, tt3, kll, klr, krl, krr) \
+	/* \
+	 * t0 = kll; \
+	 * t0 &= ll; \
+	 * lr ^= rol32(t0, 1); \
+	 */ \
+	vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
+	vpxor tt0, tt0, tt0; \
+	vpbroadcastb t0##_x, t3; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t2; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t1; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t0; \
+	\
+	vpand l0, t0, t0; \
+	vpand l1, t1, t1; \
+	vpand l2, t2, t2; \
+	vpand l3, t3, t3; \
+	\
+	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor l4, t0, l4; \
+	vmovdqu l4, 4 * 32(l); \
+	vpxor l5, t1, l5; \
+	vmovdqu l5, 5 * 32(l); \
+	vpxor l6, t2, l6; \
+	vmovdqu l6, 6 * 32(l); \
+	vpxor l7, t3, l7; \
+	vmovdqu l7, 7 * 32(l); \
+	\
+	/* \
+	 * t2 = krr; \
+	 * t2 |= rr; \
+	 * rl ^= t2; \
+	 */ \
+	\
+	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
+	vpbroadcastb t0##_x, t3; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t2; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t1; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t0; \
+	\
+	vpor 4 * 32(r), t0, t0; \
+	vpor 5 * 32(r), t1, t1; \
+	vpor 6 * 32(r), t2, t2; \
+	vpor 7 * 32(r), t3, t3; \
+	\
+	vpxor 0 * 32(r), t0, t0; \
+	vpxor 1 * 32(r), t1, t1; \
+	vpxor 2 * 32(r), t2, t2; \
+	vpxor 3 * 32(r), t3, t3; \
+	vmovdqu t0, 0 * 32(r); \
+	vmovdqu t1, 1 * 32(r); \
+	vmovdqu t2, 2 * 32(r); \
+	vmovdqu t3, 3 * 32(r); \
+	\
+	/* \
+	 * t2 = krl; \
+	 * t2 &= rl; \
+	 * rr ^= rol32(t2, 1); \
+	 */ \
+	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
+	vpbroadcastb t0##_x, t3; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t2; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t1; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t0; \
+	\
+	vpand 0 * 32(r), t0, t0; \
+	vpand 1 * 32(r), t1, t1; \
+	vpand 2 * 32(r), t2, t2; \
+	vpand 3 * 32(r), t3, t3; \
+	\
+	rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
+	\
+	vpxor 4 * 32(r), t0, t0; \
+	vpxor 5 * 32(r), t1, t1; \
+	vpxor 6 * 32(r), t2, t2; \
+	vpxor 7 * 32(r), t3, t3; \
+	vmovdqu t0, 4 * 32(r); \
+	vmovdqu t1, 5 * 32(r); \
+	vmovdqu t2, 6 * 32(r); \
+	vmovdqu t3, 7 * 32(r); \
+	\
+	/* \
+	 * t0 = klr; \
+	 * t0 |= lr; \
+	 * ll ^= t0; \
+	 */ \
+	\
+	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
+	vpbroadcastb t0##_x, t3; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t2; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t1; \
+	vpsrldq $1, t0, t0; \
+	vpbroadcastb t0##_x, t0; \
+	\
+	vpor l4, t0, t0; \
+	vpor l5, t1, t1; \
+	vpor l6, t2, t2; \
+	vpor l7, t3, t3; \
+	\
+	vpxor l0, t0, l0; \
+	vmovdqu l0, 0 * 32(l); \
+	vpxor l1, t1, l1; \
+	vmovdqu l1, 1 * 32(l); \
+	vpxor l2, t2, l2; \
+	vmovdqu l2, 2 * 32(l); \
+	vpxor l3, t3, l3; \
+	vmovdqu l3, 3 * 32(l);
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+			      a3, b3, c3, d3, st0, st1) \
+	vmovdqu d2, st0; \
+	vmovdqu d3, st1; \
+	transpose_4x4(a0, a1, a2, a3, d2, d3); \
+	transpose_4x4(b0, b1, b2, b3, d2, d3); \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
+	\
+	vmovdqu a0, st0; \
+	vmovdqu a1, st1; \
+	transpose_4x4(c0, c1, c2, c3, a0, a1); \
+	transpose_4x4(d0, d1, d2, d3, a0, a1); \
+	\
+	vbroadcasti128 .Lshufb_16x16b, a0; \
+	vmovdqu st1, a1; \
+	vpshufb a0, a2, a2; \
+	vpshufb a0, a3, a3; \
+	vpshufb a0, b0, b0; \
+	vpshufb a0, b1, b1; \
+	vpshufb a0, b2, b2; \
+	vpshufb a0, b3, b3; \
+	vpshufb a0, a1, a1; \
+	vpshufb a0, c0, c0; \
+	vpshufb a0, c1, c1; \
+	vpshufb a0, c2, c2; \
+	vpshufb a0, c3, c3; \
+	vpshufb a0, d0, d0; \
+	vpshufb a0, d1, d1; \
+	vpshufb a0, d2, d2; \
+	vpshufb a0, d3, d3; \
+	vmovdqu d3, st1; \
+	vmovdqu st0, d3; \
+	vpshufb a0, d3, a0; \
+	vmovdqu d2, st0; \
+	\
+	transpose_4x4(a0, b0, c0, d0, d2, d3); \
+	transpose_4x4(a1, b1, c1, d1, d2, d3); \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
+	\
+	vmovdqu b0, st0; \
+	vmovdqu b1, st1; \
+	transpose_4x4(a2, b2, c2, d2, b0, b1); \
+	transpose_4x4(a3, b3, c3, d3, b0, b1); \
+	vmovdqu st0, b0; \
+	vmovdqu st1, b1; \
+	/* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio, key) \
+	vpbroadcastq key, x0; \
+	vpshufb .Lpack_bswap, x0, x0; \
+	\
+	vpxor 0 * 32(rio), x0, y7; \
+	vpxor 1 * 32(rio), x0, y6; \
+	vpxor 2 * 32(rio), x0, y5; \
+	vpxor 3 * 32(rio), x0, y4; \
+	vpxor 4 * 32(rio), x0, y3; \
+	vpxor 5 * 32(rio), x0, y2; \
+	vpxor 6 * 32(rio), x0, y1; \
+	vpxor 7 * 32(rio), x0, y0; \
+	vpxor 8 * 32(rio), x0, x7; \
+	vpxor 9 * 32(rio), x0, x6; \
+	vpxor 10 * 32(rio), x0, x5; \
+	vpxor 11 * 32(rio), x0, x4; \
+	vpxor 12 * 32(rio), x0, x3; \
+	vpxor 13 * 32(rio), x0, x2; \
+	vpxor 14 * 32(rio), x0, x1; \
+	vpxor 15 * 32(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		      y6, y7, mem_ab, mem_cd) \
+	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+			      y4, y5, y6, y7, (mem_ab), (mem_cd)); \
+	\
+	vmovdqu x0, 0 * 32(mem_ab); \
+	vmovdqu x1, 1 * 32(mem_ab); \
+	vmovdqu x2, 2 * 32(mem_ab); \
+	vmovdqu x3, 3 * 32(mem_ab); \
+	vmovdqu x4, 4 * 32(mem_ab); \
+	vmovdqu x5, 5 * 32(mem_ab); \
+	vmovdqu x6, 6 * 32(mem_ab); \
+	vmovdqu x7, 7 * 32(mem_ab); \
+	vmovdqu y0, 0 * 32(mem_cd); \
+	vmovdqu y1, 1 * 32(mem_cd); \
+	vmovdqu y2, 2 * 32(mem_cd); \
+	vmovdqu y3, 3 * 32(mem_cd); \
+	vmovdqu y4, 4 * 32(mem_cd); \
+	vmovdqu y5, 5 * 32(mem_cd); \
+	vmovdqu y6, 6 * 32(mem_cd); \
+	vmovdqu y7, 7 * 32(mem_cd);
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+		    y5, y6, y7, key, stack_tmp0, stack_tmp1) \
+	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
+	\
+	vmovdqu x0, stack_tmp0; \
+	\
+	vpbroadcastq key, x0; \
+	vpshufb .Lpack_bswap, x0, x0; \
+	\
+	vpxor x0, y7, y7; \
+	vpxor x0, y6, y6; \
+	vpxor x0, y5, y5; \
+	vpxor x0, y4, y4; \
+	vpxor x0, y3, y3; \
+	vpxor x0, y2, y2; \
+	vpxor x0, y1, y1; \
+	vpxor x0, y0, y0; \
+	vpxor x0, x7, x7; \
+	vpxor x0, x6, x6; \
+	vpxor x0, x5, x5; \
+	vpxor x0, x4, x4; \
+	vpxor x0, x3, x3; \
+	vpxor x0, x2, x2; \
+	vpxor x0, x1, x1; \
+	vpxor stack_tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+		     y6, y7, rio) \
+	vmovdqu x0, 0 * 32(rio); \
+	vmovdqu x1, 1 * 32(rio); \
+	vmovdqu x2, 2 * 32(rio); \
+	vmovdqu x3, 3 * 32(rio); \
+	vmovdqu x4, 4 * 32(rio); \
+	vmovdqu x5, 5 * 32(rio); \
+	vmovdqu x6, 6 * 32(rio); \
+	vmovdqu x7, 7 * 32(rio); \
+	vmovdqu y0, 8 * 32(rio); \
+	vmovdqu y1, 9 * 32(rio); \
+	vmovdqu y2, 10 * 32(rio); \
+	vmovdqu y3, 11 * 32(rio); \
+	vmovdqu y4, 12 * 32(rio); \
+	vmovdqu y5, 13 * 32(rio); \
+	vmovdqu y6, 14 * 32(rio); \
+	vmovdqu y7, 15 * 32(rio);
+
+.data
+.align 32
+
+#define SHUFB_BYTES(idx) \
+	0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+.Lshufb_16x16b:
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+	.byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+.Lpack_bswap:
+	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+	.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For XTS mode */
+.Lxts_gf128mul_and_shl1_mask_0:
+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+.Lxts_gf128mul_and_shl1_mask_1:
+	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox1, sbox2, sbox3:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s1:
+	.byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
+	.byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
+.Lpre_tf_hi_s1:
+	.byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
+	.byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
+
+/*
+ * pre-SubByte transform
+ *
+ * pre-lookup for sbox4:
+ *   swap_bitendianness(
+ *       isom_map_camellia_to_aes(
+ *           camellia_f(
+ *               swap_bitendianess(in <<< 1)
+ *           )
+ *       )
+ *   )
+ *
+ * (note: '⊕ 0xc5' inside camellia_f())
+ */
+.Lpre_tf_lo_s4:
+	.byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
+	.byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
+.Lpre_tf_hi_s4:
+	.byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
+	.byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox1, sbox4:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  )
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s1:
+	.byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
+	.byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
+.Lpost_tf_hi_s1:
+	.byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
+	.byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox2:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) <<< 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s2:
+	.byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
+	.byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
+.Lpost_tf_hi_s2:
+	.byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
+	.byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
+
+/*
+ * post-SubByte transform
+ *
+ * post-lookup for sbox3:
+ *  swap_bitendianness(
+ *      camellia_h(
+ *          isom_map_aes_to_camellia(
+ *              swap_bitendianness(
+ *                  aes_inverse_affine_transform(in)
+ *              )
+ *          )
+ *      )
+ *  ) >>> 1
+ *
+ * (note: '⊕ 0x6e' inside camellia_h())
+ */
+.Lpost_tf_lo_s3:
+	.byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
+	.byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
+.Lpost_tf_hi_s3:
+	.byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
+	.byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
+
+/* For isolating SubBytes from AESENCLAST, inverse shift row */
+.Linv_shift_row:
+	.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+	.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+
+.align 4
+/* 4-bit mask */
+.L0f0f0f0f:
+	.long 0x0f0f0f0f
+
+.text
+
+.align 8
+__camellia_enc_blk32:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rax: temporary storage, 512 bytes
+	 *	%ymm0..%ymm15: 32 plaintext blocks
+	 * output:
+	 *	%ymm0..%ymm15: 32 encrypted blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+
+	leaq 8 * 32(%rax), %rcx;
+
+	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		      %ymm15, %rax, %rcx);
+
+	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 0);
+
+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+	      %ymm15,
+	      ((key_table + (8) * 8) + 0)(CTX),
+	      ((key_table + (8) * 8) + 4)(CTX),
+	      ((key_table + (8) * 8) + 8)(CTX),
+	      ((key_table + (8) * 8) + 12)(CTX));
+
+	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 8);
+
+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+	      %ymm15,
+	      ((key_table + (16) * 8) + 0)(CTX),
+	      ((key_table + (16) * 8) + 4)(CTX),
+	      ((key_table + (16) * 8) + 8)(CTX),
+	      ((key_table + (16) * 8) + 12)(CTX));
+
+	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 16);
+
+	movl $24, %r8d;
+	cmpl $16, key_length(CTX);
+	jne .Lenc_max32;
+
+.Lenc_done:
+	/* load CD for output */
+	vmovdqu 0 * 32(%rcx), %ymm8;
+	vmovdqu 1 * 32(%rcx), %ymm9;
+	vmovdqu 2 * 32(%rcx), %ymm10;
+	vmovdqu 3 * 32(%rcx), %ymm11;
+	vmovdqu 4 * 32(%rcx), %ymm12;
+	vmovdqu 5 * 32(%rcx), %ymm13;
+	vmovdqu 6 * 32(%rcx), %ymm14;
+	vmovdqu 7 * 32(%rcx), %ymm15;
+
+	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		    %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
+
+	ret;
+
+.align 8
+.Lenc_max32:
+	movl $32, %r8d;
+
+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+	      %ymm15,
+	      ((key_table + (24) * 8) + 0)(CTX),
+	      ((key_table + (24) * 8) + 4)(CTX),
+	      ((key_table + (24) * 8) + 8)(CTX),
+	      ((key_table + (24) * 8) + 12)(CTX));
+
+	enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 24);
+
+	jmp .Lenc_done;
+ENDPROC(__camellia_enc_blk32)
+
+.align 8
+__camellia_dec_blk32:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rax: temporary storage, 512 bytes
+	 *	%r8d: 24 for 16 byte key, 32 for larger
+	 *	%ymm0..%ymm15: 16 encrypted blocks
+	 * output:
+	 *	%ymm0..%ymm15: 16 plaintext blocks, order swapped:
+	 *       7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+	 */
+
+	leaq 8 * 32(%rax), %rcx;
+
+	inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		      %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		      %ymm15, %rax, %rcx);
+
+	cmpl $32, %r8d;
+	je .Ldec_max32;
+
+.Ldec_max24:
+	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 16);
+
+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+	      %ymm15,
+	      ((key_table + (16) * 8) + 8)(CTX),
+	      ((key_table + (16) * 8) + 12)(CTX),
+	      ((key_table + (16) * 8) + 0)(CTX),
+	      ((key_table + (16) * 8) + 4)(CTX));
+
+	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 8);
+
+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+	      %ymm15,
+	      ((key_table + (8) * 8) + 8)(CTX),
+	      ((key_table + (8) * 8) + 12)(CTX),
+	      ((key_table + (8) * 8) + 0)(CTX),
+	      ((key_table + (8) * 8) + 4)(CTX));
+
+	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 0);
+
+	/* load CD for output */
+	vmovdqu 0 * 32(%rcx), %ymm8;
+	vmovdqu 1 * 32(%rcx), %ymm9;
+	vmovdqu 2 * 32(%rcx), %ymm10;
+	vmovdqu 3 * 32(%rcx), %ymm11;
+	vmovdqu 4 * 32(%rcx), %ymm12;
+	vmovdqu 5 * 32(%rcx), %ymm13;
+	vmovdqu 6 * 32(%rcx), %ymm14;
+	vmovdqu 7 * 32(%rcx), %ymm15;
+
+	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		    %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
+
+	ret;
+
+.align 8
+.Ldec_max32:
+	dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rax, %rcx, 24);
+
+	fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+	      %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+	      %ymm15,
+	      ((key_table + (24) * 8) + 8)(CTX),
+	      ((key_table + (24) * 8) + 12)(CTX),
+	      ((key_table + (24) * 8) + 0)(CTX),
+	      ((key_table + (24) * 8) + 4)(CTX));
+
+	jmp .Ldec_max24;
+ENDPROC(__camellia_dec_blk32)
+
+ENTRY(camellia_ecb_enc_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 */
+
+	vzeroupper;
+
+	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx, (key_table)(CTX));
+
+	/* now dst can be used as temporary buffer (even in src == dst case) */
+	movq	%rsi, %rax;
+
+	call __camellia_enc_blk32;
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(camellia_ecb_enc_32way)
+
+ENTRY(camellia_ecb_dec_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 */
+
+	vzeroupper;
+
+	cmpl $16, key_length(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+	/* now dst can be used as temporary buffer (even in src == dst case) */
+	movq	%rsi, %rax;
+
+	call __camellia_dec_blk32;
+
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(camellia_ecb_dec_32way)
+
+ENTRY(camellia_cbc_dec_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 */
+
+	vzeroupper;
+
+	cmpl $16, key_length(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
+		     %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
+		     %ymm15, %rdx, (key_table)(CTX, %r8, 8));
+
+	movq %rsp, %r10;
+	cmpq %rsi, %rdx;
+	je .Lcbc_dec_use_stack;
+
+	/* dst can be used as temporary storage, src is not overwritten. */
+	movq %rsi, %rax;
+	jmp .Lcbc_dec_continue;
+
+.Lcbc_dec_use_stack:
+	/*
+	 * dst still in-use (because dst == src), so use stack for temporary
+	 * storage.
+	 */
+	subq $(16 * 32), %rsp;
+	movq %rsp, %rax;
+
+.Lcbc_dec_continue:
+	call __camellia_dec_blk32;
+
+	vmovdqu %ymm7, (%rax);
+	vpxor %ymm7, %ymm7, %ymm7;
+	vinserti128 $1, (%rdx), %ymm7, %ymm7;
+	vpxor (%rax), %ymm7, %ymm7;
+	movq %r10, %rsp;
+	vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
+	vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
+	vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
+	vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
+	vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
+	vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
+	vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
+	vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
+	vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
+	vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
+	vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
+	vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
+	vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
+	vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
+	vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(camellia_cbc_dec_32way)
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
+	vpcmpeqq minus_one, x, tmp1; \
+	vpcmpeqq minus_two, x, tmp2; \
+	vpsubq minus_two, x, x; \
+	vpor tmp2, tmp1, tmp1; \
+	vpslldq $8, tmp1, tmp1; \
+	vpsubq tmp1, x, x;
+
+ENTRY(camellia_ctr_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: iv (little endian, 128bit)
+	 */
+
+	vzeroupper;
+
+	movq %rsp, %r10;
+	cmpq %rsi, %rdx;
+	je .Lctr_use_stack;
+
+	/* dst can be used as temporary storage, src is not overwritten. */
+	movq %rsi, %rax;
+	jmp .Lctr_continue;
+
+.Lctr_use_stack:
+	subq $(16 * 32), %rsp;
+	movq %rsp, %rax;
+
+.Lctr_continue:
+	vpcmpeqd %ymm15, %ymm15, %ymm15;
+	vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
+	vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
+
+	/* load IV and byteswap */
+	vmovdqu (%rcx), %xmm0;
+	vmovdqa %xmm0, %xmm1;
+	inc_le128(%xmm0, %xmm15, %xmm14);
+	vbroadcasti128 .Lbswap128_mask, %ymm14;
+	vinserti128 $1, %xmm0, %ymm1, %ymm0;
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 15 * 32(%rax);
+
+	/* construct IVs */
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 14 * 32(%rax);
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 13 * 32(%rax);
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 12 * 32(%rax);
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm13;
+	vmovdqu %ymm13, 11 * 32(%rax);
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm10;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm9;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm8;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm7;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm6;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm5;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm4;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm3;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm2;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vpshufb %ymm14, %ymm0, %ymm1;
+	add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
+	vextracti128 $1, %ymm0, %xmm13;
+	vpshufb %ymm14, %ymm0, %ymm0;
+	inc_le128(%xmm13, %xmm15, %xmm14);
+	vmovdqu %xmm13, (%rcx);
+
+	/* inpack32_pre: */
+	vpbroadcastq (key_table)(CTX), %ymm15;
+	vpshufb .Lpack_bswap, %ymm15, %ymm15;
+	vpxor %ymm0, %ymm15, %ymm0;
+	vpxor %ymm1, %ymm15, %ymm1;
+	vpxor %ymm2, %ymm15, %ymm2;
+	vpxor %ymm3, %ymm15, %ymm3;
+	vpxor %ymm4, %ymm15, %ymm4;
+	vpxor %ymm5, %ymm15, %ymm5;
+	vpxor %ymm6, %ymm15, %ymm6;
+	vpxor %ymm7, %ymm15, %ymm7;
+	vpxor %ymm8, %ymm15, %ymm8;
+	vpxor %ymm9, %ymm15, %ymm9;
+	vpxor %ymm10, %ymm15, %ymm10;
+	vpxor 11 * 32(%rax), %ymm15, %ymm11;
+	vpxor 12 * 32(%rax), %ymm15, %ymm12;
+	vpxor 13 * 32(%rax), %ymm15, %ymm13;
+	vpxor 14 * 32(%rax), %ymm15, %ymm14;
+	vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+	call __camellia_enc_blk32;
+
+	movq %r10, %rsp;
+
+	vpxor 0 * 32(%rdx), %ymm7, %ymm7;
+	vpxor 1 * 32(%rdx), %ymm6, %ymm6;
+	vpxor 2 * 32(%rdx), %ymm5, %ymm5;
+	vpxor 3 * 32(%rdx), %ymm4, %ymm4;
+	vpxor 4 * 32(%rdx), %ymm3, %ymm3;
+	vpxor 5 * 32(%rdx), %ymm2, %ymm2;
+	vpxor 6 * 32(%rdx), %ymm1, %ymm1;
+	vpxor 7 * 32(%rdx), %ymm0, %ymm0;
+	vpxor 8 * 32(%rdx), %ymm15, %ymm15;
+	vpxor 9 * 32(%rdx), %ymm14, %ymm14;
+	vpxor 10 * 32(%rdx), %ymm13, %ymm13;
+	vpxor 11 * 32(%rdx), %ymm12, %ymm12;
+	vpxor 12 * 32(%rdx), %ymm11, %ymm11;
+	vpxor 13 * 32(%rdx), %ymm10, %ymm10;
+	vpxor 14 * 32(%rdx), %ymm9, %ymm9;
+	vpxor 15 * 32(%rdx), %ymm8, %ymm8;
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(camellia_ctr_32way)
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+	vpsrad $31, iv, tmp; \
+	vpaddq iv, iv, iv; \
+	vpshufd $0x13, tmp, tmp; \
+	vpand mask, tmp, tmp; \
+	vpxor tmp, iv, iv;
+
+#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
+	vpsrad $31, iv, tmp0; \
+	vpaddq iv, iv, tmp1; \
+	vpsllq $2, iv, iv; \
+	vpshufd $0x13, tmp0, tmp0; \
+	vpsrad $31, tmp1, tmp1; \
+	vpand mask2, tmp0, tmp0; \
+	vpshufd $0x13, tmp1, tmp1; \
+	vpxor tmp0, iv, iv; \
+	vpand mask1, tmp1, tmp1; \
+	vpxor tmp1, iv, iv;
+
+.align 8
+camellia_xts_crypt_32way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 *	%r8: index for input whitening key
+	 *	%r9: pointer to  __camellia_enc_blk32 or __camellia_dec_blk32
+	 */
+
+	vzeroupper;
+
+	subq $(16 * 32), %rsp;
+	movq %rsp, %rax;
+
+	vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
+
+	/* load IV and construct second IV */
+	vmovdqu (%rcx), %xmm0;
+	vmovdqa %xmm0, %xmm15;
+	gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
+	vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
+	vinserti128 $1, %xmm0, %ymm15, %ymm0;
+	vpxor 0 * 32(%rdx), %ymm0, %ymm15;
+	vmovdqu %ymm15, 15 * 32(%rax);
+	vmovdqu %ymm0, 0 * 32(%rsi);
+
+	/* construct IVs */
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 1 * 32(%rdx), %ymm0, %ymm15;
+	vmovdqu %ymm15, 14 * 32(%rax);
+	vmovdqu %ymm0, 1 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 2 * 32(%rdx), %ymm0, %ymm15;
+	vmovdqu %ymm15, 13 * 32(%rax);
+	vmovdqu %ymm0, 2 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 3 * 32(%rdx), %ymm0, %ymm15;
+	vmovdqu %ymm15, 12 * 32(%rax);
+	vmovdqu %ymm0, 3 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 4 * 32(%rdx), %ymm0, %ymm11;
+	vmovdqu %ymm0, 4 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 5 * 32(%rdx), %ymm0, %ymm10;
+	vmovdqu %ymm0, 5 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 6 * 32(%rdx), %ymm0, %ymm9;
+	vmovdqu %ymm0, 6 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 7 * 32(%rdx), %ymm0, %ymm8;
+	vmovdqu %ymm0, 7 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 8 * 32(%rdx), %ymm0, %ymm7;
+	vmovdqu %ymm0, 8 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 9 * 32(%rdx), %ymm0, %ymm6;
+	vmovdqu %ymm0, 9 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 10 * 32(%rdx), %ymm0, %ymm5;
+	vmovdqu %ymm0, 10 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 11 * 32(%rdx), %ymm0, %ymm4;
+	vmovdqu %ymm0, 11 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 12 * 32(%rdx), %ymm0, %ymm3;
+	vmovdqu %ymm0, 12 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 13 * 32(%rdx), %ymm0, %ymm2;
+	vmovdqu %ymm0, 13 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 14 * 32(%rdx), %ymm0, %ymm1;
+	vmovdqu %ymm0, 14 * 32(%rsi);
+
+	gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
+	vpxor 15 * 32(%rdx), %ymm0, %ymm15;
+	vmovdqu %ymm15, 0 * 32(%rax);
+	vmovdqu %ymm0, 15 * 32(%rsi);
+
+	vextracti128 $1, %ymm0, %xmm0;
+	gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
+	vmovdqu %xmm0, (%rcx);
+
+	/* inpack32_pre: */
+	vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
+	vpshufb .Lpack_bswap, %ymm15, %ymm15;
+	vpxor 0 * 32(%rax), %ymm15, %ymm0;
+	vpxor %ymm1, %ymm15, %ymm1;
+	vpxor %ymm2, %ymm15, %ymm2;
+	vpxor %ymm3, %ymm15, %ymm3;
+	vpxor %ymm4, %ymm15, %ymm4;
+	vpxor %ymm5, %ymm15, %ymm5;
+	vpxor %ymm6, %ymm15, %ymm6;
+	vpxor %ymm7, %ymm15, %ymm7;
+	vpxor %ymm8, %ymm15, %ymm8;
+	vpxor %ymm9, %ymm15, %ymm9;
+	vpxor %ymm10, %ymm15, %ymm10;
+	vpxor %ymm11, %ymm15, %ymm11;
+	vpxor 12 * 32(%rax), %ymm15, %ymm12;
+	vpxor 13 * 32(%rax), %ymm15, %ymm13;
+	vpxor 14 * 32(%rax), %ymm15, %ymm14;
+	vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+	call *%r9;
+
+	addq $(16 * 32), %rsp;
+
+	vpxor 0 * 32(%rsi), %ymm7, %ymm7;
+	vpxor 1 * 32(%rsi), %ymm6, %ymm6;
+	vpxor 2 * 32(%rsi), %ymm5, %ymm5;
+	vpxor 3 * 32(%rsi), %ymm4, %ymm4;
+	vpxor 4 * 32(%rsi), %ymm3, %ymm3;
+	vpxor 5 * 32(%rsi), %ymm2, %ymm2;
+	vpxor 6 * 32(%rsi), %ymm1, %ymm1;
+	vpxor 7 * 32(%rsi), %ymm0, %ymm0;
+	vpxor 8 * 32(%rsi), %ymm15, %ymm15;
+	vpxor 9 * 32(%rsi), %ymm14, %ymm14;
+	vpxor 10 * 32(%rsi), %ymm13, %ymm13;
+	vpxor 11 * 32(%rsi), %ymm12, %ymm12;
+	vpxor 12 * 32(%rsi), %ymm11, %ymm11;
+	vpxor 13 * 32(%rsi), %ymm10, %ymm10;
+	vpxor 14 * 32(%rsi), %ymm9, %ymm9;
+	vpxor 15 * 32(%rsi), %ymm8, %ymm8;
+	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+		     %ymm8, %rsi);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(camellia_xts_crypt_32way)
+
+ENTRY(camellia_xts_enc_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	xorl %r8d, %r8d; /* input whitening key, 0 for enc */
+
+	leaq __camellia_enc_blk32, %r9;
+
+	jmp camellia_xts_crypt_32way;
+ENDPROC(camellia_xts_enc_32way)
+
+ENTRY(camellia_xts_dec_32way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	cmpl $16, key_length(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d;  /* input whitening key, last for dec */
+
+	leaq __camellia_dec_blk32, %r9;
+
+	jmp camellia_xts_crypt_32way;
+ENDPROC(camellia_xts_dec_32way)

+ 586 - 0
arch/x86/crypto/camellia_aesni_avx2_glue.c

@@ -0,0 +1,586 @@
+/*
+ * Glue Code for x86_64/AVX2/AES-NI assembler optimized version of Camellia
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/ctr.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/camellia.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
+
+#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
+#define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32
+
+/* 32-way AVX2/AES-NI parallel cipher functions */
+asmlinkage void camellia_ecb_enc_32way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void camellia_ecb_dec_32way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+
+asmlinkage void camellia_cbc_dec_32way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void camellia_ctr_32way(struct camellia_ctx *ctx, u8 *dst,
+				   const u8 *src, le128 *iv);
+
+asmlinkage void camellia_xts_enc_32way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src, le128 *iv);
+asmlinkage void camellia_xts_dec_32way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src, le128 *iv);
+
+static const struct common_glue_ctx camellia_enc = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_32way) }
+	}, {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
+	}, {
+		.num_blocks = 2,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
+	} }
+};
+
+static const struct common_glue_ctx camellia_ctr = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_32way) }
+	}, {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
+	}, {
+		.num_blocks = 2,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
+	} }
+};
+
+static const struct common_glue_ctx camellia_enc_xts = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_32way) }
+	}, {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
+	} }
+};
+
+static const struct common_glue_ctx camellia_dec = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_32way) }
+	}, {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
+	}, {
+		.num_blocks = 2,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
+	} }
+};
+
+static const struct common_glue_ctx camellia_dec_cbc = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_32way) }
+	}, {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
+	}, {
+		.num_blocks = 2,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
+	} }
+};
+
+static const struct common_glue_ctx camellia_dec_xts = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_32way) }
+	}, {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
+	} }
+};
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
+				       dst, src, nbytes);
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
+				       nbytes);
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
+}
+
+static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+	return glue_fpu_begin(CAMELLIA_BLOCK_SIZE,
+			      CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled,
+			      nbytes);
+}
+
+static inline void camellia_fpu_end(bool fpu_enabled)
+{
+	glue_fpu_end(fpu_enabled);
+}
+
+static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
+			   unsigned int key_len)
+{
+	return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
+				 &tfm->crt_flags);
+}
+
+struct crypt_priv {
+	struct camellia_ctx *ctx;
+	bool fpu_enabled;
+};
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) {
+		camellia_ecb_enc_32way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
+	}
+
+	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+		camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+	}
+
+	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		camellia_enc_blk(ctx->ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) {
+		camellia_ecb_dec_32way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
+	}
+
+	if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+		camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
+	}
+
+	while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+		camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
+		nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		camellia_dec_blk(ctx->ctx, srcdst, srcdst);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->camellia_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	camellia_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->camellia_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	camellia_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(camellia_enc_blk),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(camellia_enc_blk),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static struct crypto_alg cmll_algs[10] = { {
+	.cra_name		= "__ecb-camellia-aesni-avx2",
+	.cra_driver_name	= "__driver-ecb-camellia-aesni-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.setkey		= camellia_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__cbc-camellia-aesni-avx2",
+	.cra_driver_name	= "__driver-cbc-camellia-aesni-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.setkey		= camellia_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__ctr-camellia-aesni-avx2",
+	.cra_driver_name	= "__driver-ctr-camellia-aesni-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct camellia_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= camellia_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "__lrw-camellia-aesni-avx2",
+	.cra_driver_name	= "__driver-lrw-camellia-aesni-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_exit		= lrw_camellia_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE +
+					  CAMELLIA_BLOCK_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE +
+					  CAMELLIA_BLOCK_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= lrw_camellia_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__xts-camellia-aesni-avx2",
+	.cra_driver_name	= "__driver-xts-camellia-aesni-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE * 2,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE * 2,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= xts_camellia_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ecb(camellia)",
+	.cra_driver_name	= "ecb-camellia-aesni-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(camellia)",
+	.cra_driver_name	= "cbc-camellia-aesni-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= __ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(camellia)",
+	.cra_driver_name	= "ctr-camellia-aesni-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_encrypt,
+			.geniv		= "chainiv",
+		},
+	},
+}, {
+	.cra_name		= "lrw(camellia)",
+	.cra_driver_name	= "lrw-camellia-aesni-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE +
+					  CAMELLIA_BLOCK_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE +
+					  CAMELLIA_BLOCK_SIZE,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "xts(camellia)",
+	.cra_driver_name	= "xts-camellia-aesni-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE * 2,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE * 2,
+			.ivsize		= CAMELLIA_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+} };
+
+static int __init camellia_aesni_init(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
+		pr_info("AVX2 or AES-NI instructions are not detected.\n");
+		return -ENODEV;
+	}
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		pr_info("AVX2 detected but unusable.\n");
+		return -ENODEV;
+	}
+
+	return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
+}
+
+static void __exit camellia_aesni_fini(void)
+{
+	crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
+}
+
+module_init(camellia_aesni_init);
+module_exit(camellia_aesni_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized");
+MODULE_ALIAS("camellia");
+MODULE_ALIAS("camellia-asm");

+ 62 - 42
arch/x86/crypto/camellia_aesni_avx_glue.c

@@ -1,7 +1,7 @@
 /*
 /*
  * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
  * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
  *
  *
- * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * it under the terms of the GNU General Public License as published by
@@ -26,16 +26,44 @@
 
 
 #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
 #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
 
 
-/* 16-way AES-NI parallel cipher functions */
+/* 16-way parallel cipher functions (avx/aes-ni) */
 asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
 asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
 				       const u8 *src);
 				       const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way);
+
 asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
 asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
 				       const u8 *src);
 				       const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way);
 
 
 asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
 asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
 				       const u8 *src);
 				       const u8 *src);
+EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way);
+
 asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
 asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
 				   const u8 *src, le128 *iv);
 				   const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_ctr_16way);
+
+asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_xts_enc_16way);
+
+asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(camellia_xts_dec_16way);
+
+void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+				  GLUE_FUNC_CAST(camellia_enc_blk));
+}
+EXPORT_SYMBOL_GPL(camellia_xts_enc);
+
+void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+				  GLUE_FUNC_CAST(camellia_dec_blk));
+}
+EXPORT_SYMBOL_GPL(camellia_xts_dec);
 
 
 static const struct common_glue_ctx camellia_enc = {
 static const struct common_glue_ctx camellia_enc = {
 	.num_funcs = 3,
 	.num_funcs = 3,
@@ -69,6 +97,19 @@ static const struct common_glue_ctx camellia_ctr = {
 	} }
 	} }
 };
 };
 
 
+static const struct common_glue_ctx camellia_enc_xts = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
+	} }
+};
+
 static const struct common_glue_ctx camellia_dec = {
 static const struct common_glue_ctx camellia_dec = {
 	.num_funcs = 3,
 	.num_funcs = 3,
 	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
 	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
@@ -101,6 +142,19 @@ static const struct common_glue_ctx camellia_dec_cbc = {
 	} }
 	} }
 };
 };
 
 
+static const struct common_glue_ctx camellia_dec_xts = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
+	} }
+};
+
 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 		       struct scatterlist *src, unsigned int nbytes)
 {
 {
@@ -261,54 +315,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 		       struct scatterlist *src, unsigned int nbytes)
 {
 {
 	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
 	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
-	struct crypt_priv crypt_ctx = {
-		.ctx = &ctx->crypt_ctx,
-		.fpu_enabled = false,
-	};
-	struct xts_crypt_req req = {
-		.tbuf = buf,
-		.tbuflen = sizeof(buf),
 
 
-		.tweak_ctx = &ctx->tweak_ctx,
-		.tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
-		.crypt_ctx = &crypt_ctx,
-		.crypt_fn = encrypt_callback,
-	};
-	int ret;
-
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-	ret = xts_crypt(desc, dst, src, nbytes, &req);
-	camellia_fpu_end(crypt_ctx.fpu_enabled);
-
-	return ret;
+	return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(camellia_enc_blk),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
 }
 }
 
 
 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 		       struct scatterlist *src, unsigned int nbytes)
 {
 {
 	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
 	struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
-	struct crypt_priv crypt_ctx = {
-		.ctx = &ctx->crypt_ctx,
-		.fpu_enabled = false,
-	};
-	struct xts_crypt_req req = {
-		.tbuf = buf,
-		.tbuflen = sizeof(buf),
-
-		.tweak_ctx = &ctx->tweak_ctx,
-		.tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
-		.crypt_ctx = &crypt_ctx,
-		.crypt_fn = decrypt_callback,
-	};
-	int ret;
 
 
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-	ret = xts_crypt(desc, dst, src, nbytes, &req);
-	camellia_fpu_end(crypt_ctx.fpu_enabled);
-
-	return ret;
+	return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(camellia_enc_blk),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
 }
 }
 
 
 static struct crypto_alg cmll_algs[10] = { {
 static struct crypto_alg cmll_algs[10] = { {

+ 47 - 1
arch/x86/crypto/cast6-avx-x86_64-asm_64.S

@@ -4,7 +4,7 @@
  * Copyright (C) 2012 Johannes Goetzfried
  * Copyright (C) 2012 Johannes Goetzfried
  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  *
  *
- * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * it under the terms of the GNU General Public License as published by
@@ -227,6 +227,8 @@
 .data
 .data
 
 
 .align 16
 .align 16
+.Lxts_gf128mul_and_shl1_mask:
+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
 .Lbswap_mask:
 .Lbswap_mask:
 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 .Lbswap128_mask:
 .Lbswap128_mask:
@@ -424,3 +426,47 @@ ENTRY(cast6_ctr_8way)
 
 
 	ret;
 	ret;
 ENDPROC(cast6_ctr_8way)
 ENDPROC(cast6_ctr_8way)
+
+ENTRY(cast6_xts_enc_8way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	movq %rsi, %r11;
+
+	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
+	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+		      RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
+
+	call __cast6_enc_blk8;
+
+	/* dst <= regs xor IVs(in dst) */
+	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	ret;
+ENDPROC(cast6_xts_enc_8way)
+
+ENTRY(cast6_xts_dec_8way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	movq %rsi, %r11;
+
+	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
+	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+		      RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
+
+	call __cast6_dec_blk8;
+
+	/* dst <= regs xor IVs(in dst) */
+	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	ret;
+ENDPROC(cast6_xts_dec_8way)

+ 51 - 40
arch/x86/crypto/cast6_avx_glue.c

@@ -4,6 +4,8 @@
  * Copyright (C) 2012 Johannes Goetzfried
  * Copyright (C) 2012 Johannes Goetzfried
  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  *
  *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * the Free Software Foundation; either version 2 of the License, or
@@ -50,6 +52,23 @@ asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
 asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
 asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
 			       le128 *iv);
 			       le128 *iv);
 
 
+asmlinkage void cast6_xts_enc_8way(struct cast6_ctx *ctx, u8 *dst,
+				   const u8 *src, le128 *iv);
+asmlinkage void cast6_xts_dec_8way(struct cast6_ctx *ctx, u8 *dst,
+				   const u8 *src, le128 *iv);
+
+static void cast6_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+				  GLUE_FUNC_CAST(__cast6_encrypt));
+}
+
+static void cast6_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+				  GLUE_FUNC_CAST(__cast6_decrypt));
+}
+
 static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 {
 	be128 ctrblk;
 	be128 ctrblk;
@@ -87,6 +106,19 @@ static const struct common_glue_ctx cast6_ctr = {
 	} }
 	} }
 };
 };
 
 
+static const struct common_glue_ctx cast6_enc_xts = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAST6_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc_8way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc) }
+	} }
+};
+
 static const struct common_glue_ctx cast6_dec = {
 static const struct common_glue_ctx cast6_dec = {
 	.num_funcs = 2,
 	.num_funcs = 2,
 	.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
 	.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
@@ -113,6 +145,19 @@ static const struct common_glue_ctx cast6_dec_cbc = {
 	} }
 	} }
 };
 };
 
 
+static const struct common_glue_ctx cast6_dec_xts = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = CAST6_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec_8way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec) }
+	} }
+};
+
 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 		       struct scatterlist *src, unsigned int nbytes)
 {
 {
@@ -307,54 +352,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 		       struct scatterlist *src, unsigned int nbytes)
 {
 {
 	struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
 	struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[CAST6_PARALLEL_BLOCKS];
-	struct crypt_priv crypt_ctx = {
-		.ctx = &ctx->crypt_ctx,
-		.fpu_enabled = false,
-	};
-	struct xts_crypt_req req = {
-		.tbuf = buf,
-		.tbuflen = sizeof(buf),
 
 
-		.tweak_ctx = &ctx->tweak_ctx,
-		.tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt),
-		.crypt_ctx = &crypt_ctx,
-		.crypt_fn = encrypt_callback,
-	};
-	int ret;
-
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-	ret = xts_crypt(desc, dst, src, nbytes, &req);
-	cast6_fpu_end(crypt_ctx.fpu_enabled);
-
-	return ret;
+	return glue_xts_crypt_128bit(&cast6_enc_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(__cast6_encrypt),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
 }
 }
 
 
 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 		       struct scatterlist *src, unsigned int nbytes)
 {
 {
 	struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
 	struct cast6_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[CAST6_PARALLEL_BLOCKS];
-	struct crypt_priv crypt_ctx = {
-		.ctx = &ctx->crypt_ctx,
-		.fpu_enabled = false,
-	};
-	struct xts_crypt_req req = {
-		.tbuf = buf,
-		.tbuflen = sizeof(buf),
-
-		.tweak_ctx = &ctx->tweak_ctx,
-		.tweak_fn = XTS_TWEAK_CAST(__cast6_encrypt),
-		.crypt_ctx = &crypt_ctx,
-		.crypt_fn = decrypt_callback,
-	};
-	int ret;
 
 
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-	ret = xts_crypt(desc, dst, src, nbytes, &req);
-	cast6_fpu_end(crypt_ctx.fpu_enabled);
-
-	return ret;
+	return glue_xts_crypt_128bit(&cast6_dec_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(__cast6_encrypt),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
 }
 }
 
 
 static struct crypto_alg cast6_algs[10] = { {
 static struct crypto_alg cast6_algs[10] = { {

+ 3 - 3
arch/x86/crypto/crc32-pclmul_asm.S

@@ -101,9 +101,8 @@
  *      uint crc32_pclmul_le_16(unsigned char const *buffer,
  *      uint crc32_pclmul_le_16(unsigned char const *buffer,
  *	                     size_t len, uint crc32)
  *	                     size_t len, uint crc32)
  */
  */
-.globl crc32_pclmul_le_16
-.align 4, 0x90
-crc32_pclmul_le_16:/* buffer and buffer size are 16 bytes aligned */
+
+ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
 	movdqa  (BUF), %xmm1
 	movdqa  (BUF), %xmm1
 	movdqa  0x10(BUF), %xmm2
 	movdqa  0x10(BUF), %xmm2
 	movdqa  0x20(BUF), %xmm3
 	movdqa  0x20(BUF), %xmm3
@@ -244,3 +243,4 @@ fold_64:
 	pextrd  $0x01, %xmm1, %eax
 	pextrd  $0x01, %xmm1, %eax
 
 
 	ret
 	ret
+ENDPROC(crc32_pclmul_le_16)

+ 6 - 4
arch/x86/crypto/crc32c-pcl-intel-asm_64.S

@@ -1,9 +1,10 @@
 /*
 /*
  * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
  * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
  *
  *
- * The white paper on CRC32C calculations with PCLMULQDQ instruction can be
+ * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
  * downloaded from:
  * downloaded from:
- * http://download.intel.com/design/intarch/papers/323405.pdf
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
  *
  *
  * Copyright (C) 2012 Intel Corporation.
  * Copyright (C) 2012 Intel Corporation.
  *
  *
@@ -42,6 +43,7 @@
  * SOFTWARE.
  * SOFTWARE.
  */
  */
 
 
+#include <asm/inst.h>
 #include <linux/linkage.h>
 #include <linux/linkage.h>
 
 
 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
@@ -225,10 +227,10 @@ LABEL crc_ %i
 	movdqa  (bufp), %xmm0			# 2 consts: K1:K2
 	movdqa  (bufp), %xmm0			# 2 consts: K1:K2
 
 
 	movq    crc_init, %xmm1			# CRC for block 1
 	movq    crc_init, %xmm1			# CRC for block 1
-	pclmulqdq $0x00,%xmm0,%xmm1		# Multiply by K2
+	PCLMULQDQ 0x00,%xmm0,%xmm1		# Multiply by K2
 
 
 	movq    crc1, %xmm2			# CRC for block 2
 	movq    crc1, %xmm2			# CRC for block 2
-	pclmulqdq $0x10, %xmm0, %xmm2		# Multiply by K1
+	PCLMULQDQ 0x10, %xmm0, %xmm2		# Multiply by K1
 
 
 	pxor    %xmm2,%xmm1
 	pxor    %xmm2,%xmm1
 	movq    %xmm1, %rax
 	movq    %xmm1, %rax

+ 60 - 1
arch/x86/crypto/glue_helper-asm-avx.S

@@ -1,7 +1,7 @@
 /*
 /*
  * Shared glue code for 128bit block ciphers, AVX assembler macros
  * Shared glue code for 128bit block ciphers, AVX assembler macros
  *
  *
- * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * it under the terms of the GNU General Public License as published by
@@ -89,3 +89,62 @@
 	vpxor (6*16)(src), x6, x6; \
 	vpxor (6*16)(src), x6, x6; \
 	vpxor (7*16)(src), x7, x7; \
 	vpxor (7*16)(src), x7, x7; \
 	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
 	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+	vpsrad $31, iv, tmp; \
+	vpaddq iv, iv, iv; \
+	vpshufd $0x13, tmp, tmp; \
+	vpand mask, tmp, tmp; \
+	vpxor tmp, iv, iv;
+
+#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \
+		      t1, xts_gf128mul_and_shl1_mask) \
+	vmovdqa xts_gf128mul_and_shl1_mask, t0; \
+	\
+	/* load IV */ \
+	vmovdqu (iv), tiv; \
+	vpxor (0*16)(src), tiv, x0; \
+	vmovdqu tiv, (0*16)(dst); \
+	\
+	/* construct and store IVs, also xor with source */ \
+	gf128mul_x_ble(tiv, t0, t1); \
+	vpxor (1*16)(src), tiv, x1; \
+	vmovdqu tiv, (1*16)(dst); \
+	\
+	gf128mul_x_ble(tiv, t0, t1); \
+	vpxor (2*16)(src), tiv, x2; \
+	vmovdqu tiv, (2*16)(dst); \
+	\
+	gf128mul_x_ble(tiv, t0, t1); \
+	vpxor (3*16)(src), tiv, x3; \
+	vmovdqu tiv, (3*16)(dst); \
+	\
+	gf128mul_x_ble(tiv, t0, t1); \
+	vpxor (4*16)(src), tiv, x4; \
+	vmovdqu tiv, (4*16)(dst); \
+	\
+	gf128mul_x_ble(tiv, t0, t1); \
+	vpxor (5*16)(src), tiv, x5; \
+	vmovdqu tiv, (5*16)(dst); \
+	\
+	gf128mul_x_ble(tiv, t0, t1); \
+	vpxor (6*16)(src), tiv, x6; \
+	vmovdqu tiv, (6*16)(dst); \
+	\
+	gf128mul_x_ble(tiv, t0, t1); \
+	vpxor (7*16)(src), tiv, x7; \
+	vmovdqu tiv, (7*16)(dst); \
+	\
+	gf128mul_x_ble(tiv, t0, t1); \
+	vmovdqu tiv, (iv);
+
+#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+	vpxor (0*16)(dst), x0, x0; \
+	vpxor (1*16)(dst), x1, x1; \
+	vpxor (2*16)(dst), x2, x2; \
+	vpxor (3*16)(dst), x3, x3; \
+	vpxor (4*16)(dst), x4, x4; \
+	vpxor (5*16)(dst), x5, x5; \
+	vpxor (6*16)(dst), x6, x6; \
+	vpxor (7*16)(dst), x7, x7; \
+	store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);

+ 180 - 0
arch/x86/crypto/glue_helper-asm-avx2.S

@@ -0,0 +1,180 @@
+/*
+ * Shared glue code for 128bit block ciphers, AVX2 assembler macros
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#define load_16way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
+	vmovdqu (0*32)(src), x0; \
+	vmovdqu (1*32)(src), x1; \
+	vmovdqu (2*32)(src), x2; \
+	vmovdqu (3*32)(src), x3; \
+	vmovdqu (4*32)(src), x4; \
+	vmovdqu (5*32)(src), x5; \
+	vmovdqu (6*32)(src), x6; \
+	vmovdqu (7*32)(src), x7;
+
+#define store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+	vmovdqu x0, (0*32)(dst); \
+	vmovdqu x1, (1*32)(dst); \
+	vmovdqu x2, (2*32)(dst); \
+	vmovdqu x3, (3*32)(dst); \
+	vmovdqu x4, (4*32)(dst); \
+	vmovdqu x5, (5*32)(dst); \
+	vmovdqu x6, (6*32)(dst); \
+	vmovdqu x7, (7*32)(dst);
+
+#define store_cbc_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7, t0) \
+	vpxor t0, t0, t0; \
+	vinserti128 $1, (src), t0, t0; \
+	vpxor t0, x0, x0; \
+	vpxor (0*32+16)(src), x1, x1; \
+	vpxor (1*32+16)(src), x2, x2; \
+	vpxor (2*32+16)(src), x3, x3; \
+	vpxor (3*32+16)(src), x4, x4; \
+	vpxor (4*32+16)(src), x5, x5; \
+	vpxor (5*32+16)(src), x6, x6; \
+	vpxor (6*32+16)(src), x7, x7; \
+	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define inc_le128(x, minus_one, tmp) \
+	vpcmpeqq minus_one, x, tmp; \
+	vpsubq minus_one, x, x; \
+	vpslldq $8, tmp, tmp; \
+	vpsubq tmp, x, x;
+
+#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
+	vpcmpeqq minus_one, x, tmp1; \
+	vpcmpeqq minus_two, x, tmp2; \
+	vpsubq minus_two, x, x; \
+	vpor tmp2, tmp1, tmp1; \
+	vpslldq $8, tmp1, tmp1; \
+	vpsubq tmp1, x, x;
+
+#define load_ctr_16way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t0x, t1, \
+		       t1x, t2, t2x, t3, t3x, t4, t5) \
+	vpcmpeqd t0, t0, t0; \
+	vpsrldq $8, t0, t0; /* ab: -1:0 ; cd: -1:0 */ \
+	vpaddq t0, t0, t4; /* ab: -2:0 ; cd: -2:0 */\
+	\
+	/* load IV and byteswap */ \
+	vmovdqu (iv), t2x; \
+	vmovdqa t2x, t3x; \
+	inc_le128(t2x, t0x, t1x); \
+	vbroadcasti128 bswap, t1; \
+	vinserti128 $1, t2x, t3, t2; /* ab: le0 ; cd: le1 */ \
+	vpshufb t1, t2, x0; \
+	\
+	/* construct IVs */ \
+	add2_le128(t2, t0, t4, t3, t5); /* ab: le2 ; cd: le3 */ \
+	vpshufb t1, t2, x1; \
+	add2_le128(t2, t0, t4, t3, t5); \
+	vpshufb t1, t2, x2; \
+	add2_le128(t2, t0, t4, t3, t5); \
+	vpshufb t1, t2, x3; \
+	add2_le128(t2, t0, t4, t3, t5); \
+	vpshufb t1, t2, x4; \
+	add2_le128(t2, t0, t4, t3, t5); \
+	vpshufb t1, t2, x5; \
+	add2_le128(t2, t0, t4, t3, t5); \
+	vpshufb t1, t2, x6; \
+	add2_le128(t2, t0, t4, t3, t5); \
+	vpshufb t1, t2, x7; \
+	vextracti128 $1, t2, t2x; \
+	inc_le128(t2x, t0x, t3x); \
+	vmovdqu t2x, (iv);
+
+#define store_ctr_16way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+	vpxor (0*32)(src), x0, x0; \
+	vpxor (1*32)(src), x1, x1; \
+	vpxor (2*32)(src), x2, x2; \
+	vpxor (3*32)(src), x3, x3; \
+	vpxor (4*32)(src), x4, x4; \
+	vpxor (5*32)(src), x5, x5; \
+	vpxor (6*32)(src), x6, x6; \
+	vpxor (7*32)(src), x7, x7; \
+	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
+
+#define gf128mul_x_ble(iv, mask, tmp) \
+	vpsrad $31, iv, tmp; \
+	vpaddq iv, iv, iv; \
+	vpshufd $0x13, tmp, tmp; \
+	vpand mask, tmp, tmp; \
+	vpxor tmp, iv, iv;
+
+#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
+	vpsrad $31, iv, tmp0; \
+	vpaddq iv, iv, tmp1; \
+	vpsllq $2, iv, iv; \
+	vpshufd $0x13, tmp0, tmp0; \
+	vpsrad $31, tmp1, tmp1; \
+	vpand mask2, tmp0, tmp0; \
+	vpshufd $0x13, tmp1, tmp1; \
+	vpxor tmp0, iv, iv; \
+	vpand mask1, tmp1, tmp1; \
+	vpxor tmp1, iv, iv;
+
+#define load_xts_16way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, \
+		       tivx, t0, t0x, t1, t1x, t2, t2x, t3, \
+		       xts_gf128mul_and_shl1_mask_0, \
+		       xts_gf128mul_and_shl1_mask_1) \
+	vbroadcasti128 xts_gf128mul_and_shl1_mask_0, t1; \
+	\
+	/* load IV and construct second IV */ \
+	vmovdqu (iv), tivx; \
+	vmovdqa tivx, t0x; \
+	gf128mul_x_ble(tivx, t1x, t2x); \
+	vbroadcasti128 xts_gf128mul_and_shl1_mask_1, t2; \
+	vinserti128 $1, tivx, t0, tiv; \
+	vpxor (0*32)(src), tiv, x0; \
+	vmovdqu tiv, (0*32)(dst); \
+	\
+	/* construct and store IVs, also xor with source */ \
+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+	vpxor (1*32)(src), tiv, x1; \
+	vmovdqu tiv, (1*32)(dst); \
+	\
+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+	vpxor (2*32)(src), tiv, x2; \
+	vmovdqu tiv, (2*32)(dst); \
+	\
+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+	vpxor (3*32)(src), tiv, x3; \
+	vmovdqu tiv, (3*32)(dst); \
+	\
+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+	vpxor (4*32)(src), tiv, x4; \
+	vmovdqu tiv, (4*32)(dst); \
+	\
+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+	vpxor (5*32)(src), tiv, x5; \
+	vmovdqu tiv, (5*32)(dst); \
+	\
+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+	vpxor (6*32)(src), tiv, x6; \
+	vmovdqu tiv, (6*32)(dst); \
+	\
+	gf128mul_x2_ble(tiv, t1, t2, t0, t3); \
+	vpxor (7*32)(src), tiv, x7; \
+	vmovdqu tiv, (7*32)(dst); \
+	\
+	vextracti128 $1, tiv, tivx; \
+	gf128mul_x_ble(tivx, t1x, t2x); \
+	vmovdqu tivx, (iv);
+
+#define store_xts_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
+	vpxor (0*32)(dst), x0, x0; \
+	vpxor (1*32)(dst), x1, x1; \
+	vpxor (2*32)(dst), x2, x2; \
+	vpxor (3*32)(dst), x3, x3; \
+	vpxor (4*32)(dst), x4, x4; \
+	vpxor (5*32)(dst), x5, x5; \
+	vpxor (6*32)(dst), x6, x6; \
+	vpxor (7*32)(dst), x7, x7; \
+	store_16way(dst, x0, x1, x2, x3, x4, x5, x6, x7);

+ 96 - 1
arch/x86/crypto/glue_helper.c

@@ -1,7 +1,7 @@
 /*
 /*
  * Shared glue code for 128bit block ciphers
  * Shared glue code for 128bit block ciphers
  *
  *
- * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  *
  * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
  * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
  *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
  *   Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
@@ -304,4 +304,99 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
 }
 }
 EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit);
 EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit);
 
 
+static unsigned int __glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
+					    void *ctx,
+					    struct blkcipher_desc *desc,
+					    struct blkcipher_walk *walk)
+{
+	const unsigned int bsize = 128 / 8;
+	unsigned int nbytes = walk->nbytes;
+	u128 *src = (u128 *)walk->src.virt.addr;
+	u128 *dst = (u128 *)walk->dst.virt.addr;
+	unsigned int num_blocks, func_bytes;
+	unsigned int i;
+
+	/* Process multi-block batch */
+	for (i = 0; i < gctx->num_funcs; i++) {
+		num_blocks = gctx->funcs[i].num_blocks;
+		func_bytes = bsize * num_blocks;
+
+		if (nbytes >= func_bytes) {
+			do {
+				gctx->funcs[i].fn_u.xts(ctx, dst, src,
+							(le128 *)walk->iv);
+
+				src += num_blocks;
+				dst += num_blocks;
+				nbytes -= func_bytes;
+			} while (nbytes >= func_bytes);
+
+			if (nbytes < bsize)
+				goto done;
+		}
+	}
+
+done:
+	return nbytes;
+}
+
+/* for implementations implementing faster XTS IV generator */
+int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
+			  struct blkcipher_desc *desc, struct scatterlist *dst,
+			  struct scatterlist *src, unsigned int nbytes,
+			  void (*tweak_fn)(void *ctx, u8 *dst, const u8 *src),
+			  void *tweak_ctx, void *crypt_ctx)
+{
+	const unsigned int bsize = 128 / 8;
+	bool fpu_enabled = false;
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+
+	err = blkcipher_walk_virt(desc, &walk);
+	nbytes = walk.nbytes;
+	if (!nbytes)
+		return err;
+
+	/* set minimum length to bsize, for tweak_fn */
+	fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit,
+				     desc, fpu_enabled,
+				     nbytes < bsize ? bsize : nbytes);
+
+	/* calculate first value of T */
+	tweak_fn(tweak_ctx, walk.iv, walk.iv);
+
+	while (nbytes) {
+		nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk);
+
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+		nbytes = walk.nbytes;
+	}
+
+	glue_fpu_end(fpu_enabled);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
+
+void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv,
+			       common_glue_func_t fn)
+{
+	le128 ivblk = *iv;
+
+	/* generate next IV */
+	le128_gf128mul_x_ble(iv, &ivblk);
+
+	/* CC <- T xor C */
+	u128_xor(dst, src, (u128 *)&ivblk);
+
+	/* PP <- D(Key2,CC) */
+	fn(ctx, (u8 *)dst, (u8 *)dst);
+
+	/* P <- T xor PP */
+	u128_xor(dst, dst, (u128 *)&ivblk);
+}
+EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one);
+
 MODULE_LICENSE("GPL");
 MODULE_LICENSE("GPL");

+ 43 - 2
arch/x86/crypto/serpent-avx-x86_64-asm_64.S

@@ -4,8 +4,7 @@
  * Copyright (C) 2012 Johannes Goetzfried
  * Copyright (C) 2012 Johannes Goetzfried
  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  *
  *
- * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by
- *  Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * it under the terms of the GNU General Public License as published by
@@ -34,6 +33,8 @@
 
 
 .Lbswap128_mask:
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lxts_gf128mul_and_shl1_mask:
+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
 
 
 .text
 .text
 
 
@@ -739,3 +740,43 @@ ENTRY(serpent_ctr_8way_avx)
 
 
 	ret;
 	ret;
 ENDPROC(serpent_ctr_8way_avx)
 ENDPROC(serpent_ctr_8way_avx)
+
+ENTRY(serpent_xts_enc_8way_avx)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
+	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+		      RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
+
+	call __serpent_enc_blk8_avx;
+
+	/* dst <= regs xor IVs(in dst) */
+	store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	ret;
+ENDPROC(serpent_xts_enc_8way_avx)
+
+ENTRY(serpent_xts_dec_8way_avx)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
+	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+		      RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
+
+	call __serpent_dec_blk8_avx;
+
+	/* dst <= regs xor IVs(in dst) */
+	store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+	ret;
+ENDPROC(serpent_xts_dec_8way_avx)

+ 800 - 0
arch/x86/crypto/serpent-avx2-asm_64.S

@@ -0,0 +1,800 @@
+/*
+ * x86_64/AVX2 assembler optimized version of Serpent
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on AVX assembler implementation of Serpent by:
+ *  Copyright © 2012 Johannes Goetzfried
+ *      <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/linkage.h>
+#include "glue_helper-asm-avx2.S"
+
+.file "serpent-avx2-asm_64.S"
+
+.data
+.align 16
+
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lxts_gf128mul_and_shl1_mask_0:
+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+.Lxts_gf128mul_and_shl1_mask_1:
+	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
+
+.text
+
+#define CTX %rdi
+
+#define RNOT %ymm0
+#define tp  %ymm1
+
+#define RA1 %ymm2
+#define RA2 %ymm3
+#define RB1 %ymm4
+#define RB2 %ymm5
+#define RC1 %ymm6
+#define RC2 %ymm7
+#define RD1 %ymm8
+#define RD2 %ymm9
+#define RE1 %ymm10
+#define RE2 %ymm11
+
+#define RK0 %ymm12
+#define RK1 %ymm13
+#define RK2 %ymm14
+#define RK3 %ymm15
+
+#define RK0x %xmm12
+#define RK1x %xmm13
+#define RK2x %xmm14
+#define RK3x %xmm15
+
+#define S0_1(x0, x1, x2, x3, x4)      \
+	vpor		x0,   x3, tp; \
+	vpxor		x3,   x0, x0; \
+	vpxor		x2,   x3, x4; \
+	vpxor		RNOT, x4, x4; \
+	vpxor		x1,   tp, x3; \
+	vpand		x0,   x1, x1; \
+	vpxor		x4,   x1, x1; \
+	vpxor		x0,   x2, x2;
+#define S0_2(x0, x1, x2, x3, x4)      \
+	vpxor		x3,   x0, x0; \
+	vpor		x0,   x4, x4; \
+	vpxor		x2,   x0, x0; \
+	vpand		x1,   x2, x2; \
+	vpxor		x2,   x3, x3; \
+	vpxor		RNOT, x1, x1; \
+	vpxor		x4,   x2, x2; \
+	vpxor		x2,   x1, x1;
+
+#define S1_1(x0, x1, x2, x3, x4)      \
+	vpxor		x0,   x1, tp; \
+	vpxor		x3,   x0, x0; \
+	vpxor		RNOT, x3, x3; \
+	vpand		tp,   x1, x4; \
+	vpor		tp,   x0, x0; \
+	vpxor		x2,   x3, x3; \
+	vpxor		x3,   x0, x0; \
+	vpxor		x3,   tp, x1;
+#define S1_2(x0, x1, x2, x3, x4)      \
+	vpxor		x4,   x3, x3; \
+	vpor		x4,   x1, x1; \
+	vpxor		x2,   x4, x4; \
+	vpand		x0,   x2, x2; \
+	vpxor		x1,   x2, x2; \
+	vpor		x0,   x1, x1; \
+	vpxor		RNOT, x0, x0; \
+	vpxor		x2,   x0, x0; \
+	vpxor		x1,   x4, x4;
+
+#define S2_1(x0, x1, x2, x3, x4)      \
+	vpxor		RNOT, x3, x3; \
+	vpxor		x0,   x1, x1; \
+	vpand		x2,   x0, tp; \
+	vpxor		x3,   tp, tp; \
+	vpor		x0,   x3, x3; \
+	vpxor		x1,   x2, x2; \
+	vpxor		x1,   x3, x3; \
+	vpand		tp,   x1, x1;
+#define S2_2(x0, x1, x2, x3, x4)      \
+	vpxor		x2,   tp, tp; \
+	vpand		x3,   x2, x2; \
+	vpor		x1,   x3, x3; \
+	vpxor		RNOT, tp, tp; \
+	vpxor		tp,   x3, x3; \
+	vpxor		tp,   x0, x4; \
+	vpxor		x2,   tp, x0; \
+	vpor		x2,   x1, x1;
+
+#define S3_1(x0, x1, x2, x3, x4)      \
+	vpxor		x3,   x1, tp; \
+	vpor		x0,   x3, x3; \
+	vpand		x0,   x1, x4; \
+	vpxor		x2,   x0, x0; \
+	vpxor		tp,   x2, x2; \
+	vpand		x3,   tp, x1; \
+	vpxor		x3,   x2, x2; \
+	vpor		x4,   x0, x0; \
+	vpxor		x3,   x4, x4;
+#define S3_2(x0, x1, x2, x3, x4)      \
+	vpxor		x0,   x1, x1; \
+	vpand		x3,   x0, x0; \
+	vpand		x4,   x3, x3; \
+	vpxor		x2,   x3, x3; \
+	vpor		x1,   x4, x4; \
+	vpand		x1,   x2, x2; \
+	vpxor		x3,   x4, x4; \
+	vpxor		x3,   x0, x0; \
+	vpxor		x2,   x3, x3;
+
+#define S4_1(x0, x1, x2, x3, x4)      \
+	vpand		x0,   x3, tp; \
+	vpxor		x3,   x0, x0; \
+	vpxor		x2,   tp, tp; \
+	vpor		x3,   x2, x2; \
+	vpxor		x1,   x0, x0; \
+	vpxor		tp,   x3, x4; \
+	vpor		x0,   x2, x2; \
+	vpxor		x1,   x2, x2;
+#define S4_2(x0, x1, x2, x3, x4)      \
+	vpand		x0,   x1, x1; \
+	vpxor		x4,   x1, x1; \
+	vpand		x2,   x4, x4; \
+	vpxor		tp,   x2, x2; \
+	vpxor		x0,   x4, x4; \
+	vpor		x1,   tp, x3; \
+	vpxor		RNOT, x1, x1; \
+	vpxor		x0,   x3, x3;
+
+#define S5_1(x0, x1, x2, x3, x4)      \
+	vpor		x0,   x1, tp; \
+	vpxor		tp,   x2, x2; \
+	vpxor		RNOT, x3, x3; \
+	vpxor		x0,   x1, x4; \
+	vpxor		x2,   x0, x0; \
+	vpand		x4,   tp, x1; \
+	vpor		x3,   x4, x4; \
+	vpxor		x0,   x4, x4;
+#define S5_2(x0, x1, x2, x3, x4)      \
+	vpand		x3,   x0, x0; \
+	vpxor		x3,   x1, x1; \
+	vpxor		x2,   x3, x3; \
+	vpxor		x1,   x0, x0; \
+	vpand		x4,   x2, x2; \
+	vpxor		x2,   x1, x1; \
+	vpand		x0,   x2, x2; \
+	vpxor		x2,   x3, x3;
+
+#define S6_1(x0, x1, x2, x3, x4)      \
+	vpxor		x0,   x3, x3; \
+	vpxor		x2,   x1, tp; \
+	vpxor		x0,   x2, x2; \
+	vpand		x3,   x0, x0; \
+	vpor		x3,   tp, tp; \
+	vpxor		RNOT, x1, x4; \
+	vpxor		tp,   x0, x0; \
+	vpxor		x2,   tp, x1;
+#define S6_2(x0, x1, x2, x3, x4)      \
+	vpxor		x4,   x3, x3; \
+	vpxor		x0,   x4, x4; \
+	vpand		x0,   x2, x2; \
+	vpxor		x1,   x4, x4; \
+	vpxor		x3,   x2, x2; \
+	vpand		x1,   x3, x3; \
+	vpxor		x0,   x3, x3; \
+	vpxor		x2,   x1, x1;
+
+#define S7_1(x0, x1, x2, x3, x4)      \
+	vpxor		RNOT, x1, tp; \
+	vpxor		RNOT, x0, x0; \
+	vpand		x2,   tp, x1; \
+	vpxor		x3,   x1, x1; \
+	vpor		tp,   x3, x3; \
+	vpxor		x2,   tp, x4; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x0,   x3, x3; \
+	vpor		x1,   x0, x0;
+#define S7_2(x0, x1, x2, x3, x4)      \
+	vpand		x0,   x2, x2; \
+	vpxor		x4,   x0, x0; \
+	vpxor		x3,   x4, x4; \
+	vpand		x0,   x3, x3; \
+	vpxor		x1,   x4, x4; \
+	vpxor		x4,   x2, x2; \
+	vpxor		x1,   x3, x3; \
+	vpor		x0,   x4, x4; \
+	vpxor		x1,   x4, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4)     \
+	vpxor		x0,   x1, x1; \
+	vpor		x1,   x3, tp; \
+	vpxor		x1,   x3, x4; \
+	vpxor		RNOT, x0, x0; \
+	vpxor		tp,   x2, x2; \
+	vpxor		x0,   tp, x3; \
+	vpand		x1,   x0, x0; \
+	vpxor		x2,   x0, x0;
+#define SI0_2(x0, x1, x2, x3, x4)     \
+	vpand		x3,   x2, x2; \
+	vpxor		x4,   x3, x3; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x3,   x1, x1; \
+	vpand		x0,   x3, x3; \
+	vpxor		x0,   x1, x1; \
+	vpxor		x2,   x0, x0; \
+	vpxor		x3,   x4, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4)     \
+	vpxor		x3,   x1, x1; \
+	vpxor		x2,   x0, tp; \
+	vpxor		RNOT, x2, x2; \
+	vpor		x1,   x0, x4; \
+	vpxor		x3,   x4, x4; \
+	vpand		x1,   x3, x3; \
+	vpxor		x2,   x1, x1; \
+	vpand		x4,   x2, x2;
+#define SI1_2(x0, x1, x2, x3, x4)     \
+	vpxor		x1,   x4, x4; \
+	vpor		x3,   x1, x1; \
+	vpxor		tp,   x3, x3; \
+	vpxor		tp,   x2, x2; \
+	vpor		x4,   tp, x0; \
+	vpxor		x4,   x2, x2; \
+	vpxor		x0,   x1, x1; \
+	vpxor		x1,   x4, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4)     \
+	vpxor		x1,   x2, x2; \
+	vpxor		RNOT, x3, tp; \
+	vpor		x2,   tp, tp; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x0,   x3, x4; \
+	vpxor		x1,   tp, x3; \
+	vpor		x2,   x1, x1; \
+	vpxor		x0,   x2, x2;
+#define SI2_2(x0, x1, x2, x3, x4)     \
+	vpxor		x4,   x1, x1; \
+	vpor		x3,   x4, x4; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x2,   x4, x4; \
+	vpand		x1,   x2, x2; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x4,   x3, x3; \
+	vpxor		x0,   x4, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4)     \
+	vpxor		x1,   x2, x2; \
+	vpand		x2,   x1, tp; \
+	vpxor		x0,   tp, tp; \
+	vpor		x1,   x0, x0; \
+	vpxor		x3,   x1, x4; \
+	vpxor		x3,   x0, x0; \
+	vpor		tp,   x3, x3; \
+	vpxor		x2,   tp, x1;
+#define SI3_2(x0, x1, x2, x3, x4)     \
+	vpxor		x3,   x1, x1; \
+	vpxor		x2,   x0, x0; \
+	vpxor		x3,   x2, x2; \
+	vpand		x1,   x3, x3; \
+	vpxor		x0,   x1, x1; \
+	vpand		x2,   x0, x0; \
+	vpxor		x3,   x4, x4; \
+	vpxor		x0,   x3, x3; \
+	vpxor		x1,   x0, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4)     \
+	vpxor		x3,   x2, x2; \
+	vpand		x1,   x0, tp; \
+	vpxor		x2,   tp, tp; \
+	vpor		x3,   x2, x2; \
+	vpxor		RNOT, x0, x4; \
+	vpxor		tp,   x1, x1; \
+	vpxor		x2,   tp, x0; \
+	vpand		x4,   x2, x2;
+#define SI4_2(x0, x1, x2, x3, x4)     \
+	vpxor		x0,   x2, x2; \
+	vpor		x4,   x0, x0; \
+	vpxor		x3,   x0, x0; \
+	vpand		x2,   x3, x3; \
+	vpxor		x3,   x4, x4; \
+	vpxor		x1,   x3, x3; \
+	vpand		x0,   x1, x1; \
+	vpxor		x1,   x4, x4; \
+	vpxor		x3,   x0, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4)     \
+	vpor		x2,   x1, tp; \
+	vpxor		x1,   x2, x2; \
+	vpxor		x3,   tp, tp; \
+	vpand		x1,   x3, x3; \
+	vpxor		x3,   x2, x2; \
+	vpor		x0,   x3, x3; \
+	vpxor		RNOT, x0, x0; \
+	vpxor		x2,   x3, x3; \
+	vpor		x0,   x2, x2;
+#define SI5_2(x0, x1, x2, x3, x4)     \
+	vpxor		tp,   x1, x4; \
+	vpxor		x4,   x2, x2; \
+	vpand		x0,   x4, x4; \
+	vpxor		tp,   x0, x0; \
+	vpxor		x3,   tp, x1; \
+	vpand		x2,   x0, x0; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x2,   x0, x0; \
+	vpxor		x4,   x2, x2; \
+	vpxor		x3,   x4, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4)     \
+	vpxor		x2,   x0, x0; \
+	vpand		x3,   x0, tp; \
+	vpxor		x3,   x2, x2; \
+	vpxor		x2,   tp, tp; \
+	vpxor		x1,   x3, x3; \
+	vpor		x0,   x2, x2; \
+	vpxor		x3,   x2, x2; \
+	vpand		tp,   x3, x3;
+#define SI6_2(x0, x1, x2, x3, x4)     \
+	vpxor		RNOT, tp, tp; \
+	vpxor		x1,   x3, x3; \
+	vpand		x2,   x1, x1; \
+	vpxor		tp,   x0, x4; \
+	vpxor		x4,   x3, x3; \
+	vpxor		x2,   x4, x4; \
+	vpxor		x1,   tp, x0; \
+	vpxor		x0,   x2, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4)     \
+	vpand		x0,   x3, tp; \
+	vpxor		x2,   x0, x0; \
+	vpor		x3,   x2, x2; \
+	vpxor		x1,   x3, x4; \
+	vpxor		RNOT, x0, x0; \
+	vpor		tp,   x1, x1; \
+	vpxor		x0,   x4, x4; \
+	vpand		x2,   x0, x0; \
+	vpxor		x1,   x0, x0;
+#define SI7_2(x0, x1, x2, x3, x4)     \
+	vpand		x2,   x1, x1; \
+	vpxor		x2,   tp, x3; \
+	vpxor		x3,   x4, x4; \
+	vpand		x3,   x2, x2; \
+	vpor		x0,   x3, x3; \
+	vpxor		x4,   x1, x1; \
+	vpxor		x4,   x3, x3; \
+	vpand		x0,   x4, x4; \
+	vpxor		x2,   x4, x4;
+
+#define get_key(i,j,t) \
+	vpbroadcastd (4*(i)+(j))*4(CTX), t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+	get_key(i, 0, RK0); \
+	get_key(i, 1, RK1); \
+	get_key(i, 2, RK2); \
+	get_key(i, 3, RK3); \
+	vpxor RK0,	x0 ## 1, x0 ## 1; \
+	vpxor RK1,	x1 ## 1, x1 ## 1; \
+	vpxor RK2,	x2 ## 1, x2 ## 1; \
+	vpxor RK3,	x3 ## 1, x3 ## 1; \
+		vpxor RK0,	x0 ## 2, x0 ## 2; \
+		vpxor RK1,	x1 ## 2, x1 ## 2; \
+		vpxor RK2,	x2 ## 2, x2 ## 2; \
+		vpxor RK3,	x3 ## 2, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+	vpslld $13,		x0 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 13),	x0 ## 1, x0 ## 1;          \
+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
+	vpslld $3,		x2 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 3),	x2 ## 1, x2 ## 1;          \
+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
+		vpslld $13,		x0 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 13),	x0 ## 2, x0 ## 2;          \
+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
+		vpslld $3,		x2 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 3),	x2 ## 2, x2 ## 2;          \
+		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
+	vpslld $1,		x1 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 1),	x1 ## 1, x1 ## 1;          \
+	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
+	vpslld $3,		x0 ## 1, x4 ## 1;          \
+	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
+	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
+	get_key(i, 1, RK1); \
+		vpslld $1,		x1 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 1),	x1 ## 2, x1 ## 2;          \
+		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
+		vpslld $3,		x0 ## 2, x4 ## 2;          \
+		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
+		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
+		get_key(i, 3, RK3); \
+	vpslld $7,		x3 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 7),	x3 ## 1, x3 ## 1;          \
+	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
+	vpslld $7,		x1 ## 1, x4 ## 1;          \
+	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
+	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	get_key(i, 0, RK0); \
+		vpslld $7,		x3 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 7),	x3 ## 2, x3 ## 2;          \
+		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
+		vpslld $7,		x1 ## 2, x4 ## 2;          \
+		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
+		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		get_key(i, 2, RK2); \
+	vpxor			RK1, x1 ## 1, x1 ## 1;     \
+	vpxor			RK3, x3 ## 1, x3 ## 1;     \
+	vpslld $5,		x0 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 5),	x0 ## 1, x0 ## 1;          \
+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
+	vpslld $22,		x2 ## 1, x4 ## 1;          \
+	vpsrld $(32 - 22),	x2 ## 1, x2 ## 1;          \
+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	vpxor			RK0, x0 ## 1, x0 ## 1;     \
+	vpxor			RK2, x2 ## 1, x2 ## 1;     \
+		vpxor			RK1, x1 ## 2, x1 ## 2;     \
+		vpxor			RK3, x3 ## 2, x3 ## 2;     \
+		vpslld $5,		x0 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 5),	x0 ## 2, x0 ## 2;          \
+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
+		vpslld $22,		x2 ## 2, x4 ## 2;          \
+		vpsrld $(32 - 22),	x2 ## 2, x2 ## 2;          \
+		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		vpxor			RK0, x0 ## 2, x0 ## 2;     \
+		vpxor			RK2, x2 ## 2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+	vpxor			RK0, x0 ## 1, x0 ## 1;     \
+	vpxor			RK2, x2 ## 1, x2 ## 1;     \
+	vpsrld $5,		x0 ## 1, x4 ## 1;          \
+	vpslld $(32 - 5),	x0 ## 1, x0 ## 1;          \
+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			RK3, x3 ## 1, x3 ## 1;     \
+	vpxor			RK1, x1 ## 1, x1 ## 1;     \
+	vpsrld $22,		x2 ## 1, x4 ## 1;          \
+	vpslld $(32 - 22),	x2 ## 1, x2 ## 1;          \
+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	vpxor			x3 ## 1, x2 ## 1, x2 ## 1; \
+		vpxor			RK0, x0 ## 2, x0 ## 2;     \
+		vpxor			RK2, x2 ## 2, x2 ## 2;     \
+		vpsrld $5,		x0 ## 2, x4 ## 2;          \
+		vpslld $(32 - 5),	x0 ## 2, x0 ## 2;          \
+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			RK3, x3 ## 2, x3 ## 2;     \
+		vpxor			RK1, x1 ## 2, x1 ## 2;     \
+		vpsrld $22,		x2 ## 2, x4 ## 2;          \
+		vpslld $(32 - 22),	x2 ## 2, x2 ## 2;          \
+		vpor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		vpxor			x3 ## 2, x2 ## 2, x2 ## 2; \
+	vpxor			x3 ## 1, x0 ## 1, x0 ## 1; \
+	vpslld $7,		x1 ## 1, x4 ## 1;          \
+	vpxor			x1 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x4 ## 1, x2 ## 1, x2 ## 1; \
+	vpsrld $1,		x1 ## 1, x4 ## 1;          \
+	vpslld $(32 - 1),	x1 ## 1, x1 ## 1;          \
+	vpor			x4 ## 1, x1 ## 1, x1 ## 1; \
+		vpxor			x3 ## 2, x0 ## 2, x0 ## 2; \
+		vpslld $7,		x1 ## 2, x4 ## 2;          \
+		vpxor			x1 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x4 ## 2, x2 ## 2, x2 ## 2; \
+		vpsrld $1,		x1 ## 2, x4 ## 2;          \
+		vpslld $(32 - 1),	x1 ## 2, x1 ## 2;          \
+		vpor			x4 ## 2, x1 ## 2, x1 ## 2; \
+	vpsrld $7,		x3 ## 1, x4 ## 1;          \
+	vpslld $(32 - 7),	x3 ## 1, x3 ## 1;          \
+	vpor			x4 ## 1, x3 ## 1, x3 ## 1; \
+	vpxor			x0 ## 1, x1 ## 1, x1 ## 1; \
+	vpslld $3,		x0 ## 1, x4 ## 1;          \
+	vpxor			x4 ## 1, x3 ## 1, x3 ## 1; \
+		vpsrld $7,		x3 ## 2, x4 ## 2;          \
+		vpslld $(32 - 7),	x3 ## 2, x3 ## 2;          \
+		vpor			x4 ## 2, x3 ## 2, x3 ## 2; \
+		vpxor			x0 ## 2, x1 ## 2, x1 ## 2; \
+		vpslld $3,		x0 ## 2, x4 ## 2;          \
+		vpxor			x4 ## 2, x3 ## 2, x3 ## 2; \
+	vpsrld $13,		x0 ## 1, x4 ## 1;          \
+	vpslld $(32 - 13),	x0 ## 1, x0 ## 1;          \
+	vpor			x4 ## 1, x0 ## 1, x0 ## 1; \
+	vpxor			x2 ## 1, x1 ## 1, x1 ## 1; \
+	vpxor			x2 ## 1, x3 ## 1, x3 ## 1; \
+	vpsrld $3,		x2 ## 1, x4 ## 1;          \
+	vpslld $(32 - 3),	x2 ## 1, x2 ## 1;          \
+	vpor			x4 ## 1, x2 ## 1, x2 ## 1; \
+		vpsrld $13,		x0 ## 2, x4 ## 2;          \
+		vpslld $(32 - 13),	x0 ## 2, x0 ## 2;          \
+		vpor			x4 ## 2, x0 ## 2, x0 ## 2; \
+		vpxor			x2 ## 2, x1 ## 2, x1 ## 2; \
+		vpxor			x2 ## 2, x3 ## 2, x3 ## 2; \
+		vpsrld $3,		x2 ## 2, x4 ## 2;          \
+		vpslld $(32 - 3),	x2 ## 2, x2 ## 2;          \
+		vpor			x4 ## 2, x2 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+	get_key(i, 0, RK0); \
+	SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	get_key(i, 2, RK2); \
+	SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+	get_key(i, 3, RK3); \
+	SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+	get_key(i, 1, RK1); \
+	SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+	vpunpckldq		x1, x0, t0; \
+	vpunpckhdq		x1, x0, t2; \
+	vpunpckldq		x3, x2, t1; \
+	vpunpckhdq		x3, x2, x3; \
+	\
+	vpunpcklqdq		t1, t0, x0; \
+	vpunpckhqdq		t1, t0, x1; \
+	vpunpcklqdq		x3, t2, x2; \
+	vpunpckhqdq		x3, t2, x3;
+
+#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
+	transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+.align 8
+__serpent_enc_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext
+	 * output:
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
+	 */
+
+	vpcmpeqd RNOT, RNOT, RNOT;
+
+	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+						 K2(RA, RB, RC, RD, RE, 0);
+	S(S0, RA, RB, RC, RD, RE);		LK2(RC, RB, RD, RA, RE, 1);
+	S(S1, RC, RB, RD, RA, RE);		LK2(RE, RD, RA, RC, RB, 2);
+	S(S2, RE, RD, RA, RC, RB);		LK2(RB, RD, RE, RC, RA, 3);
+	S(S3, RB, RD, RE, RC, RA);		LK2(RC, RA, RD, RB, RE, 4);
+	S(S4, RC, RA, RD, RB, RE);		LK2(RA, RD, RB, RE, RC, 5);
+	S(S5, RA, RD, RB, RE, RC);		LK2(RC, RA, RD, RE, RB, 6);
+	S(S6, RC, RA, RD, RE, RB);		LK2(RD, RB, RA, RE, RC, 7);
+	S(S7, RD, RB, RA, RE, RC);		LK2(RC, RA, RE, RD, RB, 8);
+	S(S0, RC, RA, RE, RD, RB);		LK2(RE, RA, RD, RC, RB, 9);
+	S(S1, RE, RA, RD, RC, RB);		LK2(RB, RD, RC, RE, RA, 10);
+	S(S2, RB, RD, RC, RE, RA);		LK2(RA, RD, RB, RE, RC, 11);
+	S(S3, RA, RD, RB, RE, RC);		LK2(RE, RC, RD, RA, RB, 12);
+	S(S4, RE, RC, RD, RA, RB);		LK2(RC, RD, RA, RB, RE, 13);
+	S(S5, RC, RD, RA, RB, RE);		LK2(RE, RC, RD, RB, RA, 14);
+	S(S6, RE, RC, RD, RB, RA);		LK2(RD, RA, RC, RB, RE, 15);
+	S(S7, RD, RA, RC, RB, RE);		LK2(RE, RC, RB, RD, RA, 16);
+	S(S0, RE, RC, RB, RD, RA);		LK2(RB, RC, RD, RE, RA, 17);
+	S(S1, RB, RC, RD, RE, RA);		LK2(RA, RD, RE, RB, RC, 18);
+	S(S2, RA, RD, RE, RB, RC);		LK2(RC, RD, RA, RB, RE, 19);
+	S(S3, RC, RD, RA, RB, RE);		LK2(RB, RE, RD, RC, RA, 20);
+	S(S4, RB, RE, RD, RC, RA);		LK2(RE, RD, RC, RA, RB, 21);
+	S(S5, RE, RD, RC, RA, RB);		LK2(RB, RE, RD, RA, RC, 22);
+	S(S6, RB, RE, RD, RA, RC);		LK2(RD, RC, RE, RA, RB, 23);
+	S(S7, RD, RC, RE, RA, RB);		LK2(RB, RE, RA, RD, RC, 24);
+	S(S0, RB, RE, RA, RD, RC);		LK2(RA, RE, RD, RB, RC, 25);
+	S(S1, RA, RE, RD, RB, RC);		LK2(RC, RD, RB, RA, RE, 26);
+	S(S2, RC, RD, RB, RA, RE);		LK2(RE, RD, RC, RA, RB, 27);
+	S(S3, RE, RD, RC, RA, RB);		LK2(RA, RB, RD, RE, RC, 28);
+	S(S4, RA, RB, RD, RE, RC);		LK2(RB, RD, RE, RC, RA, 29);
+	S(S5, RB, RD, RE, RC, RA);		LK2(RA, RB, RD, RC, RE, 30);
+	S(S6, RA, RB, RD, RC, RE);		LK2(RD, RE, RB, RC, RA, 31);
+	S(S7, RD, RE, RB, RC, RA);		 K2(RA, RB, RC, RD, RE, 32);
+
+	write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+	ret;
+ENDPROC(__serpent_enc_blk16)
+
+.align 8
+__serpent_dec_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext
+	 * output:
+	 *	RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext
+	 */
+
+	vpcmpeqd RNOT, RNOT, RNOT;
+
+	read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+	read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+						 K2(RA, RB, RC, RD, RE, 32);
+	SP(SI7, RA, RB, RC, RD, RE, 31);	KL2(RB, RD, RA, RE, RC, 31);
+	SP(SI6, RB, RD, RA, RE, RC, 30);	KL2(RA, RC, RE, RB, RD, 30);
+	SP(SI5, RA, RC, RE, RB, RD, 29);	KL2(RC, RD, RA, RE, RB, 29);
+	SP(SI4, RC, RD, RA, RE, RB, 28);	KL2(RC, RA, RB, RE, RD, 28);
+	SP(SI3, RC, RA, RB, RE, RD, 27);	KL2(RB, RC, RD, RE, RA, 27);
+	SP(SI2, RB, RC, RD, RE, RA, 26);	KL2(RC, RA, RE, RD, RB, 26);
+	SP(SI1, RC, RA, RE, RD, RB, 25);	KL2(RB, RA, RE, RD, RC, 25);
+	SP(SI0, RB, RA, RE, RD, RC, 24);	KL2(RE, RC, RA, RB, RD, 24);
+	SP(SI7, RE, RC, RA, RB, RD, 23);	KL2(RC, RB, RE, RD, RA, 23);
+	SP(SI6, RC, RB, RE, RD, RA, 22);	KL2(RE, RA, RD, RC, RB, 22);
+	SP(SI5, RE, RA, RD, RC, RB, 21);	KL2(RA, RB, RE, RD, RC, 21);
+	SP(SI4, RA, RB, RE, RD, RC, 20);	KL2(RA, RE, RC, RD, RB, 20);
+	SP(SI3, RA, RE, RC, RD, RB, 19);	KL2(RC, RA, RB, RD, RE, 19);
+	SP(SI2, RC, RA, RB, RD, RE, 18);	KL2(RA, RE, RD, RB, RC, 18);
+	SP(SI1, RA, RE, RD, RB, RC, 17);	KL2(RC, RE, RD, RB, RA, 17);
+	SP(SI0, RC, RE, RD, RB, RA, 16);	KL2(RD, RA, RE, RC, RB, 16);
+	SP(SI7, RD, RA, RE, RC, RB, 15);	KL2(RA, RC, RD, RB, RE, 15);
+	SP(SI6, RA, RC, RD, RB, RE, 14);	KL2(RD, RE, RB, RA, RC, 14);
+	SP(SI5, RD, RE, RB, RA, RC, 13);	KL2(RE, RC, RD, RB, RA, 13);
+	SP(SI4, RE, RC, RD, RB, RA, 12);	KL2(RE, RD, RA, RB, RC, 12);
+	SP(SI3, RE, RD, RA, RB, RC, 11);	KL2(RA, RE, RC, RB, RD, 11);
+	SP(SI2, RA, RE, RC, RB, RD, 10);	KL2(RE, RD, RB, RC, RA, 10);
+	SP(SI1, RE, RD, RB, RC, RA, 9);		KL2(RA, RD, RB, RC, RE, 9);
+	SP(SI0, RA, RD, RB, RC, RE, 8);		KL2(RB, RE, RD, RA, RC, 8);
+	SP(SI7, RB, RE, RD, RA, RC, 7);		KL2(RE, RA, RB, RC, RD, 7);
+	SP(SI6, RE, RA, RB, RC, RD, 6);		KL2(RB, RD, RC, RE, RA, 6);
+	SP(SI5, RB, RD, RC, RE, RA, 5);		KL2(RD, RA, RB, RC, RE, 5);
+	SP(SI4, RD, RA, RB, RC, RE, 4);		KL2(RD, RB, RE, RC, RA, 4);
+	SP(SI3, RD, RB, RE, RC, RA, 3);		KL2(RE, RD, RA, RC, RB, 3);
+	SP(SI2, RE, RD, RA, RC, RB, 2);		KL2(RD, RB, RC, RA, RE, 2);
+	SP(SI1, RD, RB, RC, RA, RE, 1);		KL2(RE, RB, RC, RA, RD, 1);
+	S(SI0, RE, RB, RC, RA, RD);		 K2(RC, RD, RB, RE, RA, 0);
+
+	write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+	write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+	ret;
+ENDPROC(__serpent_dec_blk16)
+
+ENTRY(serpent_ecb_enc_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	call __serpent_enc_blk16;
+
+	store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(serpent_ecb_enc_16way)
+
+ENTRY(serpent_ecb_dec_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	call __serpent_dec_blk16;
+
+	store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(serpent_ecb_dec_16way)
+
+ENTRY(serpent_cbc_dec_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+
+	load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	call __serpent_dec_blk16;
+
+	store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2,
+			RK0);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(serpent_cbc_dec_16way)
+
+ENTRY(serpent_ctr_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (little endian, 128bit)
+	 */
+
+	vzeroupper;
+
+	load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+		       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+		       tp);
+
+	call __serpent_enc_blk16;
+
+	store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(serpent_ctr_16way)
+
+ENTRY(serpent_xts_enc_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	vzeroupper;
+
+	load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+		       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+		       .Lxts_gf128mul_and_shl1_mask_0,
+		       .Lxts_gf128mul_and_shl1_mask_1);
+
+	call __serpent_enc_blk16;
+
+	store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(serpent_xts_enc_16way)
+
+ENTRY(serpent_xts_dec_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	vzeroupper;
+
+	load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+		       RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT,
+		       .Lxts_gf128mul_and_shl1_mask_0,
+		       .Lxts_gf128mul_and_shl1_mask_1);
+
+	call __serpent_dec_blk16;
+
+	store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+
+	vzeroupper;
+
+	ret;
+ENDPROC(serpent_xts_dec_16way)

+ 562 - 0
arch/x86/crypto/serpent_avx2_glue.c

@@ -0,0 +1,562 @@
+/*
+ * Glue Code for x86_64/AVX2 assembler optimized version of Serpent
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/ctr.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <crypto/serpent.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/serpent-avx.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
+
+#define SERPENT_AVX2_PARALLEL_BLOCKS 16
+
+/* 16-way AVX2 parallel cipher functions */
+asmlinkage void serpent_ecb_enc_16way(struct serpent_ctx *ctx, u8 *dst,
+				      const u8 *src);
+asmlinkage void serpent_ecb_dec_16way(struct serpent_ctx *ctx, u8 *dst,
+				      const u8 *src);
+asmlinkage void serpent_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
+
+asmlinkage void serpent_ctr_16way(void *ctx, u128 *dst, const u128 *src,
+				  le128 *iv);
+asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst,
+				      const u8 *src, le128 *iv);
+asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst,
+				      const u8 *src, le128 *iv);
+
+static const struct common_glue_ctx serpent_enc = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
+	} }
+};
+
+static const struct common_glue_ctx serpent_ctr = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_16way) }
+	},  {
+		.num_blocks = 8,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
+	} }
+};
+
+static const struct common_glue_ctx serpent_enc_xts = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
+	} }
+};
+
+static const struct common_glue_ctx serpent_dec = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
+	} }
+};
+
+static const struct common_glue_ctx serpent_dec_cbc = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
+	} }
+};
+
+static const struct common_glue_ctx serpent_dec_xts = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
+	} }
+};
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&serpent_enc, desc, dst, src, nbytes);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&serpent_dec, desc, dst, src, nbytes);
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(__serpent_encrypt), desc,
+				       dst, src, nbytes);
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_decrypt_128bit(&serpent_dec_cbc, desc, dst, src,
+				       nbytes);
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ctr_crypt_128bit(&serpent_ctr, desc, dst, src, nbytes);
+}
+
+static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+	/* since reusing AVX functions, starts using FPU at 8 parallel blocks */
+	return glue_fpu_begin(SERPENT_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
+}
+
+static inline void serpent_fpu_end(bool fpu_enabled)
+{
+	glue_fpu_end(fpu_enabled);
+}
+
+struct crypt_priv {
+	struct serpent_ctx *ctx;
+	bool fpu_enabled;
+};
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) {
+		serpent_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
+		nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
+	}
+
+	while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
+		serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
+		nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		__serpent_encrypt(ctx->ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = SERPENT_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	if (nbytes >= SERPENT_AVX2_PARALLEL_BLOCKS * bsize) {
+		serpent_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
+		nbytes -= bsize * SERPENT_AVX2_PARALLEL_BLOCKS;
+	}
+
+	while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) {
+		serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * SERPENT_PARALLEL_BLOCKS;
+		nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->serpent_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[SERPENT_AVX2_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->serpent_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(__serpent_encrypt),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(__serpent_encrypt),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static struct crypto_alg srp_algs[10] = { {
+	.cra_name		= "__ecb-serpent-avx2",
+	.cra_driver_name	= "__driver-ecb-serpent-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[0].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__cbc-serpent-avx2",
+	.cra_driver_name	= "__driver-cbc-serpent-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[1].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__ctr-serpent-avx2",
+	.cra_driver_name	= "__driver-ctr-serpent-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct serpent_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[2].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= serpent_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "__lrw-serpent-avx2",
+	.cra_driver_name	= "__driver-lrw-serpent-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[3].cra_list),
+	.cra_exit		= lrw_serpent_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= lrw_serpent_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__xts-serpent-avx2",
+	.cra_driver_name	= "__driver-xts-serpent-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct serpent_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[4].cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= xts_serpent_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ecb(serpent)",
+	.cra_driver_name	= "ecb-serpent-avx2",
+	.cra_priority		= 600,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[5].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(serpent)",
+	.cra_driver_name	= "cbc-serpent-avx2",
+	.cra_priority		= 600,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[6].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= __ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(serpent)",
+	.cra_driver_name	= "ctr-serpent-avx2",
+	.cra_priority		= 600,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[7].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_encrypt,
+			.geniv		= "chainiv",
+		},
+	},
+}, {
+	.cra_name		= "lrw(serpent)",
+	.cra_driver_name	= "lrw-serpent-avx2",
+	.cra_priority		= 600,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[8].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE +
+					  SERPENT_BLOCK_SIZE,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "xts(serpent)",
+	.cra_driver_name	= "xts-serpent-avx2",
+	.cra_priority		= 600,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= SERPENT_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(srp_algs[9].cra_list),
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= SERPENT_MIN_KEY_SIZE * 2,
+			.max_keysize	= SERPENT_MAX_KEY_SIZE * 2,
+			.ivsize		= SERPENT_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+} };
+
+static int __init init(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx2 || !cpu_has_osxsave) {
+		pr_info("AVX2 instructions are not detected.\n");
+		return -ENODEV;
+	}
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		pr_info("AVX detected but unusable.\n");
+		return -ENODEV;
+	}
+
+	return crypto_register_algs(srp_algs, ARRAY_SIZE(srp_algs));
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_algs(srp_algs, ARRAY_SIZE(srp_algs));
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, AVX2 optimized");
+MODULE_ALIAS("serpent");
+MODULE_ALIAS("serpent-asm");

+ 85 - 60
arch/x86/crypto/serpent_avx_glue.c

@@ -4,8 +4,7 @@
  * Copyright (C) 2012 Johannes Goetzfried
  * Copyright (C) 2012 Johannes Goetzfried
  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  *
  *
- * Glue code based on serpent_sse2_glue.c by:
- *  Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * it under the terms of the GNU General Public License as published by
@@ -42,7 +41,32 @@
 #include <asm/crypto/ablk_helper.h>
 #include <asm/crypto/ablk_helper.h>
 #include <asm/crypto/glue_helper.h>
 #include <asm/crypto/glue_helper.h>
 
 
-static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+/* 8-way parallel cipher functions */
+asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx);
+
+asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx);
+
+asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src);
+EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx);
+
+asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+				     const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx);
+
+asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx);
+
+asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx);
+
+void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 {
 {
 	be128 ctrblk;
 	be128 ctrblk;
 
 
@@ -52,6 +76,22 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
 	__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
 	__serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
 	u128_xor(dst, src, (u128 *)&ctrblk);
 	u128_xor(dst, src, (u128 *)&ctrblk);
 }
 }
+EXPORT_SYMBOL_GPL(__serpent_crypt_ctr);
+
+void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+				  GLUE_FUNC_CAST(__serpent_encrypt));
+}
+EXPORT_SYMBOL_GPL(serpent_xts_enc);
+
+void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+				  GLUE_FUNC_CAST(__serpent_decrypt));
+}
+EXPORT_SYMBOL_GPL(serpent_xts_dec);
+
 
 
 static const struct common_glue_ctx serpent_enc = {
 static const struct common_glue_ctx serpent_enc = {
 	.num_funcs = 2,
 	.num_funcs = 2,
@@ -75,7 +115,20 @@ static const struct common_glue_ctx serpent_ctr = {
 		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
 		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
 	}, {
 	}, {
 		.num_blocks = 1,
 		.num_blocks = 1,
-		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
+	} }
+};
+
+static const struct common_glue_ctx serpent_enc_xts = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = SERPENT_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
 	} }
 	} }
 };
 };
 
 
@@ -105,6 +158,19 @@ static const struct common_glue_ctx serpent_dec_cbc = {
 	} }
 	} }
 };
 };
 
 
+static const struct common_glue_ctx serpent_dec_xts = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = SERPENT_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
+	} }
+};
+
 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 		       struct scatterlist *src, unsigned int nbytes)
 {
 {
@@ -187,13 +253,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
 		__serpent_decrypt(ctx->ctx, srcdst, srcdst);
 }
 }
 
 
-struct serpent_lrw_ctx {
-	struct lrw_table_ctx lrw_table;
-	struct serpent_ctx serpent_ctx;
-};
-
-static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
-			      unsigned int keylen)
+int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+		       unsigned int keylen)
 {
 {
 	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
 	int err;
 	int err;
@@ -206,6 +267,7 @@ static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
 	return lrw_init_table(&ctx->lrw_table, key + keylen -
 	return lrw_init_table(&ctx->lrw_table, key + keylen -
 						SERPENT_BLOCK_SIZE);
 						SERPENT_BLOCK_SIZE);
 }
 }
+EXPORT_SYMBOL_GPL(lrw_serpent_setkey);
 
 
 static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 		       struct scatterlist *src, unsigned int nbytes)
@@ -259,20 +321,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 	return ret;
 	return ret;
 }
 }
 
 
-static void lrw_exit_tfm(struct crypto_tfm *tfm)
+void lrw_serpent_exit_tfm(struct crypto_tfm *tfm)
 {
 {
 	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
 
 
 	lrw_free_table(&ctx->lrw_table);
 	lrw_free_table(&ctx->lrw_table);
 }
 }
+EXPORT_SYMBOL_GPL(lrw_serpent_exit_tfm);
 
 
-struct serpent_xts_ctx {
-	struct serpent_ctx tweak_ctx;
-	struct serpent_ctx crypt_ctx;
-};
-
-static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
-			      unsigned int keylen)
+int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+		       unsigned int keylen)
 {
 {
 	struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
 	u32 *flags = &tfm->crt_flags;
 	u32 *flags = &tfm->crt_flags;
@@ -294,59 +352,26 @@ static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
 	/* second half of xts-key is for tweak */
 	/* second half of xts-key is for tweak */
 	return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
 	return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
 }
 }
+EXPORT_SYMBOL_GPL(xts_serpent_setkey);
 
 
 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 		       struct scatterlist *src, unsigned int nbytes)
 {
 {
 	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
 	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[SERPENT_PARALLEL_BLOCKS];
-	struct crypt_priv crypt_ctx = {
-		.ctx = &ctx->crypt_ctx,
-		.fpu_enabled = false,
-	};
-	struct xts_crypt_req req = {
-		.tbuf = buf,
-		.tbuflen = sizeof(buf),
-
-		.tweak_ctx = &ctx->tweak_ctx,
-		.tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
-		.crypt_ctx = &crypt_ctx,
-		.crypt_fn = encrypt_callback,
-	};
-	int ret;
-
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-	ret = xts_crypt(desc, dst, src, nbytes, &req);
-	serpent_fpu_end(crypt_ctx.fpu_enabled);
 
 
-	return ret;
+	return glue_xts_crypt_128bit(&serpent_enc_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(__serpent_encrypt),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
 }
 }
 
 
 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 		       struct scatterlist *src, unsigned int nbytes)
 {
 {
 	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
 	struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[SERPENT_PARALLEL_BLOCKS];
-	struct crypt_priv crypt_ctx = {
-		.ctx = &ctx->crypt_ctx,
-		.fpu_enabled = false,
-	};
-	struct xts_crypt_req req = {
-		.tbuf = buf,
-		.tbuflen = sizeof(buf),
-
-		.tweak_ctx = &ctx->tweak_ctx,
-		.tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
-		.crypt_ctx = &crypt_ctx,
-		.crypt_fn = decrypt_callback,
-	};
-	int ret;
 
 
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-	ret = xts_crypt(desc, dst, src, nbytes, &req);
-	serpent_fpu_end(crypt_ctx.fpu_enabled);
-
-	return ret;
+	return glue_xts_crypt_128bit(&serpent_dec_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(__serpent_encrypt),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
 }
 }
 
 
 static struct crypto_alg serpent_algs[10] = { {
 static struct crypto_alg serpent_algs[10] = { {
@@ -417,7 +442,7 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_alignmask		= 0,
 	.cra_alignmask		= 0,
 	.cra_type		= &crypto_blkcipher_type,
 	.cra_type		= &crypto_blkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
-	.cra_exit		= lrw_exit_tfm,
+	.cra_exit		= lrw_serpent_exit_tfm,
 	.cra_u = {
 	.cra_u = {
 		.blkcipher = {
 		.blkcipher = {
 			.min_keysize	= SERPENT_MIN_KEY_SIZE +
 			.min_keysize	= SERPENT_MIN_KEY_SIZE +

+ 496 - 0
arch/x86/crypto/sha256-avx-asm.S

@@ -0,0 +1,496 @@
+########################################################################
+# Implement fast SHA-256 with AVX1 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 block at a time, with 4 lanes per block
+########################################################################
+
+#ifdef CONFIG_AS_AVX
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define    VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+	add     \p1, \p2
+	mov     \p2, \p1
+.endm
+
+
+.macro MY_ROR p1 p2
+	shld    $(32-(\p1)), \p2, \p2
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+	VMOVDQ \p2, \p1
+	vpshufb \p3, \p1, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+XTMP5 = %xmm11
+
+SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm13
+
+NUM_BLKS = %rdx   # 3rd arg
+CTX = %rsi        # 2nd arg
+INP = %rdi        # 1st arg
+
+SRND = %rdi       # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %rbp
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 8
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP            = _INP_END  + _INP_END_SIZE
+_XFER           = _INP      + _INP_SIZE
+_XMM_SAVE       = _XFER     + _XFER_SIZE
+STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+	## compute s0 four at a time and s1 two at a time
+	## compute W[-16] + W[-7] 4 at a time
+
+	mov     e, y0			# y0 = e
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	vpalignr $4, X2, X3, XTMP0      # XTMP0 = W[-7]
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	vpaddd  X0, XTMP0, XTMP0        # XTMP0 = W[-7] + W[-16]
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	## compute s0
+	vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y0, y2                  # y2 = S1 + CH
+	add     _XFER(%rsp), y2         # y2 = k + w + S1 + CH
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpsrld  $7, XTMP1, XTMP2
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	vpslld  $(32-7), XTMP1, XTMP3
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	vpor    XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	vpsrld  $18, XTMP1, XTMP2       #
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     g, y2                   # y2 = f^g
+	vpsrld  $3, XTMP1, XTMP4        # XTMP4 = W[-15] >> 3
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	vpslld  $(32-18), XTMP1, XTMP1
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	vpxor   XTMP1, XTMP3, XTMP3     #
+	add     y0, y2                  # y2 = S1 + CH
+	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	vpxor   XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	## compute low s1
+	vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	mov     f, y2                   # y2 = f
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	vpsrld  $10, XTMP2, XTMP4       # XTMP4 = W[-2] >> 10 {BBAA}
+	xor     g, y2                   # y2 = f^g
+	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xBxA}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xBxA}
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	vpxor   XTMP3, XTMP2, XTMP2     #
+	add     y0, y2                  # y2 = S1 + CH
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	## compute high s1
+	vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	mov     e, y0                   # y0 = e
+	MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xDxC}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xDxC}
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	vpxor   XTMP3, XTMP2, XTMP2
+	MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y0, y2                  # y2 = S1 + CH
+	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+	rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+	mov	e, y0			# y0 = e
+        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
+        mov     a, y1                   # y1 = a
+        xor     e, y0                   # y0 = e ^ (e >> (25-11))
+        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
+        mov     f, y2                   # y2 = f
+        xor     a, y1                   # y1 = a ^ (a >> (22-13)
+        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
+        xor     g, y2                   # y2 = f^g
+        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
+        and     e, y2                   # y2 = (f^g)&e
+        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+        add     y0, y2                  # y2 = S1 + CH
+        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+        offset = \round * 4 + _XFER     #
+        add     offset(%rsp), y2	# y2 = k + w + S1 + CH
+        mov     a, y0			# y0 = a
+        add     y2, h                   # h = h + S1 + CH + k + w
+        mov     a, y2                   # y2 = a
+        or      c, y0                   # y0 = a|c
+        add     h, d                    # d = d + h + S1 + CH + k + w
+        and     c, y2                   # y2 = a&c
+        and     b, y0                   # y0 = (a|c)&b
+        add     y1, h                   # h = h + S1 + CH + k + w + S0
+        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+        ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
+## arg 1 : pointer to input data
+## arg 2 : pointer to digest
+## arg 3 : Num blocks
+########################################################################
+.text
+ENTRY(sha256_transform_avx)
+.align 32
+	pushq   %rbx
+	pushq   %rbp
+	pushq   %r13
+	pushq   %r14
+	pushq   %r15
+	pushq   %r12
+
+	mov	%rsp, %r12
+	subq    $STACK_SIZE, %rsp	# allocate stack space
+	and	$~15, %rsp		# align stack pointer
+
+	shl     $6, NUM_BLKS		# convert to bytes
+	jz      done_hash
+	add     INP, NUM_BLKS		# pointer to end of data
+	mov     NUM_BLKS, _INP_END(%rsp)
+
+	## load initial digest
+	mov     4*0(CTX), a
+	mov     4*1(CTX), b
+	mov     4*2(CTX), c
+	mov     4*3(CTX), d
+	mov     4*4(CTX), e
+	mov     4*5(CTX), f
+	mov     4*6(CTX), g
+	mov     4*7(CTX), h
+
+	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
+	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
+loop0:
+	lea     K256(%rip), TBL
+
+	## byte swap first 16 dwords
+	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
+
+	mov     INP, _INP(%rsp)
+
+	## schedule 48 input dwords, by doing 3 rounds of 16 each
+	mov     $3, SRND
+.align 16
+loop1:
+	vpaddd  (TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd  1*16(TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd  2*16(TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddd  3*16(TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	add	$4*16, TBL
+	FOUR_ROUNDS_AND_SCHED
+
+	sub     $1, SRND
+	jne     loop1
+
+	mov     $2, SRND
+loop2:
+	vpaddd  (TBL), X0, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+
+	vpaddd  1*16(TBL), X1, XFER
+	vmovdqa XFER, _XFER(%rsp)
+	add     $2*16, TBL
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+
+	vmovdqa X2, X0
+	vmovdqa X3, X1
+
+	sub     $1, SRND
+	jne     loop2
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	mov     _INP(%rsp), INP
+	add     $64, INP
+	cmp     _INP_END(%rsp), INP
+	jne     loop0
+
+done_hash:
+
+	mov	%r12, %rsp
+
+	popq	%r12
+	popq    %r15
+	popq    %r14
+	popq    %r13
+	popq    %rbp
+	popq    %rbx
+	ret
+ENDPROC(sha256_transform_avx)
+
+.data
+.align 64
+K256:
+	.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
+
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
+#endif

+ 772 - 0
arch/x86/crypto/sha256-avx2-asm.S

@@ -0,0 +1,772 @@
+########################################################################
+# Implement fast SHA-256 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 2 blocks at a time, with 4 lanes per block
+########################################################################
+
+#ifdef CONFIG_AS_AVX2
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define	VMOVDQ vmovdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+	add	\p1, \p2
+	mov	\p2, \p1
+.endm
+
+################################
+
+X0 = %ymm4
+X1 = %ymm5
+X2 = %ymm6
+X3 = %ymm7
+
+# XMM versions of above
+XWORD0 = %xmm4
+XWORD1 = %xmm5
+XWORD2 = %xmm6
+XWORD3 = %xmm7
+
+XTMP0 = %ymm0
+XTMP1 = %ymm1
+XTMP2 = %ymm2
+XTMP3 = %ymm3
+XTMP4 = %ymm8
+XFER  = %ymm9
+XTMP5 = %ymm11
+
+SHUF_00BA =	%ymm10 # shuffle xBxA -> 00BA
+SHUF_DC00 =	%ymm12 # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %ymm13
+
+X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
+
+NUM_BLKS = %rdx	# 3rd arg
+CTX	= %rsi  # 2nd arg
+INP	= %rdi	# 1st arg
+c	= %ecx
+d	= %r8d
+e       = %edx	# clobbers NUM_BLKS
+y3	= %edi	# clobbers INP
+
+
+TBL	= %rbp
+SRND	= CTX	# SRND is same register as CTX
+
+a = %eax
+b = %ebx
+f = %r9d
+g = %r10d
+h = %r11d
+old_h = %r11d
+
+T1 = %r12d
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+_XFER_SIZE	= 2*64*4	# 2 blocks, 64 rounds, 4 bytes/round
+_XMM_SAVE_SIZE	= 0
+_INP_END_SIZE	= 8
+_INP_SIZE	= 8
+_CTX_SIZE	= 8
+_RSP_SIZE	= 8
+
+_XFER		= 0
+_XMM_SAVE	= _XFER     + _XFER_SIZE
+_INP_END	= _XMM_SAVE + _XMM_SAVE_SIZE
+_INP		= _INP_END  + _INP_END_SIZE
+_CTX		= _INP      + _INP_SIZE
+_RSP		= _CTX      + _CTX_SIZE
+STACK_SIZE	= _RSP      + _RSP_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+	X_ = X0
+	X0 = X1
+	X1 = X2
+	X2 = X3
+	X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+	old_h = h
+	TMP_ = h
+	h = g
+	g = f
+	f = e
+	e = d
+	d = c
+	c = b
+	b = a
+	a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED disp
+################################### RND N + 0 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+
+	addl	\disp(%rsp, SRND), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+	vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+	vpaddd	X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	add	h, d		# d = k + w + h + d                     # --
+
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	vpalignr $4, X0, X1, XTMP1	# XTMP1 = W[-15]
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	vpsrld	$7, XTMP1, XTMP2
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+
+	add	y0, y2		# y2 = S1 + CH                          # --
+	vpslld	$(32-7), XTMP1, XTMP3
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	vpor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7
+
+	vpsrld	$18, XTMP1, XTMP2
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+
+	ROTATE_ARGS
+
+################################### RND N + 1 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	offset = \disp + 1*4
+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	vpsrld	$3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+
+	vpslld	$(32-18), XTMP1, XTMP1
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+
+	vpxor	XTMP1, XTMP3, XTMP3
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	vpxor	XTMP2, XTMP3, XTMP3	# XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	vpxor	XTMP4, XTMP3, XTMP1	# XTMP1 = s0
+	vpshufd	$0b11111010, X3, XTMP2	# XTMP2 = W[-2] {BBAA}
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	vpaddd	XTMP1, XTMP0, XTMP0	# XTMP0 = W[-16] + W[-7] + s0
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	vpsrld	$10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
+
+
+	ROTATE_ARGS
+
+################################### RND N + 2 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	offset = \disp + 2*4
+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
+
+	vpsrlq	$19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	or	c, y3		# y3 = a|c                              # MAJA
+	mov	f, y2		# y2 = f                                # CH
+	xor	g, y2		# y2 = f^g                              # CH
+
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xBxA}
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	vpxor	XTMP3, XTMP2, XTMP2
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	vpxor	XTMP2, XTMP4, XTMP4	# XTMP4 = s1 {xBxA}
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	vpshufb	SHUF_00BA, XTMP4, XTMP4	# XTMP4 = s1 {00BA}
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a ,T1	# T1 = (a >> 2)				# S0
+	vpaddd	XTMP4, XTMP0, XTMP0	# XTMP0 = {..., ..., W[1], W[0]}
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+	vpshufd	$0b01010000, XTMP0, XTMP2	# XTMP2 = W[-2] {DDCC}
+
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1,h		# h = k + w + h + S0                    # --
+	add	y2,d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2,h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3,h		# h = t1 + S0 + MAJ                     # --
+
+
+	ROTATE_ARGS
+
+################################### RND N + 3 ############################
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	offset = \disp + 3*4
+	addl	offset(%rsp, SRND), h	# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	vpsrld	$10, XTMP2, XTMP5	# XTMP5 = W[-2] >> 10 {DDCC}
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	vpsrlq	$19, XTMP2, XTMP3	# XTMP3 = W[-2] ror 19 {xDxC}
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	vpsrlq	$17, XTMP2, XTMP2	# XTMP2 = W[-2] ror 17 {xDxC}
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	vpxor	XTMP3, XTMP2, XTMP2
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	vpxor	XTMP2, XTMP5, XTMP5	# XTMP5 = s1 {xDxC}
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	vpshufb	SHUF_DC00, XTMP5, XTMP5	# XTMP5 = s1 {DC00}
+
+	vpaddd	XTMP0, XTMP5, X0	# X0 = {W[3], W[2], W[1], W[0]}
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	ROTATE_ARGS
+	rotate_Xs
+.endm
+
+.macro DO_4ROUNDS disp
+################################### RND N + 0 ###########################
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	addl	\disp(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	ROTATE_ARGS
+
+################################### RND N + 1 ###########################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	offset = 4*1 + \disp
+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	ROTATE_ARGS
+
+################################### RND N + 2 ##############################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	offset = 4*2 + \disp
+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	ROTATE_ARGS
+
+################################### RND N + 3 ###########################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$25, e, y0	# y0 = e >> 25				# S1A
+	rorx	$11, e, y1	# y1 = e >> 11				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11)		# S1
+	rorx	$6, e, y1	# y1 = (e >> 6)				# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>25) ^ (e>>11) ^ (e>>6)	# S1
+	rorx	$13, a, T1	# T1 = a >> 13				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$22, a, y1	# y1 = a >> 22				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13)		# S0
+	rorx	$2, a, T1	# T1 = (a >> 2)				# S0
+	offset = 4*3 + \disp
+	addl	offset(%rsp, SRND), h		# h = k + w + h # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>22) ^ (a>>13) ^ (a>>2)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	ROTATE_ARGS
+
+.endm
+
+########################################################################
+## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
+## arg 1 : pointer to input data
+## arg 2 : pointer to digest
+## arg 3 : Num blocks
+########################################################################
+.text
+ENTRY(sha256_transform_rorx)
+.align 32
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	mov	%rsp, %rax
+	subq	$STACK_SIZE, %rsp
+	and	$-32, %rsp	# align rsp to 32 byte boundary
+	mov	%rax, _RSP(%rsp)
+
+
+	shl	$6, NUM_BLKS	# convert to bytes
+	jz	done_hash
+	lea	-64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
+	mov	NUM_BLKS, _INP_END(%rsp)
+
+	cmp	NUM_BLKS, INP
+	je	only_one_block
+
+	## load initial digest
+	mov	(CTX), a
+	mov	4*1(CTX), b
+	mov	4*2(CTX), c
+	mov	4*3(CTX), d
+	mov	4*4(CTX), e
+	mov	4*5(CTX), f
+	mov	4*6(CTX), g
+	mov	4*7(CTX), h
+
+	vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
+	vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
+
+	mov	CTX, _CTX(%rsp)
+
+loop0:
+	lea     K256(%rip), TBL
+
+	## Load first 16 dwords from two blocks
+	VMOVDQ	0*32(INP),XTMP0
+	VMOVDQ	1*32(INP),XTMP1
+	VMOVDQ	2*32(INP),XTMP2
+	VMOVDQ	3*32(INP),XTMP3
+
+	## byte swap data
+	vpshufb	BYTE_FLIP_MASK, XTMP0, XTMP0
+	vpshufb	BYTE_FLIP_MASK, XTMP1, XTMP1
+	vpshufb	BYTE_FLIP_MASK, XTMP2, XTMP2
+	vpshufb	BYTE_FLIP_MASK, XTMP3, XTMP3
+
+	## transpose data into high/low halves
+	vperm2i128	$0x20, XTMP2, XTMP0, X0
+	vperm2i128	$0x31, XTMP2, XTMP0, X1
+	vperm2i128	$0x20, XTMP3, XTMP1, X2
+	vperm2i128	$0x31, XTMP3, XTMP1, X3
+
+last_block_enter:
+	add	$64, INP
+	mov	INP, _INP(%rsp)
+
+	## schedule 48 input dwords, by doing 3 rounds of 12 each
+	xor	SRND, SRND
+
+.align 16
+loop1:
+	vpaddd	0*32(TBL, SRND), X0, XFER
+	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	_XFER + 0*32
+
+	vpaddd	1*32(TBL, SRND), X0, XFER
+	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	_XFER + 1*32
+
+	vpaddd	2*32(TBL, SRND), X0, XFER
+	vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	_XFER + 2*32
+
+	vpaddd	3*32(TBL, SRND), X0, XFER
+	vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
+	FOUR_ROUNDS_AND_SCHED	_XFER + 3*32
+
+	add	$4*32, SRND
+	cmp	$3*4*32, SRND
+	jb	loop1
+
+loop2:
+	## Do last 16 rounds with no scheduling
+	vpaddd	0*32(TBL, SRND), X0, XFER
+	vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
+	DO_4ROUNDS	_XFER + 0*32
+	vpaddd	1*32(TBL, SRND), X1, XFER
+	vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
+	DO_4ROUNDS	_XFER + 1*32
+	add	$2*32, SRND
+
+	vmovdqa	X2, X0
+	vmovdqa	X3, X1
+
+	cmp	$4*4*32, SRND
+	jb	loop2
+
+	mov	_CTX(%rsp), CTX
+	mov	_INP(%rsp), INP
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	cmp	_INP_END(%rsp), INP
+	ja	done_hash
+
+	#### Do second block using previously scheduled results
+	xor	SRND, SRND
+.align 16
+loop3:
+	DO_4ROUNDS	 _XFER + 0*32 + 16
+	DO_4ROUNDS	 _XFER + 1*32 + 16
+	add	$2*32, SRND
+	cmp	$4*4*32, SRND
+	jb	loop3
+
+	mov	_CTX(%rsp), CTX
+	mov	_INP(%rsp), INP
+	add	$64, INP
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	cmp	_INP_END(%rsp), INP
+	jb	loop0
+	ja	done_hash
+
+do_last_block:
+	#### do last block
+	lea	K256(%rip), TBL
+
+	VMOVDQ	0*16(INP),XWORD0
+	VMOVDQ	1*16(INP),XWORD1
+	VMOVDQ	2*16(INP),XWORD2
+	VMOVDQ	3*16(INP),XWORD3
+
+	vpshufb	X_BYTE_FLIP_MASK, XWORD0, XWORD0
+	vpshufb	X_BYTE_FLIP_MASK, XWORD1, XWORD1
+	vpshufb	X_BYTE_FLIP_MASK, XWORD2, XWORD2
+	vpshufb	X_BYTE_FLIP_MASK, XWORD3, XWORD3
+
+	jmp	last_block_enter
+
+only_one_block:
+
+	## load initial digest
+	mov	(4*0)(CTX),a
+	mov	(4*1)(CTX),b
+	mov	(4*2)(CTX),c
+	mov	(4*3)(CTX),d
+	mov	(4*4)(CTX),e
+	mov	(4*5)(CTX),f
+	mov	(4*6)(CTX),g
+	mov	(4*7)(CTX),h
+
+	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	vmovdqa	_SHUF_00BA(%rip), SHUF_00BA
+	vmovdqa	_SHUF_DC00(%rip), SHUF_DC00
+
+	mov	CTX, _CTX(%rsp)
+	jmp	do_last_block
+
+done_hash:
+
+	mov	_RSP(%rsp), %rsp
+
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbp
+	popq	%rbx
+	ret
+ENDPROC(sha256_transform_rorx)
+
+.data
+.align 64
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
+
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
+#endif

+ 506 - 0
arch/x86/crypto/sha256-ssse3-asm.S

@@ -0,0 +1,506 @@
+########################################################################
+# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-256 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+## assume buffers not aligned
+#define    MOVDQ movdqu
+
+################################ Define Macros
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+        add     \p1, \p2
+        mov     \p2, \p1
+.endm
+
+################################
+
+# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+# Load xmm with mem and byte swap each dword
+.macro COPY_XMM_AND_BSWAP p1 p2 p3
+        MOVDQ \p2, \p1
+        pshufb \p3, \p1
+.endm
+
+################################
+
+X0 = %xmm4
+X1 = %xmm5
+X2 = %xmm6
+X3 = %xmm7
+
+XTMP0 = %xmm0
+XTMP1 = %xmm1
+XTMP2 = %xmm2
+XTMP3 = %xmm3
+XTMP4 = %xmm8
+XFER = %xmm9
+
+SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
+SHUF_DC00 = %xmm11      # shuffle xDxC -> DC00
+BYTE_FLIP_MASK = %xmm12
+
+NUM_BLKS = %rdx   # 3rd arg
+CTX = %rsi        # 2nd arg
+INP = %rdi        # 1st arg
+
+SRND = %rdi       # clobbers INP
+c = %ecx
+d = %r8d
+e = %edx
+TBL = %rbp
+a = %eax
+b = %ebx
+
+f = %r9d
+g = %r10d
+h = %r11d
+
+y0 = %r13d
+y1 = %r14d
+y2 = %r15d
+
+
+
+_INP_END_SIZE = 8
+_INP_SIZE = 8
+_XFER_SIZE = 8
+_XMM_SAVE_SIZE = 0
+
+_INP_END = 0
+_INP            = _INP_END  + _INP_END_SIZE
+_XFER           = _INP      + _INP_SIZE
+_XMM_SAVE       = _XFER     + _XFER_SIZE
+STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
+
+# rotate_Xs
+# Rotate values of symbols X0...X3
+.macro rotate_Xs
+X_ = X0
+X0 = X1
+X1 = X2
+X2 = X3
+X3 = X_
+.endm
+
+# ROTATE_ARGS
+# Rotate values of symbols a...h
+.macro ROTATE_ARGS
+TMP_ = h
+h = g
+g = f
+f = e
+e = d
+d = c
+c = b
+b = a
+a = TMP_
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+	## compute s0 four at a time and s1 two at a time
+	## compute W[-16] + W[-7] 4 at a time
+	movdqa  X3, XTMP0
+	mov     e, y0			# y0 = e
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	palignr $4, X2, XTMP0           # XTMP0 = W[-7]
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	movdqa  X1, XTMP1
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	paddd   X0, XTMP0               # XTMP0 = W[-7] + W[-16]
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	## compute s0
+	palignr $4, X0, XTMP1           # XTMP1 = W[-15]
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	movdqa  XTMP1, XTMP2            # XTMP2 = W[-15]
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     y0, y2                  # y2 = S1 + CH
+	add     _XFER(%rsp) , y2        # y2 = k + w + S1 + CH
+	movdqa  XTMP1, XTMP3            # XTMP3 = W[-15]
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pslld   $(32-7), XTMP1          #
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	psrld   $7, XTMP2               #
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	por     XTMP2, XTMP1            # XTMP1 = W[-15] ror 7
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+					#
+	ROTATE_ARGS                     #
+	movdqa  XTMP3, XTMP2            # XTMP2 = W[-15]
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	movdqa  XTMP3, XTMP4            # XTMP4 = W[-15]
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	pslld   $(32-18), XTMP3         #
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     g, y2                   # y2 = f^g
+	psrld   $18, XTMP2              #
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	pxor    XTMP3, XTMP1
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	psrld   $3, XTMP4               # XTMP4 = W[-15] >> 3
+	add     y0, y2                  # y2 = S1 + CH
+	add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	pxor    XTMP2, XTMP1            # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pxor    XTMP4, XTMP1            # XTMP1 = s0
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	## compute low s1
+	pshufd  $0b11111010, X3, XTMP2   # XTMP2 = W[-2] {BBAA}
+	and     b, y0			# y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	paddd   XTMP1, XTMP0            # XTMP0 = W[-16] + W[-7] + s0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {BBAA}
+	mov     e, y0                   # y0 = e
+	mov     a, y1                   # y1 = a
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	movdqa  XTMP2, XTMP4            # XTMP4 = W[-2] {BBAA}
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	mov     f, y2                   # y2 = f
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xBxA}
+	xor     g, y2                   # y2 = f^g
+	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xBxA}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	and     e, y2                   # y2 = (f^g)&e
+	psrld   $10, XTMP4              # XTMP4 = W[-2] >> 10 {BBAA}
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	pxor    XTMP3, XTMP2
+	add     y0, y2                  # y2 = S1 + CH
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	pxor    XTMP2, XTMP4            # XTMP4 = s1 {xBxA}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pshufb  SHUF_00BA, XTMP4        # XTMP4 = s1 {00BA}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	paddd   XTMP4, XTMP0            # XTMP0 = {..., ..., W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	## compute high s1
+	pshufd  $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+					#
+	ROTATE_ARGS                     #
+	movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {DDCC}
+	mov     e, y0                   # y0 = e
+	ror     $(25-11), y0            # y0 = e >> (25-11)
+	mov     a, y1                   # y1 = a
+	movdqa  XTMP2, X0               # X0    = W[-2] {DDCC}
+	ror     $(22-13), y1            # y1 = a >> (22-13)
+	xor     e, y0                   # y0 = e ^ (e >> (25-11))
+	mov     f, y2                   # y2 = f
+	ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xDxC}
+	xor     a, y1                   # y1 = a ^ (a >> (22-13)
+	xor     g, y2                   # y2 = f^g
+	psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xDxC}
+	xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25
+	and     e, y2                   # y2 = (f^g)&e
+	ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	psrld   $10, X0                 # X0 = W[-2] >> 10 {DDCC}
+	xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22
+	ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
+	xor     g, y2                   # y2 = CH = ((f^g)&e)^g
+	pxor    XTMP3, XTMP2            #
+	ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
+	add     y0, y2                  # y2 = S1 + CH
+	add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
+	pxor    XTMP2, X0               # X0 = s1 {xDxC}
+	mov     a, y0                   # y0 = a
+	add     y2, h                   # h = h + S1 + CH + k + w
+	mov     a, y2                   # y2 = a
+	pshufb  SHUF_DC00, X0           # X0 = s1 {DC00}
+	or      c, y0                   # y0 = a|c
+	add     h, d                    # d = d + h + S1 + CH + k + w
+	and     c, y2                   # y2 = a&c
+	paddd   XTMP0, X0               # X0 = {W[3], W[2], W[1], W[0]}
+	and     b, y0                   # y0 = (a|c)&b
+	add     y1, h                   # h = h + S1 + CH + k + w + S0
+	or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
+
+	ROTATE_ARGS
+	rotate_Xs
+.endm
+
+## input is [rsp + _XFER + %1 * 4]
+.macro DO_ROUND round
+	mov     e, y0                 # y0 = e
+	ror     $(25-11), y0          # y0 = e >> (25-11)
+	mov     a, y1                 # y1 = a
+	xor     e, y0                 # y0 = e ^ (e >> (25-11))
+	ror     $(22-13), y1          # y1 = a >> (22-13)
+	mov     f, y2                 # y2 = f
+	xor     a, y1                 # y1 = a ^ (a >> (22-13)
+	ror     $(11-6), y0           # y0 = (e >> (11-6)) ^ (e >> (25-6))
+	xor     g, y2                 # y2 = f^g
+	xor     e, y0                 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+	ror     $(13-2), y1           # y1 = (a >> (13-2)) ^ (a >> (22-2))
+	and     e, y2                 # y2 = (f^g)&e
+	xor     a, y1                 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+	ror     $6, y0                # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+	xor     g, y2                 # y2 = CH = ((f^g)&e)^g
+	add     y0, y2                # y2 = S1 + CH
+	ror     $2, y1                # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+	offset = \round * 4 + _XFER
+	add     offset(%rsp), y2      # y2 = k + w + S1 + CH
+	mov     a, y0                 # y0 = a
+	add     y2, h                 # h = h + S1 + CH + k + w
+	mov     a, y2                 # y2 = a
+	or      c, y0                 # y0 = a|c
+	add     h, d                  # d = d + h + S1 + CH + k + w
+	and     c, y2                 # y2 = a&c
+	and     b, y0                 # y0 = (a|c)&b
+	add     y1, h                 # h = h + S1 + CH + k + w + S0
+	or      y2, y0		      # y0 = MAJ = (a|c)&b)|(a&c)
+	add     y0, h		      # h = h + S1 + CH + k + w + S0 + MAJ
+	ROTATE_ARGS
+.endm
+
+########################################################################
+## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
+## arg 1 : pointer to input data
+## arg 2 : pointer to digest
+## arg 3 : Num blocks
+########################################################################
+.text
+ENTRY(sha256_transform_ssse3)
+.align 32
+	pushq   %rbx
+	pushq   %rbp
+	pushq   %r13
+	pushq   %r14
+	pushq   %r15
+	pushq   %r12
+
+	mov	%rsp, %r12
+	subq    $STACK_SIZE, %rsp
+	and	$~15, %rsp
+
+	shl     $6, NUM_BLKS		 # convert to bytes
+	jz      done_hash
+	add     INP, NUM_BLKS
+	mov     NUM_BLKS, _INP_END(%rsp) # pointer to end of data
+
+	## load initial digest
+	mov     4*0(CTX), a
+	mov     4*1(CTX), b
+	mov     4*2(CTX), c
+	mov     4*3(CTX), d
+	mov     4*4(CTX), e
+	mov     4*5(CTX), f
+	mov     4*6(CTX), g
+	mov     4*7(CTX), h
+
+	movdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+	movdqa  _SHUF_00BA(%rip), SHUF_00BA
+	movdqa  _SHUF_DC00(%rip), SHUF_DC00
+
+loop0:
+	lea     K256(%rip), TBL
+
+	## byte swap first 16 dwords
+	COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
+	COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
+
+	mov     INP, _INP(%rsp)
+
+	## schedule 48 input dwords, by doing 3 rounds of 16 each
+	mov     $3, SRND
+.align 16
+loop1:
+	movdqa  (TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  1*16(TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  2*16(TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	movdqa  3*16(TBL), XFER
+	paddd   X0, XFER
+	movdqa  XFER, _XFER(%rsp)
+	add     $4*16, TBL
+	FOUR_ROUNDS_AND_SCHED
+
+	sub     $1, SRND
+	jne     loop1
+
+	mov     $2, SRND
+loop2:
+	paddd   (TBL), X0
+	movdqa  X0, _XFER(%rsp)
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+	paddd   1*16(TBL), X1
+	movdqa  X1, _XFER(%rsp)
+	add     $2*16, TBL
+	DO_ROUND        0
+	DO_ROUND        1
+	DO_ROUND        2
+	DO_ROUND        3
+
+	movdqa  X2, X0
+	movdqa  X3, X1
+
+	sub     $1, SRND
+	jne     loop2
+
+	addm    (4*0)(CTX),a
+	addm    (4*1)(CTX),b
+	addm    (4*2)(CTX),c
+	addm    (4*3)(CTX),d
+	addm    (4*4)(CTX),e
+	addm    (4*5)(CTX),f
+	addm    (4*6)(CTX),g
+	addm    (4*7)(CTX),h
+
+	mov     _INP(%rsp), INP
+	add     $64, INP
+	cmp     _INP_END(%rsp), INP
+	jne     loop0
+
+done_hash:
+
+	mov	%r12, %rsp
+
+	popq    %r12
+	popq    %r15
+	popq    %r14
+	popq    %r13
+	popq    %rbp
+	popq    %rbx
+
+	ret
+ENDPROC(sha256_transform_ssse3)
+
+.data
+.align 64
+K256:
+        .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
+
+# shuffle xBxA -> 00BA
+_SHUF_00BA:
+	.octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
+
+# shuffle xDxC -> DC00
+_SHUF_DC00:
+	.octa 0x0b0a090803020100FFFFFFFFFFFFFFFF

+ 275 - 0
arch/x86/crypto/sha256_ssse3_glue.c

@@ -0,0 +1,275 @@
+/*
+ * Cryptographic API.
+ *
+ * Glue code for the SHA256 Secure Hash Algorithm assembler
+ * implementation using supplemental SSE3 / AVX / AVX2 instructions.
+ *
+ * This file is based on sha256_generic.c
+ *
+ * Copyright (C) 2013 Intel Corporation.
+ *
+ * Author:
+ *     Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <linux/string.h>
+
+asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest,
+				     u64 rounds);
+#ifdef CONFIG_AS_AVX
+asmlinkage void sha256_transform_avx(const char *data, u32 *digest,
+				     u64 rounds);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void sha256_transform_rorx(const char *data, u32 *digest,
+				     u64 rounds);
+#endif
+
+static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64);
+
+
+static int sha256_ssse3_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA256_H0;
+	sctx->state[1] = SHA256_H1;
+	sctx->state[2] = SHA256_H2;
+	sctx->state[3] = SHA256_H3;
+	sctx->state[4] = SHA256_H4;
+	sctx->state[5] = SHA256_H5;
+	sctx->state[6] = SHA256_H6;
+	sctx->state[7] = SHA256_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, unsigned int partial)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA256_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha256_transform_asm(sctx->buf, sctx->state, 1);
+	}
+
+	if (len - done >= SHA256_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+		sha256_transform_asm(data + done, sctx->state, (u64) rounds);
+
+		done += rounds * SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+	int res;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA256_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	if (!irq_fpu_usable()) {
+		res = crypto_sha256_update(desc, data, len);
+	} else {
+		kernel_fpu_begin();
+		res = __sha256_ssse3_update(desc, data, len, partial);
+		kernel_fpu_end();
+	}
+
+	return res;
+}
+
+
+/* Add padding and return the message digest. */
+static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA256_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
+
+	if (!irq_fpu_usable()) {
+		crypto_sha256_update(desc, padding, padlen);
+		crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+	} else {
+		kernel_fpu_begin();
+		/* We need to fill a whole block for __sha256_ssse3_update() */
+		if (padlen <= 56) {
+			sctx->count += padlen;
+			memcpy(sctx->buf + index, padding, padlen);
+		} else {
+			__sha256_ssse3_update(desc, padding, padlen, index);
+		}
+		__sha256_ssse3_update(desc, (const u8 *)&bits,
+					sizeof(bits), 56);
+		kernel_fpu_end();
+	}
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha256_ssse3_export(struct shash_desc *desc, void *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha256_ssse3_import(struct shash_desc *desc, const void *in)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_ssse3_init,
+	.update		=	sha256_ssse3_update,
+	.final		=	sha256_ssse3_final,
+	.export		=	sha256_ssse3_export,
+	.import		=	sha256_ssse3_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name =	"sha256-ssse3",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+#ifdef CONFIG_AS_AVX
+static bool __init avx_usable(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx || !cpu_has_osxsave)
+		return false;
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		pr_info("AVX detected but unusable.\n");
+
+		return false;
+	}
+
+	return true;
+}
+#endif
+
+static int __init sha256_ssse3_mod_init(void)
+{
+	/* test for SSE3 first */
+	if (cpu_has_ssse3)
+		sha256_transform_asm = sha256_transform_ssse3;
+
+#ifdef CONFIG_AS_AVX
+	/* allow AVX to override SSSE3, it's a little faster */
+	if (avx_usable()) {
+#ifdef CONFIG_AS_AVX2
+		if (boot_cpu_has(X86_FEATURE_AVX2))
+			sha256_transform_asm = sha256_transform_rorx;
+		else
+#endif
+			sha256_transform_asm = sha256_transform_avx;
+	}
+#endif
+
+	if (sha256_transform_asm) {
+#ifdef CONFIG_AS_AVX
+		if (sha256_transform_asm == sha256_transform_avx)
+			pr_info("Using AVX optimized SHA-256 implementation\n");
+#ifdef CONFIG_AS_AVX2
+		else if (sha256_transform_asm == sha256_transform_rorx)
+			pr_info("Using AVX2 optimized SHA-256 implementation\n");
+#endif
+		else
+#endif
+			pr_info("Using SSSE3 optimized SHA-256 implementation\n");
+		return crypto_register_shash(&alg);
+	}
+	pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+
+	return -ENODEV;
+}
+
+static void __exit sha256_ssse3_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(sha256_ssse3_mod_init);
+module_exit(sha256_ssse3_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm, Supplemental SSE3 accelerated");
+
+MODULE_ALIAS("sha256");

+ 423 - 0
arch/x86/crypto/sha512-avx-asm.S

@@ -0,0 +1,423 @@
+########################################################################
+# Implement fast SHA-512 with AVX instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#ifdef CONFIG_AS_AVX
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+msg	= %rdi
+# ARG2
+digest	= %rsi
+# ARG3
+msglen	= %rdx
+T1	= %rcx
+T2	= %r8
+a_64	= %r9
+b_64	= %r10
+c_64	= %r11
+d_64	= %r12
+e_64	= %r13
+f_64	= %r14
+g_64	= %r15
+h_64	= %rbx
+tmp0	= %rax
+
+# Local variables (stack frame)
+
+# Message Schedule
+W_SIZE = 80*8
+# W[t] + K[t] | W[t+1] + K[t+1]
+WK_SIZE = 2*8
+RSPSAVE_SIZE = 1*8
+GPRSAVE_SIZE = 5*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_RSPSAVE = frame_WK + WK_SIZE
+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i)    8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i)    8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i)    8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i)   8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+	# Rotate symbols a..h right
+	TMP   = h_64
+	h_64  = g_64
+	g_64  = f_64
+	f_64  = e_64
+	e_64  = d_64
+	d_64  = c_64
+	c_64  = b_64
+	b_64  = a_64
+	a_64  = TMP
+.endm
+
+.macro RORQ p1 p2
+	# shld is faster than ror on Sandybridge
+	shld	$(64-\p2), \p1, \p1
+.endm
+
+.macro SHA512_Round rnd
+	# Compute Round %%t
+	mov     f_64, T1          # T1 = f
+	mov     e_64, tmp0        # tmp = e
+	xor     g_64, T1          # T1 = f ^ g
+	RORQ    tmp0, 23   # 41    # tmp = e ror 23
+	and     e_64, T1          # T1 = (f ^ g) & e
+	xor     e_64, tmp0        # tmp = (e ror 23) ^ e
+	xor     g_64, T1          # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+	idx = \rnd
+	add     WK_2(idx), T1     # W[t] + K[t] from message scheduler
+	RORQ    tmp0, 4   # 18    # tmp = ((e ror 23) ^ e) ror 4
+	xor     e_64, tmp0        # tmp = (((e ror 23) ^ e) ror 4) ^ e
+	mov     a_64, T2          # T2 = a
+	add     h_64, T1          # T1 = CH(e,f,g) + W[t] + K[t] + h
+	RORQ    tmp0, 14  # 14    # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+	add     tmp0, T1          # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+	mov     a_64, tmp0        # tmp = a
+	xor     c_64, T2          # T2 = a ^ c
+	and     c_64, tmp0        # tmp = a & c
+	and     b_64, T2          # T2 = (a ^ c) & b
+	xor     tmp0, T2          # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+	mov     a_64, tmp0        # tmp = a
+	RORQ    tmp0, 5  # 39     # tmp = a ror 5
+	xor     a_64, tmp0        # tmp = (a ror 5) ^ a
+	add     T1, d_64          # e(next_state) = d + T1
+	RORQ    tmp0, 6  # 34     # tmp = ((a ror 5) ^ a) ror 6
+	xor     a_64, tmp0        # tmp = (((a ror 5) ^ a) ror 6) ^ a
+	lea     (T1, T2), h_64    # a(next_state) = T1 + Maj(a,b,c)
+	RORQ    tmp0, 28  # 28    # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+	add     tmp0, h_64        # a(next_state) = T1 + Maj(a,b,c) S0(a)
+	RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_avx rnd
+	# Compute rounds t-2 and t-1
+	# Compute message schedule QWORDS t and t+1
+
+	#   Two rounds are computed based on the values for K[t-2]+W[t-2] and
+	# K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+	# scheduler.
+	#   The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
+	# They are then added to their respective SHA512 constants at
+	# [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)]
+	#   For brievity, the comments following vectored instructions only refer to
+	# the first of a pair of QWORDS.
+	# Eg. XMM4=W[t-2] really means XMM4={W[t-2]|W[t-1]}
+	#   The computation of the message schedule and the rounds are tightly
+	# stitched to take advantage of instruction-level parallelism.
+
+	idx = \rnd - 2
+	vmovdqa	W_t(idx), %xmm4		# XMM4 = W[t-2]
+	idx = \rnd - 15
+	vmovdqu	W_t(idx), %xmm5		# XMM5 = W[t-15]
+	mov	f_64, T1
+	vpsrlq	$61, %xmm4, %xmm0	# XMM0 = W[t-2]>>61
+	mov	e_64, tmp0
+	vpsrlq	$1, %xmm5, %xmm6	# XMM6 = W[t-15]>>1
+	xor	g_64, T1
+	RORQ	tmp0, 23 # 41
+	vpsrlq	$19, %xmm4, %xmm1	# XMM1 = W[t-2]>>19
+	and	e_64, T1
+	xor	e_64, tmp0
+	vpxor	%xmm1, %xmm0, %xmm0	# XMM0 = W[t-2]>>61 ^ W[t-2]>>19
+	xor	g_64, T1
+	idx = \rnd
+	add	WK_2(idx), T1#
+	vpsrlq	$8, %xmm5, %xmm7	# XMM7 = W[t-15]>>8
+	RORQ	tmp0, 4 # 18
+	vpsrlq	$6, %xmm4, %xmm2	# XMM2 = W[t-2]>>6
+	xor	e_64, tmp0
+	mov	a_64, T2
+	add	h_64, T1
+	vpxor	%xmm7, %xmm6, %xmm6	# XMM6 = W[t-15]>>1 ^ W[t-15]>>8
+	RORQ	tmp0, 14 # 14
+	add	tmp0, T1
+	vpsrlq	$7, %xmm5, %xmm8	# XMM8 = W[t-15]>>7
+	mov	a_64, tmp0
+	xor	c_64, T2
+	vpsllq	$(64-61), %xmm4, %xmm3  # XMM3 = W[t-2]<<3
+	and	c_64, tmp0
+	and	b_64, T2
+	vpxor	%xmm3, %xmm2, %xmm2	# XMM2 = W[t-2]>>6 ^ W[t-2]<<3
+	xor	tmp0, T2
+	mov	a_64, tmp0
+	vpsllq	$(64-1), %xmm5, %xmm9	# XMM9 = W[t-15]<<63
+	RORQ	tmp0, 5 # 39
+	vpxor	%xmm9, %xmm8, %xmm8	# XMM8 = W[t-15]>>7 ^ W[t-15]<<63
+	xor	a_64, tmp0
+	add	T1, d_64
+	RORQ	tmp0, 6 # 34
+	xor	a_64, tmp0
+	vpxor	%xmm8, %xmm6, %xmm6	# XMM6 = W[t-15]>>1 ^ W[t-15]>>8 ^
+					#  W[t-15]>>7 ^ W[t-15]<<63
+	lea	(T1, T2), h_64
+	RORQ	tmp0, 28 # 28
+	vpsllq	$(64-19), %xmm4, %xmm4  # XMM4 = W[t-2]<<25
+	add	tmp0, h_64
+	RotateState
+	vpxor	%xmm4, %xmm0, %xmm0     # XMM0 = W[t-2]>>61 ^ W[t-2]>>19 ^
+					#        W[t-2]<<25
+	mov	f_64, T1
+	vpxor	%xmm2, %xmm0, %xmm0     # XMM0 = s1(W[t-2])
+	mov	e_64, tmp0
+	xor	g_64, T1
+	idx = \rnd - 16
+	vpaddq	W_t(idx), %xmm0, %xmm0  # XMM0 = s1(W[t-2]) + W[t-16]
+	idx = \rnd - 7
+	vmovdqu	W_t(idx), %xmm1		# XMM1 = W[t-7]
+	RORQ	tmp0, 23 # 41
+	and	e_64, T1
+	xor	e_64, tmp0
+	xor	g_64, T1
+	vpsllq	$(64-8), %xmm5, %xmm5   # XMM5 = W[t-15]<<56
+	idx = \rnd + 1
+	add	WK_2(idx), T1
+	vpxor	%xmm5, %xmm6, %xmm6     # XMM6 = s0(W[t-15])
+	RORQ	tmp0, 4 # 18
+	vpaddq	%xmm6, %xmm0, %xmm0     # XMM0 = s1(W[t-2]) + W[t-16] + s0(W[t-15])
+	xor	e_64, tmp0
+	vpaddq	%xmm1, %xmm0, %xmm0     # XMM0 = W[t] = s1(W[t-2]) + W[t-7] +
+					#               s0(W[t-15]) + W[t-16]
+	mov	a_64, T2
+	add	h_64, T1
+	RORQ	tmp0, 14 # 14
+	add	tmp0, T1
+	idx = \rnd
+	vmovdqa	%xmm0, W_t(idx)		# Store W[t]
+	vpaddq	K_t(idx), %xmm0, %xmm0  # Compute W[t]+K[t]
+	vmovdqa	%xmm0, WK_2(idx)	# Store W[t]+K[t] for next rounds
+	mov	a_64, tmp0
+	xor	c_64, T2
+	and	c_64, tmp0
+	and	b_64, T2
+	xor	tmp0, T2
+	mov	a_64, tmp0
+	RORQ	tmp0, 5 # 39
+	xor	a_64, tmp0
+	add	T1, d_64
+	RORQ	tmp0, 6 # 34
+	xor	a_64, tmp0
+	lea	(T1, T2), h_64
+	RORQ	tmp0, 28 # 28
+	add	tmp0, h_64
+	RotateState
+.endm
+
+########################################################################
+# void sha512_transform_avx(const void* M, void* D, u64 L)
+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+# The size of the message pointed to by M must be an integer multiple of SHA512
+# message blocks.
+# L is the message length in SHA512 blocks
+########################################################################
+ENTRY(sha512_transform_avx)
+	cmp $0, msglen
+	je nowork
+
+	# Allocate Stack Space
+	mov	%rsp, %rax
+	sub     $frame_size, %rsp
+	and	$~(0x20 - 1), %rsp
+	mov	%rax, frame_RSPSAVE(%rsp)
+
+	# Save GPRs
+	mov     %rbx, frame_GPRSAVE(%rsp)
+	mov     %r12, frame_GPRSAVE +8*1(%rsp)
+	mov     %r13, frame_GPRSAVE +8*2(%rsp)
+	mov     %r14, frame_GPRSAVE +8*3(%rsp)
+	mov     %r15, frame_GPRSAVE +8*4(%rsp)
+
+updateblock:
+
+	# Load state variables
+	mov     DIGEST(0), a_64
+	mov     DIGEST(1), b_64
+	mov     DIGEST(2), c_64
+	mov     DIGEST(3), d_64
+	mov     DIGEST(4), e_64
+	mov     DIGEST(5), f_64
+	mov     DIGEST(6), g_64
+	mov     DIGEST(7), h_64
+
+	t = 0
+	.rept 80/2 + 1
+	# (80 rounds) / (2 rounds/iteration) + (1 iteration)
+	# +1 iteration because the scheduler leads hashing by 1 iteration
+		.if t < 2
+			# BSWAP 2 QWORDS
+			vmovdqa  XMM_QWORD_BSWAP(%rip), %xmm1
+			vmovdqu  MSG(t), %xmm0
+			vpshufb  %xmm1, %xmm0, %xmm0    # BSWAP
+			vmovdqa  %xmm0, W_t(t) # Store Scheduled Pair
+			vpaddq   K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+			vmovdqa  %xmm0, WK_2(t) # Store into WK for rounds
+		.elseif t < 16
+			# BSWAP 2 QWORDS# Compute 2 Rounds
+			vmovdqu  MSG(t), %xmm0
+			vpshufb  %xmm1, %xmm0, %xmm0    # BSWAP
+			SHA512_Round t-2    # Round t-2
+			vmovdqa  %xmm0, W_t(t) # Store Scheduled Pair
+			vpaddq   K_t(t), %xmm0, %xmm0 # Compute W[t]+K[t]
+			SHA512_Round t-1    # Round t-1
+			vmovdqa  %xmm0, WK_2(t)# Store W[t]+K[t] into WK
+		.elseif t < 79
+			# Schedule 2 QWORDS# Compute 2 Rounds
+			SHA512_2Sched_2Round_avx t
+		.else
+			# Compute 2 Rounds
+			SHA512_Round t-2
+			SHA512_Round t-1
+		.endif
+		t = t+2
+	.endr
+
+	# Update digest
+	add     a_64, DIGEST(0)
+	add     b_64, DIGEST(1)
+	add     c_64, DIGEST(2)
+	add     d_64, DIGEST(3)
+	add     e_64, DIGEST(4)
+	add     f_64, DIGEST(5)
+	add     g_64, DIGEST(6)
+	add     h_64, DIGEST(7)
+
+	# Advance to next message block
+	add     $16*8, msg
+	dec     msglen
+	jnz     updateblock
+
+	# Restore GPRs
+	mov     frame_GPRSAVE(%rsp),      %rbx
+	mov     frame_GPRSAVE +8*1(%rsp), %r12
+	mov     frame_GPRSAVE +8*2(%rsp), %r13
+	mov     frame_GPRSAVE +8*3(%rsp), %r14
+	mov     frame_GPRSAVE +8*4(%rsp), %r15
+
+	# Restore Stack Pointer
+	mov	frame_RSPSAVE(%rsp), %rsp
+
+nowork:
+	ret
+ENDPROC(sha512_transform_avx)
+
+########################################################################
+### Binary Data
+
+.data
+
+.align 16
+
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+	.octa 0x08090a0b0c0d0e0f0001020304050607
+
+# K[t] used in SHA512 hashing
+K512:
+	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad 0x3956c25bf348b538,0x59f111f1b605d019
+	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad 0xd807aa98a3030242,0x12835b0145706fbe
+	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad 0x06ca6351e003826f,0x142929670a0e6e70
+	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad 0x81c2c92e47edaee6,0x92722c851482353b
+	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad 0xd192e819d6ef5218,0xd69906245565a910
+	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad 0x90befffa23631e28,0xa4506cebde82bde9
+	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad 0xca273eceea26619c,0xd186b8c721c0c207
+	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad 0x113f9804bef90dae,0x1b710b35131c471b
+	.quad 0x28db77f523047d84,0x32caab7b40c72493
+	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+#endif

+ 743 - 0
arch/x86/crypto/sha512-avx2-asm.S

@@ -0,0 +1,743 @@
+########################################################################
+# Implement fast SHA-512 with AVX2 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+# This code schedules 1 blocks at a time, with 4 lanes per block
+########################################################################
+
+#ifdef CONFIG_AS_AVX2
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+Y_0 = %ymm4
+Y_1 = %ymm5
+Y_2 = %ymm6
+Y_3 = %ymm7
+
+YTMP0 = %ymm0
+YTMP1 = %ymm1
+YTMP2 = %ymm2
+YTMP3 = %ymm3
+YTMP4 = %ymm8
+XFER  = YTMP0
+
+BYTE_FLIP_MASK  = %ymm9
+
+# 1st arg
+INP         = %rdi
+# 2nd arg
+CTX         = %rsi
+# 3rd arg
+NUM_BLKS    = %rdx
+
+c           = %rcx
+d           = %r8
+e           = %rdx
+y3          = %rdi
+
+TBL   = %rbp
+
+a     = %rax
+b     = %rbx
+
+f     = %r9
+g     = %r10
+h     = %r11
+old_h = %r11
+
+T1    = %r12
+y0    = %r13
+y1    = %r14
+y2    = %r15
+
+y4    = %r12
+
+# Local variables (stack frame)
+XFER_SIZE = 4*8
+SRND_SIZE = 1*8
+INP_SIZE = 1*8
+INPEND_SIZE = 1*8
+RSPSAVE_SIZE = 1*8
+GPRSAVE_SIZE = 6*8
+
+frame_XFER = 0
+frame_SRND = frame_XFER + XFER_SIZE
+frame_INP = frame_SRND + SRND_SIZE
+frame_INPEND = frame_INP + INP_SIZE
+frame_RSPSAVE = frame_INPEND + INPEND_SIZE
+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+
+## assume buffers not aligned
+#define	VMOVDQ vmovdqu
+
+# addm [mem], reg
+# Add reg to mem using reg-mem add and store
+.macro addm p1 p2
+	add	\p1, \p2
+	mov	\p2, \p1
+.endm
+
+
+# COPY_YMM_AND_BSWAP ymm, [mem], byte_flip_mask
+# Load ymm with mem and byte swap each dword
+.macro COPY_YMM_AND_BSWAP p1 p2 p3
+	VMOVDQ \p2, \p1
+	vpshufb \p3, \p1, \p1
+.endm
+# rotate_Ys
+# Rotate values of symbols Y0...Y3
+.macro rotate_Ys
+	Y_ = Y_0
+	Y_0 = Y_1
+	Y_1 = Y_2
+	Y_2 = Y_3
+	Y_3 = Y_
+.endm
+
+# RotateState
+.macro RotateState
+	# Rotate symbols a..h right
+	old_h  = h
+	TMP_   = h
+	h      = g
+	g      = f
+	f      = e
+	e      = d
+	d      = c
+	c      = b
+	b      = a
+	a      = TMP_
+.endm
+
+# macro MY_VPALIGNR	YDST, YSRC1, YSRC2, RVAL
+# YDST = {YSRC1, YSRC2} >> RVAL*8
+.macro MY_VPALIGNR YDST YSRC1 YSRC2 RVAL
+	vperm2f128      $0x3, \YSRC2, \YSRC1, \YDST     # YDST = {YS1_LO, YS2_HI}
+	vpalignr        $\RVAL, \YSRC2, \YDST, \YDST    # YDST = {YDS1, YS2} >> RVAL*8
+.endm
+
+.macro FOUR_ROUNDS_AND_SCHED
+################################### RND N + 0 #########################################
+
+	# Extract w[t-7]
+	MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		# YTMP0 = W[-7]
+	# Calculate w[t-16] + w[t-7]
+	vpaddq		Y_0, YTMP0, YTMP0		# YTMP0 = W[-7] + W[-16]
+	# Extract w[t-15]
+	MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		# YTMP1 = W[-15]
+
+	# Calculate sigma0
+
+	# Calculate w[t-15] ror 1
+	vpsrlq		$1, YTMP1, YTMP2
+	vpsllq		$(64-1), YTMP1, YTMP3
+	vpor		YTMP2, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1
+	# Calculate w[t-15] shr 7
+	vpsrlq		$7, YTMP1, YTMP4		# YTMP4 = W[-15] >> 7
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	add	frame_XFER(%rsp),h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	add	h, d		# d = k + w + h + d                     # --
+
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+
+	add	y0, y2		# y2 = S1 + CH                          # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+################################### RND N + 1 #########################################
+
+	# Calculate w[t-15] ror 8
+	vpsrlq		$8, YTMP1, YTMP2
+	vpsllq		$(64-8), YTMP1, YTMP1
+	vpor		YTMP2, YTMP1, YTMP1		# YTMP1 = W[-15] ror 8
+	# XOR the three components
+	vpxor		YTMP4, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
+	vpxor		YTMP1, YTMP3, YTMP1		# YTMP1 = s0
+
+
+	# Add three components, w[t-16], w[t-7] and sigma0
+	vpaddq		YTMP1, YTMP0, YTMP0		# YTMP0 = W[-16] + W[-7] + s0
+	# Move to appropriate lanes for calculating w[16] and w[17]
+	vperm2f128	$0x0, YTMP0, YTMP0, Y_0		# Y_0 = W[-16] + W[-7] + s0 {BABA}
+	# Move to appropriate lanes for calculating w[18] and w[19]
+	vpand		MASK_YMM_LO(%rip), YTMP0, YTMP0	# YTMP0 = W[-16] + W[-7] + s0 {DC00}
+
+	# Calculate w[16] and w[17] in both 128 bit lanes
+
+	# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
+	vperm2f128	$0x11, Y_3, Y_3, YTMP2		# YTMP2 = W[-2] {BABA}
+	vpsrlq		$6, YTMP2, YTMP4		# YTMP4 = W[-2] >> 6 {BABA}
+
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	add	1*8+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+
+################################### RND N + 2 #########################################
+
+	vpsrlq		$19, YTMP2, YTMP3		# YTMP3 = W[-2] >> 19 {BABA}
+	vpsllq		$(64-19), YTMP2, YTMP1		# YTMP1 = W[-2] << 19 {BABA}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {BABA}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
+	vpsrlq		$61, YTMP2, YTMP3		# YTMP3 = W[-2] >> 61 {BABA}
+	vpsllq		$(64-61), YTMP2, YTMP1		# YTMP1 = W[-2] << 61 {BABA}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {BABA}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
+							#  (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
+
+	# Add sigma1 to the other compunents to get w[16] and w[17]
+	vpaddq		YTMP4, Y_0, Y_0			# Y_0 = {W[1], W[0], W[1], W[0]}
+
+	# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
+	vpsrlq		$6, Y_0, YTMP4			# YTMP4 = W[-2] >> 6 {DC--}
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	add	2*8+frame_XFER(%rsp), h		# h = k + w + h         # --
+
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	or	c, y3		# y3 = a|c                              # MAJA
+	mov	f, y2		# y2 = f                                # CH
+	xor	g, y2		# y2 = f^g                              # CH
+
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+################################### RND N + 3 #########################################
+
+	vpsrlq		$19, Y_0, YTMP3			# YTMP3 = W[-2] >> 19 {DC--}
+	vpsllq		$(64-19), Y_0, YTMP1		# YTMP1 = W[-2] << 19 {DC--}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {DC--}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
+	vpsrlq		$61, Y_0, YTMP3			# YTMP3 = W[-2] >> 61 {DC--}
+	vpsllq		$(64-61), Y_0, YTMP1		# YTMP1 = W[-2] << 61 {DC--}
+	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {DC--}
+	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
+							#  (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
+
+	# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
+	# to newly calculated sigma1 to get w[18] and w[19]
+	vpaddq		YTMP4, YTMP0, YTMP2		# YTMP2 = {W[3], W[2], --, --}
+
+	# Form w[19, w[18], w17], w[16]
+	vpblendd		$0xF0, YTMP2, Y_0, Y_0		# Y_0 = {W[3], W[2], W[1], W[0]}
+
+	mov	a, y3		# y3 = a                                # MAJA
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	add	3*8+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	xor	g, y2		# y2 = f^g                              # CH
+
+
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	h, d		# d = k + w + h + d                     # --
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	c, T1		# T1 = a&c                              # MAJB
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+
+	add	y1, h		# h = k + w + h + S0                    # --
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+	rotate_Ys
+.endm
+
+.macro DO_4ROUNDS
+
+################################### RND N + 0 #########################################
+
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	RotateState
+
+################################### RND N + 1 #########################################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	8*1+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	RotateState
+
+################################### RND N + 2 #########################################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	8*2+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	RotateState
+
+################################### RND N + 3 #########################################
+
+	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+	mov	f, y2		# y2 = f                                # CH
+	rorx	$41, e, y0	# y0 = e >> 41				# S1A
+	rorx	$18, e, y1	# y1 = e >> 18				# S1B
+	xor	g, y2		# y2 = f^g                              # CH
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18)		# S1
+	rorx	$14, e, y1	# y1 = (e >> 14)			# S1
+	and	e, y2		# y2 = (f^g)&e                          # CH
+	add	y3, old_h	# h = t1 + S0 + MAJ                     # --
+
+	xor	y1, y0		# y0 = (e>>41) ^ (e>>18) ^ (e>>14)	# S1
+	rorx	$34, a, T1	# T1 = a >> 34				# S0B
+	xor	g, y2		# y2 = CH = ((f^g)&e)^g                 # CH
+	rorx	$39, a, y1	# y1 = a >> 39				# S0A
+	mov	a, y3		# y3 = a                                # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34)		# S0
+	rorx	$28, a, T1	# T1 = (a >> 28)			# S0
+	add	8*3+frame_XFER(%rsp), h		# h = k + w + h         # --
+	or	c, y3		# y3 = a|c                              # MAJA
+
+	xor	T1, y1		# y1 = (a>>39) ^ (a>>34) ^ (a>>28)	# S0
+	mov	a, T1		# T1 = a                                # MAJB
+	and	b, y3		# y3 = (a|c)&b                          # MAJA
+	and	c, T1		# T1 = a&c                              # MAJB
+	add	y0, y2		# y2 = S1 + CH                          # --
+
+
+	add	h, d		# d = k + w + h + d                     # --
+	or	T1, y3		# y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
+	add	y1, h		# h = k + w + h + S0                    # --
+
+	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
+
+	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
+
+	add	y3, h		# h = t1 + S0 + MAJ                     # --
+
+	RotateState
+
+.endm
+
+########################################################################
+# void sha512_transform_rorx(const void* M, void* D, uint64_t L)#
+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+# The size of the message pointed to by M must be an integer multiple of SHA512
+#   message blocks.
+# L is the message length in SHA512 blocks
+########################################################################
+ENTRY(sha512_transform_rorx)
+	# Allocate Stack Space
+	mov	%rsp, %rax
+	sub	$frame_size, %rsp
+	and	$~(0x20 - 1), %rsp
+	mov	%rax, frame_RSPSAVE(%rsp)
+
+	# Save GPRs
+	mov	%rbp, frame_GPRSAVE(%rsp)
+	mov	%rbx, 8*1+frame_GPRSAVE(%rsp)
+	mov	%r12, 8*2+frame_GPRSAVE(%rsp)
+	mov	%r13, 8*3+frame_GPRSAVE(%rsp)
+	mov	%r14, 8*4+frame_GPRSAVE(%rsp)
+	mov	%r15, 8*5+frame_GPRSAVE(%rsp)
+
+	shl	$7, NUM_BLKS	# convert to bytes
+	jz	done_hash
+	add	INP, NUM_BLKS	# pointer to end of data
+	mov	NUM_BLKS, frame_INPEND(%rsp)
+
+	## load initial digest
+	mov	8*0(CTX),a
+	mov	8*1(CTX),b
+	mov	8*2(CTX),c
+	mov	8*3(CTX),d
+	mov	8*4(CTX),e
+	mov	8*5(CTX),f
+	mov	8*6(CTX),g
+	mov	8*7(CTX),h
+
+	vmovdqa	PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
+
+loop0:
+	lea	K512(%rip), TBL
+
+	## byte swap first 16 dwords
+	COPY_YMM_AND_BSWAP	Y_0, (INP), BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_1, 1*32(INP), BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_2, 2*32(INP), BYTE_FLIP_MASK
+	COPY_YMM_AND_BSWAP	Y_3, 3*32(INP), BYTE_FLIP_MASK
+
+	mov	INP, frame_INP(%rsp)
+
+	## schedule 64 input dwords, by doing 12 rounds of 4 each
+	movq	$4, frame_SRND(%rsp)
+
+.align 16
+loop1:
+	vpaddq	(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddq	1*32(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddq	2*32(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	FOUR_ROUNDS_AND_SCHED
+
+	vpaddq	3*32(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	add	$(4*32), TBL
+	FOUR_ROUNDS_AND_SCHED
+
+	subq	$1, frame_SRND(%rsp)
+	jne	loop1
+
+	movq	$2, frame_SRND(%rsp)
+loop2:
+	vpaddq	(TBL), Y_0, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	DO_4ROUNDS
+	vpaddq	1*32(TBL), Y_1, XFER
+	vmovdqa XFER, frame_XFER(%rsp)
+	add	$(2*32), TBL
+	DO_4ROUNDS
+
+	vmovdqa	Y_2, Y_0
+	vmovdqa	Y_3, Y_1
+
+	subq	$1, frame_SRND(%rsp)
+	jne	loop2
+
+	addm	8*0(CTX),a
+	addm	8*1(CTX),b
+	addm	8*2(CTX),c
+	addm	8*3(CTX),d
+	addm	8*4(CTX),e
+	addm	8*5(CTX),f
+	addm	8*6(CTX),g
+	addm	8*7(CTX),h
+
+	mov	frame_INP(%rsp), INP
+	add	$128, INP
+	cmp	frame_INPEND(%rsp), INP
+	jne	loop0
+
+done_hash:
+
+# Restore GPRs
+	mov	frame_GPRSAVE(%rsp)     ,%rbp
+	mov	8*1+frame_GPRSAVE(%rsp) ,%rbx
+	mov	8*2+frame_GPRSAVE(%rsp) ,%r12
+	mov	8*3+frame_GPRSAVE(%rsp) ,%r13
+	mov	8*4+frame_GPRSAVE(%rsp) ,%r14
+	mov	8*5+frame_GPRSAVE(%rsp) ,%r15
+
+	# Restore Stack Pointer
+	mov	frame_RSPSAVE(%rsp), %rsp
+	ret
+ENDPROC(sha512_transform_rorx)
+
+########################################################################
+### Binary Data
+
+.data
+
+.align 64
+# K[t] used in SHA512 hashing
+K512:
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.align 32
+
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa 0x08090a0b0c0d0e0f0001020304050607
+	.octa 0x18191a1b1c1d1e1f1011121314151617
+
+MASK_YMM_LO:
+	.octa 0x00000000000000000000000000000000
+	.octa 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+#endif

+ 421 - 0
arch/x86/crypto/sha512-ssse3-asm.S

@@ -0,0 +1,421 @@
+########################################################################
+# Implement fast SHA-512 with SSSE3 instructions. (x86_64)
+#
+# Copyright (C) 2013 Intel Corporation.
+#
+# Authors:
+#     James Guilford <james.guilford@intel.com>
+#     Kirk Yap <kirk.s.yap@intel.com>
+#     David Cote <david.m.cote@intel.com>
+#     Tim Chen <tim.c.chen@linux.intel.com>
+#
+# This software is available to you under a choice of one of two
+# licenses.  You may choose to be licensed under the terms of the GNU
+# General Public License (GPL) Version 2, available from the file
+# COPYING in the main directory of this source tree, or the
+# OpenIB.org BSD license below:
+#
+#     Redistribution and use in source and binary forms, with or
+#     without modification, are permitted provided that the following
+#     conditions are met:
+#
+#      - Redistributions of source code must retain the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer.
+#
+#      - Redistributions in binary form must reproduce the above
+#        copyright notice, this list of conditions and the following
+#        disclaimer in the documentation and/or other materials
+#        provided with the distribution.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+########################################################################
+#
+# This code is described in an Intel White-Paper:
+# "Fast SHA-512 Implementations on Intel Architecture Processors"
+#
+# To find it, surf to http://www.intel.com/p/en_US/embedded
+# and search for that title.
+#
+########################################################################
+
+#include <linux/linkage.h>
+
+.text
+
+# Virtual Registers
+# ARG1
+msg =		%rdi
+# ARG2
+digest =	%rsi
+# ARG3
+msglen =	%rdx
+T1 =		%rcx
+T2 =		%r8
+a_64 =		%r9
+b_64 =		%r10
+c_64 =		%r11
+d_64 =		%r12
+e_64 =		%r13
+f_64 =		%r14
+g_64 =		%r15
+h_64 =		%rbx
+tmp0 =		%rax
+
+# Local variables (stack frame)
+
+W_SIZE = 80*8
+WK_SIZE = 2*8
+RSPSAVE_SIZE = 1*8
+GPRSAVE_SIZE = 5*8
+
+frame_W = 0
+frame_WK = frame_W + W_SIZE
+frame_RSPSAVE = frame_WK + WK_SIZE
+frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
+frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+
+# Useful QWORD "arrays" for simpler memory references
+# MSG, DIGEST, K_t, W_t are arrays
+# WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+# Input message (arg1)
+#define MSG(i)    8*i(msg)
+
+# Output Digest (arg2)
+#define DIGEST(i) 8*i(digest)
+
+# SHA Constants (static mem)
+#define K_t(i)    8*i+K512(%rip)
+
+# Message Schedule (stack frame)
+#define W_t(i)    8*i+frame_W(%rsp)
+
+# W[t]+K[t] (stack frame)
+#define WK_2(i)   8*((i%2))+frame_WK(%rsp)
+
+.macro RotateState
+	# Rotate symbols a..h right
+	TMP   = h_64
+	h_64  = g_64
+	g_64  = f_64
+	f_64  = e_64
+	e_64  = d_64
+	d_64  = c_64
+	c_64  = b_64
+	b_64  = a_64
+	a_64  = TMP
+.endm
+
+.macro SHA512_Round rnd
+
+	# Compute Round %%t
+	mov	f_64, T1          # T1 = f
+	mov	e_64, tmp0        # tmp = e
+	xor	g_64, T1          # T1 = f ^ g
+	ror	$23, tmp0 # 41    # tmp = e ror 23
+	and	e_64, T1          # T1 = (f ^ g) & e
+	xor	e_64, tmp0        # tmp = (e ror 23) ^ e
+	xor	g_64, T1          # T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+	idx = \rnd
+	add	WK_2(idx), T1     # W[t] + K[t] from message scheduler
+	ror	$4, tmp0  # 18    # tmp = ((e ror 23) ^ e) ror 4
+	xor	e_64, tmp0        # tmp = (((e ror 23) ^ e) ror 4) ^ e
+	mov	a_64, T2          # T2 = a
+	add	h_64, T1          # T1 = CH(e,f,g) + W[t] + K[t] + h
+	ror	$14, tmp0 # 14    # tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+	add	tmp0, T1          # T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+	mov	a_64, tmp0        # tmp = a
+	xor	c_64, T2          # T2 = a ^ c
+	and	c_64, tmp0        # tmp = a & c
+	and	b_64, T2          # T2 = (a ^ c) & b
+	xor	tmp0, T2          # T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+	mov	a_64, tmp0        # tmp = a
+	ror	$5, tmp0 # 39     # tmp = a ror 5
+	xor	a_64, tmp0        # tmp = (a ror 5) ^ a
+	add	T1, d_64          # e(next_state) = d + T1
+	ror	$6, tmp0 # 34     # tmp = ((a ror 5) ^ a) ror 6
+	xor	a_64, tmp0        # tmp = (((a ror 5) ^ a) ror 6) ^ a
+	lea	(T1, T2), h_64    # a(next_state) = T1 + Maj(a,b,c)
+	ror	$28, tmp0 # 28    # tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+	add	tmp0, h_64        # a(next_state) = T1 + Maj(a,b,c) S0(a)
+	RotateState
+.endm
+
+.macro SHA512_2Sched_2Round_sse rnd
+
+	# Compute rounds t-2 and t-1
+	# Compute message schedule QWORDS t and t+1
+
+	#   Two rounds are computed based on the values for K[t-2]+W[t-2] and
+	# K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+	# scheduler.
+	#   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+	# They are then added to their respective SHA512 constants at
+	# [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+	#   For brievity, the comments following vectored instructions only refer to
+	# the first of a pair of QWORDS.
+	# Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+	#   The computation of the message schedule and the rounds are tightly
+	# stitched to take advantage of instruction-level parallelism.
+	# For clarity, integer instructions (for the rounds calculation) are indented
+	# by one tab. Vectored instructions (for the message scheduler) are indented
+	# by two tabs.
+
+	mov	f_64, T1
+	idx = \rnd -2
+	movdqa	W_t(idx), %xmm2		    # XMM2 = W[t-2]
+	xor	g_64, T1
+	and	e_64, T1
+	movdqa	%xmm2, %xmm0	            # XMM0 = W[t-2]
+	xor	g_64, T1
+	idx = \rnd
+	add	WK_2(idx), T1
+	idx = \rnd - 15
+	movdqu	W_t(idx), %xmm5		    # XMM5 = W[t-15]
+	mov	e_64, tmp0
+	ror	$23, tmp0 # 41
+	movdqa	%xmm5, %xmm3	            # XMM3 = W[t-15]
+	xor	e_64, tmp0
+	ror	$4, tmp0 # 18
+	psrlq	$61-19, %xmm0		    # XMM0 = W[t-2] >> 42
+	xor	e_64, tmp0
+	ror	$14, tmp0 # 14
+	psrlq	$(8-7), %xmm3		    # XMM3 = W[t-15] >> 1
+	add	tmp0, T1
+	add	h_64, T1
+	pxor	%xmm2, %xmm0                # XMM0 = (W[t-2] >> 42) ^ W[t-2]
+	mov	a_64, T2
+	xor	c_64, T2
+	pxor	%xmm5, %xmm3                # XMM3 = (W[t-15] >> 1) ^ W[t-15]
+	and	b_64, T2
+	mov	a_64, tmp0
+	psrlq	$(19-6), %xmm0		    # XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+	and	c_64, tmp0
+	xor	tmp0, T2
+	psrlq	$(7-1), %xmm3		    # XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+	mov	a_64, tmp0
+	ror	$5, tmp0 # 39
+	pxor	%xmm2, %xmm0	            # XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+	xor	a_64, tmp0
+	ror	$6, tmp0 # 34
+	pxor	%xmm5, %xmm3                # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+	xor	a_64, tmp0
+	ror	$28, tmp0 # 28
+	psrlq	$6, %xmm0                   # XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+	add	tmp0, T2
+	add	T1, d_64
+	psrlq	$1, %xmm3                   # XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+	lea	(T1, T2), h_64
+	RotateState
+	movdqa	%xmm2, %xmm1	            # XMM1 = W[t-2]
+	mov	f_64, T1
+	xor	g_64, T1
+	movdqa	%xmm5, %xmm4		    # XMM4 = W[t-15]
+	and	e_64, T1
+	xor	g_64, T1
+	psllq	$(64-19)-(64-61) , %xmm1    # XMM1 = W[t-2] << 42
+	idx = \rnd + 1
+	add	WK_2(idx), T1
+	mov	e_64, tmp0
+	psllq	$(64-1)-(64-8), %xmm4	    # XMM4 = W[t-15] << 7
+	ror	$23, tmp0 # 41
+	xor	e_64, tmp0
+	pxor	%xmm2, %xmm1		    # XMM1 = (W[t-2] << 42)^W[t-2]
+	ror	$4, tmp0 # 18
+	xor	e_64, tmp0
+	pxor	%xmm5, %xmm4		    # XMM4 = (W[t-15]<<7)^W[t-15]
+	ror	$14, tmp0 # 14
+	add	tmp0, T1
+	psllq	$(64-61), %xmm1		    # XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+	add	h_64, T1
+	mov	a_64, T2
+	psllq	$(64-8), %xmm4		    # XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+	xor	c_64, T2
+	and	b_64, T2
+	pxor	%xmm1, %xmm0		    # XMM0 = s1(W[t-2])
+	mov	a_64, tmp0
+	and	c_64, tmp0
+	idx = \rnd - 7
+	movdqu	W_t(idx), %xmm1		    # XMM1 = W[t-7]
+	xor	tmp0, T2
+	pxor	%xmm4, %xmm3                # XMM3 = s0(W[t-15])
+	mov	a_64, tmp0
+	paddq	%xmm3, %xmm0		    # XMM0 = s1(W[t-2]) + s0(W[t-15])
+	ror	$5, tmp0 # 39
+	idx =\rnd-16
+	paddq	W_t(idx), %xmm0		    # XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+	xor	a_64, tmp0
+	paddq	%xmm1, %xmm0	            # XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+	ror	$6, tmp0 # 34
+	movdqa	%xmm0, W_t(\rnd)	    # Store scheduled qwords
+	xor	a_64, tmp0
+	paddq	K_t(\rnd), %xmm0	    # Compute W[t]+K[t]
+	ror	$28, tmp0 # 28
+	idx = \rnd
+	movdqa	%xmm0, WK_2(idx)	    # Store W[t]+K[t] for next rounds
+	add	tmp0, T2
+	add	T1, d_64
+	lea	(T1, T2), h_64
+	RotateState
+.endm
+
+########################################################################
+# void sha512_transform_ssse3(const void* M, void* D, u64 L)#
+# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+# The size of the message pointed to by M must be an integer multiple of SHA512
+#   message blocks.
+# L is the message length in SHA512 blocks.
+########################################################################
+ENTRY(sha512_transform_ssse3)
+
+	cmp $0, msglen
+	je nowork
+
+	# Allocate Stack Space
+	mov	%rsp, %rax
+	sub	$frame_size, %rsp
+	and	$~(0x20 - 1), %rsp
+	mov	%rax, frame_RSPSAVE(%rsp)
+
+	# Save GPRs
+	mov	%rbx, frame_GPRSAVE(%rsp)
+	mov	%r12, frame_GPRSAVE +8*1(%rsp)
+	mov	%r13, frame_GPRSAVE +8*2(%rsp)
+	mov	%r14, frame_GPRSAVE +8*3(%rsp)
+	mov	%r15, frame_GPRSAVE +8*4(%rsp)
+
+updateblock:
+
+# Load state variables
+	mov	DIGEST(0), a_64
+	mov	DIGEST(1), b_64
+	mov	DIGEST(2), c_64
+	mov	DIGEST(3), d_64
+	mov	DIGEST(4), e_64
+	mov	DIGEST(5), f_64
+	mov	DIGEST(6), g_64
+	mov	DIGEST(7), h_64
+
+	t = 0
+	.rept 80/2 + 1
+	# (80 rounds) / (2 rounds/iteration) + (1 iteration)
+	# +1 iteration because the scheduler leads hashing by 1 iteration
+		.if t < 2
+			# BSWAP 2 QWORDS
+			movdqa	XMM_QWORD_BSWAP(%rip), %xmm1
+			movdqu	MSG(t), %xmm0
+			pshufb	%xmm1, %xmm0	# BSWAP
+			movdqa	%xmm0, W_t(t)	# Store Scheduled Pair
+			paddq	K_t(t), %xmm0	# Compute W[t]+K[t]
+			movdqa	%xmm0, WK_2(t)	# Store into WK for rounds
+		.elseif t < 16
+			# BSWAP 2 QWORDS# Compute 2 Rounds
+			movdqu	MSG(t), %xmm0
+			pshufb	%xmm1, %xmm0	# BSWAP
+			SHA512_Round t-2	# Round t-2
+			movdqa	%xmm0, W_t(t)	# Store Scheduled Pair
+			paddq	K_t(t), %xmm0	# Compute W[t]+K[t]
+			SHA512_Round t-1	# Round t-1
+			movdqa	%xmm0, WK_2(t)	# Store W[t]+K[t] into WK
+		.elseif t < 79
+			# Schedule 2 QWORDS# Compute 2 Rounds
+			SHA512_2Sched_2Round_sse t
+		.else
+			# Compute 2 Rounds
+			SHA512_Round t-2
+			SHA512_Round t-1
+		.endif
+		t = t+2
+	.endr
+
+	# Update digest
+	add	a_64, DIGEST(0)
+	add	b_64, DIGEST(1)
+	add	c_64, DIGEST(2)
+	add	d_64, DIGEST(3)
+	add	e_64, DIGEST(4)
+	add	f_64, DIGEST(5)
+	add	g_64, DIGEST(6)
+	add	h_64, DIGEST(7)
+
+	# Advance to next message block
+	add	$16*8, msg
+	dec	msglen
+	jnz	updateblock
+
+	# Restore GPRs
+	mov	frame_GPRSAVE(%rsp),      %rbx
+	mov	frame_GPRSAVE +8*1(%rsp), %r12
+	mov	frame_GPRSAVE +8*2(%rsp), %r13
+	mov	frame_GPRSAVE +8*3(%rsp), %r14
+	mov	frame_GPRSAVE +8*4(%rsp), %r15
+
+	# Restore Stack Pointer
+	mov	frame_RSPSAVE(%rsp), %rsp
+
+nowork:
+	ret
+ENDPROC(sha512_transform_ssse3)
+
+########################################################################
+### Binary Data
+
+.data
+
+.align 16
+
+# Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+	.octa 0x08090a0b0c0d0e0f0001020304050607
+
+# K[t] used in SHA512 hashing
+K512:
+	.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad 0x3956c25bf348b538,0x59f111f1b605d019
+	.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad 0xd807aa98a3030242,0x12835b0145706fbe
+	.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad 0x06ca6351e003826f,0x142929670a0e6e70
+	.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad 0x81c2c92e47edaee6,0x92722c851482353b
+	.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad 0xd192e819d6ef5218,0xd69906245565a910
+	.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad 0x90befffa23631e28,0xa4506cebde82bde9
+	.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad 0xca273eceea26619c,0xd186b8c721c0c207
+	.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad 0x113f9804bef90dae,0x1b710b35131c471b
+	.quad 0x28db77f523047d84,0x32caab7b40c72493
+	.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817

+ 282 - 0
arch/x86/crypto/sha512_ssse3_glue.c

@@ -0,0 +1,282 @@
+/*
+ * Cryptographic API.
+ *
+ * Glue code for the SHA512 Secure Hash Algorithm assembler
+ * implementation using supplemental SSE3 / AVX / AVX2 instructions.
+ *
+ * This file is based on sha512_generic.c
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+
+#include <linux/string.h>
+
+asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest,
+				     u64 rounds);
+#ifdef CONFIG_AS_AVX
+asmlinkage void sha512_transform_avx(const char *data, u64 *digest,
+				     u64 rounds);
+#endif
+#ifdef CONFIG_AS_AVX2
+asmlinkage void sha512_transform_rorx(const char *data, u64 *digest,
+				     u64 rounds);
+#endif
+
+static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64);
+
+
+static int sha512_ssse3_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA512_H0;
+	sctx->state[1] = SHA512_H1;
+	sctx->state[2] = SHA512_H2;
+	sctx->state[3] = SHA512_H3;
+	sctx->state[4] = SHA512_H4;
+	sctx->state[5] = SHA512_H5;
+	sctx->state[6] = SHA512_H6;
+	sctx->state[7] = SHA512_H7;
+	sctx->count[0] = sctx->count[1] = 0;
+
+	return 0;
+}
+
+static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, unsigned int partial)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count[0] += len;
+	if (sctx->count[0] < len)
+		sctx->count[1]++;
+
+	if (partial) {
+		done = SHA512_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha512_transform_asm(sctx->buf, sctx->state, 1);
+	}
+
+	if (len - done >= SHA512_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
+
+		sha512_transform_asm(data + done, sctx->state, (u64) rounds);
+
+		done += rounds * SHA512_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
+	int res;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA512_BLOCK_SIZE) {
+		sctx->count[0] += len;
+		if (sctx->count[0] < len)
+			sctx->count[1]++;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	if (!irq_fpu_usable()) {
+		res = crypto_sha512_update(desc, data, len);
+	} else {
+		kernel_fpu_begin();
+		res = __sha512_ssse3_update(desc, data, len, partial);
+		kernel_fpu_end();
+	}
+
+	return res;
+}
+
+
+/* Add padding and return the message digest. */
+static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be64 *dst = (__be64 *)out;
+	__be64 bits[2];
+	static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
+
+	/* save number of bits */
+	bits[1] = cpu_to_be64(sctx->count[0] << 3);
+	bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61;
+
+	/* Pad out to 112 mod 128 and append length */
+	index = sctx->count[0] & 0x7f;
+	padlen = (index < 112) ? (112 - index) : ((128+112) - index);
+
+	if (!irq_fpu_usable()) {
+		crypto_sha512_update(desc, padding, padlen);
+		crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
+	} else {
+		kernel_fpu_begin();
+		/* We need to fill a whole block for __sha512_ssse3_update() */
+		if (padlen <= 112) {
+			sctx->count[0] += padlen;
+			if (sctx->count[0] < padlen)
+				sctx->count[1]++;
+			memcpy(sctx->buf + index, padding, padlen);
+		} else {
+			__sha512_ssse3_update(desc, padding, padlen, index);
+		}
+		__sha512_ssse3_update(desc, (const u8 *)&bits,
+					sizeof(bits), 112);
+		kernel_fpu_end();
+	}
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be64(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha512_ssse3_export(struct shash_desc *desc, void *out)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.digestsize	=	SHA512_DIGEST_SIZE,
+	.init		=	sha512_ssse3_init,
+	.update		=	sha512_ssse3_update,
+	.final		=	sha512_ssse3_final,
+	.export		=	sha512_ssse3_export,
+	.import		=	sha512_ssse3_import,
+	.descsize	=	sizeof(struct sha512_state),
+	.statesize	=	sizeof(struct sha512_state),
+	.base		=	{
+		.cra_name	=	"sha512",
+		.cra_driver_name =	"sha512-ssse3",
+		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA512_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+#ifdef CONFIG_AS_AVX
+static bool __init avx_usable(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx || !cpu_has_osxsave)
+		return false;
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		pr_info("AVX detected but unusable.\n");
+
+		return false;
+	}
+
+	return true;
+}
+#endif
+
+static int __init sha512_ssse3_mod_init(void)
+{
+	/* test for SSE3 first */
+	if (cpu_has_ssse3)
+		sha512_transform_asm = sha512_transform_ssse3;
+
+#ifdef CONFIG_AS_AVX
+	/* allow AVX to override SSSE3, it's a little faster */
+	if (avx_usable()) {
+#ifdef CONFIG_AS_AVX2
+		if (boot_cpu_has(X86_FEATURE_AVX2))
+			sha512_transform_asm = sha512_transform_rorx;
+		else
+#endif
+			sha512_transform_asm = sha512_transform_avx;
+	}
+#endif
+
+	if (sha512_transform_asm) {
+#ifdef CONFIG_AS_AVX
+		if (sha512_transform_asm == sha512_transform_avx)
+			pr_info("Using AVX optimized SHA-512 implementation\n");
+#ifdef CONFIG_AS_AVX2
+		else if (sha512_transform_asm == sha512_transform_rorx)
+			pr_info("Using AVX2 optimized SHA-512 implementation\n");
+#endif
+		else
+#endif
+			pr_info("Using SSSE3 optimized SHA-512 implementation\n");
+		return crypto_register_shash(&alg);
+	}
+	pr_info("Neither AVX nor SSSE3 is available/usable.\n");
+
+	return -ENODEV;
+}
+
+static void __exit sha512_ssse3_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(sha512_ssse3_mod_init);
+module_exit(sha512_ssse3_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated");
+
+MODULE_ALIAS("sha512");

+ 47 - 1
arch/x86/crypto/twofish-avx-x86_64-asm_64.S

@@ -4,7 +4,7 @@
  * Copyright (C) 2012 Johannes Goetzfried
  * Copyright (C) 2012 Johannes Goetzfried
  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  *
  *
- * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * it under the terms of the GNU General Public License as published by
@@ -33,6 +33,8 @@
 
 
 .Lbswap128_mask:
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lxts_gf128mul_and_shl1_mask:
+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
 
 
 .text
 .text
 
 
@@ -408,3 +410,47 @@ ENTRY(twofish_ctr_8way)
 
 
 	ret;
 	ret;
 ENDPROC(twofish_ctr_8way)
 ENDPROC(twofish_ctr_8way)
+
+ENTRY(twofish_xts_enc_8way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	movq %rsi, %r11;
+
+	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
+	load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
+		      RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
+
+	call __twofish_enc_blk8;
+
+	/* dst <= regs xor IVs(in dst) */
+	store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+	ret;
+ENDPROC(twofish_xts_enc_8way)
+
+ENTRY(twofish_xts_dec_8way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+
+	movq %rsi, %r11;
+
+	/* regs <= src, dst <= IVs, regs <= regs xor IVs */
+	load_xts_8way(%rcx, %rdx, %rsi, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2,
+		      RX0, RX1, RY0, .Lxts_gf128mul_and_shl1_mask);
+
+	call __twofish_dec_blk8;
+
+	/* dst <= regs xor IVs(in dst) */
+	store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+	ret;
+ENDPROC(twofish_xts_dec_8way)

+ 600 - 0
arch/x86/crypto/twofish-avx2-asm_64.S

@@ -0,0 +1,600 @@
+/*
+ * x86_64/AVX2 assembler optimized version of Twofish
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/linkage.h>
+#include "glue_helper-asm-avx2.S"
+
+.file "twofish-avx2-asm_64.S"
+
+.data
+.align 16
+
+.Lvpshufb_mask0:
+.long 0x80808000
+.long 0x80808004
+.long 0x80808008
+.long 0x8080800c
+
+.Lbswap128_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.Lxts_gf128mul_and_shl1_mask_0:
+	.byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
+.Lxts_gf128mul_and_shl1_mask_1:
+	.byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
+
+.text
+
+/* structure of crypto context */
+#define s0	0
+#define s1	1024
+#define s2	2048
+#define s3	3072
+#define w	4096
+#define	k	4128
+
+/* register macros */
+#define CTX	%rdi
+
+#define RS0	CTX
+#define RS1	%r8
+#define RS2	%r9
+#define RS3	%r10
+#define RK	%r11
+#define RW	%rax
+#define RROUND  %r12
+#define RROUNDd %r12d
+
+#define RA0	%ymm8
+#define RB0	%ymm9
+#define RC0	%ymm10
+#define RD0	%ymm11
+#define RA1	%ymm12
+#define RB1	%ymm13
+#define RC1	%ymm14
+#define RD1	%ymm15
+
+/* temp regs */
+#define RX0	%ymm0
+#define RY0	%ymm1
+#define RX1	%ymm2
+#define RY1	%ymm3
+#define RT0	%ymm4
+#define RIDX	%ymm5
+
+#define RX0x	%xmm0
+#define RY0x	%xmm1
+#define RX1x	%xmm2
+#define RY1x	%xmm3
+#define RT0x	%xmm4
+
+/* vpgatherdd mask and '-1' */
+#define RNOT	%ymm6
+
+/* byte mask, (-1 >> 24) */
+#define RBYTE	%ymm7
+
+/**********************************************************************
+  16-way AVX2 twofish
+ **********************************************************************/
+#define init_round_constants() \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpsrld $24, RNOT, RBYTE; \
+	leaq k(CTX), RK; \
+	leaq w(CTX), RW; \
+	leaq s1(CTX), RS1; \
+	leaq s2(CTX), RS2; \
+	leaq s3(CTX), RS3; \
+
+#define g16(ab, rs0, rs1, rs2, rs3, xy) \
+	vpand RBYTE, ab ## 0, RIDX; \
+	vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+		\
+		vpand RBYTE, ab ## 1, RIDX; \
+		vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+	\
+	vpsrld $8, ab ## 0, RIDX; \
+	vpand RBYTE, RIDX, RIDX; \
+	vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpxor RT0, xy ## 0, xy ## 0; \
+		\
+		vpsrld $8, ab ## 1, RIDX; \
+		vpand RBYTE, RIDX, RIDX; \
+		vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+		vpxor RT0, xy ## 1, xy ## 1; \
+	\
+	vpsrld $16, ab ## 0, RIDX; \
+	vpand RBYTE, RIDX, RIDX; \
+	vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpxor RT0, xy ## 0, xy ## 0; \
+		\
+		vpsrld $16, ab ## 1, RIDX; \
+		vpand RBYTE, RIDX, RIDX; \
+		vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+		vpxor RT0, xy ## 1, xy ## 1; \
+	\
+	vpsrld $24, ab ## 0, RIDX; \
+	vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
+	vpcmpeqd RNOT, RNOT, RNOT; \
+	vpxor RT0, xy ## 0, xy ## 0; \
+		\
+		vpsrld $24, ab ## 1, RIDX; \
+		vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
+		vpcmpeqd RNOT, RNOT, RNOT; \
+		vpxor RT0, xy ## 1, xy ## 1;
+
+#define g1_16(a, x) \
+	g16(a, RS0, RS1, RS2, RS3, x);
+
+#define g2_16(b, y) \
+	g16(b, RS1, RS2, RS3, RS0, y);
+
+#define encrypt_round_end16(a, b, c, d, nk) \
+	vpaddd RY0, RX0, RX0; \
+	vpaddd RX0, RY0, RY0; \
+	vpbroadcastd nk(RK,RROUND,8), RT0; \
+	vpaddd RT0, RX0, RX0; \
+	vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
+	vpaddd RT0, RY0, RY0; \
+	\
+	vpxor RY0, d ## 0, d ## 0; \
+	\
+	vpxor RX0, c ## 0, c ## 0; \
+	vpsrld $1, c ## 0, RT0; \
+	vpslld $31, c ## 0, c ## 0; \
+	vpor RT0, c ## 0, c ## 0; \
+	\
+		vpaddd RY1, RX1, RX1; \
+		vpaddd RX1, RY1, RY1; \
+		vpbroadcastd nk(RK,RROUND,8), RT0; \
+		vpaddd RT0, RX1, RX1; \
+		vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
+		vpaddd RT0, RY1, RY1; \
+		\
+		vpxor RY1, d ## 1, d ## 1; \
+		\
+		vpxor RX1, c ## 1, c ## 1; \
+		vpsrld $1, c ## 1, RT0; \
+		vpslld $31, c ## 1, c ## 1; \
+		vpor RT0, c ## 1, c ## 1; \
+
+#define encrypt_round16(a, b, c, d, nk) \
+	g2_16(b, RY); \
+	\
+	vpslld $1, b ## 0, RT0; \
+	vpsrld $31, b ## 0, b ## 0; \
+	vpor RT0, b ## 0, b ## 0; \
+	\
+		vpslld $1, b ## 1, RT0; \
+		vpsrld $31, b ## 1, b ## 1; \
+		vpor RT0, b ## 1, b ## 1; \
+	\
+	g1_16(a, RX); \
+	\
+	encrypt_round_end16(a, b, c, d, nk);
+
+#define encrypt_round_first16(a, b, c, d, nk) \
+	vpslld $1, d ## 0, RT0; \
+	vpsrld $31, d ## 0, d ## 0; \
+	vpor RT0, d ## 0, d ## 0; \
+	\
+		vpslld $1, d ## 1, RT0; \
+		vpsrld $31, d ## 1, d ## 1; \
+		vpor RT0, d ## 1, d ## 1; \
+	\
+	encrypt_round16(a, b, c, d, nk);
+
+#define encrypt_round_last16(a, b, c, d, nk) \
+	g2_16(b, RY); \
+	\
+	g1_16(a, RX); \
+	\
+	encrypt_round_end16(a, b, c, d, nk);
+
+#define decrypt_round_end16(a, b, c, d, nk) \
+	vpaddd RY0, RX0, RX0; \
+	vpaddd RX0, RY0, RY0; \
+	vpbroadcastd nk(RK,RROUND,8), RT0; \
+	vpaddd RT0, RX0, RX0; \
+	vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
+	vpaddd RT0, RY0, RY0; \
+	\
+	vpxor RX0, c ## 0, c ## 0; \
+	\
+	vpxor RY0, d ## 0, d ## 0; \
+	vpsrld $1, d ## 0, RT0; \
+	vpslld $31, d ## 0, d ## 0; \
+	vpor RT0, d ## 0, d ## 0; \
+	\
+		vpaddd RY1, RX1, RX1; \
+		vpaddd RX1, RY1, RY1; \
+		vpbroadcastd nk(RK,RROUND,8), RT0; \
+		vpaddd RT0, RX1, RX1; \
+		vpbroadcastd 4+nk(RK,RROUND,8), RT0; \
+		vpaddd RT0, RY1, RY1; \
+		\
+		vpxor RX1, c ## 1, c ## 1; \
+		\
+		vpxor RY1, d ## 1, d ## 1; \
+		vpsrld $1, d ## 1, RT0; \
+		vpslld $31, d ## 1, d ## 1; \
+		vpor RT0, d ## 1, d ## 1;
+
+#define decrypt_round16(a, b, c, d, nk) \
+	g1_16(a, RX); \
+	\
+	vpslld $1, a ## 0, RT0; \
+	vpsrld $31, a ## 0, a ## 0; \
+	vpor RT0, a ## 0, a ## 0; \
+	\
+		vpslld $1, a ## 1, RT0; \
+		vpsrld $31, a ## 1, a ## 1; \
+		vpor RT0, a ## 1, a ## 1; \
+	\
+	g2_16(b, RY); \
+	\
+	decrypt_round_end16(a, b, c, d, nk);
+
+#define decrypt_round_first16(a, b, c, d, nk) \
+	vpslld $1, c ## 0, RT0; \
+	vpsrld $31, c ## 0, c ## 0; \
+	vpor RT0, c ## 0, c ## 0; \
+	\
+		vpslld $1, c ## 1, RT0; \
+		vpsrld $31, c ## 1, c ## 1; \
+		vpor RT0, c ## 1, c ## 1; \
+	\
+	decrypt_round16(a, b, c, d, nk)
+
+#define decrypt_round_last16(a, b, c, d, nk) \
+	g1_16(a, RX); \
+	\
+	g2_16(b, RY); \
+	\
+	decrypt_round_end16(a, b, c, d, nk);
+
+#define encrypt_cycle16() \
+	encrypt_round16(RA, RB, RC, RD, 0); \
+	encrypt_round16(RC, RD, RA, RB, 8);
+
+#define encrypt_cycle_first16() \
+	encrypt_round_first16(RA, RB, RC, RD, 0); \
+	encrypt_round16(RC, RD, RA, RB, 8);
+
+#define encrypt_cycle_last16() \
+	encrypt_round16(RA, RB, RC, RD, 0); \
+	encrypt_round_last16(RC, RD, RA, RB, 8);
+
+#define decrypt_cycle16(n) \
+	decrypt_round16(RC, RD, RA, RB, 8); \
+	decrypt_round16(RA, RB, RC, RD, 0);
+
+#define decrypt_cycle_first16(n) \
+	decrypt_round_first16(RC, RD, RA, RB, 8); \
+	decrypt_round16(RA, RB, RC, RD, 0);
+
+#define decrypt_cycle_last16(n) \
+	decrypt_round16(RC, RD, RA, RB, 8); \
+	decrypt_round_last16(RA, RB, RC, RD, 0);
+
+#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1,	x0, x1; \
+	vpunpcklqdq t1,	x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2,	t2, x2;
+
+#define read_blocks8(offs,a,b,c,d) \
+	transpose_4x4(a, b, c, d, RX0, RY0);
+
+#define write_blocks8(offs,a,b,c,d) \
+	transpose_4x4(a, b, c, d, RX0, RY0);
+
+#define inpack_enc8(a,b,c,d) \
+	vpbroadcastd 4*0(RW), RT0; \
+	vpxor RT0, a, a; \
+	\
+	vpbroadcastd 4*1(RW), RT0; \
+	vpxor RT0, b, b; \
+	\
+	vpbroadcastd 4*2(RW), RT0; \
+	vpxor RT0, c, c; \
+	\
+	vpbroadcastd 4*3(RW), RT0; \
+	vpxor RT0, d, d;
+
+#define outunpack_enc8(a,b,c,d) \
+	vpbroadcastd 4*4(RW), RX0; \
+	vpbroadcastd 4*5(RW), RY0; \
+	vpxor RX0, c, RX0; \
+	vpxor RY0, d, RY0; \
+	\
+	vpbroadcastd 4*6(RW), RT0; \
+	vpxor RT0, a, c; \
+	vpbroadcastd 4*7(RW), RT0; \
+	vpxor RT0, b, d; \
+	\
+	vmovdqa RX0, a; \
+	vmovdqa RY0, b;
+
+#define inpack_dec8(a,b,c,d) \
+	vpbroadcastd 4*4(RW), RX0; \
+	vpbroadcastd 4*5(RW), RY0; \
+	vpxor RX0, a, RX0; \
+	vpxor RY0, b, RY0; \
+	\
+	vpbroadcastd 4*6(RW), RT0; \
+	vpxor RT0, c, a; \
+	vpbroadcastd 4*7(RW), RT0; \
+	vpxor RT0, d, b; \
+	\
+	vmovdqa RX0, c; \
+	vmovdqa RY0, d;
+
+#define outunpack_dec8(a,b,c,d) \
+	vpbroadcastd 4*0(RW), RT0; \
+	vpxor RT0, a, a; \
+	\
+	vpbroadcastd 4*1(RW), RT0; \
+	vpxor RT0, b, b; \
+	\
+	vpbroadcastd 4*2(RW), RT0; \
+	vpxor RT0, c, c; \
+	\
+	vpbroadcastd 4*3(RW), RT0; \
+	vpxor RT0, d, d;
+
+#define read_blocks16(a,b,c,d) \
+	read_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
+	read_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define write_blocks16(a,b,c,d) \
+	write_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
+	write_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define xor_blocks16(a,b,c,d) \
+	xor_blocks8(0, a ## 0, b ## 0, c ## 0, d ## 0); \
+	xor_blocks8(8, a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define inpack_enc16(a,b,c,d) \
+	inpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	inpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define outunpack_enc16(a,b,c,d) \
+	outunpack_enc8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	outunpack_enc8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define inpack_dec16(a,b,c,d) \
+	inpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	inpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+#define outunpack_dec16(a,b,c,d) \
+	outunpack_dec8(a ## 0, b ## 0, c ## 0, d ## 0); \
+	outunpack_dec8(a ## 1, b ## 1, c ## 1, d ## 1);
+
+.align 8
+__twofish_enc_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
+	 * output:
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
+	 */
+	init_round_constants();
+
+	read_blocks16(RA, RB, RC, RD);
+	inpack_enc16(RA, RB, RC, RD);
+
+	xorl RROUNDd, RROUNDd;
+	encrypt_cycle_first16();
+	movl $2, RROUNDd;
+
+.align 4
+.L__enc_loop:
+	encrypt_cycle16();
+
+	addl $2, RROUNDd;
+	cmpl $14, RROUNDd;
+	jne .L__enc_loop;
+
+	encrypt_cycle_last16();
+
+	outunpack_enc16(RA, RB, RC, RD);
+	write_blocks16(RA, RB, RC, RD);
+
+	ret;
+ENDPROC(__twofish_enc_blk16)
+
+.align 8
+__twofish_dec_blk16:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: ciphertext
+	 * output:
+	 *	RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: plaintext
+	 */
+	init_round_constants();
+
+	read_blocks16(RA, RB, RC, RD);
+	inpack_dec16(RA, RB, RC, RD);
+
+	movl $14, RROUNDd;
+	decrypt_cycle_first16();
+	movl $12, RROUNDd;
+
+.align 4
+.L__dec_loop:
+	decrypt_cycle16();
+
+	addl $-2, RROUNDd;
+	jnz .L__dec_loop;
+
+	decrypt_cycle_last16();
+
+	outunpack_dec16(RA, RB, RC, RD);
+	write_blocks16(RA, RB, RC, RD);
+
+	ret;
+ENDPROC(__twofish_dec_blk16)
+
+ENTRY(twofish_ecb_enc_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+	pushq %r12;
+
+	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+
+	call __twofish_enc_blk16;
+
+	store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+
+	popq %r12;
+	vzeroupper;
+
+	ret;
+ENDPROC(twofish_ecb_enc_16way)
+
+ENTRY(twofish_ecb_dec_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+	pushq %r12;
+
+	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+
+	call __twofish_dec_blk16;
+
+	store_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+
+	popq %r12;
+	vzeroupper;
+
+	ret;
+ENDPROC(twofish_ecb_dec_16way)
+
+ENTRY(twofish_cbc_dec_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	vzeroupper;
+	pushq %r12;
+
+	load_16way(%rdx, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+
+	call __twofish_dec_blk16;
+
+	store_cbc_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1,
+			RX0);
+
+	popq %r12;
+	vzeroupper;
+
+	ret;
+ENDPROC(twofish_cbc_dec_16way)
+
+ENTRY(twofish_ctr_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (little endian, 128bit)
+	 */
+
+	vzeroupper;
+	pushq %r12;
+
+	load_ctr_16way(%rcx, .Lbswap128_mask, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
+		       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
+		       RBYTE);
+
+	call __twofish_enc_blk16;
+
+	store_ctr_16way(%rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+
+	popq %r12;
+	vzeroupper;
+
+	ret;
+ENDPROC(twofish_ctr_16way)
+
+.align 8
+twofish_xts_crypt_16way:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 *	%r8: pointer to __twofish_enc_blk16 or __twofish_dec_blk16
+	 */
+
+	vzeroupper;
+	pushq %r12;
+
+	load_xts_16way(%rcx, %rdx, %rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1,
+		       RD1, RX0, RX0x, RX1, RX1x, RY0, RY0x, RY1, RY1x, RNOT,
+		       .Lxts_gf128mul_and_shl1_mask_0,
+		       .Lxts_gf128mul_and_shl1_mask_1);
+
+	call *%r8;
+
+	store_xts_16way(%rsi, RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1);
+
+	popq %r12;
+	vzeroupper;
+
+	ret;
+ENDPROC(twofish_xts_crypt_16way)
+
+ENTRY(twofish_xts_enc_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+	leaq __twofish_enc_blk16, %r8;
+	jmp twofish_xts_crypt_16way;
+ENDPROC(twofish_xts_enc_16way)
+
+ENTRY(twofish_xts_dec_16way)
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (16 blocks)
+	 *	%rdx: src (16 blocks)
+	 *	%rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
+	 */
+	leaq __twofish_dec_blk16, %r8;
+	jmp twofish_xts_crypt_16way;
+ENDPROC(twofish_xts_dec_16way)

+ 584 - 0
arch/x86/crypto/twofish_avx2_glue.c

@@ -0,0 +1,584 @@
+/*
+ * Glue Code for x86_64/AVX2 assembler optimized version of Twofish
+ *
+ * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/ctr.h>
+#include <crypto/twofish.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <asm/crypto/twofish.h>
+#include <asm/crypto/ablk_helper.h>
+#include <asm/crypto/glue_helper.h>
+#include <crypto/scatterwalk.h>
+
+#define TF_AVX2_PARALLEL_BLOCKS 16
+
+/* 16-way AVX2 parallel cipher functions */
+asmlinkage void twofish_ecb_enc_16way(struct twofish_ctx *ctx, u8 *dst,
+				      const u8 *src);
+asmlinkage void twofish_ecb_dec_16way(struct twofish_ctx *ctx, u8 *dst,
+				      const u8 *src);
+asmlinkage void twofish_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
+
+asmlinkage void twofish_ctr_16way(void *ctx, u128 *dst, const u128 *src,
+				  le128 *iv);
+
+asmlinkage void twofish_xts_enc_16way(struct twofish_ctx *ctx, u8 *dst,
+				      const u8 *src, le128 *iv);
+asmlinkage void twofish_xts_dec_16way(struct twofish_ctx *ctx, u8 *dst,
+				      const u8 *src, le128 *iv);
+
+static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	__twofish_enc_blk_3way(ctx, dst, src, false);
+}
+
+static const struct common_glue_ctx twofish_enc = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
+	}, {
+		.num_blocks = 3,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
+	} }
+};
+
+static const struct common_glue_ctx twofish_ctr = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_16way) }
+	},  {
+		.num_blocks = 8,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
+	}, {
+		.num_blocks = 3,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
+	} }
+};
+
+static const struct common_glue_ctx twofish_enc_xts = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
+	} }
+};
+
+static const struct common_glue_ctx twofish_dec = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
+	}, {
+		.num_blocks = 3,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
+	} }
+};
+
+static const struct common_glue_ctx twofish_dec_cbc = {
+	.num_funcs = 4,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
+	}, {
+		.num_blocks = 3,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
+	} }
+};
+
+static const struct common_glue_ctx twofish_dec_xts = {
+	.num_funcs = 3,
+	.fpu_blocks_limit = 8,
+
+	.funcs = { {
+		.num_blocks = 16,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_16way) }
+	}, {
+		.num_blocks = 8,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
+	} }
+};
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&twofish_enc, desc, dst, src, nbytes);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ecb_crypt_128bit(&twofish_dec, desc, dst, src, nbytes);
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(twofish_enc_blk), desc,
+				       dst, src, nbytes);
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_cbc_decrypt_128bit(&twofish_dec_cbc, desc, dst, src,
+				       nbytes);
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		     struct scatterlist *src, unsigned int nbytes)
+{
+	return glue_ctr_crypt_128bit(&twofish_ctr, desc, dst, src, nbytes);
+}
+
+static inline bool twofish_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+	/* since reusing AVX functions, starts using FPU at 8 parallel blocks */
+	return glue_fpu_begin(TF_BLOCK_SIZE, 8, NULL, fpu_enabled, nbytes);
+}
+
+static inline void twofish_fpu_end(bool fpu_enabled)
+{
+	glue_fpu_end(fpu_enabled);
+}
+
+struct crypt_priv {
+	struct twofish_ctx *ctx;
+	bool fpu_enabled;
+};
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = TF_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
+		twofish_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
+		nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
+	}
+
+	while (nbytes >= 8 * bsize) {
+		twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * 8;
+		nbytes -= bsize * 8;
+	}
+
+	while (nbytes >= 3 * bsize) {
+		twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * 3;
+		nbytes -= bsize * 3;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		twofish_enc_blk(ctx->ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+	const unsigned int bsize = TF_BLOCK_SIZE;
+	struct crypt_priv *ctx = priv;
+	int i;
+
+	ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
+
+	while (nbytes >= TF_AVX2_PARALLEL_BLOCKS * bsize) {
+		twofish_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * TF_AVX2_PARALLEL_BLOCKS;
+		nbytes -= bsize * TF_AVX2_PARALLEL_BLOCKS;
+	}
+
+	while (nbytes >= 8 * bsize) {
+		twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * 8;
+		nbytes -= bsize * 8;
+	}
+
+	while (nbytes >= 3 * bsize) {
+		twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst);
+		srcdst += bsize * 3;
+		nbytes -= bsize * 3;
+	}
+
+	for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+		twofish_dec_blk(ctx->ctx, srcdst, srcdst);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[TF_AVX2_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->twofish_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = encrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	twofish_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	be128 buf[TF_AVX2_PARALLEL_BLOCKS];
+	struct crypt_priv crypt_ctx = {
+		.ctx = &ctx->twofish_ctx,
+		.fpu_enabled = false,
+	};
+	struct lrw_crypt_req req = {
+		.tbuf = buf,
+		.tbuflen = sizeof(buf),
+
+		.table_ctx = &ctx->lrw_table,
+		.crypt_ctx = &crypt_ctx,
+		.crypt_fn = decrypt_callback,
+	};
+	int ret;
+
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+	ret = lrw_crypt(desc, dst, src, nbytes, &req);
+	twofish_fpu_end(crypt_ctx.fpu_enabled);
+
+	return ret;
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(twofish_enc_blk),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+		       struct scatterlist *src, unsigned int nbytes)
+{
+	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+
+	return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(twofish_enc_blk),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
+}
+
+static struct crypto_alg tf_algs[10] = { {
+	.cra_name		= "__ecb-twofish-avx2",
+	.cra_driver_name	= "__driver-ecb-twofish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.setkey		= twofish_setkey,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__cbc-twofish-avx2",
+	.cra_driver_name	= "__driver-cbc-twofish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.setkey		= twofish_setkey,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__ctr-twofish-avx2",
+	.cra_driver_name	= "__driver-ctr-twofish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct twofish_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= twofish_setkey,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+}, {
+	.cra_name		= "__lrw-twofish-avx2",
+	.cra_driver_name	= "__driver-lrw-twofish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_lrw_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_exit		= lrw_twofish_exit_tfm,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE +
+					  TF_BLOCK_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE +
+					  TF_BLOCK_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= lrw_twofish_setkey,
+			.encrypt	= lrw_encrypt,
+			.decrypt	= lrw_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "__xts-twofish-avx2",
+	.cra_driver_name	= "__driver-xts-twofish-avx2",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct twofish_xts_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE * 2,
+			.max_keysize	= TF_MAX_KEY_SIZE * 2,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= xts_twofish_setkey,
+			.encrypt	= xts_encrypt,
+			.decrypt	= xts_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ecb(twofish)",
+	.cra_driver_name	= "ecb-twofish-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(twofish)",
+	.cra_driver_name	= "cbc-twofish-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= __ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(twofish)",
+	.cra_driver_name	= "ctr-twofish-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_encrypt,
+			.geniv		= "chainiv",
+		},
+	},
+}, {
+	.cra_name		= "lrw(twofish)",
+	.cra_driver_name	= "lrw-twofish-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE +
+					  TF_BLOCK_SIZE,
+			.max_keysize	= TF_MAX_KEY_SIZE +
+					  TF_BLOCK_SIZE,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "xts(twofish)",
+	.cra_driver_name	= "xts-twofish-avx2",
+	.cra_priority		= 500,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= TF_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct async_helper_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= ablk_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize	= TF_MIN_KEY_SIZE * 2,
+			.max_keysize	= TF_MAX_KEY_SIZE * 2,
+			.ivsize		= TF_BLOCK_SIZE,
+			.setkey		= ablk_set_key,
+			.encrypt	= ablk_encrypt,
+			.decrypt	= ablk_decrypt,
+		},
+	},
+} };
+
+static int __init init(void)
+{
+	u64 xcr0;
+
+	if (!cpu_has_avx2 || !cpu_has_osxsave) {
+		pr_info("AVX2 instructions are not detected.\n");
+		return -ENODEV;
+	}
+
+	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
+		pr_info("AVX2 detected but unusable.\n");
+		return -ENODEV;
+	}
+
+	return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs));
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs));
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Twofish Cipher Algorithm, AVX2 optimized");
+MODULE_ALIAS("twofish");
+MODULE_ALIAS("twofish-asm");

+ 61 - 40
arch/x86/crypto/twofish_avx_glue.c

@@ -4,6 +4,8 @@
  * Copyright (C) 2012 Johannes Goetzfried
  * Copyright (C) 2012 Johannes Goetzfried
  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  *
  *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * the Free Software Foundation; either version 2 of the License, or
@@ -48,13 +50,26 @@
 /* 8-way parallel cipher functions */
 /* 8-way parallel cipher functions */
 asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
 asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
 				     const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_ecb_enc_8way);
+
 asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
 asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
 				     const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_ecb_dec_8way);
 
 
 asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
 asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
 				     const u8 *src);
+EXPORT_SYMBOL_GPL(twofish_cbc_dec_8way);
+
 asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
 asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
 				 const u8 *src, le128 *iv);
 				 const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(twofish_ctr_8way);
+
+asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(twofish_xts_enc_8way);
+asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src, le128 *iv);
+EXPORT_SYMBOL_GPL(twofish_xts_dec_8way);
 
 
 static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 					const u8 *src)
 					const u8 *src)
@@ -62,6 +77,20 @@ static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 	__twofish_enc_blk_3way(ctx, dst, src, false);
 	__twofish_enc_blk_3way(ctx, dst, src, false);
 }
 }
 
 
+void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+				  GLUE_FUNC_CAST(twofish_enc_blk));
+}
+EXPORT_SYMBOL_GPL(twofish_xts_enc);
+
+void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+{
+	glue_xts_crypt_128bit_one(ctx, dst, src, iv,
+				  GLUE_FUNC_CAST(twofish_dec_blk));
+}
+EXPORT_SYMBOL_GPL(twofish_xts_dec);
+
 
 
 static const struct common_glue_ctx twofish_enc = {
 static const struct common_glue_ctx twofish_enc = {
 	.num_funcs = 3,
 	.num_funcs = 3,
@@ -95,6 +124,19 @@ static const struct common_glue_ctx twofish_ctr = {
 	} }
 	} }
 };
 };
 
 
+static const struct common_glue_ctx twofish_enc_xts = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = TWOFISH_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
+	} }
+};
+
 static const struct common_glue_ctx twofish_dec = {
 static const struct common_glue_ctx twofish_dec = {
 	.num_funcs = 3,
 	.num_funcs = 3,
 	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
 	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
@@ -127,6 +169,19 @@ static const struct common_glue_ctx twofish_dec_cbc = {
 	} }
 	} }
 };
 };
 
 
+static const struct common_glue_ctx twofish_dec_xts = {
+	.num_funcs = 2,
+	.fpu_blocks_limit = TWOFISH_PARALLEL_BLOCKS,
+
+	.funcs = { {
+		.num_blocks = TWOFISH_PARALLEL_BLOCKS,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
+	}, {
+		.num_blocks = 1,
+		.fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
+	} }
+};
+
 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 		       struct scatterlist *src, unsigned int nbytes)
 {
 {
@@ -275,54 +330,20 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 		       struct scatterlist *src, unsigned int nbytes)
 {
 {
 	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
 	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[TWOFISH_PARALLEL_BLOCKS];
-	struct crypt_priv crypt_ctx = {
-		.ctx = &ctx->crypt_ctx,
-		.fpu_enabled = false,
-	};
-	struct xts_crypt_req req = {
-		.tbuf = buf,
-		.tbuflen = sizeof(buf),
 
 
-		.tweak_ctx = &ctx->tweak_ctx,
-		.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
-		.crypt_ctx = &crypt_ctx,
-		.crypt_fn = encrypt_callback,
-	};
-	int ret;
-
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-	ret = xts_crypt(desc, dst, src, nbytes, &req);
-	twofish_fpu_end(crypt_ctx.fpu_enabled);
-
-	return ret;
+	return glue_xts_crypt_128bit(&twofish_enc_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(twofish_enc_blk),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
 }
 }
 
 
 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 		       struct scatterlist *src, unsigned int nbytes)
 		       struct scatterlist *src, unsigned int nbytes)
 {
 {
 	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
 	struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-	be128 buf[TWOFISH_PARALLEL_BLOCKS];
-	struct crypt_priv crypt_ctx = {
-		.ctx = &ctx->crypt_ctx,
-		.fpu_enabled = false,
-	};
-	struct xts_crypt_req req = {
-		.tbuf = buf,
-		.tbuflen = sizeof(buf),
-
-		.tweak_ctx = &ctx->tweak_ctx,
-		.tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
-		.crypt_ctx = &crypt_ctx,
-		.crypt_fn = decrypt_callback,
-	};
-	int ret;
 
 
-	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-	ret = xts_crypt(desc, dst, src, nbytes, &req);
-	twofish_fpu_end(crypt_ctx.fpu_enabled);
-
-	return ret;
+	return glue_xts_crypt_128bit(&twofish_dec_xts, desc, dst, src, nbytes,
+				     XTS_TWEAK_CAST(twofish_enc_blk),
+				     &ctx->tweak_ctx, &ctx->crypt_ctx);
 }
 }
 
 
 static struct crypto_alg twofish_algs[10] = { {
 static struct crypto_alg twofish_algs[10] = { {

+ 1 - 0
arch/x86/include/asm/cpufeature.h

@@ -293,6 +293,7 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_ssse3		boot_cpu_has(X86_FEATURE_SSSE3)
 #define cpu_has_ssse3		boot_cpu_has(X86_FEATURE_SSSE3)
 #define cpu_has_aes		boot_cpu_has(X86_FEATURE_AES)
 #define cpu_has_aes		boot_cpu_has(X86_FEATURE_AES)
 #define cpu_has_avx		boot_cpu_has(X86_FEATURE_AVX)
 #define cpu_has_avx		boot_cpu_has(X86_FEATURE_AVX)
+#define cpu_has_avx2		boot_cpu_has(X86_FEATURE_AVX2)
 #define cpu_has_ht		boot_cpu_has(X86_FEATURE_HT)
 #define cpu_has_ht		boot_cpu_has(X86_FEATURE_HT)
 #define cpu_has_mp		boot_cpu_has(X86_FEATURE_MP)
 #define cpu_has_mp		boot_cpu_has(X86_FEATURE_MP)
 #define cpu_has_nx		boot_cpu_has(X86_FEATURE_NX)
 #define cpu_has_nx		boot_cpu_has(X86_FEATURE_NX)

+ 43 - 0
arch/x86/include/asm/crypto/blowfish.h

@@ -0,0 +1,43 @@
+#ifndef ASM_X86_BLOWFISH_H
+#define ASM_X86_BLOWFISH_H
+
+#include <linux/crypto.h>
+#include <crypto/blowfish.h>
+
+#define BF_PARALLEL_BLOCKS 4
+
+/* regular block cipher functions */
+asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
+				   bool xor);
+asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src);
+
+/* 4-way parallel cipher functions */
+asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
+					const u8 *src, bool xor);
+asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst,
+				      const u8 *src);
+
+static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src)
+{
+	__blowfish_enc_blk(ctx, dst, src, false);
+}
+
+static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst,
+					const u8 *src)
+{
+	__blowfish_enc_blk(ctx, dst, src, true);
+}
+
+static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst,
+					 const u8 *src)
+{
+	__blowfish_enc_blk_4way(ctx, dst, src, false);
+}
+
+static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst,
+				      const u8 *src)
+{
+	__blowfish_enc_blk_4way(ctx, dst, src, true);
+}
+
+#endif

+ 19 - 0
arch/x86/include/asm/crypto/camellia.h

@@ -48,6 +48,22 @@ asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
 asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
 asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
 				      const u8 *src);
 				      const u8 *src);
 
 
+/* 16-way parallel cipher functions (avx/aes-ni) */
+asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+
+asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src);
+asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
+				   const u8 *src, le128 *iv);
+
+asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src, le128 *iv);
+asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
+				       const u8 *src, le128 *iv);
+
 static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
 static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
 				    const u8 *src)
 				    const u8 *src)
 {
 {
@@ -79,4 +95,7 @@ extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
 extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
 extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
 				    le128 *iv);
 				    le128 *iv);
 
 
+extern void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+extern void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+
 #endif /* ASM_X86_CAMELLIA_H */
 #endif /* ASM_X86_CAMELLIA_H */

+ 24 - 0
arch/x86/include/asm/crypto/glue_helper.h

@@ -14,10 +14,13 @@ typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
 typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
 typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
 typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
 typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
 				       le128 *iv);
 				       le128 *iv);
+typedef void (*common_glue_xts_func_t)(void *ctx, u128 *dst, const u128 *src,
+				       le128 *iv);
 
 
 #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
 #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
 #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
 #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
 #define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn))
 #define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn))
+#define GLUE_XTS_FUNC_CAST(fn) ((common_glue_xts_func_t)(fn))
 
 
 struct common_glue_func_entry {
 struct common_glue_func_entry {
 	unsigned int num_blocks; /* number of blocks that @fn will process */
 	unsigned int num_blocks; /* number of blocks that @fn will process */
@@ -25,6 +28,7 @@ struct common_glue_func_entry {
 		common_glue_func_t ecb;
 		common_glue_func_t ecb;
 		common_glue_cbc_func_t cbc;
 		common_glue_cbc_func_t cbc;
 		common_glue_ctr_func_t ctr;
 		common_glue_ctr_func_t ctr;
+		common_glue_xts_func_t xts;
 	} fn_u;
 	} fn_u;
 };
 };
 
 
@@ -96,6 +100,16 @@ static inline void le128_inc(le128 *i)
 	i->b = cpu_to_le64(b);
 	i->b = cpu_to_le64(b);
 }
 }
 
 
+static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src)
+{
+	u64 a = le64_to_cpu(src->a);
+	u64 b = le64_to_cpu(src->b);
+	u64 _tt = ((s64)a >> 63) & 0x87;
+
+	dst->a = cpu_to_le64((a << 1) ^ (b >> 63));
+	dst->b = cpu_to_le64((b << 1) ^ _tt);
+}
+
 extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
 extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
 				 struct blkcipher_desc *desc,
 				 struct blkcipher_desc *desc,
 				 struct scatterlist *dst,
 				 struct scatterlist *dst,
@@ -118,4 +132,14 @@ extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
 				 struct scatterlist *dst,
 				 struct scatterlist *dst,
 				 struct scatterlist *src, unsigned int nbytes);
 				 struct scatterlist *src, unsigned int nbytes);
 
 
+extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
+				 struct blkcipher_desc *desc,
+				 struct scatterlist *dst,
+				 struct scatterlist *src, unsigned int nbytes,
+				 common_glue_func_t tweak_fn, void *tweak_ctx,
+				 void *crypt_ctx);
+
+extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src,
+				      le128 *iv, common_glue_func_t fn);
+
 #endif /* _CRYPTO_GLUE_HELPER_H */
 #endif /* _CRYPTO_GLUE_HELPER_H */

+ 29 - 0
arch/x86/include/asm/crypto/serpent-avx.h

@@ -6,6 +6,16 @@
 
 
 #define SERPENT_PARALLEL_BLOCKS 8
 #define SERPENT_PARALLEL_BLOCKS 8
 
 
+struct serpent_lrw_ctx {
+	struct lrw_table_ctx lrw_table;
+	struct serpent_ctx serpent_ctx;
+};
+
+struct serpent_xts_ctx {
+	struct serpent_ctx tweak_ctx;
+	struct serpent_ctx crypt_ctx;
+};
+
 asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
 asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
 					 const u8 *src);
 					 const u8 *src);
 asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
 asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
@@ -16,4 +26,23 @@ asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
 asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
 asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
 				     const u8 *src, le128 *iv);
 				     const u8 *src, le128 *iv);
 
 
+asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src, le128 *iv);
+asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+					 const u8 *src, le128 *iv);
+
+extern void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
+				le128 *iv);
+
+extern void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+extern void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+
+extern int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen);
+
+extern void lrw_serpent_exit_tfm(struct crypto_tfm *tfm);
+
+extern int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+			      unsigned int keylen);
+
 #endif
 #endif

+ 18 - 0
arch/x86/include/asm/crypto/twofish.h

@@ -28,6 +28,20 @@ asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
 				     const u8 *src);
 				     const u8 *src);
 
 
+/* 8-way parallel cipher functions */
+asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src);
+asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src);
+asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src);
+asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
+				 const u8 *src, le128 *iv);
+asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src, le128 *iv);
+asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+				     const u8 *src, le128 *iv);
+
 /* helpers from twofish_x86_64-3way module */
 /* helpers from twofish_x86_64-3way module */
 extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
 extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
 extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
 extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
@@ -43,4 +57,8 @@ extern void lrw_twofish_exit_tfm(struct crypto_tfm *tfm);
 extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
 extern int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
 			      unsigned int keylen);
 			      unsigned int keylen);
 
 
+/* helpers from twofish-avx module */
+extern void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+extern void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+
 #endif /* ASM_X86_TWOFISH_H */
 #endif /* ASM_X86_TWOFISH_H */

+ 123 - 10
crypto/Kconfig

@@ -198,6 +198,7 @@ config CRYPTO_GCM
 	select CRYPTO_CTR
 	select CRYPTO_CTR
 	select CRYPTO_AEAD
 	select CRYPTO_AEAD
 	select CRYPTO_GHASH
 	select CRYPTO_GHASH
+	select CRYPTO_NULL
 	help
 	help
 	  Support for Galois/Counter Mode (GCM) and Galois Message
 	  Support for Galois/Counter Mode (GCM) and Galois Message
 	  Authentication Code (GMAC). Required for IPSec.
 	  Authentication Code (GMAC). Required for IPSec.
@@ -282,6 +283,17 @@ config CRYPTO_XTS
 
 
 comment "Hash modes"
 comment "Hash modes"
 
 
+config CRYPTO_CMAC
+	tristate "CMAC support"
+	select CRYPTO_HASH
+	select CRYPTO_MANAGER
+	help
+	  Cipher-based Message Authentication Code (CMAC) specified by
+	  The National Institute of Standards and Technology (NIST).
+
+	  https://tools.ietf.org/html/rfc4493
+	  http://csrc.nist.gov/publications/nistpubs/800-38B/SP_800-38B.pdf
+
 config CRYPTO_HMAC
 config CRYPTO_HMAC
 	tristate "HMAC support"
 	tristate "HMAC support"
 	select CRYPTO_HASH
 	select CRYPTO_HASH
@@ -322,19 +334,9 @@ config CRYPTO_CRC32C
 	  by iSCSI for header and data digests and by others.
 	  by iSCSI for header and data digests and by others.
 	  See Castagnoli93.  Module will be crc32c.
 	  See Castagnoli93.  Module will be crc32c.
 
 
-config CRYPTO_CRC32C_X86_64
-	bool
-	depends on X86 && 64BIT
-	select CRYPTO_HASH
-	help
-	  In Intel processor with SSE4.2 supported, the processor will
-	  support CRC32C calculation using hardware accelerated CRC32
-	  instruction optimized with PCLMULQDQ instruction when available.
-
 config CRYPTO_CRC32C_INTEL
 config CRYPTO_CRC32C_INTEL
 	tristate "CRC32c INTEL hardware acceleration"
 	tristate "CRC32c INTEL hardware acceleration"
 	depends on X86
 	depends on X86
-	select CRYPTO_CRC32C_X86_64 if 64BIT
 	select CRYPTO_HASH
 	select CRYPTO_HASH
 	help
 	help
 	  In Intel processor with SSE4.2 supported, the processor will
 	  In Intel processor with SSE4.2 supported, the processor will
@@ -480,6 +482,28 @@ config CRYPTO_SHA1_SSSE3
 	  using Supplemental SSE3 (SSSE3) instructions or Advanced Vector
 	  using Supplemental SSE3 (SSSE3) instructions or Advanced Vector
 	  Extensions (AVX), when available.
 	  Extensions (AVX), when available.
 
 
+config CRYPTO_SHA256_SSSE3
+	tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2)"
+	depends on X86 && 64BIT
+	select CRYPTO_SHA256
+	select CRYPTO_HASH
+	help
+	  SHA-256 secure hash standard (DFIPS 180-2) implemented
+	  using Supplemental SSE3 (SSSE3) instructions, or Advanced Vector
+	  Extensions version 1 (AVX1), or Advanced Vector Extensions
+	  version 2 (AVX2) instructions, when available.
+
+config CRYPTO_SHA512_SSSE3
+	tristate "SHA512 digest algorithm (SSSE3/AVX/AVX2)"
+	depends on X86 && 64BIT
+	select CRYPTO_SHA512
+	select CRYPTO_HASH
+	help
+	  SHA-512 secure hash standard (DFIPS 180-2) implemented
+	  using Supplemental SSE3 (SSSE3) instructions, or Advanced Vector
+	  Extensions version 1 (AVX1), or Advanced Vector Extensions
+	  version 2 (AVX2) instructions, when available.
+
 config CRYPTO_SHA1_SPARC64
 config CRYPTO_SHA1_SPARC64
 	tristate "SHA1 digest algorithm (SPARC64)"
 	tristate "SHA1 digest algorithm (SPARC64)"
 	depends on SPARC64
 	depends on SPARC64
@@ -654,6 +678,7 @@ config CRYPTO_AES_NI_INTEL
 	select CRYPTO_CRYPTD
 	select CRYPTO_CRYPTD
 	select CRYPTO_ABLK_HELPER_X86
 	select CRYPTO_ABLK_HELPER_X86
 	select CRYPTO_ALGAPI
 	select CRYPTO_ALGAPI
+	select CRYPTO_GLUE_HELPER_X86 if 64BIT
 	select CRYPTO_LRW
 	select CRYPTO_LRW
 	select CRYPTO_XTS
 	select CRYPTO_XTS
 	help
 	help
@@ -795,6 +820,24 @@ config CRYPTO_BLOWFISH_X86_64
 	  See also:
 	  See also:
 	  <http://www.schneier.com/blowfish.html>
 	  <http://www.schneier.com/blowfish.html>
 
 
+config CRYPTO_BLOWFISH_AVX2_X86_64
+	tristate "Blowfish cipher algorithm (x86_64/AVX2)"
+	depends on X86 && 64BIT
+	select CRYPTO_ALGAPI
+	select CRYPTO_CRYPTD
+	select CRYPTO_ABLK_HELPER_X86
+	select CRYPTO_BLOWFISH_COMMON
+	select CRYPTO_BLOWFISH_X86_64
+	help
+	  Blowfish cipher algorithm (x86_64/AVX2), by Bruce Schneier.
+
+	  This is a variable key length cipher which can use keys from 32
+	  bits to 448 bits in length.  It's fast, simple and specifically
+	  designed for use on "large microprocessors".
+
+	  See also:
+	  <http://www.schneier.com/blowfish.html>
+
 config CRYPTO_CAMELLIA
 config CRYPTO_CAMELLIA
 	tristate "Camellia cipher algorithms"
 	tristate "Camellia cipher algorithms"
 	depends on CRYPTO
 	depends on CRYPTO
@@ -851,6 +894,29 @@ config CRYPTO_CAMELLIA_AESNI_AVX_X86_64
 	  See also:
 	  See also:
 	  <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
 	  <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
 
 
+config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64
+	tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX2)"
+	depends on X86 && 64BIT
+	depends on CRYPTO
+	select CRYPTO_ALGAPI
+	select CRYPTO_CRYPTD
+	select CRYPTO_ABLK_HELPER_X86
+	select CRYPTO_GLUE_HELPER_X86
+	select CRYPTO_CAMELLIA_X86_64
+	select CRYPTO_CAMELLIA_AESNI_AVX_X86_64
+	select CRYPTO_LRW
+	select CRYPTO_XTS
+	help
+	  Camellia cipher algorithm module (x86_64/AES-NI/AVX2).
+
+	  Camellia is a symmetric key block cipher developed jointly
+	  at NTT and Mitsubishi Electric Corporation.
+
+	  The Camellia specifies three key sizes: 128, 192 and 256 bits.
+
+	  See also:
+	  <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html>
+
 config CRYPTO_CAMELLIA_SPARC64
 config CRYPTO_CAMELLIA_SPARC64
 	tristate "Camellia cipher algorithm (SPARC64)"
 	tristate "Camellia cipher algorithm (SPARC64)"
 	depends on SPARC64
 	depends on SPARC64
@@ -1088,6 +1154,29 @@ config CRYPTO_SERPENT_AVX_X86_64
 	  See also:
 	  See also:
 	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
 	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
 
 
+config CRYPTO_SERPENT_AVX2_X86_64
+	tristate "Serpent cipher algorithm (x86_64/AVX2)"
+	depends on X86 && 64BIT
+	select CRYPTO_ALGAPI
+	select CRYPTO_CRYPTD
+	select CRYPTO_ABLK_HELPER_X86
+	select CRYPTO_GLUE_HELPER_X86
+	select CRYPTO_SERPENT
+	select CRYPTO_SERPENT_AVX_X86_64
+	select CRYPTO_LRW
+	select CRYPTO_XTS
+	help
+	  Serpent cipher algorithm, by Anderson, Biham & Knudsen.
+
+	  Keys are allowed to be from 0 to 256 bits in length, in steps
+	  of 8 bits.
+
+	  This module provides Serpent cipher algorithm that processes 16
+	  blocks parallel using AVX2 instruction set.
+
+	  See also:
+	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
+
 config CRYPTO_TEA
 config CRYPTO_TEA
 	tristate "TEA, XTEA and XETA cipher algorithms"
 	tristate "TEA, XTEA and XETA cipher algorithms"
 	select CRYPTO_ALGAPI
 	select CRYPTO_ALGAPI
@@ -1207,6 +1296,30 @@ config CRYPTO_TWOFISH_AVX_X86_64
 	  See also:
 	  See also:
 	  <http://www.schneier.com/twofish.html>
 	  <http://www.schneier.com/twofish.html>
 
 
+config CRYPTO_TWOFISH_AVX2_X86_64
+	tristate "Twofish cipher algorithm (x86_64/AVX2)"
+	depends on X86 && 64BIT
+	select CRYPTO_ALGAPI
+	select CRYPTO_CRYPTD
+	select CRYPTO_ABLK_HELPER_X86
+	select CRYPTO_GLUE_HELPER_X86
+	select CRYPTO_TWOFISH_COMMON
+	select CRYPTO_TWOFISH_X86_64
+	select CRYPTO_TWOFISH_X86_64_3WAY
+	select CRYPTO_TWOFISH_AVX_X86_64
+	select CRYPTO_LRW
+	select CRYPTO_XTS
+	help
+	  Twofish cipher algorithm (x86_64/AVX2).
+
+	  Twofish was submitted as an AES (Advanced Encryption Standard)
+	  candidate cipher by researchers at CounterPane Systems.  It is a
+	  16 round block cipher supporting key sizes of 128, 192, and 256
+	  bits.
+
+	  See also:
+	  <http://www.schneier.com/twofish.html>
+
 comment "Compression"
 comment "Compression"
 
 
 config CRYPTO_DEFLATE
 config CRYPTO_DEFLATE

+ 1 - 0
crypto/Makefile

@@ -32,6 +32,7 @@ cryptomgr-y := algboss.o testmgr.o
 
 
 obj-$(CONFIG_CRYPTO_MANAGER2) += cryptomgr.o
 obj-$(CONFIG_CRYPTO_MANAGER2) += cryptomgr.o
 obj-$(CONFIG_CRYPTO_USER) += crypto_user.o
 obj-$(CONFIG_CRYPTO_USER) += crypto_user.o
+obj-$(CONFIG_CRYPTO_CMAC) += cmac.o
 obj-$(CONFIG_CRYPTO_HMAC) += hmac.o
 obj-$(CONFIG_CRYPTO_HMAC) += hmac.o
 obj-$(CONFIG_CRYPTO_VMAC) += vmac.o
 obj-$(CONFIG_CRYPTO_VMAC) += vmac.o
 obj-$(CONFIG_CRYPTO_XCBC) += xcbc.o
 obj-$(CONFIG_CRYPTO_XCBC) += xcbc.o

+ 315 - 0
crypto/cmac.c

@@ -0,0 +1,315 @@
+/*
+ * CMAC: Cipher Block Mode for Authentication
+ *
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * Based on work by:
+ *  Copyright © 2013 Tom St Denis <tstdenis@elliptictech.com>
+ * Based on crypto/xcbc.c:
+ *  Copyright © 2006 USAGI/WIDE Project,
+ *   Author: Kazunori Miyazawa <miyazawa@linux-ipv6.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+/*
+ * +------------------------
+ * | <parent tfm>
+ * +------------------------
+ * | cmac_tfm_ctx
+ * +------------------------
+ * | consts (block size * 2)
+ * +------------------------
+ */
+struct cmac_tfm_ctx {
+	struct crypto_cipher *child;
+	u8 ctx[];
+};
+
+/*
+ * +------------------------
+ * | <shash desc>
+ * +------------------------
+ * | cmac_desc_ctx
+ * +------------------------
+ * | odds (block size)
+ * +------------------------
+ * | prev (block size)
+ * +------------------------
+ */
+struct cmac_desc_ctx {
+	unsigned int len;
+	u8 ctx[];
+};
+
+static int crypto_cmac_digest_setkey(struct crypto_shash *parent,
+				     const u8 *inkey, unsigned int keylen)
+{
+	unsigned long alignmask = crypto_shash_alignmask(parent);
+	struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent);
+	unsigned int bs = crypto_shash_blocksize(parent);
+	__be64 *consts = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
+	u64 _const[2];
+	int i, err = 0;
+	u8 msb_mask, gfmask;
+
+	err = crypto_cipher_setkey(ctx->child, inkey, keylen);
+	if (err)
+		return err;
+
+	/* encrypt the zero block */
+	memset(consts, 0, bs);
+	crypto_cipher_encrypt_one(ctx->child, (u8 *)consts, (u8 *)consts);
+
+	switch (bs) {
+	case 16:
+		gfmask = 0x87;
+		_const[0] = be64_to_cpu(consts[1]);
+		_const[1] = be64_to_cpu(consts[0]);
+
+		/* gf(2^128) multiply zero-ciphertext with u and u^2 */
+		for (i = 0; i < 4; i += 2) {
+			msb_mask = ((s64)_const[1] >> 63) & gfmask;
+			_const[1] = (_const[1] << 1) | (_const[0] >> 63);
+			_const[0] = (_const[0] << 1) ^ msb_mask;
+
+			consts[i + 0] = cpu_to_be64(_const[1]);
+			consts[i + 1] = cpu_to_be64(_const[0]);
+		}
+
+		break;
+	case 8:
+		gfmask = 0x1B;
+		_const[0] = be64_to_cpu(consts[0]);
+
+		/* gf(2^64) multiply zero-ciphertext with u and u^2 */
+		for (i = 0; i < 2; i++) {
+			msb_mask = ((s64)_const[0] >> 63) & gfmask;
+			_const[0] = (_const[0] << 1) ^ msb_mask;
+
+			consts[i] = cpu_to_be64(_const[0]);
+		}
+
+		break;
+	}
+
+	return 0;
+}
+
+static int crypto_cmac_digest_init(struct shash_desc *pdesc)
+{
+	unsigned long alignmask = crypto_shash_alignmask(pdesc->tfm);
+	struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
+	int bs = crypto_shash_blocksize(pdesc->tfm);
+	u8 *prev = PTR_ALIGN((void *)ctx->ctx, alignmask + 1) + bs;
+
+	ctx->len = 0;
+	memset(prev, 0, bs);
+
+	return 0;
+}
+
+static int crypto_cmac_digest_update(struct shash_desc *pdesc, const u8 *p,
+				     unsigned int len)
+{
+	struct crypto_shash *parent = pdesc->tfm;
+	unsigned long alignmask = crypto_shash_alignmask(parent);
+	struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
+	struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
+	struct crypto_cipher *tfm = tctx->child;
+	int bs = crypto_shash_blocksize(parent);
+	u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
+	u8 *prev = odds + bs;
+
+	/* checking the data can fill the block */
+	if ((ctx->len + len) <= bs) {
+		memcpy(odds + ctx->len, p, len);
+		ctx->len += len;
+		return 0;
+	}
+
+	/* filling odds with new data and encrypting it */
+	memcpy(odds + ctx->len, p, bs - ctx->len);
+	len -= bs - ctx->len;
+	p += bs - ctx->len;
+
+	crypto_xor(prev, odds, bs);
+	crypto_cipher_encrypt_one(tfm, prev, prev);
+
+	/* clearing the length */
+	ctx->len = 0;
+
+	/* encrypting the rest of data */
+	while (len > bs) {
+		crypto_xor(prev, p, bs);
+		crypto_cipher_encrypt_one(tfm, prev, prev);
+		p += bs;
+		len -= bs;
+	}
+
+	/* keeping the surplus of blocksize */
+	if (len) {
+		memcpy(odds, p, len);
+		ctx->len = len;
+	}
+
+	return 0;
+}
+
+static int crypto_cmac_digest_final(struct shash_desc *pdesc, u8 *out)
+{
+	struct crypto_shash *parent = pdesc->tfm;
+	unsigned long alignmask = crypto_shash_alignmask(parent);
+	struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent);
+	struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc);
+	struct crypto_cipher *tfm = tctx->child;
+	int bs = crypto_shash_blocksize(parent);
+	u8 *consts = PTR_ALIGN((void *)tctx->ctx, alignmask + 1);
+	u8 *odds = PTR_ALIGN((void *)ctx->ctx, alignmask + 1);
+	u8 *prev = odds + bs;
+	unsigned int offset = 0;
+
+	if (ctx->len != bs) {
+		unsigned int rlen;
+		u8 *p = odds + ctx->len;
+
+		*p = 0x80;
+		p++;
+
+		rlen = bs - ctx->len - 1;
+		if (rlen)
+			memset(p, 0, rlen);
+
+		offset += bs;
+	}
+
+	crypto_xor(prev, odds, bs);
+	crypto_xor(prev, consts + offset, bs);
+
+	crypto_cipher_encrypt_one(tfm, out, prev);
+
+	return 0;
+}
+
+static int cmac_init_tfm(struct crypto_tfm *tfm)
+{
+	struct crypto_cipher *cipher;
+	struct crypto_instance *inst = (void *)tfm->__crt_alg;
+	struct crypto_spawn *spawn = crypto_instance_ctx(inst);
+	struct cmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	cipher = crypto_spawn_cipher(spawn);
+	if (IS_ERR(cipher))
+		return PTR_ERR(cipher);
+
+	ctx->child = cipher;
+
+	return 0;
+};
+
+static void cmac_exit_tfm(struct crypto_tfm *tfm)
+{
+	struct cmac_tfm_ctx *ctx = crypto_tfm_ctx(tfm);
+	crypto_free_cipher(ctx->child);
+}
+
+static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb)
+{
+	struct shash_instance *inst;
+	struct crypto_alg *alg;
+	unsigned long alignmask;
+	int err;
+
+	err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH);
+	if (err)
+		return err;
+
+	alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_CIPHER,
+				  CRYPTO_ALG_TYPE_MASK);
+	if (IS_ERR(alg))
+		return PTR_ERR(alg);
+
+	switch (alg->cra_blocksize) {
+	case 16:
+	case 8:
+		break;
+	default:
+		goto out_put_alg;
+	}
+
+	inst = shash_alloc_instance("cmac", alg);
+	err = PTR_ERR(inst);
+	if (IS_ERR(inst))
+		goto out_put_alg;
+
+	err = crypto_init_spawn(shash_instance_ctx(inst), alg,
+				shash_crypto_instance(inst),
+				CRYPTO_ALG_TYPE_MASK);
+	if (err)
+		goto out_free_inst;
+
+	alignmask = alg->cra_alignmask | (sizeof(long) - 1);
+	inst->alg.base.cra_alignmask = alignmask;
+	inst->alg.base.cra_priority = alg->cra_priority;
+	inst->alg.base.cra_blocksize = alg->cra_blocksize;
+
+	inst->alg.digestsize = alg->cra_blocksize;
+	inst->alg.descsize =
+		ALIGN(sizeof(struct cmac_desc_ctx), crypto_tfm_ctx_alignment())
+		+ (alignmask & ~(crypto_tfm_ctx_alignment() - 1))
+		+ alg->cra_blocksize * 2;
+
+	inst->alg.base.cra_ctxsize =
+		ALIGN(sizeof(struct cmac_tfm_ctx), alignmask + 1)
+		+ alg->cra_blocksize * 2;
+
+	inst->alg.base.cra_init = cmac_init_tfm;
+	inst->alg.base.cra_exit = cmac_exit_tfm;
+
+	inst->alg.init = crypto_cmac_digest_init;
+	inst->alg.update = crypto_cmac_digest_update;
+	inst->alg.final = crypto_cmac_digest_final;
+	inst->alg.setkey = crypto_cmac_digest_setkey;
+
+	err = shash_register_instance(tmpl, inst);
+	if (err) {
+out_free_inst:
+		shash_free_instance(shash_crypto_instance(inst));
+	}
+
+out_put_alg:
+	crypto_mod_put(alg);
+	return err;
+}
+
+static struct crypto_template crypto_cmac_tmpl = {
+	.name = "cmac",
+	.create = cmac_create,
+	.free = shash_free_instance,
+	.module = THIS_MODULE,
+};
+
+static int __init crypto_cmac_module_init(void)
+{
+	return crypto_register_template(&crypto_cmac_tmpl);
+}
+
+static void __exit crypto_cmac_module_exit(void)
+{
+	crypto_unregister_template(&crypto_cmac_tmpl);
+}
+
+module_init(crypto_cmac_module_init);
+module_exit(crypto_cmac_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("CMAC keyed hash algorithm");

+ 2 - 2
crypto/crypto_user.c

@@ -440,7 +440,7 @@ static const struct nla_policy crypto_policy[CRYPTOCFGA_MAX+1] = {
 
 
 #undef MSGSIZE
 #undef MSGSIZE
 
 
-static struct crypto_link {
+static const struct crypto_link {
 	int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **);
 	int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **);
 	int (*dump)(struct sk_buff *, struct netlink_callback *);
 	int (*dump)(struct sk_buff *, struct netlink_callback *);
 	int (*done)(struct netlink_callback *);
 	int (*done)(struct netlink_callback *);
@@ -456,7 +456,7 @@ static struct crypto_link {
 static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 {
 	struct nlattr *attrs[CRYPTOCFGA_MAX+1];
 	struct nlattr *attrs[CRYPTOCFGA_MAX+1];
-	struct crypto_link *link;
+	const struct crypto_link *link;
 	int type, err;
 	int type, err;
 
 
 	type = nlh->nlmsg_type;
 	type = nlh->nlmsg_type;

+ 97 - 19
crypto/gcm.c

@@ -37,8 +37,14 @@ struct crypto_rfc4106_ctx {
 	u8 nonce[4];
 	u8 nonce[4];
 };
 };
 
 
+struct crypto_rfc4543_instance_ctx {
+	struct crypto_aead_spawn aead;
+	struct crypto_skcipher_spawn null;
+};
+
 struct crypto_rfc4543_ctx {
 struct crypto_rfc4543_ctx {
 	struct crypto_aead *child;
 	struct crypto_aead *child;
+	struct crypto_blkcipher *null;
 	u8 nonce[4];
 	u8 nonce[4];
 };
 };
 
 
@@ -1094,21 +1100,36 @@ static int crypto_rfc4543_setauthsize(struct crypto_aead *parent,
 	return crypto_aead_setauthsize(ctx->child, authsize);
 	return crypto_aead_setauthsize(ctx->child, authsize);
 }
 }
 
 
+static void crypto_rfc4543_done(struct crypto_async_request *areq, int err)
+{
+	struct aead_request *req = areq->data;
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req);
+
+	if (!err) {
+		scatterwalk_map_and_copy(rctx->auth_tag, req->dst,
+					 req->cryptlen,
+					 crypto_aead_authsize(aead), 1);
+	}
+
+	aead_request_complete(req, err);
+}
+
 static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
 static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
-						 int enc)
+						 bool enc)
 {
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
 	struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead);
 	struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead);
 	struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req);
 	struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req);
 	struct aead_request *subreq = &rctx->subreq;
 	struct aead_request *subreq = &rctx->subreq;
-	struct scatterlist *dst = req->dst;
+	struct scatterlist *src = req->src;
 	struct scatterlist *cipher = rctx->cipher;
 	struct scatterlist *cipher = rctx->cipher;
 	struct scatterlist *payload = rctx->payload;
 	struct scatterlist *payload = rctx->payload;
 	struct scatterlist *assoc = rctx->assoc;
 	struct scatterlist *assoc = rctx->assoc;
 	unsigned int authsize = crypto_aead_authsize(aead);
 	unsigned int authsize = crypto_aead_authsize(aead);
 	unsigned int assoclen = req->assoclen;
 	unsigned int assoclen = req->assoclen;
-	struct page *dstp;
-	u8 *vdst;
+	struct page *srcp;
+	u8 *vsrc;
 	u8 *iv = PTR_ALIGN((u8 *)(rctx + 1) + crypto_aead_reqsize(ctx->child),
 	u8 *iv = PTR_ALIGN((u8 *)(rctx + 1) + crypto_aead_reqsize(ctx->child),
 			   crypto_aead_alignmask(ctx->child) + 1);
 			   crypto_aead_alignmask(ctx->child) + 1);
 
 
@@ -1119,19 +1140,19 @@ static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
 	if (enc)
 	if (enc)
 		memset(rctx->auth_tag, 0, authsize);
 		memset(rctx->auth_tag, 0, authsize);
 	else
 	else
-		scatterwalk_map_and_copy(rctx->auth_tag, dst,
+		scatterwalk_map_and_copy(rctx->auth_tag, src,
 					 req->cryptlen - authsize,
 					 req->cryptlen - authsize,
 					 authsize, 0);
 					 authsize, 0);
 
 
 	sg_init_one(cipher, rctx->auth_tag, authsize);
 	sg_init_one(cipher, rctx->auth_tag, authsize);
 
 
 	/* construct the aad */
 	/* construct the aad */
-	dstp = sg_page(dst);
-	vdst = PageHighMem(dstp) ? NULL : page_address(dstp) + dst->offset;
+	srcp = sg_page(src);
+	vsrc = PageHighMem(srcp) ? NULL : page_address(srcp) + src->offset;
 
 
 	sg_init_table(payload, 2);
 	sg_init_table(payload, 2);
 	sg_set_buf(payload, req->iv, 8);
 	sg_set_buf(payload, req->iv, 8);
-	scatterwalk_crypto_chain(payload, dst, vdst == req->iv + 8, 2);
+	scatterwalk_crypto_chain(payload, src, vsrc == req->iv + 8, 2);
 	assoclen += 8 + req->cryptlen - (enc ? 0 : authsize);
 	assoclen += 8 + req->cryptlen - (enc ? 0 : authsize);
 
 
 	if (req->assoc->length == req->assoclen) {
 	if (req->assoc->length == req->assoclen) {
@@ -1150,14 +1171,27 @@ static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req,
 	scatterwalk_crypto_chain(assoc, payload, 0, 2);
 	scatterwalk_crypto_chain(assoc, payload, 0, 2);
 
 
 	aead_request_set_tfm(subreq, ctx->child);
 	aead_request_set_tfm(subreq, ctx->child);
-	aead_request_set_callback(subreq, req->base.flags, req->base.complete,
-				  req->base.data);
+	aead_request_set_callback(subreq, req->base.flags, crypto_rfc4543_done,
+				  req);
 	aead_request_set_crypt(subreq, cipher, cipher, enc ? 0 : authsize, iv);
 	aead_request_set_crypt(subreq, cipher, cipher, enc ? 0 : authsize, iv);
 	aead_request_set_assoc(subreq, assoc, assoclen);
 	aead_request_set_assoc(subreq, assoc, assoclen);
 
 
 	return subreq;
 	return subreq;
 }
 }
 
 
+static int crypto_rfc4543_copy_src_to_dst(struct aead_request *req, bool enc)
+{
+	struct crypto_aead *aead = crypto_aead_reqtfm(req);
+	struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead);
+	unsigned int authsize = crypto_aead_authsize(aead);
+	unsigned int nbytes = req->cryptlen - (enc ? 0 : authsize);
+	struct blkcipher_desc desc = {
+		.tfm = ctx->null,
+	};
+
+	return crypto_blkcipher_encrypt(&desc, req->dst, req->src, nbytes);
+}
+
 static int crypto_rfc4543_encrypt(struct aead_request *req)
 static int crypto_rfc4543_encrypt(struct aead_request *req)
 {
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
@@ -1165,7 +1199,13 @@ static int crypto_rfc4543_encrypt(struct aead_request *req)
 	struct aead_request *subreq;
 	struct aead_request *subreq;
 	int err;
 	int err;
 
 
-	subreq = crypto_rfc4543_crypt(req, 1);
+	if (req->src != req->dst) {
+		err = crypto_rfc4543_copy_src_to_dst(req, true);
+		if (err)
+			return err;
+	}
+
+	subreq = crypto_rfc4543_crypt(req, true);
 	err = crypto_aead_encrypt(subreq);
 	err = crypto_aead_encrypt(subreq);
 	if (err)
 	if (err)
 		return err;
 		return err;
@@ -1178,7 +1218,15 @@ static int crypto_rfc4543_encrypt(struct aead_request *req)
 
 
 static int crypto_rfc4543_decrypt(struct aead_request *req)
 static int crypto_rfc4543_decrypt(struct aead_request *req)
 {
 {
-	req = crypto_rfc4543_crypt(req, 0);
+	int err;
+
+	if (req->src != req->dst) {
+		err = crypto_rfc4543_copy_src_to_dst(req, false);
+		if (err)
+			return err;
+	}
+
+	req = crypto_rfc4543_crypt(req, false);
 
 
 	return crypto_aead_decrypt(req);
 	return crypto_aead_decrypt(req);
 }
 }
@@ -1186,16 +1234,25 @@ static int crypto_rfc4543_decrypt(struct aead_request *req)
 static int crypto_rfc4543_init_tfm(struct crypto_tfm *tfm)
 static int crypto_rfc4543_init_tfm(struct crypto_tfm *tfm)
 {
 {
 	struct crypto_instance *inst = (void *)tfm->__crt_alg;
 	struct crypto_instance *inst = (void *)tfm->__crt_alg;
-	struct crypto_aead_spawn *spawn = crypto_instance_ctx(inst);
+	struct crypto_rfc4543_instance_ctx *ictx = crypto_instance_ctx(inst);
+	struct crypto_aead_spawn *spawn = &ictx->aead;
 	struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct crypto_aead *aead;
 	struct crypto_aead *aead;
+	struct crypto_blkcipher *null;
 	unsigned long align;
 	unsigned long align;
+	int err = 0;
 
 
 	aead = crypto_spawn_aead(spawn);
 	aead = crypto_spawn_aead(spawn);
 	if (IS_ERR(aead))
 	if (IS_ERR(aead))
 		return PTR_ERR(aead);
 		return PTR_ERR(aead);
 
 
+	null = crypto_spawn_blkcipher(&ictx->null.base);
+	err = PTR_ERR(null);
+	if (IS_ERR(null))
+		goto err_free_aead;
+
 	ctx->child = aead;
 	ctx->child = aead;
+	ctx->null = null;
 
 
 	align = crypto_aead_alignmask(aead);
 	align = crypto_aead_alignmask(aead);
 	align &= ~(crypto_tfm_ctx_alignment() - 1);
 	align &= ~(crypto_tfm_ctx_alignment() - 1);
@@ -1205,6 +1262,10 @@ static int crypto_rfc4543_init_tfm(struct crypto_tfm *tfm)
 				align + 16;
 				align + 16;
 
 
 	return 0;
 	return 0;
+
+err_free_aead:
+	crypto_free_aead(aead);
+	return err;
 }
 }
 
 
 static void crypto_rfc4543_exit_tfm(struct crypto_tfm *tfm)
 static void crypto_rfc4543_exit_tfm(struct crypto_tfm *tfm)
@@ -1212,6 +1273,7 @@ static void crypto_rfc4543_exit_tfm(struct crypto_tfm *tfm)
 	struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm);
 
 
 	crypto_free_aead(ctx->child);
 	crypto_free_aead(ctx->child);
+	crypto_free_blkcipher(ctx->null);
 }
 }
 
 
 static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
@@ -1220,6 +1282,7 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 	struct crypto_instance *inst;
 	struct crypto_instance *inst;
 	struct crypto_aead_spawn *spawn;
 	struct crypto_aead_spawn *spawn;
 	struct crypto_alg *alg;
 	struct crypto_alg *alg;
+	struct crypto_rfc4543_instance_ctx *ctx;
 	const char *ccm_name;
 	const char *ccm_name;
 	int err;
 	int err;
 
 
@@ -1234,11 +1297,12 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 	if (IS_ERR(ccm_name))
 	if (IS_ERR(ccm_name))
 		return ERR_CAST(ccm_name);
 		return ERR_CAST(ccm_name);
 
 
-	inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
+	inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL);
 	if (!inst)
 	if (!inst)
 		return ERR_PTR(-ENOMEM);
 		return ERR_PTR(-ENOMEM);
 
 
-	spawn = crypto_instance_ctx(inst);
+	ctx = crypto_instance_ctx(inst);
+	spawn = &ctx->aead;
 	crypto_set_aead_spawn(spawn, inst);
 	crypto_set_aead_spawn(spawn, inst);
 	err = crypto_grab_aead(spawn, ccm_name, 0,
 	err = crypto_grab_aead(spawn, ccm_name, 0,
 			       crypto_requires_sync(algt->type, algt->mask));
 			       crypto_requires_sync(algt->type, algt->mask));
@@ -1247,15 +1311,23 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 
 
 	alg = crypto_aead_spawn_alg(spawn);
 	alg = crypto_aead_spawn_alg(spawn);
 
 
+	crypto_set_skcipher_spawn(&ctx->null, inst);
+	err = crypto_grab_skcipher(&ctx->null, "ecb(cipher_null)", 0,
+				   CRYPTO_ALG_ASYNC);
+	if (err)
+		goto out_drop_alg;
+
+	crypto_skcipher_spawn_alg(&ctx->null);
+
 	err = -EINVAL;
 	err = -EINVAL;
 
 
 	/* We only support 16-byte blocks. */
 	/* We only support 16-byte blocks. */
 	if (alg->cra_aead.ivsize != 16)
 	if (alg->cra_aead.ivsize != 16)
-		goto out_drop_alg;
+		goto out_drop_ecbnull;
 
 
 	/* Not a stream cipher? */
 	/* Not a stream cipher? */
 	if (alg->cra_blocksize != 1)
 	if (alg->cra_blocksize != 1)
-		goto out_drop_alg;
+		goto out_drop_ecbnull;
 
 
 	err = -ENAMETOOLONG;
 	err = -ENAMETOOLONG;
 	if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME,
 	if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME,
@@ -1263,7 +1335,7 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 	    snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME,
 	    snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME,
 		     "rfc4543(%s)", alg->cra_driver_name) >=
 		     "rfc4543(%s)", alg->cra_driver_name) >=
 	    CRYPTO_MAX_ALG_NAME)
 	    CRYPTO_MAX_ALG_NAME)
-		goto out_drop_alg;
+		goto out_drop_ecbnull;
 
 
 	inst->alg.cra_flags = CRYPTO_ALG_TYPE_AEAD;
 	inst->alg.cra_flags = CRYPTO_ALG_TYPE_AEAD;
 	inst->alg.cra_flags |= alg->cra_flags & CRYPTO_ALG_ASYNC;
 	inst->alg.cra_flags |= alg->cra_flags & CRYPTO_ALG_ASYNC;
@@ -1290,6 +1362,8 @@ static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb)
 out:
 out:
 	return inst;
 	return inst;
 
 
+out_drop_ecbnull:
+	crypto_drop_skcipher(&ctx->null);
 out_drop_alg:
 out_drop_alg:
 	crypto_drop_aead(spawn);
 	crypto_drop_aead(spawn);
 out_free_inst:
 out_free_inst:
@@ -1300,7 +1374,11 @@ out_free_inst:
 
 
 static void crypto_rfc4543_free(struct crypto_instance *inst)
 static void crypto_rfc4543_free(struct crypto_instance *inst)
 {
 {
-	crypto_drop_spawn(crypto_instance_ctx(inst));
+	struct crypto_rfc4543_instance_ctx *ctx = crypto_instance_ctx(inst);
+
+	crypto_drop_aead(&ctx->aead);
+	crypto_drop_skcipher(&ctx->null);
+
 	kfree(inst);
 	kfree(inst);
 }
 }
 
 

+ 6 - 5
crypto/sha256_generic.c

@@ -246,7 +246,7 @@ static int sha256_init(struct shash_desc *desc)
 	return 0;
 	return 0;
 }
 }
 
 
-static int sha256_update(struct shash_desc *desc, const u8 *data,
+int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
 			  unsigned int len)
 			  unsigned int len)
 {
 {
 	struct sha256_state *sctx = shash_desc_ctx(desc);
 	struct sha256_state *sctx = shash_desc_ctx(desc);
@@ -277,6 +277,7 @@ static int sha256_update(struct shash_desc *desc, const u8 *data,
 
 
 	return 0;
 	return 0;
 }
 }
+EXPORT_SYMBOL(crypto_sha256_update);
 
 
 static int sha256_final(struct shash_desc *desc, u8 *out)
 static int sha256_final(struct shash_desc *desc, u8 *out)
 {
 {
@@ -293,10 +294,10 @@ static int sha256_final(struct shash_desc *desc, u8 *out)
 	/* Pad out to 56 mod 64. */
 	/* Pad out to 56 mod 64. */
 	index = sctx->count & 0x3f;
 	index = sctx->count & 0x3f;
 	pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
 	pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
-	sha256_update(desc, padding, pad_len);
+	crypto_sha256_update(desc, padding, pad_len);
 
 
 	/* Append length (before padding) */
 	/* Append length (before padding) */
-	sha256_update(desc, (const u8 *)&bits, sizeof(bits));
+	crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
 
 
 	/* Store state in digest */
 	/* Store state in digest */
 	for (i = 0; i < 8; i++)
 	for (i = 0; i < 8; i++)
@@ -339,7 +340,7 @@ static int sha256_import(struct shash_desc *desc, const void *in)
 static struct shash_alg sha256_algs[2] = { {
 static struct shash_alg sha256_algs[2] = { {
 	.digestsize	=	SHA256_DIGEST_SIZE,
 	.digestsize	=	SHA256_DIGEST_SIZE,
 	.init		=	sha256_init,
 	.init		=	sha256_init,
-	.update		=	sha256_update,
+	.update		=	crypto_sha256_update,
 	.final		=	sha256_final,
 	.final		=	sha256_final,
 	.export		=	sha256_export,
 	.export		=	sha256_export,
 	.import		=	sha256_import,
 	.import		=	sha256_import,
@@ -355,7 +356,7 @@ static struct shash_alg sha256_algs[2] = { {
 }, {
 }, {
 	.digestsize	=	SHA224_DIGEST_SIZE,
 	.digestsize	=	SHA224_DIGEST_SIZE,
 	.init		=	sha224_init,
 	.init		=	sha224_init,
-	.update		=	sha256_update,
+	.update		=	crypto_sha256_update,
 	.final		=	sha224_final,
 	.final		=	sha224_final,
 	.descsize	=	sizeof(struct sha256_state),
 	.descsize	=	sizeof(struct sha256_state),
 	.base		=	{
 	.base		=	{

+ 7 - 6
crypto/sha512_generic.c

@@ -163,8 +163,8 @@ sha384_init(struct shash_desc *desc)
 	return 0;
 	return 0;
 }
 }
 
 
-static int
-sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len)
+int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
+			unsigned int len)
 {
 {
 	struct sha512_state *sctx = shash_desc_ctx(desc);
 	struct sha512_state *sctx = shash_desc_ctx(desc);
 
 
@@ -197,6 +197,7 @@ sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len)
 
 
 	return 0;
 	return 0;
 }
 }
+EXPORT_SYMBOL(crypto_sha512_update);
 
 
 static int
 static int
 sha512_final(struct shash_desc *desc, u8 *hash)
 sha512_final(struct shash_desc *desc, u8 *hash)
@@ -215,10 +216,10 @@ sha512_final(struct shash_desc *desc, u8 *hash)
 	/* Pad out to 112 mod 128. */
 	/* Pad out to 112 mod 128. */
 	index = sctx->count[0] & 0x7f;
 	index = sctx->count[0] & 0x7f;
 	pad_len = (index < 112) ? (112 - index) : ((128+112) - index);
 	pad_len = (index < 112) ? (112 - index) : ((128+112) - index);
-	sha512_update(desc, padding, pad_len);
+	crypto_sha512_update(desc, padding, pad_len);
 
 
 	/* Append length (before padding) */
 	/* Append length (before padding) */
-	sha512_update(desc, (const u8 *)bits, sizeof(bits));
+	crypto_sha512_update(desc, (const u8 *)bits, sizeof(bits));
 
 
 	/* Store state in digest */
 	/* Store state in digest */
 	for (i = 0; i < 8; i++)
 	for (i = 0; i < 8; i++)
@@ -245,7 +246,7 @@ static int sha384_final(struct shash_desc *desc, u8 *hash)
 static struct shash_alg sha512_algs[2] = { {
 static struct shash_alg sha512_algs[2] = { {
 	.digestsize	=	SHA512_DIGEST_SIZE,
 	.digestsize	=	SHA512_DIGEST_SIZE,
 	.init		=	sha512_init,
 	.init		=	sha512_init,
-	.update		=	sha512_update,
+	.update		=	crypto_sha512_update,
 	.final		=	sha512_final,
 	.final		=	sha512_final,
 	.descsize	=	sizeof(struct sha512_state),
 	.descsize	=	sizeof(struct sha512_state),
 	.base		=	{
 	.base		=	{
@@ -257,7 +258,7 @@ static struct shash_alg sha512_algs[2] = { {
 }, {
 }, {
 	.digestsize	=	SHA384_DIGEST_SIZE,
 	.digestsize	=	SHA384_DIGEST_SIZE,
 	.init		=	sha384_init,
 	.init		=	sha384_init,
-	.update		=	sha512_update,
+	.update		=	crypto_sha512_update,
 	.final		=	sha384_final,
 	.final		=	sha384_final,
 	.descsize	=	sizeof(struct sha512_state),
 	.descsize	=	sizeof(struct sha512_state),
 	.base		=	{
 	.base		=	{

+ 29 - 1
crypto/tcrypt.c

@@ -1095,7 +1095,6 @@ static int do_test(int m)
 		break;
 		break;
 
 
 	case 28:
 	case 28:
-
 		ret += tcrypt_test("tgr160");
 		ret += tcrypt_test("tgr160");
 		break;
 		break;
 
 
@@ -1118,6 +1117,7 @@ static int do_test(int m)
 		ret += tcrypt_test("lrw(camellia)");
 		ret += tcrypt_test("lrw(camellia)");
 		ret += tcrypt_test("xts(camellia)");
 		ret += tcrypt_test("xts(camellia)");
 		break;
 		break;
+
 	case 33:
 	case 33:
 		ret += tcrypt_test("sha224");
 		ret += tcrypt_test("sha224");
 		break;
 		break;
@@ -1213,6 +1213,7 @@ static int do_test(int m)
 	case 109:
 	case 109:
 		ret += tcrypt_test("vmac(aes)");
 		ret += tcrypt_test("vmac(aes)");
 		break;
 		break;
+
 	case 110:
 	case 110:
 		ret += tcrypt_test("hmac(crc32)");
 		ret += tcrypt_test("hmac(crc32)");
 		break;
 		break;
@@ -1225,6 +1226,18 @@ static int do_test(int m)
 		ret += tcrypt_test("rfc4106(gcm(aes))");
 		ret += tcrypt_test("rfc4106(gcm(aes))");
 		break;
 		break;
 
 
+	case 152:
+		ret += tcrypt_test("rfc4543(gcm(aes))");
+		break;
+
+	case 153:
+		ret += tcrypt_test("cmac(aes)");
+		break;
+
+	case 154:
+		ret += tcrypt_test("cmac(des3_ede)");
+		break;
+
 	case 200:
 	case 200:
 		test_cipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0,
 		test_cipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0,
 				speed_template_16_24_32);
 				speed_template_16_24_32);
@@ -1755,6 +1768,21 @@ static int do_test(int m)
 				   speed_template_32_64);
 				   speed_template_32_64);
 		break;
 		break;
 
 
+	case 509:
+		test_acipher_speed("ecb(blowfish)", ENCRYPT, sec, NULL, 0,
+				   speed_template_8_32);
+		test_acipher_speed("ecb(blowfish)", DECRYPT, sec, NULL, 0,
+				   speed_template_8_32);
+		test_acipher_speed("cbc(blowfish)", ENCRYPT, sec, NULL, 0,
+				   speed_template_8_32);
+		test_acipher_speed("cbc(blowfish)", DECRYPT, sec, NULL, 0,
+				   speed_template_8_32);
+		test_acipher_speed("ctr(blowfish)", ENCRYPT, sec, NULL, 0,
+				   speed_template_8_32);
+		test_acipher_speed("ctr(blowfish)", DECRYPT, sec, NULL, 0,
+				   speed_template_8_32);
+		break;
+
 	case 1000:
 	case 1000:
 		test_available();
 		test_available();
 		break;
 		break;

+ 93 - 2
crypto/testmgr.c

@@ -1644,19 +1644,31 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 	}, {
 		.alg = "__cbc-serpent-avx",
 		.alg = "__cbc-serpent-avx",
 		.test = alg_test_null,
 		.test = alg_test_null,
+	}, {
+		.alg = "__cbc-serpent-avx2",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "__cbc-serpent-sse2",
 		.alg = "__cbc-serpent-sse2",
 		.test = alg_test_null,
 		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "__cbc-twofish-avx",
 		.alg = "__cbc-twofish-avx",
 		.test = alg_test_null,
 		.test = alg_test_null,
+	}, {
+		.alg = "__cbc-twofish-avx2",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "__driver-cbc-aes-aesni",
 		.alg = "__driver-cbc-aes-aesni",
 		.test = alg_test_null,
 		.test = alg_test_null,
 		.fips_allowed = 1,
 		.fips_allowed = 1,
+	}, {
+		.alg = "__driver-cbc-blowfish-avx2",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "__driver-cbc-camellia-aesni",
 		.alg = "__driver-cbc-camellia-aesni",
 		.test = alg_test_null,
 		.test = alg_test_null,
+	}, {
+		.alg = "__driver-cbc-camellia-aesni-avx2",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "__driver-cbc-cast5-avx",
 		.alg = "__driver-cbc-cast5-avx",
 		.test = alg_test_null,
 		.test = alg_test_null,
@@ -1666,19 +1678,31 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 	}, {
 		.alg = "__driver-cbc-serpent-avx",
 		.alg = "__driver-cbc-serpent-avx",
 		.test = alg_test_null,
 		.test = alg_test_null,
+	}, {
+		.alg = "__driver-cbc-serpent-avx2",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "__driver-cbc-serpent-sse2",
 		.alg = "__driver-cbc-serpent-sse2",
 		.test = alg_test_null,
 		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "__driver-cbc-twofish-avx",
 		.alg = "__driver-cbc-twofish-avx",
 		.test = alg_test_null,
 		.test = alg_test_null,
+	}, {
+		.alg = "__driver-cbc-twofish-avx2",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "__driver-ecb-aes-aesni",
 		.alg = "__driver-ecb-aes-aesni",
 		.test = alg_test_null,
 		.test = alg_test_null,
 		.fips_allowed = 1,
 		.fips_allowed = 1,
+	}, {
+		.alg = "__driver-ecb-blowfish-avx2",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "__driver-ecb-camellia-aesni",
 		.alg = "__driver-ecb-camellia-aesni",
 		.test = alg_test_null,
 		.test = alg_test_null,
+	}, {
+		.alg = "__driver-ecb-camellia-aesni-avx2",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "__driver-ecb-cast5-avx",
 		.alg = "__driver-ecb-cast5-avx",
 		.test = alg_test_null,
 		.test = alg_test_null,
@@ -1688,12 +1712,18 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 	}, {
 		.alg = "__driver-ecb-serpent-avx",
 		.alg = "__driver-ecb-serpent-avx",
 		.test = alg_test_null,
 		.test = alg_test_null,
+	}, {
+		.alg = "__driver-ecb-serpent-avx2",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "__driver-ecb-serpent-sse2",
 		.alg = "__driver-ecb-serpent-sse2",
 		.test = alg_test_null,
 		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "__driver-ecb-twofish-avx",
 		.alg = "__driver-ecb-twofish-avx",
 		.test = alg_test_null,
 		.test = alg_test_null,
+	}, {
+		.alg = "__driver-ecb-twofish-avx2",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "__ghash-pclmulqdqni",
 		.alg = "__ghash-pclmulqdqni",
 		.test = alg_test_null,
 		.test = alg_test_null,
@@ -1912,6 +1942,27 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 				}
 			}
 			}
 		}
 		}
+	}, {
+		.alg = "cmac(aes)",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = {
+				.vecs = aes_cmac128_tv_template,
+				.count = CMAC_AES_TEST_VECTORS
+			}
+		}
+	}, {
+		.alg = "cmac(des3_ede)",
+		.test = alg_test_hash,
+		.suite = {
+			.hash = {
+				.vecs = des3_ede_cmac64_tv_template,
+				.count = CMAC_DES3_EDE_TEST_VECTORS
+			}
+		}
+	}, {
+		.alg = "compress_null",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "crc32c",
 		.alg = "crc32c",
 		.test = alg_test_crc32c,
 		.test = alg_test_crc32c,
@@ -1926,16 +1977,31 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.alg = "cryptd(__driver-cbc-aes-aesni)",
 		.alg = "cryptd(__driver-cbc-aes-aesni)",
 		.test = alg_test_null,
 		.test = alg_test_null,
 		.fips_allowed = 1,
 		.fips_allowed = 1,
+	}, {
+		.alg = "cryptd(__driver-cbc-blowfish-avx2)",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "cryptd(__driver-cbc-camellia-aesni)",
 		.alg = "cryptd(__driver-cbc-camellia-aesni)",
 		.test = alg_test_null,
 		.test = alg_test_null,
+	}, {
+		.alg = "cryptd(__driver-cbc-camellia-aesni-avx2)",
+		.test = alg_test_null,
+	}, {
+		.alg = "cryptd(__driver-cbc-serpent-avx2)",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "cryptd(__driver-ecb-aes-aesni)",
 		.alg = "cryptd(__driver-ecb-aes-aesni)",
 		.test = alg_test_null,
 		.test = alg_test_null,
 		.fips_allowed = 1,
 		.fips_allowed = 1,
+	}, {
+		.alg = "cryptd(__driver-ecb-blowfish-avx2)",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "cryptd(__driver-ecb-camellia-aesni)",
 		.alg = "cryptd(__driver-ecb-camellia-aesni)",
 		.test = alg_test_null,
 		.test = alg_test_null,
+	}, {
+		.alg = "cryptd(__driver-ecb-camellia-aesni-avx2)",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "cryptd(__driver-ecb-cast5-avx)",
 		.alg = "cryptd(__driver-ecb-cast5-avx)",
 		.test = alg_test_null,
 		.test = alg_test_null,
@@ -1945,12 +2011,18 @@ static const struct alg_test_desc alg_test_descs[] = {
 	}, {
 	}, {
 		.alg = "cryptd(__driver-ecb-serpent-avx)",
 		.alg = "cryptd(__driver-ecb-serpent-avx)",
 		.test = alg_test_null,
 		.test = alg_test_null,
+	}, {
+		.alg = "cryptd(__driver-ecb-serpent-avx2)",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "cryptd(__driver-ecb-serpent-sse2)",
 		.alg = "cryptd(__driver-ecb-serpent-sse2)",
 		.test = alg_test_null,
 		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "cryptd(__driver-ecb-twofish-avx)",
 		.alg = "cryptd(__driver-ecb-twofish-avx)",
 		.test = alg_test_null,
 		.test = alg_test_null,
+	}, {
+		.alg = "cryptd(__driver-ecb-twofish-avx2)",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "cryptd(__driver-gcm-aes-aesni)",
 		.alg = "cryptd(__driver-gcm-aes-aesni)",
 		.test = alg_test_null,
 		.test = alg_test_null,
@@ -2126,6 +2198,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 				}
 			}
 			}
 		}
 		}
+	}, {
+		.alg = "digest_null",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "ecb(__aes-aesni)",
 		.alg = "ecb(__aes-aesni)",
 		.test = alg_test_null,
 		.test = alg_test_null,
@@ -2236,6 +2311,9 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 				}
 			}
 			}
 		}
 		}
+	}, {
+		.alg = "ecb(cipher_null)",
+		.test = alg_test_null,
 	}, {
 	}, {
 		.alg = "ecb(des)",
 		.alg = "ecb(des)",
 		.test = alg_test_skcipher,
 		.test = alg_test_skcipher,
@@ -2696,8 +2774,6 @@ static const struct alg_test_desc alg_test_descs[] = {
 			}
 			}
 		}
 		}
 	}, {
 	}, {
-
-
 		.alg = "rfc4309(ccm(aes))",
 		.alg = "rfc4309(ccm(aes))",
 		.test = alg_test_aead,
 		.test = alg_test_aead,
 		.fips_allowed = 1,
 		.fips_allowed = 1,
@@ -2713,6 +2789,21 @@ static const struct alg_test_desc alg_test_descs[] = {
 				}
 				}
 			}
 			}
 		}
 		}
+	}, {
+		.alg = "rfc4543(gcm(aes))",
+		.test = alg_test_aead,
+		.suite = {
+			.aead = {
+				.enc = {
+					.vecs = aes_gcm_rfc4543_enc_tv_template,
+					.count = AES_GCM_4543_ENC_TEST_VECTORS
+				},
+				.dec = {
+					.vecs = aes_gcm_rfc4543_dec_tv_template,
+					.count = AES_GCM_4543_DEC_TEST_VECTORS
+				},
+			}
+		}
 	}, {
 	}, {
 		.alg = "rmd128",
 		.alg = "rmd128",
 		.test = alg_test_hash,
 		.test = alg_test_hash,

+ 1276 - 38
crypto/testmgr.h

@@ -1639,6 +1639,131 @@ static struct hash_testvec hmac_sha256_tv_template[] = {
 	},
 	},
 };
 };
 
 
+#define CMAC_AES_TEST_VECTORS 6
+
+static struct hash_testvec aes_cmac128_tv_template[] = {
+	{ /* From NIST Special Publication 800-38B, AES-128 */
+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.plaintext	= zeroed_string,
+		.digest		= "\xbb\x1d\x69\x29\xe9\x59\x37\x28"
+				  "\x7f\xa3\x7d\x12\x9b\x75\x67\x46",
+		.psize		= 0,
+		.ksize		= 16,
+	}, {
+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a",
+		.digest		= "\x07\x0a\x16\xb4\x6b\x4d\x41\x44"
+				  "\xf7\x9b\xdd\x9d\xd0\x4a\x28\x7c",
+		.psize		= 16,
+		.ksize		= 16,
+	}, {
+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+				  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11",
+		.digest		= "\xdf\xa6\x67\x47\xde\x9a\xe6\x30"
+				  "\x30\xca\x32\x61\x14\x97\xc8\x27",
+		.psize		= 40,
+		.ksize		= 16,
+	}, {
+		.key		= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+				  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+				  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+				  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+				  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+				  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.digest		= "\x51\xf0\xbe\xbf\x7e\x3b\x9d\x92"
+				  "\xfc\x49\x74\x17\x79\x36\x3c\xfe",
+		.psize		= 64,
+		.ksize		= 16,
+	}, { /* From NIST Special Publication 800-38B, AES-256 */
+		.key		= "\x60\x3d\xeb\x10\x15\xca\x71\xbe"
+				  "\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+				  "\x1f\x35\x2c\x07\x3b\x61\x08\xd7"
+				  "\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+		.plaintext	= zeroed_string,
+		.digest		= "\x02\x89\x62\xf6\x1b\x7b\xf8\x9e"
+				  "\xfc\x6b\x55\x1f\x46\x67\xd9\x83",
+		.psize		= 0,
+		.ksize		= 32,
+	}, {
+		.key		= "\x60\x3d\xeb\x10\x15\xca\x71\xbe"
+				  "\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+				  "\x1f\x35\x2c\x07\x3b\x61\x08\xd7"
+				  "\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+				  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+				  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+				  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+				  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.digest		= "\xe1\x99\x21\x90\x54\x9f\x6e\xd5"
+				  "\x69\x6a\x2c\x05\x6c\x31\x54\x10",
+		.psize		= 64,
+		.ksize		= 32,
+	}
+};
+
+#define CMAC_DES3_EDE_TEST_VECTORS 4
+
+static struct hash_testvec des3_ede_cmac64_tv_template[] = {
+/*
+ * From NIST Special Publication 800-38B, Three Key TDEA
+ * Corrected test vectors from:
+ *  http://csrc.nist.gov/publications/nistpubs/800-38B/Updated_CMAC_Examples.pdf
+ */
+	{
+		.key		= "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62"
+				  "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
+				  "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
+		.plaintext	= zeroed_string,
+		.digest		= "\xb7\xa6\x88\xe1\x22\xff\xaf\x95",
+		.psize		= 0,
+		.ksize		= 24,
+	}, {
+		.key		= "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62"
+				  "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
+				  "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96",
+		.digest		= "\x8e\x8f\x29\x31\x36\x28\x37\x97",
+		.psize		= 8,
+		.ksize		= 24,
+	}, {
+		.key		= "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62"
+				  "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
+				  "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+				  "\xae\x2d\x8a\x57",
+		.digest		= "\x74\x3d\xdb\xe0\xce\x2d\xc2\xed",
+		.psize		= 20,
+		.ksize		= 24,
+	}, {
+		.key		= "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62"
+				  "\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58"
+				  "\xbc\x31\x3d\x4a\x37\x1c\xa8\xb5",
+		.plaintext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+				  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+				  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+				  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51",
+		.digest		= "\x33\xe6\xb1\x09\x24\x00\xea\xe5",
+		.psize		= 32,
+		.ksize		= 24,
+	}
+};
+
 #define XCBC_AES_TEST_VECTORS 6
 #define XCBC_AES_TEST_VECTORS 6
 
 
 static struct hash_testvec aes_xcbc128_tv_template[] = {
 static struct hash_testvec aes_xcbc128_tv_template[] = {
@@ -12680,6 +12805,8 @@ static struct cipher_testvec cast6_xts_dec_tv_template[] = {
 #define AES_GCM_DEC_TEST_VECTORS 8
 #define AES_GCM_DEC_TEST_VECTORS 8
 #define AES_GCM_4106_ENC_TEST_VECTORS 7
 #define AES_GCM_4106_ENC_TEST_VECTORS 7
 #define AES_GCM_4106_DEC_TEST_VECTORS 7
 #define AES_GCM_4106_DEC_TEST_VECTORS 7
+#define AES_GCM_4543_ENC_TEST_VECTORS 1
+#define AES_GCM_4543_DEC_TEST_VECTORS 2
 #define AES_CCM_ENC_TEST_VECTORS 7
 #define AES_CCM_ENC_TEST_VECTORS 7
 #define AES_CCM_DEC_TEST_VECTORS 7
 #define AES_CCM_DEC_TEST_VECTORS 7
 #define AES_CCM_4309_ENC_TEST_VECTORS 7
 #define AES_CCM_4309_ENC_TEST_VECTORS 7
@@ -18193,6 +18320,93 @@ static struct aead_testvec aes_gcm_rfc4106_dec_tv_template[] = {
 	}
 	}
 };
 };
 
 
+static struct aead_testvec aes_gcm_rfc4543_enc_tv_template[] = {
+	{ /* From draft-mcgrew-gcm-test-01 */
+		.key	= "\x4c\x80\xcd\xef\xbb\x5d\x10\xda"
+			  "\x90\x6a\xc7\x3c\x36\x13\xa6\x34"
+			  "\x22\x43\x3c\x64",
+		.klen	= 20,
+		.iv	= zeroed_string,
+		.assoc	= "\x00\x00\x43\x21\x00\x00\x00\x07",
+		.alen	= 8,
+		.input	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
+			  "\x01\x02\x02\x01",
+		.ilen	= 52,
+		.result	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
+			  "\x01\x02\x02\x01\xf2\xa9\xa8\x36"
+			  "\xe1\x55\x10\x6a\xa8\xdc\xd6\x18"
+			  "\xe4\x09\x9a\xaa",
+		.rlen	= 68,
+	}
+};
+
+static struct aead_testvec aes_gcm_rfc4543_dec_tv_template[] = {
+	{ /* From draft-mcgrew-gcm-test-01 */
+		.key	= "\x4c\x80\xcd\xef\xbb\x5d\x10\xda"
+			  "\x90\x6a\xc7\x3c\x36\x13\xa6\x34"
+			  "\x22\x43\x3c\x64",
+		.klen	= 20,
+		.iv	= zeroed_string,
+		.assoc	= "\x00\x00\x43\x21\x00\x00\x00\x07",
+		.alen	= 8,
+		.input	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
+			  "\x01\x02\x02\x01\xf2\xa9\xa8\x36"
+			  "\xe1\x55\x10\x6a\xa8\xdc\xd6\x18"
+			  "\xe4\x09\x9a\xaa",
+		.ilen	= 68,
+		.result	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
+			  "\x01\x02\x02\x01",
+		.rlen	= 52,
+	}, { /* nearly same as previous, but should fail */
+		.key	= "\x4c\x80\xcd\xef\xbb\x5d\x10\xda"
+			  "\x90\x6a\xc7\x3c\x36\x13\xa6\x34"
+			  "\x22\x43\x3c\x64",
+		.klen	= 20,
+		.iv	= zeroed_string,
+		.assoc	= "\x00\x00\x43\x21\x00\x00\x00\x07",
+		.alen	= 8,
+		.input	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
+			  "\x01\x02\x02\x01\xf2\xa9\xa8\x36"
+			  "\xe1\x55\x10\x6a\xa8\xdc\xd6\x18"
+			  "\x00\x00\x00\x00",
+		.ilen	= 68,
+		.novrfy = 1,
+		.result	= "\x45\x00\x00\x30\xda\x3a\x00\x00"
+			  "\x80\x01\xdf\x3b\xc0\xa8\x00\x05"
+			  "\xc0\xa8\x00\x01\x08\x00\xc6\xcd"
+			  "\x02\x00\x07\x00\x61\x62\x63\x64"
+			  "\x65\x66\x67\x68\x69\x6a\x6b\x6c"
+			  "\x6d\x6e\x6f\x70\x71\x72\x73\x74"
+			  "\x01\x02\x02\x01",
+		.rlen	= 52,
+	},
+};
+
 static struct aead_testvec aes_ccm_enc_tv_template[] = {
 static struct aead_testvec aes_ccm_enc_tv_template[] = {
 	{ /* From RFC 3610 */
 	{ /* From RFC 3610 */
 		.key	= "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
 		.key	= "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7"
@@ -20783,8 +20997,72 @@ static struct cipher_testvec camellia_enc_tv_template[] = {
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
-		.ilen	= 496,
+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
+		.ilen	= 1008,
 		.result	= "\xED\xCD\xDB\xB8\x68\xCE\xBD\xEA"
 		.result	= "\xED\xCD\xDB\xB8\x68\xCE\xBD\xEA"
 			  "\x9D\x9D\xCD\x9F\x4F\xFC\x4D\xB7"
 			  "\x9D\x9D\xCD\x9F\x4F\xFC\x4D\xB7"
 			  "\xA5\xFF\x6F\x43\x0F\xBA\x32\x04"
 			  "\xA5\xFF\x6F\x43\x0F\xBA\x32\x04"
@@ -20846,11 +21124,75 @@ static struct cipher_testvec camellia_enc_tv_template[] = {
 			  "\x2C\x35\x1B\x38\x85\x7D\xE8\xF3"
 			  "\x2C\x35\x1B\x38\x85\x7D\xE8\xF3"
 			  "\x87\x4F\xDA\xD8\x5F\xFC\xB6\x44"
 			  "\x87\x4F\xDA\xD8\x5F\xFC\xB6\x44"
 			  "\xD0\xE3\x9B\x8B\xBF\xD6\xB8\xC4"
 			  "\xD0\xE3\x9B\x8B\xBF\xD6\xB8\xC4"
-			  "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB",
-		.rlen	= 496,
+			  "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB"
+			  "\xA4\xAD\xCF\x5D\xD4\x58\xC9\xCD"
+			  "\xF7\x90\x68\xCF\xC9\x11\x52\x3E"
+			  "\xE8\xA1\xA3\x78\x8B\xD0\xAC\x0A"
+			  "\xD4\xC9\xA3\xA5\x55\x30\xC8\x3E"
+			  "\xED\x28\x39\xE9\x63\xED\x41\x70"
+			  "\x51\xE3\xC4\xA0\xFC\xD5\x43\xCB"
+			  "\x4D\x65\xC8\xFD\x3A\x91\x8F\x60"
+			  "\x8A\xA6\x6D\x9D\x3E\x01\x23\x4B"
+			  "\x50\x47\xC9\xDC\x9B\xDE\x37\xC5"
+			  "\xBF\x67\xB1\x6B\x78\x38\xD5\x7E"
+			  "\xB6\xFF\x67\x83\x3B\x6E\xBE\x23"
+			  "\x45\xFA\x1D\x69\x44\xFD\xC6\xB9"
+			  "\xD0\x4A\x92\xD1\xBE\xF6\x4A\xB7"
+			  "\xCA\xA8\xA2\x9E\x13\x87\x57\x92"
+			  "\x64\x7C\x85\x0B\xB3\x29\x37\xD8"
+			  "\xE6\xAA\xAF\xC4\x03\x67\xA3\xBF"
+			  "\x2E\x45\x83\xB6\xD8\x54\x00\x89"
+			  "\xF6\xBC\x3A\x7A\x88\x58\x51\xED"
+			  "\xF4\x4E\x01\xA5\xC3\x2E\xD9\x42"
+			  "\xBD\x6E\x0D\x0B\x21\xB0\x1A\xCC"
+			  "\xA4\xD3\x3F\xDC\x9B\x81\xD8\xF1"
+			  "\xEA\x7A\x6A\xB7\x07\xC9\x6D\x91"
+			  "\x6D\x3A\xF5\x5F\xA6\xFF\x87\x1E"
+			  "\x3F\xDD\xC0\x72\xEA\xAC\x08\x15"
+			  "\x21\xE6\xC6\xB6\x0D\xD8\x51\x86"
+			  "\x2A\x03\x73\xF7\x29\xD4\xC4\xE4"
+			  "\x7F\x95\x10\xF7\xAB\x3F\x92\x23"
+			  "\xD3\xCE\x9C\x2E\x46\x3B\x63\x43"
+			  "\xBB\xC2\x82\x7A\x83\xD5\x55\xE2"
+			  "\xE7\x9B\x2F\x92\xAF\xFD\x81\x56"
+			  "\x79\xFD\x3E\xF9\x46\xE0\x25\xD4"
+			  "\x38\xDE\xBC\x2C\xC4\x7A\x2A\x8F"
+			  "\x94\x4F\xD0\xAD\x9B\x37\x18\xD4"
+			  "\x0E\x4D\x0F\x02\x3A\xDC\x5A\xA2"
+			  "\x39\x25\x55\x20\x5A\xA6\x02\x9F"
+			  "\xE6\x77\x21\x77\xE5\x4B\x7B\x0B"
+			  "\x30\xF8\x5F\x33\x0F\x49\xCD\xFF"
+			  "\xF2\xE4\x35\xF9\xF0\x63\xC3\x7E"
+			  "\xF1\xA6\x73\xB4\xDF\xE7\xBB\x78"
+			  "\xFF\x21\xA9\xF3\xF3\xCF\x5D\xBA"
+			  "\xED\x87\x98\xAC\xFE\x48\x97\x6D"
+			  "\xA6\x7F\x69\x31\xB1\xC4\xFF\x14"
+			  "\xC6\x76\xD4\x10\xDD\xF6\x49\x2C"
+			  "\x9C\xC8\x6D\x76\xC0\x8F\x5F\x55"
+			  "\x2F\x3C\x8A\x30\xAA\xC3\x16\x55"
+			  "\xC6\xFC\x8D\x8B\xB9\xE5\x80\x6C"
+			  "\xC8\x7E\xBD\x65\x58\x36\xD5\xBC"
+			  "\xF0\x33\x52\x29\x70\xF9\x5C\xE9"
+			  "\xAC\x1F\xB5\x73\x56\x66\x54\xAF"
+			  "\x1B\x8F\x7D\xED\xAB\x03\xCE\xE3"
+			  "\xAE\x47\xB6\x69\x86\xE9\x01\x31"
+			  "\x83\x18\x3D\xF4\x74\x7B\xF9\x42"
+			  "\x4C\xFD\x75\x4A\x6D\xF0\x03\xA6"
+			  "\x2B\x20\x63\xDA\x49\x65\x5E\x8B"
+			  "\xC0\x19\xE3\x8D\xD9\xF3\xB0\x34"
+			  "\xD3\x52\xFC\x68\x00\x43\x1B\x37"
+			  "\x31\x93\x51\x1C\x63\x97\x70\xB0"
+			  "\x99\x78\x83\x13\xFD\xCF\x53\x81"
+			  "\x36\x46\xB5\x42\x52\x2F\x32\xEB"
+			  "\x4A\x3D\xF1\x8F\x1C\x54\x2E\xFC"
+			  "\x41\x75\x5A\x8C\x8E\x6F\xE7\x1A"
+			  "\xAE\xEF\x3E\x82\x12\x0B\x74\x72"
+			  "\xF8\xB2\xAA\x7A\xD6\xFF\xFA\x55"
+			  "\x33\x1A\xBB\xD3\xA2\x7E\x97\x66",
+		.rlen	= 1008,
 		.also_non_np = 1,
 		.also_non_np = 1,
 		.np	= 2,
 		.np	= 2,
-		.tap	= { 496 - 16, 16 },
+		.tap	= { 1008 - 16, 16 },
 	},
 	},
 };
 };
 
 
@@ -20955,8 +21297,72 @@ static struct cipher_testvec camellia_dec_tv_template[] = {
 			  "\x2C\x35\x1B\x38\x85\x7D\xE8\xF3"
 			  "\x2C\x35\x1B\x38\x85\x7D\xE8\xF3"
 			  "\x87\x4F\xDA\xD8\x5F\xFC\xB6\x44"
 			  "\x87\x4F\xDA\xD8\x5F\xFC\xB6\x44"
 			  "\xD0\xE3\x9B\x8B\xBF\xD6\xB8\xC4"
 			  "\xD0\xE3\x9B\x8B\xBF\xD6\xB8\xC4"
-			  "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB",
-		.ilen	= 496,
+			  "\x73\xAE\x1D\x8B\x5B\x74\x8B\xCB"
+			  "\xA4\xAD\xCF\x5D\xD4\x58\xC9\xCD"
+			  "\xF7\x90\x68\xCF\xC9\x11\x52\x3E"
+			  "\xE8\xA1\xA3\x78\x8B\xD0\xAC\x0A"
+			  "\xD4\xC9\xA3\xA5\x55\x30\xC8\x3E"
+			  "\xED\x28\x39\xE9\x63\xED\x41\x70"
+			  "\x51\xE3\xC4\xA0\xFC\xD5\x43\xCB"
+			  "\x4D\x65\xC8\xFD\x3A\x91\x8F\x60"
+			  "\x8A\xA6\x6D\x9D\x3E\x01\x23\x4B"
+			  "\x50\x47\xC9\xDC\x9B\xDE\x37\xC5"
+			  "\xBF\x67\xB1\x6B\x78\x38\xD5\x7E"
+			  "\xB6\xFF\x67\x83\x3B\x6E\xBE\x23"
+			  "\x45\xFA\x1D\x69\x44\xFD\xC6\xB9"
+			  "\xD0\x4A\x92\xD1\xBE\xF6\x4A\xB7"
+			  "\xCA\xA8\xA2\x9E\x13\x87\x57\x92"
+			  "\x64\x7C\x85\x0B\xB3\x29\x37\xD8"
+			  "\xE6\xAA\xAF\xC4\x03\x67\xA3\xBF"
+			  "\x2E\x45\x83\xB6\xD8\x54\x00\x89"
+			  "\xF6\xBC\x3A\x7A\x88\x58\x51\xED"
+			  "\xF4\x4E\x01\xA5\xC3\x2E\xD9\x42"
+			  "\xBD\x6E\x0D\x0B\x21\xB0\x1A\xCC"
+			  "\xA4\xD3\x3F\xDC\x9B\x81\xD8\xF1"
+			  "\xEA\x7A\x6A\xB7\x07\xC9\x6D\x91"
+			  "\x6D\x3A\xF5\x5F\xA6\xFF\x87\x1E"
+			  "\x3F\xDD\xC0\x72\xEA\xAC\x08\x15"
+			  "\x21\xE6\xC6\xB6\x0D\xD8\x51\x86"
+			  "\x2A\x03\x73\xF7\x29\xD4\xC4\xE4"
+			  "\x7F\x95\x10\xF7\xAB\x3F\x92\x23"
+			  "\xD3\xCE\x9C\x2E\x46\x3B\x63\x43"
+			  "\xBB\xC2\x82\x7A\x83\xD5\x55\xE2"
+			  "\xE7\x9B\x2F\x92\xAF\xFD\x81\x56"
+			  "\x79\xFD\x3E\xF9\x46\xE0\x25\xD4"
+			  "\x38\xDE\xBC\x2C\xC4\x7A\x2A\x8F"
+			  "\x94\x4F\xD0\xAD\x9B\x37\x18\xD4"
+			  "\x0E\x4D\x0F\x02\x3A\xDC\x5A\xA2"
+			  "\x39\x25\x55\x20\x5A\xA6\x02\x9F"
+			  "\xE6\x77\x21\x77\xE5\x4B\x7B\x0B"
+			  "\x30\xF8\x5F\x33\x0F\x49\xCD\xFF"
+			  "\xF2\xE4\x35\xF9\xF0\x63\xC3\x7E"
+			  "\xF1\xA6\x73\xB4\xDF\xE7\xBB\x78"
+			  "\xFF\x21\xA9\xF3\xF3\xCF\x5D\xBA"
+			  "\xED\x87\x98\xAC\xFE\x48\x97\x6D"
+			  "\xA6\x7F\x69\x31\xB1\xC4\xFF\x14"
+			  "\xC6\x76\xD4\x10\xDD\xF6\x49\x2C"
+			  "\x9C\xC8\x6D\x76\xC0\x8F\x5F\x55"
+			  "\x2F\x3C\x8A\x30\xAA\xC3\x16\x55"
+			  "\xC6\xFC\x8D\x8B\xB9\xE5\x80\x6C"
+			  "\xC8\x7E\xBD\x65\x58\x36\xD5\xBC"
+			  "\xF0\x33\x52\x29\x70\xF9\x5C\xE9"
+			  "\xAC\x1F\xB5\x73\x56\x66\x54\xAF"
+			  "\x1B\x8F\x7D\xED\xAB\x03\xCE\xE3"
+			  "\xAE\x47\xB6\x69\x86\xE9\x01\x31"
+			  "\x83\x18\x3D\xF4\x74\x7B\xF9\x42"
+			  "\x4C\xFD\x75\x4A\x6D\xF0\x03\xA6"
+			  "\x2B\x20\x63\xDA\x49\x65\x5E\x8B"
+			  "\xC0\x19\xE3\x8D\xD9\xF3\xB0\x34"
+			  "\xD3\x52\xFC\x68\x00\x43\x1B\x37"
+			  "\x31\x93\x51\x1C\x63\x97\x70\xB0"
+			  "\x99\x78\x83\x13\xFD\xCF\x53\x81"
+			  "\x36\x46\xB5\x42\x52\x2F\x32\xEB"
+			  "\x4A\x3D\xF1\x8F\x1C\x54\x2E\xFC"
+			  "\x41\x75\x5A\x8C\x8E\x6F\xE7\x1A"
+			  "\xAE\xEF\x3E\x82\x12\x0B\x74\x72"
+			  "\xF8\xB2\xAA\x7A\xD6\xFF\xFA\x55"
+			  "\x33\x1A\xBB\xD3\xA2\x7E\x97\x66",
+		.ilen	= 1008,
 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
@@ -21018,11 +21424,75 @@ static struct cipher_testvec camellia_dec_tv_template[] = {
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
-		.rlen	= 496,
+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
+		.rlen	= 1008,
 		.also_non_np = 1,
 		.also_non_np = 1,
 		.np	= 2,
 		.np	= 2,
-		.tap	= { 496 - 16, 16 },
+		.tap	= { 1008 - 16, 16 },
 	},
 	},
 };
 };
 
 
@@ -21123,8 +21593,72 @@ static struct cipher_testvec camellia_cbc_enc_tv_template[] = {
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
-		.ilen	= 496,
+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
+		.ilen	= 1008,
 		.result	= "\xCD\x3E\x2A\x3B\x3E\x94\xC5\x77"
 		.result	= "\xCD\x3E\x2A\x3B\x3E\x94\xC5\x77"
 			  "\xBA\xBB\x5B\xB1\xDE\x7B\xA4\x40"
 			  "\xBA\xBB\x5B\xB1\xDE\x7B\xA4\x40"
 			  "\x88\x39\xE3\xFD\x94\x4B\x25\x58"
 			  "\x88\x39\xE3\xFD\x94\x4B\x25\x58"
@@ -21186,11 +21720,75 @@ static struct cipher_testvec camellia_cbc_enc_tv_template[] = {
 			  "\x2D\x1A\x68\xFE\xEC\x92\x94\xDA"
 			  "\x2D\x1A\x68\xFE\xEC\x92\x94\xDA"
 			  "\x94\x2A\x6F\xD6\xFE\xE5\x76\x97"
 			  "\x94\x2A\x6F\xD6\xFE\xE5\x76\x97"
 			  "\xF4\x6E\xEE\xCB\x2B\x95\x4E\x36"
 			  "\xF4\x6E\xEE\xCB\x2B\x95\x4E\x36"
-			  "\x5F\x74\x8C\x86\x5B\x71\xD0\x20",
-		.rlen	= 496,
+			  "\x5F\x74\x8C\x86\x5B\x71\xD0\x20"
+			  "\x78\x1A\x7F\x18\x8C\xD9\xCD\xF5"
+			  "\x21\x41\x56\x72\x13\xE1\x86\x07"
+			  "\x07\x26\xF3\x4F\x7B\xEA\xB5\x18"
+			  "\xFE\x94\x2D\x9F\xE0\x72\x18\x65"
+			  "\xB2\xA5\x63\x48\xB4\x13\x22\xF7"
+			  "\x25\xF1\x80\xA8\x7F\x54\x86\x7B"
+			  "\x39\xAE\x95\x0C\x09\x32\x22\x2D"
+			  "\x4D\x73\x39\x0C\x09\x2C\x7C\x10"
+			  "\xD0\x4B\x53\xF6\x90\xC5\x99\x2F"
+			  "\x15\xE1\x7F\xC6\xC5\x7A\x52\x14"
+			  "\x65\xEE\x93\x54\xD0\x66\x15\x3C"
+			  "\x4C\x68\xFD\x64\x0F\xF9\x10\x39"
+			  "\x46\x7A\xDD\x97\x20\xEE\xC7\xD2"
+			  "\x98\x4A\xB6\xE6\xF5\xA8\x1F\x4F"
+			  "\xDB\xAB\x6D\xD5\x9B\x34\x16\x97"
+			  "\x2F\x64\xE5\x37\xEF\x0E\xA1\xE9"
+			  "\xBE\x31\x31\x96\x8B\x40\x18\x75"
+			  "\x11\x75\x14\x32\xA5\x2D\x1B\x6B"
+			  "\xDB\x59\xEB\xFA\x3D\x8E\x7C\xC4"
+			  "\xDE\x68\xC8\x9F\xC9\x99\xE3\xC6"
+			  "\x71\xB0\x12\x57\x89\x0D\xC0\x2B"
+			  "\x9F\x12\x6A\x04\x67\xF1\x95\x31"
+			  "\x59\xFD\x84\x95\x2C\x9C\x5B\xEC"
+			  "\x09\xB0\x43\x96\x4A\x64\x80\x40"
+			  "\xB9\x72\x19\xDD\x70\x42\xFA\xB1"
+			  "\x4A\x2C\x0C\x0A\x60\x6E\xE3\x7C"
+			  "\x37\x5A\xBE\xA4\x62\xCF\x29\xAB"
+			  "\x7F\x4D\xA6\xB3\xE2\xB6\x64\xC6"
+			  "\x33\x0B\xF3\xD5\x01\x38\x74\xA4"
+			  "\x67\x1E\x75\x68\xC3\xAD\x76\xE9"
+			  "\xE9\xBC\xF0\xEB\xD8\xFD\x31\x8A"
+			  "\x5F\xC9\x18\x94\x4B\x86\x66\xFC"
+			  "\xBD\x0B\x3D\xB3\x9F\xFA\x1F\xD9"
+			  "\x78\xC4\xE3\x24\x1C\x67\xA2\xF8"
+			  "\x43\xBC\x76\x75\xBF\x6C\x05\xB3"
+			  "\x32\xE8\x7C\x80\xDB\xC7\xB6\x61"
+			  "\x1A\x3E\x2B\xA7\x25\xED\x8F\xA0"
+			  "\x00\x4B\xF8\x90\xCA\xD8\xFB\x12"
+			  "\xAC\x1F\x18\xE9\xD2\x5E\xA2\x8E"
+			  "\xE4\x84\x6B\x9D\xEB\x1E\x6B\xA3"
+			  "\x7B\xDC\xCE\x15\x97\x27\xB2\x65"
+			  "\xBC\x0E\x47\xAB\x55\x13\x53\xAB"
+			  "\x0E\x34\x55\x02\x5F\x27\xC5\x89"
+			  "\xDF\xC5\x70\xC4\xDD\x76\x82\xEE"
+			  "\x68\xA6\x09\xB0\xE5\x5E\xF1\x0C"
+			  "\xE3\xF3\x09\x9B\xFE\x65\x4B\xB8"
+			  "\x30\xEC\xD5\x7C\x6A\xEC\x1D\xD2"
+			  "\x93\xB7\xA1\x1A\x02\xD4\xC0\xD6"
+			  "\x8D\x4D\x83\x9A\xED\x29\x4E\x14"
+			  "\x86\xD5\x3C\x1A\xD5\xB9\x0A\x6A"
+			  "\x72\x22\xD5\x92\x38\xF1\xA1\x86"
+			  "\xB2\x41\x51\xCA\x4E\xAB\x8F\xD3"
+			  "\x80\x56\xC3\xD7\x65\xE1\xB3\x86"
+			  "\xCB\xCE\x98\xA1\xD4\x59\x1C\x06"
+			  "\x01\xED\xF8\x29\x91\x19\x5C\x9A"
+			  "\xEE\x28\x1B\x48\xD7\x32\xEF\x9F"
+			  "\x6C\x2B\x66\x4E\x78\xD5\x8B\x72"
+			  "\x80\xE7\x29\xDC\x23\x55\x98\x54"
+			  "\xB1\xFF\x3E\x95\x56\xA8\x78\x78"
+			  "\xEF\xC4\xA5\x11\x2D\x2B\xD8\x93"
+			  "\x30\x6E\x7E\x51\xBB\x42\x5F\x03"
+			  "\x43\x94\x23\x7E\xEE\xF0\xA5\x79"
+			  "\x55\x01\xD4\x58\xB2\xF2\x85\x49"
+			  "\x70\xC5\xB9\x0B\x3B\x7A\x6E\x6C",
+		.rlen	= 1008,
 		.also_non_np = 1,
 		.also_non_np = 1,
 		.np	= 2,
 		.np	= 2,
-		.tap	= { 496 - 16, 16 },
+		.tap	= { 1008 - 16, 16 },
 	},
 	},
 };
 };
 
 
@@ -21291,8 +21889,72 @@ static struct cipher_testvec camellia_cbc_dec_tv_template[] = {
 			  "\x2D\x1A\x68\xFE\xEC\x92\x94\xDA"
 			  "\x2D\x1A\x68\xFE\xEC\x92\x94\xDA"
 			  "\x94\x2A\x6F\xD6\xFE\xE5\x76\x97"
 			  "\x94\x2A\x6F\xD6\xFE\xE5\x76\x97"
 			  "\xF4\x6E\xEE\xCB\x2B\x95\x4E\x36"
 			  "\xF4\x6E\xEE\xCB\x2B\x95\x4E\x36"
-			  "\x5F\x74\x8C\x86\x5B\x71\xD0\x20",
-		.ilen	= 496,
+			  "\x5F\x74\x8C\x86\x5B\x71\xD0\x20"
+			  "\x78\x1A\x7F\x18\x8C\xD9\xCD\xF5"
+			  "\x21\x41\x56\x72\x13\xE1\x86\x07"
+			  "\x07\x26\xF3\x4F\x7B\xEA\xB5\x18"
+			  "\xFE\x94\x2D\x9F\xE0\x72\x18\x65"
+			  "\xB2\xA5\x63\x48\xB4\x13\x22\xF7"
+			  "\x25\xF1\x80\xA8\x7F\x54\x86\x7B"
+			  "\x39\xAE\x95\x0C\x09\x32\x22\x2D"
+			  "\x4D\x73\x39\x0C\x09\x2C\x7C\x10"
+			  "\xD0\x4B\x53\xF6\x90\xC5\x99\x2F"
+			  "\x15\xE1\x7F\xC6\xC5\x7A\x52\x14"
+			  "\x65\xEE\x93\x54\xD0\x66\x15\x3C"
+			  "\x4C\x68\xFD\x64\x0F\xF9\x10\x39"
+			  "\x46\x7A\xDD\x97\x20\xEE\xC7\xD2"
+			  "\x98\x4A\xB6\xE6\xF5\xA8\x1F\x4F"
+			  "\xDB\xAB\x6D\xD5\x9B\x34\x16\x97"
+			  "\x2F\x64\xE5\x37\xEF\x0E\xA1\xE9"
+			  "\xBE\x31\x31\x96\x8B\x40\x18\x75"
+			  "\x11\x75\x14\x32\xA5\x2D\x1B\x6B"
+			  "\xDB\x59\xEB\xFA\x3D\x8E\x7C\xC4"
+			  "\xDE\x68\xC8\x9F\xC9\x99\xE3\xC6"
+			  "\x71\xB0\x12\x57\x89\x0D\xC0\x2B"
+			  "\x9F\x12\x6A\x04\x67\xF1\x95\x31"
+			  "\x59\xFD\x84\x95\x2C\x9C\x5B\xEC"
+			  "\x09\xB0\x43\x96\x4A\x64\x80\x40"
+			  "\xB9\x72\x19\xDD\x70\x42\xFA\xB1"
+			  "\x4A\x2C\x0C\x0A\x60\x6E\xE3\x7C"
+			  "\x37\x5A\xBE\xA4\x62\xCF\x29\xAB"
+			  "\x7F\x4D\xA6\xB3\xE2\xB6\x64\xC6"
+			  "\x33\x0B\xF3\xD5\x01\x38\x74\xA4"
+			  "\x67\x1E\x75\x68\xC3\xAD\x76\xE9"
+			  "\xE9\xBC\xF0\xEB\xD8\xFD\x31\x8A"
+			  "\x5F\xC9\x18\x94\x4B\x86\x66\xFC"
+			  "\xBD\x0B\x3D\xB3\x9F\xFA\x1F\xD9"
+			  "\x78\xC4\xE3\x24\x1C\x67\xA2\xF8"
+			  "\x43\xBC\x76\x75\xBF\x6C\x05\xB3"
+			  "\x32\xE8\x7C\x80\xDB\xC7\xB6\x61"
+			  "\x1A\x3E\x2B\xA7\x25\xED\x8F\xA0"
+			  "\x00\x4B\xF8\x90\xCA\xD8\xFB\x12"
+			  "\xAC\x1F\x18\xE9\xD2\x5E\xA2\x8E"
+			  "\xE4\x84\x6B\x9D\xEB\x1E\x6B\xA3"
+			  "\x7B\xDC\xCE\x15\x97\x27\xB2\x65"
+			  "\xBC\x0E\x47\xAB\x55\x13\x53\xAB"
+			  "\x0E\x34\x55\x02\x5F\x27\xC5\x89"
+			  "\xDF\xC5\x70\xC4\xDD\x76\x82\xEE"
+			  "\x68\xA6\x09\xB0\xE5\x5E\xF1\x0C"
+			  "\xE3\xF3\x09\x9B\xFE\x65\x4B\xB8"
+			  "\x30\xEC\xD5\x7C\x6A\xEC\x1D\xD2"
+			  "\x93\xB7\xA1\x1A\x02\xD4\xC0\xD6"
+			  "\x8D\x4D\x83\x9A\xED\x29\x4E\x14"
+			  "\x86\xD5\x3C\x1A\xD5\xB9\x0A\x6A"
+			  "\x72\x22\xD5\x92\x38\xF1\xA1\x86"
+			  "\xB2\x41\x51\xCA\x4E\xAB\x8F\xD3"
+			  "\x80\x56\xC3\xD7\x65\xE1\xB3\x86"
+			  "\xCB\xCE\x98\xA1\xD4\x59\x1C\x06"
+			  "\x01\xED\xF8\x29\x91\x19\x5C\x9A"
+			  "\xEE\x28\x1B\x48\xD7\x32\xEF\x9F"
+			  "\x6C\x2B\x66\x4E\x78\xD5\x8B\x72"
+			  "\x80\xE7\x29\xDC\x23\x55\x98\x54"
+			  "\xB1\xFF\x3E\x95\x56\xA8\x78\x78"
+			  "\xEF\xC4\xA5\x11\x2D\x2B\xD8\x93"
+			  "\x30\x6E\x7E\x51\xBB\x42\x5F\x03"
+			  "\x43\x94\x23\x7E\xEE\xF0\xA5\x79"
+			  "\x55\x01\xD4\x58\xB2\xF2\x85\x49"
+			  "\x70\xC5\xB9\x0B\x3B\x7A\x6E\x6C",
+		.ilen	= 1008,
 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
@@ -21354,11 +22016,75 @@ static struct cipher_testvec camellia_cbc_dec_tv_template[] = {
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
-		.rlen	= 496,
+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
+		.rlen	= 1008,
 		.also_non_np = 1,
 		.also_non_np = 1,
 		.np	= 2,
 		.np	= 2,
-		.tap	= { 496 - 16, 16 },
+		.tap	= { 1008 - 16, 16 },
 	},
 	},
 };
 };
 
 
@@ -21567,8 +22293,72 @@ static struct cipher_testvec camellia_ctr_enc_tv_template[] = {
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
 			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
 			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
-			  "\x2B\xC2\x59",
-		.ilen	= 499,
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D"
+			  "\xE4\x7B\x12",
+		.ilen	= 1011,
 		.result	= "\xF3\x06\x3A\x84\xCD\xBA\x8E\x11"
 		.result	= "\xF3\x06\x3A\x84\xCD\xBA\x8E\x11"
 			  "\xB7\x74\x6F\x5C\x97\xFB\x36\xFE"
 			  "\xB7\x74\x6F\x5C\x97\xFB\x36\xFE"
 			  "\xDE\x71\x58\xD4\x15\xD1\xC1\xA4"
 			  "\xDE\x71\x58\xD4\x15\xD1\xC1\xA4"
@@ -21631,11 +22421,75 @@ static struct cipher_testvec camellia_ctr_enc_tv_template[] = {
 			  "\x7E\x42\xEC\xB6\x6F\x4D\x6B\x48"
 			  "\x7E\x42\xEC\xB6\x6F\x4D\x6B\x48"
 			  "\xE6\xA6\x50\x80\x78\x9E\xF1\xB0"
 			  "\xE6\xA6\x50\x80\x78\x9E\xF1\xB0"
 			  "\x4D\xB2\x0D\x3D\xFC\x40\x25\x4D"
 			  "\x4D\xB2\x0D\x3D\xFC\x40\x25\x4D"
-			  "\x93\x11\x1C",
-		.rlen	= 499,
+			  "\x93\x11\x1C\xE9\xD2\x9F\x6E\x90"
+			  "\xE5\x41\x4A\xE2\x3C\x45\x29\x35"
+			  "\xEC\xD6\x47\x50\xCB\x7B\xA2\x32"
+			  "\xF7\x8B\x62\xF1\xE3\x9A\xFE\xC7"
+			  "\x1D\x8C\x02\x72\x68\x09\xE9\xB6"
+			  "\x4A\x80\xE6\xB1\x56\xDF\x90\xD4"
+			  "\x93\x74\xA4\xCE\x20\x23\xBF\x48"
+			  "\xA5\xDE\x1B\xFA\x40\x69\x31\x98"
+			  "\x62\x6E\xA5\xC7\xBF\x0C\x62\xE5"
+			  "\x6D\xE1\x93\xF1\x83\x10\x1C\xCA"
+			  "\xF6\x5C\x19\xF8\x90\x78\xCB\xE4"
+			  "\x0B\x3A\xB5\xF8\x43\x86\xD3\x3F"
+			  "\xBA\x83\x34\x3C\x42\xCC\x7D\x28"
+			  "\x29\x63\x4F\xD8\x02\x17\xC5\x07"
+			  "\x2C\xA4\xAC\x79\xCB\xC3\xA9\x09"
+			  "\x81\x45\x18\xED\xE4\xCB\x42\x3B"
+			  "\x87\x2D\x23\xDC\xC5\xBA\x45\xBD"
+			  "\x92\xE5\x02\x97\x96\xCE\xAD\xEC"
+			  "\xBA\xD8\x76\xF8\xCA\xC1\x31\xEC"
+			  "\x1E\x4F\x3F\x83\xF8\x33\xE8\x6E"
+			  "\xCC\xF8\x5F\xDD\x65\x50\x99\x69"
+			  "\xAF\x48\xCE\xA5\xBA\xB6\x14\x9F"
+			  "\x05\x93\xB2\xE6\x59\xC8\x28\xFE"
+			  "\x8F\x37\xF9\x64\xB9\xA5\x56\x8F"
+			  "\xF1\x1B\x90\xEF\xAE\xEB\xFC\x09"
+			  "\x11\x7A\xF2\x19\x0A\x0A\x9A\x3C"
+			  "\xE2\x5E\x29\xFA\x31\x9B\xC1\x74"
+			  "\x1E\x10\x3E\x07\xA9\x31\x6D\xF8"
+			  "\x81\xF5\xD5\x8A\x04\x23\x51\xAC"
+			  "\xA2\xE2\x63\xFD\x27\x1F\x79\x5B"
+			  "\x1F\xE8\xDA\x11\x49\x4D\x1C\xBA"
+			  "\x54\xCC\x0F\xBA\x92\x69\xE5\xCB"
+			  "\x41\x1A\x67\xA6\x40\x82\x70\x8C"
+			  "\x19\x79\x08\xA4\x51\x20\x7D\xC9"
+			  "\x12\x27\xAE\x20\x0D\x2C\xA1\x6D"
+			  "\xF4\x55\xD4\xE7\xE6\xD4\x28\x08"
+			  "\x00\x70\x12\x56\x56\x50\xAD\x14"
+			  "\x5C\x3E\xA2\xD1\x36\x3F\x36\x48"
+			  "\xED\xB1\x57\x3E\x5D\x15\xF6\x1E"
+			  "\x53\xE9\xA4\x3E\xED\x7D\xCF\x7D"
+			  "\x29\xAF\xF3\x1E\x51\xA8\x9F\x85"
+			  "\x8B\xF0\xBB\xCE\xCC\x39\xC3\x64"
+			  "\x4B\xF2\xAD\x70\x19\xD4\x44\x8F"
+			  "\x91\x76\xE8\x15\x66\x34\x9F\xF6"
+			  "\x0F\x15\xA4\xA8\x24\xF8\x58\xB1"
+			  "\x38\x46\x47\xC7\x9B\xCA\xE9\x42"
+			  "\x44\xAA\xE6\xB5\x9C\x91\xA4\xD3"
+			  "\x16\xA0\xED\x42\xBE\xB5\x06\x19"
+			  "\xBE\x67\xE8\xBC\x22\x32\xA4\x1E"
+			  "\x93\xEB\xBE\xE9\xE1\x93\xE5\x31"
+			  "\x3A\xA2\x75\xDF\xE3\x6B\xE7\xCC"
+			  "\xB4\x70\x20\xE0\x6D\x82\x7C\xC8"
+			  "\x94\x5C\x5E\x37\x18\xAD\xED\x8B"
+			  "\x44\x86\xCA\x5E\x07\xB7\x70\x8D"
+			  "\x40\x48\x19\x73\x7C\x78\x64\x0B"
+			  "\xDB\x01\xCA\xAE\x63\x19\xE9\xD1"
+			  "\x6B\x2C\x84\x10\x45\x42\x2E\xC3"
+			  "\xDF\x7F\xAA\xE8\x87\x1B\x63\x46"
+			  "\x74\x28\x9D\x05\x30\x20\x62\x41"
+			  "\xC0\x9F\x2C\x36\x2B\x78\xD7\x26"
+			  "\xDF\x58\x51\xED\xFA\xDC\x87\x79"
+			  "\xBF\x8C\xBF\xC4\x0F\xE5\x05\xDA"
+			  "\x45\xE3\x35\x0D\x69\x91\x54\x1C"
+			  "\xE7\x2C\x49\x08\x8B\x72\xFA\x5C"
+			  "\xF1\x6B\xD9",
+		.rlen	= 1011,
 		.also_non_np = 1,
 		.also_non_np = 1,
 		.np	= 2,
 		.np	= 2,
-		.tap	= { 499 - 16, 16 },
+		.tap	= { 1011 - 16, 16 },
 	}, { /* Generated with Crypto++ */
 	}, { /* Generated with Crypto++ */
 		.key	= "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
 		.key	= "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
 			  "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
 			  "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
@@ -21705,8 +22559,72 @@ static struct cipher_testvec camellia_ctr_enc_tv_template[] = {
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
-		.ilen	= 496,
+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
+		.ilen	= 1008,
 		.result	= "\x85\x79\x6C\x8B\x2B\x6D\x14\xF9"
 		.result	= "\x85\x79\x6C\x8B\x2B\x6D\x14\xF9"
 			  "\xA6\x83\xB6\x80\x5B\x3A\xF3\x7E"
 			  "\xA6\x83\xB6\x80\x5B\x3A\xF3\x7E"
 			  "\x30\x29\xEB\x1F\xDC\x19\x5F\xEB"
 			  "\x30\x29\xEB\x1F\xDC\x19\x5F\xEB"
@@ -21768,8 +22686,72 @@ static struct cipher_testvec camellia_ctr_enc_tv_template[] = {
 			  "\xB4\x3A\x5F\x19\xCF\x42\x1B\x22"
 			  "\xB4\x3A\x5F\x19\xCF\x42\x1B\x22"
 			  "\x0B\x2D\x7B\xF1\xC5\x43\xF7\x5E"
 			  "\x0B\x2D\x7B\xF1\xC5\x43\xF7\x5E"
 			  "\x12\xA8\x01\x64\x16\x0B\x26\x5A"
 			  "\x12\xA8\x01\x64\x16\x0B\x26\x5A"
-			  "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C",
-		.rlen	= 496,
+			  "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C"
+			  "\xCF\xF5\xD5\xB7\x7A\x34\x23\xB6"
+			  "\xAA\x9E\xA8\x98\xA2\xF8\x3D\xD3"
+			  "\x3F\x23\x69\x63\x56\x96\x45\xD6"
+			  "\x74\x23\x1D\x5C\x63\xCC\xD8\x78"
+			  "\x16\xE2\x9C\xD2\x80\x02\xF2\x28"
+			  "\x69\x2F\xC4\xA8\x15\x15\x24\x3B"
+			  "\xCB\xF0\x14\xE4\x62\xC8\xF3\xD1"
+			  "\x03\x58\x1B\x33\x77\x74\x1F\xB4"
+			  "\x07\x86\xF2\x21\xB7\x41\xAE\xBF"
+			  "\x25\xC2\xFF\x51\xEF\xEA\xCE\xC4"
+			  "\x5F\xD9\xB8\x18\x6A\xF0\x0F\x0D"
+			  "\xF8\x04\xBB\x6D\x62\x33\x87\x26"
+			  "\x4F\x2F\x14\x6E\xDC\xDB\x66\x09"
+			  "\x2A\xEF\x7D\x84\x10\xAC\x82\x5E"
+			  "\xD2\xE4\xAD\x74\x7A\x6D\xCC\x3A"
+			  "\x7B\x62\xD8\xD6\x07\x2D\xF7\xDF"
+			  "\x9B\xB3\x82\xCF\x9C\x1D\x76\x5C"
+			  "\xAC\x7B\xD4\x9B\x45\xA1\x64\x11"
+			  "\x66\xF1\xA7\x0B\xF9\xDD\x00\xDD"
+			  "\xA4\x45\x3D\x3E\x03\xC9\x2E\xCB"
+			  "\xC3\x14\x84\x72\xFD\x41\xDC\xBD"
+			  "\x75\xBE\xA8\xE5\x16\x48\x64\x39"
+			  "\xCA\xF3\xE6\xDC\x25\x24\xF1\x6D"
+			  "\xB2\x8D\xC5\x38\x54\xD3\x5D\x6D"
+			  "\x0B\x29\x10\x15\x0E\x13\x3B\xAC"
+			  "\x7E\xCC\x9E\x3E\x18\x48\xA6\x02"
+			  "\xEF\x03\xB2\x2E\xE3\xD2\x70\x21"
+			  "\xB4\x19\x26\xBE\x3A\x3D\x05\xE0"
+			  "\xF8\x09\xAF\xE4\x31\x26\x92\x2F"
+			  "\x8F\x55\xAC\xED\x0B\xB2\xA5\x34"
+			  "\xBE\x50\xB1\x02\x22\x96\xE3\x40"
+			  "\x7B\x70\x50\x6E\x3B\xD5\xE5\xA0"
+			  "\x8E\xA2\xAD\x14\x60\x5C\x7A\x2B"
+			  "\x3D\x1B\x7F\xC1\xC0\x2C\x56\x36"
+			  "\xD2\x0A\x32\x06\x97\x34\xB9\xF4"
+			  "\x6F\x9F\x7E\x80\xD0\x9D\xF7\x6A"
+			  "\x21\xC1\xA2\x6A\xB1\x96\x5B\x4D"
+			  "\x7A\x15\x6C\xC4\x4E\xB8\xE0\x9E"
+			  "\x6C\x50\xF3\x9C\xC9\xB5\x23\xB7"
+			  "\xF1\xD4\x29\x4A\x23\xC4\xAD\x1E"
+			  "\x2C\x07\xD2\x43\x5F\x57\x93\xCA"
+			  "\x85\xF9\x9F\xAD\x4C\xF1\xE4\xB1"
+			  "\x1A\x8E\x28\xA4\xB6\x52\x77\x7E"
+			  "\x68\xC6\x47\xB9\x76\xCC\x65\x5F"
+			  "\x0B\xF9\x67\x93\xD8\x0E\x9A\x37"
+			  "\x5F\x41\xED\x64\x6C\xAD\x5F\xED"
+			  "\x3F\x8D\xFB\x8E\x1E\xA0\xE4\x1F"
+			  "\xC2\xC7\xED\x18\x43\xE1\x20\x86"
+			  "\x5D\xBC\x30\x70\x22\xA1\xDC\x53"
+			  "\x10\x3A\x8D\x47\x82\xCD\x7F\x59"
+			  "\x03\x2D\x6D\xF5\xE7\x79\xD4\x07"
+			  "\x68\x2A\xA5\x42\x19\x4D\xAF\xF5"
+			  "\xED\x47\x83\xBC\x5F\x62\x84\xDA"
+			  "\xDA\x41\xFF\xB0\x1D\x64\xA3\xC8"
+			  "\xBD\x4E\xE0\xB8\x7F\xEE\x55\x0A"
+			  "\x4E\x61\xB2\x51\xF6\x9C\x95\xF6"
+			  "\x92\xBB\xF6\xC5\xF0\x09\x86\xDE"
+			  "\x37\x9E\x29\xF9\x2A\x18\x73\x0D"
+			  "\xDC\x7E\x6B\x7B\x1B\x43\x8C\xEA"
+			  "\x13\xC8\x1A\x47\x0A\x2D\x6D\x56"
+			  "\xCD\xD2\xE7\x53\x1A\xAB\x1C\x3C"
+			  "\xC5\x9B\x03\x70\x29\x2A\x49\x09"
+			  "\x67\xA1\xEA\xD6\x3A\x5B\xBF\x71"
+			  "\x1D\x48\x64\x6C\xFB\xC0\x9E\x36",
+		.rlen	= 1008,
 	},
 	},
 };
 };
 
 
@@ -21978,8 +22960,72 @@ static struct cipher_testvec camellia_ctr_dec_tv_template[] = {
 			  "\x7E\x42\xEC\xB6\x6F\x4D\x6B\x48"
 			  "\x7E\x42\xEC\xB6\x6F\x4D\x6B\x48"
 			  "\xE6\xA6\x50\x80\x78\x9E\xF1\xB0"
 			  "\xE6\xA6\x50\x80\x78\x9E\xF1\xB0"
 			  "\x4D\xB2\x0D\x3D\xFC\x40\x25\x4D"
 			  "\x4D\xB2\x0D\x3D\xFC\x40\x25\x4D"
-			  "\x93\x11\x1C",
-		.ilen	= 499,
+			  "\x93\x11\x1C\xE9\xD2\x9F\x6E\x90"
+			  "\xE5\x41\x4A\xE2\x3C\x45\x29\x35"
+			  "\xEC\xD6\x47\x50\xCB\x7B\xA2\x32"
+			  "\xF7\x8B\x62\xF1\xE3\x9A\xFE\xC7"
+			  "\x1D\x8C\x02\x72\x68\x09\xE9\xB6"
+			  "\x4A\x80\xE6\xB1\x56\xDF\x90\xD4"
+			  "\x93\x74\xA4\xCE\x20\x23\xBF\x48"
+			  "\xA5\xDE\x1B\xFA\x40\x69\x31\x98"
+			  "\x62\x6E\xA5\xC7\xBF\x0C\x62\xE5"
+			  "\x6D\xE1\x93\xF1\x83\x10\x1C\xCA"
+			  "\xF6\x5C\x19\xF8\x90\x78\xCB\xE4"
+			  "\x0B\x3A\xB5\xF8\x43\x86\xD3\x3F"
+			  "\xBA\x83\x34\x3C\x42\xCC\x7D\x28"
+			  "\x29\x63\x4F\xD8\x02\x17\xC5\x07"
+			  "\x2C\xA4\xAC\x79\xCB\xC3\xA9\x09"
+			  "\x81\x45\x18\xED\xE4\xCB\x42\x3B"
+			  "\x87\x2D\x23\xDC\xC5\xBA\x45\xBD"
+			  "\x92\xE5\x02\x97\x96\xCE\xAD\xEC"
+			  "\xBA\xD8\x76\xF8\xCA\xC1\x31\xEC"
+			  "\x1E\x4F\x3F\x83\xF8\x33\xE8\x6E"
+			  "\xCC\xF8\x5F\xDD\x65\x50\x99\x69"
+			  "\xAF\x48\xCE\xA5\xBA\xB6\x14\x9F"
+			  "\x05\x93\xB2\xE6\x59\xC8\x28\xFE"
+			  "\x8F\x37\xF9\x64\xB9\xA5\x56\x8F"
+			  "\xF1\x1B\x90\xEF\xAE\xEB\xFC\x09"
+			  "\x11\x7A\xF2\x19\x0A\x0A\x9A\x3C"
+			  "\xE2\x5E\x29\xFA\x31\x9B\xC1\x74"
+			  "\x1E\x10\x3E\x07\xA9\x31\x6D\xF8"
+			  "\x81\xF5\xD5\x8A\x04\x23\x51\xAC"
+			  "\xA2\xE2\x63\xFD\x27\x1F\x79\x5B"
+			  "\x1F\xE8\xDA\x11\x49\x4D\x1C\xBA"
+			  "\x54\xCC\x0F\xBA\x92\x69\xE5\xCB"
+			  "\x41\x1A\x67\xA6\x40\x82\x70\x8C"
+			  "\x19\x79\x08\xA4\x51\x20\x7D\xC9"
+			  "\x12\x27\xAE\x20\x0D\x2C\xA1\x6D"
+			  "\xF4\x55\xD4\xE7\xE6\xD4\x28\x08"
+			  "\x00\x70\x12\x56\x56\x50\xAD\x14"
+			  "\x5C\x3E\xA2\xD1\x36\x3F\x36\x48"
+			  "\xED\xB1\x57\x3E\x5D\x15\xF6\x1E"
+			  "\x53\xE9\xA4\x3E\xED\x7D\xCF\x7D"
+			  "\x29\xAF\xF3\x1E\x51\xA8\x9F\x85"
+			  "\x8B\xF0\xBB\xCE\xCC\x39\xC3\x64"
+			  "\x4B\xF2\xAD\x70\x19\xD4\x44\x8F"
+			  "\x91\x76\xE8\x15\x66\x34\x9F\xF6"
+			  "\x0F\x15\xA4\xA8\x24\xF8\x58\xB1"
+			  "\x38\x46\x47\xC7\x9B\xCA\xE9\x42"
+			  "\x44\xAA\xE6\xB5\x9C\x91\xA4\xD3"
+			  "\x16\xA0\xED\x42\xBE\xB5\x06\x19"
+			  "\xBE\x67\xE8\xBC\x22\x32\xA4\x1E"
+			  "\x93\xEB\xBE\xE9\xE1\x93\xE5\x31"
+			  "\x3A\xA2\x75\xDF\xE3\x6B\xE7\xCC"
+			  "\xB4\x70\x20\xE0\x6D\x82\x7C\xC8"
+			  "\x94\x5C\x5E\x37\x18\xAD\xED\x8B"
+			  "\x44\x86\xCA\x5E\x07\xB7\x70\x8D"
+			  "\x40\x48\x19\x73\x7C\x78\x64\x0B"
+			  "\xDB\x01\xCA\xAE\x63\x19\xE9\xD1"
+			  "\x6B\x2C\x84\x10\x45\x42\x2E\xC3"
+			  "\xDF\x7F\xAA\xE8\x87\x1B\x63\x46"
+			  "\x74\x28\x9D\x05\x30\x20\x62\x41"
+			  "\xC0\x9F\x2C\x36\x2B\x78\xD7\x26"
+			  "\xDF\x58\x51\xED\xFA\xDC\x87\x79"
+			  "\xBF\x8C\xBF\xC4\x0F\xE5\x05\xDA"
+			  "\x45\xE3\x35\x0D\x69\x91\x54\x1C"
+			  "\xE7\x2C\x49\x08\x8B\x72\xFA\x5C"
+			  "\xF1\x6B\xD9",
+		.ilen	= 1011,
 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
@@ -22042,11 +23088,75 @@ static struct cipher_testvec camellia_ctr_dec_tv_template[] = {
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
 			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
 			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
-			  "\x2B\xC2\x59",
-		.rlen	= 499,
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D"
+			  "\xE4\x7B\x12",
+		.rlen	= 1011,
 		.also_non_np = 1,
 		.also_non_np = 1,
 		.np	= 2,
 		.np	= 2,
-		.tap	= { 499 - 16, 16 },
+		.tap	= { 1011 - 16, 16 },
 	}, { /* Generated with Crypto++ */
 	}, { /* Generated with Crypto++ */
 		.key	= "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
 		.key	= "\x85\x62\x3F\x1C\xF9\xD6\x1C\xF9"
 			  "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
 			  "\xD6\xB3\x90\x6D\x4A\x90\x6D\x4A"
@@ -22116,8 +23226,72 @@ static struct cipher_testvec camellia_ctr_dec_tv_template[] = {
 			  "\xB4\x3A\x5F\x19\xCF\x42\x1B\x22"
 			  "\xB4\x3A\x5F\x19\xCF\x42\x1B\x22"
 			  "\x0B\x2D\x7B\xF1\xC5\x43\xF7\x5E"
 			  "\x0B\x2D\x7B\xF1\xC5\x43\xF7\x5E"
 			  "\x12\xA8\x01\x64\x16\x0B\x26\x5A"
 			  "\x12\xA8\x01\x64\x16\x0B\x26\x5A"
-			  "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C",
-		.ilen	= 496,
+			  "\x0C\x95\x0F\x40\xC5\x5A\x06\x7C"
+			  "\xCF\xF5\xD5\xB7\x7A\x34\x23\xB6"
+			  "\xAA\x9E\xA8\x98\xA2\xF8\x3D\xD3"
+			  "\x3F\x23\x69\x63\x56\x96\x45\xD6"
+			  "\x74\x23\x1D\x5C\x63\xCC\xD8\x78"
+			  "\x16\xE2\x9C\xD2\x80\x02\xF2\x28"
+			  "\x69\x2F\xC4\xA8\x15\x15\x24\x3B"
+			  "\xCB\xF0\x14\xE4\x62\xC8\xF3\xD1"
+			  "\x03\x58\x1B\x33\x77\x74\x1F\xB4"
+			  "\x07\x86\xF2\x21\xB7\x41\xAE\xBF"
+			  "\x25\xC2\xFF\x51\xEF\xEA\xCE\xC4"
+			  "\x5F\xD9\xB8\x18\x6A\xF0\x0F\x0D"
+			  "\xF8\x04\xBB\x6D\x62\x33\x87\x26"
+			  "\x4F\x2F\x14\x6E\xDC\xDB\x66\x09"
+			  "\x2A\xEF\x7D\x84\x10\xAC\x82\x5E"
+			  "\xD2\xE4\xAD\x74\x7A\x6D\xCC\x3A"
+			  "\x7B\x62\xD8\xD6\x07\x2D\xF7\xDF"
+			  "\x9B\xB3\x82\xCF\x9C\x1D\x76\x5C"
+			  "\xAC\x7B\xD4\x9B\x45\xA1\x64\x11"
+			  "\x66\xF1\xA7\x0B\xF9\xDD\x00\xDD"
+			  "\xA4\x45\x3D\x3E\x03\xC9\x2E\xCB"
+			  "\xC3\x14\x84\x72\xFD\x41\xDC\xBD"
+			  "\x75\xBE\xA8\xE5\x16\x48\x64\x39"
+			  "\xCA\xF3\xE6\xDC\x25\x24\xF1\x6D"
+			  "\xB2\x8D\xC5\x38\x54\xD3\x5D\x6D"
+			  "\x0B\x29\x10\x15\x0E\x13\x3B\xAC"
+			  "\x7E\xCC\x9E\x3E\x18\x48\xA6\x02"
+			  "\xEF\x03\xB2\x2E\xE3\xD2\x70\x21"
+			  "\xB4\x19\x26\xBE\x3A\x3D\x05\xE0"
+			  "\xF8\x09\xAF\xE4\x31\x26\x92\x2F"
+			  "\x8F\x55\xAC\xED\x0B\xB2\xA5\x34"
+			  "\xBE\x50\xB1\x02\x22\x96\xE3\x40"
+			  "\x7B\x70\x50\x6E\x3B\xD5\xE5\xA0"
+			  "\x8E\xA2\xAD\x14\x60\x5C\x7A\x2B"
+			  "\x3D\x1B\x7F\xC1\xC0\x2C\x56\x36"
+			  "\xD2\x0A\x32\x06\x97\x34\xB9\xF4"
+			  "\x6F\x9F\x7E\x80\xD0\x9D\xF7\x6A"
+			  "\x21\xC1\xA2\x6A\xB1\x96\x5B\x4D"
+			  "\x7A\x15\x6C\xC4\x4E\xB8\xE0\x9E"
+			  "\x6C\x50\xF3\x9C\xC9\xB5\x23\xB7"
+			  "\xF1\xD4\x29\x4A\x23\xC4\xAD\x1E"
+			  "\x2C\x07\xD2\x43\x5F\x57\x93\xCA"
+			  "\x85\xF9\x9F\xAD\x4C\xF1\xE4\xB1"
+			  "\x1A\x8E\x28\xA4\xB6\x52\x77\x7E"
+			  "\x68\xC6\x47\xB9\x76\xCC\x65\x5F"
+			  "\x0B\xF9\x67\x93\xD8\x0E\x9A\x37"
+			  "\x5F\x41\xED\x64\x6C\xAD\x5F\xED"
+			  "\x3F\x8D\xFB\x8E\x1E\xA0\xE4\x1F"
+			  "\xC2\xC7\xED\x18\x43\xE1\x20\x86"
+			  "\x5D\xBC\x30\x70\x22\xA1\xDC\x53"
+			  "\x10\x3A\x8D\x47\x82\xCD\x7F\x59"
+			  "\x03\x2D\x6D\xF5\xE7\x79\xD4\x07"
+			  "\x68\x2A\xA5\x42\x19\x4D\xAF\xF5"
+			  "\xED\x47\x83\xBC\x5F\x62\x84\xDA"
+			  "\xDA\x41\xFF\xB0\x1D\x64\xA3\xC8"
+			  "\xBD\x4E\xE0\xB8\x7F\xEE\x55\x0A"
+			  "\x4E\x61\xB2\x51\xF6\x9C\x95\xF6"
+			  "\x92\xBB\xF6\xC5\xF0\x09\x86\xDE"
+			  "\x37\x9E\x29\xF9\x2A\x18\x73\x0D"
+			  "\xDC\x7E\x6B\x7B\x1B\x43\x8C\xEA"
+			  "\x13\xC8\x1A\x47\x0A\x2D\x6D\x56"
+			  "\xCD\xD2\xE7\x53\x1A\xAB\x1C\x3C"
+			  "\xC5\x9B\x03\x70\x29\x2A\x49\x09"
+			  "\x67\xA1\xEA\xD6\x3A\x5B\xBF\x71"
+			  "\x1D\x48\x64\x6C\xFB\xC0\x9E\x36",
+		.ilen	= 1008,
 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
 		.result	= "\x56\xED\x84\x1B\x8F\x26\xBD\x31"
 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
 			  "\xC8\x5F\xF6\x6A\x01\x98\x0C\xA3"
 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
 			  "\x3A\xD1\x45\xDC\x73\x0A\x7E\x15"
@@ -22179,8 +23353,72 @@ static struct cipher_testvec camellia_ctr_dec_tv_template[] = {
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\x86\x1D\xB4\x28\xBF\x56\xED\x61"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\xF8\x8F\x03\x9A\x31\xC8\x3C\xD3"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
 			  "\x6A\x01\x75\x0C\xA3\x17\xAE\x45"
-			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7",
-		.rlen	= 496,
+			  "\xDC\x50\xE7\x7E\x15\x89\x20\xB7"
+			  "\x2B\xC2\x59\xF0\x64\xFB\x92\x06"
+			  "\x9D\x34\xCB\x3F\xD6\x6D\x04\x78"
+			  "\x0F\xA6\x1A\xB1\x48\xDF\x53\xEA"
+			  "\x81\x18\x8C\x23\xBA\x2E\xC5\x5C"
+			  "\xF3\x67\xFE\x95\x09\xA0\x37\xCE"
+			  "\x42\xD9\x70\x07\x7B\x12\xA9\x1D"
+			  "\xB4\x4B\xE2\x56\xED\x84\x1B\x8F"
+			  "\x26\xBD\x31\xC8\x5F\xF6\x6A\x01"
+			  "\x98\x0C\xA3\x3A\xD1\x45\xDC\x73"
+			  "\x0A\x7E\x15\xAC\x20\xB7\x4E\xE5"
+			  "\x59\xF0\x87\x1E\x92\x29\xC0\x34"
+			  "\xCB\x62\xF9\x6D\x04\x9B\x0F\xA6"
+			  "\x3D\xD4\x48\xDF\x76\x0D\x81\x18"
+			  "\xAF\x23\xBA\x51\xE8\x5C\xF3\x8A"
+			  "\x21\x95\x2C\xC3\x37\xCE\x65\xFC"
+			  "\x70\x07\x9E\x12\xA9\x40\xD7\x4B"
+			  "\xE2\x79\x10\x84\x1B\xB2\x26\xBD"
+			  "\x54\xEB\x5F\xF6\x8D\x01\x98\x2F"
+			  "\xC6\x3A\xD1\x68\xFF\x73\x0A\xA1"
+			  "\x15\xAC\x43\xDA\x4E\xE5\x7C\x13"
+			  "\x87\x1E\xB5\x29\xC0\x57\xEE\x62"
+			  "\xF9\x90\x04\x9B\x32\xC9\x3D\xD4"
+			  "\x6B\x02\x76\x0D\xA4\x18\xAF\x46"
+			  "\xDD\x51\xE8\x7F\x16\x8A\x21\xB8"
+			  "\x2C\xC3\x5A\xF1\x65\xFC\x93\x07"
+			  "\x9E\x35\xCC\x40\xD7\x6E\x05\x79"
+			  "\x10\xA7\x1B\xB2\x49\xE0\x54\xEB"
+			  "\x82\x19\x8D\x24\xBB\x2F\xC6\x5D"
+			  "\xF4\x68\xFF\x96\x0A\xA1\x38\xCF"
+			  "\x43\xDA\x71\x08\x7C\x13\xAA\x1E"
+			  "\xB5\x4C\xE3\x57\xEE\x85\x1C\x90"
+			  "\x27\xBE\x32\xC9\x60\xF7\x6B\x02"
+			  "\x99\x0D\xA4\x3B\xD2\x46\xDD\x74"
+			  "\x0B\x7F\x16\xAD\x21\xB8\x4F\xE6"
+			  "\x5A\xF1\x88\x1F\x93\x2A\xC1\x35"
+			  "\xCC\x63\xFA\x6E\x05\x9C\x10\xA7"
+			  "\x3E\xD5\x49\xE0\x77\x0E\x82\x19"
+			  "\xB0\x24\xBB\x52\xE9\x5D\xF4\x8B"
+			  "\x22\x96\x2D\xC4\x38\xCF\x66\xFD"
+			  "\x71\x08\x9F\x13\xAA\x41\xD8\x4C"
+			  "\xE3\x7A\x11\x85\x1C\xB3\x27\xBE"
+			  "\x55\xEC\x60\xF7\x8E\x02\x99\x30"
+			  "\xC7\x3B\xD2\x69\x00\x74\x0B\xA2"
+			  "\x16\xAD\x44\xDB\x4F\xE6\x7D\x14"
+			  "\x88\x1F\xB6\x2A\xC1\x58\xEF\x63"
+			  "\xFA\x91\x05\x9C\x33\xCA\x3E\xD5"
+			  "\x6C\x03\x77\x0E\xA5\x19\xB0\x47"
+			  "\xDE\x52\xE9\x80\x17\x8B\x22\xB9"
+			  "\x2D\xC4\x5B\xF2\x66\xFD\x94\x08"
+			  "\x9F\x36\xCD\x41\xD8\x6F\x06\x7A"
+			  "\x11\xA8\x1C\xB3\x4A\xE1\x55\xEC"
+			  "\x83\x1A\x8E\x25\xBC\x30\xC7\x5E"
+			  "\xF5\x69\x00\x97\x0B\xA2\x39\xD0"
+			  "\x44\xDB\x72\x09\x7D\x14\xAB\x1F"
+			  "\xB6\x4D\xE4\x58\xEF\x86\x1D\x91"
+			  "\x28\xBF\x33\xCA\x61\xF8\x6C\x03"
+			  "\x9A\x0E\xA5\x3C\xD3\x47\xDE\x75"
+			  "\x0C\x80\x17\xAE\x22\xB9\x50\xE7"
+			  "\x5B\xF2\x89\x20\x94\x2B\xC2\x36"
+			  "\xCD\x64\xFB\x6F\x06\x9D\x11\xA8"
+			  "\x3F\xD6\x4A\xE1\x78\x0F\x83\x1A"
+			  "\xB1\x25\xBC\x53\xEA\x5E\xF5\x8C"
+			  "\x00\x97\x2E\xC5\x39\xD0\x67\xFE"
+			  "\x72\x09\xA0\x14\xAB\x42\xD9\x4D",
+		.rlen	= 1008,
 	},
 	},
 };
 };
 
 

+ 12 - 0
drivers/char/hw_random/Kconfig

@@ -86,6 +86,18 @@ config HW_RANDOM_BCM63XX
 
 
 	  If unusure, say Y.
 	  If unusure, say Y.
 
 
+config HW_RANDOM_BCM2835
+	tristate "Broadcom BCM2835 Random Number Generator support"
+	depends on HW_RANDOM && ARCH_BCM2835
+	default HW_RANDOM
+	---help---
+	  This driver provides kernel-side support for the Random Number
+	  Generator hardware found on the Broadcom BCM2835 SoCs.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called bcm2835-rng
+
+	  If unsure, say Y.
 
 
 config HW_RANDOM_GEODE
 config HW_RANDOM_GEODE
 	tristate "AMD Geode HW Random Number Generator support"
 	tristate "AMD Geode HW Random Number Generator support"

+ 1 - 0
drivers/char/hw_random/Makefile

@@ -26,3 +26,4 @@ obj-$(CONFIG_HW_RANDOM_PPC4XX) += ppc4xx-rng.o
 obj-$(CONFIG_HW_RANDOM_PSERIES) += pseries-rng.o
 obj-$(CONFIG_HW_RANDOM_PSERIES) += pseries-rng.o
 obj-$(CONFIG_HW_RANDOM_EXYNOS)	+= exynos-rng.o
 obj-$(CONFIG_HW_RANDOM_EXYNOS)	+= exynos-rng.o
 obj-$(CONFIG_HW_RANDOM_TPM) += tpm-rng.o
 obj-$(CONFIG_HW_RANDOM_TPM) += tpm-rng.o
+obj-$(CONFIG_HW_RANDOM_BCM2835) += bcm2835-rng.o

+ 113 - 0
drivers/char/hw_random/bcm2835-rng.c

@@ -0,0 +1,113 @@
+/**
+ * Copyright (c) 2010-2012 Broadcom. All rights reserved.
+ * Copyright (c) 2013 Lubomir Rintel
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License ("GPL")
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#include <linux/hw_random.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/printk.h>
+
+#define RNG_CTRL	0x0
+#define RNG_STATUS	0x4
+#define RNG_DATA	0x8
+
+/* enable rng */
+#define RNG_RBGEN	0x1
+
+/* the initial numbers generated are "less random" so will be discarded */
+#define RNG_WARMUP_COUNT 0x40000
+
+static int bcm2835_rng_read(struct hwrng *rng, void *buf, size_t max,
+			       bool wait)
+{
+	void __iomem *rng_base = (void __iomem *)rng->priv;
+
+	while ((__raw_readl(rng_base + RNG_STATUS) >> 24) == 0) {
+		if (!wait)
+			return 0;
+		cpu_relax();
+	}
+
+	*(u32 *)buf = __raw_readl(rng_base + RNG_DATA);
+	return sizeof(u32);
+}
+
+static struct hwrng bcm2835_rng_ops = {
+	.name	= "bcm2835",
+	.read	= bcm2835_rng_read,
+};
+
+static int bcm2835_rng_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct device_node *np = dev->of_node;
+	void __iomem *rng_base;
+	int err;
+
+	/* map peripheral */
+	rng_base = of_iomap(np, 0);
+	if (!rng_base) {
+		dev_err(dev, "failed to remap rng regs");
+		return -ENODEV;
+	}
+	bcm2835_rng_ops.priv = (unsigned long)rng_base;
+
+	/* register driver */
+	err = hwrng_register(&bcm2835_rng_ops);
+	if (err) {
+		dev_err(dev, "hwrng registration failed\n");
+		iounmap(rng_base);
+	} else {
+		dev_info(dev, "hwrng registered\n");
+
+		/* set warm-up count & enable */
+		__raw_writel(RNG_WARMUP_COUNT, rng_base + RNG_STATUS);
+		__raw_writel(RNG_RBGEN, rng_base + RNG_CTRL);
+	}
+	return err;
+}
+
+static int bcm2835_rng_remove(struct platform_device *pdev)
+{
+	void __iomem *rng_base = (void __iomem *)bcm2835_rng_ops.priv;
+
+	/* disable rng hardware */
+	__raw_writel(0, rng_base + RNG_CTRL);
+
+	/* unregister driver */
+	hwrng_unregister(&bcm2835_rng_ops);
+	iounmap(rng_base);
+
+	return 0;
+}
+
+static const struct of_device_id bcm2835_rng_of_match[] = {
+	{ .compatible = "brcm,bcm2835-rng", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, bcm2835_rng_of_match);
+
+static struct platform_driver bcm2835_rng_driver = {
+	.driver = {
+		.name = "bcm2835-rng",
+		.owner = THIS_MODULE,
+		.of_match_table = bcm2835_rng_of_match,
+	},
+	.probe		= bcm2835_rng_probe,
+	.remove		= bcm2835_rng_remove,
+};
+module_platform_driver(bcm2835_rng_driver);
+
+MODULE_AUTHOR("Lubomir Rintel <lkundrak@v3.sk>");
+MODULE_DESCRIPTION("BCM2835 Random Number Generator (RNG) driver");
+MODULE_LICENSE("GPLv2");

+ 2 - 1
drivers/char/hw_random/exynos-rng.c

@@ -144,6 +144,7 @@ static int exynos_rng_remove(struct platform_device *pdev)
 	return 0;
 	return 0;
 }
 }
 
 
+#if defined(CONFIG_PM_SLEEP) || defined(CONFIG_PM_RUNTIME)
 static int exynos_rng_runtime_suspend(struct device *dev)
 static int exynos_rng_runtime_suspend(struct device *dev)
 {
 {
 	struct platform_device *pdev = to_platform_device(dev);
 	struct platform_device *pdev = to_platform_device(dev);
@@ -161,7 +162,7 @@ static int exynos_rng_runtime_resume(struct device *dev)
 
 
 	return clk_prepare_enable(exynos_rng->clk);
 	return clk_prepare_enable(exynos_rng->clk);
 }
 }
-
+#endif
 
 
 static UNIVERSAL_DEV_PM_OPS(exynos_rng_pm_ops, exynos_rng_runtime_suspend,
 static UNIVERSAL_DEV_PM_OPS(exynos_rng_pm_ops, exynos_rng_runtime_suspend,
 					exynos_rng_runtime_resume, NULL);
 					exynos_rng_runtime_resume, NULL);

+ 4 - 17
drivers/char/hw_random/mxc-rnga.c

@@ -142,7 +142,7 @@ static void mxc_rnga_cleanup(struct hwrng *rng)
 static int __init mxc_rnga_probe(struct platform_device *pdev)
 static int __init mxc_rnga_probe(struct platform_device *pdev)
 {
 {
 	int err = -ENODEV;
 	int err = -ENODEV;
-	struct resource *res, *mem;
+	struct resource *res;
 	struct mxc_rng *mxc_rng;
 	struct mxc_rng *mxc_rng;
 
 
 	mxc_rng = devm_kzalloc(&pdev->dev, sizeof(struct mxc_rng),
 	mxc_rng = devm_kzalloc(&pdev->dev, sizeof(struct mxc_rng),
@@ -172,15 +172,9 @@ static int __init mxc_rnga_probe(struct platform_device *pdev)
 		goto err_region;
 		goto err_region;
 	}
 	}
 
 
-	mem = request_mem_region(res->start, resource_size(res), pdev->name);
-	if (mem == NULL) {
-		err = -EBUSY;
-		goto err_region;
-	}
-
-	mxc_rng->mem = ioremap(res->start, resource_size(res));
-	if (!mxc_rng->mem) {
-		err = -ENOMEM;
+	mxc_rng->mem = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(mxc_rng->mem)) {
+		err = PTR_ERR(mxc_rng->mem);
 		goto err_ioremap;
 		goto err_ioremap;
 	}
 	}
 
 
@@ -195,8 +189,6 @@ static int __init mxc_rnga_probe(struct platform_device *pdev)
 	return 0;
 	return 0;
 
 
 err_ioremap:
 err_ioremap:
-	release_mem_region(res->start, resource_size(res));
-
 err_region:
 err_region:
 	clk_disable_unprepare(mxc_rng->clk);
 	clk_disable_unprepare(mxc_rng->clk);
 
 
@@ -206,15 +198,10 @@ out:
 
 
 static int __exit mxc_rnga_remove(struct platform_device *pdev)
 static int __exit mxc_rnga_remove(struct platform_device *pdev)
 {
 {
-	struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	struct mxc_rng *mxc_rng = platform_get_drvdata(pdev);
 	struct mxc_rng *mxc_rng = platform_get_drvdata(pdev);
 
 
 	hwrng_unregister(&mxc_rng->rng);
 	hwrng_unregister(&mxc_rng->rng);
 
 
-	iounmap(mxc_rng->mem);
-
-	release_mem_region(res->start, resource_size(res));
-
 	clk_disable_unprepare(mxc_rng->clk);
 	clk_disable_unprepare(mxc_rng->clk);
 
 
 	return 0;
 	return 0;

+ 136 - 54
drivers/char/hw_random/timeriomem-rng.c

@@ -23,127 +23,209 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/kernel.h>
 #include <linux/platform_device.h>
 #include <linux/platform_device.h>
+#include <linux/of.h>
 #include <linux/hw_random.h>
 #include <linux/hw_random.h>
 #include <linux/io.h>
 #include <linux/io.h>
+#include <linux/slab.h>
 #include <linux/timeriomem-rng.h>
 #include <linux/timeriomem-rng.h>
 #include <linux/jiffies.h>
 #include <linux/jiffies.h>
 #include <linux/sched.h>
 #include <linux/sched.h>
 #include <linux/timer.h>
 #include <linux/timer.h>
 #include <linux/completion.h>
 #include <linux/completion.h>
 
 
-static struct timeriomem_rng_data *timeriomem_rng_data;
+struct timeriomem_rng_private_data {
+	void __iomem		*io_base;
+	unsigned int		expires;
+	unsigned int		period;
+	unsigned int		present:1;
 
 
-static void timeriomem_rng_trigger(unsigned long);
-static DEFINE_TIMER(timeriomem_rng_timer, timeriomem_rng_trigger, 0, 0);
+	struct timer_list	timer;
+	struct completion	completion;
+
+	struct hwrng		timeriomem_rng_ops;
+};
+
+#define to_rng_priv(rng) \
+		((struct timeriomem_rng_private_data *)rng->priv)
 
 
 /*
 /*
  * have data return 1, however return 0 if we have nothing
  * have data return 1, however return 0 if we have nothing
  */
  */
 static int timeriomem_rng_data_present(struct hwrng *rng, int wait)
 static int timeriomem_rng_data_present(struct hwrng *rng, int wait)
 {
 {
-	if (rng->priv == 0)
-		return 1;
+	struct timeriomem_rng_private_data *priv = to_rng_priv(rng);
 
 
-	if (!wait || timeriomem_rng_data->present)
-		return timeriomem_rng_data->present;
+	if (!wait || priv->present)
+		return priv->present;
 
 
-	wait_for_completion(&timeriomem_rng_data->completion);
+	wait_for_completion(&priv->completion);
 
 
 	return 1;
 	return 1;
 }
 }
 
 
 static int timeriomem_rng_data_read(struct hwrng *rng, u32 *data)
 static int timeriomem_rng_data_read(struct hwrng *rng, u32 *data)
 {
 {
+	struct timeriomem_rng_private_data *priv = to_rng_priv(rng);
 	unsigned long cur;
 	unsigned long cur;
 	s32 delay;
 	s32 delay;
 
 
-	*data = readl(timeriomem_rng_data->address);
+	*data = readl(priv->io_base);
 
 
-	if (rng->priv != 0) {
-		cur = jiffies;
+	cur = jiffies;
 
 
-		delay = cur - timeriomem_rng_timer.expires;
-		delay = rng->priv - (delay % rng->priv);
+	delay = cur - priv->expires;
+	delay = priv->period - (delay % priv->period);
 
 
-		timeriomem_rng_timer.expires = cur + delay;
-		timeriomem_rng_data->present = 0;
+	priv->expires = cur + delay;
+	priv->present = 0;
 
 
-		init_completion(&timeriomem_rng_data->completion);
-		add_timer(&timeriomem_rng_timer);
-	}
+	INIT_COMPLETION(priv->completion);
+	mod_timer(&priv->timer, priv->expires);
 
 
 	return 4;
 	return 4;
 }
 }
 
 
-static void timeriomem_rng_trigger(unsigned long dummy)
+static void timeriomem_rng_trigger(unsigned long data)
 {
 {
-	timeriomem_rng_data->present = 1;
-	complete(&timeriomem_rng_data->completion);
-}
+	struct timeriomem_rng_private_data *priv
+			= (struct timeriomem_rng_private_data *)data;
 
 
-static struct hwrng timeriomem_rng_ops = {
-	.name		= "timeriomem",
-	.data_present	= timeriomem_rng_data_present,
-	.data_read	= timeriomem_rng_data_read,
-	.priv		= 0,
-};
+	priv->present = 1;
+	complete(&priv->completion);
+}
 
 
 static int timeriomem_rng_probe(struct platform_device *pdev)
 static int timeriomem_rng_probe(struct platform_device *pdev)
 {
 {
+	struct timeriomem_rng_data *pdata = pdev->dev.platform_data;
+	struct timeriomem_rng_private_data *priv;
 	struct resource *res;
 	struct resource *res;
-	int ret;
+	int err = 0;
+	int period;
 
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!pdev->dev.of_node && !pdata) {
+		dev_err(&pdev->dev, "timeriomem_rng_data is missing\n");
+		return -EINVAL;
+	}
 
 
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!res)
 	if (!res)
-		return -ENOENT;
+		return -ENXIO;
 
 
-	timeriomem_rng_data = pdev->dev.platform_data;
+	if (res->start % 4 != 0 || resource_size(res) != 4) {
+		dev_err(&pdev->dev,
+			"address must be four bytes wide and aligned\n");
+		return -EINVAL;
+	}
 
 
-	timeriomem_rng_data->address = ioremap(res->start, resource_size(res));
-	if (!timeriomem_rng_data->address)
-		return -EIO;
+	/* Allocate memory for the device structure (and zero it) */
+	priv = kzalloc(sizeof(struct timeriomem_rng_private_data), GFP_KERNEL);
+	if (!priv) {
+		dev_err(&pdev->dev, "failed to allocate device structure.\n");
+		return -ENOMEM;
+	}
+
+	platform_set_drvdata(pdev, priv);
+
+	if (pdev->dev.of_node) {
+		int i;
+
+		if (!of_property_read_u32(pdev->dev.of_node,
+						"period", &i))
+			period = i;
+		else {
+			dev_err(&pdev->dev, "missing period\n");
+			err = -EINVAL;
+			goto out_free;
+		}
+	} else
+		period = pdata->period;
+
+	priv->period = usecs_to_jiffies(period);
+	if (priv->period < 1) {
+		dev_err(&pdev->dev, "period is less than one jiffy\n");
+		err = -EINVAL;
+		goto out_free;
+	}
 
 
-	if (timeriomem_rng_data->period != 0
-		&& usecs_to_jiffies(timeriomem_rng_data->period) > 0) {
-		timeriomem_rng_timer.expires = jiffies;
+	priv->expires	= jiffies;
+	priv->present	= 1;
 
 
-		timeriomem_rng_ops.priv = usecs_to_jiffies(
-						timeriomem_rng_data->period);
+	init_completion(&priv->completion);
+	complete(&priv->completion);
+
+	setup_timer(&priv->timer, timeriomem_rng_trigger, (unsigned long)priv);
+
+	priv->timeriomem_rng_ops.name		= dev_name(&pdev->dev);
+	priv->timeriomem_rng_ops.data_present	= timeriomem_rng_data_present;
+	priv->timeriomem_rng_ops.data_read	= timeriomem_rng_data_read;
+	priv->timeriomem_rng_ops.priv		= (unsigned long)priv;
+
+	if (!request_mem_region(res->start, resource_size(res),
+				dev_name(&pdev->dev))) {
+		dev_err(&pdev->dev, "request_mem_region failed\n");
+		err = -EBUSY;
+		goto out_timer;
 	}
 	}
-	timeriomem_rng_data->present = 1;
 
 
-	ret = hwrng_register(&timeriomem_rng_ops);
-	if (ret)
-		goto failed;
+	priv->io_base = ioremap(res->start, resource_size(res));
+	if (priv->io_base == NULL) {
+		dev_err(&pdev->dev, "ioremap failed\n");
+		err = -EIO;
+		goto out_release_io;
+	}
+
+	err = hwrng_register(&priv->timeriomem_rng_ops);
+	if (err) {
+		dev_err(&pdev->dev, "problem registering\n");
+		goto out;
+	}
 
 
 	dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n",
 	dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n",
-			timeriomem_rng_data->address,
-			timeriomem_rng_data->period);
+			priv->io_base, period);
 
 
 	return 0;
 	return 0;
 
 
-failed:
-	dev_err(&pdev->dev, "problem registering\n");
-	iounmap(timeriomem_rng_data->address);
-
-	return ret;
+out:
+	iounmap(priv->io_base);
+out_release_io:
+	release_mem_region(res->start, resource_size(res));
+out_timer:
+	del_timer_sync(&priv->timer);
+out_free:
+	platform_set_drvdata(pdev, NULL);
+	kfree(priv);
+	return err;
 }
 }
 
 
 static int timeriomem_rng_remove(struct platform_device *pdev)
 static int timeriomem_rng_remove(struct platform_device *pdev)
 {
 {
-	del_timer_sync(&timeriomem_rng_timer);
-	hwrng_unregister(&timeriomem_rng_ops);
+	struct timeriomem_rng_private_data *priv = platform_get_drvdata(pdev);
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 
 
-	iounmap(timeriomem_rng_data->address);
+	hwrng_unregister(&priv->timeriomem_rng_ops);
+
+	del_timer_sync(&priv->timer);
+	iounmap(priv->io_base);
+	release_mem_region(res->start, resource_size(res));
+	platform_set_drvdata(pdev, NULL);
+	kfree(priv);
 
 
 	return 0;
 	return 0;
 }
 }
 
 
+static const struct of_device_id timeriomem_rng_match[] = {
+	{ .compatible = "timeriomem_rng" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, timeriomem_rng_match);
+
 static struct platform_driver timeriomem_rng_driver = {
 static struct platform_driver timeriomem_rng_driver = {
 	.driver = {
 	.driver = {
 		.name		= "timeriomem_rng",
 		.name		= "timeriomem_rng",
 		.owner		= THIS_MODULE,
 		.owner		= THIS_MODULE,
+		.of_match_table	= timeriomem_rng_match,
 	},
 	},
 	.probe		= timeriomem_rng_probe,
 	.probe		= timeriomem_rng_probe,
 	.remove		= timeriomem_rng_remove,
 	.remove		= timeriomem_rng_remove,

+ 15 - 3
drivers/crypto/Kconfig

@@ -276,6 +276,16 @@ config CRYPTO_DEV_PICOXCELL
 
 
 	  Saying m here will build a module named pipcoxcell_crypto.
 	  Saying m here will build a module named pipcoxcell_crypto.
 
 
+config CRYPTO_DEV_SAHARA
+	tristate "Support for SAHARA crypto accelerator"
+	depends on ARCH_MXC && EXPERIMENTAL && OF
+	select CRYPTO_BLKCIPHER
+	select CRYPTO_AES
+	select CRYPTO_ECB
+	help
+	  This option enables support for the SAHARA HW crypto accelerator
+	  found in some Freescale i.MX chips.
+
 config CRYPTO_DEV_S5P
 config CRYPTO_DEV_S5P
 	tristate "Support for Samsung S5PV210 crypto accelerator"
 	tristate "Support for Samsung S5PV210 crypto accelerator"
 	depends on ARCH_S5PV210
 	depends on ARCH_S5PV210
@@ -361,15 +371,17 @@ config CRYPTO_DEV_ATMEL_TDES
 	  will be called atmel-tdes.
 	  will be called atmel-tdes.
 
 
 config CRYPTO_DEV_ATMEL_SHA
 config CRYPTO_DEV_ATMEL_SHA
-	tristate "Support for Atmel SHA1/SHA256 hw accelerator"
+	tristate "Support for Atmel SHA hw accelerator"
 	depends on ARCH_AT91
 	depends on ARCH_AT91
 	select CRYPTO_SHA1
 	select CRYPTO_SHA1
 	select CRYPTO_SHA256
 	select CRYPTO_SHA256
+	select CRYPTO_SHA512
 	select CRYPTO_ALGAPI
 	select CRYPTO_ALGAPI
 	help
 	help
-	  Some Atmel processors have SHA1/SHA256 hw accelerator.
+	  Some Atmel processors have SHA1/SHA224/SHA256/SHA384/SHA512
+	  hw accelerator.
 	  Select this if you want to use the Atmel module for
 	  Select this if you want to use the Atmel module for
-	  SHA1/SHA256 algorithms.
+	  SHA1/SHA224/SHA256/SHA384/SHA512 algorithms.
 
 
 	  To compile this driver as a module, choose M here: the module
 	  To compile this driver as a module, choose M here: the module
 	  will be called atmel-sha.
 	  will be called atmel-sha.

+ 1 - 0
drivers/crypto/Makefile

@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_DEV_PPC4XX) += amcc/
 obj-$(CONFIG_CRYPTO_DEV_OMAP_SHAM) += omap-sham.o
 obj-$(CONFIG_CRYPTO_DEV_OMAP_SHAM) += omap-sham.o
 obj-$(CONFIG_CRYPTO_DEV_OMAP_AES) += omap-aes.o
 obj-$(CONFIG_CRYPTO_DEV_OMAP_AES) += omap-aes.o
 obj-$(CONFIG_CRYPTO_DEV_PICOXCELL) += picoxcell_crypto.o
 obj-$(CONFIG_CRYPTO_DEV_PICOXCELL) += picoxcell_crypto.o
+obj-$(CONFIG_CRYPTO_DEV_SAHARA) += sahara.o
 obj-$(CONFIG_CRYPTO_DEV_S5P) += s5p-sss.o
 obj-$(CONFIG_CRYPTO_DEV_S5P) += s5p-sss.o
 obj-$(CONFIG_CRYPTO_DEV_TEGRA_AES) += tegra-aes.o
 obj-$(CONFIG_CRYPTO_DEV_TEGRA_AES) += tegra-aes.o
 obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/
 obj-$(CONFIG_CRYPTO_DEV_UX500) += ux500/

+ 353 - 118
drivers/crypto/atmel-aes.c

@@ -38,7 +38,7 @@
 #include <crypto/aes.h>
 #include <crypto/aes.h>
 #include <crypto/hash.h>
 #include <crypto/hash.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/hash.h>
-#include <linux/platform_data/atmel-aes.h>
+#include <linux/platform_data/crypto-atmel.h>
 #include "atmel-aes-regs.h"
 #include "atmel-aes-regs.h"
 
 
 #define CFB8_BLOCK_SIZE		1
 #define CFB8_BLOCK_SIZE		1
@@ -47,7 +47,7 @@
 #define CFB64_BLOCK_SIZE	8
 #define CFB64_BLOCK_SIZE	8
 
 
 /* AES flags */
 /* AES flags */
-#define AES_FLAGS_MODE_MASK	0x01ff
+#define AES_FLAGS_MODE_MASK	0x03ff
 #define AES_FLAGS_ENCRYPT	BIT(0)
 #define AES_FLAGS_ENCRYPT	BIT(0)
 #define AES_FLAGS_CBC		BIT(1)
 #define AES_FLAGS_CBC		BIT(1)
 #define AES_FLAGS_CFB		BIT(2)
 #define AES_FLAGS_CFB		BIT(2)
@@ -55,21 +55,26 @@
 #define AES_FLAGS_CFB16		BIT(4)
 #define AES_FLAGS_CFB16		BIT(4)
 #define AES_FLAGS_CFB32		BIT(5)
 #define AES_FLAGS_CFB32		BIT(5)
 #define AES_FLAGS_CFB64		BIT(6)
 #define AES_FLAGS_CFB64		BIT(6)
-#define AES_FLAGS_OFB		BIT(7)
-#define AES_FLAGS_CTR		BIT(8)
+#define AES_FLAGS_CFB128	BIT(7)
+#define AES_FLAGS_OFB		BIT(8)
+#define AES_FLAGS_CTR		BIT(9)
 
 
 #define AES_FLAGS_INIT		BIT(16)
 #define AES_FLAGS_INIT		BIT(16)
 #define AES_FLAGS_DMA		BIT(17)
 #define AES_FLAGS_DMA		BIT(17)
 #define AES_FLAGS_BUSY		BIT(18)
 #define AES_FLAGS_BUSY		BIT(18)
+#define AES_FLAGS_FAST		BIT(19)
 
 
-#define AES_FLAGS_DUALBUFF	BIT(24)
-
-#define ATMEL_AES_QUEUE_LENGTH	1
-#define ATMEL_AES_CACHE_SIZE	0
+#define ATMEL_AES_QUEUE_LENGTH	50
 
 
 #define ATMEL_AES_DMA_THRESHOLD		16
 #define ATMEL_AES_DMA_THRESHOLD		16
 
 
 
 
+struct atmel_aes_caps {
+	bool	has_dualbuff;
+	bool	has_cfb64;
+	u32		max_burst_size;
+};
+
 struct atmel_aes_dev;
 struct atmel_aes_dev;
 
 
 struct atmel_aes_ctx {
 struct atmel_aes_ctx {
@@ -77,6 +82,8 @@ struct atmel_aes_ctx {
 
 
 	int		keylen;
 	int		keylen;
 	u32		key[AES_KEYSIZE_256 / sizeof(u32)];
 	u32		key[AES_KEYSIZE_256 / sizeof(u32)];
+
+	u16		block_size;
 };
 };
 
 
 struct atmel_aes_reqctx {
 struct atmel_aes_reqctx {
@@ -112,20 +119,27 @@ struct atmel_aes_dev {
 
 
 	struct scatterlist	*in_sg;
 	struct scatterlist	*in_sg;
 	unsigned int		nb_in_sg;
 	unsigned int		nb_in_sg;
-
+	size_t				in_offset;
 	struct scatterlist	*out_sg;
 	struct scatterlist	*out_sg;
 	unsigned int		nb_out_sg;
 	unsigned int		nb_out_sg;
+	size_t				out_offset;
 
 
 	size_t	bufcnt;
 	size_t	bufcnt;
+	size_t	buflen;
+	size_t	dma_size;
 
 
-	u8	buf_in[ATMEL_AES_DMA_THRESHOLD] __aligned(sizeof(u32));
-	int	dma_in;
+	void	*buf_in;
+	int		dma_in;
+	dma_addr_t	dma_addr_in;
 	struct atmel_aes_dma	dma_lch_in;
 	struct atmel_aes_dma	dma_lch_in;
 
 
-	u8	buf_out[ATMEL_AES_DMA_THRESHOLD] __aligned(sizeof(u32));
-	int	dma_out;
+	void	*buf_out;
+	int		dma_out;
+	dma_addr_t	dma_addr_out;
 	struct atmel_aes_dma	dma_lch_out;
 	struct atmel_aes_dma	dma_lch_out;
 
 
+	struct atmel_aes_caps	caps;
+
 	u32	hw_version;
 	u32	hw_version;
 };
 };
 
 
@@ -165,6 +179,37 @@ static int atmel_aes_sg_length(struct ablkcipher_request *req,
 	return sg_nb;
 	return sg_nb;
 }
 }
 
 
+static int atmel_aes_sg_copy(struct scatterlist **sg, size_t *offset,
+			void *buf, size_t buflen, size_t total, int out)
+{
+	unsigned int count, off = 0;
+
+	while (buflen && total) {
+		count = min((*sg)->length - *offset, total);
+		count = min(count, buflen);
+
+		if (!count)
+			return off;
+
+		scatterwalk_map_and_copy(buf + off, *sg, *offset, count, out);
+
+		off += count;
+		buflen -= count;
+		*offset += count;
+		total -= count;
+
+		if (*offset == (*sg)->length) {
+			*sg = sg_next(*sg);
+			if (*sg)
+				*offset = 0;
+			else
+				total = 0;
+		}
+	}
+
+	return off;
+}
+
 static inline u32 atmel_aes_read(struct atmel_aes_dev *dd, u32 offset)
 static inline u32 atmel_aes_read(struct atmel_aes_dev *dd, u32 offset)
 {
 {
 	return readl_relaxed(dd->io_base + offset);
 	return readl_relaxed(dd->io_base + offset);
@@ -190,14 +235,6 @@ static void atmel_aes_write_n(struct atmel_aes_dev *dd, u32 offset,
 		atmel_aes_write(dd, offset, *value);
 		atmel_aes_write(dd, offset, *value);
 }
 }
 
 
-static void atmel_aes_dualbuff_test(struct atmel_aes_dev *dd)
-{
-	atmel_aes_write(dd, AES_MR, AES_MR_DUALBUFF);
-
-	if (atmel_aes_read(dd, AES_MR) & AES_MR_DUALBUFF)
-		dd->flags |= AES_FLAGS_DUALBUFF;
-}
-
 static struct atmel_aes_dev *atmel_aes_find_dev(struct atmel_aes_ctx *ctx)
 static struct atmel_aes_dev *atmel_aes_find_dev(struct atmel_aes_ctx *ctx)
 {
 {
 	struct atmel_aes_dev *aes_dd = NULL;
 	struct atmel_aes_dev *aes_dd = NULL;
@@ -225,7 +262,7 @@ static int atmel_aes_hw_init(struct atmel_aes_dev *dd)
 
 
 	if (!(dd->flags & AES_FLAGS_INIT)) {
 	if (!(dd->flags & AES_FLAGS_INIT)) {
 		atmel_aes_write(dd, AES_CR, AES_CR_SWRST);
 		atmel_aes_write(dd, AES_CR, AES_CR_SWRST);
-		atmel_aes_dualbuff_test(dd);
+		atmel_aes_write(dd, AES_MR, 0xE << AES_MR_CKEY_OFFSET);
 		dd->flags |= AES_FLAGS_INIT;
 		dd->flags |= AES_FLAGS_INIT;
 		dd->err = 0;
 		dd->err = 0;
 	}
 	}
@@ -233,11 +270,19 @@ static int atmel_aes_hw_init(struct atmel_aes_dev *dd)
 	return 0;
 	return 0;
 }
 }
 
 
+static inline unsigned int atmel_aes_get_version(struct atmel_aes_dev *dd)
+{
+	return atmel_aes_read(dd, AES_HW_VERSION) & 0x00000fff;
+}
+
 static void atmel_aes_hw_version_init(struct atmel_aes_dev *dd)
 static void atmel_aes_hw_version_init(struct atmel_aes_dev *dd)
 {
 {
 	atmel_aes_hw_init(dd);
 	atmel_aes_hw_init(dd);
 
 
-	dd->hw_version = atmel_aes_read(dd, AES_HW_VERSION);
+	dd->hw_version = atmel_aes_get_version(dd);
+
+	dev_info(dd->dev,
+			"version: 0x%x\n", dd->hw_version);
 
 
 	clk_disable_unprepare(dd->iclk);
 	clk_disable_unprepare(dd->iclk);
 }
 }
@@ -260,50 +305,77 @@ static void atmel_aes_dma_callback(void *data)
 	tasklet_schedule(&dd->done_task);
 	tasklet_schedule(&dd->done_task);
 }
 }
 
 
-static int atmel_aes_crypt_dma(struct atmel_aes_dev *dd)
+static int atmel_aes_crypt_dma(struct atmel_aes_dev *dd,
+		dma_addr_t dma_addr_in, dma_addr_t dma_addr_out, int length)
 {
 {
+	struct scatterlist sg[2];
 	struct dma_async_tx_descriptor	*in_desc, *out_desc;
 	struct dma_async_tx_descriptor	*in_desc, *out_desc;
-	int nb_dma_sg_in, nb_dma_sg_out;
 
 
-	dd->nb_in_sg = atmel_aes_sg_length(dd->req, dd->in_sg);
-	if (!dd->nb_in_sg)
-		goto exit_err;
+	dd->dma_size = length;
 
 
-	nb_dma_sg_in = dma_map_sg(dd->dev, dd->in_sg, dd->nb_in_sg,
-			DMA_TO_DEVICE);
-	if (!nb_dma_sg_in)
-		goto exit_err;
+	if (!(dd->flags & AES_FLAGS_FAST)) {
+		dma_sync_single_for_device(dd->dev, dma_addr_in, length,
+					   DMA_TO_DEVICE);
+	}
 
 
-	in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, dd->in_sg,
-				nb_dma_sg_in, DMA_MEM_TO_DEV,
-				DMA_PREP_INTERRUPT  |  DMA_CTRL_ACK);
+	if (dd->flags & AES_FLAGS_CFB8) {
+		dd->dma_lch_in.dma_conf.dst_addr_width =
+			DMA_SLAVE_BUSWIDTH_1_BYTE;
+		dd->dma_lch_out.dma_conf.src_addr_width =
+			DMA_SLAVE_BUSWIDTH_1_BYTE;
+	} else if (dd->flags & AES_FLAGS_CFB16) {
+		dd->dma_lch_in.dma_conf.dst_addr_width =
+			DMA_SLAVE_BUSWIDTH_2_BYTES;
+		dd->dma_lch_out.dma_conf.src_addr_width =
+			DMA_SLAVE_BUSWIDTH_2_BYTES;
+	} else {
+		dd->dma_lch_in.dma_conf.dst_addr_width =
+			DMA_SLAVE_BUSWIDTH_4_BYTES;
+		dd->dma_lch_out.dma_conf.src_addr_width =
+			DMA_SLAVE_BUSWIDTH_4_BYTES;
+	}
 
 
-	if (!in_desc)
-		goto unmap_in;
+	if (dd->flags & (AES_FLAGS_CFB8 | AES_FLAGS_CFB16 |
+			AES_FLAGS_CFB32 | AES_FLAGS_CFB64)) {
+		dd->dma_lch_in.dma_conf.src_maxburst = 1;
+		dd->dma_lch_in.dma_conf.dst_maxburst = 1;
+		dd->dma_lch_out.dma_conf.src_maxburst = 1;
+		dd->dma_lch_out.dma_conf.dst_maxburst = 1;
+	} else {
+		dd->dma_lch_in.dma_conf.src_maxburst = dd->caps.max_burst_size;
+		dd->dma_lch_in.dma_conf.dst_maxburst = dd->caps.max_burst_size;
+		dd->dma_lch_out.dma_conf.src_maxburst = dd->caps.max_burst_size;
+		dd->dma_lch_out.dma_conf.dst_maxburst = dd->caps.max_burst_size;
+	}
 
 
-	/* callback not needed */
+	dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf);
+	dmaengine_slave_config(dd->dma_lch_out.chan, &dd->dma_lch_out.dma_conf);
 
 
-	dd->nb_out_sg = atmel_aes_sg_length(dd->req, dd->out_sg);
-	if (!dd->nb_out_sg)
-		goto unmap_in;
+	dd->flags |= AES_FLAGS_DMA;
 
 
-	nb_dma_sg_out = dma_map_sg(dd->dev, dd->out_sg, dd->nb_out_sg,
-			DMA_FROM_DEVICE);
-	if (!nb_dma_sg_out)
-		goto unmap_out;
+	sg_init_table(&sg[0], 1);
+	sg_dma_address(&sg[0]) = dma_addr_in;
+	sg_dma_len(&sg[0]) = length;
 
 
-	out_desc = dmaengine_prep_slave_sg(dd->dma_lch_out.chan, dd->out_sg,
-				nb_dma_sg_out, DMA_DEV_TO_MEM,
-				DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	sg_init_table(&sg[1], 1);
+	sg_dma_address(&sg[1]) = dma_addr_out;
+	sg_dma_len(&sg[1]) = length;
+
+	in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, &sg[0],
+				1, DMA_MEM_TO_DEV,
+				DMA_PREP_INTERRUPT  |  DMA_CTRL_ACK);
+	if (!in_desc)
+		return -EINVAL;
 
 
+	out_desc = dmaengine_prep_slave_sg(dd->dma_lch_out.chan, &sg[1],
+				1, DMA_DEV_TO_MEM,
+				DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 	if (!out_desc)
 	if (!out_desc)
-		goto unmap_out;
+		return -EINVAL;
 
 
 	out_desc->callback = atmel_aes_dma_callback;
 	out_desc->callback = atmel_aes_dma_callback;
 	out_desc->callback_param = dd;
 	out_desc->callback_param = dd;
 
 
-	dd->total -= dd->req->nbytes;
-
 	dmaengine_submit(out_desc);
 	dmaengine_submit(out_desc);
 	dma_async_issue_pending(dd->dma_lch_out.chan);
 	dma_async_issue_pending(dd->dma_lch_out.chan);
 
 
@@ -311,15 +383,6 @@ static int atmel_aes_crypt_dma(struct atmel_aes_dev *dd)
 	dma_async_issue_pending(dd->dma_lch_in.chan);
 	dma_async_issue_pending(dd->dma_lch_in.chan);
 
 
 	return 0;
 	return 0;
-
-unmap_out:
-	dma_unmap_sg(dd->dev, dd->out_sg, dd->nb_out_sg,
-		DMA_FROM_DEVICE);
-unmap_in:
-	dma_unmap_sg(dd->dev, dd->in_sg, dd->nb_in_sg,
-		DMA_TO_DEVICE);
-exit_err:
-	return -EINVAL;
 }
 }
 
 
 static int atmel_aes_crypt_cpu_start(struct atmel_aes_dev *dd)
 static int atmel_aes_crypt_cpu_start(struct atmel_aes_dev *dd)
@@ -352,30 +415,66 @@ static int atmel_aes_crypt_cpu_start(struct atmel_aes_dev *dd)
 
 
 static int atmel_aes_crypt_dma_start(struct atmel_aes_dev *dd)
 static int atmel_aes_crypt_dma_start(struct atmel_aes_dev *dd)
 {
 {
-	int err;
+	int err, fast = 0, in, out;
+	size_t count;
+	dma_addr_t addr_in, addr_out;
+
+	if ((!dd->in_offset) && (!dd->out_offset)) {
+		/* check for alignment */
+		in = IS_ALIGNED((u32)dd->in_sg->offset, sizeof(u32)) &&
+			IS_ALIGNED(dd->in_sg->length, dd->ctx->block_size);
+		out = IS_ALIGNED((u32)dd->out_sg->offset, sizeof(u32)) &&
+			IS_ALIGNED(dd->out_sg->length, dd->ctx->block_size);
+		fast = in && out;
+
+		if (sg_dma_len(dd->in_sg) != sg_dma_len(dd->out_sg))
+			fast = 0;
+	}
+
+
+	if (fast)  {
+		count = min(dd->total, sg_dma_len(dd->in_sg));
+		count = min(count, sg_dma_len(dd->out_sg));
+
+		err = dma_map_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
+		if (!err) {
+			dev_err(dd->dev, "dma_map_sg() error\n");
+			return -EINVAL;
+		}
+
+		err = dma_map_sg(dd->dev, dd->out_sg, 1,
+				DMA_FROM_DEVICE);
+		if (!err) {
+			dev_err(dd->dev, "dma_map_sg() error\n");
+			dma_unmap_sg(dd->dev, dd->in_sg, 1,
+				DMA_TO_DEVICE);
+			return -EINVAL;
+		}
+
+		addr_in = sg_dma_address(dd->in_sg);
+		addr_out = sg_dma_address(dd->out_sg);
+
+		dd->flags |= AES_FLAGS_FAST;
 
 
-	if (dd->flags & AES_FLAGS_CFB8) {
-		dd->dma_lch_in.dma_conf.dst_addr_width =
-			DMA_SLAVE_BUSWIDTH_1_BYTE;
-		dd->dma_lch_out.dma_conf.src_addr_width =
-			DMA_SLAVE_BUSWIDTH_1_BYTE;
-	} else if (dd->flags & AES_FLAGS_CFB16) {
-		dd->dma_lch_in.dma_conf.dst_addr_width =
-			DMA_SLAVE_BUSWIDTH_2_BYTES;
-		dd->dma_lch_out.dma_conf.src_addr_width =
-			DMA_SLAVE_BUSWIDTH_2_BYTES;
 	} else {
 	} else {
-		dd->dma_lch_in.dma_conf.dst_addr_width =
-			DMA_SLAVE_BUSWIDTH_4_BYTES;
-		dd->dma_lch_out.dma_conf.src_addr_width =
-			DMA_SLAVE_BUSWIDTH_4_BYTES;
+		/* use cache buffers */
+		count = atmel_aes_sg_copy(&dd->in_sg, &dd->in_offset,
+				dd->buf_in, dd->buflen, dd->total, 0);
+
+		addr_in = dd->dma_addr_in;
+		addr_out = dd->dma_addr_out;
+
+		dd->flags &= ~AES_FLAGS_FAST;
 	}
 	}
 
 
-	dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf);
-	dmaengine_slave_config(dd->dma_lch_out.chan, &dd->dma_lch_out.dma_conf);
+	dd->total -= count;
 
 
-	dd->flags |= AES_FLAGS_DMA;
-	err = atmel_aes_crypt_dma(dd);
+	err = atmel_aes_crypt_dma(dd, addr_in, addr_out, count);
+
+	if (err && (dd->flags & AES_FLAGS_FAST)) {
+		dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
+		dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_TO_DEVICE);
+	}
 
 
 	return err;
 	return err;
 }
 }
@@ -410,6 +509,8 @@ static int atmel_aes_write_ctrl(struct atmel_aes_dev *dd)
 			valmr |= AES_MR_CFBS_32b;
 			valmr |= AES_MR_CFBS_32b;
 		else if (dd->flags & AES_FLAGS_CFB64)
 		else if (dd->flags & AES_FLAGS_CFB64)
 			valmr |= AES_MR_CFBS_64b;
 			valmr |= AES_MR_CFBS_64b;
+		else if (dd->flags & AES_FLAGS_CFB128)
+			valmr |= AES_MR_CFBS_128b;
 	} else if (dd->flags & AES_FLAGS_OFB) {
 	} else if (dd->flags & AES_FLAGS_OFB) {
 		valmr |= AES_MR_OPMOD_OFB;
 		valmr |= AES_MR_OPMOD_OFB;
 	} else if (dd->flags & AES_FLAGS_CTR) {
 	} else if (dd->flags & AES_FLAGS_CTR) {
@@ -423,7 +524,7 @@ static int atmel_aes_write_ctrl(struct atmel_aes_dev *dd)
 
 
 	if (dd->total > ATMEL_AES_DMA_THRESHOLD) {
 	if (dd->total > ATMEL_AES_DMA_THRESHOLD) {
 		valmr |= AES_MR_SMOD_IDATAR0;
 		valmr |= AES_MR_SMOD_IDATAR0;
-		if (dd->flags & AES_FLAGS_DUALBUFF)
+		if (dd->caps.has_dualbuff)
 			valmr |= AES_MR_DUALBUFF;
 			valmr |= AES_MR_DUALBUFF;
 	} else {
 	} else {
 		valmr |= AES_MR_SMOD_AUTO;
 		valmr |= AES_MR_SMOD_AUTO;
@@ -477,7 +578,9 @@ static int atmel_aes_handle_queue(struct atmel_aes_dev *dd,
 	/* assign new request to device */
 	/* assign new request to device */
 	dd->req = req;
 	dd->req = req;
 	dd->total = req->nbytes;
 	dd->total = req->nbytes;
+	dd->in_offset = 0;
 	dd->in_sg = req->src;
 	dd->in_sg = req->src;
+	dd->out_offset = 0;
 	dd->out_sg = req->dst;
 	dd->out_sg = req->dst;
 
 
 	rctx = ablkcipher_request_ctx(req);
 	rctx = ablkcipher_request_ctx(req);
@@ -506,18 +609,86 @@ static int atmel_aes_handle_queue(struct atmel_aes_dev *dd,
 static int atmel_aes_crypt_dma_stop(struct atmel_aes_dev *dd)
 static int atmel_aes_crypt_dma_stop(struct atmel_aes_dev *dd)
 {
 {
 	int err = -EINVAL;
 	int err = -EINVAL;
+	size_t count;
 
 
 	if (dd->flags & AES_FLAGS_DMA) {
 	if (dd->flags & AES_FLAGS_DMA) {
-		dma_unmap_sg(dd->dev, dd->out_sg,
-			dd->nb_out_sg, DMA_FROM_DEVICE);
-		dma_unmap_sg(dd->dev, dd->in_sg,
-			dd->nb_in_sg, DMA_TO_DEVICE);
 		err = 0;
 		err = 0;
+		if  (dd->flags & AES_FLAGS_FAST) {
+			dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_FROM_DEVICE);
+			dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
+		} else {
+			dma_sync_single_for_device(dd->dev, dd->dma_addr_out,
+				dd->dma_size, DMA_FROM_DEVICE);
+
+			/* copy data */
+			count = atmel_aes_sg_copy(&dd->out_sg, &dd->out_offset,
+				dd->buf_out, dd->buflen, dd->dma_size, 1);
+			if (count != dd->dma_size) {
+				err = -EINVAL;
+				pr_err("not all data converted: %u\n", count);
+			}
+		}
 	}
 	}
 
 
 	return err;
 	return err;
 }
 }
 
 
+
+static int atmel_aes_buff_init(struct atmel_aes_dev *dd)
+{
+	int err = -ENOMEM;
+
+	dd->buf_in = (void *)__get_free_pages(GFP_KERNEL, 0);
+	dd->buf_out = (void *)__get_free_pages(GFP_KERNEL, 0);
+	dd->buflen = PAGE_SIZE;
+	dd->buflen &= ~(AES_BLOCK_SIZE - 1);
+
+	if (!dd->buf_in || !dd->buf_out) {
+		dev_err(dd->dev, "unable to alloc pages.\n");
+		goto err_alloc;
+	}
+
+	/* MAP here */
+	dd->dma_addr_in = dma_map_single(dd->dev, dd->buf_in,
+					dd->buflen, DMA_TO_DEVICE);
+	if (dma_mapping_error(dd->dev, dd->dma_addr_in)) {
+		dev_err(dd->dev, "dma %d bytes error\n", dd->buflen);
+		err = -EINVAL;
+		goto err_map_in;
+	}
+
+	dd->dma_addr_out = dma_map_single(dd->dev, dd->buf_out,
+					dd->buflen, DMA_FROM_DEVICE);
+	if (dma_mapping_error(dd->dev, dd->dma_addr_out)) {
+		dev_err(dd->dev, "dma %d bytes error\n", dd->buflen);
+		err = -EINVAL;
+		goto err_map_out;
+	}
+
+	return 0;
+
+err_map_out:
+	dma_unmap_single(dd->dev, dd->dma_addr_in, dd->buflen,
+		DMA_TO_DEVICE);
+err_map_in:
+	free_page((unsigned long)dd->buf_out);
+	free_page((unsigned long)dd->buf_in);
+err_alloc:
+	if (err)
+		pr_err("error: %d\n", err);
+	return err;
+}
+
+static void atmel_aes_buff_cleanup(struct atmel_aes_dev *dd)
+{
+	dma_unmap_single(dd->dev, dd->dma_addr_out, dd->buflen,
+			 DMA_FROM_DEVICE);
+	dma_unmap_single(dd->dev, dd->dma_addr_in, dd->buflen,
+		DMA_TO_DEVICE);
+	free_page((unsigned long)dd->buf_out);
+	free_page((unsigned long)dd->buf_in);
+}
+
 static int atmel_aes_crypt(struct ablkcipher_request *req, unsigned long mode)
 static int atmel_aes_crypt(struct ablkcipher_request *req, unsigned long mode)
 {
 {
 	struct atmel_aes_ctx *ctx = crypto_ablkcipher_ctx(
 	struct atmel_aes_ctx *ctx = crypto_ablkcipher_ctx(
@@ -525,9 +696,30 @@ static int atmel_aes_crypt(struct ablkcipher_request *req, unsigned long mode)
 	struct atmel_aes_reqctx *rctx = ablkcipher_request_ctx(req);
 	struct atmel_aes_reqctx *rctx = ablkcipher_request_ctx(req);
 	struct atmel_aes_dev *dd;
 	struct atmel_aes_dev *dd;
 
 
-	if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE)) {
-		pr_err("request size is not exact amount of AES blocks\n");
-		return -EINVAL;
+	if (mode & AES_FLAGS_CFB8) {
+		if (!IS_ALIGNED(req->nbytes, CFB8_BLOCK_SIZE)) {
+			pr_err("request size is not exact amount of CFB8 blocks\n");
+			return -EINVAL;
+		}
+		ctx->block_size = CFB8_BLOCK_SIZE;
+	} else if (mode & AES_FLAGS_CFB16) {
+		if (!IS_ALIGNED(req->nbytes, CFB16_BLOCK_SIZE)) {
+			pr_err("request size is not exact amount of CFB16 blocks\n");
+			return -EINVAL;
+		}
+		ctx->block_size = CFB16_BLOCK_SIZE;
+	} else if (mode & AES_FLAGS_CFB32) {
+		if (!IS_ALIGNED(req->nbytes, CFB32_BLOCK_SIZE)) {
+			pr_err("request size is not exact amount of CFB32 blocks\n");
+			return -EINVAL;
+		}
+		ctx->block_size = CFB32_BLOCK_SIZE;
+	} else {
+		if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE)) {
+			pr_err("request size is not exact amount of AES blocks\n");
+			return -EINVAL;
+		}
+		ctx->block_size = AES_BLOCK_SIZE;
 	}
 	}
 
 
 	dd = atmel_aes_find_dev(ctx);
 	dd = atmel_aes_find_dev(ctx);
@@ -551,14 +743,12 @@ static bool atmel_aes_filter(struct dma_chan *chan, void *slave)
 	}
 	}
 }
 }
 
 
-static int atmel_aes_dma_init(struct atmel_aes_dev *dd)
+static int atmel_aes_dma_init(struct atmel_aes_dev *dd,
+	struct crypto_platform_data *pdata)
 {
 {
 	int err = -ENOMEM;
 	int err = -ENOMEM;
-	struct aes_platform_data	*pdata;
 	dma_cap_mask_t mask_in, mask_out;
 	dma_cap_mask_t mask_in, mask_out;
 
 
-	pdata = dd->dev->platform_data;
-
 	if (pdata && pdata->dma_slave->txdata.dma_dev &&
 	if (pdata && pdata->dma_slave->txdata.dma_dev &&
 		pdata->dma_slave->rxdata.dma_dev) {
 		pdata->dma_slave->rxdata.dma_dev) {
 
 
@@ -568,28 +758,38 @@ static int atmel_aes_dma_init(struct atmel_aes_dev *dd)
 
 
 		dd->dma_lch_in.chan = dma_request_channel(mask_in,
 		dd->dma_lch_in.chan = dma_request_channel(mask_in,
 				atmel_aes_filter, &pdata->dma_slave->rxdata);
 				atmel_aes_filter, &pdata->dma_slave->rxdata);
+
 		if (!dd->dma_lch_in.chan)
 		if (!dd->dma_lch_in.chan)
 			goto err_dma_in;
 			goto err_dma_in;
 
 
 		dd->dma_lch_in.dma_conf.direction = DMA_MEM_TO_DEV;
 		dd->dma_lch_in.dma_conf.direction = DMA_MEM_TO_DEV;
 		dd->dma_lch_in.dma_conf.dst_addr = dd->phys_base +
 		dd->dma_lch_in.dma_conf.dst_addr = dd->phys_base +
 			AES_IDATAR(0);
 			AES_IDATAR(0);
-		dd->dma_lch_in.dma_conf.src_maxburst = 1;
-		dd->dma_lch_in.dma_conf.dst_maxburst = 1;
+		dd->dma_lch_in.dma_conf.src_maxburst = dd->caps.max_burst_size;
+		dd->dma_lch_in.dma_conf.src_addr_width =
+			DMA_SLAVE_BUSWIDTH_4_BYTES;
+		dd->dma_lch_in.dma_conf.dst_maxburst = dd->caps.max_burst_size;
+		dd->dma_lch_in.dma_conf.dst_addr_width =
+			DMA_SLAVE_BUSWIDTH_4_BYTES;
 		dd->dma_lch_in.dma_conf.device_fc = false;
 		dd->dma_lch_in.dma_conf.device_fc = false;
 
 
 		dma_cap_zero(mask_out);
 		dma_cap_zero(mask_out);
 		dma_cap_set(DMA_SLAVE, mask_out);
 		dma_cap_set(DMA_SLAVE, mask_out);
 		dd->dma_lch_out.chan = dma_request_channel(mask_out,
 		dd->dma_lch_out.chan = dma_request_channel(mask_out,
 				atmel_aes_filter, &pdata->dma_slave->txdata);
 				atmel_aes_filter, &pdata->dma_slave->txdata);
+
 		if (!dd->dma_lch_out.chan)
 		if (!dd->dma_lch_out.chan)
 			goto err_dma_out;
 			goto err_dma_out;
 
 
 		dd->dma_lch_out.dma_conf.direction = DMA_DEV_TO_MEM;
 		dd->dma_lch_out.dma_conf.direction = DMA_DEV_TO_MEM;
 		dd->dma_lch_out.dma_conf.src_addr = dd->phys_base +
 		dd->dma_lch_out.dma_conf.src_addr = dd->phys_base +
 			AES_ODATAR(0);
 			AES_ODATAR(0);
-		dd->dma_lch_out.dma_conf.src_maxburst = 1;
-		dd->dma_lch_out.dma_conf.dst_maxburst = 1;
+		dd->dma_lch_out.dma_conf.src_maxburst = dd->caps.max_burst_size;
+		dd->dma_lch_out.dma_conf.src_addr_width =
+			DMA_SLAVE_BUSWIDTH_4_BYTES;
+		dd->dma_lch_out.dma_conf.dst_maxburst = dd->caps.max_burst_size;
+		dd->dma_lch_out.dma_conf.dst_addr_width =
+			DMA_SLAVE_BUSWIDTH_4_BYTES;
 		dd->dma_lch_out.dma_conf.device_fc = false;
 		dd->dma_lch_out.dma_conf.device_fc = false;
 
 
 		return 0;
 		return 0;
@@ -665,13 +865,13 @@ static int atmel_aes_ofb_decrypt(struct ablkcipher_request *req)
 static int atmel_aes_cfb_encrypt(struct ablkcipher_request *req)
 static int atmel_aes_cfb_encrypt(struct ablkcipher_request *req)
 {
 {
 	return atmel_aes_crypt(req,
 	return atmel_aes_crypt(req,
-		AES_FLAGS_ENCRYPT | AES_FLAGS_CFB);
+		AES_FLAGS_ENCRYPT | AES_FLAGS_CFB | AES_FLAGS_CFB128);
 }
 }
 
 
 static int atmel_aes_cfb_decrypt(struct ablkcipher_request *req)
 static int atmel_aes_cfb_decrypt(struct ablkcipher_request *req)
 {
 {
 	return atmel_aes_crypt(req,
 	return atmel_aes_crypt(req,
-		AES_FLAGS_CFB);
+		AES_FLAGS_CFB | AES_FLAGS_CFB128);
 }
 }
 
 
 static int atmel_aes_cfb64_encrypt(struct ablkcipher_request *req)
 static int atmel_aes_cfb64_encrypt(struct ablkcipher_request *req)
@@ -753,7 +953,7 @@ static struct crypto_alg aes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
-	.cra_alignmask		= 0x0,
+	.cra_alignmask		= 0xf,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_aes_cra_init,
 	.cra_init		= atmel_aes_cra_init,
@@ -773,7 +973,7 @@ static struct crypto_alg aes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
-	.cra_alignmask		= 0x0,
+	.cra_alignmask		= 0xf,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_aes_cra_init,
 	.cra_init		= atmel_aes_cra_init,
@@ -794,7 +994,7 @@ static struct crypto_alg aes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
-	.cra_alignmask		= 0x0,
+	.cra_alignmask		= 0xf,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_aes_cra_init,
 	.cra_init		= atmel_aes_cra_init,
@@ -815,7 +1015,7 @@ static struct crypto_alg aes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
-	.cra_alignmask		= 0x0,
+	.cra_alignmask		= 0xf,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_aes_cra_init,
 	.cra_init		= atmel_aes_cra_init,
@@ -836,7 +1036,7 @@ static struct crypto_alg aes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= CFB32_BLOCK_SIZE,
 	.cra_blocksize		= CFB32_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
-	.cra_alignmask		= 0x0,
+	.cra_alignmask		= 0x3,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_aes_cra_init,
 	.cra_init		= atmel_aes_cra_init,
@@ -857,7 +1057,7 @@ static struct crypto_alg aes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= CFB16_BLOCK_SIZE,
 	.cra_blocksize		= CFB16_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
-	.cra_alignmask		= 0x0,
+	.cra_alignmask		= 0x1,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_aes_cra_init,
 	.cra_init		= atmel_aes_cra_init,
@@ -899,7 +1099,7 @@ static struct crypto_alg aes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
-	.cra_alignmask		= 0x0,
+	.cra_alignmask		= 0xf,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_aes_cra_init,
 	.cra_init		= atmel_aes_cra_init,
@@ -915,15 +1115,14 @@ static struct crypto_alg aes_algs[] = {
 },
 },
 };
 };
 
 
-static struct crypto_alg aes_cfb64_alg[] = {
-{
+static struct crypto_alg aes_cfb64_alg = {
 	.cra_name		= "cfb64(aes)",
 	.cra_name		= "cfb64(aes)",
 	.cra_driver_name	= "atmel-cfb64-aes",
 	.cra_driver_name	= "atmel-cfb64-aes",
 	.cra_priority		= 100,
 	.cra_priority		= 100,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= CFB64_BLOCK_SIZE,
 	.cra_blocksize		= CFB64_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_aes_ctx),
-	.cra_alignmask		= 0x0,
+	.cra_alignmask		= 0x7,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_aes_cra_init,
 	.cra_init		= atmel_aes_cra_init,
@@ -936,7 +1135,6 @@ static struct crypto_alg aes_cfb64_alg[] = {
 		.encrypt	= atmel_aes_cfb64_encrypt,
 		.encrypt	= atmel_aes_cfb64_encrypt,
 		.decrypt	= atmel_aes_cfb64_decrypt,
 		.decrypt	= atmel_aes_cfb64_decrypt,
 	}
 	}
-},
 };
 };
 
 
 static void atmel_aes_queue_task(unsigned long data)
 static void atmel_aes_queue_task(unsigned long data)
@@ -969,7 +1167,14 @@ static void atmel_aes_done_task(unsigned long data)
 	err = dd->err ? : err;
 	err = dd->err ? : err;
 
 
 	if (dd->total && !err) {
 	if (dd->total && !err) {
-		err = atmel_aes_crypt_dma_start(dd);
+		if (dd->flags & AES_FLAGS_FAST) {
+			dd->in_sg = sg_next(dd->in_sg);
+			dd->out_sg = sg_next(dd->out_sg);
+			if (!dd->in_sg || !dd->out_sg)
+				err = -EINVAL;
+		}
+		if (!err)
+			err = atmel_aes_crypt_dma_start(dd);
 		if (!err)
 		if (!err)
 			return; /* DMA started. Not fininishing. */
 			return; /* DMA started. Not fininishing. */
 	}
 	}
@@ -1003,8 +1208,8 @@ static void atmel_aes_unregister_algs(struct atmel_aes_dev *dd)
 
 
 	for (i = 0; i < ARRAY_SIZE(aes_algs); i++)
 	for (i = 0; i < ARRAY_SIZE(aes_algs); i++)
 		crypto_unregister_alg(&aes_algs[i]);
 		crypto_unregister_alg(&aes_algs[i]);
-	if (dd->hw_version >= 0x130)
-		crypto_unregister_alg(&aes_cfb64_alg[0]);
+	if (dd->caps.has_cfb64)
+		crypto_unregister_alg(&aes_cfb64_alg);
 }
 }
 
 
 static int atmel_aes_register_algs(struct atmel_aes_dev *dd)
 static int atmel_aes_register_algs(struct atmel_aes_dev *dd)
@@ -1017,10 +1222,8 @@ static int atmel_aes_register_algs(struct atmel_aes_dev *dd)
 			goto err_aes_algs;
 			goto err_aes_algs;
 	}
 	}
 
 
-	atmel_aes_hw_version_init(dd);
-
-	if (dd->hw_version >= 0x130) {
-		err = crypto_register_alg(&aes_cfb64_alg[0]);
+	if (dd->caps.has_cfb64) {
+		err = crypto_register_alg(&aes_cfb64_alg);
 		if (err)
 		if (err)
 			goto err_aes_cfb64_alg;
 			goto err_aes_cfb64_alg;
 	}
 	}
@@ -1036,10 +1239,32 @@ err_aes_algs:
 	return err;
 	return err;
 }
 }
 
 
+static void atmel_aes_get_cap(struct atmel_aes_dev *dd)
+{
+	dd->caps.has_dualbuff = 0;
+	dd->caps.has_cfb64 = 0;
+	dd->caps.max_burst_size = 1;
+
+	/* keep only major version number */
+	switch (dd->hw_version & 0xff0) {
+	case 0x130:
+		dd->caps.has_dualbuff = 1;
+		dd->caps.has_cfb64 = 1;
+		dd->caps.max_burst_size = 4;
+		break;
+	case 0x120:
+		break;
+	default:
+		dev_warn(dd->dev,
+				"Unmanaged aes version, set minimum capabilities\n");
+		break;
+	}
+}
+
 static int atmel_aes_probe(struct platform_device *pdev)
 static int atmel_aes_probe(struct platform_device *pdev)
 {
 {
 	struct atmel_aes_dev *aes_dd;
 	struct atmel_aes_dev *aes_dd;
-	struct aes_platform_data	*pdata;
+	struct crypto_platform_data *pdata;
 	struct device *dev = &pdev->dev;
 	struct device *dev = &pdev->dev;
 	struct resource *aes_res;
 	struct resource *aes_res;
 	unsigned long aes_phys_size;
 	unsigned long aes_phys_size;
@@ -1099,7 +1324,7 @@ static int atmel_aes_probe(struct platform_device *pdev)
 	}
 	}
 
 
 	/* Initializing the clock */
 	/* Initializing the clock */
-	aes_dd->iclk = clk_get(&pdev->dev, NULL);
+	aes_dd->iclk = clk_get(&pdev->dev, "aes_clk");
 	if (IS_ERR(aes_dd->iclk)) {
 	if (IS_ERR(aes_dd->iclk)) {
 		dev_err(dev, "clock intialization failed.\n");
 		dev_err(dev, "clock intialization failed.\n");
 		err = PTR_ERR(aes_dd->iclk);
 		err = PTR_ERR(aes_dd->iclk);
@@ -1113,7 +1338,15 @@ static int atmel_aes_probe(struct platform_device *pdev)
 		goto aes_io_err;
 		goto aes_io_err;
 	}
 	}
 
 
-	err = atmel_aes_dma_init(aes_dd);
+	atmel_aes_hw_version_init(aes_dd);
+
+	atmel_aes_get_cap(aes_dd);
+
+	err = atmel_aes_buff_init(aes_dd);
+	if (err)
+		goto err_aes_buff;
+
+	err = atmel_aes_dma_init(aes_dd, pdata);
 	if (err)
 	if (err)
 		goto err_aes_dma;
 		goto err_aes_dma;
 
 
@@ -1135,6 +1368,8 @@ err_algs:
 	spin_unlock(&atmel_aes.lock);
 	spin_unlock(&atmel_aes.lock);
 	atmel_aes_dma_cleanup(aes_dd);
 	atmel_aes_dma_cleanup(aes_dd);
 err_aes_dma:
 err_aes_dma:
+	atmel_aes_buff_cleanup(aes_dd);
+err_aes_buff:
 	iounmap(aes_dd->io_base);
 	iounmap(aes_dd->io_base);
 aes_io_err:
 aes_io_err:
 	clk_put(aes_dd->iclk);
 	clk_put(aes_dd->iclk);

+ 6 - 1
drivers/crypto/atmel-sha-regs.h

@@ -14,10 +14,13 @@
 #define SHA_MR_MODE_MANUAL		0x0
 #define SHA_MR_MODE_MANUAL		0x0
 #define SHA_MR_MODE_AUTO		0x1
 #define SHA_MR_MODE_AUTO		0x1
 #define SHA_MR_MODE_PDC			0x2
 #define SHA_MR_MODE_PDC			0x2
-#define	SHA_MR_DUALBUFF			(1 << 3)
 #define SHA_MR_PROCDLY			(1 << 4)
 #define SHA_MR_PROCDLY			(1 << 4)
 #define SHA_MR_ALGO_SHA1		(0 << 8)
 #define SHA_MR_ALGO_SHA1		(0 << 8)
 #define SHA_MR_ALGO_SHA256		(1 << 8)
 #define SHA_MR_ALGO_SHA256		(1 << 8)
+#define SHA_MR_ALGO_SHA384		(2 << 8)
+#define SHA_MR_ALGO_SHA512		(3 << 8)
+#define SHA_MR_ALGO_SHA224		(4 << 8)
+#define	SHA_MR_DUALBUFF			(1 << 16)
 
 
 #define SHA_IER				0x10
 #define SHA_IER				0x10
 #define SHA_IDR				0x14
 #define SHA_IDR				0x14
@@ -33,6 +36,8 @@
 #define SHA_ISR_URAT_MR			(0x2 << 12)
 #define SHA_ISR_URAT_MR			(0x2 << 12)
 #define SHA_ISR_URAT_WO			(0x5 << 12)
 #define SHA_ISR_URAT_WO			(0x5 << 12)
 
 
+#define	SHA_HW_VERSION		0xFC
+
 #define SHA_TPR				0x108
 #define SHA_TPR				0x108
 #define SHA_TCR				0x10C
 #define SHA_TCR				0x10C
 #define SHA_TNPR			0x118
 #define SHA_TNPR			0x118

+ 486 - 100
drivers/crypto/atmel-sha.c

@@ -38,6 +38,7 @@
 #include <crypto/sha.h>
 #include <crypto/sha.h>
 #include <crypto/hash.h>
 #include <crypto/hash.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/hash.h>
+#include <linux/platform_data/crypto-atmel.h>
 #include "atmel-sha-regs.h"
 #include "atmel-sha-regs.h"
 
 
 /* SHA flags */
 /* SHA flags */
@@ -52,11 +53,12 @@
 #define SHA_FLAGS_FINUP		BIT(16)
 #define SHA_FLAGS_FINUP		BIT(16)
 #define SHA_FLAGS_SG		BIT(17)
 #define SHA_FLAGS_SG		BIT(17)
 #define SHA_FLAGS_SHA1		BIT(18)
 #define SHA_FLAGS_SHA1		BIT(18)
-#define SHA_FLAGS_SHA256	BIT(19)
-#define SHA_FLAGS_ERROR		BIT(20)
-#define SHA_FLAGS_PAD		BIT(21)
-
-#define SHA_FLAGS_DUALBUFF	BIT(24)
+#define SHA_FLAGS_SHA224	BIT(19)
+#define SHA_FLAGS_SHA256	BIT(20)
+#define SHA_FLAGS_SHA384	BIT(21)
+#define SHA_FLAGS_SHA512	BIT(22)
+#define SHA_FLAGS_ERROR		BIT(23)
+#define SHA_FLAGS_PAD		BIT(24)
 
 
 #define SHA_OP_UPDATE	1
 #define SHA_OP_UPDATE	1
 #define SHA_OP_FINAL	2
 #define SHA_OP_FINAL	2
@@ -65,6 +67,12 @@
 
 
 #define ATMEL_SHA_DMA_THRESHOLD		56
 #define ATMEL_SHA_DMA_THRESHOLD		56
 
 
+struct atmel_sha_caps {
+	bool	has_dma;
+	bool	has_dualbuff;
+	bool	has_sha224;
+	bool	has_sha_384_512;
+};
 
 
 struct atmel_sha_dev;
 struct atmel_sha_dev;
 
 
@@ -73,8 +81,8 @@ struct atmel_sha_reqctx {
 	unsigned long	flags;
 	unsigned long	flags;
 	unsigned long	op;
 	unsigned long	op;
 
 
-	u8	digest[SHA256_DIGEST_SIZE] __aligned(sizeof(u32));
-	size_t	digcnt;
+	u8	digest[SHA512_DIGEST_SIZE] __aligned(sizeof(u32));
+	u64	digcnt[2];
 	size_t	bufcnt;
 	size_t	bufcnt;
 	size_t	buflen;
 	size_t	buflen;
 	dma_addr_t	dma_addr;
 	dma_addr_t	dma_addr;
@@ -84,6 +92,8 @@ struct atmel_sha_reqctx {
 	unsigned int	offset;	/* offset in current sg */
 	unsigned int	offset;	/* offset in current sg */
 	unsigned int	total;	/* total request */
 	unsigned int	total;	/* total request */
 
 
+	size_t block_size;
+
 	u8	buffer[0] __aligned(sizeof(u32));
 	u8	buffer[0] __aligned(sizeof(u32));
 };
 };
 
 
@@ -97,7 +107,12 @@ struct atmel_sha_ctx {
 
 
 };
 };
 
 
-#define ATMEL_SHA_QUEUE_LENGTH	1
+#define ATMEL_SHA_QUEUE_LENGTH	50
+
+struct atmel_sha_dma {
+	struct dma_chan			*chan;
+	struct dma_slave_config dma_conf;
+};
 
 
 struct atmel_sha_dev {
 struct atmel_sha_dev {
 	struct list_head	list;
 	struct list_head	list;
@@ -114,6 +129,12 @@ struct atmel_sha_dev {
 	unsigned long		flags;
 	unsigned long		flags;
 	struct crypto_queue	queue;
 	struct crypto_queue	queue;
 	struct ahash_request	*req;
 	struct ahash_request	*req;
+
+	struct atmel_sha_dma	dma_lch_in;
+
+	struct atmel_sha_caps	caps;
+
+	u32	hw_version;
 };
 };
 
 
 struct atmel_sha_drv {
 struct atmel_sha_drv {
@@ -137,14 +158,6 @@ static inline void atmel_sha_write(struct atmel_sha_dev *dd,
 	writel_relaxed(value, dd->io_base + offset);
 	writel_relaxed(value, dd->io_base + offset);
 }
 }
 
 
-static void atmel_sha_dualbuff_test(struct atmel_sha_dev *dd)
-{
-	atmel_sha_write(dd, SHA_MR, SHA_MR_DUALBUFF);
-
-	if (atmel_sha_read(dd, SHA_MR) & SHA_MR_DUALBUFF)
-		dd->flags |= SHA_FLAGS_DUALBUFF;
-}
-
 static size_t atmel_sha_append_sg(struct atmel_sha_reqctx *ctx)
 static size_t atmel_sha_append_sg(struct atmel_sha_reqctx *ctx)
 {
 {
 	size_t count;
 	size_t count;
@@ -176,31 +189,58 @@ static size_t atmel_sha_append_sg(struct atmel_sha_reqctx *ctx)
 }
 }
 
 
 /*
 /*
- * The purpose of this padding is to ensure that the padded message
- * is a multiple of 512 bits. The bit "1" is appended at the end of
- * the message followed by "padlen-1" zero bits. Then a 64 bits block
- * equals to the message length in bits is appended.
+ * The purpose of this padding is to ensure that the padded message is a
+ * multiple of 512 bits (SHA1/SHA224/SHA256) or 1024 bits (SHA384/SHA512).
+ * The bit "1" is appended at the end of the message followed by
+ * "padlen-1" zero bits. Then a 64 bits block (SHA1/SHA224/SHA256) or
+ * 128 bits block (SHA384/SHA512) equals to the message length in bits
+ * is appended.
  *
  *
- * padlen is calculated as followed:
+ * For SHA1/SHA224/SHA256, padlen is calculated as followed:
  *  - if message length < 56 bytes then padlen = 56 - message length
  *  - if message length < 56 bytes then padlen = 56 - message length
  *  - else padlen = 64 + 56 - message length
  *  - else padlen = 64 + 56 - message length
+ *
+ * For SHA384/SHA512, padlen is calculated as followed:
+ *  - if message length < 112 bytes then padlen = 112 - message length
+ *  - else padlen = 128 + 112 - message length
  */
  */
 static void atmel_sha_fill_padding(struct atmel_sha_reqctx *ctx, int length)
 static void atmel_sha_fill_padding(struct atmel_sha_reqctx *ctx, int length)
 {
 {
 	unsigned int index, padlen;
 	unsigned int index, padlen;
-	u64 bits;
-	u64 size;
-
-	bits = (ctx->bufcnt + ctx->digcnt + length) << 3;
-	size = cpu_to_be64(bits);
-
-	index = ctx->bufcnt & 0x3f;
-	padlen = (index < 56) ? (56 - index) : ((64+56) - index);
-	*(ctx->buffer + ctx->bufcnt) = 0x80;
-	memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen-1);
-	memcpy(ctx->buffer + ctx->bufcnt + padlen, &size, 8);
-	ctx->bufcnt += padlen + 8;
-	ctx->flags |= SHA_FLAGS_PAD;
+	u64 bits[2];
+	u64 size[2];
+
+	size[0] = ctx->digcnt[0];
+	size[1] = ctx->digcnt[1];
+
+	size[0] += ctx->bufcnt;
+	if (size[0] < ctx->bufcnt)
+		size[1]++;
+
+	size[0] += length;
+	if (size[0]  < length)
+		size[1]++;
+
+	bits[1] = cpu_to_be64(size[0] << 3);
+	bits[0] = cpu_to_be64(size[1] << 3 | size[0] >> 61);
+
+	if (ctx->flags & (SHA_FLAGS_SHA384 | SHA_FLAGS_SHA512)) {
+		index = ctx->bufcnt & 0x7f;
+		padlen = (index < 112) ? (112 - index) : ((128+112) - index);
+		*(ctx->buffer + ctx->bufcnt) = 0x80;
+		memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen-1);
+		memcpy(ctx->buffer + ctx->bufcnt + padlen, bits, 16);
+		ctx->bufcnt += padlen + 16;
+		ctx->flags |= SHA_FLAGS_PAD;
+	} else {
+		index = ctx->bufcnt & 0x3f;
+		padlen = (index < 56) ? (56 - index) : ((64+56) - index);
+		*(ctx->buffer + ctx->bufcnt) = 0x80;
+		memset(ctx->buffer + ctx->bufcnt + 1, 0, padlen-1);
+		memcpy(ctx->buffer + ctx->bufcnt + padlen, &bits[1], 8);
+		ctx->bufcnt += padlen + 8;
+		ctx->flags |= SHA_FLAGS_PAD;
+	}
 }
 }
 
 
 static int atmel_sha_init(struct ahash_request *req)
 static int atmel_sha_init(struct ahash_request *req)
@@ -231,13 +271,35 @@ static int atmel_sha_init(struct ahash_request *req)
 	dev_dbg(dd->dev, "init: digest size: %d\n",
 	dev_dbg(dd->dev, "init: digest size: %d\n",
 		crypto_ahash_digestsize(tfm));
 		crypto_ahash_digestsize(tfm));
 
 
-	if (crypto_ahash_digestsize(tfm) == SHA1_DIGEST_SIZE)
+	switch (crypto_ahash_digestsize(tfm)) {
+	case SHA1_DIGEST_SIZE:
 		ctx->flags |= SHA_FLAGS_SHA1;
 		ctx->flags |= SHA_FLAGS_SHA1;
-	else if (crypto_ahash_digestsize(tfm) == SHA256_DIGEST_SIZE)
+		ctx->block_size = SHA1_BLOCK_SIZE;
+		break;
+	case SHA224_DIGEST_SIZE:
+		ctx->flags |= SHA_FLAGS_SHA224;
+		ctx->block_size = SHA224_BLOCK_SIZE;
+		break;
+	case SHA256_DIGEST_SIZE:
 		ctx->flags |= SHA_FLAGS_SHA256;
 		ctx->flags |= SHA_FLAGS_SHA256;
+		ctx->block_size = SHA256_BLOCK_SIZE;
+		break;
+	case SHA384_DIGEST_SIZE:
+		ctx->flags |= SHA_FLAGS_SHA384;
+		ctx->block_size = SHA384_BLOCK_SIZE;
+		break;
+	case SHA512_DIGEST_SIZE:
+		ctx->flags |= SHA_FLAGS_SHA512;
+		ctx->block_size = SHA512_BLOCK_SIZE;
+		break;
+	default:
+		return -EINVAL;
+		break;
+	}
 
 
 	ctx->bufcnt = 0;
 	ctx->bufcnt = 0;
-	ctx->digcnt = 0;
+	ctx->digcnt[0] = 0;
+	ctx->digcnt[1] = 0;
 	ctx->buflen = SHA_BUFFER_LEN;
 	ctx->buflen = SHA_BUFFER_LEN;
 
 
 	return 0;
 	return 0;
@@ -249,19 +311,28 @@ static void atmel_sha_write_ctrl(struct atmel_sha_dev *dd, int dma)
 	u32 valcr = 0, valmr = SHA_MR_MODE_AUTO;
 	u32 valcr = 0, valmr = SHA_MR_MODE_AUTO;
 
 
 	if (likely(dma)) {
 	if (likely(dma)) {
-		atmel_sha_write(dd, SHA_IER, SHA_INT_TXBUFE);
+		if (!dd->caps.has_dma)
+			atmel_sha_write(dd, SHA_IER, SHA_INT_TXBUFE);
 		valmr = SHA_MR_MODE_PDC;
 		valmr = SHA_MR_MODE_PDC;
-		if (dd->flags & SHA_FLAGS_DUALBUFF)
-			valmr = SHA_MR_DUALBUFF;
+		if (dd->caps.has_dualbuff)
+			valmr |= SHA_MR_DUALBUFF;
 	} else {
 	} else {
 		atmel_sha_write(dd, SHA_IER, SHA_INT_DATARDY);
 		atmel_sha_write(dd, SHA_IER, SHA_INT_DATARDY);
 	}
 	}
 
 
-	if (ctx->flags & SHA_FLAGS_SHA256)
+	if (ctx->flags & SHA_FLAGS_SHA1)
+		valmr |= SHA_MR_ALGO_SHA1;
+	else if (ctx->flags & SHA_FLAGS_SHA224)
+		valmr |= SHA_MR_ALGO_SHA224;
+	else if (ctx->flags & SHA_FLAGS_SHA256)
 		valmr |= SHA_MR_ALGO_SHA256;
 		valmr |= SHA_MR_ALGO_SHA256;
+	else if (ctx->flags & SHA_FLAGS_SHA384)
+		valmr |= SHA_MR_ALGO_SHA384;
+	else if (ctx->flags & SHA_FLAGS_SHA512)
+		valmr |= SHA_MR_ALGO_SHA512;
 
 
 	/* Setting CR_FIRST only for the first iteration */
 	/* Setting CR_FIRST only for the first iteration */
-	if (!ctx->digcnt)
+	if (!(ctx->digcnt[0] || ctx->digcnt[1]))
 		valcr = SHA_CR_FIRST;
 		valcr = SHA_CR_FIRST;
 
 
 	atmel_sha_write(dd, SHA_CR, valcr);
 	atmel_sha_write(dd, SHA_CR, valcr);
@@ -275,13 +346,15 @@ static int atmel_sha_xmit_cpu(struct atmel_sha_dev *dd, const u8 *buf,
 	int count, len32;
 	int count, len32;
 	const u32 *buffer = (const u32 *)buf;
 	const u32 *buffer = (const u32 *)buf;
 
 
-	dev_dbg(dd->dev, "xmit_cpu: digcnt: %d, length: %d, final: %d\n",
-						ctx->digcnt, length, final);
+	dev_dbg(dd->dev, "xmit_cpu: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
+		ctx->digcnt[1], ctx->digcnt[0], length, final);
 
 
 	atmel_sha_write_ctrl(dd, 0);
 	atmel_sha_write_ctrl(dd, 0);
 
 
 	/* should be non-zero before next lines to disable clocks later */
 	/* should be non-zero before next lines to disable clocks later */
-	ctx->digcnt += length;
+	ctx->digcnt[0] += length;
+	if (ctx->digcnt[0] < length)
+		ctx->digcnt[1]++;
 
 
 	if (final)
 	if (final)
 		dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */
 		dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */
@@ -302,8 +375,8 @@ static int atmel_sha_xmit_pdc(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
 	struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
 	struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
 	int len32;
 	int len32;
 
 
-	dev_dbg(dd->dev, "xmit_pdc: digcnt: %d, length: %d, final: %d\n",
-						ctx->digcnt, length1, final);
+	dev_dbg(dd->dev, "xmit_pdc: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
+		ctx->digcnt[1], ctx->digcnt[0], length1, final);
 
 
 	len32 = DIV_ROUND_UP(length1, sizeof(u32));
 	len32 = DIV_ROUND_UP(length1, sizeof(u32));
 	atmel_sha_write(dd, SHA_PTCR, SHA_PTCR_TXTDIS);
 	atmel_sha_write(dd, SHA_PTCR, SHA_PTCR_TXTDIS);
@@ -317,7 +390,9 @@ static int atmel_sha_xmit_pdc(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
 	atmel_sha_write_ctrl(dd, 1);
 	atmel_sha_write_ctrl(dd, 1);
 
 
 	/* should be non-zero before next lines to disable clocks later */
 	/* should be non-zero before next lines to disable clocks later */
-	ctx->digcnt += length1;
+	ctx->digcnt[0] += length1;
+	if (ctx->digcnt[0] < length1)
+		ctx->digcnt[1]++;
 
 
 	if (final)
 	if (final)
 		dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */
 		dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */
@@ -330,6 +405,86 @@ static int atmel_sha_xmit_pdc(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
 	return -EINPROGRESS;
 	return -EINPROGRESS;
 }
 }
 
 
+static void atmel_sha_dma_callback(void *data)
+{
+	struct atmel_sha_dev *dd = data;
+
+	/* dma_lch_in - completed - wait DATRDY */
+	atmel_sha_write(dd, SHA_IER, SHA_INT_DATARDY);
+}
+
+static int atmel_sha_xmit_dma(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
+		size_t length1, dma_addr_t dma_addr2, size_t length2, int final)
+{
+	struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
+	struct dma_async_tx_descriptor	*in_desc;
+	struct scatterlist sg[2];
+
+	dev_dbg(dd->dev, "xmit_dma: digcnt: 0x%llx 0x%llx, length: %d, final: %d\n",
+		ctx->digcnt[1], ctx->digcnt[0], length1, final);
+
+	if (ctx->flags & (SHA_FLAGS_SHA1 | SHA_FLAGS_SHA224 |
+			SHA_FLAGS_SHA256)) {
+		dd->dma_lch_in.dma_conf.src_maxburst = 16;
+		dd->dma_lch_in.dma_conf.dst_maxburst = 16;
+	} else {
+		dd->dma_lch_in.dma_conf.src_maxburst = 32;
+		dd->dma_lch_in.dma_conf.dst_maxburst = 32;
+	}
+
+	dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf);
+
+	if (length2) {
+		sg_init_table(sg, 2);
+		sg_dma_address(&sg[0]) = dma_addr1;
+		sg_dma_len(&sg[0]) = length1;
+		sg_dma_address(&sg[1]) = dma_addr2;
+		sg_dma_len(&sg[1]) = length2;
+		in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, sg, 2,
+			DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	} else {
+		sg_init_table(sg, 1);
+		sg_dma_address(&sg[0]) = dma_addr1;
+		sg_dma_len(&sg[0]) = length1;
+		in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, sg, 1,
+			DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	}
+	if (!in_desc)
+		return -EINVAL;
+
+	in_desc->callback = atmel_sha_dma_callback;
+	in_desc->callback_param = dd;
+
+	atmel_sha_write_ctrl(dd, 1);
+
+	/* should be non-zero before next lines to disable clocks later */
+	ctx->digcnt[0] += length1;
+	if (ctx->digcnt[0] < length1)
+		ctx->digcnt[1]++;
+
+	if (final)
+		dd->flags |= SHA_FLAGS_FINAL; /* catch last interrupt */
+
+	dd->flags |=  SHA_FLAGS_DMA_ACTIVE;
+
+	/* Start DMA transfer */
+	dmaengine_submit(in_desc);
+	dma_async_issue_pending(dd->dma_lch_in.chan);
+
+	return -EINPROGRESS;
+}
+
+static int atmel_sha_xmit_start(struct atmel_sha_dev *dd, dma_addr_t dma_addr1,
+		size_t length1, dma_addr_t dma_addr2, size_t length2, int final)
+{
+	if (dd->caps.has_dma)
+		return atmel_sha_xmit_dma(dd, dma_addr1, length1,
+				dma_addr2, length2, final);
+	else
+		return atmel_sha_xmit_pdc(dd, dma_addr1, length1,
+				dma_addr2, length2, final);
+}
+
 static int atmel_sha_update_cpu(struct atmel_sha_dev *dd)
 static int atmel_sha_update_cpu(struct atmel_sha_dev *dd)
 {
 {
 	struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
 	struct atmel_sha_reqctx *ctx = ahash_request_ctx(dd->req);
@@ -337,7 +492,6 @@ static int atmel_sha_update_cpu(struct atmel_sha_dev *dd)
 
 
 	atmel_sha_append_sg(ctx);
 	atmel_sha_append_sg(ctx);
 	atmel_sha_fill_padding(ctx, 0);
 	atmel_sha_fill_padding(ctx, 0);
-
 	bufcnt = ctx->bufcnt;
 	bufcnt = ctx->bufcnt;
 	ctx->bufcnt = 0;
 	ctx->bufcnt = 0;
 
 
@@ -349,17 +503,17 @@ static int atmel_sha_xmit_dma_map(struct atmel_sha_dev *dd,
 					size_t length, int final)
 					size_t length, int final)
 {
 {
 	ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer,
 	ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer,
-				ctx->buflen + SHA1_BLOCK_SIZE, DMA_TO_DEVICE);
+				ctx->buflen + ctx->block_size, DMA_TO_DEVICE);
 	if (dma_mapping_error(dd->dev, ctx->dma_addr)) {
 	if (dma_mapping_error(dd->dev, ctx->dma_addr)) {
 		dev_err(dd->dev, "dma %u bytes error\n", ctx->buflen +
 		dev_err(dd->dev, "dma %u bytes error\n", ctx->buflen +
-				SHA1_BLOCK_SIZE);
+				ctx->block_size);
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
 	ctx->flags &= ~SHA_FLAGS_SG;
 	ctx->flags &= ~SHA_FLAGS_SG;
 
 
 	/* next call does not fail... so no unmap in the case of error */
 	/* next call does not fail... so no unmap in the case of error */
-	return atmel_sha_xmit_pdc(dd, ctx->dma_addr, length, 0, 0, final);
+	return atmel_sha_xmit_start(dd, ctx->dma_addr, length, 0, 0, final);
 }
 }
 
 
 static int atmel_sha_update_dma_slow(struct atmel_sha_dev *dd)
 static int atmel_sha_update_dma_slow(struct atmel_sha_dev *dd)
@@ -372,8 +526,8 @@ static int atmel_sha_update_dma_slow(struct atmel_sha_dev *dd)
 
 
 	final = (ctx->flags & SHA_FLAGS_FINUP) && !ctx->total;
 	final = (ctx->flags & SHA_FLAGS_FINUP) && !ctx->total;
 
 
-	dev_dbg(dd->dev, "slow: bufcnt: %u, digcnt: %d, final: %d\n",
-					 ctx->bufcnt, ctx->digcnt, final);
+	dev_dbg(dd->dev, "slow: bufcnt: %u, digcnt: 0x%llx 0x%llx, final: %d\n",
+		 ctx->bufcnt, ctx->digcnt[1], ctx->digcnt[0], final);
 
 
 	if (final)
 	if (final)
 		atmel_sha_fill_padding(ctx, 0);
 		atmel_sha_fill_padding(ctx, 0);
@@ -400,30 +554,25 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 	if (ctx->bufcnt || ctx->offset)
 	if (ctx->bufcnt || ctx->offset)
 		return atmel_sha_update_dma_slow(dd);
 		return atmel_sha_update_dma_slow(dd);
 
 
-	dev_dbg(dd->dev, "fast: digcnt: %d, bufcnt: %u, total: %u\n",
-			ctx->digcnt, ctx->bufcnt, ctx->total);
+	dev_dbg(dd->dev, "fast: digcnt: 0x%llx 0x%llx, bufcnt: %u, total: %u\n",
+		ctx->digcnt[1], ctx->digcnt[0], ctx->bufcnt, ctx->total);
 
 
 	sg = ctx->sg;
 	sg = ctx->sg;
 
 
 	if (!IS_ALIGNED(sg->offset, sizeof(u32)))
 	if (!IS_ALIGNED(sg->offset, sizeof(u32)))
 		return atmel_sha_update_dma_slow(dd);
 		return atmel_sha_update_dma_slow(dd);
 
 
-	if (!sg_is_last(sg) && !IS_ALIGNED(sg->length, SHA1_BLOCK_SIZE))
-		/* size is not SHA1_BLOCK_SIZE aligned */
+	if (!sg_is_last(sg) && !IS_ALIGNED(sg->length, ctx->block_size))
+		/* size is not ctx->block_size aligned */
 		return atmel_sha_update_dma_slow(dd);
 		return atmel_sha_update_dma_slow(dd);
 
 
 	length = min(ctx->total, sg->length);
 	length = min(ctx->total, sg->length);
 
 
 	if (sg_is_last(sg)) {
 	if (sg_is_last(sg)) {
 		if (!(ctx->flags & SHA_FLAGS_FINUP)) {
 		if (!(ctx->flags & SHA_FLAGS_FINUP)) {
-			/* not last sg must be SHA1_BLOCK_SIZE aligned */
-			tail = length & (SHA1_BLOCK_SIZE - 1);
+			/* not last sg must be ctx->block_size aligned */
+			tail = length & (ctx->block_size - 1);
 			length -= tail;
 			length -= tail;
-			if (length == 0) {
-				/* offset where to start slow */
-				ctx->offset = length;
-				return atmel_sha_update_dma_slow(dd);
-			}
 		}
 		}
 	}
 	}
 
 
@@ -434,7 +583,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 
 
 	/* Add padding */
 	/* Add padding */
 	if (final) {
 	if (final) {
-		tail = length & (SHA1_BLOCK_SIZE - 1);
+		tail = length & (ctx->block_size - 1);
 		length -= tail;
 		length -= tail;
 		ctx->total += tail;
 		ctx->total += tail;
 		ctx->offset = length; /* offset where to start slow */
 		ctx->offset = length; /* offset where to start slow */
@@ -445,10 +594,10 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 		atmel_sha_fill_padding(ctx, length);
 		atmel_sha_fill_padding(ctx, length);
 
 
 		ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer,
 		ctx->dma_addr = dma_map_single(dd->dev, ctx->buffer,
-			ctx->buflen + SHA1_BLOCK_SIZE, DMA_TO_DEVICE);
+			ctx->buflen + ctx->block_size, DMA_TO_DEVICE);
 		if (dma_mapping_error(dd->dev, ctx->dma_addr)) {
 		if (dma_mapping_error(dd->dev, ctx->dma_addr)) {
 			dev_err(dd->dev, "dma %u bytes error\n",
 			dev_err(dd->dev, "dma %u bytes error\n",
-				ctx->buflen + SHA1_BLOCK_SIZE);
+				ctx->buflen + ctx->block_size);
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
 
 
@@ -456,7 +605,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 			ctx->flags &= ~SHA_FLAGS_SG;
 			ctx->flags &= ~SHA_FLAGS_SG;
 			count = ctx->bufcnt;
 			count = ctx->bufcnt;
 			ctx->bufcnt = 0;
 			ctx->bufcnt = 0;
-			return atmel_sha_xmit_pdc(dd, ctx->dma_addr, count, 0,
+			return atmel_sha_xmit_start(dd, ctx->dma_addr, count, 0,
 					0, final);
 					0, final);
 		} else {
 		} else {
 			ctx->sg = sg;
 			ctx->sg = sg;
@@ -470,7 +619,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 
 
 			count = ctx->bufcnt;
 			count = ctx->bufcnt;
 			ctx->bufcnt = 0;
 			ctx->bufcnt = 0;
-			return atmel_sha_xmit_pdc(dd, sg_dma_address(ctx->sg),
+			return atmel_sha_xmit_start(dd, sg_dma_address(ctx->sg),
 					length, ctx->dma_addr, count, final);
 					length, ctx->dma_addr, count, final);
 		}
 		}
 	}
 	}
@@ -483,7 +632,7 @@ static int atmel_sha_update_dma_start(struct atmel_sha_dev *dd)
 	ctx->flags |= SHA_FLAGS_SG;
 	ctx->flags |= SHA_FLAGS_SG;
 
 
 	/* next call does not fail... so no unmap in the case of error */
 	/* next call does not fail... so no unmap in the case of error */
-	return atmel_sha_xmit_pdc(dd, sg_dma_address(ctx->sg), length, 0,
+	return atmel_sha_xmit_start(dd, sg_dma_address(ctx->sg), length, 0,
 								0, final);
 								0, final);
 }
 }
 
 
@@ -498,12 +647,13 @@ static int atmel_sha_update_dma_stop(struct atmel_sha_dev *dd)
 			if (ctx->sg)
 			if (ctx->sg)
 				ctx->offset = 0;
 				ctx->offset = 0;
 		}
 		}
-		if (ctx->flags & SHA_FLAGS_PAD)
+		if (ctx->flags & SHA_FLAGS_PAD) {
 			dma_unmap_single(dd->dev, ctx->dma_addr,
 			dma_unmap_single(dd->dev, ctx->dma_addr,
-				ctx->buflen + SHA1_BLOCK_SIZE, DMA_TO_DEVICE);
+				ctx->buflen + ctx->block_size, DMA_TO_DEVICE);
+		}
 	} else {
 	} else {
 		dma_unmap_single(dd->dev, ctx->dma_addr, ctx->buflen +
 		dma_unmap_single(dd->dev, ctx->dma_addr, ctx->buflen +
-						SHA1_BLOCK_SIZE, DMA_TO_DEVICE);
+						ctx->block_size, DMA_TO_DEVICE);
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -515,8 +665,8 @@ static int atmel_sha_update_req(struct atmel_sha_dev *dd)
 	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
 	struct atmel_sha_reqctx *ctx = ahash_request_ctx(req);
 	int err;
 	int err;
 
 
-	dev_dbg(dd->dev, "update_req: total: %u, digcnt: %d, finup: %d\n",
-		 ctx->total, ctx->digcnt, (ctx->flags & SHA_FLAGS_FINUP) != 0);
+	dev_dbg(dd->dev, "update_req: total: %u, digcnt: 0x%llx 0x%llx\n",
+		ctx->total, ctx->digcnt[1], ctx->digcnt[0]);
 
 
 	if (ctx->flags & SHA_FLAGS_CPU)
 	if (ctx->flags & SHA_FLAGS_CPU)
 		err = atmel_sha_update_cpu(dd);
 		err = atmel_sha_update_cpu(dd);
@@ -524,8 +674,8 @@ static int atmel_sha_update_req(struct atmel_sha_dev *dd)
 		err = atmel_sha_update_dma_start(dd);
 		err = atmel_sha_update_dma_start(dd);
 
 
 	/* wait for dma completion before can take more data */
 	/* wait for dma completion before can take more data */
-	dev_dbg(dd->dev, "update: err: %d, digcnt: %d\n",
-			err, ctx->digcnt);
+	dev_dbg(dd->dev, "update: err: %d, digcnt: 0x%llx 0%llx\n",
+			err, ctx->digcnt[1], ctx->digcnt[0]);
 
 
 	return err;
 	return err;
 }
 }
@@ -562,12 +712,21 @@ static void atmel_sha_copy_hash(struct ahash_request *req)
 	u32 *hash = (u32 *)ctx->digest;
 	u32 *hash = (u32 *)ctx->digest;
 	int i;
 	int i;
 
 
-	if (likely(ctx->flags & SHA_FLAGS_SHA1))
+	if (ctx->flags & SHA_FLAGS_SHA1)
 		for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(u32); i++)
 		for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(u32); i++)
 			hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
 			hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
-	else
+	else if (ctx->flags & SHA_FLAGS_SHA224)
+		for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(u32); i++)
+			hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
+	else if (ctx->flags & SHA_FLAGS_SHA256)
 		for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(u32); i++)
 		for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(u32); i++)
 			hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
 			hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
+	else if (ctx->flags & SHA_FLAGS_SHA384)
+		for (i = 0; i < SHA384_DIGEST_SIZE / sizeof(u32); i++)
+			hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
+	else
+		for (i = 0; i < SHA512_DIGEST_SIZE / sizeof(u32); i++)
+			hash[i] = atmel_sha_read(ctx->dd, SHA_REG_DIGEST(i));
 }
 }
 
 
 static void atmel_sha_copy_ready_hash(struct ahash_request *req)
 static void atmel_sha_copy_ready_hash(struct ahash_request *req)
@@ -577,10 +736,16 @@ static void atmel_sha_copy_ready_hash(struct ahash_request *req)
 	if (!req->result)
 	if (!req->result)
 		return;
 		return;
 
 
-	if (likely(ctx->flags & SHA_FLAGS_SHA1))
+	if (ctx->flags & SHA_FLAGS_SHA1)
 		memcpy(req->result, ctx->digest, SHA1_DIGEST_SIZE);
 		memcpy(req->result, ctx->digest, SHA1_DIGEST_SIZE);
-	else
+	else if (ctx->flags & SHA_FLAGS_SHA224)
+		memcpy(req->result, ctx->digest, SHA224_DIGEST_SIZE);
+	else if (ctx->flags & SHA_FLAGS_SHA256)
 		memcpy(req->result, ctx->digest, SHA256_DIGEST_SIZE);
 		memcpy(req->result, ctx->digest, SHA256_DIGEST_SIZE);
+	else if (ctx->flags & SHA_FLAGS_SHA384)
+		memcpy(req->result, ctx->digest, SHA384_DIGEST_SIZE);
+	else
+		memcpy(req->result, ctx->digest, SHA512_DIGEST_SIZE);
 }
 }
 
 
 static int atmel_sha_finish(struct ahash_request *req)
 static int atmel_sha_finish(struct ahash_request *req)
@@ -589,11 +754,11 @@ static int atmel_sha_finish(struct ahash_request *req)
 	struct atmel_sha_dev *dd = ctx->dd;
 	struct atmel_sha_dev *dd = ctx->dd;
 	int err = 0;
 	int err = 0;
 
 
-	if (ctx->digcnt)
+	if (ctx->digcnt[0] || ctx->digcnt[1])
 		atmel_sha_copy_ready_hash(req);
 		atmel_sha_copy_ready_hash(req);
 
 
-	dev_dbg(dd->dev, "digcnt: %d, bufcnt: %d\n", ctx->digcnt,
-		ctx->bufcnt);
+	dev_dbg(dd->dev, "digcnt: 0x%llx 0x%llx, bufcnt: %d\n", ctx->digcnt[1],
+		ctx->digcnt[0], ctx->bufcnt);
 
 
 	return err;
 	return err;
 }
 }
@@ -628,9 +793,8 @@ static int atmel_sha_hw_init(struct atmel_sha_dev *dd)
 {
 {
 	clk_prepare_enable(dd->iclk);
 	clk_prepare_enable(dd->iclk);
 
 
-	if (SHA_FLAGS_INIT & dd->flags) {
+	if (!(SHA_FLAGS_INIT & dd->flags)) {
 		atmel_sha_write(dd, SHA_CR, SHA_CR_SWRST);
 		atmel_sha_write(dd, SHA_CR, SHA_CR_SWRST);
-		atmel_sha_dualbuff_test(dd);
 		dd->flags |= SHA_FLAGS_INIT;
 		dd->flags |= SHA_FLAGS_INIT;
 		dd->err = 0;
 		dd->err = 0;
 	}
 	}
@@ -638,6 +802,23 @@ static int atmel_sha_hw_init(struct atmel_sha_dev *dd)
 	return 0;
 	return 0;
 }
 }
 
 
+static inline unsigned int atmel_sha_get_version(struct atmel_sha_dev *dd)
+{
+	return atmel_sha_read(dd, SHA_HW_VERSION) & 0x00000fff;
+}
+
+static void atmel_sha_hw_version_init(struct atmel_sha_dev *dd)
+{
+	atmel_sha_hw_init(dd);
+
+	dd->hw_version = atmel_sha_get_version(dd);
+
+	dev_info(dd->dev,
+			"version: 0x%x\n", dd->hw_version);
+
+	clk_disable_unprepare(dd->iclk);
+}
+
 static int atmel_sha_handle_queue(struct atmel_sha_dev *dd,
 static int atmel_sha_handle_queue(struct atmel_sha_dev *dd,
 				  struct ahash_request *req)
 				  struct ahash_request *req)
 {
 {
@@ -682,10 +863,9 @@ static int atmel_sha_handle_queue(struct atmel_sha_dev *dd,
 
 
 	if (ctx->op == SHA_OP_UPDATE) {
 	if (ctx->op == SHA_OP_UPDATE) {
 		err = atmel_sha_update_req(dd);
 		err = atmel_sha_update_req(dd);
-		if (err != -EINPROGRESS && (ctx->flags & SHA_FLAGS_FINUP)) {
+		if (err != -EINPROGRESS && (ctx->flags & SHA_FLAGS_FINUP))
 			/* no final() after finup() */
 			/* no final() after finup() */
 			err = atmel_sha_final_req(dd);
 			err = atmel_sha_final_req(dd);
-		}
 	} else if (ctx->op == SHA_OP_FINAL) {
 	} else if (ctx->op == SHA_OP_FINAL) {
 		err = atmel_sha_final_req(dd);
 		err = atmel_sha_final_req(dd);
 	}
 	}
@@ -808,7 +988,7 @@ static int atmel_sha_cra_init_alg(struct crypto_tfm *tfm, const char *alg_base)
 	}
 	}
 	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
 	crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
 				 sizeof(struct atmel_sha_reqctx) +
 				 sizeof(struct atmel_sha_reqctx) +
-				 SHA_BUFFER_LEN + SHA256_BLOCK_SIZE);
+				 SHA_BUFFER_LEN + SHA512_BLOCK_SIZE);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -826,7 +1006,7 @@ static void atmel_sha_cra_exit(struct crypto_tfm *tfm)
 	tctx->fallback = NULL;
 	tctx->fallback = NULL;
 }
 }
 
 
-static struct ahash_alg sha_algs[] = {
+static struct ahash_alg sha_1_256_algs[] = {
 {
 {
 	.init		= atmel_sha_init,
 	.init		= atmel_sha_init,
 	.update		= atmel_sha_update,
 	.update		= atmel_sha_update,
@@ -875,6 +1055,79 @@ static struct ahash_alg sha_algs[] = {
 },
 },
 };
 };
 
 
+static struct ahash_alg sha_224_alg = {
+	.init		= atmel_sha_init,
+	.update		= atmel_sha_update,
+	.final		= atmel_sha_final,
+	.finup		= atmel_sha_finup,
+	.digest		= atmel_sha_digest,
+	.halg = {
+		.digestsize	= SHA224_DIGEST_SIZE,
+		.base	= {
+			.cra_name		= "sha224",
+			.cra_driver_name	= "atmel-sha224",
+			.cra_priority		= 100,
+			.cra_flags		= CRYPTO_ALG_ASYNC |
+						CRYPTO_ALG_NEED_FALLBACK,
+			.cra_blocksize		= SHA224_BLOCK_SIZE,
+			.cra_ctxsize		= sizeof(struct atmel_sha_ctx),
+			.cra_alignmask		= 0,
+			.cra_module		= THIS_MODULE,
+			.cra_init		= atmel_sha_cra_init,
+			.cra_exit		= atmel_sha_cra_exit,
+		}
+	}
+};
+
+static struct ahash_alg sha_384_512_algs[] = {
+{
+	.init		= atmel_sha_init,
+	.update		= atmel_sha_update,
+	.final		= atmel_sha_final,
+	.finup		= atmel_sha_finup,
+	.digest		= atmel_sha_digest,
+	.halg = {
+		.digestsize	= SHA384_DIGEST_SIZE,
+		.base	= {
+			.cra_name		= "sha384",
+			.cra_driver_name	= "atmel-sha384",
+			.cra_priority		= 100,
+			.cra_flags		= CRYPTO_ALG_ASYNC |
+						CRYPTO_ALG_NEED_FALLBACK,
+			.cra_blocksize		= SHA384_BLOCK_SIZE,
+			.cra_ctxsize		= sizeof(struct atmel_sha_ctx),
+			.cra_alignmask		= 0x3,
+			.cra_module		= THIS_MODULE,
+			.cra_init		= atmel_sha_cra_init,
+			.cra_exit		= atmel_sha_cra_exit,
+		}
+	}
+},
+{
+	.init		= atmel_sha_init,
+	.update		= atmel_sha_update,
+	.final		= atmel_sha_final,
+	.finup		= atmel_sha_finup,
+	.digest		= atmel_sha_digest,
+	.halg = {
+		.digestsize	= SHA512_DIGEST_SIZE,
+		.base	= {
+			.cra_name		= "sha512",
+			.cra_driver_name	= "atmel-sha512",
+			.cra_priority		= 100,
+			.cra_flags		= CRYPTO_ALG_ASYNC |
+						CRYPTO_ALG_NEED_FALLBACK,
+			.cra_blocksize		= SHA512_BLOCK_SIZE,
+			.cra_ctxsize		= sizeof(struct atmel_sha_ctx),
+			.cra_alignmask		= 0x3,
+			.cra_module		= THIS_MODULE,
+			.cra_init		= atmel_sha_cra_init,
+			.cra_exit		= atmel_sha_cra_exit,
+		}
+	}
+},
+};
+
 static void atmel_sha_done_task(unsigned long data)
 static void atmel_sha_done_task(unsigned long data)
 {
 {
 	struct atmel_sha_dev *dd = (struct atmel_sha_dev *)data;
 	struct atmel_sha_dev *dd = (struct atmel_sha_dev *)data;
@@ -941,32 +1194,142 @@ static void atmel_sha_unregister_algs(struct atmel_sha_dev *dd)
 {
 {
 	int i;
 	int i;
 
 
-	for (i = 0; i < ARRAY_SIZE(sha_algs); i++)
-		crypto_unregister_ahash(&sha_algs[i]);
+	for (i = 0; i < ARRAY_SIZE(sha_1_256_algs); i++)
+		crypto_unregister_ahash(&sha_1_256_algs[i]);
+
+	if (dd->caps.has_sha224)
+		crypto_unregister_ahash(&sha_224_alg);
+
+	if (dd->caps.has_sha_384_512) {
+		for (i = 0; i < ARRAY_SIZE(sha_384_512_algs); i++)
+			crypto_unregister_ahash(&sha_384_512_algs[i]);
+	}
 }
 }
 
 
 static int atmel_sha_register_algs(struct atmel_sha_dev *dd)
 static int atmel_sha_register_algs(struct atmel_sha_dev *dd)
 {
 {
 	int err, i, j;
 	int err, i, j;
 
 
-	for (i = 0; i < ARRAY_SIZE(sha_algs); i++) {
-		err = crypto_register_ahash(&sha_algs[i]);
+	for (i = 0; i < ARRAY_SIZE(sha_1_256_algs); i++) {
+		err = crypto_register_ahash(&sha_1_256_algs[i]);
 		if (err)
 		if (err)
-			goto err_sha_algs;
+			goto err_sha_1_256_algs;
+	}
+
+	if (dd->caps.has_sha224) {
+		err = crypto_register_ahash(&sha_224_alg);
+		if (err)
+			goto err_sha_224_algs;
+	}
+
+	if (dd->caps.has_sha_384_512) {
+		for (i = 0; i < ARRAY_SIZE(sha_384_512_algs); i++) {
+			err = crypto_register_ahash(&sha_384_512_algs[i]);
+			if (err)
+				goto err_sha_384_512_algs;
+		}
 	}
 	}
 
 
 	return 0;
 	return 0;
 
 
-err_sha_algs:
+err_sha_384_512_algs:
+	for (j = 0; j < i; j++)
+		crypto_unregister_ahash(&sha_384_512_algs[j]);
+	crypto_unregister_ahash(&sha_224_alg);
+err_sha_224_algs:
+	i = ARRAY_SIZE(sha_1_256_algs);
+err_sha_1_256_algs:
 	for (j = 0; j < i; j++)
 	for (j = 0; j < i; j++)
-		crypto_unregister_ahash(&sha_algs[j]);
+		crypto_unregister_ahash(&sha_1_256_algs[j]);
 
 
 	return err;
 	return err;
 }
 }
 
 
+static bool atmel_sha_filter(struct dma_chan *chan, void *slave)
+{
+	struct at_dma_slave	*sl = slave;
+
+	if (sl && sl->dma_dev == chan->device->dev) {
+		chan->private = sl;
+		return true;
+	} else {
+		return false;
+	}
+}
+
+static int atmel_sha_dma_init(struct atmel_sha_dev *dd,
+				struct crypto_platform_data *pdata)
+{
+	int err = -ENOMEM;
+	dma_cap_mask_t mask_in;
+
+	if (pdata && pdata->dma_slave->rxdata.dma_dev) {
+		/* Try to grab DMA channel */
+		dma_cap_zero(mask_in);
+		dma_cap_set(DMA_SLAVE, mask_in);
+
+		dd->dma_lch_in.chan = dma_request_channel(mask_in,
+				atmel_sha_filter, &pdata->dma_slave->rxdata);
+
+		if (!dd->dma_lch_in.chan)
+			return err;
+
+		dd->dma_lch_in.dma_conf.direction = DMA_MEM_TO_DEV;
+		dd->dma_lch_in.dma_conf.dst_addr = dd->phys_base +
+			SHA_REG_DIN(0);
+		dd->dma_lch_in.dma_conf.src_maxburst = 1;
+		dd->dma_lch_in.dma_conf.src_addr_width =
+			DMA_SLAVE_BUSWIDTH_4_BYTES;
+		dd->dma_lch_in.dma_conf.dst_maxburst = 1;
+		dd->dma_lch_in.dma_conf.dst_addr_width =
+			DMA_SLAVE_BUSWIDTH_4_BYTES;
+		dd->dma_lch_in.dma_conf.device_fc = false;
+
+		return 0;
+	}
+
+	return -ENODEV;
+}
+
+static void atmel_sha_dma_cleanup(struct atmel_sha_dev *dd)
+{
+	dma_release_channel(dd->dma_lch_in.chan);
+}
+
+static void atmel_sha_get_cap(struct atmel_sha_dev *dd)
+{
+
+	dd->caps.has_dma = 0;
+	dd->caps.has_dualbuff = 0;
+	dd->caps.has_sha224 = 0;
+	dd->caps.has_sha_384_512 = 0;
+
+	/* keep only major version number */
+	switch (dd->hw_version & 0xff0) {
+	case 0x410:
+		dd->caps.has_dma = 1;
+		dd->caps.has_dualbuff = 1;
+		dd->caps.has_sha224 = 1;
+		dd->caps.has_sha_384_512 = 1;
+		break;
+	case 0x400:
+		dd->caps.has_dma = 1;
+		dd->caps.has_dualbuff = 1;
+		dd->caps.has_sha224 = 1;
+		break;
+	case 0x320:
+		break;
+	default:
+		dev_warn(dd->dev,
+				"Unmanaged sha version, set minimum capabilities\n");
+		break;
+	}
+}
+
 static int atmel_sha_probe(struct platform_device *pdev)
 static int atmel_sha_probe(struct platform_device *pdev)
 {
 {
 	struct atmel_sha_dev *sha_dd;
 	struct atmel_sha_dev *sha_dd;
+	struct crypto_platform_data	*pdata;
 	struct device *dev = &pdev->dev;
 	struct device *dev = &pdev->dev;
 	struct resource *sha_res;
 	struct resource *sha_res;
 	unsigned long sha_phys_size;
 	unsigned long sha_phys_size;
@@ -1018,7 +1381,7 @@ static int atmel_sha_probe(struct platform_device *pdev)
 	}
 	}
 
 
 	/* Initializing the clock */
 	/* Initializing the clock */
-	sha_dd->iclk = clk_get(&pdev->dev, NULL);
+	sha_dd->iclk = clk_get(&pdev->dev, "sha_clk");
 	if (IS_ERR(sha_dd->iclk)) {
 	if (IS_ERR(sha_dd->iclk)) {
 		dev_err(dev, "clock intialization failed.\n");
 		dev_err(dev, "clock intialization failed.\n");
 		err = PTR_ERR(sha_dd->iclk);
 		err = PTR_ERR(sha_dd->iclk);
@@ -1032,6 +1395,22 @@ static int atmel_sha_probe(struct platform_device *pdev)
 		goto sha_io_err;
 		goto sha_io_err;
 	}
 	}
 
 
+	atmel_sha_hw_version_init(sha_dd);
+
+	atmel_sha_get_cap(sha_dd);
+
+	if (sha_dd->caps.has_dma) {
+		pdata = pdev->dev.platform_data;
+		if (!pdata) {
+			dev_err(&pdev->dev, "platform data not available\n");
+			err = -ENXIO;
+			goto err_pdata;
+		}
+		err = atmel_sha_dma_init(sha_dd, pdata);
+		if (err)
+			goto err_sha_dma;
+	}
+
 	spin_lock(&atmel_sha.lock);
 	spin_lock(&atmel_sha.lock);
 	list_add_tail(&sha_dd->list, &atmel_sha.dev_list);
 	list_add_tail(&sha_dd->list, &atmel_sha.dev_list);
 	spin_unlock(&atmel_sha.lock);
 	spin_unlock(&atmel_sha.lock);
@@ -1048,6 +1427,10 @@ err_algs:
 	spin_lock(&atmel_sha.lock);
 	spin_lock(&atmel_sha.lock);
 	list_del(&sha_dd->list);
 	list_del(&sha_dd->list);
 	spin_unlock(&atmel_sha.lock);
 	spin_unlock(&atmel_sha.lock);
+	if (sha_dd->caps.has_dma)
+		atmel_sha_dma_cleanup(sha_dd);
+err_sha_dma:
+err_pdata:
 	iounmap(sha_dd->io_base);
 	iounmap(sha_dd->io_base);
 sha_io_err:
 sha_io_err:
 	clk_put(sha_dd->iclk);
 	clk_put(sha_dd->iclk);
@@ -1078,6 +1461,9 @@ static int atmel_sha_remove(struct platform_device *pdev)
 
 
 	tasklet_kill(&sha_dd->done_task);
 	tasklet_kill(&sha_dd->done_task);
 
 
+	if (sha_dd->caps.has_dma)
+		atmel_sha_dma_cleanup(sha_dd);
+
 	iounmap(sha_dd->io_base);
 	iounmap(sha_dd->io_base);
 
 
 	clk_put(sha_dd->iclk);
 	clk_put(sha_dd->iclk);
@@ -1102,6 +1488,6 @@ static struct platform_driver atmel_sha_driver = {
 
 
 module_platform_driver(atmel_sha_driver);
 module_platform_driver(atmel_sha_driver);
 
 
-MODULE_DESCRIPTION("Atmel SHA1/SHA256 hw acceleration support.");
+MODULE_DESCRIPTION("Atmel SHA (1/256/224/384/512) hw acceleration support.");
 MODULE_LICENSE("GPL v2");
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Nicolas Royer - Eukréa Electromatique");
 MODULE_AUTHOR("Nicolas Royer - Eukréa Electromatique");

+ 2 - 0
drivers/crypto/atmel-tdes-regs.h

@@ -69,6 +69,8 @@
 #define	TDES_XTEARNDR_XTEA_RNDS_MASK	(0x3F << 0)
 #define	TDES_XTEARNDR_XTEA_RNDS_MASK	(0x3F << 0)
 #define	TDES_XTEARNDR_XTEA_RNDS_OFFSET	0
 #define	TDES_XTEARNDR_XTEA_RNDS_OFFSET	0
 
 
+#define	TDES_HW_VERSION	0xFC
+
 #define TDES_RPR		0x100
 #define TDES_RPR		0x100
 #define TDES_RCR		0x104
 #define TDES_RCR		0x104
 #define TDES_TPR		0x108
 #define TDES_TPR		0x108

+ 341 - 53
drivers/crypto/atmel-tdes.c

@@ -38,29 +38,35 @@
 #include <crypto/des.h>
 #include <crypto/des.h>
 #include <crypto/hash.h>
 #include <crypto/hash.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/hash.h>
+#include <linux/platform_data/crypto-atmel.h>
 #include "atmel-tdes-regs.h"
 #include "atmel-tdes-regs.h"
 
 
 /* TDES flags  */
 /* TDES flags  */
-#define TDES_FLAGS_MODE_MASK		0x007f
+#define TDES_FLAGS_MODE_MASK		0x00ff
 #define TDES_FLAGS_ENCRYPT	BIT(0)
 #define TDES_FLAGS_ENCRYPT	BIT(0)
 #define TDES_FLAGS_CBC		BIT(1)
 #define TDES_FLAGS_CBC		BIT(1)
 #define TDES_FLAGS_CFB		BIT(2)
 #define TDES_FLAGS_CFB		BIT(2)
 #define TDES_FLAGS_CFB8		BIT(3)
 #define TDES_FLAGS_CFB8		BIT(3)
 #define TDES_FLAGS_CFB16	BIT(4)
 #define TDES_FLAGS_CFB16	BIT(4)
 #define TDES_FLAGS_CFB32	BIT(5)
 #define TDES_FLAGS_CFB32	BIT(5)
-#define TDES_FLAGS_OFB		BIT(6)
+#define TDES_FLAGS_CFB64	BIT(6)
+#define TDES_FLAGS_OFB		BIT(7)
 
 
 #define TDES_FLAGS_INIT		BIT(16)
 #define TDES_FLAGS_INIT		BIT(16)
 #define TDES_FLAGS_FAST		BIT(17)
 #define TDES_FLAGS_FAST		BIT(17)
 #define TDES_FLAGS_BUSY		BIT(18)
 #define TDES_FLAGS_BUSY		BIT(18)
+#define TDES_FLAGS_DMA		BIT(19)
 
 
-#define ATMEL_TDES_QUEUE_LENGTH	1
+#define ATMEL_TDES_QUEUE_LENGTH	50
 
 
 #define CFB8_BLOCK_SIZE		1
 #define CFB8_BLOCK_SIZE		1
 #define CFB16_BLOCK_SIZE	2
 #define CFB16_BLOCK_SIZE	2
 #define CFB32_BLOCK_SIZE	4
 #define CFB32_BLOCK_SIZE	4
-#define CFB64_BLOCK_SIZE	8
 
 
+struct atmel_tdes_caps {
+	bool	has_dma;
+	u32		has_cfb_3keys;
+};
 
 
 struct atmel_tdes_dev;
 struct atmel_tdes_dev;
 
 
@@ -70,12 +76,19 @@ struct atmel_tdes_ctx {
 	int		keylen;
 	int		keylen;
 	u32		key[3*DES_KEY_SIZE / sizeof(u32)];
 	u32		key[3*DES_KEY_SIZE / sizeof(u32)];
 	unsigned long	flags;
 	unsigned long	flags;
+
+	u16		block_size;
 };
 };
 
 
 struct atmel_tdes_reqctx {
 struct atmel_tdes_reqctx {
 	unsigned long mode;
 	unsigned long mode;
 };
 };
 
 
+struct atmel_tdes_dma {
+	struct dma_chan			*chan;
+	struct dma_slave_config dma_conf;
+};
+
 struct atmel_tdes_dev {
 struct atmel_tdes_dev {
 	struct list_head	list;
 	struct list_head	list;
 	unsigned long		phys_base;
 	unsigned long		phys_base;
@@ -99,8 +112,10 @@ struct atmel_tdes_dev {
 	size_t				total;
 	size_t				total;
 
 
 	struct scatterlist	*in_sg;
 	struct scatterlist	*in_sg;
+	unsigned int		nb_in_sg;
 	size_t				in_offset;
 	size_t				in_offset;
 	struct scatterlist	*out_sg;
 	struct scatterlist	*out_sg;
+	unsigned int		nb_out_sg;
 	size_t				out_offset;
 	size_t				out_offset;
 
 
 	size_t	buflen;
 	size_t	buflen;
@@ -109,10 +124,16 @@ struct atmel_tdes_dev {
 	void	*buf_in;
 	void	*buf_in;
 	int		dma_in;
 	int		dma_in;
 	dma_addr_t	dma_addr_in;
 	dma_addr_t	dma_addr_in;
+	struct atmel_tdes_dma	dma_lch_in;
 
 
 	void	*buf_out;
 	void	*buf_out;
 	int		dma_out;
 	int		dma_out;
 	dma_addr_t	dma_addr_out;
 	dma_addr_t	dma_addr_out;
+	struct atmel_tdes_dma	dma_lch_out;
+
+	struct atmel_tdes_caps	caps;
+
+	u32	hw_version;
 };
 };
 
 
 struct atmel_tdes_drv {
 struct atmel_tdes_drv {
@@ -207,6 +228,31 @@ static int atmel_tdes_hw_init(struct atmel_tdes_dev *dd)
 	return 0;
 	return 0;
 }
 }
 
 
+static inline unsigned int atmel_tdes_get_version(struct atmel_tdes_dev *dd)
+{
+	return atmel_tdes_read(dd, TDES_HW_VERSION) & 0x00000fff;
+}
+
+static void atmel_tdes_hw_version_init(struct atmel_tdes_dev *dd)
+{
+	atmel_tdes_hw_init(dd);
+
+	dd->hw_version = atmel_tdes_get_version(dd);
+
+	dev_info(dd->dev,
+			"version: 0x%x\n", dd->hw_version);
+
+	clk_disable_unprepare(dd->iclk);
+}
+
+static void atmel_tdes_dma_callback(void *data)
+{
+	struct atmel_tdes_dev *dd = data;
+
+	/* dma_lch_out - completed */
+	tasklet_schedule(&dd->done_task);
+}
+
 static int atmel_tdes_write_ctrl(struct atmel_tdes_dev *dd)
 static int atmel_tdes_write_ctrl(struct atmel_tdes_dev *dd)
 {
 {
 	int err;
 	int err;
@@ -217,7 +263,9 @@ static int atmel_tdes_write_ctrl(struct atmel_tdes_dev *dd)
 	if (err)
 	if (err)
 		return err;
 		return err;
 
 
-	atmel_tdes_write(dd, TDES_PTCR, TDES_PTCR_TXTDIS|TDES_PTCR_RXTDIS);
+	if (!dd->caps.has_dma)
+		atmel_tdes_write(dd, TDES_PTCR,
+			TDES_PTCR_TXTDIS | TDES_PTCR_RXTDIS);
 
 
 	/* MR register must be set before IV registers */
 	/* MR register must be set before IV registers */
 	if (dd->ctx->keylen > (DES_KEY_SIZE << 1)) {
 	if (dd->ctx->keylen > (DES_KEY_SIZE << 1)) {
@@ -241,6 +289,8 @@ static int atmel_tdes_write_ctrl(struct atmel_tdes_dev *dd)
 			valmr |= TDES_MR_CFBS_16b;
 			valmr |= TDES_MR_CFBS_16b;
 		else if (dd->flags & TDES_FLAGS_CFB32)
 		else if (dd->flags & TDES_FLAGS_CFB32)
 			valmr |= TDES_MR_CFBS_32b;
 			valmr |= TDES_MR_CFBS_32b;
+		else if (dd->flags & TDES_FLAGS_CFB64)
+			valmr |= TDES_MR_CFBS_64b;
 	} else if (dd->flags & TDES_FLAGS_OFB) {
 	} else if (dd->flags & TDES_FLAGS_OFB) {
 		valmr |= TDES_MR_OPMOD_OFB;
 		valmr |= TDES_MR_OPMOD_OFB;
 	}
 	}
@@ -262,7 +312,7 @@ static int atmel_tdes_write_ctrl(struct atmel_tdes_dev *dd)
 	return 0;
 	return 0;
 }
 }
 
 
-static int atmel_tdes_crypt_dma_stop(struct atmel_tdes_dev *dd)
+static int atmel_tdes_crypt_pdc_stop(struct atmel_tdes_dev *dd)
 {
 {
 	int err = 0;
 	int err = 0;
 	size_t count;
 	size_t count;
@@ -288,7 +338,7 @@ static int atmel_tdes_crypt_dma_stop(struct atmel_tdes_dev *dd)
 	return err;
 	return err;
 }
 }
 
 
-static int atmel_tdes_dma_init(struct atmel_tdes_dev *dd)
+static int atmel_tdes_buff_init(struct atmel_tdes_dev *dd)
 {
 {
 	int err = -ENOMEM;
 	int err = -ENOMEM;
 
 
@@ -333,7 +383,7 @@ err_alloc:
 	return err;
 	return err;
 }
 }
 
 
-static void atmel_tdes_dma_cleanup(struct atmel_tdes_dev *dd)
+static void atmel_tdes_buff_cleanup(struct atmel_tdes_dev *dd)
 {
 {
 	dma_unmap_single(dd->dev, dd->dma_addr_out, dd->buflen,
 	dma_unmap_single(dd->dev, dd->dma_addr_out, dd->buflen,
 			 DMA_FROM_DEVICE);
 			 DMA_FROM_DEVICE);
@@ -343,7 +393,7 @@ static void atmel_tdes_dma_cleanup(struct atmel_tdes_dev *dd)
 	free_page((unsigned long)dd->buf_in);
 	free_page((unsigned long)dd->buf_in);
 }
 }
 
 
-static int atmel_tdes_crypt_dma(struct crypto_tfm *tfm, dma_addr_t dma_addr_in,
+static int atmel_tdes_crypt_pdc(struct crypto_tfm *tfm, dma_addr_t dma_addr_in,
 			       dma_addr_t dma_addr_out, int length)
 			       dma_addr_t dma_addr_out, int length)
 {
 {
 	struct atmel_tdes_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct atmel_tdes_ctx *ctx = crypto_tfm_ctx(tfm);
@@ -379,7 +429,76 @@ static int atmel_tdes_crypt_dma(struct crypto_tfm *tfm, dma_addr_t dma_addr_in,
 	return 0;
 	return 0;
 }
 }
 
 
-static int atmel_tdes_crypt_dma_start(struct atmel_tdes_dev *dd)
+static int atmel_tdes_crypt_dma(struct crypto_tfm *tfm, dma_addr_t dma_addr_in,
+			       dma_addr_t dma_addr_out, int length)
+{
+	struct atmel_tdes_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct atmel_tdes_dev *dd = ctx->dd;
+	struct scatterlist sg[2];
+	struct dma_async_tx_descriptor	*in_desc, *out_desc;
+
+	dd->dma_size = length;
+
+	if (!(dd->flags & TDES_FLAGS_FAST)) {
+		dma_sync_single_for_device(dd->dev, dma_addr_in, length,
+					   DMA_TO_DEVICE);
+	}
+
+	if (dd->flags & TDES_FLAGS_CFB8) {
+		dd->dma_lch_in.dma_conf.dst_addr_width =
+			DMA_SLAVE_BUSWIDTH_1_BYTE;
+		dd->dma_lch_out.dma_conf.src_addr_width =
+			DMA_SLAVE_BUSWIDTH_1_BYTE;
+	} else if (dd->flags & TDES_FLAGS_CFB16) {
+		dd->dma_lch_in.dma_conf.dst_addr_width =
+			DMA_SLAVE_BUSWIDTH_2_BYTES;
+		dd->dma_lch_out.dma_conf.src_addr_width =
+			DMA_SLAVE_BUSWIDTH_2_BYTES;
+	} else {
+		dd->dma_lch_in.dma_conf.dst_addr_width =
+			DMA_SLAVE_BUSWIDTH_4_BYTES;
+		dd->dma_lch_out.dma_conf.src_addr_width =
+			DMA_SLAVE_BUSWIDTH_4_BYTES;
+	}
+
+	dmaengine_slave_config(dd->dma_lch_in.chan, &dd->dma_lch_in.dma_conf);
+	dmaengine_slave_config(dd->dma_lch_out.chan, &dd->dma_lch_out.dma_conf);
+
+	dd->flags |= TDES_FLAGS_DMA;
+
+	sg_init_table(&sg[0], 1);
+	sg_dma_address(&sg[0]) = dma_addr_in;
+	sg_dma_len(&sg[0]) = length;
+
+	sg_init_table(&sg[1], 1);
+	sg_dma_address(&sg[1]) = dma_addr_out;
+	sg_dma_len(&sg[1]) = length;
+
+	in_desc = dmaengine_prep_slave_sg(dd->dma_lch_in.chan, &sg[0],
+				1, DMA_MEM_TO_DEV,
+				DMA_PREP_INTERRUPT  |  DMA_CTRL_ACK);
+	if (!in_desc)
+		return -EINVAL;
+
+	out_desc = dmaengine_prep_slave_sg(dd->dma_lch_out.chan, &sg[1],
+				1, DMA_DEV_TO_MEM,
+				DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+	if (!out_desc)
+		return -EINVAL;
+
+	out_desc->callback = atmel_tdes_dma_callback;
+	out_desc->callback_param = dd;
+
+	dmaengine_submit(out_desc);
+	dma_async_issue_pending(dd->dma_lch_out.chan);
+
+	dmaengine_submit(in_desc);
+	dma_async_issue_pending(dd->dma_lch_in.chan);
+
+	return 0;
+}
+
+static int atmel_tdes_crypt_start(struct atmel_tdes_dev *dd)
 {
 {
 	struct crypto_tfm *tfm = crypto_ablkcipher_tfm(
 	struct crypto_tfm *tfm = crypto_ablkcipher_tfm(
 					crypto_ablkcipher_reqtfm(dd->req));
 					crypto_ablkcipher_reqtfm(dd->req));
@@ -387,23 +506,23 @@ static int atmel_tdes_crypt_dma_start(struct atmel_tdes_dev *dd)
 	size_t count;
 	size_t count;
 	dma_addr_t addr_in, addr_out;
 	dma_addr_t addr_in, addr_out;
 
 
-	if (sg_is_last(dd->in_sg) && sg_is_last(dd->out_sg)) {
+	if ((!dd->in_offset) && (!dd->out_offset)) {
 		/* check for alignment */
 		/* check for alignment */
-		in = IS_ALIGNED((u32)dd->in_sg->offset, sizeof(u32));
-		out = IS_ALIGNED((u32)dd->out_sg->offset, sizeof(u32));
-
+		in = IS_ALIGNED((u32)dd->in_sg->offset, sizeof(u32)) &&
+			IS_ALIGNED(dd->in_sg->length, dd->ctx->block_size);
+		out = IS_ALIGNED((u32)dd->out_sg->offset, sizeof(u32)) &&
+			IS_ALIGNED(dd->out_sg->length, dd->ctx->block_size);
 		fast = in && out;
 		fast = in && out;
+
+		if (sg_dma_len(dd->in_sg) != sg_dma_len(dd->out_sg))
+			fast = 0;
 	}
 	}
 
 
+
 	if (fast)  {
 	if (fast)  {
 		count = min(dd->total, sg_dma_len(dd->in_sg));
 		count = min(dd->total, sg_dma_len(dd->in_sg));
 		count = min(count, sg_dma_len(dd->out_sg));
 		count = min(count, sg_dma_len(dd->out_sg));
 
 
-		if (count != dd->total) {
-			pr_err("request length != buffer length\n");
-			return -EINVAL;
-		}
-
 		err = dma_map_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
 		err = dma_map_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
 		if (!err) {
 		if (!err) {
 			dev_err(dd->dev, "dma_map_sg() error\n");
 			dev_err(dd->dev, "dma_map_sg() error\n");
@@ -433,13 +552,16 @@ static int atmel_tdes_crypt_dma_start(struct atmel_tdes_dev *dd)
 		addr_out = dd->dma_addr_out;
 		addr_out = dd->dma_addr_out;
 
 
 		dd->flags &= ~TDES_FLAGS_FAST;
 		dd->flags &= ~TDES_FLAGS_FAST;
-
 	}
 	}
 
 
 	dd->total -= count;
 	dd->total -= count;
 
 
-	err = atmel_tdes_crypt_dma(tfm, addr_in, addr_out, count);
-	if (err) {
+	if (dd->caps.has_dma)
+		err = atmel_tdes_crypt_dma(tfm, addr_in, addr_out, count);
+	else
+		err = atmel_tdes_crypt_pdc(tfm, addr_in, addr_out, count);
+
+	if (err && (dd->flags & TDES_FLAGS_FAST)) {
 		dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
 		dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
 		dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_TO_DEVICE);
 		dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_TO_DEVICE);
 	}
 	}
@@ -447,7 +569,6 @@ static int atmel_tdes_crypt_dma_start(struct atmel_tdes_dev *dd)
 	return err;
 	return err;
 }
 }
 
 
-
 static void atmel_tdes_finish_req(struct atmel_tdes_dev *dd, int err)
 static void atmel_tdes_finish_req(struct atmel_tdes_dev *dd, int err)
 {
 {
 	struct ablkcipher_request *req = dd->req;
 	struct ablkcipher_request *req = dd->req;
@@ -506,7 +627,7 @@ static int atmel_tdes_handle_queue(struct atmel_tdes_dev *dd,
 
 
 	err = atmel_tdes_write_ctrl(dd);
 	err = atmel_tdes_write_ctrl(dd);
 	if (!err)
 	if (!err)
-		err = atmel_tdes_crypt_dma_start(dd);
+		err = atmel_tdes_crypt_start(dd);
 	if (err) {
 	if (err) {
 		/* des_task will not finish it, so do it here */
 		/* des_task will not finish it, so do it here */
 		atmel_tdes_finish_req(dd, err);
 		atmel_tdes_finish_req(dd, err);
@@ -516,41 +637,145 @@ static int atmel_tdes_handle_queue(struct atmel_tdes_dev *dd,
 	return ret;
 	return ret;
 }
 }
 
 
+static int atmel_tdes_crypt_dma_stop(struct atmel_tdes_dev *dd)
+{
+	int err = -EINVAL;
+	size_t count;
+
+	if (dd->flags & TDES_FLAGS_DMA) {
+		err = 0;
+		if  (dd->flags & TDES_FLAGS_FAST) {
+			dma_unmap_sg(dd->dev, dd->out_sg, 1, DMA_FROM_DEVICE);
+			dma_unmap_sg(dd->dev, dd->in_sg, 1, DMA_TO_DEVICE);
+		} else {
+			dma_sync_single_for_device(dd->dev, dd->dma_addr_out,
+				dd->dma_size, DMA_FROM_DEVICE);
+
+			/* copy data */
+			count = atmel_tdes_sg_copy(&dd->out_sg, &dd->out_offset,
+				dd->buf_out, dd->buflen, dd->dma_size, 1);
+			if (count != dd->dma_size) {
+				err = -EINVAL;
+				pr_err("not all data converted: %u\n", count);
+			}
+		}
+	}
+	return err;
+}
 
 
 static int atmel_tdes_crypt(struct ablkcipher_request *req, unsigned long mode)
 static int atmel_tdes_crypt(struct ablkcipher_request *req, unsigned long mode)
 {
 {
 	struct atmel_tdes_ctx *ctx = crypto_ablkcipher_ctx(
 	struct atmel_tdes_ctx *ctx = crypto_ablkcipher_ctx(
 			crypto_ablkcipher_reqtfm(req));
 			crypto_ablkcipher_reqtfm(req));
 	struct atmel_tdes_reqctx *rctx = ablkcipher_request_ctx(req);
 	struct atmel_tdes_reqctx *rctx = ablkcipher_request_ctx(req);
-	struct atmel_tdes_dev *dd;
 
 
 	if (mode & TDES_FLAGS_CFB8) {
 	if (mode & TDES_FLAGS_CFB8) {
 		if (!IS_ALIGNED(req->nbytes, CFB8_BLOCK_SIZE)) {
 		if (!IS_ALIGNED(req->nbytes, CFB8_BLOCK_SIZE)) {
 			pr_err("request size is not exact amount of CFB8 blocks\n");
 			pr_err("request size is not exact amount of CFB8 blocks\n");
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
+		ctx->block_size = CFB8_BLOCK_SIZE;
 	} else if (mode & TDES_FLAGS_CFB16) {
 	} else if (mode & TDES_FLAGS_CFB16) {
 		if (!IS_ALIGNED(req->nbytes, CFB16_BLOCK_SIZE)) {
 		if (!IS_ALIGNED(req->nbytes, CFB16_BLOCK_SIZE)) {
 			pr_err("request size is not exact amount of CFB16 blocks\n");
 			pr_err("request size is not exact amount of CFB16 blocks\n");
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
+		ctx->block_size = CFB16_BLOCK_SIZE;
 	} else if (mode & TDES_FLAGS_CFB32) {
 	} else if (mode & TDES_FLAGS_CFB32) {
 		if (!IS_ALIGNED(req->nbytes, CFB32_BLOCK_SIZE)) {
 		if (!IS_ALIGNED(req->nbytes, CFB32_BLOCK_SIZE)) {
 			pr_err("request size is not exact amount of CFB32 blocks\n");
 			pr_err("request size is not exact amount of CFB32 blocks\n");
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
-	} else if (!IS_ALIGNED(req->nbytes, DES_BLOCK_SIZE)) {
-		pr_err("request size is not exact amount of DES blocks\n");
-		return -EINVAL;
+		ctx->block_size = CFB32_BLOCK_SIZE;
+	} else {
+		if (!IS_ALIGNED(req->nbytes, DES_BLOCK_SIZE)) {
+			pr_err("request size is not exact amount of DES blocks\n");
+			return -EINVAL;
+		}
+		ctx->block_size = DES_BLOCK_SIZE;
 	}
 	}
 
 
-	dd = atmel_tdes_find_dev(ctx);
-	if (!dd)
+	rctx->mode = mode;
+
+	return atmel_tdes_handle_queue(ctx->dd, req);
+}
+
+static bool atmel_tdes_filter(struct dma_chan *chan, void *slave)
+{
+	struct at_dma_slave	*sl = slave;
+
+	if (sl && sl->dma_dev == chan->device->dev) {
+		chan->private = sl;
+		return true;
+	} else {
+		return false;
+	}
+}
+
+static int atmel_tdes_dma_init(struct atmel_tdes_dev *dd,
+			struct crypto_platform_data *pdata)
+{
+	int err = -ENOMEM;
+	dma_cap_mask_t mask_in, mask_out;
+
+	if (pdata && pdata->dma_slave->txdata.dma_dev &&
+		pdata->dma_slave->rxdata.dma_dev) {
+
+		/* Try to grab 2 DMA channels */
+		dma_cap_zero(mask_in);
+		dma_cap_set(DMA_SLAVE, mask_in);
+
+		dd->dma_lch_in.chan = dma_request_channel(mask_in,
+				atmel_tdes_filter, &pdata->dma_slave->rxdata);
+
+		if (!dd->dma_lch_in.chan)
+			goto err_dma_in;
+
+		dd->dma_lch_in.dma_conf.direction = DMA_MEM_TO_DEV;
+		dd->dma_lch_in.dma_conf.dst_addr = dd->phys_base +
+			TDES_IDATA1R;
+		dd->dma_lch_in.dma_conf.src_maxburst = 1;
+		dd->dma_lch_in.dma_conf.src_addr_width =
+			DMA_SLAVE_BUSWIDTH_4_BYTES;
+		dd->dma_lch_in.dma_conf.dst_maxburst = 1;
+		dd->dma_lch_in.dma_conf.dst_addr_width =
+			DMA_SLAVE_BUSWIDTH_4_BYTES;
+		dd->dma_lch_in.dma_conf.device_fc = false;
+
+		dma_cap_zero(mask_out);
+		dma_cap_set(DMA_SLAVE, mask_out);
+		dd->dma_lch_out.chan = dma_request_channel(mask_out,
+				atmel_tdes_filter, &pdata->dma_slave->txdata);
+
+		if (!dd->dma_lch_out.chan)
+			goto err_dma_out;
+
+		dd->dma_lch_out.dma_conf.direction = DMA_DEV_TO_MEM;
+		dd->dma_lch_out.dma_conf.src_addr = dd->phys_base +
+			TDES_ODATA1R;
+		dd->dma_lch_out.dma_conf.src_maxburst = 1;
+		dd->dma_lch_out.dma_conf.src_addr_width =
+			DMA_SLAVE_BUSWIDTH_4_BYTES;
+		dd->dma_lch_out.dma_conf.dst_maxburst = 1;
+		dd->dma_lch_out.dma_conf.dst_addr_width =
+			DMA_SLAVE_BUSWIDTH_4_BYTES;
+		dd->dma_lch_out.dma_conf.device_fc = false;
+
+		return 0;
+	} else {
 		return -ENODEV;
 		return -ENODEV;
+	}
 
 
-	rctx->mode = mode;
+err_dma_out:
+	dma_release_channel(dd->dma_lch_in.chan);
+err_dma_in:
+	return err;
+}
 
 
-	return atmel_tdes_handle_queue(dd, req);
+static void atmel_tdes_dma_cleanup(struct atmel_tdes_dev *dd)
+{
+	dma_release_channel(dd->dma_lch_in.chan);
+	dma_release_channel(dd->dma_lch_out.chan);
 }
 }
 
 
 static int atmel_des_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
 static int atmel_des_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
@@ -590,7 +815,8 @@ static int atmel_tdes_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
 	/*
 	/*
 	 * HW bug in cfb 3-keys mode.
 	 * HW bug in cfb 3-keys mode.
 	 */
 	 */
-	if (strstr(alg_name, "cfb") && (keylen != 2*DES_KEY_SIZE)) {
+	if (!ctx->dd->caps.has_cfb_3keys && strstr(alg_name, "cfb")
+			&& (keylen != 2*DES_KEY_SIZE)) {
 		crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
 		crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
 		return -EINVAL;
 		return -EINVAL;
 	} else if ((keylen != 2*DES_KEY_SIZE) && (keylen != 3*DES_KEY_SIZE)) {
 	} else if ((keylen != 2*DES_KEY_SIZE) && (keylen != 3*DES_KEY_SIZE)) {
@@ -678,8 +904,15 @@ static int atmel_tdes_ofb_decrypt(struct ablkcipher_request *req)
 
 
 static int atmel_tdes_cra_init(struct crypto_tfm *tfm)
 static int atmel_tdes_cra_init(struct crypto_tfm *tfm)
 {
 {
+	struct atmel_tdes_ctx *ctx = crypto_tfm_ctx(tfm);
+	struct atmel_tdes_dev *dd;
+
 	tfm->crt_ablkcipher.reqsize = sizeof(struct atmel_tdes_reqctx);
 	tfm->crt_ablkcipher.reqsize = sizeof(struct atmel_tdes_reqctx);
 
 
+	dd = atmel_tdes_find_dev(ctx);
+	if (!dd)
+		return -ENODEV;
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -695,7 +928,7 @@ static struct crypto_alg tdes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
-	.cra_alignmask		= 0,
+	.cra_alignmask		= 0x7,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_tdes_cra_init,
 	.cra_init		= atmel_tdes_cra_init,
@@ -715,7 +948,7 @@ static struct crypto_alg tdes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
-	.cra_alignmask		= 0,
+	.cra_alignmask		= 0x7,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_tdes_cra_init,
 	.cra_init		= atmel_tdes_cra_init,
@@ -736,7 +969,7 @@ static struct crypto_alg tdes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
-	.cra_alignmask		= 0,
+	.cra_alignmask		= 0x7,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_tdes_cra_init,
 	.cra_init		= atmel_tdes_cra_init,
@@ -778,7 +1011,7 @@ static struct crypto_alg tdes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= CFB16_BLOCK_SIZE,
 	.cra_blocksize		= CFB16_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
-	.cra_alignmask		= 0,
+	.cra_alignmask		= 0x1,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_tdes_cra_init,
 	.cra_init		= atmel_tdes_cra_init,
@@ -799,7 +1032,7 @@ static struct crypto_alg tdes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= CFB32_BLOCK_SIZE,
 	.cra_blocksize		= CFB32_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
-	.cra_alignmask		= 0,
+	.cra_alignmask		= 0x3,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_tdes_cra_init,
 	.cra_init		= atmel_tdes_cra_init,
@@ -820,7 +1053,7 @@ static struct crypto_alg tdes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
-	.cra_alignmask		= 0,
+	.cra_alignmask		= 0x7,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_tdes_cra_init,
 	.cra_init		= atmel_tdes_cra_init,
@@ -841,7 +1074,7 @@ static struct crypto_alg tdes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
-	.cra_alignmask		= 0,
+	.cra_alignmask		= 0x7,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_tdes_cra_init,
 	.cra_init		= atmel_tdes_cra_init,
@@ -861,7 +1094,7 @@ static struct crypto_alg tdes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
-	.cra_alignmask		= 0,
+	.cra_alignmask		= 0x7,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_tdes_cra_init,
 	.cra_init		= atmel_tdes_cra_init,
@@ -882,7 +1115,7 @@ static struct crypto_alg tdes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
-	.cra_alignmask		= 0,
+	.cra_alignmask		= 0x7,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_tdes_cra_init,
 	.cra_init		= atmel_tdes_cra_init,
@@ -924,7 +1157,7 @@ static struct crypto_alg tdes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= CFB16_BLOCK_SIZE,
 	.cra_blocksize		= CFB16_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
-	.cra_alignmask		= 0,
+	.cra_alignmask		= 0x1,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_tdes_cra_init,
 	.cra_init		= atmel_tdes_cra_init,
@@ -945,7 +1178,7 @@ static struct crypto_alg tdes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= CFB32_BLOCK_SIZE,
 	.cra_blocksize		= CFB32_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
-	.cra_alignmask		= 0,
+	.cra_alignmask		= 0x3,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_tdes_cra_init,
 	.cra_init		= atmel_tdes_cra_init,
@@ -966,7 +1199,7 @@ static struct crypto_alg tdes_algs[] = {
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_blocksize		= DES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
 	.cra_ctxsize		= sizeof(struct atmel_tdes_ctx),
-	.cra_alignmask		= 0,
+	.cra_alignmask		= 0x7,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_type		= &crypto_ablkcipher_type,
 	.cra_module		= THIS_MODULE,
 	.cra_module		= THIS_MODULE,
 	.cra_init		= atmel_tdes_cra_init,
 	.cra_init		= atmel_tdes_cra_init,
@@ -994,14 +1227,24 @@ static void atmel_tdes_done_task(unsigned long data)
 	struct atmel_tdes_dev *dd = (struct atmel_tdes_dev *) data;
 	struct atmel_tdes_dev *dd = (struct atmel_tdes_dev *) data;
 	int err;
 	int err;
 
 
-	err = atmel_tdes_crypt_dma_stop(dd);
+	if (!(dd->flags & TDES_FLAGS_DMA))
+		err = atmel_tdes_crypt_pdc_stop(dd);
+	else
+		err = atmel_tdes_crypt_dma_stop(dd);
 
 
 	err = dd->err ? : err;
 	err = dd->err ? : err;
 
 
 	if (dd->total && !err) {
 	if (dd->total && !err) {
-		err = atmel_tdes_crypt_dma_start(dd);
+		if (dd->flags & TDES_FLAGS_FAST) {
+			dd->in_sg = sg_next(dd->in_sg);
+			dd->out_sg = sg_next(dd->out_sg);
+			if (!dd->in_sg || !dd->out_sg)
+				err = -EINVAL;
+		}
 		if (!err)
 		if (!err)
-			return;
+			err = atmel_tdes_crypt_start(dd);
+		if (!err)
+			return; /* DMA started. Not fininishing. */
 	}
 	}
 
 
 	atmel_tdes_finish_req(dd, err);
 	atmel_tdes_finish_req(dd, err);
@@ -1053,9 +1296,31 @@ err_tdes_algs:
 	return err;
 	return err;
 }
 }
 
 
+static void atmel_tdes_get_cap(struct atmel_tdes_dev *dd)
+{
+
+	dd->caps.has_dma = 0;
+	dd->caps.has_cfb_3keys = 0;
+
+	/* keep only major version number */
+	switch (dd->hw_version & 0xf00) {
+	case 0x700:
+		dd->caps.has_dma = 1;
+		dd->caps.has_cfb_3keys = 1;
+		break;
+	case 0x600:
+		break;
+	default:
+		dev_warn(dd->dev,
+				"Unmanaged tdes version, set minimum capabilities\n");
+		break;
+	}
+}
+
 static int atmel_tdes_probe(struct platform_device *pdev)
 static int atmel_tdes_probe(struct platform_device *pdev)
 {
 {
 	struct atmel_tdes_dev *tdes_dd;
 	struct atmel_tdes_dev *tdes_dd;
+	struct crypto_platform_data	*pdata;
 	struct device *dev = &pdev->dev;
 	struct device *dev = &pdev->dev;
 	struct resource *tdes_res;
 	struct resource *tdes_res;
 	unsigned long tdes_phys_size;
 	unsigned long tdes_phys_size;
@@ -1109,7 +1374,7 @@ static int atmel_tdes_probe(struct platform_device *pdev)
 	}
 	}
 
 
 	/* Initializing the clock */
 	/* Initializing the clock */
-	tdes_dd->iclk = clk_get(&pdev->dev, NULL);
+	tdes_dd->iclk = clk_get(&pdev->dev, "tdes_clk");
 	if (IS_ERR(tdes_dd->iclk)) {
 	if (IS_ERR(tdes_dd->iclk)) {
 		dev_err(dev, "clock intialization failed.\n");
 		dev_err(dev, "clock intialization failed.\n");
 		err = PTR_ERR(tdes_dd->iclk);
 		err = PTR_ERR(tdes_dd->iclk);
@@ -1123,9 +1388,25 @@ static int atmel_tdes_probe(struct platform_device *pdev)
 		goto tdes_io_err;
 		goto tdes_io_err;
 	}
 	}
 
 
-	err = atmel_tdes_dma_init(tdes_dd);
+	atmel_tdes_hw_version_init(tdes_dd);
+
+	atmel_tdes_get_cap(tdes_dd);
+
+	err = atmel_tdes_buff_init(tdes_dd);
 	if (err)
 	if (err)
-		goto err_tdes_dma;
+		goto err_tdes_buff;
+
+	if (tdes_dd->caps.has_dma) {
+		pdata = pdev->dev.platform_data;
+		if (!pdata) {
+			dev_err(&pdev->dev, "platform data not available\n");
+			err = -ENXIO;
+			goto err_pdata;
+		}
+		err = atmel_tdes_dma_init(tdes_dd, pdata);
+		if (err)
+			goto err_tdes_dma;
+	}
 
 
 	spin_lock(&atmel_tdes.lock);
 	spin_lock(&atmel_tdes.lock);
 	list_add_tail(&tdes_dd->list, &atmel_tdes.dev_list);
 	list_add_tail(&tdes_dd->list, &atmel_tdes.dev_list);
@@ -1143,8 +1424,12 @@ err_algs:
 	spin_lock(&atmel_tdes.lock);
 	spin_lock(&atmel_tdes.lock);
 	list_del(&tdes_dd->list);
 	list_del(&tdes_dd->list);
 	spin_unlock(&atmel_tdes.lock);
 	spin_unlock(&atmel_tdes.lock);
-	atmel_tdes_dma_cleanup(tdes_dd);
+	if (tdes_dd->caps.has_dma)
+		atmel_tdes_dma_cleanup(tdes_dd);
 err_tdes_dma:
 err_tdes_dma:
+err_pdata:
+	atmel_tdes_buff_cleanup(tdes_dd);
+err_tdes_buff:
 	iounmap(tdes_dd->io_base);
 	iounmap(tdes_dd->io_base);
 tdes_io_err:
 tdes_io_err:
 	clk_put(tdes_dd->iclk);
 	clk_put(tdes_dd->iclk);
@@ -1178,7 +1463,10 @@ static int atmel_tdes_remove(struct platform_device *pdev)
 	tasklet_kill(&tdes_dd->done_task);
 	tasklet_kill(&tdes_dd->done_task);
 	tasklet_kill(&tdes_dd->queue_task);
 	tasklet_kill(&tdes_dd->queue_task);
 
 
-	atmel_tdes_dma_cleanup(tdes_dd);
+	if (tdes_dd->caps.has_dma)
+		atmel_tdes_dma_cleanup(tdes_dd);
+
+	atmel_tdes_buff_cleanup(tdes_dd);
 
 
 	iounmap(tdes_dd->io_base);
 	iounmap(tdes_dd->io_base);
 
 

+ 3 - 3
drivers/crypto/bfin_crc.c

@@ -151,7 +151,7 @@ static int bfin_crypto_crc_init(struct ahash_request *req)
 	struct bfin_crypto_crc_reqctx *ctx = ahash_request_ctx(req);
 	struct bfin_crypto_crc_reqctx *ctx = ahash_request_ctx(req);
 	struct bfin_crypto_crc *crc;
 	struct bfin_crypto_crc *crc;
 
 
-	dev_dbg(crc->dev, "crc_init\n");
+	dev_dbg(ctx->crc->dev, "crc_init\n");
 	spin_lock_bh(&crc_list.lock);
 	spin_lock_bh(&crc_list.lock);
 	list_for_each_entry(crc, &crc_list.dev_list, list) {
 	list_for_each_entry(crc, &crc_list.dev_list, list) {
 		crc_ctx->crc = crc;
 		crc_ctx->crc = crc;
@@ -160,7 +160,7 @@ static int bfin_crypto_crc_init(struct ahash_request *req)
 	spin_unlock_bh(&crc_list.lock);
 	spin_unlock_bh(&crc_list.lock);
 
 
 	if (sg_count(req->src) > CRC_MAX_DMA_DESC) {
 	if (sg_count(req->src) > CRC_MAX_DMA_DESC) {
-		dev_dbg(crc->dev, "init: requested sg list is too big > %d\n",
+		dev_dbg(ctx->crc->dev, "init: requested sg list is too big > %d\n",
 			CRC_MAX_DMA_DESC);
 			CRC_MAX_DMA_DESC);
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
@@ -175,7 +175,7 @@ static int bfin_crypto_crc_init(struct ahash_request *req)
 	/* init crc results */
 	/* init crc results */
 	put_unaligned_le32(crc_ctx->key, req->result);
 	put_unaligned_le32(crc_ctx->key, req->result);
 
 
-	dev_dbg(crc->dev, "init: digest size: %d\n",
+	dev_dbg(ctx->crc->dev, "init: digest size: %d\n",
 		crypto_ahash_digestsize(tfm));
 		crypto_ahash_digestsize(tfm));
 
 
 	return bfin_crypto_crc_init_hw(crc, crc_ctx->key);
 	return bfin_crypto_crc_init_hw(crc, crc_ctx->key);

+ 1 - 1
drivers/crypto/caam/Kconfig

@@ -78,7 +78,7 @@ config CRYPTO_DEV_FSL_CAAM_AHASH_API
 	tristate "Register hash algorithm implementations with Crypto API"
 	tristate "Register hash algorithm implementations with Crypto API"
 	depends on CRYPTO_DEV_FSL_CAAM
 	depends on CRYPTO_DEV_FSL_CAAM
 	default y
 	default y
-	select CRYPTO_AHASH
+	select CRYPTO_HASH
 	help
 	help
 	  Selecting this will offload ahash for users of the
 	  Selecting this will offload ahash for users of the
 	  scatterlist crypto API to the SEC4 via job ring.
 	  scatterlist crypto API to the SEC4 via job ring.

+ 6 - 0
drivers/crypto/caam/caamalg.c

@@ -1693,6 +1693,7 @@ static struct caam_alg_template driver_algs[] = {
 		.name = "authenc(hmac(sha224),cbc(aes))",
 		.name = "authenc(hmac(sha224),cbc(aes))",
 		.driver_name = "authenc-hmac-sha224-cbc-aes-caam",
 		.driver_name = "authenc-hmac-sha224-cbc-aes-caam",
 		.blocksize = AES_BLOCK_SIZE,
 		.blocksize = AES_BLOCK_SIZE,
+		.type = CRYPTO_ALG_TYPE_AEAD,
 		.template_aead = {
 		.template_aead = {
 			.setkey = aead_setkey,
 			.setkey = aead_setkey,
 			.setauthsize = aead_setauthsize,
 			.setauthsize = aead_setauthsize,
@@ -1732,6 +1733,7 @@ static struct caam_alg_template driver_algs[] = {
 		.name = "authenc(hmac(sha384),cbc(aes))",
 		.name = "authenc(hmac(sha384),cbc(aes))",
 		.driver_name = "authenc-hmac-sha384-cbc-aes-caam",
 		.driver_name = "authenc-hmac-sha384-cbc-aes-caam",
 		.blocksize = AES_BLOCK_SIZE,
 		.blocksize = AES_BLOCK_SIZE,
+		.type = CRYPTO_ALG_TYPE_AEAD,
 		.template_aead = {
 		.template_aead = {
 			.setkey = aead_setkey,
 			.setkey = aead_setkey,
 			.setauthsize = aead_setauthsize,
 			.setauthsize = aead_setauthsize,
@@ -1810,6 +1812,7 @@ static struct caam_alg_template driver_algs[] = {
 		.name = "authenc(hmac(sha224),cbc(des3_ede))",
 		.name = "authenc(hmac(sha224),cbc(des3_ede))",
 		.driver_name = "authenc-hmac-sha224-cbc-des3_ede-caam",
 		.driver_name = "authenc-hmac-sha224-cbc-des3_ede-caam",
 		.blocksize = DES3_EDE_BLOCK_SIZE,
 		.blocksize = DES3_EDE_BLOCK_SIZE,
+		.type = CRYPTO_ALG_TYPE_AEAD,
 		.template_aead = {
 		.template_aead = {
 			.setkey = aead_setkey,
 			.setkey = aead_setkey,
 			.setauthsize = aead_setauthsize,
 			.setauthsize = aead_setauthsize,
@@ -1849,6 +1852,7 @@ static struct caam_alg_template driver_algs[] = {
 		.name = "authenc(hmac(sha384),cbc(des3_ede))",
 		.name = "authenc(hmac(sha384),cbc(des3_ede))",
 		.driver_name = "authenc-hmac-sha384-cbc-des3_ede-caam",
 		.driver_name = "authenc-hmac-sha384-cbc-des3_ede-caam",
 		.blocksize = DES3_EDE_BLOCK_SIZE,
 		.blocksize = DES3_EDE_BLOCK_SIZE,
+		.type = CRYPTO_ALG_TYPE_AEAD,
 		.template_aead = {
 		.template_aead = {
 			.setkey = aead_setkey,
 			.setkey = aead_setkey,
 			.setauthsize = aead_setauthsize,
 			.setauthsize = aead_setauthsize,
@@ -1926,6 +1930,7 @@ static struct caam_alg_template driver_algs[] = {
 		.name = "authenc(hmac(sha224),cbc(des))",
 		.name = "authenc(hmac(sha224),cbc(des))",
 		.driver_name = "authenc-hmac-sha224-cbc-des-caam",
 		.driver_name = "authenc-hmac-sha224-cbc-des-caam",
 		.blocksize = DES_BLOCK_SIZE,
 		.blocksize = DES_BLOCK_SIZE,
+		.type = CRYPTO_ALG_TYPE_AEAD,
 		.template_aead = {
 		.template_aead = {
 			.setkey = aead_setkey,
 			.setkey = aead_setkey,
 			.setauthsize = aead_setauthsize,
 			.setauthsize = aead_setauthsize,
@@ -1965,6 +1970,7 @@ static struct caam_alg_template driver_algs[] = {
 		.name = "authenc(hmac(sha384),cbc(des))",
 		.name = "authenc(hmac(sha384),cbc(des))",
 		.driver_name = "authenc-hmac-sha384-cbc-des-caam",
 		.driver_name = "authenc-hmac-sha384-cbc-des-caam",
 		.blocksize = DES_BLOCK_SIZE,
 		.blocksize = DES_BLOCK_SIZE,
+		.type = CRYPTO_ALG_TYPE_AEAD,
 		.template_aead = {
 		.template_aead = {
 			.setkey = aead_setkey,
 			.setkey = aead_setkey,
 			.setauthsize = aead_setauthsize,
 			.setauthsize = aead_setauthsize,

+ 2 - 2
drivers/crypto/caam/caamhash.c

@@ -411,7 +411,7 @@ static int ahash_set_sh_desc(struct crypto_ahash *ahash)
 	return 0;
 	return 0;
 }
 }
 
 
-static u32 gen_split_hash_key(struct caam_hash_ctx *ctx, const u8 *key_in,
+static int gen_split_hash_key(struct caam_hash_ctx *ctx, const u8 *key_in,
 			      u32 keylen)
 			      u32 keylen)
 {
 {
 	return gen_split_key(ctx->jrdev, ctx->key, ctx->split_key_len,
 	return gen_split_key(ctx->jrdev, ctx->key, ctx->split_key_len,
@@ -420,7 +420,7 @@ static u32 gen_split_hash_key(struct caam_hash_ctx *ctx, const u8 *key_in,
 }
 }
 
 
 /* Digest hash size if it is too large */
 /* Digest hash size if it is too large */
-static u32 hash_digest_key(struct caam_hash_ctx *ctx, const u8 *key_in,
+static int hash_digest_key(struct caam_hash_ctx *ctx, const u8 *key_in,
 			   u32 *keylen, u8 *key_out, u32 digestsize)
 			   u32 *keylen, u8 *key_out, u32 digestsize)
 {
 {
 	struct device *jrdev = ctx->jrdev;
 	struct device *jrdev = ctx->jrdev;

+ 3 - 0
drivers/crypto/caam/ctrl.c

@@ -304,6 +304,9 @@ static int caam_probe(struct platform_device *pdev)
 			caam_remove(pdev);
 			caam_remove(pdev);
 			return ret;
 			return ret;
 		}
 		}
+
+		/* Enable RDB bit so that RNG works faster */
+		setbits32(&topregs->ctrl.scfgr, SCFGR_RDBENABLE);
 	}
 	}
 
 
 	/* NOTE: RTIC detection ought to go here, around Si time */
 	/* NOTE: RTIC detection ought to go here, around Si time */

+ 5 - 5
drivers/crypto/caam/error.c

@@ -36,7 +36,7 @@ static void report_jump_idx(u32 status, char *outstr)
 
 
 static void report_ccb_status(u32 status, char *outstr)
 static void report_ccb_status(u32 status, char *outstr)
 {
 {
-	char *cha_id_list[] = {
+	static const char * const cha_id_list[] = {
 		"",
 		"",
 		"AES",
 		"AES",
 		"DES",
 		"DES",
@@ -51,7 +51,7 @@ static void report_ccb_status(u32 status, char *outstr)
 		"ZUCE",
 		"ZUCE",
 		"ZUCA",
 		"ZUCA",
 	};
 	};
-	char *err_id_list[] = {
+	static const char * const err_id_list[] = {
 		"No error.",
 		"No error.",
 		"Mode error.",
 		"Mode error.",
 		"Data size error.",
 		"Data size error.",
@@ -69,7 +69,7 @@ static void report_ccb_status(u32 status, char *outstr)
 		"Invalid CHA combination was selected",
 		"Invalid CHA combination was selected",
 		"Invalid CHA selected.",
 		"Invalid CHA selected.",
 	};
 	};
-	char *rng_err_id_list[] = {
+	static const char * const rng_err_id_list[] = {
 		"",
 		"",
 		"",
 		"",
 		"",
 		"",
@@ -117,7 +117,7 @@ static void report_jump_status(u32 status, char *outstr)
 
 
 static void report_deco_status(u32 status, char *outstr)
 static void report_deco_status(u32 status, char *outstr)
 {
 {
-	const struct {
+	static const struct {
 		u8 value;
 		u8 value;
 		char *error_text;
 		char *error_text;
 	} desc_error_list[] = {
 	} desc_error_list[] = {
@@ -245,7 +245,7 @@ static void report_cond_code_status(u32 status, char *outstr)
 
 
 char *caam_jr_strstatus(char *outstr, u32 status)
 char *caam_jr_strstatus(char *outstr, u32 status)
 {
 {
-	struct stat_src {
+	static const struct stat_src {
 		void (*report_ssed)(u32 status, char *outstr);
 		void (*report_ssed)(u32 status, char *outstr);
 		char *error;
 		char *error;
 	} status_src[] = {
 	} status_src[] = {

+ 1 - 0
drivers/crypto/caam/intern.h

@@ -41,6 +41,7 @@ struct caam_jrentry_info {
 /* Private sub-storage for a single JobR */
 /* Private sub-storage for a single JobR */
 struct caam_drv_private_jr {
 struct caam_drv_private_jr {
 	struct device *parentdev;	/* points back to controller dev */
 	struct device *parentdev;	/* points back to controller dev */
+	struct platform_device *jr_pdev;/* points to platform device for JR */
 	int ridx;
 	int ridx;
 	struct caam_job_ring __iomem *rregs;	/* JobR's register space */
 	struct caam_job_ring __iomem *rregs;	/* JobR's register space */
 	struct tasklet_struct irqtask;
 	struct tasklet_struct irqtask;

+ 4 - 0
drivers/crypto/caam/jr.c

@@ -407,6 +407,7 @@ int caam_jr_shutdown(struct device *dev)
 	dma_free_coherent(dev, sizeof(struct jr_outentry) * JOBR_DEPTH,
 	dma_free_coherent(dev, sizeof(struct jr_outentry) * JOBR_DEPTH,
 			  jrp->outring, outbusaddr);
 			  jrp->outring, outbusaddr);
 	kfree(jrp->entinfo);
 	kfree(jrp->entinfo);
+	of_device_unregister(jrp->jr_pdev);
 
 
 	return ret;
 	return ret;
 }
 }
@@ -454,6 +455,8 @@ int caam_jr_probe(struct platform_device *pdev, struct device_node *np,
 		kfree(jrpriv);
 		kfree(jrpriv);
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
+
+	jrpriv->jr_pdev = jr_pdev;
 	jrdev = &jr_pdev->dev;
 	jrdev = &jr_pdev->dev;
 	dev_set_drvdata(jrdev, jrpriv);
 	dev_set_drvdata(jrdev, jrpriv);
 	ctrlpriv->jrdev[ring] = jrdev;
 	ctrlpriv->jrdev[ring] = jrdev;
@@ -472,6 +475,7 @@ int caam_jr_probe(struct platform_device *pdev, struct device_node *np,
 	/* Now do the platform independent part */
 	/* Now do the platform independent part */
 	error = caam_jr_init(jrdev); /* now turn on hardware */
 	error = caam_jr_init(jrdev); /* now turn on hardware */
 	if (error) {
 	if (error) {
+		of_device_unregister(jr_pdev);
 		kfree(jrpriv);
 		kfree(jrpriv);
 		return error;
 		return error;
 	}
 	}

+ 1 - 1
drivers/crypto/caam/key_gen.c

@@ -44,7 +44,7 @@ Split key generation-----------------------------------------------
 [06] 0x64260028    fifostr: class2 mdsplit-jdk len=40
 [06] 0x64260028    fifostr: class2 mdsplit-jdk len=40
 			@0xffe04000
 			@0xffe04000
 */
 */
-u32 gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
+int gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
 		  int split_key_pad_len, const u8 *key_in, u32 keylen,
 		  int split_key_pad_len, const u8 *key_in, u32 keylen,
 		  u32 alg_op)
 		  u32 alg_op)
 {
 {

+ 1 - 1
drivers/crypto/caam/key_gen.h

@@ -12,6 +12,6 @@ struct split_key_result {
 
 
 void split_key_done(struct device *dev, u32 *desc, u32 err, void *context);
 void split_key_done(struct device *dev, u32 *desc, u32 err, void *context);
 
 
-u32 gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
+int gen_split_key(struct device *jrdev, u8 *key_out, int split_key_len,
 		    int split_key_pad_len, const u8 *key_in, u32 keylen,
 		    int split_key_pad_len, const u8 *key_in, u32 keylen,
 		    u32 alg_op);
 		    u32 alg_op);

+ 3 - 1
drivers/crypto/caam/regs.h

@@ -252,7 +252,8 @@ struct caam_ctrl {
 	/* Read/Writable					        */
 	/* Read/Writable					        */
 	u32 rsvd1;
 	u32 rsvd1;
 	u32 mcr;		/* MCFG      Master Config Register  */
 	u32 mcr;		/* MCFG      Master Config Register  */
-	u32 rsvd2[2];
+	u32 rsvd2;
+	u32 scfgr;		/* SCFGR, Security Config Register */
 
 
 	/* Bus Access Configuration Section			010-11f */
 	/* Bus Access Configuration Section			010-11f */
 	/* Read/Writable                                                */
 	/* Read/Writable                                                */
@@ -299,6 +300,7 @@ struct caam_ctrl {
 #define MCFGR_WDFAIL		0x20000000 /* DECO watchdog force-fail */
 #define MCFGR_WDFAIL		0x20000000 /* DECO watchdog force-fail */
 #define MCFGR_DMA_RESET		0x10000000
 #define MCFGR_DMA_RESET		0x10000000
 #define MCFGR_LONG_PTR		0x00010000 /* Use >32-bit desc addressing */
 #define MCFGR_LONG_PTR		0x00010000 /* Use >32-bit desc addressing */
+#define SCFGR_RDBENABLE		0x00000400
 
 
 /* AXI read cache control */
 /* AXI read cache control */
 #define MCFGR_ARCACHE_SHIFT	12
 #define MCFGR_ARCACHE_SHIFT	12

+ 2 - 13
drivers/crypto/omap-aes.c

@@ -636,7 +636,7 @@ static void omap_aes_finish_req(struct omap_aes_dev *dd, int err)
 
 
 	pr_debug("err: %d\n", err);
 	pr_debug("err: %d\n", err);
 
 
-	pm_runtime_put_sync(dd->dev);
+	pm_runtime_put(dd->dev);
 	dd->flags &= ~FLAGS_BUSY;
 	dd->flags &= ~FLAGS_BUSY;
 
 
 	req->base.complete(&req->base, err);
 	req->base.complete(&req->base, err);
@@ -1248,18 +1248,7 @@ static struct platform_driver omap_aes_driver = {
 	},
 	},
 };
 };
 
 
-static int __init omap_aes_mod_init(void)
-{
-	return  platform_driver_register(&omap_aes_driver);
-}
-
-static void __exit omap_aes_mod_exit(void)
-{
-	platform_driver_unregister(&omap_aes_driver);
-}
-
-module_init(omap_aes_mod_init);
-module_exit(omap_aes_mod_exit);
+module_platform_driver(omap_aes_driver);
 
 
 MODULE_DESCRIPTION("OMAP AES hw acceleration support.");
 MODULE_DESCRIPTION("OMAP AES hw acceleration support.");
 MODULE_LICENSE("GPL v2");
 MODULE_LICENSE("GPL v2");

+ 2 - 13
drivers/crypto/omap-sham.c

@@ -923,7 +923,7 @@ static void omap_sham_finish_req(struct ahash_request *req, int err)
 	dd->flags &= ~(BIT(FLAGS_BUSY) | BIT(FLAGS_FINAL) | BIT(FLAGS_CPU) |
 	dd->flags &= ~(BIT(FLAGS_BUSY) | BIT(FLAGS_FINAL) | BIT(FLAGS_CPU) |
 			BIT(FLAGS_DMA_READY) | BIT(FLAGS_OUTPUT_READY));
 			BIT(FLAGS_DMA_READY) | BIT(FLAGS_OUTPUT_READY));
 
 
-	pm_runtime_put_sync(dd->dev);
+	pm_runtime_put(dd->dev);
 
 
 	if (req->base.complete)
 	if (req->base.complete)
 		req->base.complete(&req->base, err);
 		req->base.complete(&req->base, err);
@@ -1813,18 +1813,7 @@ static struct platform_driver omap_sham_driver = {
 	},
 	},
 };
 };
 
 
-static int __init omap_sham_mod_init(void)
-{
-	return platform_driver_register(&omap_sham_driver);
-}
-
-static void __exit omap_sham_mod_exit(void)
-{
-	platform_driver_unregister(&omap_sham_driver);
-}
-
-module_init(omap_sham_mod_init);
-module_exit(omap_sham_mod_exit);
+module_platform_driver(omap_sham_driver);
 
 
 MODULE_DESCRIPTION("OMAP SHA1/MD5 hw acceleration support.");
 MODULE_DESCRIPTION("OMAP SHA1/MD5 hw acceleration support.");
 MODULE_LICENSE("GPL v2");
 MODULE_LICENSE("GPL v2");

+ 1 - 3
drivers/crypto/picoxcell_crypto.c

@@ -1688,8 +1688,6 @@ static const struct of_device_id spacc_of_id_table[] = {
 	{ .compatible = "picochip,spacc-l2" },
 	{ .compatible = "picochip,spacc-l2" },
 	{}
 	{}
 };
 };
-#else /* CONFIG_OF */
-#define spacc_of_id_table NULL
 #endif /* CONFIG_OF */
 #endif /* CONFIG_OF */
 
 
 static bool spacc_is_compatible(struct platform_device *pdev,
 static bool spacc_is_compatible(struct platform_device *pdev,
@@ -1874,7 +1872,7 @@ static struct platform_driver spacc_driver = {
 #ifdef CONFIG_PM
 #ifdef CONFIG_PM
 		.pm	= &spacc_pm_ops,
 		.pm	= &spacc_pm_ops,
 #endif /* CONFIG_PM */
 #endif /* CONFIG_PM */
-		.of_match_table	= spacc_of_id_table,
+		.of_match_table	= of_match_ptr(spacc_of_id_table),
 	},
 	},
 	.id_table	= spacc_id_table,
 	.id_table	= spacc_id_table,
 };
 };

+ 1070 - 0
drivers/crypto/sahara.c

@@ -0,0 +1,1070 @@
+/*
+ * Cryptographic API.
+ *
+ * Support for SAHARA cryptographic accelerator.
+ *
+ * Copyright (c) 2013 Vista Silicon S.L.
+ * Author: Javier Martin <javier.martin@vista-silicon.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Based on omap-aes.c and tegra-aes.c
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/aes.h>
+
+#include <linux/clk.h>
+#include <linux/crypto.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+
+#define SAHARA_NAME "sahara"
+#define SAHARA_VERSION_3	3
+#define SAHARA_TIMEOUT_MS	1000
+#define SAHARA_MAX_HW_DESC	2
+#define SAHARA_MAX_HW_LINK	20
+
+#define FLAGS_MODE_MASK		0x000f
+#define FLAGS_ENCRYPT		BIT(0)
+#define FLAGS_CBC		BIT(1)
+#define FLAGS_NEW_KEY		BIT(3)
+#define FLAGS_BUSY		4
+
+#define SAHARA_HDR_BASE			0x00800000
+#define SAHARA_HDR_SKHA_ALG_AES	0
+#define SAHARA_HDR_SKHA_OP_ENC		(1 << 2)
+#define SAHARA_HDR_SKHA_MODE_ECB	(0 << 3)
+#define SAHARA_HDR_SKHA_MODE_CBC	(1 << 3)
+#define SAHARA_HDR_FORM_DATA		(5 << 16)
+#define SAHARA_HDR_FORM_KEY		(8 << 16)
+#define SAHARA_HDR_LLO			(1 << 24)
+#define SAHARA_HDR_CHA_SKHA		(1 << 28)
+#define SAHARA_HDR_CHA_MDHA		(2 << 28)
+#define SAHARA_HDR_PARITY_BIT		(1 << 31)
+
+/* SAHARA can only process one request at a time */
+#define SAHARA_QUEUE_LENGTH	1
+
+#define SAHARA_REG_VERSION	0x00
+#define SAHARA_REG_DAR		0x04
+#define SAHARA_REG_CONTROL	0x08
+#define		SAHARA_CONTROL_SET_THROTTLE(x)	(((x) & 0xff) << 24)
+#define		SAHARA_CONTROL_SET_MAXBURST(x)	(((x) & 0xff) << 16)
+#define		SAHARA_CONTROL_RNG_AUTORSD	(1 << 7)
+#define		SAHARA_CONTROL_ENABLE_INT	(1 << 4)
+#define SAHARA_REG_CMD		0x0C
+#define		SAHARA_CMD_RESET		(1 << 0)
+#define		SAHARA_CMD_CLEAR_INT		(1 << 8)
+#define		SAHARA_CMD_CLEAR_ERR		(1 << 9)
+#define		SAHARA_CMD_SINGLE_STEP		(1 << 10)
+#define		SAHARA_CMD_MODE_BATCH		(1 << 16)
+#define		SAHARA_CMD_MODE_DEBUG		(1 << 18)
+#define	SAHARA_REG_STATUS	0x10
+#define		SAHARA_STATUS_GET_STATE(x)	((x) & 0x7)
+#define			SAHARA_STATE_IDLE	0
+#define			SAHARA_STATE_BUSY	1
+#define			SAHARA_STATE_ERR	2
+#define			SAHARA_STATE_FAULT	3
+#define			SAHARA_STATE_COMPLETE	4
+#define			SAHARA_STATE_COMP_FLAG	(1 << 2)
+#define		SAHARA_STATUS_DAR_FULL		(1 << 3)
+#define		SAHARA_STATUS_ERROR		(1 << 4)
+#define		SAHARA_STATUS_SECURE		(1 << 5)
+#define		SAHARA_STATUS_FAIL		(1 << 6)
+#define		SAHARA_STATUS_INIT		(1 << 7)
+#define		SAHARA_STATUS_RNG_RESEED	(1 << 8)
+#define		SAHARA_STATUS_ACTIVE_RNG	(1 << 9)
+#define		SAHARA_STATUS_ACTIVE_MDHA	(1 << 10)
+#define		SAHARA_STATUS_ACTIVE_SKHA	(1 << 11)
+#define		SAHARA_STATUS_MODE_BATCH	(1 << 16)
+#define		SAHARA_STATUS_MODE_DEDICATED	(1 << 17)
+#define		SAHARA_STATUS_MODE_DEBUG	(1 << 18)
+#define		SAHARA_STATUS_GET_ISTATE(x)	(((x) >> 24) & 0xff)
+#define SAHARA_REG_ERRSTATUS	0x14
+#define		SAHARA_ERRSTATUS_GET_SOURCE(x)	((x) & 0xf)
+#define			SAHARA_ERRSOURCE_CHA	14
+#define			SAHARA_ERRSOURCE_DMA	15
+#define		SAHARA_ERRSTATUS_DMA_DIR	(1 << 8)
+#define		SAHARA_ERRSTATUS_GET_DMASZ(x)(((x) >> 9) & 0x3)
+#define		SAHARA_ERRSTATUS_GET_DMASRC(x) (((x) >> 13) & 0x7)
+#define		SAHARA_ERRSTATUS_GET_CHASRC(x)	(((x) >> 16) & 0xfff)
+#define		SAHARA_ERRSTATUS_GET_CHAERR(x)	(((x) >> 28) & 0x3)
+#define SAHARA_REG_FADDR	0x18
+#define SAHARA_REG_CDAR		0x1C
+#define SAHARA_REG_IDAR		0x20
+
+struct sahara_hw_desc {
+	u32		hdr;
+	u32		len1;
+	dma_addr_t	p1;
+	u32		len2;
+	dma_addr_t	p2;
+	dma_addr_t	next;
+};
+
+struct sahara_hw_link {
+	u32		len;
+	dma_addr_t	p;
+	dma_addr_t	next;
+};
+
+struct sahara_ctx {
+	struct sahara_dev *dev;
+	unsigned long flags;
+	int keylen;
+	u8 key[AES_KEYSIZE_128];
+	struct crypto_ablkcipher *fallback;
+};
+
+struct sahara_aes_reqctx {
+	unsigned long mode;
+};
+
+struct sahara_dev {
+	struct device		*device;
+	void __iomem		*regs_base;
+	struct clk		*clk_ipg;
+	struct clk		*clk_ahb;
+
+	struct sahara_ctx	*ctx;
+	spinlock_t		lock;
+	struct crypto_queue	queue;
+	unsigned long		flags;
+
+	struct tasklet_struct	done_task;
+	struct tasklet_struct	queue_task;
+
+	struct sahara_hw_desc	*hw_desc[SAHARA_MAX_HW_DESC];
+	dma_addr_t		hw_phys_desc[SAHARA_MAX_HW_DESC];
+
+	u8			*key_base;
+	dma_addr_t		key_phys_base;
+
+	u8			*iv_base;
+	dma_addr_t		iv_phys_base;
+
+	struct sahara_hw_link	*hw_link[SAHARA_MAX_HW_LINK];
+	dma_addr_t		hw_phys_link[SAHARA_MAX_HW_LINK];
+
+	struct ablkcipher_request *req;
+	size_t			total;
+	struct scatterlist	*in_sg;
+	unsigned int		nb_in_sg;
+	struct scatterlist	*out_sg;
+	unsigned int		nb_out_sg;
+
+	u32			error;
+	struct timer_list	watchdog;
+};
+
+static struct sahara_dev *dev_ptr;
+
+static inline void sahara_write(struct sahara_dev *dev, u32 data, u32 reg)
+{
+	writel(data, dev->regs_base + reg);
+}
+
+static inline unsigned int sahara_read(struct sahara_dev *dev, u32 reg)
+{
+	return readl(dev->regs_base + reg);
+}
+
+static u32 sahara_aes_key_hdr(struct sahara_dev *dev)
+{
+	u32 hdr = SAHARA_HDR_BASE | SAHARA_HDR_SKHA_ALG_AES |
+			SAHARA_HDR_FORM_KEY | SAHARA_HDR_LLO |
+			SAHARA_HDR_CHA_SKHA | SAHARA_HDR_PARITY_BIT;
+
+	if (dev->flags & FLAGS_CBC) {
+		hdr |= SAHARA_HDR_SKHA_MODE_CBC;
+		hdr ^= SAHARA_HDR_PARITY_BIT;
+	}
+
+	if (dev->flags & FLAGS_ENCRYPT) {
+		hdr |= SAHARA_HDR_SKHA_OP_ENC;
+		hdr ^= SAHARA_HDR_PARITY_BIT;
+	}
+
+	return hdr;
+}
+
+static u32 sahara_aes_data_link_hdr(struct sahara_dev *dev)
+{
+	return SAHARA_HDR_BASE | SAHARA_HDR_FORM_DATA |
+			SAHARA_HDR_CHA_SKHA | SAHARA_HDR_PARITY_BIT;
+}
+
+static int sahara_sg_length(struct scatterlist *sg,
+			    unsigned int total)
+{
+	int sg_nb;
+	unsigned int len;
+	struct scatterlist *sg_list;
+
+	sg_nb = 0;
+	sg_list = sg;
+
+	while (total) {
+		len = min(sg_list->length, total);
+
+		sg_nb++;
+		total -= len;
+
+		sg_list = sg_next(sg_list);
+		if (!sg_list)
+			total = 0;
+	}
+
+	return sg_nb;
+}
+
+static char *sahara_err_src[16] = {
+	"No error",
+	"Header error",
+	"Descriptor length error",
+	"Descriptor length or pointer error",
+	"Link length error",
+	"Link pointer error",
+	"Input buffer error",
+	"Output buffer error",
+	"Output buffer starvation",
+	"Internal state fault",
+	"General descriptor problem",
+	"Reserved",
+	"Descriptor address error",
+	"Link address error",
+	"CHA error",
+	"DMA error"
+};
+
+static char *sahara_err_dmasize[4] = {
+	"Byte transfer",
+	"Half-word transfer",
+	"Word transfer",
+	"Reserved"
+};
+
+static char *sahara_err_dmasrc[8] = {
+	"No error",
+	"AHB bus error",
+	"Internal IP bus error",
+	"Parity error",
+	"DMA crosses 256 byte boundary",
+	"DMA is busy",
+	"Reserved",
+	"DMA HW error"
+};
+
+static char *sahara_cha_errsrc[12] = {
+	"Input buffer non-empty",
+	"Illegal address",
+	"Illegal mode",
+	"Illegal data size",
+	"Illegal key size",
+	"Write during processing",
+	"CTX read during processing",
+	"HW error",
+	"Input buffer disabled/underflow",
+	"Output buffer disabled/overflow",
+	"DES key parity error",
+	"Reserved"
+};
+
+static char *sahara_cha_err[4] = { "No error", "SKHA", "MDHA", "RNG" };
+
+static void sahara_decode_error(struct sahara_dev *dev, unsigned int error)
+{
+	u8 source = SAHARA_ERRSTATUS_GET_SOURCE(error);
+	u16 chasrc = ffs(SAHARA_ERRSTATUS_GET_CHASRC(error));
+
+	dev_err(dev->device, "%s: Error Register = 0x%08x\n", __func__, error);
+
+	dev_err(dev->device, "	- %s.\n", sahara_err_src[source]);
+
+	if (source == SAHARA_ERRSOURCE_DMA) {
+		if (error & SAHARA_ERRSTATUS_DMA_DIR)
+			dev_err(dev->device, "		* DMA read.\n");
+		else
+			dev_err(dev->device, "		* DMA write.\n");
+
+		dev_err(dev->device, "		* %s.\n",
+		       sahara_err_dmasize[SAHARA_ERRSTATUS_GET_DMASZ(error)]);
+		dev_err(dev->device, "		* %s.\n",
+		       sahara_err_dmasrc[SAHARA_ERRSTATUS_GET_DMASRC(error)]);
+	} else if (source == SAHARA_ERRSOURCE_CHA) {
+		dev_err(dev->device, "		* %s.\n",
+			sahara_cha_errsrc[chasrc]);
+		dev_err(dev->device, "		* %s.\n",
+		       sahara_cha_err[SAHARA_ERRSTATUS_GET_CHAERR(error)]);
+	}
+	dev_err(dev->device, "\n");
+}
+
+static char *sahara_state[4] = { "Idle", "Busy", "Error", "HW Fault" };
+
+static void sahara_decode_status(struct sahara_dev *dev, unsigned int status)
+{
+	u8 state;
+
+	if (!IS_ENABLED(DEBUG))
+		return;
+
+	state = SAHARA_STATUS_GET_STATE(status);
+
+	dev_dbg(dev->device, "%s: Status Register = 0x%08x\n",
+		__func__, status);
+
+	dev_dbg(dev->device, "	- State = %d:\n", state);
+	if (state & SAHARA_STATE_COMP_FLAG)
+		dev_dbg(dev->device, "		* Descriptor completed. IRQ pending.\n");
+
+	dev_dbg(dev->device, "		* %s.\n",
+	       sahara_state[state & ~SAHARA_STATE_COMP_FLAG]);
+
+	if (status & SAHARA_STATUS_DAR_FULL)
+		dev_dbg(dev->device, "	- DAR Full.\n");
+	if (status & SAHARA_STATUS_ERROR)
+		dev_dbg(dev->device, "	- Error.\n");
+	if (status & SAHARA_STATUS_SECURE)
+		dev_dbg(dev->device, "	- Secure.\n");
+	if (status & SAHARA_STATUS_FAIL)
+		dev_dbg(dev->device, "	- Fail.\n");
+	if (status & SAHARA_STATUS_RNG_RESEED)
+		dev_dbg(dev->device, "	- RNG Reseed Request.\n");
+	if (status & SAHARA_STATUS_ACTIVE_RNG)
+		dev_dbg(dev->device, "	- RNG Active.\n");
+	if (status & SAHARA_STATUS_ACTIVE_MDHA)
+		dev_dbg(dev->device, "	- MDHA Active.\n");
+	if (status & SAHARA_STATUS_ACTIVE_SKHA)
+		dev_dbg(dev->device, "	- SKHA Active.\n");
+
+	if (status & SAHARA_STATUS_MODE_BATCH)
+		dev_dbg(dev->device, "	- Batch Mode.\n");
+	else if (status & SAHARA_STATUS_MODE_DEDICATED)
+		dev_dbg(dev->device, "	- Decidated Mode.\n");
+	else if (status & SAHARA_STATUS_MODE_DEBUG)
+		dev_dbg(dev->device, "	- Debug Mode.\n");
+
+	dev_dbg(dev->device, "	- Internal state = 0x%02x\n",
+	       SAHARA_STATUS_GET_ISTATE(status));
+
+	dev_dbg(dev->device, "Current DAR: 0x%08x\n",
+		sahara_read(dev, SAHARA_REG_CDAR));
+	dev_dbg(dev->device, "Initial DAR: 0x%08x\n\n",
+		sahara_read(dev, SAHARA_REG_IDAR));
+}
+
+static void sahara_dump_descriptors(struct sahara_dev *dev)
+{
+	int i;
+
+	if (!IS_ENABLED(DEBUG))
+		return;
+
+	for (i = 0; i < SAHARA_MAX_HW_DESC; i++) {
+		dev_dbg(dev->device, "Descriptor (%d) (0x%08x):\n",
+			i, dev->hw_phys_desc[i]);
+		dev_dbg(dev->device, "\thdr = 0x%08x\n", dev->hw_desc[i]->hdr);
+		dev_dbg(dev->device, "\tlen1 = %u\n", dev->hw_desc[i]->len1);
+		dev_dbg(dev->device, "\tp1 = 0x%08x\n", dev->hw_desc[i]->p1);
+		dev_dbg(dev->device, "\tlen2 = %u\n", dev->hw_desc[i]->len2);
+		dev_dbg(dev->device, "\tp2 = 0x%08x\n", dev->hw_desc[i]->p2);
+		dev_dbg(dev->device, "\tnext = 0x%08x\n",
+			dev->hw_desc[i]->next);
+	}
+	dev_dbg(dev->device, "\n");
+}
+
+static void sahara_dump_links(struct sahara_dev *dev)
+{
+	int i;
+
+	if (!IS_ENABLED(DEBUG))
+		return;
+
+	for (i = 0; i < SAHARA_MAX_HW_LINK; i++) {
+		dev_dbg(dev->device, "Link (%d) (0x%08x):\n",
+			i, dev->hw_phys_link[i]);
+		dev_dbg(dev->device, "\tlen = %u\n", dev->hw_link[i]->len);
+		dev_dbg(dev->device, "\tp = 0x%08x\n", dev->hw_link[i]->p);
+		dev_dbg(dev->device, "\tnext = 0x%08x\n",
+			dev->hw_link[i]->next);
+	}
+	dev_dbg(dev->device, "\n");
+}
+
+static void sahara_aes_done_task(unsigned long data)
+{
+	struct sahara_dev *dev = (struct sahara_dev *)data;
+
+	dma_unmap_sg(dev->device, dev->out_sg, dev->nb_out_sg,
+		DMA_TO_DEVICE);
+	dma_unmap_sg(dev->device, dev->in_sg, dev->nb_in_sg,
+		DMA_FROM_DEVICE);
+
+	spin_lock(&dev->lock);
+	clear_bit(FLAGS_BUSY, &dev->flags);
+	spin_unlock(&dev->lock);
+
+	dev->req->base.complete(&dev->req->base, dev->error);
+}
+
+void sahara_watchdog(unsigned long data)
+{
+	struct sahara_dev *dev = (struct sahara_dev *)data;
+	unsigned int err = sahara_read(dev, SAHARA_REG_ERRSTATUS);
+	unsigned int stat = sahara_read(dev, SAHARA_REG_STATUS);
+
+	sahara_decode_status(dev, stat);
+	sahara_decode_error(dev, err);
+	dev->error = -ETIMEDOUT;
+	sahara_aes_done_task(data);
+}
+
+static int sahara_hw_descriptor_create(struct sahara_dev *dev)
+{
+	struct sahara_ctx *ctx = dev->ctx;
+	struct scatterlist *sg;
+	int ret;
+	int i, j;
+
+	/* Copy new key if necessary */
+	if (ctx->flags & FLAGS_NEW_KEY) {
+		memcpy(dev->key_base, ctx->key, ctx->keylen);
+		ctx->flags &= ~FLAGS_NEW_KEY;
+
+		if (dev->flags & FLAGS_CBC) {
+			dev->hw_desc[0]->len1 = AES_BLOCK_SIZE;
+			dev->hw_desc[0]->p1 = dev->iv_phys_base;
+		} else {
+			dev->hw_desc[0]->len1 = 0;
+			dev->hw_desc[0]->p1 = 0;
+		}
+		dev->hw_desc[0]->len2 = ctx->keylen;
+		dev->hw_desc[0]->p2 = dev->key_phys_base;
+		dev->hw_desc[0]->next = dev->hw_phys_desc[1];
+	}
+	dev->hw_desc[0]->hdr = sahara_aes_key_hdr(dev);
+
+	dev->nb_in_sg = sahara_sg_length(dev->in_sg, dev->total);
+	dev->nb_out_sg = sahara_sg_length(dev->out_sg, dev->total);
+	if ((dev->nb_in_sg + dev->nb_out_sg) > SAHARA_MAX_HW_LINK) {
+		dev_err(dev->device, "not enough hw links (%d)\n",
+			dev->nb_in_sg + dev->nb_out_sg);
+		return -EINVAL;
+	}
+
+	ret = dma_map_sg(dev->device, dev->in_sg, dev->nb_in_sg,
+			 DMA_TO_DEVICE);
+	if (ret != dev->nb_in_sg) {
+		dev_err(dev->device, "couldn't map in sg\n");
+		goto unmap_in;
+	}
+	ret = dma_map_sg(dev->device, dev->out_sg, dev->nb_out_sg,
+			 DMA_FROM_DEVICE);
+	if (ret != dev->nb_out_sg) {
+		dev_err(dev->device, "couldn't map out sg\n");
+		goto unmap_out;
+	}
+
+	/* Create input links */
+	dev->hw_desc[1]->p1 = dev->hw_phys_link[0];
+	sg = dev->in_sg;
+	for (i = 0; i < dev->nb_in_sg; i++) {
+		dev->hw_link[i]->len = sg->length;
+		dev->hw_link[i]->p = sg->dma_address;
+		if (i == (dev->nb_in_sg - 1)) {
+			dev->hw_link[i]->next = 0;
+		} else {
+			dev->hw_link[i]->next = dev->hw_phys_link[i + 1];
+			sg = sg_next(sg);
+		}
+	}
+
+	/* Create output links */
+	dev->hw_desc[1]->p2 = dev->hw_phys_link[i];
+	sg = dev->out_sg;
+	for (j = i; j < dev->nb_out_sg + i; j++) {
+		dev->hw_link[j]->len = sg->length;
+		dev->hw_link[j]->p = sg->dma_address;
+		if (j == (dev->nb_out_sg + i - 1)) {
+			dev->hw_link[j]->next = 0;
+		} else {
+			dev->hw_link[j]->next = dev->hw_phys_link[j + 1];
+			sg = sg_next(sg);
+		}
+	}
+
+	/* Fill remaining fields of hw_desc[1] */
+	dev->hw_desc[1]->hdr = sahara_aes_data_link_hdr(dev);
+	dev->hw_desc[1]->len1 = dev->total;
+	dev->hw_desc[1]->len2 = dev->total;
+	dev->hw_desc[1]->next = 0;
+
+	sahara_dump_descriptors(dev);
+	sahara_dump_links(dev);
+
+	/* Start processing descriptor chain. */
+	mod_timer(&dev->watchdog,
+		  jiffies + msecs_to_jiffies(SAHARA_TIMEOUT_MS));
+	sahara_write(dev, dev->hw_phys_desc[0], SAHARA_REG_DAR);
+
+	return 0;
+
+unmap_out:
+	dma_unmap_sg(dev->device, dev->out_sg, dev->nb_out_sg,
+		DMA_TO_DEVICE);
+unmap_in:
+	dma_unmap_sg(dev->device, dev->in_sg, dev->nb_in_sg,
+		DMA_FROM_DEVICE);
+
+	return -EINVAL;
+}
+
+static void sahara_aes_queue_task(unsigned long data)
+{
+	struct sahara_dev *dev = (struct sahara_dev *)data;
+	struct crypto_async_request *async_req, *backlog;
+	struct sahara_ctx *ctx;
+	struct sahara_aes_reqctx *rctx;
+	struct ablkcipher_request *req;
+	int ret;
+
+	spin_lock(&dev->lock);
+	backlog = crypto_get_backlog(&dev->queue);
+	async_req = crypto_dequeue_request(&dev->queue);
+	if (!async_req)
+		clear_bit(FLAGS_BUSY, &dev->flags);
+	spin_unlock(&dev->lock);
+
+	if (!async_req)
+		return;
+
+	if (backlog)
+		backlog->complete(backlog, -EINPROGRESS);
+
+	req = ablkcipher_request_cast(async_req);
+
+	/* Request is ready to be dispatched by the device */
+	dev_dbg(dev->device,
+		"dispatch request (nbytes=%d, src=%p, dst=%p)\n",
+		req->nbytes, req->src, req->dst);
+
+	/* assign new request to device */
+	dev->req = req;
+	dev->total = req->nbytes;
+	dev->in_sg = req->src;
+	dev->out_sg = req->dst;
+
+	rctx = ablkcipher_request_ctx(req);
+	ctx = crypto_ablkcipher_ctx(crypto_ablkcipher_reqtfm(req));
+	rctx->mode &= FLAGS_MODE_MASK;
+	dev->flags = (dev->flags & ~FLAGS_MODE_MASK) | rctx->mode;
+
+	if ((dev->flags & FLAGS_CBC) && req->info)
+		memcpy(dev->iv_base, req->info, AES_KEYSIZE_128);
+
+	/* assign new context to device */
+	ctx->dev = dev;
+	dev->ctx = ctx;
+
+	ret = sahara_hw_descriptor_create(dev);
+	if (ret < 0) {
+		spin_lock(&dev->lock);
+		clear_bit(FLAGS_BUSY, &dev->flags);
+		spin_unlock(&dev->lock);
+		dev->req->base.complete(&dev->req->base, ret);
+	}
+}
+
+static int sahara_aes_setkey(struct crypto_ablkcipher *tfm, const u8 *key,
+			     unsigned int keylen)
+{
+	struct sahara_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+	int ret;
+
+	ctx->keylen = keylen;
+
+	/* SAHARA only supports 128bit keys */
+	if (keylen == AES_KEYSIZE_128) {
+		memcpy(ctx->key, key, keylen);
+		ctx->flags |= FLAGS_NEW_KEY;
+		return 0;
+	}
+
+	if (keylen != AES_KEYSIZE_128 &&
+	    keylen != AES_KEYSIZE_192 && keylen != AES_KEYSIZE_256)
+		return -EINVAL;
+
+	/*
+	 * The requested key size is not supported by HW, do a fallback.
+	 */
+	ctx->fallback->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
+	ctx->fallback->base.crt_flags |=
+		(tfm->base.crt_flags & CRYPTO_TFM_REQ_MASK);
+
+	ret = crypto_ablkcipher_setkey(ctx->fallback, key, keylen);
+	if (ret) {
+		struct crypto_tfm *tfm_aux = crypto_ablkcipher_tfm(tfm);
+
+		tfm_aux->crt_flags &= ~CRYPTO_TFM_RES_MASK;
+		tfm_aux->crt_flags |=
+			(ctx->fallback->base.crt_flags & CRYPTO_TFM_RES_MASK);
+	}
+	return ret;
+}
+
+static int sahara_aes_crypt(struct ablkcipher_request *req, unsigned long mode)
+{
+	struct sahara_ctx *ctx = crypto_ablkcipher_ctx(
+		crypto_ablkcipher_reqtfm(req));
+	struct sahara_aes_reqctx *rctx = ablkcipher_request_ctx(req);
+	struct sahara_dev *dev = dev_ptr;
+	int err = 0;
+	int busy;
+
+	dev_dbg(dev->device, "nbytes: %d, enc: %d, cbc: %d\n",
+		req->nbytes, !!(mode & FLAGS_ENCRYPT), !!(mode & FLAGS_CBC));
+
+	if (!IS_ALIGNED(req->nbytes, AES_BLOCK_SIZE)) {
+		dev_err(dev->device,
+			"request size is not exact amount of AES blocks\n");
+		return -EINVAL;
+	}
+
+	ctx->dev = dev;
+
+	rctx->mode = mode;
+	spin_lock_bh(&dev->lock);
+	err = ablkcipher_enqueue_request(&dev->queue, req);
+	busy = test_and_set_bit(FLAGS_BUSY, &dev->flags);
+	spin_unlock_bh(&dev->lock);
+
+	if (!busy)
+		tasklet_schedule(&dev->queue_task);
+
+	return err;
+}
+
+static int sahara_aes_ecb_encrypt(struct ablkcipher_request *req)
+{
+	struct crypto_tfm *tfm =
+		crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req));
+	struct sahara_ctx *ctx = crypto_ablkcipher_ctx(
+		crypto_ablkcipher_reqtfm(req));
+	int err;
+
+	if (unlikely(ctx->keylen != AES_KEYSIZE_128)) {
+		ablkcipher_request_set_tfm(req, ctx->fallback);
+		err = crypto_ablkcipher_encrypt(req);
+		ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm));
+		return err;
+	}
+
+	return sahara_aes_crypt(req, FLAGS_ENCRYPT);
+}
+
+static int sahara_aes_ecb_decrypt(struct ablkcipher_request *req)
+{
+	struct crypto_tfm *tfm =
+		crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req));
+	struct sahara_ctx *ctx = crypto_ablkcipher_ctx(
+		crypto_ablkcipher_reqtfm(req));
+	int err;
+
+	if (unlikely(ctx->keylen != AES_KEYSIZE_128)) {
+		ablkcipher_request_set_tfm(req, ctx->fallback);
+		err = crypto_ablkcipher_decrypt(req);
+		ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm));
+		return err;
+	}
+
+	return sahara_aes_crypt(req, 0);
+}
+
+static int sahara_aes_cbc_encrypt(struct ablkcipher_request *req)
+{
+	struct crypto_tfm *tfm =
+		crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req));
+	struct sahara_ctx *ctx = crypto_ablkcipher_ctx(
+		crypto_ablkcipher_reqtfm(req));
+	int err;
+
+	if (unlikely(ctx->keylen != AES_KEYSIZE_128)) {
+		ablkcipher_request_set_tfm(req, ctx->fallback);
+		err = crypto_ablkcipher_encrypt(req);
+		ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm));
+		return err;
+	}
+
+	return sahara_aes_crypt(req, FLAGS_ENCRYPT | FLAGS_CBC);
+}
+
+static int sahara_aes_cbc_decrypt(struct ablkcipher_request *req)
+{
+	struct crypto_tfm *tfm =
+		crypto_ablkcipher_tfm(crypto_ablkcipher_reqtfm(req));
+	struct sahara_ctx *ctx = crypto_ablkcipher_ctx(
+		crypto_ablkcipher_reqtfm(req));
+	int err;
+
+	if (unlikely(ctx->keylen != AES_KEYSIZE_128)) {
+		ablkcipher_request_set_tfm(req, ctx->fallback);
+		err = crypto_ablkcipher_decrypt(req);
+		ablkcipher_request_set_tfm(req, __crypto_ablkcipher_cast(tfm));
+		return err;
+	}
+
+	return sahara_aes_crypt(req, FLAGS_CBC);
+}
+
+static int sahara_aes_cra_init(struct crypto_tfm *tfm)
+{
+	const char *name = tfm->__crt_alg->cra_name;
+	struct sahara_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->fallback = crypto_alloc_ablkcipher(name, 0,
+				CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+	if (IS_ERR(ctx->fallback)) {
+		pr_err("Error allocating fallback algo %s\n", name);
+		return PTR_ERR(ctx->fallback);
+	}
+
+	tfm->crt_ablkcipher.reqsize = sizeof(struct sahara_aes_reqctx);
+
+	return 0;
+}
+
+static void sahara_aes_cra_exit(struct crypto_tfm *tfm)
+{
+	struct sahara_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	if (ctx->fallback)
+		crypto_free_ablkcipher(ctx->fallback);
+	ctx->fallback = NULL;
+}
+
+static struct crypto_alg aes_algs[] = {
+{
+	.cra_name		= "ecb(aes)",
+	.cra_driver_name	= "sahara-ecb-aes",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER |
+			CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct sahara_ctx),
+	.cra_alignmask		= 0x0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= sahara_aes_cra_init,
+	.cra_exit		= sahara_aes_cra_exit,
+	.cra_u.ablkcipher = {
+		.min_keysize	= AES_MIN_KEY_SIZE ,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.setkey		= sahara_aes_setkey,
+		.encrypt	= sahara_aes_ecb_encrypt,
+		.decrypt	= sahara_aes_ecb_decrypt,
+	}
+}, {
+	.cra_name		= "cbc(aes)",
+	.cra_driver_name	= "sahara-cbc-aes",
+	.cra_priority		= 300,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER |
+			CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct sahara_ctx),
+	.cra_alignmask		= 0x0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_init		= sahara_aes_cra_init,
+	.cra_exit		= sahara_aes_cra_exit,
+	.cra_u.ablkcipher = {
+		.min_keysize	= AES_MIN_KEY_SIZE ,
+		.max_keysize	= AES_MAX_KEY_SIZE,
+		.ivsize		= AES_BLOCK_SIZE,
+		.setkey		= sahara_aes_setkey,
+		.encrypt	= sahara_aes_cbc_encrypt,
+		.decrypt	= sahara_aes_cbc_decrypt,
+	}
+}
+};
+
+static irqreturn_t sahara_irq_handler(int irq, void *data)
+{
+	struct sahara_dev *dev = (struct sahara_dev *)data;
+	unsigned int stat = sahara_read(dev, SAHARA_REG_STATUS);
+	unsigned int err = sahara_read(dev, SAHARA_REG_ERRSTATUS);
+
+	del_timer(&dev->watchdog);
+
+	sahara_write(dev, SAHARA_CMD_CLEAR_INT | SAHARA_CMD_CLEAR_ERR,
+		     SAHARA_REG_CMD);
+
+	sahara_decode_status(dev, stat);
+
+	if (SAHARA_STATUS_GET_STATE(stat) == SAHARA_STATE_BUSY) {
+		return IRQ_NONE;
+	} else if (SAHARA_STATUS_GET_STATE(stat) == SAHARA_STATE_COMPLETE) {
+		dev->error = 0;
+	} else {
+		sahara_decode_error(dev, err);
+		dev->error = -EINVAL;
+	}
+
+	tasklet_schedule(&dev->done_task);
+
+	return IRQ_HANDLED;
+}
+
+
+static int sahara_register_algs(struct sahara_dev *dev)
+{
+	int err, i, j;
+
+	for (i = 0; i < ARRAY_SIZE(aes_algs); i++) {
+		INIT_LIST_HEAD(&aes_algs[i].cra_list);
+		err = crypto_register_alg(&aes_algs[i]);
+		if (err)
+			goto err_aes_algs;
+	}
+
+	return 0;
+
+err_aes_algs:
+	for (j = 0; j < i; j++)
+		crypto_unregister_alg(&aes_algs[j]);
+
+	return err;
+}
+
+static void sahara_unregister_algs(struct sahara_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(aes_algs); i++)
+		crypto_unregister_alg(&aes_algs[i]);
+}
+
+static struct platform_device_id sahara_platform_ids[] = {
+	{ .name = "sahara-imx27" },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(platform, sahara_platform_ids);
+
+static struct of_device_id sahara_dt_ids[] = {
+	{ .compatible = "fsl,imx27-sahara" },
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(platform, sahara_dt_ids);
+
+static int sahara_probe(struct platform_device *pdev)
+{
+	struct sahara_dev *dev;
+	struct resource *res;
+	u32 version;
+	int irq;
+	int err;
+	int i;
+
+	dev = devm_kzalloc(&pdev->dev, sizeof(struct sahara_dev), GFP_KERNEL);
+	if (dev == NULL) {
+		dev_err(&pdev->dev, "unable to alloc data struct.\n");
+		return -ENOMEM;
+	}
+
+	dev->device = &pdev->dev;
+	platform_set_drvdata(pdev, dev);
+
+	/* Get the base address */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "failed to get memory region resource\n");
+		return -ENODEV;
+	}
+
+	if (devm_request_mem_region(&pdev->dev, res->start,
+			resource_size(res), SAHARA_NAME) == NULL) {
+		dev_err(&pdev->dev, "failed to request memory region\n");
+		return -ENOENT;
+	}
+	dev->regs_base = devm_ioremap(&pdev->dev, res->start,
+				      resource_size(res));
+	if (!dev->regs_base) {
+		dev_err(&pdev->dev, "failed to ioremap address region\n");
+		return -ENOENT;
+	}
+
+	/* Get the IRQ */
+	irq = platform_get_irq(pdev,  0);
+	if (irq < 0) {
+		dev_err(&pdev->dev, "failed to get irq resource\n");
+		return irq;
+	}
+
+	if (devm_request_irq(&pdev->dev, irq, sahara_irq_handler,
+		0, SAHARA_NAME, dev) < 0) {
+		dev_err(&pdev->dev, "failed to request irq\n");
+		return -ENOENT;
+	}
+
+	/* clocks */
+	dev->clk_ipg = devm_clk_get(&pdev->dev, "ipg");
+	if (IS_ERR(dev->clk_ipg)) {
+		dev_err(&pdev->dev, "Could not get ipg clock\n");
+		return PTR_ERR(dev->clk_ipg);
+	}
+
+	dev->clk_ahb = devm_clk_get(&pdev->dev, "ahb");
+	if (IS_ERR(dev->clk_ahb)) {
+		dev_err(&pdev->dev, "Could not get ahb clock\n");
+		return PTR_ERR(dev->clk_ahb);
+	}
+
+	/* Allocate HW descriptors */
+	dev->hw_desc[0] = dma_alloc_coherent(&pdev->dev,
+			SAHARA_MAX_HW_DESC * sizeof(struct sahara_hw_desc),
+			&dev->hw_phys_desc[0], GFP_KERNEL);
+	if (!dev->hw_desc[0]) {
+		dev_err(&pdev->dev, "Could not allocate hw descriptors\n");
+		return -ENOMEM;
+	}
+	dev->hw_desc[1] = dev->hw_desc[0] + 1;
+	dev->hw_phys_desc[1] = dev->hw_phys_desc[0] +
+				sizeof(struct sahara_hw_desc);
+
+	/* Allocate space for iv and key */
+	dev->key_base = dma_alloc_coherent(&pdev->dev, 2 * AES_KEYSIZE_128,
+				&dev->key_phys_base, GFP_KERNEL);
+	if (!dev->key_base) {
+		dev_err(&pdev->dev, "Could not allocate memory for key\n");
+		err = -ENOMEM;
+		goto err_key;
+	}
+	dev->iv_base = dev->key_base + AES_KEYSIZE_128;
+	dev->iv_phys_base = dev->key_phys_base + AES_KEYSIZE_128;
+
+	/* Allocate space for HW links */
+	dev->hw_link[0] = dma_alloc_coherent(&pdev->dev,
+			SAHARA_MAX_HW_LINK * sizeof(struct sahara_hw_link),
+			&dev->hw_phys_link[0], GFP_KERNEL);
+	if (!dev->hw_link) {
+		dev_err(&pdev->dev, "Could not allocate hw links\n");
+		err = -ENOMEM;
+		goto err_link;
+	}
+	for (i = 1; i < SAHARA_MAX_HW_LINK; i++) {
+		dev->hw_phys_link[i] = dev->hw_phys_link[i - 1] +
+					sizeof(struct sahara_hw_link);
+		dev->hw_link[i] = dev->hw_link[i - 1] + 1;
+	}
+
+	crypto_init_queue(&dev->queue, SAHARA_QUEUE_LENGTH);
+
+	dev_ptr = dev;
+
+	tasklet_init(&dev->queue_task, sahara_aes_queue_task,
+		     (unsigned long)dev);
+	tasklet_init(&dev->done_task, sahara_aes_done_task,
+		     (unsigned long)dev);
+
+	init_timer(&dev->watchdog);
+	dev->watchdog.function = &sahara_watchdog;
+	dev->watchdog.data = (unsigned long)dev;
+
+	clk_prepare_enable(dev->clk_ipg);
+	clk_prepare_enable(dev->clk_ahb);
+
+	version = sahara_read(dev, SAHARA_REG_VERSION);
+	if (version != SAHARA_VERSION_3) {
+		dev_err(&pdev->dev, "SAHARA version %d not supported\n",
+			version);
+		err = -ENODEV;
+		goto err_algs;
+	}
+
+	sahara_write(dev, SAHARA_CMD_RESET | SAHARA_CMD_MODE_BATCH,
+		     SAHARA_REG_CMD);
+	sahara_write(dev, SAHARA_CONTROL_SET_THROTTLE(0) |
+			SAHARA_CONTROL_SET_MAXBURST(8) |
+			SAHARA_CONTROL_RNG_AUTORSD |
+			SAHARA_CONTROL_ENABLE_INT,
+			SAHARA_REG_CONTROL);
+
+	err = sahara_register_algs(dev);
+	if (err)
+		goto err_algs;
+
+	dev_info(&pdev->dev, "SAHARA version %d initialized\n", version);
+
+	return 0;
+
+err_algs:
+	dma_free_coherent(&pdev->dev,
+			  SAHARA_MAX_HW_LINK * sizeof(struct sahara_hw_link),
+			  dev->hw_link[0], dev->hw_phys_link[0]);
+	clk_disable_unprepare(dev->clk_ipg);
+	clk_disable_unprepare(dev->clk_ahb);
+	dev_ptr = NULL;
+err_link:
+	dma_free_coherent(&pdev->dev,
+			  2 * AES_KEYSIZE_128,
+			  dev->key_base, dev->key_phys_base);
+err_key:
+	dma_free_coherent(&pdev->dev,
+			  SAHARA_MAX_HW_DESC * sizeof(struct sahara_hw_desc),
+			  dev->hw_desc[0], dev->hw_phys_desc[0]);
+
+	return err;
+}
+
+static int sahara_remove(struct platform_device *pdev)
+{
+	struct sahara_dev *dev = platform_get_drvdata(pdev);
+
+	dma_free_coherent(&pdev->dev,
+			  SAHARA_MAX_HW_LINK * sizeof(struct sahara_hw_link),
+			  dev->hw_link[0], dev->hw_phys_link[0]);
+	dma_free_coherent(&pdev->dev,
+			  2 * AES_KEYSIZE_128,
+			  dev->key_base, dev->key_phys_base);
+	dma_free_coherent(&pdev->dev,
+			  SAHARA_MAX_HW_DESC * sizeof(struct sahara_hw_desc),
+			  dev->hw_desc[0], dev->hw_phys_desc[0]);
+
+	tasklet_kill(&dev->done_task);
+	tasklet_kill(&dev->queue_task);
+
+	sahara_unregister_algs(dev);
+
+	clk_disable_unprepare(dev->clk_ipg);
+	clk_disable_unprepare(dev->clk_ahb);
+
+	dev_ptr = NULL;
+
+	return 0;
+}
+
+static struct platform_driver sahara_driver = {
+	.probe		= sahara_probe,
+	.remove		= sahara_remove,
+	.driver		= {
+		.name	= SAHARA_NAME,
+		.owner	= THIS_MODULE,
+		.of_match_table = of_match_ptr(sahara_dt_ids),
+	},
+	.id_table = sahara_platform_ids,
+};
+
+module_platform_driver(sahara_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Javier Martin <javier.martin@vista-silicon.com>");
+MODULE_DESCRIPTION("SAHARA2 HW crypto accelerator");

+ 3 - 3
drivers/crypto/ux500/hash/hash_core.c

@@ -938,6 +938,7 @@ static int hash_dma_final(struct ahash_request *req)
 	if (!ctx->device->dma.nents) {
 	if (!ctx->device->dma.nents) {
 		dev_err(device_data->dev, "[%s] "
 		dev_err(device_data->dev, "[%s] "
 				"ctx->device->dma.nents = 0", __func__);
 				"ctx->device->dma.nents = 0", __func__);
+		ret = ctx->device->dma.nents;
 		goto out;
 		goto out;
 	}
 	}
 
 
@@ -945,6 +946,7 @@ static int hash_dma_final(struct ahash_request *req)
 	if (bytes_written != req->nbytes) {
 	if (bytes_written != req->nbytes) {
 		dev_err(device_data->dev, "[%s] "
 		dev_err(device_data->dev, "[%s] "
 				"hash_dma_write() failed!", __func__);
 				"hash_dma_write() failed!", __func__);
+		ret = bytes_written;
 		goto out;
 		goto out;
 	}
 	}
 
 
@@ -1367,14 +1369,12 @@ static int hash_setkey(struct crypto_ahash *tfm,
 	/**
 	/**
 	 * Freed in final.
 	 * Freed in final.
 	 */
 	 */
-	ctx->key = kmalloc(keylen, GFP_KERNEL);
+	ctx->key = kmemdup(key, keylen, GFP_KERNEL);
 	if (!ctx->key) {
 	if (!ctx->key) {
 		pr_err(DEV_DBG_NAME " [%s] Failed to allocate ctx->key "
 		pr_err(DEV_DBG_NAME " [%s] Failed to allocate ctx->key "
 		       "for %d\n", __func__, alg);
 		       "for %d\n", __func__, alg);
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
-
-	memcpy(ctx->key, key, keylen);
 	ctx->keylen = keylen;
 	ctx->keylen = keylen;
 
 
 	return ret;
 	return ret;

+ 5 - 0
include/crypto/sha.h

@@ -87,4 +87,9 @@ struct shash_desc;
 extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
 extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data,
 			      unsigned int len);
 			      unsigned int len);
 
 
+extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len);
+
+extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len);
 #endif
 #endif

+ 0 - 22
include/linux/platform_data/atmel-aes.h

@@ -1,22 +0,0 @@
-#ifndef __LINUX_ATMEL_AES_H
-#define __LINUX_ATMEL_AES_H
-
-#include <linux/platform_data/dma-atmel.h>
-
-/**
- * struct aes_dma_data - DMA data for AES
- */
-struct aes_dma_data {
-	struct at_dma_slave	txdata;
-	struct at_dma_slave	rxdata;
-};
-
-/**
- * struct aes_platform_data - board-specific AES configuration
- * @dma_slave: DMA slave interface to use in data transfers.
- */
-struct aes_platform_data {
-	struct aes_dma_data	*dma_slave;
-};
-
-#endif /* __LINUX_ATMEL_AES_H */

+ 22 - 0
include/linux/platform_data/crypto-atmel.h

@@ -0,0 +1,22 @@
+#ifndef __LINUX_CRYPTO_ATMEL_H
+#define __LINUX_CRYPTO_ATMEL_H
+
+#include <linux/platform_data/dma-atmel.h>
+
+/**
+ * struct crypto_dma_data - DMA data for AES/TDES/SHA
+ */
+struct crypto_dma_data {
+	struct at_dma_slave	txdata;
+	struct at_dma_slave	rxdata;
+};
+
+/**
+ * struct crypto_platform_data - board-specific AES/TDES/SHA configuration
+ * @dma_slave: DMA slave interface to use in data transfers.
+ */
+struct crypto_platform_data {
+	struct crypto_dma_data	*dma_slave;
+};
+
+#endif /* __LINUX_CRYPTO_ATMEL_H */

+ 0 - 5
include/linux/timeriomem-rng.h

@@ -8,12 +8,7 @@
  * published by the Free Software Foundation.
  * published by the Free Software Foundation.
  */
  */
 
 
-#include <linux/completion.h>
-
 struct timeriomem_rng_data {
 struct timeriomem_rng_data {
-	struct completion	completion;
-	unsigned int		present:1;
-
 	void __iomem		*address;
 	void __iomem		*address;
 
 
 	/* measures in usecs */
 	/* measures in usecs */

+ 13 - 0
net/xfrm/xfrm_algo.c

@@ -311,6 +311,19 @@ static struct xfrm_algo_desc aalg_list[] = {
 		.sadb_alg_maxbits = 128
 		.sadb_alg_maxbits = 128
 	}
 	}
 },
 },
+{
+	/* rfc4494 */
+	.name = "cmac(aes)",
+
+	.uinfo = {
+		.auth = {
+			.icv_truncbits = 96,
+			.icv_fullbits = 128,
+		}
+	},
+
+	.pfkey_supported = 0,
+},
 };
 };
 
 
 static struct xfrm_algo_desc ealg_list[] = {
 static struct xfrm_algo_desc ealg_list[] = {