aes-neon.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. /*
  2. * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
  3. *
  4. * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License version 2 as
  8. * published by the Free Software Foundation.
  9. */
  10. #include <linux/linkage.h>
  11. #include <asm/assembler.h>
  12. #define AES_ENTRY(func) ENTRY(neon_ ## func)
  13. #define AES_ENDPROC(func) ENDPROC(neon_ ## func)
  14. xtsmask .req v7
  15. .macro xts_reload_mask, tmp
  16. xts_load_mask \tmp
  17. .endm
  18. /* multiply by polynomial 'x' in GF(2^8) */
  19. .macro mul_by_x, out, in, temp, const
  20. sshr \temp, \in, #7
  21. shl \out, \in, #1
  22. and \temp, \temp, \const
  23. eor \out, \out, \temp
  24. .endm
  25. /* multiply by polynomial 'x^2' in GF(2^8) */
  26. .macro mul_by_x2, out, in, temp, const
  27. ushr \temp, \in, #6
  28. shl \out, \in, #2
  29. pmul \temp, \temp, \const
  30. eor \out, \out, \temp
  31. .endm
  32. /* preload the entire Sbox */
  33. .macro prepare, sbox, shiftrows, temp
  34. movi v12.16b, #0x1b
  35. ldr_l q13, \shiftrows, \temp
  36. ldr_l q14, .Lror32by8, \temp
  37. adr_l \temp, \sbox
  38. ld1 {v16.16b-v19.16b}, [\temp], #64
  39. ld1 {v20.16b-v23.16b}, [\temp], #64
  40. ld1 {v24.16b-v27.16b}, [\temp], #64
  41. ld1 {v28.16b-v31.16b}, [\temp]
  42. .endm
  43. /* do preload for encryption */
  44. .macro enc_prepare, ignore0, ignore1, temp
  45. prepare .LForward_Sbox, .LForward_ShiftRows, \temp
  46. .endm
  47. .macro enc_switch_key, ignore0, ignore1, temp
  48. /* do nothing */
  49. .endm
  50. /* do preload for decryption */
  51. .macro dec_prepare, ignore0, ignore1, temp
  52. prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp
  53. .endm
  54. /* apply SubBytes transformation using the the preloaded Sbox */
  55. .macro sub_bytes, in
  56. sub v9.16b, \in\().16b, v15.16b
  57. tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b
  58. sub v10.16b, v9.16b, v15.16b
  59. tbx \in\().16b, {v20.16b-v23.16b}, v9.16b
  60. sub v11.16b, v10.16b, v15.16b
  61. tbx \in\().16b, {v24.16b-v27.16b}, v10.16b
  62. tbx \in\().16b, {v28.16b-v31.16b}, v11.16b
  63. .endm
  64. /* apply MixColumns transformation */
  65. .macro mix_columns, in, enc
  66. .if \enc == 0
  67. /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
  68. mul_by_x2 v8.16b, \in\().16b, v9.16b, v12.16b
  69. eor \in\().16b, \in\().16b, v8.16b
  70. rev32 v8.8h, v8.8h
  71. eor \in\().16b, \in\().16b, v8.16b
  72. .endif
  73. mul_by_x v9.16b, \in\().16b, v8.16b, v12.16b
  74. rev32 v8.8h, \in\().8h
  75. eor v8.16b, v8.16b, v9.16b
  76. eor \in\().16b, \in\().16b, v8.16b
  77. tbl \in\().16b, {\in\().16b}, v14.16b
  78. eor \in\().16b, \in\().16b, v8.16b
  79. .endm
  80. .macro do_block, enc, in, rounds, rk, rkp, i
  81. ld1 {v15.4s}, [\rk]
  82. add \rkp, \rk, #16
  83. mov \i, \rounds
  84. 1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
  85. movi v15.16b, #0x40
  86. tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */
  87. sub_bytes \in
  88. subs \i, \i, #1
  89. ld1 {v15.4s}, [\rkp], #16
  90. beq 2222f
  91. mix_columns \in, \enc
  92. b 1111b
  93. 2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */
  94. .endm
  95. .macro encrypt_block, in, rounds, rk, rkp, i
  96. do_block 1, \in, \rounds, \rk, \rkp, \i
  97. .endm
  98. .macro decrypt_block, in, rounds, rk, rkp, i
  99. do_block 0, \in, \rounds, \rk, \rkp, \i
  100. .endm
  101. /*
  102. * Interleaved versions: functionally equivalent to the
  103. * ones above, but applied to 2 or 4 AES states in parallel.
  104. */
  105. .macro sub_bytes_2x, in0, in1
  106. sub v8.16b, \in0\().16b, v15.16b
  107. tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
  108. sub v9.16b, \in1\().16b, v15.16b
  109. tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
  110. sub v10.16b, v8.16b, v15.16b
  111. tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
  112. sub v11.16b, v9.16b, v15.16b
  113. tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
  114. sub v8.16b, v10.16b, v15.16b
  115. tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b
  116. sub v9.16b, v11.16b, v15.16b
  117. tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b
  118. tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
  119. tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
  120. .endm
  121. .macro sub_bytes_4x, in0, in1, in2, in3
  122. sub v8.16b, \in0\().16b, v15.16b
  123. tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
  124. sub v9.16b, \in1\().16b, v15.16b
  125. tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
  126. sub v10.16b, \in2\().16b, v15.16b
  127. tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b
  128. sub v11.16b, \in3\().16b, v15.16b
  129. tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b
  130. tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
  131. tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
  132. sub v8.16b, v8.16b, v15.16b
  133. tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b
  134. sub v9.16b, v9.16b, v15.16b
  135. tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b
  136. sub v10.16b, v10.16b, v15.16b
  137. tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b
  138. sub v11.16b, v11.16b, v15.16b
  139. tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b
  140. sub v8.16b, v8.16b, v15.16b
  141. tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b
  142. sub v9.16b, v9.16b, v15.16b
  143. tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b
  144. sub v10.16b, v10.16b, v15.16b
  145. tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
  146. sub v11.16b, v11.16b, v15.16b
  147. tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
  148. tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b
  149. tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b
  150. .endm
  151. .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
  152. sshr \tmp0\().16b, \in0\().16b, #7
  153. shl \out0\().16b, \in0\().16b, #1
  154. sshr \tmp1\().16b, \in1\().16b, #7
  155. and \tmp0\().16b, \tmp0\().16b, \const\().16b
  156. shl \out1\().16b, \in1\().16b, #1
  157. and \tmp1\().16b, \tmp1\().16b, \const\().16b
  158. eor \out0\().16b, \out0\().16b, \tmp0\().16b
  159. eor \out1\().16b, \out1\().16b, \tmp1\().16b
  160. .endm
  161. .macro mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
  162. ushr \tmp0\().16b, \in0\().16b, #6
  163. shl \out0\().16b, \in0\().16b, #2
  164. ushr \tmp1\().16b, \in1\().16b, #6
  165. pmul \tmp0\().16b, \tmp0\().16b, \const\().16b
  166. shl \out1\().16b, \in1\().16b, #2
  167. pmul \tmp1\().16b, \tmp1\().16b, \const\().16b
  168. eor \out0\().16b, \out0\().16b, \tmp0\().16b
  169. eor \out1\().16b, \out1\().16b, \tmp1\().16b
  170. .endm
  171. .macro mix_columns_2x, in0, in1, enc
  172. .if \enc == 0
  173. /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
  174. mul_by_x2_2x v8, v9, \in0, \in1, v10, v11, v12
  175. eor \in0\().16b, \in0\().16b, v8.16b
  176. rev32 v8.8h, v8.8h
  177. eor \in1\().16b, \in1\().16b, v9.16b
  178. rev32 v9.8h, v9.8h
  179. eor \in0\().16b, \in0\().16b, v8.16b
  180. eor \in1\().16b, \in1\().16b, v9.16b
  181. .endif
  182. mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v12
  183. rev32 v10.8h, \in0\().8h
  184. rev32 v11.8h, \in1\().8h
  185. eor v10.16b, v10.16b, v8.16b
  186. eor v11.16b, v11.16b, v9.16b
  187. eor \in0\().16b, \in0\().16b, v10.16b
  188. eor \in1\().16b, \in1\().16b, v11.16b
  189. tbl \in0\().16b, {\in0\().16b}, v14.16b
  190. tbl \in1\().16b, {\in1\().16b}, v14.16b
  191. eor \in0\().16b, \in0\().16b, v10.16b
  192. eor \in1\().16b, \in1\().16b, v11.16b
  193. .endm
  194. .macro do_block_2x, enc, in0, in1, rounds, rk, rkp, i
  195. ld1 {v15.4s}, [\rk]
  196. add \rkp, \rk, #16
  197. mov \i, \rounds
  198. 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
  199. eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
  200. movi v15.16b, #0x40
  201. tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
  202. tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
  203. sub_bytes_2x \in0, \in1
  204. subs \i, \i, #1
  205. ld1 {v15.4s}, [\rkp], #16
  206. beq 2222f
  207. mix_columns_2x \in0, \in1, \enc
  208. b 1111b
  209. 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
  210. eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
  211. .endm
  212. .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
  213. ld1 {v15.4s}, [\rk]
  214. add \rkp, \rk, #16
  215. mov \i, \rounds
  216. 1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
  217. eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
  218. eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
  219. eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
  220. movi v15.16b, #0x40
  221. tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
  222. tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
  223. tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */
  224. tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */
  225. sub_bytes_4x \in0, \in1, \in2, \in3
  226. subs \i, \i, #1
  227. ld1 {v15.4s}, [\rkp], #16
  228. beq 2222f
  229. mix_columns_2x \in0, \in1, \enc
  230. mix_columns_2x \in2, \in3, \enc
  231. b 1111b
  232. 2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
  233. eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
  234. eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */
  235. eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
  236. .endm
  237. .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i
  238. do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i
  239. .endm
  240. .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i
  241. do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i
  242. .endm
  243. .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
  244. do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
  245. .endm
  246. .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
  247. do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
  248. .endm
  249. #include "aes-modes.S"
  250. .section ".rodata", "a"
  251. .align 6
  252. .LForward_Sbox:
  253. .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
  254. .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
  255. .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
  256. .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
  257. .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
  258. .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
  259. .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
  260. .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
  261. .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
  262. .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
  263. .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
  264. .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
  265. .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
  266. .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
  267. .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
  268. .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
  269. .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
  270. .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
  271. .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
  272. .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
  273. .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
  274. .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
  275. .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
  276. .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
  277. .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
  278. .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
  279. .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
  280. .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
  281. .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
  282. .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
  283. .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
  284. .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
  285. .LReverse_Sbox:
  286. .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
  287. .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
  288. .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
  289. .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
  290. .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
  291. .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
  292. .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
  293. .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
  294. .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
  295. .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
  296. .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
  297. .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
  298. .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
  299. .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
  300. .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
  301. .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
  302. .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
  303. .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
  304. .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
  305. .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
  306. .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
  307. .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
  308. .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
  309. .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
  310. .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
  311. .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
  312. .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
  313. .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
  314. .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
  315. .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
  316. .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
  317. .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
  318. .LForward_ShiftRows:
  319. .octa 0x0b06010c07020d08030e09040f0a0500
  320. .LReverse_ShiftRows:
  321. .octa 0x0306090c0f0205080b0e0104070a0d00
  322. .Lror32by8:
  323. .octa 0x0c0f0e0d080b0a090407060500030201