aes-modes.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. /*
  2. * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
  3. *
  4. * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License version 2 as
  8. * published by the Free Software Foundation.
  9. */
  10. /* included by aes-ce.S and aes-neon.S */
  11. .text
  12. .align 4
  13. /*
  14. * There are several ways to instantiate this code:
  15. * - no interleave, all inline
  16. * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
  17. * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
  18. * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
  19. * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
  20. *
  21. * Macros imported by this code:
  22. * - enc_prepare - setup NEON registers for encryption
  23. * - dec_prepare - setup NEON registers for decryption
  24. * - enc_switch_key - change to new key after having prepared for encryption
  25. * - encrypt_block - encrypt a single block
  26. * - decrypt block - decrypt a single block
  27. * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
  28. * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
  29. * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
  30. * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
  31. */
  32. #if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
  33. #define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp
  34. #define FRAME_POP ldp x29, x30, [sp],#16
  35. #if INTERLEAVE == 2
  36. aes_encrypt_block2x:
  37. encrypt_block2x v0, v1, w3, x2, x6, w7
  38. ret
  39. ENDPROC(aes_encrypt_block2x)
  40. aes_decrypt_block2x:
  41. decrypt_block2x v0, v1, w3, x2, x6, w7
  42. ret
  43. ENDPROC(aes_decrypt_block2x)
  44. #elif INTERLEAVE == 4
  45. aes_encrypt_block4x:
  46. encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  47. ret
  48. ENDPROC(aes_encrypt_block4x)
  49. aes_decrypt_block4x:
  50. decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  51. ret
  52. ENDPROC(aes_decrypt_block4x)
  53. #else
  54. #error INTERLEAVE should equal 2 or 4
  55. #endif
  56. .macro do_encrypt_block2x
  57. bl aes_encrypt_block2x
  58. .endm
  59. .macro do_decrypt_block2x
  60. bl aes_decrypt_block2x
  61. .endm
  62. .macro do_encrypt_block4x
  63. bl aes_encrypt_block4x
  64. .endm
  65. .macro do_decrypt_block4x
  66. bl aes_decrypt_block4x
  67. .endm
  68. #else
  69. #define FRAME_PUSH
  70. #define FRAME_POP
  71. .macro do_encrypt_block2x
  72. encrypt_block2x v0, v1, w3, x2, x6, w7
  73. .endm
  74. .macro do_decrypt_block2x
  75. decrypt_block2x v0, v1, w3, x2, x6, w7
  76. .endm
  77. .macro do_encrypt_block4x
  78. encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  79. .endm
  80. .macro do_decrypt_block4x
  81. decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
  82. .endm
  83. #endif
  84. /*
  85. * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  86. * int blocks, int first)
  87. * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  88. * int blocks, int first)
  89. */
  90. AES_ENTRY(aes_ecb_encrypt)
  91. FRAME_PUSH
  92. cbz w5, .LecbencloopNx
  93. enc_prepare w3, x2, x5
  94. .LecbencloopNx:
  95. #if INTERLEAVE >= 2
  96. subs w4, w4, #INTERLEAVE
  97. bmi .Lecbenc1x
  98. #if INTERLEAVE == 2
  99. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
  100. do_encrypt_block2x
  101. st1 {v0.16b-v1.16b}, [x0], #32
  102. #else
  103. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
  104. do_encrypt_block4x
  105. st1 {v0.16b-v3.16b}, [x0], #64
  106. #endif
  107. b .LecbencloopNx
  108. .Lecbenc1x:
  109. adds w4, w4, #INTERLEAVE
  110. beq .Lecbencout
  111. #endif
  112. .Lecbencloop:
  113. ld1 {v0.16b}, [x1], #16 /* get next pt block */
  114. encrypt_block v0, w3, x2, x5, w6
  115. st1 {v0.16b}, [x0], #16
  116. subs w4, w4, #1
  117. bne .Lecbencloop
  118. .Lecbencout:
  119. FRAME_POP
  120. ret
  121. AES_ENDPROC(aes_ecb_encrypt)
  122. AES_ENTRY(aes_ecb_decrypt)
  123. FRAME_PUSH
  124. cbz w5, .LecbdecloopNx
  125. dec_prepare w3, x2, x5
  126. .LecbdecloopNx:
  127. #if INTERLEAVE >= 2
  128. subs w4, w4, #INTERLEAVE
  129. bmi .Lecbdec1x
  130. #if INTERLEAVE == 2
  131. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
  132. do_decrypt_block2x
  133. st1 {v0.16b-v1.16b}, [x0], #32
  134. #else
  135. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  136. do_decrypt_block4x
  137. st1 {v0.16b-v3.16b}, [x0], #64
  138. #endif
  139. b .LecbdecloopNx
  140. .Lecbdec1x:
  141. adds w4, w4, #INTERLEAVE
  142. beq .Lecbdecout
  143. #endif
  144. .Lecbdecloop:
  145. ld1 {v0.16b}, [x1], #16 /* get next ct block */
  146. decrypt_block v0, w3, x2, x5, w6
  147. st1 {v0.16b}, [x0], #16
  148. subs w4, w4, #1
  149. bne .Lecbdecloop
  150. .Lecbdecout:
  151. FRAME_POP
  152. ret
  153. AES_ENDPROC(aes_ecb_decrypt)
  154. /*
  155. * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  156. * int blocks, u8 iv[], int first)
  157. * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  158. * int blocks, u8 iv[], int first)
  159. */
  160. AES_ENTRY(aes_cbc_encrypt)
  161. cbz w6, .Lcbcencloop
  162. ld1 {v0.16b}, [x5] /* get iv */
  163. enc_prepare w3, x2, x5
  164. .Lcbcencloop:
  165. ld1 {v1.16b}, [x1], #16 /* get next pt block */
  166. eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */
  167. encrypt_block v0, w3, x2, x5, w6
  168. st1 {v0.16b}, [x0], #16
  169. subs w4, w4, #1
  170. bne .Lcbcencloop
  171. ret
  172. AES_ENDPROC(aes_cbc_encrypt)
  173. AES_ENTRY(aes_cbc_decrypt)
  174. FRAME_PUSH
  175. cbz w6, .LcbcdecloopNx
  176. ld1 {v7.16b}, [x5] /* get iv */
  177. dec_prepare w3, x2, x5
  178. .LcbcdecloopNx:
  179. #if INTERLEAVE >= 2
  180. subs w4, w4, #INTERLEAVE
  181. bmi .Lcbcdec1x
  182. #if INTERLEAVE == 2
  183. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
  184. mov v2.16b, v0.16b
  185. mov v3.16b, v1.16b
  186. do_decrypt_block2x
  187. eor v0.16b, v0.16b, v7.16b
  188. eor v1.16b, v1.16b, v2.16b
  189. mov v7.16b, v3.16b
  190. st1 {v0.16b-v1.16b}, [x0], #32
  191. #else
  192. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  193. mov v4.16b, v0.16b
  194. mov v5.16b, v1.16b
  195. mov v6.16b, v2.16b
  196. do_decrypt_block4x
  197. sub x1, x1, #16
  198. eor v0.16b, v0.16b, v7.16b
  199. eor v1.16b, v1.16b, v4.16b
  200. ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
  201. eor v2.16b, v2.16b, v5.16b
  202. eor v3.16b, v3.16b, v6.16b
  203. st1 {v0.16b-v3.16b}, [x0], #64
  204. #endif
  205. b .LcbcdecloopNx
  206. .Lcbcdec1x:
  207. adds w4, w4, #INTERLEAVE
  208. beq .Lcbcdecout
  209. #endif
  210. .Lcbcdecloop:
  211. ld1 {v1.16b}, [x1], #16 /* get next ct block */
  212. mov v0.16b, v1.16b /* ...and copy to v0 */
  213. decrypt_block v0, w3, x2, x5, w6
  214. eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
  215. mov v7.16b, v1.16b /* ct is next iv */
  216. st1 {v0.16b}, [x0], #16
  217. subs w4, w4, #1
  218. bne .Lcbcdecloop
  219. .Lcbcdecout:
  220. FRAME_POP
  221. ret
  222. AES_ENDPROC(aes_cbc_decrypt)
  223. /*
  224. * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
  225. * int blocks, u8 ctr[], int first)
  226. */
  227. AES_ENTRY(aes_ctr_encrypt)
  228. FRAME_PUSH
  229. cbnz w6, .Lctrfirst /* 1st time around? */
  230. umov x5, v4.d[1] /* keep swabbed ctr in reg */
  231. rev x5, x5
  232. #if INTERLEAVE >= 2
  233. cmn w5, w4 /* 32 bit overflow? */
  234. bcs .Lctrinc
  235. add x5, x5, #1 /* increment BE ctr */
  236. b .LctrincNx
  237. #else
  238. b .Lctrinc
  239. #endif
  240. .Lctrfirst:
  241. enc_prepare w3, x2, x6
  242. ld1 {v4.16b}, [x5]
  243. umov x5, v4.d[1] /* keep swabbed ctr in reg */
  244. rev x5, x5
  245. #if INTERLEAVE >= 2
  246. cmn w5, w4 /* 32 bit overflow? */
  247. bcs .Lctrloop
  248. .LctrloopNx:
  249. subs w4, w4, #INTERLEAVE
  250. bmi .Lctr1x
  251. #if INTERLEAVE == 2
  252. mov v0.8b, v4.8b
  253. mov v1.8b, v4.8b
  254. rev x7, x5
  255. add x5, x5, #1
  256. ins v0.d[1], x7
  257. rev x7, x5
  258. add x5, x5, #1
  259. ins v1.d[1], x7
  260. ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */
  261. do_encrypt_block2x
  262. eor v0.16b, v0.16b, v2.16b
  263. eor v1.16b, v1.16b, v3.16b
  264. st1 {v0.16b-v1.16b}, [x0], #32
  265. #else
  266. ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */
  267. dup v7.4s, w5
  268. mov v0.16b, v4.16b
  269. add v7.4s, v7.4s, v8.4s
  270. mov v1.16b, v4.16b
  271. rev32 v8.16b, v7.16b
  272. mov v2.16b, v4.16b
  273. mov v3.16b, v4.16b
  274. mov v1.s[3], v8.s[0]
  275. mov v2.s[3], v8.s[1]
  276. mov v3.s[3], v8.s[2]
  277. ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
  278. do_encrypt_block4x
  279. eor v0.16b, v5.16b, v0.16b
  280. ld1 {v5.16b}, [x1], #16 /* get 1 input block */
  281. eor v1.16b, v6.16b, v1.16b
  282. eor v2.16b, v7.16b, v2.16b
  283. eor v3.16b, v5.16b, v3.16b
  284. st1 {v0.16b-v3.16b}, [x0], #64
  285. add x5, x5, #INTERLEAVE
  286. #endif
  287. cbz w4, .LctroutNx
  288. .LctrincNx:
  289. rev x7, x5
  290. ins v4.d[1], x7
  291. b .LctrloopNx
  292. .LctroutNx:
  293. sub x5, x5, #1
  294. rev x7, x5
  295. ins v4.d[1], x7
  296. b .Lctrout
  297. .Lctr1x:
  298. adds w4, w4, #INTERLEAVE
  299. beq .Lctrout
  300. #endif
  301. .Lctrloop:
  302. mov v0.16b, v4.16b
  303. encrypt_block v0, w3, x2, x6, w7
  304. subs w4, w4, #1
  305. bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */
  306. ld1 {v3.16b}, [x1], #16
  307. eor v3.16b, v0.16b, v3.16b
  308. st1 {v3.16b}, [x0], #16
  309. beq .Lctrout
  310. .Lctrinc:
  311. adds x5, x5, #1 /* increment BE ctr */
  312. rev x7, x5
  313. ins v4.d[1], x7
  314. bcc .Lctrloop /* no overflow? */
  315. umov x7, v4.d[0] /* load upper word of ctr */
  316. rev x7, x7 /* ... to handle the carry */
  317. add x7, x7, #1
  318. rev x7, x7
  319. ins v4.d[0], x7
  320. b .Lctrloop
  321. .Lctrhalfblock:
  322. ld1 {v3.8b}, [x1]
  323. eor v3.8b, v0.8b, v3.8b
  324. st1 {v3.8b}, [x0]
  325. .Lctrout:
  326. FRAME_POP
  327. ret
  328. AES_ENDPROC(aes_ctr_encrypt)
  329. .ltorg
  330. /*
  331. * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  332. * int blocks, u8 const rk2[], u8 iv[], int first)
  333. * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
  334. * int blocks, u8 const rk2[], u8 iv[], int first)
  335. */
  336. .macro next_tweak, out, in, const, tmp
  337. sshr \tmp\().2d, \in\().2d, #63
  338. and \tmp\().16b, \tmp\().16b, \const\().16b
  339. add \out\().2d, \in\().2d, \in\().2d
  340. ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
  341. eor \out\().16b, \out\().16b, \tmp\().16b
  342. .endm
  343. .Lxts_mul_x:
  344. CPU_LE( .quad 1, 0x87 )
  345. CPU_BE( .quad 0x87, 1 )
  346. AES_ENTRY(aes_xts_encrypt)
  347. FRAME_PUSH
  348. cbz w7, .LxtsencloopNx
  349. ld1 {v4.16b}, [x6]
  350. enc_prepare w3, x5, x6
  351. encrypt_block v4, w3, x5, x6, w7 /* first tweak */
  352. enc_switch_key w3, x2, x6
  353. ldr q7, .Lxts_mul_x
  354. b .LxtsencNx
  355. .LxtsencloopNx:
  356. ldr q7, .Lxts_mul_x
  357. next_tweak v4, v4, v7, v8
  358. .LxtsencNx:
  359. #if INTERLEAVE >= 2
  360. subs w4, w4, #INTERLEAVE
  361. bmi .Lxtsenc1x
  362. #if INTERLEAVE == 2
  363. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */
  364. next_tweak v5, v4, v7, v8
  365. eor v0.16b, v0.16b, v4.16b
  366. eor v1.16b, v1.16b, v5.16b
  367. do_encrypt_block2x
  368. eor v0.16b, v0.16b, v4.16b
  369. eor v1.16b, v1.16b, v5.16b
  370. st1 {v0.16b-v1.16b}, [x0], #32
  371. cbz w4, .LxtsencoutNx
  372. next_tweak v4, v5, v7, v8
  373. b .LxtsencNx
  374. .LxtsencoutNx:
  375. mov v4.16b, v5.16b
  376. b .Lxtsencout
  377. #else
  378. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
  379. next_tweak v5, v4, v7, v8
  380. eor v0.16b, v0.16b, v4.16b
  381. next_tweak v6, v5, v7, v8
  382. eor v1.16b, v1.16b, v5.16b
  383. eor v2.16b, v2.16b, v6.16b
  384. next_tweak v7, v6, v7, v8
  385. eor v3.16b, v3.16b, v7.16b
  386. do_encrypt_block4x
  387. eor v3.16b, v3.16b, v7.16b
  388. eor v0.16b, v0.16b, v4.16b
  389. eor v1.16b, v1.16b, v5.16b
  390. eor v2.16b, v2.16b, v6.16b
  391. st1 {v0.16b-v3.16b}, [x0], #64
  392. mov v4.16b, v7.16b
  393. cbz w4, .Lxtsencout
  394. b .LxtsencloopNx
  395. #endif
  396. .Lxtsenc1x:
  397. adds w4, w4, #INTERLEAVE
  398. beq .Lxtsencout
  399. #endif
  400. .Lxtsencloop:
  401. ld1 {v1.16b}, [x1], #16
  402. eor v0.16b, v1.16b, v4.16b
  403. encrypt_block v0, w3, x2, x6, w7
  404. eor v0.16b, v0.16b, v4.16b
  405. st1 {v0.16b}, [x0], #16
  406. subs w4, w4, #1
  407. beq .Lxtsencout
  408. next_tweak v4, v4, v7, v8
  409. b .Lxtsencloop
  410. .Lxtsencout:
  411. FRAME_POP
  412. ret
  413. AES_ENDPROC(aes_xts_encrypt)
  414. AES_ENTRY(aes_xts_decrypt)
  415. FRAME_PUSH
  416. cbz w7, .LxtsdecloopNx
  417. ld1 {v4.16b}, [x6]
  418. enc_prepare w3, x5, x6
  419. encrypt_block v4, w3, x5, x6, w7 /* first tweak */
  420. dec_prepare w3, x2, x6
  421. ldr q7, .Lxts_mul_x
  422. b .LxtsdecNx
  423. .LxtsdecloopNx:
  424. ldr q7, .Lxts_mul_x
  425. next_tweak v4, v4, v7, v8
  426. .LxtsdecNx:
  427. #if INTERLEAVE >= 2
  428. subs w4, w4, #INTERLEAVE
  429. bmi .Lxtsdec1x
  430. #if INTERLEAVE == 2
  431. ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */
  432. next_tweak v5, v4, v7, v8
  433. eor v0.16b, v0.16b, v4.16b
  434. eor v1.16b, v1.16b, v5.16b
  435. do_decrypt_block2x
  436. eor v0.16b, v0.16b, v4.16b
  437. eor v1.16b, v1.16b, v5.16b
  438. st1 {v0.16b-v1.16b}, [x0], #32
  439. cbz w4, .LxtsdecoutNx
  440. next_tweak v4, v5, v7, v8
  441. b .LxtsdecNx
  442. .LxtsdecoutNx:
  443. mov v4.16b, v5.16b
  444. b .Lxtsdecout
  445. #else
  446. ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
  447. next_tweak v5, v4, v7, v8
  448. eor v0.16b, v0.16b, v4.16b
  449. next_tweak v6, v5, v7, v8
  450. eor v1.16b, v1.16b, v5.16b
  451. eor v2.16b, v2.16b, v6.16b
  452. next_tweak v7, v6, v7, v8
  453. eor v3.16b, v3.16b, v7.16b
  454. do_decrypt_block4x
  455. eor v3.16b, v3.16b, v7.16b
  456. eor v0.16b, v0.16b, v4.16b
  457. eor v1.16b, v1.16b, v5.16b
  458. eor v2.16b, v2.16b, v6.16b
  459. st1 {v0.16b-v3.16b}, [x0], #64
  460. mov v4.16b, v7.16b
  461. cbz w4, .Lxtsdecout
  462. b .LxtsdecloopNx
  463. #endif
  464. .Lxtsdec1x:
  465. adds w4, w4, #INTERLEAVE
  466. beq .Lxtsdecout
  467. #endif
  468. .Lxtsdecloop:
  469. ld1 {v1.16b}, [x1], #16
  470. eor v0.16b, v1.16b, v4.16b
  471. decrypt_block v0, w3, x2, x6, w7
  472. eor v0.16b, v0.16b, v4.16b
  473. st1 {v0.16b}, [x0], #16
  474. subs w4, w4, #1
  475. beq .Lxtsdecout
  476. next_tweak v4, v4, v7, v8
  477. b .Lxtsdecloop
  478. .Lxtsdecout:
  479. FRAME_POP
  480. ret
  481. AES_ENDPROC(aes_xts_decrypt)