ghash-ce-core.S 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. /*
  2. * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
  3. *
  4. * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
  5. *
  6. * This program is free software; you can redistribute it and/or modify it
  7. * under the terms of the GNU General Public License version 2 as published
  8. * by the Free Software Foundation.
  9. */
  10. #include <linux/linkage.h>
  11. #include <asm/assembler.h>
  12. SHASH .req q0
  13. T1 .req q1
  14. XL .req q2
  15. XM .req q3
  16. XH .req q4
  17. IN1 .req q4
  18. SHASH_L .req d0
  19. SHASH_H .req d1
  20. T1_L .req d2
  21. T1_H .req d3
  22. XL_L .req d4
  23. XL_H .req d5
  24. XM_L .req d6
  25. XM_H .req d7
  26. XH_L .req d8
  27. t0l .req d10
  28. t0h .req d11
  29. t1l .req d12
  30. t1h .req d13
  31. t2l .req d14
  32. t2h .req d15
  33. t3l .req d16
  34. t3h .req d17
  35. t4l .req d18
  36. t4h .req d19
  37. t0q .req q5
  38. t1q .req q6
  39. t2q .req q7
  40. t3q .req q8
  41. t4q .req q9
  42. T2 .req q9
  43. s1l .req d20
  44. s1h .req d21
  45. s2l .req d22
  46. s2h .req d23
  47. s3l .req d24
  48. s3h .req d25
  49. s4l .req d26
  50. s4h .req d27
  51. MASK .req d28
  52. SHASH2_p8 .req d28
  53. k16 .req d29
  54. k32 .req d30
  55. k48 .req d31
  56. SHASH2_p64 .req d31
  57. HH .req q10
  58. HH3 .req q11
  59. HH4 .req q12
  60. HH34 .req q13
  61. HH_L .req d20
  62. HH_H .req d21
  63. HH3_L .req d22
  64. HH3_H .req d23
  65. HH4_L .req d24
  66. HH4_H .req d25
  67. HH34_L .req d26
  68. HH34_H .req d27
  69. SHASH2_H .req d29
  70. XL2 .req q5
  71. XM2 .req q6
  72. XH2 .req q7
  73. T3 .req q8
  74. XL2_L .req d10
  75. XL2_H .req d11
  76. XM2_L .req d12
  77. XM2_H .req d13
  78. T3_L .req d16
  79. T3_H .req d17
  80. .text
  81. .fpu crypto-neon-fp-armv8
  82. .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
  83. vmull.p64 \rd, \rn, \rm
  84. .endm
  85. /*
  86. * This implementation of 64x64 -> 128 bit polynomial multiplication
  87. * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
  88. * "Fast Software Polynomial Multiplication on ARM Processors Using
  89. * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
  90. * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
  91. *
  92. * It has been slightly tweaked for in-order performance, and to allow
  93. * 'rq' to overlap with 'ad' or 'bd'.
  94. */
  95. .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
  96. vext.8 t0l, \ad, \ad, #1 @ A1
  97. .ifc \b1, t4l
  98. vext.8 t4l, \bd, \bd, #1 @ B1
  99. .endif
  100. vmull.p8 t0q, t0l, \bd @ F = A1*B
  101. vext.8 t1l, \ad, \ad, #2 @ A2
  102. vmull.p8 t4q, \ad, \b1 @ E = A*B1
  103. .ifc \b2, t3l
  104. vext.8 t3l, \bd, \bd, #2 @ B2
  105. .endif
  106. vmull.p8 t1q, t1l, \bd @ H = A2*B
  107. vext.8 t2l, \ad, \ad, #3 @ A3
  108. vmull.p8 t3q, \ad, \b2 @ G = A*B2
  109. veor t0q, t0q, t4q @ L = E + F
  110. .ifc \b3, t4l
  111. vext.8 t4l, \bd, \bd, #3 @ B3
  112. .endif
  113. vmull.p8 t2q, t2l, \bd @ J = A3*B
  114. veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
  115. veor t1q, t1q, t3q @ M = G + H
  116. .ifc \b4, t3l
  117. vext.8 t3l, \bd, \bd, #4 @ B4
  118. .endif
  119. vmull.p8 t4q, \ad, \b3 @ I = A*B3
  120. veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
  121. vmull.p8 t3q, \ad, \b4 @ K = A*B4
  122. vand t0h, t0h, k48
  123. vand t1h, t1h, k32
  124. veor t2q, t2q, t4q @ N = I + J
  125. veor t0l, t0l, t0h
  126. veor t1l, t1l, t1h
  127. veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
  128. vand t2h, t2h, k16
  129. veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
  130. vmov.i64 t3h, #0
  131. vext.8 t0q, t0q, t0q, #15
  132. veor t2l, t2l, t2h
  133. vext.8 t1q, t1q, t1q, #14
  134. vmull.p8 \rq, \ad, \bd @ D = A*B
  135. vext.8 t2q, t2q, t2q, #13
  136. vext.8 t3q, t3q, t3q, #12
  137. veor t0q, t0q, t1q
  138. veor t2q, t2q, t3q
  139. veor \rq, \rq, t0q
  140. veor \rq, \rq, t2q
  141. .endm
  142. //
  143. // PMULL (64x64->128) based reduction for CPUs that can do
  144. // it in a single instruction.
  145. //
  146. .macro __pmull_reduce_p64
  147. vmull.p64 T1, XL_L, MASK
  148. veor XH_L, XH_L, XM_H
  149. vext.8 T1, T1, T1, #8
  150. veor XL_H, XL_H, XM_L
  151. veor T1, T1, XL
  152. vmull.p64 XL, T1_H, MASK
  153. .endm
  154. //
  155. // Alternative reduction for CPUs that lack support for the
  156. // 64x64->128 PMULL instruction
  157. //
  158. .macro __pmull_reduce_p8
  159. veor XL_H, XL_H, XM_L
  160. veor XH_L, XH_L, XM_H
  161. vshl.i64 T1, XL, #57
  162. vshl.i64 T2, XL, #62
  163. veor T1, T1, T2
  164. vshl.i64 T2, XL, #63
  165. veor T1, T1, T2
  166. veor XL_H, XL_H, T1_L
  167. veor XH_L, XH_L, T1_H
  168. vshr.u64 T1, XL, #1
  169. veor XH, XH, XL
  170. veor XL, XL, T1
  171. vshr.u64 T1, T1, #6
  172. vshr.u64 XL, XL, #1
  173. .endm
  174. .macro ghash_update, pn
  175. vld1.64 {XL}, [r1]
  176. /* do the head block first, if supplied */
  177. ldr ip, [sp]
  178. teq ip, #0
  179. beq 0f
  180. vld1.64 {T1}, [ip]
  181. teq r0, #0
  182. b 3f
  183. 0: .ifc \pn, p64
  184. tst r0, #3 // skip until #blocks is a
  185. bne 2f // round multiple of 4
  186. vld1.8 {XL2-XM2}, [r2]!
  187. 1: vld1.8 {T3-T2}, [r2]!
  188. vrev64.8 XL2, XL2
  189. vrev64.8 XM2, XM2
  190. subs r0, r0, #4
  191. vext.8 T1, XL2, XL2, #8
  192. veor XL2_H, XL2_H, XL_L
  193. veor XL, XL, T1
  194. vrev64.8 T3, T3
  195. vrev64.8 T1, T2
  196. vmull.p64 XH, HH4_H, XL_H // a1 * b1
  197. veor XL2_H, XL2_H, XL_H
  198. vmull.p64 XL, HH4_L, XL_L // a0 * b0
  199. vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
  200. vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
  201. veor XM2_L, XM2_L, XM2_H
  202. vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
  203. vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
  204. veor XH, XH, XH2
  205. veor XL, XL, XL2
  206. veor XM, XM, XM2
  207. vmull.p64 XH2, HH_H, T3_L // a1 * b1
  208. veor T3_L, T3_L, T3_H
  209. vmull.p64 XL2, HH_L, T3_H // a0 * b0
  210. vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
  211. veor XH, XH, XH2
  212. veor XL, XL, XL2
  213. veor XM, XM, XM2
  214. vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
  215. veor T1_L, T1_L, T1_H
  216. vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
  217. vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
  218. veor XH, XH, XH2
  219. veor XL, XL, XL2
  220. veor XM, XM, XM2
  221. beq 4f
  222. vld1.8 {XL2-XM2}, [r2]!
  223. veor T1, XL, XH
  224. veor XM, XM, T1
  225. __pmull_reduce_p64
  226. veor T1, T1, XH
  227. veor XL, XL, T1
  228. b 1b
  229. .endif
  230. 2: vld1.64 {T1}, [r2]!
  231. subs r0, r0, #1
  232. 3: /* multiply XL by SHASH in GF(2^128) */
  233. #ifndef CONFIG_CPU_BIG_ENDIAN
  234. vrev64.8 T1, T1
  235. #endif
  236. vext.8 IN1, T1, T1, #8
  237. veor T1_L, T1_L, XL_H
  238. veor XL, XL, IN1
  239. __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
  240. veor T1, T1, XL
  241. __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
  242. __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
  243. 4: veor T1, XL, XH
  244. veor XM, XM, T1
  245. __pmull_reduce_\pn
  246. veor T1, T1, XH
  247. veor XL, XL, T1
  248. bne 0b
  249. vst1.64 {XL}, [r1]
  250. bx lr
  251. .endm
  252. /*
  253. * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
  254. * struct ghash_key const *k, const char *head)
  255. */
  256. ENTRY(pmull_ghash_update_p64)
  257. vld1.64 {SHASH}, [r3]!
  258. vld1.64 {HH}, [r3]!
  259. vld1.64 {HH3-HH4}, [r3]
  260. veor SHASH2_p64, SHASH_L, SHASH_H
  261. veor SHASH2_H, HH_L, HH_H
  262. veor HH34_L, HH3_L, HH3_H
  263. veor HH34_H, HH4_L, HH4_H
  264. vmov.i8 MASK, #0xe1
  265. vshl.u64 MASK, MASK, #57
  266. ghash_update p64
  267. ENDPROC(pmull_ghash_update_p64)
  268. ENTRY(pmull_ghash_update_p8)
  269. vld1.64 {SHASH}, [r3]
  270. veor SHASH2_p8, SHASH_L, SHASH_H
  271. vext.8 s1l, SHASH_L, SHASH_L, #1
  272. vext.8 s2l, SHASH_L, SHASH_L, #2
  273. vext.8 s3l, SHASH_L, SHASH_L, #3
  274. vext.8 s4l, SHASH_L, SHASH_L, #4
  275. vext.8 s1h, SHASH_H, SHASH_H, #1
  276. vext.8 s2h, SHASH_H, SHASH_H, #2
  277. vext.8 s3h, SHASH_H, SHASH_H, #3
  278. vext.8 s4h, SHASH_H, SHASH_H, #4
  279. vmov.i64 k16, #0xffff
  280. vmov.i64 k32, #0xffffffff
  281. vmov.i64 k48, #0xffffffffffff
  282. ghash_update p8
  283. ENDPROC(pmull_ghash_update_p8)