|
@@ -46,6 +46,19 @@
|
|
ss3 .req v26
|
|
ss3 .req v26
|
|
ss4 .req v27
|
|
ss4 .req v27
|
|
|
|
|
|
|
|
+ XL2 .req v8
|
|
|
|
+ XM2 .req v9
|
|
|
|
+ XH2 .req v10
|
|
|
|
+ XL3 .req v11
|
|
|
|
+ XM3 .req v12
|
|
|
|
+ XH3 .req v13
|
|
|
|
+ TT3 .req v14
|
|
|
|
+ TT4 .req v15
|
|
|
|
+ HH .req v16
|
|
|
|
+ HH3 .req v17
|
|
|
|
+ HH4 .req v18
|
|
|
|
+ HH34 .req v19
|
|
|
|
+
|
|
.text
|
|
.text
|
|
.arch armv8-a+crypto
|
|
.arch armv8-a+crypto
|
|
|
|
|
|
@@ -134,11 +147,25 @@
|
|
.endm
|
|
.endm
|
|
|
|
|
|
.macro __pmull_pre_p64
|
|
.macro __pmull_pre_p64
|
|
|
|
+ add x8, x3, #16
|
|
|
|
+ ld1 {HH.2d-HH4.2d}, [x8]
|
|
|
|
+
|
|
|
|
+ trn1 SHASH2.2d, SHASH.2d, HH.2d
|
|
|
|
+ trn2 T1.2d, SHASH.2d, HH.2d
|
|
|
|
+ eor SHASH2.16b, SHASH2.16b, T1.16b
|
|
|
|
+
|
|
|
|
+ trn1 HH34.2d, HH3.2d, HH4.2d
|
|
|
|
+ trn2 T1.2d, HH3.2d, HH4.2d
|
|
|
|
+ eor HH34.16b, HH34.16b, T1.16b
|
|
|
|
+
|
|
movi MASK.16b, #0xe1
|
|
movi MASK.16b, #0xe1
|
|
shl MASK.2d, MASK.2d, #57
|
|
shl MASK.2d, MASK.2d, #57
|
|
.endm
|
|
.endm
|
|
|
|
|
|
.macro __pmull_pre_p8
|
|
.macro __pmull_pre_p8
|
|
|
|
+ ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
|
|
|
|
+ eor SHASH2.16b, SHASH2.16b, SHASH.16b
|
|
|
|
+
|
|
// k00_16 := 0x0000000000000000_000000000000ffff
|
|
// k00_16 := 0x0000000000000000_000000000000ffff
|
|
// k32_48 := 0x00000000ffffffff_0000ffffffffffff
|
|
// k32_48 := 0x00000000ffffffff_0000ffffffffffff
|
|
movi k32_48.2d, #0xffffffff
|
|
movi k32_48.2d, #0xffffffff
|
|
@@ -215,8 +242,6 @@
|
|
.macro __pmull_ghash, pn
|
|
.macro __pmull_ghash, pn
|
|
ld1 {SHASH.2d}, [x3]
|
|
ld1 {SHASH.2d}, [x3]
|
|
ld1 {XL.2d}, [x1]
|
|
ld1 {XL.2d}, [x1]
|
|
- ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
|
|
|
|
- eor SHASH2.16b, SHASH2.16b, SHASH.16b
|
|
|
|
|
|
|
|
__pmull_pre_\pn
|
|
__pmull_pre_\pn
|
|
|
|
|
|
@@ -224,12 +249,79 @@
|
|
cbz x4, 0f
|
|
cbz x4, 0f
|
|
ld1 {T1.2d}, [x4]
|
|
ld1 {T1.2d}, [x4]
|
|
mov x4, xzr
|
|
mov x4, xzr
|
|
- b 1f
|
|
|
|
|
|
+ b 3f
|
|
|
|
+
|
|
|
|
+0: .ifc \pn, p64
|
|
|
|
+ tbnz w0, #0, 2f // skip until #blocks is a
|
|
|
|
+ tbnz w0, #1, 2f // round multiple of 4
|
|
|
|
+
|
|
|
|
+1: ld1 {XM3.16b-TT4.16b}, [x2], #64
|
|
|
|
+
|
|
|
|
+ sub w0, w0, #4
|
|
|
|
+
|
|
|
|
+ rev64 T1.16b, XM3.16b
|
|
|
|
+ rev64 T2.16b, XH3.16b
|
|
|
|
+ rev64 TT4.16b, TT4.16b
|
|
|
|
+ rev64 TT3.16b, TT3.16b
|
|
|
|
+
|
|
|
|
+ ext IN1.16b, TT4.16b, TT4.16b, #8
|
|
|
|
+ ext XL3.16b, TT3.16b, TT3.16b, #8
|
|
|
|
+
|
|
|
|
+ eor TT4.16b, TT4.16b, IN1.16b
|
|
|
|
+ pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
|
|
|
|
+ pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
|
|
|
|
+ pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
|
|
|
|
+
|
|
|
|
+ eor TT3.16b, TT3.16b, XL3.16b
|
|
|
|
+ pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1
|
|
|
|
+ pmull XL3.1q, HH.1d, XL3.1d // a0 * b0
|
|
|
|
+ pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
|
|
|
|
+
|
|
|
|
+ ext IN1.16b, T2.16b, T2.16b, #8
|
|
|
|
+ eor XL2.16b, XL2.16b, XL3.16b
|
|
|
|
+ eor XH2.16b, XH2.16b, XH3.16b
|
|
|
|
+ eor XM2.16b, XM2.16b, XM3.16b
|
|
|
|
+
|
|
|
|
+ eor T2.16b, T2.16b, IN1.16b
|
|
|
|
+ pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1
|
|
|
|
+ pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0
|
|
|
|
+ pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
|
|
|
|
|
|
-0: ld1 {T1.2d}, [x2], #16
|
|
|
|
|
|
+ eor XL2.16b, XL2.16b, XL3.16b
|
|
|
|
+ eor XH2.16b, XH2.16b, XH3.16b
|
|
|
|
+ eor XM2.16b, XM2.16b, XM3.16b
|
|
|
|
+
|
|
|
|
+ ext IN1.16b, T1.16b, T1.16b, #8
|
|
|
|
+ ext TT3.16b, XL.16b, XL.16b, #8
|
|
|
|
+ eor XL.16b, XL.16b, IN1.16b
|
|
|
|
+ eor T1.16b, T1.16b, TT3.16b
|
|
|
|
+
|
|
|
|
+ pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1
|
|
|
|
+ eor T1.16b, T1.16b, XL.16b
|
|
|
|
+ pmull XL.1q, HH4.1d, XL.1d // a0 * b0
|
|
|
|
+ pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
|
|
|
|
+
|
|
|
|
+ eor XL.16b, XL.16b, XL2.16b
|
|
|
|
+ eor XH.16b, XH.16b, XH2.16b
|
|
|
|
+ eor XM.16b, XM.16b, XM2.16b
|
|
|
|
+
|
|
|
|
+ eor T2.16b, XL.16b, XH.16b
|
|
|
|
+ ext T1.16b, XL.16b, XH.16b, #8
|
|
|
|
+ eor XM.16b, XM.16b, T2.16b
|
|
|
|
+
|
|
|
|
+ __pmull_reduce_p64
|
|
|
|
+
|
|
|
|
+ eor T2.16b, T2.16b, XH.16b
|
|
|
|
+ eor XL.16b, XL.16b, T2.16b
|
|
|
|
+
|
|
|
|
+ cbz w0, 5f
|
|
|
|
+ b 1b
|
|
|
|
+ .endif
|
|
|
|
+
|
|
|
|
+2: ld1 {T1.2d}, [x2], #16
|
|
sub w0, w0, #1
|
|
sub w0, w0, #1
|
|
|
|
|
|
-1: /* multiply XL by SHASH in GF(2^128) */
|
|
|
|
|
|
+3: /* multiply XL by SHASH in GF(2^128) */
|
|
CPU_LE( rev64 T1.16b, T1.16b )
|
|
CPU_LE( rev64 T1.16b, T1.16b )
|
|
|
|
|
|
ext T2.16b, XL.16b, XL.16b, #8
|
|
ext T2.16b, XL.16b, XL.16b, #8
|
|
@@ -242,7 +334,7 @@ CPU_LE( rev64 T1.16b, T1.16b )
|
|
__pmull_\pn XL, XL, SHASH // a0 * b0
|
|
__pmull_\pn XL, XL, SHASH // a0 * b0
|
|
__pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
|
|
__pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
|
|
|
|
|
|
- eor T2.16b, XL.16b, XH.16b
|
|
|
|
|
|
+4: eor T2.16b, XL.16b, XH.16b
|
|
ext T1.16b, XL.16b, XH.16b, #8
|
|
ext T1.16b, XL.16b, XH.16b, #8
|
|
eor XM.16b, XM.16b, T2.16b
|
|
eor XM.16b, XM.16b, T2.16b
|
|
|
|
|
|
@@ -253,7 +345,7 @@ CPU_LE( rev64 T1.16b, T1.16b )
|
|
|
|
|
|
cbnz w0, 0b
|
|
cbnz w0, 0b
|
|
|
|
|
|
- st1 {XL.2d}, [x1]
|
|
|
|
|
|
+5: st1 {XL.2d}, [x1]
|
|
ret
|
|
ret
|
|
.endm
|
|
.endm
|
|
|
|
|
|
@@ -269,14 +361,10 @@ ENTRY(pmull_ghash_update_p8)
|
|
__pmull_ghash p8
|
|
__pmull_ghash p8
|
|
ENDPROC(pmull_ghash_update_p8)
|
|
ENDPROC(pmull_ghash_update_p8)
|
|
|
|
|
|
- KS0 .req v8
|
|
|
|
- KS1 .req v9
|
|
|
|
- INP0 .req v10
|
|
|
|
- INP1 .req v11
|
|
|
|
- HH .req v12
|
|
|
|
- XL2 .req v13
|
|
|
|
- XM2 .req v14
|
|
|
|
- XH2 .req v15
|
|
|
|
|
|
+ KS0 .req v12
|
|
|
|
+ KS1 .req v13
|
|
|
|
+ INP0 .req v14
|
|
|
|
+ INP1 .req v15
|
|
|
|
|
|
.macro load_round_keys, rounds, rk
|
|
.macro load_round_keys, rounds, rk
|
|
cmp \rounds, #12
|
|
cmp \rounds, #12
|
|
@@ -310,8 +398,8 @@ ENDPROC(pmull_ghash_update_p8)
|
|
.endm
|
|
.endm
|
|
|
|
|
|
.macro pmull_gcm_do_crypt, enc
|
|
.macro pmull_gcm_do_crypt, enc
|
|
- ld1 {HH.2d}, [x4], #16
|
|
|
|
- ld1 {SHASH.2d}, [x4]
|
|
|
|
|
|
+ ld1 {SHASH.2d}, [x4], #16
|
|
|
|
+ ld1 {HH.2d}, [x4]
|
|
ld1 {XL.2d}, [x1]
|
|
ld1 {XL.2d}, [x1]
|
|
ldr x8, [x5, #8] // load lower counter
|
|
ldr x8, [x5, #8] // load lower counter
|
|
|
|
|