|
@@ -80,7 +80,46 @@
|
|
|
|
|
|
vzr .req v13
|
|
|
|
|
|
-ENTRY(crc_t10dif_pmull)
|
|
|
+ .macro fold64, p, reg1, reg2
|
|
|
+ ldp q11, q12, [arg2], #0x20
|
|
|
+
|
|
|
+ __pmull_\p v8, \reg1, v10, 2
|
|
|
+ __pmull_\p \reg1, \reg1, v10
|
|
|
+
|
|
|
+CPU_LE( rev64 v11.16b, v11.16b )
|
|
|
+CPU_LE( rev64 v12.16b, v12.16b )
|
|
|
+
|
|
|
+ __pmull_\p v9, \reg2, v10, 2
|
|
|
+ __pmull_\p \reg2, \reg2, v10
|
|
|
+
|
|
|
+CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
|
|
|
+CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
|
|
|
+
|
|
|
+ eor \reg1\().16b, \reg1\().16b, v8.16b
|
|
|
+ eor \reg2\().16b, \reg2\().16b, v9.16b
|
|
|
+ eor \reg1\().16b, \reg1\().16b, v11.16b
|
|
|
+ eor \reg2\().16b, \reg2\().16b, v12.16b
|
|
|
+ .endm
|
|
|
+
|
|
|
+ .macro fold16, p, reg, rk
|
|
|
+ __pmull_\p v8, \reg, v10
|
|
|
+ __pmull_\p \reg, \reg, v10, 2
|
|
|
+ .ifnb \rk
|
|
|
+ ldr_l q10, \rk, x8
|
|
|
+ .endif
|
|
|
+ eor v7.16b, v7.16b, v8.16b
|
|
|
+ eor v7.16b, v7.16b, \reg\().16b
|
|
|
+ .endm
|
|
|
+
|
|
|
+ .macro __pmull_p64, rd, rn, rm, n
|
|
|
+ .ifb \n
|
|
|
+ pmull \rd\().1q, \rn\().1d, \rm\().1d
|
|
|
+ .else
|
|
|
+ pmull2 \rd\().1q, \rn\().2d, \rm\().2d
|
|
|
+ .endif
|
|
|
+ .endm
|
|
|
+
|
|
|
+ .macro crc_t10dif_pmull, p
|
|
|
frame_push 3, 128
|
|
|
|
|
|
mov arg1_low32, w0
|
|
@@ -96,7 +135,7 @@ ENTRY(crc_t10dif_pmull)
|
|
|
cmp arg3, #256
|
|
|
|
|
|
// for sizes less than 128, we can't fold 64B at a time...
|
|
|
- b.lt _less_than_128
|
|
|
+ b.lt .L_less_than_128_\@
|
|
|
|
|
|
// load the initial crc value
|
|
|
// crc value does not need to be byte-reflected, but it needs
|
|
@@ -147,41 +186,19 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
|
|
|
// buffer. The _fold_64_B_loop will fold 64B at a time
|
|
|
// until we have 64+y Bytes of buffer
|
|
|
|
|
|
-
|
|
|
// fold 64B at a time. This section of the code folds 4 vector
|
|
|
// registers in parallel
|
|
|
-_fold_64_B_loop:
|
|
|
-
|
|
|
- .macro fold64, reg1, reg2
|
|
|
- ldp q11, q12, [arg2], #0x20
|
|
|
-
|
|
|
- pmull2 v8.1q, \reg1\().2d, v10.2d
|
|
|
- pmull \reg1\().1q, \reg1\().1d, v10.1d
|
|
|
-
|
|
|
-CPU_LE( rev64 v11.16b, v11.16b )
|
|
|
-CPU_LE( rev64 v12.16b, v12.16b )
|
|
|
-
|
|
|
- pmull2 v9.1q, \reg2\().2d, v10.2d
|
|
|
- pmull \reg2\().1q, \reg2\().1d, v10.1d
|
|
|
-
|
|
|
-CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
|
|
|
-CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
|
|
|
-
|
|
|
- eor \reg1\().16b, \reg1\().16b, v8.16b
|
|
|
- eor \reg2\().16b, \reg2\().16b, v9.16b
|
|
|
- eor \reg1\().16b, \reg1\().16b, v11.16b
|
|
|
- eor \reg2\().16b, \reg2\().16b, v12.16b
|
|
|
- .endm
|
|
|
+.L_fold_64_B_loop_\@:
|
|
|
|
|
|
- fold64 v0, v1
|
|
|
- fold64 v2, v3
|
|
|
- fold64 v4, v5
|
|
|
- fold64 v6, v7
|
|
|
+ fold64 \p, v0, v1
|
|
|
+ fold64 \p, v2, v3
|
|
|
+ fold64 \p, v4, v5
|
|
|
+ fold64 \p, v6, v7
|
|
|
|
|
|
subs arg3, arg3, #128
|
|
|
|
|
|
// check if there is another 64B in the buffer to be able to fold
|
|
|
- b.lt _fold_64_B_end
|
|
|
+ b.lt .L_fold_64_B_end_\@
|
|
|
|
|
|
if_will_cond_yield_neon
|
|
|
stp q0, q1, [sp, #.Lframe_local_offset]
|
|
@@ -197,9 +214,9 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
|
|
|
movi vzr.16b, #0 // init zero register
|
|
|
endif_yield_neon
|
|
|
|
|
|
- b _fold_64_B_loop
|
|
|
+ b .L_fold_64_B_loop_\@
|
|
|
|
|
|
-_fold_64_B_end:
|
|
|
+.L_fold_64_B_end_\@:
|
|
|
// at this point, the buffer pointer is pointing at the last y Bytes
|
|
|
// of the buffer the 64B of folded data is in 4 of the vector
|
|
|
// registers: v0, v1, v2, v3
|
|
@@ -209,37 +226,27 @@ _fold_64_B_end:
|
|
|
|
|
|
ldr_l q10, rk9, x8
|
|
|
|
|
|
- .macro fold16, reg, rk
|
|
|
- pmull v8.1q, \reg\().1d, v10.1d
|
|
|
- pmull2 \reg\().1q, \reg\().2d, v10.2d
|
|
|
- .ifnb \rk
|
|
|
- ldr_l q10, \rk, x8
|
|
|
- .endif
|
|
|
- eor v7.16b, v7.16b, v8.16b
|
|
|
- eor v7.16b, v7.16b, \reg\().16b
|
|
|
- .endm
|
|
|
-
|
|
|
- fold16 v0, rk11
|
|
|
- fold16 v1, rk13
|
|
|
- fold16 v2, rk15
|
|
|
- fold16 v3, rk17
|
|
|
- fold16 v4, rk19
|
|
|
- fold16 v5, rk1
|
|
|
- fold16 v6
|
|
|
+ fold16 \p, v0, rk11
|
|
|
+ fold16 \p, v1, rk13
|
|
|
+ fold16 \p, v2, rk15
|
|
|
+ fold16 \p, v3, rk17
|
|
|
+ fold16 \p, v4, rk19
|
|
|
+ fold16 \p, v5, rk1
|
|
|
+ fold16 \p, v6
|
|
|
|
|
|
// instead of 64, we add 48 to the loop counter to save 1 instruction
|
|
|
// from the loop instead of a cmp instruction, we use the negative
|
|
|
// flag with the jl instruction
|
|
|
adds arg3, arg3, #(128-16)
|
|
|
- b.lt _final_reduction_for_128
|
|
|
+ b.lt .L_final_reduction_for_128_\@
|
|
|
|
|
|
// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
|
|
|
// and the rest is in memory. We can fold 16 bytes at a time if y>=16
|
|
|
// continue folding 16B at a time
|
|
|
|
|
|
-_16B_reduction_loop:
|
|
|
- pmull v8.1q, v7.1d, v10.1d
|
|
|
- pmull2 v7.1q, v7.2d, v10.2d
|
|
|
+.L_16B_reduction_loop_\@:
|
|
|
+ __pmull_\p v8, v7, v10
|
|
|
+ __pmull_\p v7, v7, v10, 2
|
|
|
eor v7.16b, v7.16b, v8.16b
|
|
|
|
|
|
ldr q0, [arg2], #16
|
|
@@ -251,22 +258,22 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
|
|
|
// instead of a cmp instruction, we utilize the flags with the
|
|
|
// jge instruction equivalent of: cmp arg3, 16-16
|
|
|
// check if there is any more 16B in the buffer to be able to fold
|
|
|
- b.ge _16B_reduction_loop
|
|
|
+ b.ge .L_16B_reduction_loop_\@
|
|
|
|
|
|
// now we have 16+z bytes left to reduce, where 0<= z < 16.
|
|
|
// first, we reduce the data in the xmm7 register
|
|
|
|
|
|
-_final_reduction_for_128:
|
|
|
+.L_final_reduction_for_128_\@:
|
|
|
// check if any more data to fold. If not, compute the CRC of
|
|
|
// the final 128 bits
|
|
|
adds arg3, arg3, #16
|
|
|
- b.eq _128_done
|
|
|
+ b.eq .L_128_done_\@
|
|
|
|
|
|
// here we are getting data that is less than 16 bytes.
|
|
|
// since we know that there was data before the pointer, we can
|
|
|
// offset the input pointer before the actual point, to receive
|
|
|
// exactly 16 bytes. after that the registers need to be adjusted.
|
|
|
-_get_last_two_regs:
|
|
|
+.L_get_last_two_regs_\@:
|
|
|
add arg2, arg2, arg3
|
|
|
ldr q1, [arg2, #-16]
|
|
|
CPU_LE( rev64 v1.16b, v1.16b )
|
|
@@ -291,47 +298,46 @@ CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
|
|
|
bsl v0.16b, v2.16b, v1.16b
|
|
|
|
|
|
// fold 16 Bytes
|
|
|
- pmull v8.1q, v7.1d, v10.1d
|
|
|
- pmull2 v7.1q, v7.2d, v10.2d
|
|
|
+ __pmull_\p v8, v7, v10
|
|
|
+ __pmull_\p v7, v7, v10, 2
|
|
|
eor v7.16b, v7.16b, v8.16b
|
|
|
eor v7.16b, v7.16b, v0.16b
|
|
|
|
|
|
-_128_done:
|
|
|
+.L_128_done_\@:
|
|
|
// compute crc of a 128-bit value
|
|
|
ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10
|
|
|
|
|
|
// 64b fold
|
|
|
ext v0.16b, vzr.16b, v7.16b, #8
|
|
|
mov v7.d[0], v7.d[1]
|
|
|
- pmull v7.1q, v7.1d, v10.1d
|
|
|
+ __pmull_\p v7, v7, v10
|
|
|
eor v7.16b, v7.16b, v0.16b
|
|
|
|
|
|
// 32b fold
|
|
|
ext v0.16b, v7.16b, vzr.16b, #4
|
|
|
mov v7.s[3], vzr.s[0]
|
|
|
- pmull2 v0.1q, v0.2d, v10.2d
|
|
|
+ __pmull_\p v0, v0, v10, 2
|
|
|
eor v7.16b, v7.16b, v0.16b
|
|
|
|
|
|
// barrett reduction
|
|
|
-_barrett:
|
|
|
ldr_l q10, rk7, x8
|
|
|
mov v0.d[0], v7.d[1]
|
|
|
|
|
|
- pmull v0.1q, v0.1d, v10.1d
|
|
|
+ __pmull_\p v0, v0, v10
|
|
|
ext v0.16b, vzr.16b, v0.16b, #12
|
|
|
- pmull2 v0.1q, v0.2d, v10.2d
|
|
|
+ __pmull_\p v0, v0, v10, 2
|
|
|
ext v0.16b, vzr.16b, v0.16b, #12
|
|
|
eor v7.16b, v7.16b, v0.16b
|
|
|
mov w0, v7.s[1]
|
|
|
|
|
|
-_cleanup:
|
|
|
+.L_cleanup_\@:
|
|
|
// scale the result back to 16 bits
|
|
|
lsr x0, x0, #16
|
|
|
frame_pop
|
|
|
ret
|
|
|
|
|
|
-_less_than_128:
|
|
|
- cbz arg3, _cleanup
|
|
|
+.L_less_than_128_\@:
|
|
|
+ cbz arg3, .L_cleanup_\@
|
|
|
|
|
|
movi v0.16b, #0
|
|
|
mov v0.s[3], arg1_low32 // get the initial crc value
|
|
@@ -342,20 +348,20 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
|
|
|
eor v7.16b, v7.16b, v0.16b // xor the initial crc value
|
|
|
|
|
|
cmp arg3, #16
|
|
|
- b.eq _128_done // exactly 16 left
|
|
|
- b.lt _less_than_16_left
|
|
|
+ b.eq .L_128_done_\@ // exactly 16 left
|
|
|
+ b.lt .L_less_than_16_left_\@
|
|
|
|
|
|
ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10
|
|
|
|
|
|
// update the counter. subtract 32 instead of 16 to save one
|
|
|
// instruction from the loop
|
|
|
subs arg3, arg3, #32
|
|
|
- b.ge _16B_reduction_loop
|
|
|
+ b.ge .L_16B_reduction_loop_\@
|
|
|
|
|
|
add arg3, arg3, #16
|
|
|
- b _get_last_two_regs
|
|
|
+ b .L_get_last_two_regs_\@
|
|
|
|
|
|
-_less_than_16_left:
|
|
|
+.L_less_than_16_left_\@:
|
|
|
// shl r9, 4
|
|
|
adr_l x0, tbl_shf_table + 16
|
|
|
sub x0, x0, arg3
|
|
@@ -363,8 +369,12 @@ _less_than_16_left:
|
|
|
movi v9.16b, #0x80
|
|
|
eor v0.16b, v0.16b, v9.16b
|
|
|
tbl v7.16b, {v7.16b}, v0.16b
|
|
|
- b _128_done
|
|
|
-ENDPROC(crc_t10dif_pmull)
|
|
|
+ b .L_128_done_\@
|
|
|
+ .endm
|
|
|
+
|
|
|
+ENTRY(crc_t10dif_pmull_p64)
|
|
|
+ crc_t10dif_pmull p64
|
|
|
+ENDPROC(crc_t10dif_pmull_p64)
|
|
|
|
|
|
// precomputed constants
|
|
|
// these constants are precomputed from the poly:
|