7 years ago · 325f562d8f
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@@ -128,7 +128,7 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 
															 	// XOR the initial_crc value
														
 
															 	eor		v0.16b, v0.16b, v10.16b
														
 
															-	ldr		q10, rk3	// xmm10 has rk3 and rk4
														
 
															+	ldr_l		q10, rk3, x8	// xmm10 has rk3 and rk4
														
 
															 					// type of pmull instruction
														
 
															 					// will determine which constant to use
														
@@ -184,13 +184,13 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 
															 	// fold the 8 vector registers to 1 vector register with different
														
 
															 	// constants
														
 
															-	ldr		q10, rk9
														
 
															+	ldr_l		q10, rk9, x8
														
 
															 	.macro		fold16, reg, rk
														
 
															 	pmull		v8.1q, \reg\().1d, v10.1d
														
 
															 	pmull2		\reg\().1q, \reg\().2d, v10.2d
														
 
															 	.ifnb		\rk
														
 
															-	ldr		q10, \rk
														
 
															+	ldr_l		q10, \rk, x8
														
 
															 	.endif
														
 
															 	eor		v7.16b, v7.16b, v8.16b
														
 
															 	eor		v7.16b, v7.16b, \reg\().16b
														
@@ -251,7 +251,7 @@ CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
 
															 	// get rid of the extra data that was loaded before
														
 
															 	// load the shift constant
														
 
															-	adr		x4, tbl_shf_table + 16
														
 
															+	adr_l		x4, tbl_shf_table + 16
														
 
															 	sub		x4, x4, arg3
														
 
															 	ld1		{v0.16b}, [x4]
														
@@ -275,7 +275,7 @@ CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
 
															 _128_done:
														
 
															 	// compute crc of a 128-bit value
														
 
															-	ldr		q10, rk5		// rk5 and rk6 in xmm10
														
 
															+	ldr_l		q10, rk5, x8		// rk5 and rk6 in xmm10
														
 
															 	// 64b fold
														
 
															 	ext		v0.16b, vzr.16b, v7.16b, #8
														
@@ -291,7 +291,7 @@ _128_done:
 
															 	// barrett reduction
														
 
															 _barrett:
														
 
															-	ldr		q10, rk7
														
 
															+	ldr_l		q10, rk7, x8
														
 
															 	mov		v0.d[0], v7.d[1]
														
 
															 	pmull		v0.1q, v0.1d, v10.1d
														
@@ -321,7 +321,7 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 
															 	b.eq		_128_done		// exactly 16 left
														
 
															 	b.lt		_less_than_16_left
														
 
															-	ldr		q10, rk1		// rk1 and rk2 in xmm10
														
 
															+	ldr_l		q10, rk1, x8		// rk1 and rk2 in xmm10
														
 
															 	// update the counter. subtract 32 instead of 16 to save one
														
 
															 	// instruction from the loop
														
@@ -333,7 +333,7 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 
															 _less_than_16_left:
														
 
															 	// shl r9, 4
														
 
															-	adr		x0, tbl_shf_table + 16
														
 
															+	adr_l		x0, tbl_shf_table + 16
														
 
															 	sub		x0, x0, arg3
														
 
															 	ld1		{v0.16b}, [x0]
														
 
															 	movi		v9.16b, #0x80
														
@@ -345,6 +345,7 @@ ENDPROC(crc_t10dif_pmull)
 
															 // precomputed constants
														
 
															 // these constants are precomputed from the poly:
														
 
															 // 0x8bb70000 (0x8bb7 scaled to 32 bits)
														
 
															+	.section	".rodata", "a"
														
 
															 	.align		4
														
 
															 // Q = 0x18BB70000
														
 
															 // rk1 = 2^(32*3) mod Q << 32