7 years ago · d58badfb7c
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -48,8 +48,8 @@ void __trace_opal_exit(long opcode, unsigned long retval);
 
				 /* VMX copying */
			
 
				 int enter_vmx_usercopy(void);
			
 
				 int exit_vmx_usercopy(void);
			
 
				-int enter_vmx_copy(void);
			
 
				-void * exit_vmx_copy(void *dest);
			
 
				+int enter_vmx_ops(void);
			
 
				+void *exit_vmx_ops(void *dest);
			
 
				 
			
 
				 /* Traps */
			
 
				 long machine_check_early(struct pt_regs *regs);
			
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -57,7 +57,7 @@ _GLOBAL(copypage_power7)
 
				 	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
			
 
				 	std	r0,16(r1)
			
 
				 	stdu	r1,-STACKFRAMESIZE(r1)
			
 
				-	bl	enter_vmx_copy
			
 
				+	bl	enter_vmx_ops
			
 
				 	cmpwi	r3,0
			
 
				 	ld	r0,STACKFRAMESIZE+16(r1)
			
 
				 	ld	r3,STK_REG(R31)(r1)
			
@@ -100,7 +100,7 @@ _GLOBAL(copypage_power7)
 
				 	addi	r3,r3,128
			
 
				 	bdnz	1b
			
 
				 
			
 
				-	b	exit_vmx_copy		/* tail call optimise */
			
 
				+	b	exit_vmx_ops		/* tail call optimise */
			
 
				 
			
 
				 #else
			
 
				 	li	r0,(PAGE_SIZE/128)
			
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -9,6 +9,7 @@
 
				  */
			
 
				 #include <asm/ppc_asm.h>
			
 
				 #include <asm/export.h>
			
 
				+#include <asm/ppc-opcode.h>
			
 
				 
			
 
				 #define off8	r6
			
 
				 #define off16	r7
			
@@ -27,12 +28,73 @@
 
				 #define LH	lhbrx
			
 
				 #define LW	lwbrx
			
 
				 #define LD	ldbrx
			
 
				+#define LVS	lvsr
			
 
				+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
			
 
				+	vperm _VRT,_VRB,_VRA,_VRC
			
 
				 #else
			
 
				 #define LH	lhzx
			
 
				 #define LW	lwzx
			
 
				 #define LD	ldx
			
 
				+#define LVS	lvsl
			
 
				+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
			
 
				+	vperm _VRT,_VRA,_VRB,_VRC
			
 
				 #endif
			
 
				 
			
 
				+#define VMX_THRESH 4096
			
 
				+#define ENTER_VMX_OPS	\
			
 
				+	mflr    r0;	\
			
 
				+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
			
 
				+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
			
 
				+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
			
 
				+	std     r0,16(r1); \
			
 
				+	stdu    r1,-STACKFRAMESIZE(r1); \
			
 
				+	bl      enter_vmx_ops; \
			
 
				+	cmpwi   cr1,r3,0; \
			
 
				+	ld      r0,STACKFRAMESIZE+16(r1); \
			
 
				+	ld      r3,STK_REG(R31)(r1); \
			
 
				+	ld      r4,STK_REG(R30)(r1); \
			
 
				+	ld      r5,STK_REG(R29)(r1); \
			
 
				+	addi	r1,r1,STACKFRAMESIZE; \
			
 
				+	mtlr    r0
			
 
				+
			
 
				+#define EXIT_VMX_OPS \
			
 
				+	mflr    r0; \
			
 
				+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
			
 
				+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
			
 
				+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
			
 
				+	std     r0,16(r1); \
			
 
				+	stdu    r1,-STACKFRAMESIZE(r1); \
			
 
				+	bl      exit_vmx_ops; \
			
 
				+	ld      r0,STACKFRAMESIZE+16(r1); \
			
 
				+	ld      r3,STK_REG(R31)(r1); \
			
 
				+	ld      r4,STK_REG(R30)(r1); \
			
 
				+	ld      r5,STK_REG(R29)(r1); \
			
 
				+	addi	r1,r1,STACKFRAMESIZE; \
			
 
				+	mtlr    r0
			
 
				+
			
 
				+/*
			
 
				+ * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
			
 
				+ * 16 bytes boundary and permute the result with the 1st 16 bytes.
			
 
				+
			
 
				+ *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
			
 
				+ *    ^                                  ^                                 ^
			
 
				+ * 0xbbbb10                          0xbbbb20                          0xbbb30
			
 
				+ *                                 ^
			
 
				+ *                                _vaddr
			
 
				+ *
			
 
				+ *
			
 
				+ * _vmask is the mask generated by LVS
			
 
				+ * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
			
 
				+ *   for example: 0xyyyyyyyyyyyyy012 for big endian
			
 
				+ * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
			
 
				+ *   for example: 0x3456789abcdefzzz for big endian
			
 
				+ * The permute result is saved in _v_res.
			
 
				+ *   for example: 0x0123456789abcdef for big endian.
			
 
				+ */
			
 
				+#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
			
 
				+        lvx     _v2nd_qw,_vaddr,off16; \
			
 
				+        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
			
 
				+
			
 
				 /*
			
 
				  * There are 2 categories for memcmp:
			
 
				  * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
			
@@ -40,7 +102,7 @@
 
				  * 2) src/dst has different offset to the 8 bytes boundary. The handlers
			
 
				  * are named like .Ldiffoffset_xxxx
			
 
				  */
			
 
				-_GLOBAL(memcmp)
			
 
				+_GLOBAL_TOC(memcmp)
			
 
				 	cmpdi	cr1,r5,0
			
 
				 
			
 
				 	/* Use the short loop if the src/dst addresses are not
			
@@ -132,7 +194,7 @@ _GLOBAL(memcmp)
 
				 	bgt	cr6,.Llong
			
 
				 
			
 
				 .Lcmp_lt32bytes:
			
 
				-	/* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */
			
 
				+	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
			
 
				 	cmpdi   cr5,r5,7
			
 
				 	srdi    r0,r5,3
			
 
				 	ble	cr5,.Lcmp_rest_lt8bytes
			
@@ -173,6 +235,15 @@ _GLOBAL(memcmp)
 
				 	blr
			
 
				 
			
 
				 .Llong:
			
 
				+#ifdef CONFIG_ALTIVEC
			
 
				+BEGIN_FTR_SECTION
			
 
				+	/* Try to use vmx loop if length is equal or greater than 4K */
			
 
				+	cmpldi  cr6,r5,VMX_THRESH
			
 
				+	bge	cr6,.Lsameoffset_vmx_cmp
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				+
			
 
				+.Llong_novmx_cmp:
			
 
				+#endif
			
 
				 	/* At least s1 addr is aligned with 8 bytes */
			
 
				 	li	off8,8
			
 
				 	li	off16,16
			
@@ -330,7 +401,97 @@ _GLOBAL(memcmp)
 
				 	li	r3,-1
			
 
				 	blr
			
 
				 
			
 
				+#ifdef CONFIG_ALTIVEC
			
 
				+.Lsameoffset_vmx_cmp:
			
 
				+	/* Enter with src/dst addrs has the same offset with 8 bytes
			
 
				+	 * align boundary
			
 
				+	 */
			
 
				+	ENTER_VMX_OPS
			
 
				+	beq     cr1,.Llong_novmx_cmp
			
 
				+
			
 
				+3:
			
 
				+	/* need to check whether r4 has the same offset with r3
			
 
				+	 * for 16 bytes boundary.
			
 
				+	 */
			
 
				+	xor	r0,r3,r4
			
 
				+	andi.	r0,r0,0xf
			
 
				+	bne	.Ldiffoffset_vmx_cmp_start
			
 
				+
			
 
				+	/* len is no less than 4KB. Need to align with 16 bytes further.
			
 
				+	 */
			
 
				+	andi.	rA,r3,8
			
 
				+	LD	rA,0,r3
			
 
				+	beq	4f
			
 
				+	LD	rB,0,r4
			
 
				+	cmpld	cr0,rA,rB
			
 
				+	addi	r3,r3,8
			
 
				+	addi	r4,r4,8
			
 
				+	addi	r5,r5,-8
			
 
				+
			
 
				+	beq	cr0,4f
			
 
				+	/* save and restore cr0 */
			
 
				+	mfocrf  r5,128
			
 
				+	EXIT_VMX_OPS
			
 
				+	mtocrf  128,r5
			
 
				+	b	.LcmpAB_lightweight
			
 
				+
			
 
				+4:
			
 
				+	/* compare 32 bytes for each loop */
			
 
				+	srdi	r0,r5,5
			
 
				+	mtctr	r0
			
 
				+	clrldi  r5,r5,59
			
 
				+	li	off16,16
			
 
				+
			
 
				+.balign 16
			
 
				+5:
			
 
				+	lvx 	v0,0,r3
			
 
				+	lvx 	v1,0,r4
			
 
				+	VCMPEQUD_RC(v0,v0,v1)
			
 
				+	bnl	cr6,7f
			
 
				+	lvx 	v0,off16,r3
			
 
				+	lvx 	v1,off16,r4
			
 
				+	VCMPEQUD_RC(v0,v0,v1)
			
 
				+	bnl	cr6,6f
			
 
				+	addi	r3,r3,32
			
 
				+	addi	r4,r4,32
			
 
				+	bdnz	5b
			
 
				+
			
 
				+	EXIT_VMX_OPS
			
 
				+	cmpdi	r5,0
			
 
				+	beq	.Lzero
			
 
				+	b	.Lcmp_lt32bytes
			
 
				+
			
 
				+6:
			
 
				+	addi	r3,r3,16
			
 
				+	addi	r4,r4,16
			
 
				+
			
 
				+7:
			
 
				+	/* diff the last 16 bytes */
			
 
				+	EXIT_VMX_OPS
			
 
				+	LD	rA,0,r3
			
 
				+	LD	rB,0,r4
			
 
				+	cmpld	cr0,rA,rB
			
 
				+	li	off8,8
			
 
				+	bne	cr0,.LcmpAB_lightweight
			
 
				+
			
 
				+	LD	rA,off8,r3
			
 
				+	LD	rB,off8,r4
			
 
				+	cmpld	cr0,rA,rB
			
 
				+	bne	cr0,.LcmpAB_lightweight
			
 
				+	b	.Lzero
			
 
				+#endif
			
 
				+
			
 
				 .Ldiffoffset_8bytes_make_align_start:
			
 
				+#ifdef CONFIG_ALTIVEC
			
 
				+BEGIN_FTR_SECTION
			
 
				+	/* only do vmx ops when the size equal or greater than 4K bytes */
			
 
				+	cmpdi	cr5,r5,VMX_THRESH
			
 
				+	bge	cr5,.Ldiffoffset_vmx_cmp
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				+
			
 
				+.Ldiffoffset_novmx_cmp:
			
 
				+#endif
			
 
				+
			
 
				 	/* now try to align s1 with 8 bytes */
			
 
				 	rlwinm  r6,r3,3,26,28
			
 
				 	beq     .Ldiffoffset_align_s1_8bytes
			
@@ -356,6 +517,82 @@ _GLOBAL(memcmp)
 
				 	/* now s1 is aligned with 8 bytes. */
			
 
				 	cmpdi   cr5,r5,31
			
 
				 	ble	cr5,.Lcmp_lt32bytes
			
 
				+
			
 
				+#ifdef CONFIG_ALTIVEC
			
 
				+	b	.Llong_novmx_cmp
			
 
				+#else
			
 
				 	b	.Llong
			
 
				+#endif
			
 
				+
			
 
				+#ifdef CONFIG_ALTIVEC
			
 
				+.Ldiffoffset_vmx_cmp:
			
 
				+	ENTER_VMX_OPS
			
 
				+	beq     cr1,.Ldiffoffset_novmx_cmp
			
 
				+
			
 
				+.Ldiffoffset_vmx_cmp_start:
			
 
				+	/* Firstly try to align r3 with 16 bytes */
			
 
				+	andi.   r6,r3,0xf
			
 
				+	li	off16,16
			
 
				+	beq     .Ldiffoffset_vmx_s1_16bytes_align
			
 
				 
			
 
				+	LVS	v3,0,r3
			
 
				+	LVS	v4,0,r4
			
 
				+
			
 
				+	lvx     v5,0,r3
			
 
				+	lvx     v6,0,r4
			
 
				+	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
			
 
				+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
			
 
				+
			
 
				+	VCMPEQUB_RC(v7,v9,v10)
			
 
				+	bnl	cr6,.Ldiffoffset_vmx_diff_found
			
 
				+
			
 
				+	subfic  r6,r6,16
			
 
				+	subf    r5,r6,r5
			
 
				+	add     r3,r3,r6
			
 
				+	add     r4,r4,r6
			
 
				+
			
 
				+.Ldiffoffset_vmx_s1_16bytes_align:
			
 
				+	/* now s1 is aligned with 16 bytes */
			
 
				+	lvx     v6,0,r4
			
 
				+	LVS	v4,0,r4
			
 
				+	srdi	r6,r5,5  /* loop for 32 bytes each */
			
 
				+	clrldi  r5,r5,59
			
 
				+	mtctr	r6
			
 
				+
			
 
				+.balign	16
			
 
				+.Ldiffoffset_vmx_32bytesloop:
			
 
				+	/* the first qw of r4 was saved in v6 */
			
 
				+	lvx	v9,0,r3
			
 
				+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
			
 
				+	VCMPEQUB_RC(v7,v9,v10)
			
 
				+	vor	v6,v8,v8
			
 
				+	bnl	cr6,.Ldiffoffset_vmx_diff_found
			
 
				+
			
 
				+	addi	r3,r3,16
			
 
				+	addi	r4,r4,16
			
 
				+
			
 
				+	lvx	v9,0,r3
			
 
				+	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
			
 
				+	VCMPEQUB_RC(v7,v9,v10)
			
 
				+	vor	v6,v8,v8
			
 
				+	bnl	cr6,.Ldiffoffset_vmx_diff_found
			
 
				+
			
 
				+	addi	r3,r3,16
			
 
				+	addi	r4,r4,16
			
 
				+
			
 
				+	bdnz	.Ldiffoffset_vmx_32bytesloop
			
 
				+
			
 
				+	EXIT_VMX_OPS
			
 
				+
			
 
				+	cmpdi	r5,0
			
 
				+	beq	.Lzero
			
 
				+	b	.Lcmp_lt32bytes
			
 
				+
			
 
				+.Ldiffoffset_vmx_diff_found:
			
 
				+	EXIT_VMX_OPS
			
 
				+	/* anyway, the diff will appear in next 16 bytes */
			
 
				+	li	r5,16
			
 
				+	b	.Lcmp_lt32bytes
			
 
				+
			
 
				+#endif
			
 
				 EXPORT_SYMBOL(memcmp)
			
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7)
 
				 	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
			
 
				 	std	r0,16(r1)
			
 
				 	stdu	r1,-STACKFRAMESIZE(r1)
			
 
				-	bl	enter_vmx_copy
			
 
				+	bl	enter_vmx_ops
			
 
				 	cmpwi	cr1,r3,0
			
 
				 	ld	r0,STACKFRAMESIZE+16(r1)
			
 
				 	ld	r3,STK_REG(R31)(r1)
			
@@ -445,7 +445,7 @@ _GLOBAL(memcpy_power7)
 
				 
			
 
				 15:	addi	r1,r1,STACKFRAMESIZE
			
 
				 	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
			
 
				-	b	exit_vmx_copy		/* tail call optimise */
			
 
				+	b	exit_vmx_ops		/* tail call optimise */
			
 
				 
			
 
				 .Lvmx_unaligned_copy:
			
 
				 	/* Get the destination 16B aligned */
			
@@ -649,5 +649,5 @@ _GLOBAL(memcpy_power7)
 
				 
			
 
				 15:	addi	r1,r1,STACKFRAMESIZE
			
 
				 	ld	r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
			
 
				-	b	exit_vmx_copy		/* tail call optimise */
			
 
				+	b	exit_vmx_ops		/* tail call optimise */
			
 
				 #endif /* CONFIG_ALTIVEC */
			
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -53,7 +53,7 @@ int exit_vmx_usercopy(void)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-int enter_vmx_copy(void)
			
 
				+int enter_vmx_ops(void)
			
 
				 {
			
 
				 	if (in_interrupt())
			
 
				 		return 0;
			
@@ -70,7 +70,7 @@ int enter_vmx_copy(void)
 
				  * passed a pointer to the destination which we return as required by a
			
 
				  * memcpy implementation.
			
 
				  */
			
 
				-void *exit_vmx_copy(void *dest)
			
 
				+void *exit_vmx_ops(void *dest)
			
 
				 {
			
 
				 	disable_kernel_altivec();
			
 
				 	preempt_enable();