8 жил өмнө · de696a2643
--- a/arch/powerpc/crypto/crc32-vpmsum_core.S
+++ b/arch/powerpc/crypto/crc32-vpmsum_core.S
@@ -0,0 +1,726 @@
 
				+/*
			
 
				+ * Core of the accelerated CRC algorithm.
			
 
				+ * In your file, define the constants and CRC_FUNCTION_NAME
			
 
				+ * Then include this file.
			
 
				+ *
			
 
				+ * Calculate the checksum of data that is 16 byte aligned and a multiple of
			
 
				+ * 16 bytes.
			
 
				+ *
			
 
				+ * The first step is to reduce it to 1024 bits. We do this in 8 parallel
			
 
				+ * chunks in order to mask the latency of the vpmsum instructions. If we
			
 
				+ * have more than 32 kB of data to checksum we repeat this step multiple
			
 
				+ * times, passing in the previous 1024 bits.
			
 
				+ *
			
 
				+ * The next step is to reduce the 1024 bits to 64 bits. This step adds
			
 
				+ * 32 bits of 0s to the end - this matches what a CRC does. We just
			
 
				+ * calculate constants that land the data in this 32 bits.
			
 
				+ *
			
 
				+ * We then use fixed point Barrett reduction to compute a mod n over GF(2)
			
 
				+ * for n = CRC using POWER8 instructions. We use x = 32.
			
 
				+ *
			
 
				+ * http://en.wikipedia.org/wiki/Barrett_reduction
			
 
				+ *
			
 
				+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License
			
 
				+ * as published by the Free Software Foundation; either version
			
 
				+ * 2 of the License, or (at your option) any later version.
			
 
				+*/
			
 
				+
			
 
				+#include <asm/ppc_asm.h>
			
 
				+#include <asm/ppc-opcode.h>
			
 
				+
			
 
				+#define MAX_SIZE	32768
			
 
				+
			
 
				+	.text
			
 
				+
			
 
				+#if defined(__BIG_ENDIAN__)
			
 
				+#define BYTESWAP_DATA
			
 
				+#else
			
 
				+#undef BYTESWAP_DATA
			
 
				+#endif
			
 
				+
			
 
				+#define off16		r25
			
 
				+#define off32		r26
			
 
				+#define off48		r27
			
 
				+#define off64		r28
			
 
				+#define off80		r29
			
 
				+#define off96		r30
			
 
				+#define off112		r31
			
 
				+
			
 
				+#define const1		v24
			
 
				+#define const2		v25
			
 
				+
			
 
				+#define byteswap	v26
			
 
				+#define	mask_32bit	v27
			
 
				+#define	mask_64bit	v28
			
 
				+#define zeroes		v29
			
 
				+
			
 
				+#ifdef BYTESWAP_DATA
			
 
				+#define VPERM(A, B, C, D) vperm	A, B, C, D
			
 
				+#else
			
 
				+#define VPERM(A, B, C, D)
			
 
				+#endif
			
 
				+
			
 
				+/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
			
 
				+FUNC_START(CRC_FUNCTION_NAME)
			
 
				+	std	r31,-8(r1)
			
 
				+	std	r30,-16(r1)
			
 
				+	std	r29,-24(r1)
			
 
				+	std	r28,-32(r1)
			
 
				+	std	r27,-40(r1)
			
 
				+	std	r26,-48(r1)
			
 
				+	std	r25,-56(r1)
			
 
				+
			
 
				+	li	off16,16
			
 
				+	li	off32,32
			
 
				+	li	off48,48
			
 
				+	li	off64,64
			
 
				+	li	off80,80
			
 
				+	li	off96,96
			
 
				+	li	off112,112
			
 
				+	li	r0,0
			
 
				+
			
 
				+	/* Enough room for saving 10 non volatile VMX registers */
			
 
				+	subi	r6,r1,56+10*16
			
 
				+	subi	r7,r1,56+2*16
			
 
				+
			
 
				+	stvx	v20,0,r6
			
 
				+	stvx	v21,off16,r6
			
 
				+	stvx	v22,off32,r6
			
 
				+	stvx	v23,off48,r6
			
 
				+	stvx	v24,off64,r6
			
 
				+	stvx	v25,off80,r6
			
 
				+	stvx	v26,off96,r6
			
 
				+	stvx	v27,off112,r6
			
 
				+	stvx	v28,0,r7
			
 
				+	stvx	v29,off16,r7
			
 
				+
			
 
				+	mr	r10,r3
			
 
				+
			
 
				+	vxor	zeroes,zeroes,zeroes
			
 
				+	vspltisw v0,-1
			
 
				+
			
 
				+	vsldoi	mask_32bit,zeroes,v0,4
			
 
				+	vsldoi	mask_64bit,zeroes,v0,8
			
 
				+
			
 
				+	/* Get the initial value into v8 */
			
 
				+	vxor	v8,v8,v8
			
 
				+	MTVRD(v8, R3)
			
 
				+	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
			
 
				+
			
 
				+#ifdef BYTESWAP_DATA
			
 
				+	addis	r3,r2,.byteswap_constant@toc@ha
			
 
				+	addi	r3,r3,.byteswap_constant@toc@l
			
 
				+
			
 
				+	lvx	byteswap,0,r3
			
 
				+	addi	r3,r3,16
			
 
				+#endif
			
 
				+
			
 
				+	cmpdi	r5,256
			
 
				+	blt	.Lshort
			
 
				+
			
 
				+	rldicr	r6,r5,0,56
			
 
				+
			
 
				+	/* Checksum in blocks of MAX_SIZE */
			
 
				+1:	lis	r7,MAX_SIZE@h
			
 
				+	ori	r7,r7,MAX_SIZE@l
			
 
				+	mr	r9,r7
			
 
				+	cmpd	r6,r7
			
 
				+	bgt	2f
			
 
				+	mr	r7,r6
			
 
				+2:	subf	r6,r7,r6
			
 
				+
			
 
				+	/* our main loop does 128 bytes at a time */
			
 
				+	srdi	r7,r7,7
			
 
				+
			
 
				+	/*
			
 
				+	 * Work out the offset into the constants table to start at. Each
			
 
				+	 * constant is 16 bytes, and it is used against 128 bytes of input
			
 
				+	 * data - 128 / 16 = 8
			
 
				+	 */
			
 
				+	sldi	r8,r7,4
			
 
				+	srdi	r9,r9,3
			
 
				+	subf	r8,r8,r9
			
 
				+
			
 
				+	/* We reduce our final 128 bytes in a separate step */
			
 
				+	addi	r7,r7,-1
			
 
				+	mtctr	r7
			
 
				+
			
 
				+	addis	r3,r2,.constants@toc@ha
			
 
				+	addi	r3,r3,.constants@toc@l
			
 
				+
			
 
				+	/* Find the start of our constants */
			
 
				+	add	r3,r3,r8
			
 
				+
			
 
				+	/* zero v0-v7 which will contain our checksums */
			
 
				+	vxor	v0,v0,v0
			
 
				+	vxor	v1,v1,v1
			
 
				+	vxor	v2,v2,v2
			
 
				+	vxor	v3,v3,v3
			
 
				+	vxor	v4,v4,v4
			
 
				+	vxor	v5,v5,v5
			
 
				+	vxor	v6,v6,v6
			
 
				+	vxor	v7,v7,v7
			
 
				+
			
 
				+	lvx	const1,0,r3
			
 
				+
			
 
				+	/*
			
 
				+	 * If we are looping back to consume more data we use the values
			
 
				+	 * already in v16-v23.
			
 
				+	 */
			
 
				+	cmpdi	r0,1
			
 
				+	beq	2f
			
 
				+
			
 
				+	/* First warm up pass */
			
 
				+	lvx	v16,0,r4
			
 
				+	lvx	v17,off16,r4
			
 
				+	VPERM(v16,v16,v16,byteswap)
			
 
				+	VPERM(v17,v17,v17,byteswap)
			
 
				+	lvx	v18,off32,r4
			
 
				+	lvx	v19,off48,r4
			
 
				+	VPERM(v18,v18,v18,byteswap)
			
 
				+	VPERM(v19,v19,v19,byteswap)
			
 
				+	lvx	v20,off64,r4
			
 
				+	lvx	v21,off80,r4
			
 
				+	VPERM(v20,v20,v20,byteswap)
			
 
				+	VPERM(v21,v21,v21,byteswap)
			
 
				+	lvx	v22,off96,r4
			
 
				+	lvx	v23,off112,r4
			
 
				+	VPERM(v22,v22,v22,byteswap)
			
 
				+	VPERM(v23,v23,v23,byteswap)
			
 
				+	addi	r4,r4,8*16
			
 
				+
			
 
				+	/* xor in initial value */
			
 
				+	vxor	v16,v16,v8
			
 
				+
			
 
				+2:	bdz	.Lfirst_warm_up_done
			
 
				+
			
 
				+	addi	r3,r3,16
			
 
				+	lvx	const2,0,r3
			
 
				+
			
 
				+	/* Second warm up pass */
			
 
				+	VPMSUMD(v8,v16,const1)
			
 
				+	lvx	v16,0,r4
			
 
				+	VPERM(v16,v16,v16,byteswap)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	VPMSUMD(v9,v17,const1)
			
 
				+	lvx	v17,off16,r4
			
 
				+	VPERM(v17,v17,v17,byteswap)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	VPMSUMD(v10,v18,const1)
			
 
				+	lvx	v18,off32,r4
			
 
				+	VPERM(v18,v18,v18,byteswap)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	VPMSUMD(v11,v19,const1)
			
 
				+	lvx	v19,off48,r4
			
 
				+	VPERM(v19,v19,v19,byteswap)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	VPMSUMD(v12,v20,const1)
			
 
				+	lvx	v20,off64,r4
			
 
				+	VPERM(v20,v20,v20,byteswap)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	VPMSUMD(v13,v21,const1)
			
 
				+	lvx	v21,off80,r4
			
 
				+	VPERM(v21,v21,v21,byteswap)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	VPMSUMD(v14,v22,const1)
			
 
				+	lvx	v22,off96,r4
			
 
				+	VPERM(v22,v22,v22,byteswap)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	VPMSUMD(v15,v23,const1)
			
 
				+	lvx	v23,off112,r4
			
 
				+	VPERM(v23,v23,v23,byteswap)
			
 
				+
			
 
				+	addi	r4,r4,8*16
			
 
				+
			
 
				+	bdz	.Lfirst_cool_down
			
 
				+
			
 
				+	/*
			
 
				+	 * main loop. We modulo schedule it such that it takes three iterations
			
 
				+	 * to complete - first iteration load, second iteration vpmsum, third
			
 
				+	 * iteration xor.
			
 
				+	 */
			
 
				+	.balign	16
			
 
				+4:	lvx	const1,0,r3
			
 
				+	addi	r3,r3,16
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v0,v0,v8
			
 
				+	VPMSUMD(v8,v16,const2)
			
 
				+	lvx	v16,0,r4
			
 
				+	VPERM(v16,v16,v16,byteswap)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v1,v1,v9
			
 
				+	VPMSUMD(v9,v17,const2)
			
 
				+	lvx	v17,off16,r4
			
 
				+	VPERM(v17,v17,v17,byteswap)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v2,v2,v10
			
 
				+	VPMSUMD(v10,v18,const2)
			
 
				+	lvx	v18,off32,r4
			
 
				+	VPERM(v18,v18,v18,byteswap)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v3,v3,v11
			
 
				+	VPMSUMD(v11,v19,const2)
			
 
				+	lvx	v19,off48,r4
			
 
				+	VPERM(v19,v19,v19,byteswap)
			
 
				+	lvx	const2,0,r3
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v4,v4,v12
			
 
				+	VPMSUMD(v12,v20,const1)
			
 
				+	lvx	v20,off64,r4
			
 
				+	VPERM(v20,v20,v20,byteswap)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v5,v5,v13
			
 
				+	VPMSUMD(v13,v21,const1)
			
 
				+	lvx	v21,off80,r4
			
 
				+	VPERM(v21,v21,v21,byteswap)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v6,v6,v14
			
 
				+	VPMSUMD(v14,v22,const1)
			
 
				+	lvx	v22,off96,r4
			
 
				+	VPERM(v22,v22,v22,byteswap)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v7,v7,v15
			
 
				+	VPMSUMD(v15,v23,const1)
			
 
				+	lvx	v23,off112,r4
			
 
				+	VPERM(v23,v23,v23,byteswap)
			
 
				+
			
 
				+	addi	r4,r4,8*16
			
 
				+
			
 
				+	bdnz	4b
			
 
				+
			
 
				+.Lfirst_cool_down:
			
 
				+	/* First cool down pass */
			
 
				+	lvx	const1,0,r3
			
 
				+	addi	r3,r3,16
			
 
				+
			
 
				+	vxor	v0,v0,v8
			
 
				+	VPMSUMD(v8,v16,const1)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v1,v1,v9
			
 
				+	VPMSUMD(v9,v17,const1)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v2,v2,v10
			
 
				+	VPMSUMD(v10,v18,const1)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v3,v3,v11
			
 
				+	VPMSUMD(v11,v19,const1)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v4,v4,v12
			
 
				+	VPMSUMD(v12,v20,const1)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v5,v5,v13
			
 
				+	VPMSUMD(v13,v21,const1)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v6,v6,v14
			
 
				+	VPMSUMD(v14,v22,const1)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+	vxor	v7,v7,v15
			
 
				+	VPMSUMD(v15,v23,const1)
			
 
				+	ori	r2,r2,0
			
 
				+
			
 
				+.Lsecond_cool_down:
			
 
				+	/* Second cool down pass */
			
 
				+	vxor	v0,v0,v8
			
 
				+	vxor	v1,v1,v9
			
 
				+	vxor	v2,v2,v10
			
 
				+	vxor	v3,v3,v11
			
 
				+	vxor	v4,v4,v12
			
 
				+	vxor	v5,v5,v13
			
 
				+	vxor	v6,v6,v14
			
 
				+	vxor	v7,v7,v15
			
 
				+
			
 
				+	/*
			
 
				+	 * vpmsumd produces a 96 bit result in the least significant bits
			
 
				+	 * of the register. Since we are bit reflected we have to shift it
			
 
				+	 * left 32 bits so it occupies the least significant bits in the
			
 
				+	 * bit reflected domain.
			
 
				+	 */
			
 
				+	vsldoi	v0,v0,zeroes,4
			
 
				+	vsldoi	v1,v1,zeroes,4
			
 
				+	vsldoi	v2,v2,zeroes,4
			
 
				+	vsldoi	v3,v3,zeroes,4
			
 
				+	vsldoi	v4,v4,zeroes,4
			
 
				+	vsldoi	v5,v5,zeroes,4
			
 
				+	vsldoi	v6,v6,zeroes,4
			
 
				+	vsldoi	v7,v7,zeroes,4
			
 
				+
			
 
				+	/* xor with last 1024 bits */
			
 
				+	lvx	v8,0,r4
			
 
				+	lvx	v9,off16,r4
			
 
				+	VPERM(v8,v8,v8,byteswap)
			
 
				+	VPERM(v9,v9,v9,byteswap)
			
 
				+	lvx	v10,off32,r4
			
 
				+	lvx	v11,off48,r4
			
 
				+	VPERM(v10,v10,v10,byteswap)
			
 
				+	VPERM(v11,v11,v11,byteswap)
			
 
				+	lvx	v12,off64,r4
			
 
				+	lvx	v13,off80,r4
			
 
				+	VPERM(v12,v12,v12,byteswap)
			
 
				+	VPERM(v13,v13,v13,byteswap)
			
 
				+	lvx	v14,off96,r4
			
 
				+	lvx	v15,off112,r4
			
 
				+	VPERM(v14,v14,v14,byteswap)
			
 
				+	VPERM(v15,v15,v15,byteswap)
			
 
				+
			
 
				+	addi	r4,r4,8*16
			
 
				+
			
 
				+	vxor	v16,v0,v8
			
 
				+	vxor	v17,v1,v9
			
 
				+	vxor	v18,v2,v10
			
 
				+	vxor	v19,v3,v11
			
 
				+	vxor	v20,v4,v12
			
 
				+	vxor	v21,v5,v13
			
 
				+	vxor	v22,v6,v14
			
 
				+	vxor	v23,v7,v15
			
 
				+
			
 
				+	li	r0,1
			
 
				+	cmpdi	r6,0
			
 
				+	addi	r6,r6,128
			
 
				+	bne	1b
			
 
				+
			
 
				+	/* Work out how many bytes we have left */
			
 
				+	andi.	r5,r5,127
			
 
				+
			
 
				+	/* Calculate where in the constant table we need to start */
			
 
				+	subfic	r6,r5,128
			
 
				+	add	r3,r3,r6
			
 
				+
			
 
				+	/* How many 16 byte chunks are in the tail */
			
 
				+	srdi	r7,r5,4
			
 
				+	mtctr	r7
			
 
				+
			
 
				+	/*
			
 
				+	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
			
 
				+	 * 32 bits to include the trailing 32 bits of zeros
			
 
				+	 */
			
 
				+	lvx	v0,0,r3
			
 
				+	lvx	v1,off16,r3
			
 
				+	lvx	v2,off32,r3
			
 
				+	lvx	v3,off48,r3
			
 
				+	lvx	v4,off64,r3
			
 
				+	lvx	v5,off80,r3
			
 
				+	lvx	v6,off96,r3
			
 
				+	lvx	v7,off112,r3
			
 
				+	addi	r3,r3,8*16
			
 
				+
			
 
				+	VPMSUMW(v0,v16,v0)
			
 
				+	VPMSUMW(v1,v17,v1)
			
 
				+	VPMSUMW(v2,v18,v2)
			
 
				+	VPMSUMW(v3,v19,v3)
			
 
				+	VPMSUMW(v4,v20,v4)
			
 
				+	VPMSUMW(v5,v21,v5)
			
 
				+	VPMSUMW(v6,v22,v6)
			
 
				+	VPMSUMW(v7,v23,v7)
			
 
				+
			
 
				+	/* Now reduce the tail (0 - 112 bytes) */
			
 
				+	cmpdi	r7,0
			
 
				+	beq	1f
			
 
				+
			
 
				+	lvx	v16,0,r4
			
 
				+	lvx	v17,0,r3
			
 
				+	VPERM(v16,v16,v16,byteswap)
			
 
				+	VPMSUMW(v16,v16,v17)
			
 
				+	vxor	v0,v0,v16
			
 
				+	bdz	1f
			
 
				+
			
 
				+	lvx	v16,off16,r4
			
 
				+	lvx	v17,off16,r3
			
 
				+	VPERM(v16,v16,v16,byteswap)
			
 
				+	VPMSUMW(v16,v16,v17)
			
 
				+	vxor	v0,v0,v16
			
 
				+	bdz	1f
			
 
				+
			
 
				+	lvx	v16,off32,r4
			
 
				+	lvx	v17,off32,r3
			
 
				+	VPERM(v16,v16,v16,byteswap)
			
 
				+	VPMSUMW(v16,v16,v17)
			
 
				+	vxor	v0,v0,v16
			
 
				+	bdz	1f
			
 
				+
			
 
				+	lvx	v16,off48,r4
			
 
				+	lvx	v17,off48,r3
			
 
				+	VPERM(v16,v16,v16,byteswap)
			
 
				+	VPMSUMW(v16,v16,v17)
			
 
				+	vxor	v0,v0,v16
			
 
				+	bdz	1f
			
 
				+
			
 
				+	lvx	v16,off64,r4
			
 
				+	lvx	v17,off64,r3
			
 
				+	VPERM(v16,v16,v16,byteswap)
			
 
				+	VPMSUMW(v16,v16,v17)
			
 
				+	vxor	v0,v0,v16
			
 
				+	bdz	1f
			
 
				+
			
 
				+	lvx	v16,off80,r4
			
 
				+	lvx	v17,off80,r3
			
 
				+	VPERM(v16,v16,v16,byteswap)
			
 
				+	VPMSUMW(v16,v16,v17)
			
 
				+	vxor	v0,v0,v16
			
 
				+	bdz	1f
			
 
				+
			
 
				+	lvx	v16,off96,r4
			
 
				+	lvx	v17,off96,r3
			
 
				+	VPERM(v16,v16,v16,byteswap)
			
 
				+	VPMSUMW(v16,v16,v17)
			
 
				+	vxor	v0,v0,v16
			
 
				+
			
 
				+	/* Now xor all the parallel chunks together */
			
 
				+1:	vxor	v0,v0,v1
			
 
				+	vxor	v2,v2,v3
			
 
				+	vxor	v4,v4,v5
			
 
				+	vxor	v6,v6,v7
			
 
				+
			
 
				+	vxor	v0,v0,v2
			
 
				+	vxor	v4,v4,v6
			
 
				+
			
 
				+	vxor	v0,v0,v4
			
 
				+
			
 
				+.Lbarrett_reduction:
			
 
				+	/* Barrett constants */
			
 
				+	addis	r3,r2,.barrett_constants@toc@ha
			
 
				+	addi	r3,r3,.barrett_constants@toc@l
			
 
				+
			
 
				+	lvx	const1,0,r3
			
 
				+	lvx	const2,off16,r3
			
 
				+
			
 
				+	vsldoi	v1,v0,v0,8
			
 
				+	vxor	v0,v0,v1		/* xor two 64 bit results together */
			
 
				+
			
 
				+	/* shift left one bit */
			
 
				+	vspltisb v1,1
			
 
				+	vsl	v0,v0,v1
			
 
				+
			
 
				+	vand	v0,v0,mask_64bit
			
 
				+
			
 
				+	/*
			
 
				+	 * The reflected version of Barrett reduction. Instead of bit
			
 
				+	 * reflecting our data (which is expensive to do), we bit reflect our
			
 
				+	 * constants and our algorithm, which means the intermediate data in
			
 
				+	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
			
 
				+	 * the algorithm because we don't carry in mod 2 arithmetic.
			
 
				+	 */
			
 
				+	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
			
 
				+	VPMSUMD(v1,v1,const1)		/* ma */
			
 
				+	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
			
 
				+	VPMSUMD(v1,v1,const2)		/* qn */
			
 
				+	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
			
 
				+
			
 
				+	/*
			
 
				+	 * Since we are bit reflected, the result (ie the low 32 bits) is in
			
 
				+	 * the high 32 bits. We just need to shift it left 4 bytes
			
 
				+	 * V0 [ 0 1 X 3 ]
			
 
				+	 * V0 [ 0 X 2 3 ]
			
 
				+	 */
			
 
				+	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
			
 
				+
			
 
				+	/* Get it into r3 */
			
 
				+	MFVRD(R3, v0)
			
 
				+
			
 
				+.Lout:
			
 
				+	subi	r6,r1,56+10*16
			
 
				+	subi	r7,r1,56+2*16
			
 
				+
			
 
				+	lvx	v20,0,r6
			
 
				+	lvx	v21,off16,r6
			
 
				+	lvx	v22,off32,r6
			
 
				+	lvx	v23,off48,r6
			
 
				+	lvx	v24,off64,r6
			
 
				+	lvx	v25,off80,r6
			
 
				+	lvx	v26,off96,r6
			
 
				+	lvx	v27,off112,r6
			
 
				+	lvx	v28,0,r7
			
 
				+	lvx	v29,off16,r7
			
 
				+
			
 
				+	ld	r31,-8(r1)
			
 
				+	ld	r30,-16(r1)
			
 
				+	ld	r29,-24(r1)
			
 
				+	ld	r28,-32(r1)
			
 
				+	ld	r27,-40(r1)
			
 
				+	ld	r26,-48(r1)
			
 
				+	ld	r25,-56(r1)
			
 
				+
			
 
				+	blr
			
 
				+
			
 
				+.Lfirst_warm_up_done:
			
 
				+	lvx	const1,0,r3
			
 
				+	addi	r3,r3,16
			
 
				+
			
 
				+	VPMSUMD(v8,v16,const1)
			
 
				+	VPMSUMD(v9,v17,const1)
			
 
				+	VPMSUMD(v10,v18,const1)
			
 
				+	VPMSUMD(v11,v19,const1)
			
 
				+	VPMSUMD(v12,v20,const1)
			
 
				+	VPMSUMD(v13,v21,const1)
			
 
				+	VPMSUMD(v14,v22,const1)
			
 
				+	VPMSUMD(v15,v23,const1)
			
 
				+
			
 
				+	b	.Lsecond_cool_down
			
 
				+
			
 
				+.Lshort:
			
 
				+	cmpdi	r5,0
			
 
				+	beq	.Lzero
			
 
				+
			
 
				+	addis	r3,r2,.short_constants@toc@ha
			
 
				+	addi	r3,r3,.short_constants@toc@l
			
 
				+
			
 
				+	/* Calculate where in the constant table we need to start */
			
 
				+	subfic	r6,r5,256
			
 
				+	add	r3,r3,r6
			
 
				+
			
 
				+	/* How many 16 byte chunks? */
			
 
				+	srdi	r7,r5,4
			
 
				+	mtctr	r7
			
 
				+
			
 
				+	vxor	v19,v19,v19
			
 
				+	vxor	v20,v20,v20
			
 
				+
			
 
				+	lvx	v0,0,r4
			
 
				+	lvx	v16,0,r3
			
 
				+	VPERM(v0,v0,v16,byteswap)
			
 
				+	vxor	v0,v0,v8	/* xor in initial value */
			
 
				+	VPMSUMW(v0,v0,v16)
			
 
				+	bdz	.Lv0
			
 
				+
			
 
				+	lvx	v1,off16,r4
			
 
				+	lvx	v17,off16,r3
			
 
				+	VPERM(v1,v1,v17,byteswap)
			
 
				+	VPMSUMW(v1,v1,v17)
			
 
				+	bdz	.Lv1
			
 
				+
			
 
				+	lvx	v2,off32,r4
			
 
				+	lvx	v16,off32,r3
			
 
				+	VPERM(v2,v2,v16,byteswap)
			
 
				+	VPMSUMW(v2,v2,v16)
			
 
				+	bdz	.Lv2
			
 
				+
			
 
				+	lvx	v3,off48,r4
			
 
				+	lvx	v17,off48,r3
			
 
				+	VPERM(v3,v3,v17,byteswap)
			
 
				+	VPMSUMW(v3,v3,v17)
			
 
				+	bdz	.Lv3
			
 
				+
			
 
				+	lvx	v4,off64,r4
			
 
				+	lvx	v16,off64,r3
			
 
				+	VPERM(v4,v4,v16,byteswap)
			
 
				+	VPMSUMW(v4,v4,v16)
			
 
				+	bdz	.Lv4
			
 
				+
			
 
				+	lvx	v5,off80,r4
			
 
				+	lvx	v17,off80,r3
			
 
				+	VPERM(v5,v5,v17,byteswap)
			
 
				+	VPMSUMW(v5,v5,v17)
			
 
				+	bdz	.Lv5
			
 
				+
			
 
				+	lvx	v6,off96,r4
			
 
				+	lvx	v16,off96,r3
			
 
				+	VPERM(v6,v6,v16,byteswap)
			
 
				+	VPMSUMW(v6,v6,v16)
			
 
				+	bdz	.Lv6
			
 
				+
			
 
				+	lvx	v7,off112,r4
			
 
				+	lvx	v17,off112,r3
			
 
				+	VPERM(v7,v7,v17,byteswap)
			
 
				+	VPMSUMW(v7,v7,v17)
			
 
				+	bdz	.Lv7
			
 
				+
			
 
				+	addi	r3,r3,128
			
 
				+	addi	r4,r4,128
			
 
				+
			
 
				+	lvx	v8,0,r4
			
 
				+	lvx	v16,0,r3
			
 
				+	VPERM(v8,v8,v16,byteswap)
			
 
				+	VPMSUMW(v8,v8,v16)
			
 
				+	bdz	.Lv8
			
 
				+
			
 
				+	lvx	v9,off16,r4
			
 
				+	lvx	v17,off16,r3
			
 
				+	VPERM(v9,v9,v17,byteswap)
			
 
				+	VPMSUMW(v9,v9,v17)
			
 
				+	bdz	.Lv9
			
 
				+
			
 
				+	lvx	v10,off32,r4
			
 
				+	lvx	v16,off32,r3
			
 
				+	VPERM(v10,v10,v16,byteswap)
			
 
				+	VPMSUMW(v10,v10,v16)
			
 
				+	bdz	.Lv10
			
 
				+
			
 
				+	lvx	v11,off48,r4
			
 
				+	lvx	v17,off48,r3
			
 
				+	VPERM(v11,v11,v17,byteswap)
			
 
				+	VPMSUMW(v11,v11,v17)
			
 
				+	bdz	.Lv11
			
 
				+
			
 
				+	lvx	v12,off64,r4
			
 
				+	lvx	v16,off64,r3
			
 
				+	VPERM(v12,v12,v16,byteswap)
			
 
				+	VPMSUMW(v12,v12,v16)
			
 
				+	bdz	.Lv12
			
 
				+
			
 
				+	lvx	v13,off80,r4
			
 
				+	lvx	v17,off80,r3
			
 
				+	VPERM(v13,v13,v17,byteswap)
			
 
				+	VPMSUMW(v13,v13,v17)
			
 
				+	bdz	.Lv13
			
 
				+
			
 
				+	lvx	v14,off96,r4
			
 
				+	lvx	v16,off96,r3
			
 
				+	VPERM(v14,v14,v16,byteswap)
			
 
				+	VPMSUMW(v14,v14,v16)
			
 
				+	bdz	.Lv14
			
 
				+
			
 
				+	lvx	v15,off112,r4
			
 
				+	lvx	v17,off112,r3
			
 
				+	VPERM(v15,v15,v17,byteswap)
			
 
				+	VPMSUMW(v15,v15,v17)
			
 
				+
			
 
				+.Lv15:	vxor	v19,v19,v15
			
 
				+.Lv14:	vxor	v20,v20,v14
			
 
				+.Lv13:	vxor	v19,v19,v13
			
 
				+.Lv12:	vxor	v20,v20,v12
			
 
				+.Lv11:	vxor	v19,v19,v11
			
 
				+.Lv10:	vxor	v20,v20,v10
			
 
				+.Lv9:	vxor	v19,v19,v9
			
 
				+.Lv8:	vxor	v20,v20,v8
			
 
				+.Lv7:	vxor	v19,v19,v7
			
 
				+.Lv6:	vxor	v20,v20,v6
			
 
				+.Lv5:	vxor	v19,v19,v5
			
 
				+.Lv4:	vxor	v20,v20,v4
			
 
				+.Lv3:	vxor	v19,v19,v3
			
 
				+.Lv2:	vxor	v20,v20,v2
			
 
				+.Lv1:	vxor	v19,v19,v1
			
 
				+.Lv0:	vxor	v20,v20,v0
			
 
				+
			
 
				+	vxor	v0,v19,v20
			
 
				+
			
 
				+	b	.Lbarrett_reduction
			
 
				+
			
 
				+.Lzero:
			
 
				+	mr	r3,r10
			
 
				+	b	.Lout
			
 
				+
			
 
				+FUNC_END(CRC_FUNCTION_NAME)
			
--- a/arch/powerpc/crypto/crc32c-vpmsum_asm.S
+++ b/arch/powerpc/crypto/crc32c-vpmsum_asm.S
@@ -1,20 +1,5 @@
 
				 /*
			
 
				- * Calculate the checksum of data that is 16 byte aligned and a multiple of
			
 
				- * 16 bytes.
			
 
				- *
			
 
				- * The first step is to reduce it to 1024 bits. We do this in 8 parallel
			
 
				- * chunks in order to mask the latency of the vpmsum instructions. If we
			
 
				- * have more than 32 kB of data to checksum we repeat this step multiple
			
 
				- * times, passing in the previous 1024 bits.
			
 
				- *
			
 
				- * The next step is to reduce the 1024 bits to 64 bits. This step adds
			
 
				- * 32 bits of 0s to the end - this matches what a CRC does. We just
			
 
				- * calculate constants that land the data in this 32 bits.
			
 
				- *
			
 
				- * We then use fixed point Barrett reduction to compute a mod n over GF(2)
			
 
				- * for n = CRC using POWER8 instructions. We use x = 32.
			
 
				- *
			
 
				- * http://en.wikipedia.org/wiki/Barrett_reduction
			
 
				+ * Calculate a crc32c with vpmsum acceleration
			
 
				  *
			
 
				  * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
			
 
				  *
			
@@ -23,9 +8,6 @@
 
				  * as published by the Free Software Foundation; either version
			
 
				  * 2 of the License, or (at your option) any later version.
			
 
				  */
			
 
				-#include <asm/ppc_asm.h>
			
 
				-#include <asm/ppc-opcode.h>
			
 
				-
			
 
				 	.section	.rodata
			
 
				 .balign 16
			
 
				 
			
@@ -33,7 +15,6 @@
 
				 	/* byte reverse permute constant */
			
 
				 	.octa 0x0F0E0D0C0B0A09080706050403020100
			
 
				 
			
 
				-#define MAX_SIZE	32768
			
 
				 .constants:
			
 
				 
			
 
				 	/* Reduce 262144 kbits to 1024 bits */
			
@@ -860,694 +841,5 @@
 
				 	/* 33 bit reflected Barrett constant n */
			
 
				 	.octa 0x00000000000000000000000105ec76f1
			
 
				 
			
 
				-	.text
			
 
				-
			
 
				-#if defined(__BIG_ENDIAN__)
			
 
				-#define BYTESWAP_DATA
			
 
				-#else
			
 
				-#undef BYTESWAP_DATA
			
 
				-#endif
			
 
				-
			
 
				-#define off16		r25
			
 
				-#define off32		r26
			
 
				-#define off48		r27
			
 
				-#define off64		r28
			
 
				-#define off80		r29
			
 
				-#define off96		r30
			
 
				-#define off112		r31
			
 
				-
			
 
				-#define const1		v24
			
 
				-#define const2		v25
			
 
				-
			
 
				-#define byteswap	v26
			
 
				-#define	mask_32bit	v27
			
 
				-#define	mask_64bit	v28
			
 
				-#define zeroes		v29
			
 
				-
			
 
				-#ifdef BYTESWAP_DATA
			
 
				-#define VPERM(A, B, C, D) vperm	A, B, C, D
			
 
				-#else
			
 
				-#define VPERM(A, B, C, D)
			
 
				-#endif
			
 
				-
			
 
				-/* unsigned int __crc32c_vpmsum(unsigned int crc, void *p, unsigned long len) */
			
 
				-FUNC_START(__crc32c_vpmsum)
			
 
				-	std	r31,-8(r1)
			
 
				-	std	r30,-16(r1)
			
 
				-	std	r29,-24(r1)
			
 
				-	std	r28,-32(r1)
			
 
				-	std	r27,-40(r1)
			
 
				-	std	r26,-48(r1)
			
 
				-	std	r25,-56(r1)
			
 
				-
			
 
				-	li	off16,16
			
 
				-	li	off32,32
			
 
				-	li	off48,48
			
 
				-	li	off64,64
			
 
				-	li	off80,80
			
 
				-	li	off96,96
			
 
				-	li	off112,112
			
 
				-	li	r0,0
			
 
				-
			
 
				-	/* Enough room for saving 10 non volatile VMX registers */
			
 
				-	subi	r6,r1,56+10*16
			
 
				-	subi	r7,r1,56+2*16
			
 
				-
			
 
				-	stvx	v20,0,r6
			
 
				-	stvx	v21,off16,r6
			
 
				-	stvx	v22,off32,r6
			
 
				-	stvx	v23,off48,r6
			
 
				-	stvx	v24,off64,r6
			
 
				-	stvx	v25,off80,r6
			
 
				-	stvx	v26,off96,r6
			
 
				-	stvx	v27,off112,r6
			
 
				-	stvx	v28,0,r7
			
 
				-	stvx	v29,off16,r7
			
 
				-
			
 
				-	mr	r10,r3
			
 
				-
			
 
				-	vxor	zeroes,zeroes,zeroes
			
 
				-	vspltisw v0,-1
			
 
				-
			
 
				-	vsldoi	mask_32bit,zeroes,v0,4
			
 
				-	vsldoi	mask_64bit,zeroes,v0,8
			
 
				-
			
 
				-	/* Get the initial value into v8 */
			
 
				-	vxor	v8,v8,v8
			
 
				-	MTVRD(v8, R3)
			
 
				-	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
			
 
				-
			
 
				-#ifdef BYTESWAP_DATA
			
 
				-	addis	r3,r2,.byteswap_constant@toc@ha
			
 
				-	addi	r3,r3,.byteswap_constant@toc@l
			
 
				-
			
 
				-	lvx	byteswap,0,r3
			
 
				-	addi	r3,r3,16
			
 
				-#endif
			
 
				-
			
 
				-	cmpdi	r5,256
			
 
				-	blt	.Lshort
			
 
				-
			
 
				-	rldicr	r6,r5,0,56
			
 
				-
			
 
				-	/* Checksum in blocks of MAX_SIZE */
			
 
				-1:	lis	r7,MAX_SIZE@h
			
 
				-	ori	r7,r7,MAX_SIZE@l
			
 
				-	mr	r9,r7
			
 
				-	cmpd	r6,r7
			
 
				-	bgt	2f
			
 
				-	mr	r7,r6
			
 
				-2:	subf	r6,r7,r6
			
 
				-
			
 
				-	/* our main loop does 128 bytes at a time */
			
 
				-	srdi	r7,r7,7
			
 
				-
			
 
				-	/*
			
 
				-	 * Work out the offset into the constants table to start at. Each
			
 
				-	 * constant is 16 bytes, and it is used against 128 bytes of input
			
 
				-	 * data - 128 / 16 = 8
			
 
				-	 */
			
 
				-	sldi	r8,r7,4
			
 
				-	srdi	r9,r9,3
			
 
				-	subf	r8,r8,r9
			
 
				-
			
 
				-	/* We reduce our final 128 bytes in a separate step */
			
 
				-	addi	r7,r7,-1
			
 
				-	mtctr	r7
			
 
				-
			
 
				-	addis	r3,r2,.constants@toc@ha
			
 
				-	addi	r3,r3,.constants@toc@l
			
 
				-
			
 
				-	/* Find the start of our constants */
			
 
				-	add	r3,r3,r8
			
 
				-
			
 
				-	/* zero v0-v7 which will contain our checksums */
			
 
				-	vxor	v0,v0,v0
			
 
				-	vxor	v1,v1,v1
			
 
				-	vxor	v2,v2,v2
			
 
				-	vxor	v3,v3,v3
			
 
				-	vxor	v4,v4,v4
			
 
				-	vxor	v5,v5,v5
			
 
				-	vxor	v6,v6,v6
			
 
				-	vxor	v7,v7,v7
			
 
				-
			
 
				-	lvx	const1,0,r3
			
 
				-
			
 
				-	/*
			
 
				-	 * If we are looping back to consume more data we use the values
			
 
				-	 * already in v16-v23.
			
 
				-	 */
			
 
				-	cmpdi	r0,1
			
 
				-	beq	2f
			
 
				-
			
 
				-	/* First warm up pass */
			
 
				-	lvx	v16,0,r4
			
 
				-	lvx	v17,off16,r4
			
 
				-	VPERM(v16,v16,v16,byteswap)
			
 
				-	VPERM(v17,v17,v17,byteswap)
			
 
				-	lvx	v18,off32,r4
			
 
				-	lvx	v19,off48,r4
			
 
				-	VPERM(v18,v18,v18,byteswap)
			
 
				-	VPERM(v19,v19,v19,byteswap)
			
 
				-	lvx	v20,off64,r4
			
 
				-	lvx	v21,off80,r4
			
 
				-	VPERM(v20,v20,v20,byteswap)
			
 
				-	VPERM(v21,v21,v21,byteswap)
			
 
				-	lvx	v22,off96,r4
			
 
				-	lvx	v23,off112,r4
			
 
				-	VPERM(v22,v22,v22,byteswap)
			
 
				-	VPERM(v23,v23,v23,byteswap)
			
 
				-	addi	r4,r4,8*16
			
 
				-
			
 
				-	/* xor in initial value */
			
 
				-	vxor	v16,v16,v8
			
 
				-
			
 
				-2:	bdz	.Lfirst_warm_up_done
			
 
				-
			
 
				-	addi	r3,r3,16
			
 
				-	lvx	const2,0,r3
			
 
				-
			
 
				-	/* Second warm up pass */
			
 
				-	VPMSUMD(v8,v16,const1)
			
 
				-	lvx	v16,0,r4
			
 
				-	VPERM(v16,v16,v16,byteswap)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	VPMSUMD(v9,v17,const1)
			
 
				-	lvx	v17,off16,r4
			
 
				-	VPERM(v17,v17,v17,byteswap)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	VPMSUMD(v10,v18,const1)
			
 
				-	lvx	v18,off32,r4
			
 
				-	VPERM(v18,v18,v18,byteswap)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	VPMSUMD(v11,v19,const1)
			
 
				-	lvx	v19,off48,r4
			
 
				-	VPERM(v19,v19,v19,byteswap)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	VPMSUMD(v12,v20,const1)
			
 
				-	lvx	v20,off64,r4
			
 
				-	VPERM(v20,v20,v20,byteswap)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	VPMSUMD(v13,v21,const1)
			
 
				-	lvx	v21,off80,r4
			
 
				-	VPERM(v21,v21,v21,byteswap)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	VPMSUMD(v14,v22,const1)
			
 
				-	lvx	v22,off96,r4
			
 
				-	VPERM(v22,v22,v22,byteswap)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	VPMSUMD(v15,v23,const1)
			
 
				-	lvx	v23,off112,r4
			
 
				-	VPERM(v23,v23,v23,byteswap)
			
 
				-
			
 
				-	addi	r4,r4,8*16
			
 
				-
			
 
				-	bdz	.Lfirst_cool_down
			
 
				-
			
 
				-	/*
			
 
				-	 * main loop. We modulo schedule it such that it takes three iterations
			
 
				-	 * to complete - first iteration load, second iteration vpmsum, third
			
 
				-	 * iteration xor.
			
 
				-	 */
			
 
				-	.balign	16
			
 
				-4:	lvx	const1,0,r3
			
 
				-	addi	r3,r3,16
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v0,v0,v8
			
 
				-	VPMSUMD(v8,v16,const2)
			
 
				-	lvx	v16,0,r4
			
 
				-	VPERM(v16,v16,v16,byteswap)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v1,v1,v9
			
 
				-	VPMSUMD(v9,v17,const2)
			
 
				-	lvx	v17,off16,r4
			
 
				-	VPERM(v17,v17,v17,byteswap)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v2,v2,v10
			
 
				-	VPMSUMD(v10,v18,const2)
			
 
				-	lvx	v18,off32,r4
			
 
				-	VPERM(v18,v18,v18,byteswap)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v3,v3,v11
			
 
				-	VPMSUMD(v11,v19,const2)
			
 
				-	lvx	v19,off48,r4
			
 
				-	VPERM(v19,v19,v19,byteswap)
			
 
				-	lvx	const2,0,r3
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v4,v4,v12
			
 
				-	VPMSUMD(v12,v20,const1)
			
 
				-	lvx	v20,off64,r4
			
 
				-	VPERM(v20,v20,v20,byteswap)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v5,v5,v13
			
 
				-	VPMSUMD(v13,v21,const1)
			
 
				-	lvx	v21,off80,r4
			
 
				-	VPERM(v21,v21,v21,byteswap)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v6,v6,v14
			
 
				-	VPMSUMD(v14,v22,const1)
			
 
				-	lvx	v22,off96,r4
			
 
				-	VPERM(v22,v22,v22,byteswap)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v7,v7,v15
			
 
				-	VPMSUMD(v15,v23,const1)
			
 
				-	lvx	v23,off112,r4
			
 
				-	VPERM(v23,v23,v23,byteswap)
			
 
				-
			
 
				-	addi	r4,r4,8*16
			
 
				-
			
 
				-	bdnz	4b
			
 
				-
			
 
				-.Lfirst_cool_down:
			
 
				-	/* First cool down pass */
			
 
				-	lvx	const1,0,r3
			
 
				-	addi	r3,r3,16
			
 
				-
			
 
				-	vxor	v0,v0,v8
			
 
				-	VPMSUMD(v8,v16,const1)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v1,v1,v9
			
 
				-	VPMSUMD(v9,v17,const1)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v2,v2,v10
			
 
				-	VPMSUMD(v10,v18,const1)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v3,v3,v11
			
 
				-	VPMSUMD(v11,v19,const1)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v4,v4,v12
			
 
				-	VPMSUMD(v12,v20,const1)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v5,v5,v13
			
 
				-	VPMSUMD(v13,v21,const1)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v6,v6,v14
			
 
				-	VPMSUMD(v14,v22,const1)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-	vxor	v7,v7,v15
			
 
				-	VPMSUMD(v15,v23,const1)
			
 
				-	ori	r2,r2,0
			
 
				-
			
 
				-.Lsecond_cool_down:
			
 
				-	/* Second cool down pass */
			
 
				-	vxor	v0,v0,v8
			
 
				-	vxor	v1,v1,v9
			
 
				-	vxor	v2,v2,v10
			
 
				-	vxor	v3,v3,v11
			
 
				-	vxor	v4,v4,v12
			
 
				-	vxor	v5,v5,v13
			
 
				-	vxor	v6,v6,v14
			
 
				-	vxor	v7,v7,v15
			
 
				-
			
 
				-	/*
			
 
				-	 * vpmsumd produces a 96 bit result in the least significant bits
			
 
				-	 * of the register. Since we are bit reflected we have to shift it
			
 
				-	 * left 32 bits so it occupies the least significant bits in the
			
 
				-	 * bit reflected domain.
			
 
				-	 */
			
 
				-	vsldoi	v0,v0,zeroes,4
			
 
				-	vsldoi	v1,v1,zeroes,4
			
 
				-	vsldoi	v2,v2,zeroes,4
			
 
				-	vsldoi	v3,v3,zeroes,4
			
 
				-	vsldoi	v4,v4,zeroes,4
			
 
				-	vsldoi	v5,v5,zeroes,4
			
 
				-	vsldoi	v6,v6,zeroes,4
			
 
				-	vsldoi	v7,v7,zeroes,4
			
 
				-
			
 
				-	/* xor with last 1024 bits */
			
 
				-	lvx	v8,0,r4
			
 
				-	lvx	v9,off16,r4
			
 
				-	VPERM(v8,v8,v8,byteswap)
			
 
				-	VPERM(v9,v9,v9,byteswap)
			
 
				-	lvx	v10,off32,r4
			
 
				-	lvx	v11,off48,r4
			
 
				-	VPERM(v10,v10,v10,byteswap)
			
 
				-	VPERM(v11,v11,v11,byteswap)
			
 
				-	lvx	v12,off64,r4
			
 
				-	lvx	v13,off80,r4
			
 
				-	VPERM(v12,v12,v12,byteswap)
			
 
				-	VPERM(v13,v13,v13,byteswap)
			
 
				-	lvx	v14,off96,r4
			
 
				-	lvx	v15,off112,r4
			
 
				-	VPERM(v14,v14,v14,byteswap)
			
 
				-	VPERM(v15,v15,v15,byteswap)
			
 
				-
			
 
				-	addi	r4,r4,8*16
			
 
				-
			
 
				-	vxor	v16,v0,v8
			
 
				-	vxor	v17,v1,v9
			
 
				-	vxor	v18,v2,v10
			
 
				-	vxor	v19,v3,v11
			
 
				-	vxor	v20,v4,v12
			
 
				-	vxor	v21,v5,v13
			
 
				-	vxor	v22,v6,v14
			
 
				-	vxor	v23,v7,v15
			
 
				-
			
 
				-	li	r0,1
			
 
				-	cmpdi	r6,0
			
 
				-	addi	r6,r6,128
			
 
				-	bne	1b
			
 
				-
			
 
				-	/* Work out how many bytes we have left */
			
 
				-	andi.	r5,r5,127
			
 
				-
			
 
				-	/* Calculate where in the constant table we need to start */
			
 
				-	subfic	r6,r5,128
			
 
				-	add	r3,r3,r6
			
 
				-
			
 
				-	/* How many 16 byte chunks are in the tail */
			
 
				-	srdi	r7,r5,4
			
 
				-	mtctr	r7
			
 
				-
			
 
				-	/*
			
 
				-	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
			
 
				-	 * 32 bits to include the trailing 32 bits of zeros
			
 
				-	 */
			
 
				-	lvx	v0,0,r3
			
 
				-	lvx	v1,off16,r3
			
 
				-	lvx	v2,off32,r3
			
 
				-	lvx	v3,off48,r3
			
 
				-	lvx	v4,off64,r3
			
 
				-	lvx	v5,off80,r3
			
 
				-	lvx	v6,off96,r3
			
 
				-	lvx	v7,off112,r3
			
 
				-	addi	r3,r3,8*16
			
 
				-
			
 
				-	VPMSUMW(v0,v16,v0)
			
 
				-	VPMSUMW(v1,v17,v1)
			
 
				-	VPMSUMW(v2,v18,v2)
			
 
				-	VPMSUMW(v3,v19,v3)
			
 
				-	VPMSUMW(v4,v20,v4)
			
 
				-	VPMSUMW(v5,v21,v5)
			
 
				-	VPMSUMW(v6,v22,v6)
			
 
				-	VPMSUMW(v7,v23,v7)
			
 
				-
			
 
				-	/* Now reduce the tail (0 - 112 bytes) */
			
 
				-	cmpdi	r7,0
			
 
				-	beq	1f
			
 
				-
			
 
				-	lvx	v16,0,r4
			
 
				-	lvx	v17,0,r3
			
 
				-	VPERM(v16,v16,v16,byteswap)
			
 
				-	VPMSUMW(v16,v16,v17)
			
 
				-	vxor	v0,v0,v16
			
 
				-	bdz	1f
			
 
				-
			
 
				-	lvx	v16,off16,r4
			
 
				-	lvx	v17,off16,r3
			
 
				-	VPERM(v16,v16,v16,byteswap)
			
 
				-	VPMSUMW(v16,v16,v17)
			
 
				-	vxor	v0,v0,v16
			
 
				-	bdz	1f
			
 
				-
			
 
				-	lvx	v16,off32,r4
			
 
				-	lvx	v17,off32,r3
			
 
				-	VPERM(v16,v16,v16,byteswap)
			
 
				-	VPMSUMW(v16,v16,v17)
			
 
				-	vxor	v0,v0,v16
			
 
				-	bdz	1f
			
 
				-
			
 
				-	lvx	v16,off48,r4
			
 
				-	lvx	v17,off48,r3
			
 
				-	VPERM(v16,v16,v16,byteswap)
			
 
				-	VPMSUMW(v16,v16,v17)
			
 
				-	vxor	v0,v0,v16
			
 
				-	bdz	1f
			
 
				-
			
 
				-	lvx	v16,off64,r4
			
 
				-	lvx	v17,off64,r3
			
 
				-	VPERM(v16,v16,v16,byteswap)
			
 
				-	VPMSUMW(v16,v16,v17)
			
 
				-	vxor	v0,v0,v16
			
 
				-	bdz	1f
			
 
				-
			
 
				-	lvx	v16,off80,r4
			
 
				-	lvx	v17,off80,r3
			
 
				-	VPERM(v16,v16,v16,byteswap)
			
 
				-	VPMSUMW(v16,v16,v17)
			
 
				-	vxor	v0,v0,v16
			
 
				-	bdz	1f
			
 
				-
			
 
				-	lvx	v16,off96,r4
			
 
				-	lvx	v17,off96,r3
			
 
				-	VPERM(v16,v16,v16,byteswap)
			
 
				-	VPMSUMW(v16,v16,v17)
			
 
				-	vxor	v0,v0,v16
			
 
				-
			
 
				-	/* Now xor all the parallel chunks together */
			
 
				-1:	vxor	v0,v0,v1
			
 
				-	vxor	v2,v2,v3
			
 
				-	vxor	v4,v4,v5
			
 
				-	vxor	v6,v6,v7
			
 
				-
			
 
				-	vxor	v0,v0,v2
			
 
				-	vxor	v4,v4,v6
			
 
				-
			
 
				-	vxor	v0,v0,v4
			
 
				-
			
 
				-.Lbarrett_reduction:
			
 
				-	/* Barrett constants */
			
 
				-	addis	r3,r2,.barrett_constants@toc@ha
			
 
				-	addi	r3,r3,.barrett_constants@toc@l
			
 
				-
			
 
				-	lvx	const1,0,r3
			
 
				-	lvx	const2,off16,r3
			
 
				-
			
 
				-	vsldoi	v1,v0,v0,8
			
 
				-	vxor	v0,v0,v1		/* xor two 64 bit results together */
			
 
				-
			
 
				-	/* shift left one bit */
			
 
				-	vspltisb v1,1
			
 
				-	vsl	v0,v0,v1
			
 
				-
			
 
				-	vand	v0,v0,mask_64bit
			
 
				-
			
 
				-	/*
			
 
				-	 * The reflected version of Barrett reduction. Instead of bit
			
 
				-	 * reflecting our data (which is expensive to do), we bit reflect our
			
 
				-	 * constants and our algorithm, which means the intermediate data in
			
 
				-	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
			
 
				-	 * the algorithm because we don't carry in mod 2 arithmetic.
			
 
				-	 */
			
 
				-	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
			
 
				-	VPMSUMD(v1,v1,const1)		/* ma */
			
 
				-	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
			
 
				-	VPMSUMD(v1,v1,const2)		/* qn */
			
 
				-	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */
			
 
				-
			
 
				-	/*
			
 
				-	 * Since we are bit reflected, the result (ie the low 32 bits) is in
			
 
				-	 * the high 32 bits. We just need to shift it left 4 bytes
			
 
				-	 * V0 [ 0 1 X 3 ]
			
 
				-	 * V0 [ 0 X 2 3 ]
			
 
				-	 */
			
 
				-	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
			
 
				-
			
 
				-	/* Get it into r3 */
			
 
				-	MFVRD(R3, v0)
			
 
				-
			
 
				-.Lout:
			
 
				-	subi	r6,r1,56+10*16
			
 
				-	subi	r7,r1,56+2*16
			
 
				-
			
 
				-	lvx	v20,0,r6
			
 
				-	lvx	v21,off16,r6
			
 
				-	lvx	v22,off32,r6
			
 
				-	lvx	v23,off48,r6
			
 
				-	lvx	v24,off64,r6
			
 
				-	lvx	v25,off80,r6
			
 
				-	lvx	v26,off96,r6
			
 
				-	lvx	v27,off112,r6
			
 
				-	lvx	v28,0,r7
			
 
				-	lvx	v29,off16,r7
			
 
				-
			
 
				-	ld	r31,-8(r1)
			
 
				-	ld	r30,-16(r1)
			
 
				-	ld	r29,-24(r1)
			
 
				-	ld	r28,-32(r1)
			
 
				-	ld	r27,-40(r1)
			
 
				-	ld	r26,-48(r1)
			
 
				-	ld	r25,-56(r1)
			
 
				-
			
 
				-	blr
			
 
				-
			
 
				-.Lfirst_warm_up_done:
			
 
				-	lvx	const1,0,r3
			
 
				-	addi	r3,r3,16
			
 
				-
			
 
				-	VPMSUMD(v8,v16,const1)
			
 
				-	VPMSUMD(v9,v17,const1)
			
 
				-	VPMSUMD(v10,v18,const1)
			
 
				-	VPMSUMD(v11,v19,const1)
			
 
				-	VPMSUMD(v12,v20,const1)
			
 
				-	VPMSUMD(v13,v21,const1)
			
 
				-	VPMSUMD(v14,v22,const1)
			
 
				-	VPMSUMD(v15,v23,const1)
			
 
				-
			
 
				-	b	.Lsecond_cool_down
			
 
				-
			
 
				-.Lshort:
			
 
				-	cmpdi	r5,0
			
 
				-	beq	.Lzero
			
 
				-
			
 
				-	addis	r3,r2,.short_constants@toc@ha
			
 
				-	addi	r3,r3,.short_constants@toc@l
			
 
				-
			
 
				-	/* Calculate where in the constant table we need to start */
			
 
				-	subfic	r6,r5,256
			
 
				-	add	r3,r3,r6
			
 
				-
			
 
				-	/* How many 16 byte chunks? */
			
 
				-	srdi	r7,r5,4
			
 
				-	mtctr	r7
			
 
				-
			
 
				-	vxor	v19,v19,v19
			
 
				-	vxor	v20,v20,v20
			
 
				-
			
 
				-	lvx	v0,0,r4
			
 
				-	lvx	v16,0,r3
			
 
				-	VPERM(v0,v0,v16,byteswap)
			
 
				-	vxor	v0,v0,v8	/* xor in initial value */
			
 
				-	VPMSUMW(v0,v0,v16)
			
 
				-	bdz	.Lv0
			
 
				-
			
 
				-	lvx	v1,off16,r4
			
 
				-	lvx	v17,off16,r3
			
 
				-	VPERM(v1,v1,v17,byteswap)
			
 
				-	VPMSUMW(v1,v1,v17)
			
 
				-	bdz	.Lv1
			
 
				-
			
 
				-	lvx	v2,off32,r4
			
 
				-	lvx	v16,off32,r3
			
 
				-	VPERM(v2,v2,v16,byteswap)
			
 
				-	VPMSUMW(v2,v2,v16)
			
 
				-	bdz	.Lv2
			
 
				-
			
 
				-	lvx	v3,off48,r4
			
 
				-	lvx	v17,off48,r3
			
 
				-	VPERM(v3,v3,v17,byteswap)
			
 
				-	VPMSUMW(v3,v3,v17)
			
 
				-	bdz	.Lv3
			
 
				-
			
 
				-	lvx	v4,off64,r4
			
 
				-	lvx	v16,off64,r3
			
 
				-	VPERM(v4,v4,v16,byteswap)
			
 
				-	VPMSUMW(v4,v4,v16)
			
 
				-	bdz	.Lv4
			
 
				-
			
 
				-	lvx	v5,off80,r4
			
 
				-	lvx	v17,off80,r3
			
 
				-	VPERM(v5,v5,v17,byteswap)
			
 
				-	VPMSUMW(v5,v5,v17)
			
 
				-	bdz	.Lv5
			
 
				-
			
 
				-	lvx	v6,off96,r4
			
 
				-	lvx	v16,off96,r3
			
 
				-	VPERM(v6,v6,v16,byteswap)
			
 
				-	VPMSUMW(v6,v6,v16)
			
 
				-	bdz	.Lv6
			
 
				-
			
 
				-	lvx	v7,off112,r4
			
 
				-	lvx	v17,off112,r3
			
 
				-	VPERM(v7,v7,v17,byteswap)
			
 
				-	VPMSUMW(v7,v7,v17)
			
 
				-	bdz	.Lv7
			
 
				-
			
 
				-	addi	r3,r3,128
			
 
				-	addi	r4,r4,128
			
 
				-
			
 
				-	lvx	v8,0,r4
			
 
				-	lvx	v16,0,r3
			
 
				-	VPERM(v8,v8,v16,byteswap)
			
 
				-	VPMSUMW(v8,v8,v16)
			
 
				-	bdz	.Lv8
			
 
				-
			
 
				-	lvx	v9,off16,r4
			
 
				-	lvx	v17,off16,r3
			
 
				-	VPERM(v9,v9,v17,byteswap)
			
 
				-	VPMSUMW(v9,v9,v17)
			
 
				-	bdz	.Lv9
			
 
				-
			
 
				-	lvx	v10,off32,r4
			
 
				-	lvx	v16,off32,r3
			
 
				-	VPERM(v10,v10,v16,byteswap)
			
 
				-	VPMSUMW(v10,v10,v16)
			
 
				-	bdz	.Lv10
			
 
				-
			
 
				-	lvx	v11,off48,r4
			
 
				-	lvx	v17,off48,r3
			
 
				-	VPERM(v11,v11,v17,byteswap)
			
 
				-	VPMSUMW(v11,v11,v17)
			
 
				-	bdz	.Lv11
			
 
				-
			
 
				-	lvx	v12,off64,r4
			
 
				-	lvx	v16,off64,r3
			
 
				-	VPERM(v12,v12,v16,byteswap)
			
 
				-	VPMSUMW(v12,v12,v16)
			
 
				-	bdz	.Lv12
			
 
				-
			
 
				-	lvx	v13,off80,r4
			
 
				-	lvx	v17,off80,r3
			
 
				-	VPERM(v13,v13,v17,byteswap)
			
 
				-	VPMSUMW(v13,v13,v17)
			
 
				-	bdz	.Lv13
			
 
				-
			
 
				-	lvx	v14,off96,r4
			
 
				-	lvx	v16,off96,r3
			
 
				-	VPERM(v14,v14,v16,byteswap)
			
 
				-	VPMSUMW(v14,v14,v16)
			
 
				-	bdz	.Lv14
			
 
				-
			
 
				-	lvx	v15,off112,r4
			
 
				-	lvx	v17,off112,r3
			
 
				-	VPERM(v15,v15,v17,byteswap)
			
 
				-	VPMSUMW(v15,v15,v17)
			
 
				-
			
 
				-.Lv15:	vxor	v19,v19,v15
			
 
				-.Lv14:	vxor	v20,v20,v14
			
 
				-.Lv13:	vxor	v19,v19,v13
			
 
				-.Lv12:	vxor	v20,v20,v12
			
 
				-.Lv11:	vxor	v19,v19,v11
			
 
				-.Lv10:	vxor	v20,v20,v10
			
 
				-.Lv9:	vxor	v19,v19,v9
			
 
				-.Lv8:	vxor	v20,v20,v8
			
 
				-.Lv7:	vxor	v19,v19,v7
			
 
				-.Lv6:	vxor	v20,v20,v6
			
 
				-.Lv5:	vxor	v19,v19,v5
			
 
				-.Lv4:	vxor	v20,v20,v4
			
 
				-.Lv3:	vxor	v19,v19,v3
			
 
				-.Lv2:	vxor	v20,v20,v2
			
 
				-.Lv1:	vxor	v19,v19,v1
			
 
				-.Lv0:	vxor	v20,v20,v0
			
 
				-
			
 
				-	vxor	v0,v19,v20
			
 
				-
			
 
				-	b	.Lbarrett_reduction
			
 
				-
			
 
				-.Lzero:
			
 
				-	mr	r3,r10
			
 
				-	b	.Lout
			
 
				-
			
 
				-FUNC_END(__crc32_vpmsum)
			
 
				+#define CRC_FUNCTION_NAME __crc32c_vpmsum
			
 
				+#include "crc32-vpmsum_core.S"