|
@@ -1,20 +1,5 @@
|
|
|
/*
|
|
|
- * Calculate the checksum of data that is 16 byte aligned and a multiple of
|
|
|
- * 16 bytes.
|
|
|
- *
|
|
|
- * The first step is to reduce it to 1024 bits. We do this in 8 parallel
|
|
|
- * chunks in order to mask the latency of the vpmsum instructions. If we
|
|
|
- * have more than 32 kB of data to checksum we repeat this step multiple
|
|
|
- * times, passing in the previous 1024 bits.
|
|
|
- *
|
|
|
- * The next step is to reduce the 1024 bits to 64 bits. This step adds
|
|
|
- * 32 bits of 0s to the end - this matches what a CRC does. We just
|
|
|
- * calculate constants that land the data in this 32 bits.
|
|
|
- *
|
|
|
- * We then use fixed point Barrett reduction to compute a mod n over GF(2)
|
|
|
- * for n = CRC using POWER8 instructions. We use x = 32.
|
|
|
- *
|
|
|
- * http://en.wikipedia.org/wiki/Barrett_reduction
|
|
|
+ * Calculate a crc32c with vpmsum acceleration
|
|
|
*
|
|
|
* Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
|
|
|
*
|
|
@@ -23,9 +8,6 @@
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
*/
|
|
|
-#include <asm/ppc_asm.h>
|
|
|
-#include <asm/ppc-opcode.h>
|
|
|
-
|
|
|
.section .rodata
|
|
|
.balign 16
|
|
|
|
|
@@ -33,7 +15,6 @@
|
|
|
/* byte reverse permute constant */
|
|
|
.octa 0x0F0E0D0C0B0A09080706050403020100
|
|
|
|
|
|
-#define MAX_SIZE 32768
|
|
|
.constants:
|
|
|
|
|
|
/* Reduce 262144 kbits to 1024 bits */
|
|
@@ -860,694 +841,5 @@
|
|
|
/* 33 bit reflected Barrett constant n */
|
|
|
.octa 0x00000000000000000000000105ec76f1
|
|
|
|
|
|
- .text
|
|
|
-
|
|
|
-#if defined(__BIG_ENDIAN__)
|
|
|
-#define BYTESWAP_DATA
|
|
|
-#else
|
|
|
-#undef BYTESWAP_DATA
|
|
|
-#endif
|
|
|
-
|
|
|
-#define off16 r25
|
|
|
-#define off32 r26
|
|
|
-#define off48 r27
|
|
|
-#define off64 r28
|
|
|
-#define off80 r29
|
|
|
-#define off96 r30
|
|
|
-#define off112 r31
|
|
|
-
|
|
|
-#define const1 v24
|
|
|
-#define const2 v25
|
|
|
-
|
|
|
-#define byteswap v26
|
|
|
-#define mask_32bit v27
|
|
|
-#define mask_64bit v28
|
|
|
-#define zeroes v29
|
|
|
-
|
|
|
-#ifdef BYTESWAP_DATA
|
|
|
-#define VPERM(A, B, C, D) vperm A, B, C, D
|
|
|
-#else
|
|
|
-#define VPERM(A, B, C, D)
|
|
|
-#endif
|
|
|
-
|
|
|
-/* unsigned int __crc32c_vpmsum(unsigned int crc, void *p, unsigned long len) */
|
|
|
-FUNC_START(__crc32c_vpmsum)
|
|
|
- std r31,-8(r1)
|
|
|
- std r30,-16(r1)
|
|
|
- std r29,-24(r1)
|
|
|
- std r28,-32(r1)
|
|
|
- std r27,-40(r1)
|
|
|
- std r26,-48(r1)
|
|
|
- std r25,-56(r1)
|
|
|
-
|
|
|
- li off16,16
|
|
|
- li off32,32
|
|
|
- li off48,48
|
|
|
- li off64,64
|
|
|
- li off80,80
|
|
|
- li off96,96
|
|
|
- li off112,112
|
|
|
- li r0,0
|
|
|
-
|
|
|
- /* Enough room for saving 10 non volatile VMX registers */
|
|
|
- subi r6,r1,56+10*16
|
|
|
- subi r7,r1,56+2*16
|
|
|
-
|
|
|
- stvx v20,0,r6
|
|
|
- stvx v21,off16,r6
|
|
|
- stvx v22,off32,r6
|
|
|
- stvx v23,off48,r6
|
|
|
- stvx v24,off64,r6
|
|
|
- stvx v25,off80,r6
|
|
|
- stvx v26,off96,r6
|
|
|
- stvx v27,off112,r6
|
|
|
- stvx v28,0,r7
|
|
|
- stvx v29,off16,r7
|
|
|
-
|
|
|
- mr r10,r3
|
|
|
-
|
|
|
- vxor zeroes,zeroes,zeroes
|
|
|
- vspltisw v0,-1
|
|
|
-
|
|
|
- vsldoi mask_32bit,zeroes,v0,4
|
|
|
- vsldoi mask_64bit,zeroes,v0,8
|
|
|
-
|
|
|
- /* Get the initial value into v8 */
|
|
|
- vxor v8,v8,v8
|
|
|
- MTVRD(v8, R3)
|
|
|
- vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
|
|
|
-
|
|
|
-#ifdef BYTESWAP_DATA
|
|
|
- addis r3,r2,.byteswap_constant@toc@ha
|
|
|
- addi r3,r3,.byteswap_constant@toc@l
|
|
|
-
|
|
|
- lvx byteswap,0,r3
|
|
|
- addi r3,r3,16
|
|
|
-#endif
|
|
|
-
|
|
|
- cmpdi r5,256
|
|
|
- blt .Lshort
|
|
|
-
|
|
|
- rldicr r6,r5,0,56
|
|
|
-
|
|
|
- /* Checksum in blocks of MAX_SIZE */
|
|
|
-1: lis r7,MAX_SIZE@h
|
|
|
- ori r7,r7,MAX_SIZE@l
|
|
|
- mr r9,r7
|
|
|
- cmpd r6,r7
|
|
|
- bgt 2f
|
|
|
- mr r7,r6
|
|
|
-2: subf r6,r7,r6
|
|
|
-
|
|
|
- /* our main loop does 128 bytes at a time */
|
|
|
- srdi r7,r7,7
|
|
|
-
|
|
|
- /*
|
|
|
- * Work out the offset into the constants table to start at. Each
|
|
|
- * constant is 16 bytes, and it is used against 128 bytes of input
|
|
|
- * data - 128 / 16 = 8
|
|
|
- */
|
|
|
- sldi r8,r7,4
|
|
|
- srdi r9,r9,3
|
|
|
- subf r8,r8,r9
|
|
|
-
|
|
|
- /* We reduce our final 128 bytes in a separate step */
|
|
|
- addi r7,r7,-1
|
|
|
- mtctr r7
|
|
|
-
|
|
|
- addis r3,r2,.constants@toc@ha
|
|
|
- addi r3,r3,.constants@toc@l
|
|
|
-
|
|
|
- /* Find the start of our constants */
|
|
|
- add r3,r3,r8
|
|
|
-
|
|
|
- /* zero v0-v7 which will contain our checksums */
|
|
|
- vxor v0,v0,v0
|
|
|
- vxor v1,v1,v1
|
|
|
- vxor v2,v2,v2
|
|
|
- vxor v3,v3,v3
|
|
|
- vxor v4,v4,v4
|
|
|
- vxor v5,v5,v5
|
|
|
- vxor v6,v6,v6
|
|
|
- vxor v7,v7,v7
|
|
|
-
|
|
|
- lvx const1,0,r3
|
|
|
-
|
|
|
- /*
|
|
|
- * If we are looping back to consume more data we use the values
|
|
|
- * already in v16-v23.
|
|
|
- */
|
|
|
- cmpdi r0,1
|
|
|
- beq 2f
|
|
|
-
|
|
|
- /* First warm up pass */
|
|
|
- lvx v16,0,r4
|
|
|
- lvx v17,off16,r4
|
|
|
- VPERM(v16,v16,v16,byteswap)
|
|
|
- VPERM(v17,v17,v17,byteswap)
|
|
|
- lvx v18,off32,r4
|
|
|
- lvx v19,off48,r4
|
|
|
- VPERM(v18,v18,v18,byteswap)
|
|
|
- VPERM(v19,v19,v19,byteswap)
|
|
|
- lvx v20,off64,r4
|
|
|
- lvx v21,off80,r4
|
|
|
- VPERM(v20,v20,v20,byteswap)
|
|
|
- VPERM(v21,v21,v21,byteswap)
|
|
|
- lvx v22,off96,r4
|
|
|
- lvx v23,off112,r4
|
|
|
- VPERM(v22,v22,v22,byteswap)
|
|
|
- VPERM(v23,v23,v23,byteswap)
|
|
|
- addi r4,r4,8*16
|
|
|
-
|
|
|
- /* xor in initial value */
|
|
|
- vxor v16,v16,v8
|
|
|
-
|
|
|
-2: bdz .Lfirst_warm_up_done
|
|
|
-
|
|
|
- addi r3,r3,16
|
|
|
- lvx const2,0,r3
|
|
|
-
|
|
|
- /* Second warm up pass */
|
|
|
- VPMSUMD(v8,v16,const1)
|
|
|
- lvx v16,0,r4
|
|
|
- VPERM(v16,v16,v16,byteswap)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- VPMSUMD(v9,v17,const1)
|
|
|
- lvx v17,off16,r4
|
|
|
- VPERM(v17,v17,v17,byteswap)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- VPMSUMD(v10,v18,const1)
|
|
|
- lvx v18,off32,r4
|
|
|
- VPERM(v18,v18,v18,byteswap)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- VPMSUMD(v11,v19,const1)
|
|
|
- lvx v19,off48,r4
|
|
|
- VPERM(v19,v19,v19,byteswap)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- VPMSUMD(v12,v20,const1)
|
|
|
- lvx v20,off64,r4
|
|
|
- VPERM(v20,v20,v20,byteswap)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- VPMSUMD(v13,v21,const1)
|
|
|
- lvx v21,off80,r4
|
|
|
- VPERM(v21,v21,v21,byteswap)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- VPMSUMD(v14,v22,const1)
|
|
|
- lvx v22,off96,r4
|
|
|
- VPERM(v22,v22,v22,byteswap)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- VPMSUMD(v15,v23,const1)
|
|
|
- lvx v23,off112,r4
|
|
|
- VPERM(v23,v23,v23,byteswap)
|
|
|
-
|
|
|
- addi r4,r4,8*16
|
|
|
-
|
|
|
- bdz .Lfirst_cool_down
|
|
|
-
|
|
|
- /*
|
|
|
- * main loop. We modulo schedule it such that it takes three iterations
|
|
|
- * to complete - first iteration load, second iteration vpmsum, third
|
|
|
- * iteration xor.
|
|
|
- */
|
|
|
- .balign 16
|
|
|
-4: lvx const1,0,r3
|
|
|
- addi r3,r3,16
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v0,v0,v8
|
|
|
- VPMSUMD(v8,v16,const2)
|
|
|
- lvx v16,0,r4
|
|
|
- VPERM(v16,v16,v16,byteswap)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v1,v1,v9
|
|
|
- VPMSUMD(v9,v17,const2)
|
|
|
- lvx v17,off16,r4
|
|
|
- VPERM(v17,v17,v17,byteswap)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v2,v2,v10
|
|
|
- VPMSUMD(v10,v18,const2)
|
|
|
- lvx v18,off32,r4
|
|
|
- VPERM(v18,v18,v18,byteswap)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v3,v3,v11
|
|
|
- VPMSUMD(v11,v19,const2)
|
|
|
- lvx v19,off48,r4
|
|
|
- VPERM(v19,v19,v19,byteswap)
|
|
|
- lvx const2,0,r3
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v4,v4,v12
|
|
|
- VPMSUMD(v12,v20,const1)
|
|
|
- lvx v20,off64,r4
|
|
|
- VPERM(v20,v20,v20,byteswap)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v5,v5,v13
|
|
|
- VPMSUMD(v13,v21,const1)
|
|
|
- lvx v21,off80,r4
|
|
|
- VPERM(v21,v21,v21,byteswap)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v6,v6,v14
|
|
|
- VPMSUMD(v14,v22,const1)
|
|
|
- lvx v22,off96,r4
|
|
|
- VPERM(v22,v22,v22,byteswap)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v7,v7,v15
|
|
|
- VPMSUMD(v15,v23,const1)
|
|
|
- lvx v23,off112,r4
|
|
|
- VPERM(v23,v23,v23,byteswap)
|
|
|
-
|
|
|
- addi r4,r4,8*16
|
|
|
-
|
|
|
- bdnz 4b
|
|
|
-
|
|
|
-.Lfirst_cool_down:
|
|
|
- /* First cool down pass */
|
|
|
- lvx const1,0,r3
|
|
|
- addi r3,r3,16
|
|
|
-
|
|
|
- vxor v0,v0,v8
|
|
|
- VPMSUMD(v8,v16,const1)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v1,v1,v9
|
|
|
- VPMSUMD(v9,v17,const1)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v2,v2,v10
|
|
|
- VPMSUMD(v10,v18,const1)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v3,v3,v11
|
|
|
- VPMSUMD(v11,v19,const1)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v4,v4,v12
|
|
|
- VPMSUMD(v12,v20,const1)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v5,v5,v13
|
|
|
- VPMSUMD(v13,v21,const1)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v6,v6,v14
|
|
|
- VPMSUMD(v14,v22,const1)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
- vxor v7,v7,v15
|
|
|
- VPMSUMD(v15,v23,const1)
|
|
|
- ori r2,r2,0
|
|
|
-
|
|
|
-.Lsecond_cool_down:
|
|
|
- /* Second cool down pass */
|
|
|
- vxor v0,v0,v8
|
|
|
- vxor v1,v1,v9
|
|
|
- vxor v2,v2,v10
|
|
|
- vxor v3,v3,v11
|
|
|
- vxor v4,v4,v12
|
|
|
- vxor v5,v5,v13
|
|
|
- vxor v6,v6,v14
|
|
|
- vxor v7,v7,v15
|
|
|
-
|
|
|
- /*
|
|
|
- * vpmsumd produces a 96 bit result in the least significant bits
|
|
|
- * of the register. Since we are bit reflected we have to shift it
|
|
|
- * left 32 bits so it occupies the least significant bits in the
|
|
|
- * bit reflected domain.
|
|
|
- */
|
|
|
- vsldoi v0,v0,zeroes,4
|
|
|
- vsldoi v1,v1,zeroes,4
|
|
|
- vsldoi v2,v2,zeroes,4
|
|
|
- vsldoi v3,v3,zeroes,4
|
|
|
- vsldoi v4,v4,zeroes,4
|
|
|
- vsldoi v5,v5,zeroes,4
|
|
|
- vsldoi v6,v6,zeroes,4
|
|
|
- vsldoi v7,v7,zeroes,4
|
|
|
-
|
|
|
- /* xor with last 1024 bits */
|
|
|
- lvx v8,0,r4
|
|
|
- lvx v9,off16,r4
|
|
|
- VPERM(v8,v8,v8,byteswap)
|
|
|
- VPERM(v9,v9,v9,byteswap)
|
|
|
- lvx v10,off32,r4
|
|
|
- lvx v11,off48,r4
|
|
|
- VPERM(v10,v10,v10,byteswap)
|
|
|
- VPERM(v11,v11,v11,byteswap)
|
|
|
- lvx v12,off64,r4
|
|
|
- lvx v13,off80,r4
|
|
|
- VPERM(v12,v12,v12,byteswap)
|
|
|
- VPERM(v13,v13,v13,byteswap)
|
|
|
- lvx v14,off96,r4
|
|
|
- lvx v15,off112,r4
|
|
|
- VPERM(v14,v14,v14,byteswap)
|
|
|
- VPERM(v15,v15,v15,byteswap)
|
|
|
-
|
|
|
- addi r4,r4,8*16
|
|
|
-
|
|
|
- vxor v16,v0,v8
|
|
|
- vxor v17,v1,v9
|
|
|
- vxor v18,v2,v10
|
|
|
- vxor v19,v3,v11
|
|
|
- vxor v20,v4,v12
|
|
|
- vxor v21,v5,v13
|
|
|
- vxor v22,v6,v14
|
|
|
- vxor v23,v7,v15
|
|
|
-
|
|
|
- li r0,1
|
|
|
- cmpdi r6,0
|
|
|
- addi r6,r6,128
|
|
|
- bne 1b
|
|
|
-
|
|
|
- /* Work out how many bytes we have left */
|
|
|
- andi. r5,r5,127
|
|
|
-
|
|
|
- /* Calculate where in the constant table we need to start */
|
|
|
- subfic r6,r5,128
|
|
|
- add r3,r3,r6
|
|
|
-
|
|
|
- /* How many 16 byte chunks are in the tail */
|
|
|
- srdi r7,r5,4
|
|
|
- mtctr r7
|
|
|
-
|
|
|
- /*
|
|
|
- * Reduce the previously calculated 1024 bits to 64 bits, shifting
|
|
|
- * 32 bits to include the trailing 32 bits of zeros
|
|
|
- */
|
|
|
- lvx v0,0,r3
|
|
|
- lvx v1,off16,r3
|
|
|
- lvx v2,off32,r3
|
|
|
- lvx v3,off48,r3
|
|
|
- lvx v4,off64,r3
|
|
|
- lvx v5,off80,r3
|
|
|
- lvx v6,off96,r3
|
|
|
- lvx v7,off112,r3
|
|
|
- addi r3,r3,8*16
|
|
|
-
|
|
|
- VPMSUMW(v0,v16,v0)
|
|
|
- VPMSUMW(v1,v17,v1)
|
|
|
- VPMSUMW(v2,v18,v2)
|
|
|
- VPMSUMW(v3,v19,v3)
|
|
|
- VPMSUMW(v4,v20,v4)
|
|
|
- VPMSUMW(v5,v21,v5)
|
|
|
- VPMSUMW(v6,v22,v6)
|
|
|
- VPMSUMW(v7,v23,v7)
|
|
|
-
|
|
|
- /* Now reduce the tail (0 - 112 bytes) */
|
|
|
- cmpdi r7,0
|
|
|
- beq 1f
|
|
|
-
|
|
|
- lvx v16,0,r4
|
|
|
- lvx v17,0,r3
|
|
|
- VPERM(v16,v16,v16,byteswap)
|
|
|
- VPMSUMW(v16,v16,v17)
|
|
|
- vxor v0,v0,v16
|
|
|
- bdz 1f
|
|
|
-
|
|
|
- lvx v16,off16,r4
|
|
|
- lvx v17,off16,r3
|
|
|
- VPERM(v16,v16,v16,byteswap)
|
|
|
- VPMSUMW(v16,v16,v17)
|
|
|
- vxor v0,v0,v16
|
|
|
- bdz 1f
|
|
|
-
|
|
|
- lvx v16,off32,r4
|
|
|
- lvx v17,off32,r3
|
|
|
- VPERM(v16,v16,v16,byteswap)
|
|
|
- VPMSUMW(v16,v16,v17)
|
|
|
- vxor v0,v0,v16
|
|
|
- bdz 1f
|
|
|
-
|
|
|
- lvx v16,off48,r4
|
|
|
- lvx v17,off48,r3
|
|
|
- VPERM(v16,v16,v16,byteswap)
|
|
|
- VPMSUMW(v16,v16,v17)
|
|
|
- vxor v0,v0,v16
|
|
|
- bdz 1f
|
|
|
-
|
|
|
- lvx v16,off64,r4
|
|
|
- lvx v17,off64,r3
|
|
|
- VPERM(v16,v16,v16,byteswap)
|
|
|
- VPMSUMW(v16,v16,v17)
|
|
|
- vxor v0,v0,v16
|
|
|
- bdz 1f
|
|
|
-
|
|
|
- lvx v16,off80,r4
|
|
|
- lvx v17,off80,r3
|
|
|
- VPERM(v16,v16,v16,byteswap)
|
|
|
- VPMSUMW(v16,v16,v17)
|
|
|
- vxor v0,v0,v16
|
|
|
- bdz 1f
|
|
|
-
|
|
|
- lvx v16,off96,r4
|
|
|
- lvx v17,off96,r3
|
|
|
- VPERM(v16,v16,v16,byteswap)
|
|
|
- VPMSUMW(v16,v16,v17)
|
|
|
- vxor v0,v0,v16
|
|
|
-
|
|
|
- /* Now xor all the parallel chunks together */
|
|
|
-1: vxor v0,v0,v1
|
|
|
- vxor v2,v2,v3
|
|
|
- vxor v4,v4,v5
|
|
|
- vxor v6,v6,v7
|
|
|
-
|
|
|
- vxor v0,v0,v2
|
|
|
- vxor v4,v4,v6
|
|
|
-
|
|
|
- vxor v0,v0,v4
|
|
|
-
|
|
|
-.Lbarrett_reduction:
|
|
|
- /* Barrett constants */
|
|
|
- addis r3,r2,.barrett_constants@toc@ha
|
|
|
- addi r3,r3,.barrett_constants@toc@l
|
|
|
-
|
|
|
- lvx const1,0,r3
|
|
|
- lvx const2,off16,r3
|
|
|
-
|
|
|
- vsldoi v1,v0,v0,8
|
|
|
- vxor v0,v0,v1 /* xor two 64 bit results together */
|
|
|
-
|
|
|
- /* shift left one bit */
|
|
|
- vspltisb v1,1
|
|
|
- vsl v0,v0,v1
|
|
|
-
|
|
|
- vand v0,v0,mask_64bit
|
|
|
-
|
|
|
- /*
|
|
|
- * The reflected version of Barrett reduction. Instead of bit
|
|
|
- * reflecting our data (which is expensive to do), we bit reflect our
|
|
|
- * constants and our algorithm, which means the intermediate data in
|
|
|
- * our vector registers goes from 0-63 instead of 63-0. We can reflect
|
|
|
- * the algorithm because we don't carry in mod 2 arithmetic.
|
|
|
- */
|
|
|
- vand v1,v0,mask_32bit /* bottom 32 bits of a */
|
|
|
- VPMSUMD(v1,v1,const1) /* ma */
|
|
|
- vand v1,v1,mask_32bit /* bottom 32bits of ma */
|
|
|
- VPMSUMD(v1,v1,const2) /* qn */
|
|
|
- vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
|
|
|
-
|
|
|
- /*
|
|
|
- * Since we are bit reflected, the result (ie the low 32 bits) is in
|
|
|
- * the high 32 bits. We just need to shift it left 4 bytes
|
|
|
- * V0 [ 0 1 X 3 ]
|
|
|
- * V0 [ 0 X 2 3 ]
|
|
|
- */
|
|
|
- vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
|
|
|
-
|
|
|
- /* Get it into r3 */
|
|
|
- MFVRD(R3, v0)
|
|
|
-
|
|
|
-.Lout:
|
|
|
- subi r6,r1,56+10*16
|
|
|
- subi r7,r1,56+2*16
|
|
|
-
|
|
|
- lvx v20,0,r6
|
|
|
- lvx v21,off16,r6
|
|
|
- lvx v22,off32,r6
|
|
|
- lvx v23,off48,r6
|
|
|
- lvx v24,off64,r6
|
|
|
- lvx v25,off80,r6
|
|
|
- lvx v26,off96,r6
|
|
|
- lvx v27,off112,r6
|
|
|
- lvx v28,0,r7
|
|
|
- lvx v29,off16,r7
|
|
|
-
|
|
|
- ld r31,-8(r1)
|
|
|
- ld r30,-16(r1)
|
|
|
- ld r29,-24(r1)
|
|
|
- ld r28,-32(r1)
|
|
|
- ld r27,-40(r1)
|
|
|
- ld r26,-48(r1)
|
|
|
- ld r25,-56(r1)
|
|
|
-
|
|
|
- blr
|
|
|
-
|
|
|
-.Lfirst_warm_up_done:
|
|
|
- lvx const1,0,r3
|
|
|
- addi r3,r3,16
|
|
|
-
|
|
|
- VPMSUMD(v8,v16,const1)
|
|
|
- VPMSUMD(v9,v17,const1)
|
|
|
- VPMSUMD(v10,v18,const1)
|
|
|
- VPMSUMD(v11,v19,const1)
|
|
|
- VPMSUMD(v12,v20,const1)
|
|
|
- VPMSUMD(v13,v21,const1)
|
|
|
- VPMSUMD(v14,v22,const1)
|
|
|
- VPMSUMD(v15,v23,const1)
|
|
|
-
|
|
|
- b .Lsecond_cool_down
|
|
|
-
|
|
|
-.Lshort:
|
|
|
- cmpdi r5,0
|
|
|
- beq .Lzero
|
|
|
-
|
|
|
- addis r3,r2,.short_constants@toc@ha
|
|
|
- addi r3,r3,.short_constants@toc@l
|
|
|
-
|
|
|
- /* Calculate where in the constant table we need to start */
|
|
|
- subfic r6,r5,256
|
|
|
- add r3,r3,r6
|
|
|
-
|
|
|
- /* How many 16 byte chunks? */
|
|
|
- srdi r7,r5,4
|
|
|
- mtctr r7
|
|
|
-
|
|
|
- vxor v19,v19,v19
|
|
|
- vxor v20,v20,v20
|
|
|
-
|
|
|
- lvx v0,0,r4
|
|
|
- lvx v16,0,r3
|
|
|
- VPERM(v0,v0,v16,byteswap)
|
|
|
- vxor v0,v0,v8 /* xor in initial value */
|
|
|
- VPMSUMW(v0,v0,v16)
|
|
|
- bdz .Lv0
|
|
|
-
|
|
|
- lvx v1,off16,r4
|
|
|
- lvx v17,off16,r3
|
|
|
- VPERM(v1,v1,v17,byteswap)
|
|
|
- VPMSUMW(v1,v1,v17)
|
|
|
- bdz .Lv1
|
|
|
-
|
|
|
- lvx v2,off32,r4
|
|
|
- lvx v16,off32,r3
|
|
|
- VPERM(v2,v2,v16,byteswap)
|
|
|
- VPMSUMW(v2,v2,v16)
|
|
|
- bdz .Lv2
|
|
|
-
|
|
|
- lvx v3,off48,r4
|
|
|
- lvx v17,off48,r3
|
|
|
- VPERM(v3,v3,v17,byteswap)
|
|
|
- VPMSUMW(v3,v3,v17)
|
|
|
- bdz .Lv3
|
|
|
-
|
|
|
- lvx v4,off64,r4
|
|
|
- lvx v16,off64,r3
|
|
|
- VPERM(v4,v4,v16,byteswap)
|
|
|
- VPMSUMW(v4,v4,v16)
|
|
|
- bdz .Lv4
|
|
|
-
|
|
|
- lvx v5,off80,r4
|
|
|
- lvx v17,off80,r3
|
|
|
- VPERM(v5,v5,v17,byteswap)
|
|
|
- VPMSUMW(v5,v5,v17)
|
|
|
- bdz .Lv5
|
|
|
-
|
|
|
- lvx v6,off96,r4
|
|
|
- lvx v16,off96,r3
|
|
|
- VPERM(v6,v6,v16,byteswap)
|
|
|
- VPMSUMW(v6,v6,v16)
|
|
|
- bdz .Lv6
|
|
|
-
|
|
|
- lvx v7,off112,r4
|
|
|
- lvx v17,off112,r3
|
|
|
- VPERM(v7,v7,v17,byteswap)
|
|
|
- VPMSUMW(v7,v7,v17)
|
|
|
- bdz .Lv7
|
|
|
-
|
|
|
- addi r3,r3,128
|
|
|
- addi r4,r4,128
|
|
|
-
|
|
|
- lvx v8,0,r4
|
|
|
- lvx v16,0,r3
|
|
|
- VPERM(v8,v8,v16,byteswap)
|
|
|
- VPMSUMW(v8,v8,v16)
|
|
|
- bdz .Lv8
|
|
|
-
|
|
|
- lvx v9,off16,r4
|
|
|
- lvx v17,off16,r3
|
|
|
- VPERM(v9,v9,v17,byteswap)
|
|
|
- VPMSUMW(v9,v9,v17)
|
|
|
- bdz .Lv9
|
|
|
-
|
|
|
- lvx v10,off32,r4
|
|
|
- lvx v16,off32,r3
|
|
|
- VPERM(v10,v10,v16,byteswap)
|
|
|
- VPMSUMW(v10,v10,v16)
|
|
|
- bdz .Lv10
|
|
|
-
|
|
|
- lvx v11,off48,r4
|
|
|
- lvx v17,off48,r3
|
|
|
- VPERM(v11,v11,v17,byteswap)
|
|
|
- VPMSUMW(v11,v11,v17)
|
|
|
- bdz .Lv11
|
|
|
-
|
|
|
- lvx v12,off64,r4
|
|
|
- lvx v16,off64,r3
|
|
|
- VPERM(v12,v12,v16,byteswap)
|
|
|
- VPMSUMW(v12,v12,v16)
|
|
|
- bdz .Lv12
|
|
|
-
|
|
|
- lvx v13,off80,r4
|
|
|
- lvx v17,off80,r3
|
|
|
- VPERM(v13,v13,v17,byteswap)
|
|
|
- VPMSUMW(v13,v13,v17)
|
|
|
- bdz .Lv13
|
|
|
-
|
|
|
- lvx v14,off96,r4
|
|
|
- lvx v16,off96,r3
|
|
|
- VPERM(v14,v14,v16,byteswap)
|
|
|
- VPMSUMW(v14,v14,v16)
|
|
|
- bdz .Lv14
|
|
|
-
|
|
|
- lvx v15,off112,r4
|
|
|
- lvx v17,off112,r3
|
|
|
- VPERM(v15,v15,v17,byteswap)
|
|
|
- VPMSUMW(v15,v15,v17)
|
|
|
-
|
|
|
-.Lv15: vxor v19,v19,v15
|
|
|
-.Lv14: vxor v20,v20,v14
|
|
|
-.Lv13: vxor v19,v19,v13
|
|
|
-.Lv12: vxor v20,v20,v12
|
|
|
-.Lv11: vxor v19,v19,v11
|
|
|
-.Lv10: vxor v20,v20,v10
|
|
|
-.Lv9: vxor v19,v19,v9
|
|
|
-.Lv8: vxor v20,v20,v8
|
|
|
-.Lv7: vxor v19,v19,v7
|
|
|
-.Lv6: vxor v20,v20,v6
|
|
|
-.Lv5: vxor v19,v19,v5
|
|
|
-.Lv4: vxor v20,v20,v4
|
|
|
-.Lv3: vxor v19,v19,v3
|
|
|
-.Lv2: vxor v20,v20,v2
|
|
|
-.Lv1: vxor v19,v19,v1
|
|
|
-.Lv0: vxor v20,v20,v0
|
|
|
-
|
|
|
- vxor v0,v19,v20
|
|
|
-
|
|
|
- b .Lbarrett_reduction
|
|
|
-
|
|
|
-.Lzero:
|
|
|
- mr r3,r10
|
|
|
- b .Lout
|
|
|
-
|
|
|
-FUNC_END(__crc32_vpmsum)
|
|
|
+#define CRC_FUNCTION_NAME __crc32c_vpmsum
|
|
|
+#include "crc32-vpmsum_core.S"
|