%!s(int64=8) %!d(string=hai) anos · 6ec4e2514d
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -121,6 +121,7 @@ extern const struct raid6_recov_calls raid6_recov_ssse3;
 
				 extern const struct raid6_recov_calls raid6_recov_avx2;
			
 
				 extern const struct raid6_recov_calls raid6_recov_avx512;
			
 
				 extern const struct raid6_recov_calls raid6_recov_s390xc;
			
 
				+extern const struct raid6_recov_calls raid6_recov_neon;
			
 
				 
			
 
				 extern const struct raid6_calls raid6_neonx1;
			
 
				 extern const struct raid6_calls raid6_neonx2;
			
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -5,7 +5,7 @@ raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
 
				 
			
 
				 raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o avx512.o recov_avx512.o
			
 
				 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
			
 
				-raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
			
 
				+raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
			
 
				 raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
			
 
				 raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o
			
 
				 
			
@@ -26,7 +26,9 @@ NEON_FLAGS := -ffreestanding
 
				 ifeq ($(ARCH),arm)
			
 
				 NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon
			
 
				 endif
			
 
				+CFLAGS_recov_neon_inner.o += $(NEON_FLAGS)
			
 
				 ifeq ($(ARCH),arm64)
			
 
				+CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only
			
 
				 CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
			
 
				 CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
			
 
				 CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only
			
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -112,6 +112,9 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = {
 
				 #endif
			
 
				 #ifdef CONFIG_S390
			
 
				 	&raid6_recov_s390xc,
			
 
				+#endif
			
 
				+#if defined(CONFIG_KERNEL_MODE_NEON)
			
 
				+	&raid6_recov_neon,
			
 
				 #endif
			
 
				 	&raid6_recov_intx1,
			
 
				 	NULL
			
--- a/lib/raid6/recov_neon.c
+++ b/lib/raid6/recov_neon.c
@@ -0,0 +1,110 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Intel Corporation
			
 
				+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License
			
 
				+ * as published by the Free Software Foundation; version 2
			
 
				+ * of the License.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/raid/pq.h>
			
 
				+
			
 
				+#ifdef __KERNEL__
			
 
				+#include <asm/neon.h>
			
 
				+#else
			
 
				+#define kernel_neon_begin()
			
 
				+#define kernel_neon_end()
			
 
				+#define cpu_has_neon()		(1)
			
 
				+#endif
			
 
				+
			
 
				+static int raid6_has_neon(void)
			
 
				+{
			
 
				+	return cpu_has_neon();
			
 
				+}
			
 
				+
			
 
				+void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
			
 
				+			      uint8_t *dq, const uint8_t *pbmul,
			
 
				+			      const uint8_t *qmul);
			
 
				+
			
 
				+void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
			
 
				+			      const uint8_t *qmul);
			
 
				+
			
 
				+static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
			
 
				+		int failb, void **ptrs)
			
 
				+{
			
 
				+	u8 *p, *q, *dp, *dq;
			
 
				+	const u8 *pbmul;	/* P multiplier table for B data */
			
 
				+	const u8 *qmul;		/* Q multiplier table (for both) */
			
 
				+
			
 
				+	p = (u8 *)ptrs[disks - 2];
			
 
				+	q = (u8 *)ptrs[disks - 1];
			
 
				+
			
 
				+	/*
			
 
				+	 * Compute syndrome with zero for the missing data pages
			
 
				+	 * Use the dead data pages as temporary storage for
			
 
				+	 * delta p and delta q
			
 
				+	 */
			
 
				+	dp = (u8 *)ptrs[faila];
			
 
				+	ptrs[faila] = (void *)raid6_empty_zero_page;
			
 
				+	ptrs[disks - 2] = dp;
			
 
				+	dq = (u8 *)ptrs[failb];
			
 
				+	ptrs[failb] = (void *)raid6_empty_zero_page;
			
 
				+	ptrs[disks - 1] = dq;
			
 
				+
			
 
				+	raid6_call.gen_syndrome(disks, bytes, ptrs);
			
 
				+
			
 
				+	/* Restore pointer table */
			
 
				+	ptrs[faila]     = dp;
			
 
				+	ptrs[failb]     = dq;
			
 
				+	ptrs[disks - 2] = p;
			
 
				+	ptrs[disks - 1] = q;
			
 
				+
			
 
				+	/* Now, pick the proper data tables */
			
 
				+	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
			
 
				+	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
			
 
				+					 raid6_gfexp[failb]]];
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
			
 
				+	kernel_neon_end();
			
 
				+}
			
 
				+
			
 
				+static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
			
 
				+		void **ptrs)
			
 
				+{
			
 
				+	u8 *p, *q, *dq;
			
 
				+	const u8 *qmul;		/* Q multiplier table */
			
 
				+
			
 
				+	p = (u8 *)ptrs[disks - 2];
			
 
				+	q = (u8 *)ptrs[disks - 1];
			
 
				+
			
 
				+	/*
			
 
				+	 * Compute syndrome with zero for the missing data page
			
 
				+	 * Use the dead data page as temporary storage for delta q
			
 
				+	 */
			
 
				+	dq = (u8 *)ptrs[faila];
			
 
				+	ptrs[faila] = (void *)raid6_empty_zero_page;
			
 
				+	ptrs[disks - 1] = dq;
			
 
				+
			
 
				+	raid6_call.gen_syndrome(disks, bytes, ptrs);
			
 
				+
			
 
				+	/* Restore pointer table */
			
 
				+	ptrs[faila]     = dq;
			
 
				+	ptrs[disks - 1] = q;
			
 
				+
			
 
				+	/* Now, pick the proper data tables */
			
 
				+	qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
			
 
				+
			
 
				+	kernel_neon_begin();
			
 
				+	__raid6_datap_recov_neon(bytes, p, q, dq, qmul);
			
 
				+	kernel_neon_end();
			
 
				+}
			
 
				+
			
 
				+const struct raid6_recov_calls raid6_recov_neon = {
			
 
				+	.data2		= raid6_2data_recov_neon,
			
 
				+	.datap		= raid6_datap_recov_neon,
			
 
				+	.valid		= raid6_has_neon,
			
 
				+	.name		= "neon",
			
 
				+	.priority	= 10,
			
 
				+};
			
--- a/lib/raid6/recov_neon_inner.c
+++ b/lib/raid6/recov_neon_inner.c
@@ -0,0 +1,117 @@
 
				+/*
			
 
				+ * Copyright (C) 2012 Intel Corporation
			
 
				+ * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License
			
 
				+ * as published by the Free Software Foundation; version 2
			
 
				+ * of the License.
			
 
				+ */
			
 
				+
			
 
				+#include <arm_neon.h>
			
 
				+
			
 
				+static const uint8x16_t x0f = {
			
 
				+	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
			
 
				+	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
			
 
				+};
			
 
				+
			
 
				+#ifdef CONFIG_ARM
			
 
				+/*
			
 
				+ * AArch32 does not provide this intrinsic natively because it does not
			
 
				+ * implement the underlying instruction. AArch32 only provides a 64-bit
			
 
				+ * wide vtbl.8 instruction, so use that instead.
			
 
				+ */
			
 
				+static uint8x16_t vqtbl1q_u8(uint8x16_t a, uint8x16_t b)
			
 
				+{
			
 
				+	union {
			
 
				+		uint8x16_t	val;
			
 
				+		uint8x8x2_t	pair;
			
 
				+	} __a = { a };
			
 
				+
			
 
				+	return vcombine_u8(vtbl2_u8(__a.pair, vget_low_u8(b)),
			
 
				+			   vtbl2_u8(__a.pair, vget_high_u8(b)));
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp,
			
 
				+			      uint8_t *dq, const uint8_t *pbmul,
			
 
				+			      const uint8_t *qmul)
			
 
				+{
			
 
				+	uint8x16_t pm0 = vld1q_u8(pbmul);
			
 
				+	uint8x16_t pm1 = vld1q_u8(pbmul + 16);
			
 
				+	uint8x16_t qm0 = vld1q_u8(qmul);
			
 
				+	uint8x16_t qm1 = vld1q_u8(qmul + 16);
			
 
				+
			
 
				+	/*
			
 
				+	 * while ( bytes-- ) {
			
 
				+	 *	uint8_t px, qx, db;
			
 
				+	 *
			
 
				+	 *	px    = *p ^ *dp;
			
 
				+	 *	qx    = qmul[*q ^ *dq];
			
 
				+	 *	*dq++ = db = pbmul[px] ^ qx;
			
 
				+	 *	*dp++ = db ^ px;
			
 
				+	 *	p++; q++;
			
 
				+	 * }
			
 
				+	 */
			
 
				+
			
 
				+	while (bytes) {
			
 
				+		uint8x16_t vx, vy, px, qx, db;
			
 
				+
			
 
				+		px = veorq_u8(vld1q_u8(p), vld1q_u8(dp));
			
 
				+		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
			
 
				+
			
 
				+		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
			
 
				+		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
			
 
				+		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
			
 
				+		qx = veorq_u8(vx, vy);
			
 
				+
			
 
				+		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)px, 4);
			
 
				+		vx = vqtbl1q_u8(pm0, vandq_u8(px, x0f));
			
 
				+		vy = vqtbl1q_u8(pm1, vandq_u8(vy, x0f));
			
 
				+		vx = veorq_u8(vx, vy);
			
 
				+		db = veorq_u8(vx, qx);
			
 
				+
			
 
				+		vst1q_u8(dq, db);
			
 
				+		vst1q_u8(dp, veorq_u8(db, px));
			
 
				+
			
 
				+		bytes -= 16;
			
 
				+		p += 16;
			
 
				+		q += 16;
			
 
				+		dp += 16;
			
 
				+		dq += 16;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq,
			
 
				+			      const uint8_t *qmul)
			
 
				+{
			
 
				+	uint8x16_t qm0 = vld1q_u8(qmul);
			
 
				+	uint8x16_t qm1 = vld1q_u8(qmul + 16);
			
 
				+
			
 
				+	/*
			
 
				+	 * while (bytes--) {
			
 
				+	 *	*p++ ^= *dq = qmul[*q ^ *dq];
			
 
				+	 *	q++; dq++;
			
 
				+	 * }
			
 
				+	 */
			
 
				+
			
 
				+	while (bytes) {
			
 
				+		uint8x16_t vx, vy;
			
 
				+
			
 
				+		vx = veorq_u8(vld1q_u8(q), vld1q_u8(dq));
			
 
				+
			
 
				+		vy = (uint8x16_t)vshrq_n_s16((int16x8_t)vx, 4);
			
 
				+		vx = vqtbl1q_u8(qm0, vandq_u8(vx, x0f));
			
 
				+		vy = vqtbl1q_u8(qm1, vandq_u8(vy, x0f));
			
 
				+		vx = veorq_u8(vx, vy);
			
 
				+		vy = veorq_u8(vx, vld1q_u8(p));
			
 
				+
			
 
				+		vst1q_u8(dq, vx);
			
 
				+		vst1q_u8(p, vy);
			
 
				+
			
 
				+		bytes -= 16;
			
 
				+		p += 16;
			
 
				+		q += 16;
			
 
				+		dq += 16;
			
 
				+	}
			
 
				+}