7 years ago · 251d6fe9a6
--- a/drivers/media/platform/vicodec/vicodec-codec.c
+++ b/drivers/media/platform/vicodec/vicodec-codec.c
@@ -0,0 +1,797 @@
 
				+// SPDX-License-Identifier: GPL-2.0+
			
 
				+/*
			
 
				+ * Copyright 2016 Tom aan de Wiel
			
 
				+ * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
			
 
				+ *
			
 
				+ * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
			
 
				+ *
			
 
				+ * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
			
 
				+ * R.D. Brown, 1977
			
 
				+ */
			
 
				+
			
 
				+#include <linux/string.h>
			
 
				+#include "vicodec-codec.h"
			
 
				+
			
 
				+#define ALL_ZEROS 15
			
 
				+#define DEADZONE_WIDTH 20
			
 
				+
			
 
				+static const uint8_t zigzag[64] = {
			
 
				+	0,
			
 
				+	1,  8,
			
 
				+	2,  9, 16,
			
 
				+	3, 10, 17, 24,
			
 
				+	4, 11, 18, 25, 32,
			
 
				+	5, 12, 19, 26, 33, 40,
			
 
				+	6, 13, 20, 27, 34, 41, 48,
			
 
				+	7, 14, 21, 28, 35, 42, 49, 56,
			
 
				+	15, 22, 29, 36, 43, 50, 57,
			
 
				+	23, 30, 37, 44, 51, 58,
			
 
				+	31, 38, 45, 52, 59,
			
 
				+	39, 46, 53, 60,
			
 
				+	47, 54, 61,
			
 
				+	55, 62,
			
 
				+	63,
			
 
				+};
			
 
				+
			
 
				+
			
 
				+static int rlc(const s16 *in, __be16 *output, int blocktype)
			
 
				+{
			
 
				+	s16 block[8 * 8];
			
 
				+	s16 *wp = block;
			
 
				+	int i = 0;
			
 
				+	int x, y;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	/* read in block from framebuffer */
			
 
				+	int lastzero_run = 0;
			
 
				+	int to_encode;
			
 
				+
			
 
				+	for (y = 0; y < 8; y++) {
			
 
				+		for (x = 0; x < 8; x++) {
			
 
				+			*wp = in[x + y * 8];
			
 
				+			wp++;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* keep track of amount of trailing zeros */
			
 
				+	for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
			
 
				+		lastzero_run++;
			
 
				+
			
 
				+	*output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
			
 
				+	ret++;
			
 
				+
			
 
				+	to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
			
 
				+
			
 
				+	i = 0;
			
 
				+	while (i < to_encode) {
			
 
				+		int cnt = 0;
			
 
				+		int tmp;
			
 
				+
			
 
				+		/* count leading zeros */
			
 
				+		while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
			
 
				+			cnt++;
			
 
				+			i++;
			
 
				+			if (i == to_encode) {
			
 
				+				cnt--;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		/* 4 bits for run, 12 for coefficient (quantization by 4) */
			
 
				+		*output++ = htons((cnt | tmp << 4));
			
 
				+		i++;
			
 
				+		ret++;
			
 
				+	}
			
 
				+	if (lastzero_run > 14) {
			
 
				+		*output = htons(ALL_ZEROS | 0);
			
 
				+		ret++;
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * This function will worst-case increase rlc_in by 65*2 bytes:
			
 
				+ * one s16 value for the header and 8 * 8 coefficients of type s16.
			
 
				+ */
			
 
				+static s16 derlc(const __be16 **rlc_in, s16 *dwht_out)
			
 
				+{
			
 
				+	/* header */
			
 
				+	const __be16 *input = *rlc_in;
			
 
				+	s16 ret = ntohs(*input++);
			
 
				+	int dec_count = 0;
			
 
				+	s16 block[8 * 8 + 16];
			
 
				+	s16 *wp = block;
			
 
				+	int i;
			
 
				+
			
 
				+	/*
			
 
				+	 * Now de-compress, it expands one byte to up to 15 bytes
			
 
				+	 * (or fills the remainder of the 64 bytes with zeroes if it
			
 
				+	 * is the last byte to expand).
			
 
				+	 *
			
 
				+	 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
			
 
				+	 * allow for overflow if the incoming data was malformed.
			
 
				+	 */
			
 
				+	while (dec_count < 8 * 8) {
			
 
				+		s16 in = ntohs(*input++);
			
 
				+		int length = in & 0xf;
			
 
				+		int coeff = in >> 4;
			
 
				+
			
 
				+		/* fill remainder with zeros */
			
 
				+		if (length == 15) {
			
 
				+			for (i = 0; i < 64 - dec_count; i++)
			
 
				+				*wp++ = 0;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		for (i = 0; i < length; i++)
			
 
				+			*wp++ = 0;
			
 
				+		*wp++ = coeff;
			
 
				+		dec_count += length + 1;
			
 
				+	}
			
 
				+
			
 
				+	wp = block;
			
 
				+
			
 
				+	for (i = 0; i < 64; i++) {
			
 
				+		int pos = zigzag[i];
			
 
				+		int y = pos / 8;
			
 
				+		int x = pos % 8;
			
 
				+
			
 
				+		dwht_out[x + y * 8] = *wp++;
			
 
				+	}
			
 
				+	*rlc_in = input;
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static const int quant_table[] = {
			
 
				+	2, 2, 2, 2, 2, 2,  2,  2,
			
 
				+	2, 2, 2, 2, 2, 2,  2,  2,
			
 
				+	2, 2, 2, 2, 2, 2,  2,  3,
			
 
				+	2, 2, 2, 2, 2, 2,  3,  6,
			
 
				+	2, 2, 2, 2, 2, 3,  6,  6,
			
 
				+	2, 2, 2, 2, 3, 6,  6,  6,
			
 
				+	2, 2, 2, 3, 6, 6,  6,  6,
			
 
				+	2, 2, 3, 6, 6, 6,  6,  8,
			
 
				+};
			
 
				+
			
 
				+static const int quant_table_p[] = {
			
 
				+	3, 3, 3, 3, 3, 3,  3,  3,
			
 
				+	3, 3, 3, 3, 3, 3,  3,  3,
			
 
				+	3, 3, 3, 3, 3, 3,  3,  3,
			
 
				+	3, 3, 3, 3, 3, 3,  3,  6,
			
 
				+	3, 3, 3, 3, 3, 3,  6,  6,
			
 
				+	3, 3, 3, 3, 3, 6,  6,  9,
			
 
				+	3, 3, 3, 3, 6, 6,  9,  9,
			
 
				+	3, 3, 3, 6, 6, 9,  9,  10,
			
 
				+};
			
 
				+
			
 
				+static void quantize_intra(s16 *coeff, s16 *de_coeff)
			
 
				+{
			
 
				+	const int *quant = quant_table;
			
 
				+	int i, j;
			
 
				+
			
 
				+	for (j = 0; j < 8; j++) {
			
 
				+		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
			
 
				+			*coeff >>= *quant;
			
 
				+			if (*coeff >= -DEADZONE_WIDTH &&
			
 
				+			    *coeff <= DEADZONE_WIDTH)
			
 
				+				*coeff = *de_coeff = 0;
			
 
				+			else
			
 
				+				*de_coeff = *coeff << *quant;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void dequantize_intra(s16 *coeff)
			
 
				+{
			
 
				+	const int *quant = quant_table;
			
 
				+	int i, j;
			
 
				+
			
 
				+	for (j = 0; j < 8; j++)
			
 
				+		for (i = 0; i < 8; i++, quant++, coeff++)
			
 
				+			*coeff <<= *quant;
			
 
				+}
			
 
				+
			
 
				+static void quantize_inter(s16 *coeff, s16 *de_coeff)
			
 
				+{
			
 
				+	const int *quant = quant_table_p;
			
 
				+	int i, j;
			
 
				+
			
 
				+	for (j = 0; j < 8; j++) {
			
 
				+		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
			
 
				+			*coeff >>= *quant;
			
 
				+			if (*coeff >= -DEADZONE_WIDTH &&
			
 
				+			    *coeff <= DEADZONE_WIDTH)
			
 
				+				*coeff = *de_coeff = 0;
			
 
				+			else
			
 
				+				*de_coeff = *coeff << *quant;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void dequantize_inter(s16 *coeff)
			
 
				+{
			
 
				+	const int *quant = quant_table_p;
			
 
				+	int i, j;
			
 
				+
			
 
				+	for (j = 0; j < 8; j++)
			
 
				+		for (i = 0; i < 8; i++, quant++, coeff++)
			
 
				+			*coeff <<= *quant;
			
 
				+}
			
 
				+
			
 
				+static void fwht(const u8 *block, s16 *output_block, unsigned int stride,
			
 
				+		 unsigned int input_step, bool intra)
			
 
				+{
			
 
				+	/* we'll need more than 8 bits for the transformed coefficients */
			
 
				+	s32 workspace1[8], workspace2[8];
			
 
				+	const u8 *tmp = block;
			
 
				+	s16 *out = output_block;
			
 
				+	int add = intra ? 256 : 0;
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	/* stage 1 */
			
 
				+	stride *= input_step;
			
 
				+
			
 
				+	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
			
 
				+		if (input_step == 1) {
			
 
				+			workspace1[0]  = tmp[0] + tmp[1] - add;
			
 
				+			workspace1[1]  = tmp[0] - tmp[1];
			
 
				+
			
 
				+			workspace1[2]  = tmp[2] + tmp[3] - add;
			
 
				+			workspace1[3]  = tmp[2] - tmp[3];
			
 
				+
			
 
				+			workspace1[4]  = tmp[4] + tmp[5] - add;
			
 
				+			workspace1[5]  = tmp[4] - tmp[5];
			
 
				+
			
 
				+			workspace1[6]  = tmp[6] + tmp[7] - add;
			
 
				+			workspace1[7]  = tmp[6] - tmp[7];
			
 
				+		} else {
			
 
				+			workspace1[0]  = tmp[0] + tmp[2] - add;
			
 
				+			workspace1[1]  = tmp[0] - tmp[2];
			
 
				+
			
 
				+			workspace1[2]  = tmp[4] + tmp[6] - add;
			
 
				+			workspace1[3]  = tmp[4] - tmp[6];
			
 
				+
			
 
				+			workspace1[4]  = tmp[8] + tmp[10] - add;
			
 
				+			workspace1[5]  = tmp[8] - tmp[10];
			
 
				+
			
 
				+			workspace1[6]  = tmp[12] + tmp[14] - add;
			
 
				+			workspace1[7]  = tmp[12] - tmp[14];
			
 
				+		}
			
 
				+
			
 
				+		/* stage 2 */
			
 
				+		workspace2[0] = workspace1[0] + workspace1[2];
			
 
				+		workspace2[1] = workspace1[0] - workspace1[2];
			
 
				+		workspace2[2] = workspace1[1] - workspace1[3];
			
 
				+		workspace2[3] = workspace1[1] + workspace1[3];
			
 
				+
			
 
				+		workspace2[4] = workspace1[4] + workspace1[6];
			
 
				+		workspace2[5] = workspace1[4] - workspace1[6];
			
 
				+		workspace2[6] = workspace1[5] - workspace1[7];
			
 
				+		workspace2[7] = workspace1[5] + workspace1[7];
			
 
				+
			
 
				+		/* stage 3 */
			
 
				+		out[0] = workspace2[0] + workspace2[4];
			
 
				+		out[1] = workspace2[0] - workspace2[4];
			
 
				+		out[2] = workspace2[1] - workspace2[5];
			
 
				+		out[3] = workspace2[1] + workspace2[5];
			
 
				+		out[4] = workspace2[2] + workspace2[6];
			
 
				+		out[5] = workspace2[2] - workspace2[6];
			
 
				+		out[6] = workspace2[3] - workspace2[7];
			
 
				+		out[7] = workspace2[3] + workspace2[7];
			
 
				+	}
			
 
				+
			
 
				+	out = output_block;
			
 
				+
			
 
				+	for (i = 0; i < 8; i++, out++) {
			
 
				+		/* stage 1 */
			
 
				+		workspace1[0]  = out[0] + out[1 * 8];
			
 
				+		workspace1[1]  = out[0] - out[1 * 8];
			
 
				+
			
 
				+		workspace1[2]  = out[2 * 8] + out[3 * 8];
			
 
				+		workspace1[3]  = out[2 * 8] - out[3 * 8];
			
 
				+
			
 
				+		workspace1[4]  = out[4 * 8] + out[5 * 8];
			
 
				+		workspace1[5]  = out[4 * 8] - out[5 * 8];
			
 
				+
			
 
				+		workspace1[6]  = out[6 * 8] + out[7 * 8];
			
 
				+		workspace1[7]  = out[6 * 8] - out[7 * 8];
			
 
				+
			
 
				+		/* stage 2 */
			
 
				+		workspace2[0] = workspace1[0] + workspace1[2];
			
 
				+		workspace2[1] = workspace1[0] - workspace1[2];
			
 
				+		workspace2[2] = workspace1[1] - workspace1[3];
			
 
				+		workspace2[3] = workspace1[1] + workspace1[3];
			
 
				+
			
 
				+		workspace2[4] = workspace1[4] + workspace1[6];
			
 
				+		workspace2[5] = workspace1[4] - workspace1[6];
			
 
				+		workspace2[6] = workspace1[5] - workspace1[7];
			
 
				+		workspace2[7] = workspace1[5] + workspace1[7];
			
 
				+		/* stage 3 */
			
 
				+		out[0 * 8] = workspace2[0] + workspace2[4];
			
 
				+		out[1 * 8] = workspace2[0] - workspace2[4];
			
 
				+		out[2 * 8] = workspace2[1] - workspace2[5];
			
 
				+		out[3 * 8] = workspace2[1] + workspace2[5];
			
 
				+		out[4 * 8] = workspace2[2] + workspace2[6];
			
 
				+		out[5 * 8] = workspace2[2] - workspace2[6];
			
 
				+		out[6 * 8] = workspace2[3] - workspace2[7];
			
 
				+		out[7 * 8] = workspace2[3] + workspace2[7];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Not the nicest way of doing it, but P-blocks get twice the range of
			
 
				+ * that of the I-blocks. Therefore we need a type bigger than 8 bits.
			
 
				+ * Furthermore values can be negative... This is just a version that
			
 
				+ * works with 16 signed data
			
 
				+ */
			
 
				+static void fwht16(const s16 *block, s16 *output_block, int stride, int intra)
			
 
				+{
			
 
				+	/* we'll need more than 8 bits for the transformed coefficients */
			
 
				+	s32 workspace1[8], workspace2[8];
			
 
				+	const s16 *tmp = block;
			
 
				+	s16 *out = output_block;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
			
 
				+		/* stage 1 */
			
 
				+		workspace1[0]  = tmp[0] + tmp[1];
			
 
				+		workspace1[1]  = tmp[0] - tmp[1];
			
 
				+
			
 
				+		workspace1[2]  = tmp[2] + tmp[3];
			
 
				+		workspace1[3]  = tmp[2] - tmp[3];
			
 
				+
			
 
				+		workspace1[4]  = tmp[4] + tmp[5];
			
 
				+		workspace1[5]  = tmp[4] - tmp[5];
			
 
				+
			
 
				+		workspace1[6]  = tmp[6] + tmp[7];
			
 
				+		workspace1[7]  = tmp[6] - tmp[7];
			
 
				+
			
 
				+		/* stage 2 */
			
 
				+		workspace2[0] = workspace1[0] + workspace1[2];
			
 
				+		workspace2[1] = workspace1[0] - workspace1[2];
			
 
				+		workspace2[2] = workspace1[1] - workspace1[3];
			
 
				+		workspace2[3] = workspace1[1] + workspace1[3];
			
 
				+
			
 
				+		workspace2[4] = workspace1[4] + workspace1[6];
			
 
				+		workspace2[5] = workspace1[4] - workspace1[6];
			
 
				+		workspace2[6] = workspace1[5] - workspace1[7];
			
 
				+		workspace2[7] = workspace1[5] + workspace1[7];
			
 
				+
			
 
				+		/* stage 3 */
			
 
				+		out[0] = workspace2[0] + workspace2[4];
			
 
				+		out[1] = workspace2[0] - workspace2[4];
			
 
				+		out[2] = workspace2[1] - workspace2[5];
			
 
				+		out[3] = workspace2[1] + workspace2[5];
			
 
				+		out[4] = workspace2[2] + workspace2[6];
			
 
				+		out[5] = workspace2[2] - workspace2[6];
			
 
				+		out[6] = workspace2[3] - workspace2[7];
			
 
				+		out[7] = workspace2[3] + workspace2[7];
			
 
				+	}
			
 
				+
			
 
				+	out = output_block;
			
 
				+
			
 
				+	for (i = 0; i < 8; i++, out++) {
			
 
				+		/* stage 1 */
			
 
				+		workspace1[0]  = out[0] + out[1*8];
			
 
				+		workspace1[1]  = out[0] - out[1*8];
			
 
				+
			
 
				+		workspace1[2]  = out[2*8] + out[3*8];
			
 
				+		workspace1[3]  = out[2*8] - out[3*8];
			
 
				+
			
 
				+		workspace1[4]  = out[4*8] + out[5*8];
			
 
				+		workspace1[5]  = out[4*8] - out[5*8];
			
 
				+
			
 
				+		workspace1[6]  = out[6*8] + out[7*8];
			
 
				+		workspace1[7]  = out[6*8] - out[7*8];
			
 
				+
			
 
				+		/* stage 2 */
			
 
				+		workspace2[0] = workspace1[0] + workspace1[2];
			
 
				+		workspace2[1] = workspace1[0] - workspace1[2];
			
 
				+		workspace2[2] = workspace1[1] - workspace1[3];
			
 
				+		workspace2[3] = workspace1[1] + workspace1[3];
			
 
				+
			
 
				+		workspace2[4] = workspace1[4] + workspace1[6];
			
 
				+		workspace2[5] = workspace1[4] - workspace1[6];
			
 
				+		workspace2[6] = workspace1[5] - workspace1[7];
			
 
				+		workspace2[7] = workspace1[5] + workspace1[7];
			
 
				+
			
 
				+		/* stage 3 */
			
 
				+		out[0*8] = workspace2[0] + workspace2[4];
			
 
				+		out[1*8] = workspace2[0] - workspace2[4];
			
 
				+		out[2*8] = workspace2[1] - workspace2[5];
			
 
				+		out[3*8] = workspace2[1] + workspace2[5];
			
 
				+		out[4*8] = workspace2[2] + workspace2[6];
			
 
				+		out[5*8] = workspace2[2] - workspace2[6];
			
 
				+		out[6*8] = workspace2[3] - workspace2[7];
			
 
				+		out[7*8] = workspace2[3] + workspace2[7];
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void ifwht(const s16 *block, s16 *output_block, int intra)
			
 
				+{
			
 
				+	/*
			
 
				+	 * we'll need more than 8 bits for the transformed coefficients
			
 
				+	 * use native unit of cpu
			
 
				+	 */
			
 
				+	int workspace1[8], workspace2[8];
			
 
				+	int inter = intra ? 0 : 1;
			
 
				+	const s16 *tmp = block;
			
 
				+	s16 *out = output_block;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < 8; i++, tmp += 8, out += 8) {
			
 
				+		/* stage 1 */
			
 
				+		workspace1[0]  = tmp[0] + tmp[1];
			
 
				+		workspace1[1]  = tmp[0] - tmp[1];
			
 
				+
			
 
				+		workspace1[2]  = tmp[2] + tmp[3];
			
 
				+		workspace1[3]  = tmp[2] - tmp[3];
			
 
				+
			
 
				+		workspace1[4]  = tmp[4] + tmp[5];
			
 
				+		workspace1[5]  = tmp[4] - tmp[5];
			
 
				+
			
 
				+		workspace1[6]  = tmp[6] + tmp[7];
			
 
				+		workspace1[7]  = tmp[6] - tmp[7];
			
 
				+
			
 
				+		/* stage 2 */
			
 
				+		workspace2[0] = workspace1[0] + workspace1[2];
			
 
				+		workspace2[1] = workspace1[0] - workspace1[2];
			
 
				+		workspace2[2] = workspace1[1] - workspace1[3];
			
 
				+		workspace2[3] = workspace1[1] + workspace1[3];
			
 
				+
			
 
				+		workspace2[4] = workspace1[4] + workspace1[6];
			
 
				+		workspace2[5] = workspace1[4] - workspace1[6];
			
 
				+		workspace2[6] = workspace1[5] - workspace1[7];
			
 
				+		workspace2[7] = workspace1[5] + workspace1[7];
			
 
				+
			
 
				+		/* stage 3 */
			
 
				+		out[0] = workspace2[0] + workspace2[4];
			
 
				+		out[1] = workspace2[0] - workspace2[4];
			
 
				+		out[2] = workspace2[1] - workspace2[5];
			
 
				+		out[3] = workspace2[1] + workspace2[5];
			
 
				+		out[4] = workspace2[2] + workspace2[6];
			
 
				+		out[5] = workspace2[2] - workspace2[6];
			
 
				+		out[6] = workspace2[3] - workspace2[7];
			
 
				+		out[7] = workspace2[3] + workspace2[7];
			
 
				+	}
			
 
				+
			
 
				+	out = output_block;
			
 
				+
			
 
				+	for (i = 0; i < 8; i++, out++) {
			
 
				+		/* stage 1 */
			
 
				+		workspace1[0]  = out[0] + out[1 * 8];
			
 
				+		workspace1[1]  = out[0] - out[1 * 8];
			
 
				+
			
 
				+		workspace1[2]  = out[2 * 8] + out[3 * 8];
			
 
				+		workspace1[3]  = out[2 * 8] - out[3 * 8];
			
 
				+
			
 
				+		workspace1[4]  = out[4 * 8] + out[5 * 8];
			
 
				+		workspace1[5]  = out[4 * 8] - out[5 * 8];
			
 
				+
			
 
				+		workspace1[6]  = out[6 * 8] + out[7 * 8];
			
 
				+		workspace1[7]  = out[6 * 8] - out[7 * 8];
			
 
				+
			
 
				+		/* stage 2 */
			
 
				+		workspace2[0] = workspace1[0] + workspace1[2];
			
 
				+		workspace2[1] = workspace1[0] - workspace1[2];
			
 
				+		workspace2[2] = workspace1[1] - workspace1[3];
			
 
				+		workspace2[3] = workspace1[1] + workspace1[3];
			
 
				+
			
 
				+		workspace2[4] = workspace1[4] + workspace1[6];
			
 
				+		workspace2[5] = workspace1[4] - workspace1[6];
			
 
				+		workspace2[6] = workspace1[5] - workspace1[7];
			
 
				+		workspace2[7] = workspace1[5] + workspace1[7];
			
 
				+
			
 
				+		/* stage 3 */
			
 
				+		if (inter) {
			
 
				+			int d;
			
 
				+
			
 
				+			out[0 * 8] = workspace2[0] + workspace2[4];
			
 
				+			out[1 * 8] = workspace2[0] - workspace2[4];
			
 
				+			out[2 * 8] = workspace2[1] - workspace2[5];
			
 
				+			out[3 * 8] = workspace2[1] + workspace2[5];
			
 
				+			out[4 * 8] = workspace2[2] + workspace2[6];
			
 
				+			out[5 * 8] = workspace2[2] - workspace2[6];
			
 
				+			out[6 * 8] = workspace2[3] - workspace2[7];
			
 
				+			out[7 * 8] = workspace2[3] + workspace2[7];
			
 
				+
			
 
				+			for (d = 0; d < 8; d++)
			
 
				+				out[8 * d] >>= 6;
			
 
				+		} else {
			
 
				+			int d;
			
 
				+
			
 
				+			out[0 * 8] = workspace2[0] + workspace2[4];
			
 
				+			out[1 * 8] = workspace2[0] - workspace2[4];
			
 
				+			out[2 * 8] = workspace2[1] - workspace2[5];
			
 
				+			out[3 * 8] = workspace2[1] + workspace2[5];
			
 
				+			out[4 * 8] = workspace2[2] + workspace2[6];
			
 
				+			out[5 * 8] = workspace2[2] - workspace2[6];
			
 
				+			out[6 * 8] = workspace2[3] - workspace2[7];
			
 
				+			out[7 * 8] = workspace2[3] + workspace2[7];
			
 
				+
			
 
				+			for (d = 0; d < 8; d++) {
			
 
				+				out[8 * d] >>= 6;
			
 
				+				out[8 * d] += 128;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void fill_encoder_block(const u8 *input, s16 *dst,
			
 
				+			       unsigned int stride, unsigned int input_step)
			
 
				+{
			
 
				+	int i, j;
			
 
				+
			
 
				+	for (i = 0; i < 8; i++) {
			
 
				+		for (j = 0; j < 8; j++, input += input_step)
			
 
				+			*dst++ = *input;
			
 
				+		input += (stride - 8) * input_step;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int var_intra(const s16 *input)
			
 
				+{
			
 
				+	int32_t mean = 0;
			
 
				+	int32_t ret = 0;
			
 
				+	const s16 *tmp = input;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < 8 * 8; i++, tmp++)
			
 
				+		mean += *tmp;
			
 
				+	mean /= 64;
			
 
				+	tmp = input;
			
 
				+	for (i = 0; i < 8 * 8; i++, tmp++)
			
 
				+		ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int var_inter(const s16 *old, const s16 *new)
			
 
				+{
			
 
				+	int32_t ret = 0;
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < 8 * 8; i++, old++, new++)
			
 
				+		ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int decide_blocktype(const u8 *current, const u8 *reference,
			
 
				+			    s16 *deltablock, unsigned int stride,
			
 
				+			    unsigned int input_step)
			
 
				+{
			
 
				+	s16 tmp[64];
			
 
				+	s16 old[64];
			
 
				+	s16 *work = tmp;
			
 
				+	unsigned int k, l;
			
 
				+	int vari;
			
 
				+	int vard;
			
 
				+
			
 
				+	fill_encoder_block(current, tmp, stride, input_step);
			
 
				+	fill_encoder_block(reference, old, 8, 1);
			
 
				+	vari = var_intra(tmp);
			
 
				+
			
 
				+	for (k = 0; k < 8; k++) {
			
 
				+		for (l = 0; l < 8; l++) {
			
 
				+			*deltablock = *work - *reference;
			
 
				+			deltablock++;
			
 
				+			work++;
			
 
				+			reference++;
			
 
				+		}
			
 
				+	}
			
 
				+	deltablock -= 64;
			
 
				+	vard = var_inter(old, tmp);
			
 
				+	return vari <= vard ? IBLOCK : PBLOCK;
			
 
				+}
			
 
				+
			
 
				+static void fill_decoder_block(u8 *dst, const s16 *input, int stride)
			
 
				+{
			
 
				+	int i, j;
			
 
				+
			
 
				+	for (i = 0; i < 8; i++) {
			
 
				+		for (j = 0; j < 8; j++)
			
 
				+			*dst++ = *input++;
			
 
				+		dst += stride - 8;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void add_deltas(s16 *deltas, const u8 *ref, int stride)
			
 
				+{
			
 
				+	int k, l;
			
 
				+
			
 
				+	for (k = 0; k < 8; k++) {
			
 
				+		for (l = 0; l < 8; l++) {
			
 
				+			*deltas += *ref++;
			
 
				+			/*
			
 
				+			 * Due to quantizing, it might possible that the
			
 
				+			 * decoded coefficients are slightly out of range
			
 
				+			 */
			
 
				+			if (*deltas < 0)
			
 
				+				*deltas = 0;
			
 
				+			else if (*deltas > 255)
			
 
				+				*deltas = 255;
			
 
				+			deltas++;
			
 
				+		}
			
 
				+		ref += stride - 8;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
			
 
				+			struct cframe *cf, u32 height, u32 width,
			
 
				+			unsigned int input_step,
			
 
				+			bool is_intra, bool next_is_intra)
			
 
				+{
			
 
				+	u8 *input_start = input;
			
 
				+	__be16 *rlco_start = *rlco;
			
 
				+	s16 deltablock[64];
			
 
				+	__be16 pframe_bit = htons(PFRAME_BIT);
			
 
				+	u32 encoding = 0;
			
 
				+	unsigned int last_size = 0;
			
 
				+	unsigned int i, j;
			
 
				+
			
 
				+	for (j = 0; j < height / 8; j++) {
			
 
				+		for (i = 0; i < width / 8; i++) {
			
 
				+			/* intra code, first frame is always intra coded. */
			
 
				+			int blocktype = IBLOCK;
			
 
				+			unsigned int size;
			
 
				+
			
 
				+			if (!is_intra)
			
 
				+				blocktype = decide_blocktype(input, refp,
			
 
				+					deltablock, width, input_step);
			
 
				+			if (is_intra || blocktype == IBLOCK) {
			
 
				+				fwht(input, cf->coeffs, width, input_step, 1);
			
 
				+				quantize_intra(cf->coeffs, cf->de_coeffs);
			
 
				+				blocktype = IBLOCK;
			
 
				+			} else {
			
 
				+				/* inter code */
			
 
				+				encoding |= FRAME_PCODED;
			
 
				+				fwht16(deltablock, cf->coeffs, 8, 0);
			
 
				+				quantize_inter(cf->coeffs, cf->de_coeffs);
			
 
				+			}
			
 
				+			if (!next_is_intra) {
			
 
				+				ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
			
 
				+
			
 
				+				if (blocktype == PBLOCK)
			
 
				+					add_deltas(cf->de_fwht, refp, 8);
			
 
				+				fill_decoder_block(refp, cf->de_fwht, 8);
			
 
				+			}
			
 
				+
			
 
				+			input += 8 * input_step;
			
 
				+			refp += 8 * 8;
			
 
				+
			
 
				+			if (encoding & FRAME_UNENCODED)
			
 
				+				continue;
			
 
				+
			
 
				+			size = rlc(cf->coeffs, *rlco, blocktype);
			
 
				+			if (last_size == size &&
			
 
				+			    !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
			
 
				+				__be16 *last_rlco = *rlco - size;
			
 
				+				s16 hdr = ntohs(*last_rlco);
			
 
				+
			
 
				+				if (!((*last_rlco ^ **rlco) & pframe_bit) &&
			
 
				+				    (hdr & DUPS_MASK) < DUPS_MASK)
			
 
				+					*last_rlco = htons(hdr + 2);
			
 
				+				else
			
 
				+					*rlco += size;
			
 
				+			} else {
			
 
				+				*rlco += size;
			
 
				+			}
			
 
				+			if (*rlco >= rlco_max)
			
 
				+				encoding |= FRAME_UNENCODED;
			
 
				+			last_size = size;
			
 
				+		}
			
 
				+		input += width * 7 * input_step;
			
 
				+	}
			
 
				+	if (encoding & FRAME_UNENCODED) {
			
 
				+		u8 *out = (u8 *)rlco_start;
			
 
				+
			
 
				+		input = input_start;
			
 
				+		/*
			
 
				+		 * The compressed stream should never contain the magic
			
 
				+		 * header, so when we copy the YUV data we replace 0xff
			
 
				+		 * by 0xfe. Since YUV is limited range such values
			
 
				+		 * shouldn't appear anyway.
			
 
				+		 */
			
 
				+		for (i = 0; i < height * width; i++, input += input_step)
			
 
				+			*out++ = (*input == 0xff) ? 0xfe : *input;
			
 
				+		*rlco = (__be16 *)out;
			
 
				+	}
			
 
				+	return encoding;
			
 
				+}
			
 
				+
			
 
				+u32 encode_frame(struct raw_frame *frm, struct raw_frame *ref_frm,
			
 
				+		 struct cframe *cf, bool is_intra, bool next_is_intra)
			
 
				+{
			
 
				+	unsigned int size = frm->height * frm->width;
			
 
				+	__be16 *rlco = cf->rlc_data;
			
 
				+	__be16 *rlco_max;
			
 
				+	u32 encoding;
			
 
				+
			
 
				+	rlco_max = rlco + size / 2 - 256;
			
 
				+	encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
			
 
				+				  frm->height, frm->width,
			
 
				+				  1, is_intra, next_is_intra);
			
 
				+	if (encoding & FRAME_UNENCODED)
			
 
				+		encoding |= LUMA_UNENCODED;
			
 
				+	encoding &= ~FRAME_UNENCODED;
			
 
				+	rlco_max = rlco + size / 8 - 256;
			
 
				+	encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max, cf,
			
 
				+				   frm->height / 2, frm->width / 2,
			
 
				+				   frm->chroma_step, is_intra, next_is_intra);
			
 
				+	if (encoding & FRAME_UNENCODED)
			
 
				+		encoding |= CB_UNENCODED;
			
 
				+	encoding &= ~FRAME_UNENCODED;
			
 
				+	rlco_max = rlco + size / 8 - 256;
			
 
				+	encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max, cf,
			
 
				+				   frm->height / 2, frm->width / 2,
			
 
				+				   frm->chroma_step, is_intra, next_is_intra);
			
 
				+	if (encoding & FRAME_UNENCODED)
			
 
				+		encoding |= CR_UNENCODED;
			
 
				+	encoding &= ~FRAME_UNENCODED;
			
 
				+	cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
			
 
				+	return encoding;
			
 
				+}
			
 
				+
			
 
				+static void decode_plane(struct cframe *cf, const __be16 **rlco, u8 *ref,
			
 
				+			 u32 height, u32 width, bool uncompressed)
			
 
				+{
			
 
				+	unsigned int copies = 0;
			
 
				+	s16 copy[8 * 8];
			
 
				+	s16 stat;
			
 
				+	unsigned int i, j;
			
 
				+
			
 
				+	if (uncompressed) {
			
 
				+		memcpy(ref, *rlco, width * height);
			
 
				+		*rlco += width * height / 2;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * When decoding each macroblock the rlco pointer will be increased
			
 
				+	 * by 65 * 2 bytes worst-case.
			
 
				+	 * To avoid overflow the buffer has to be 65/64th of the actual raw
			
 
				+	 * image size, just in case someone feeds it malicious data.
			
 
				+	 */
			
 
				+	for (j = 0; j < height / 8; j++) {
			
 
				+		for (i = 0; i < width / 8; i++) {
			
 
				+			u8 *refp = ref + j * 8 * width + i * 8;
			
 
				+
			
 
				+			if (copies) {
			
 
				+				memcpy(cf->de_fwht, copy, sizeof(copy));
			
 
				+				if (stat & PFRAME_BIT)
			
 
				+					add_deltas(cf->de_fwht, refp, width);
			
 
				+				fill_decoder_block(refp, cf->de_fwht, width);
			
 
				+				copies--;
			
 
				+				continue;
			
 
				+			}
			
 
				+
			
 
				+			stat = derlc(rlco, cf->coeffs);
			
 
				+
			
 
				+			if (stat & PFRAME_BIT)
			
 
				+				dequantize_inter(cf->coeffs);
			
 
				+			else
			
 
				+				dequantize_intra(cf->coeffs);
			
 
				+
			
 
				+			ifwht(cf->coeffs, cf->de_fwht,
			
 
				+			      (stat & PFRAME_BIT) ? 0 : 1);
			
 
				+
			
 
				+			copies = (stat & DUPS_MASK) >> 1;
			
 
				+			if (copies)
			
 
				+				memcpy(copy, cf->de_fwht, sizeof(copy));
			
 
				+			if (stat & PFRAME_BIT)
			
 
				+				add_deltas(cf->de_fwht, refp, width);
			
 
				+			fill_decoder_block(refp, cf->de_fwht, width);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void decode_frame(struct cframe *cf, struct raw_frame *ref, u32 hdr_flags)
			
 
				+{
			
 
				+	const __be16 *rlco = cf->rlc_data;
			
 
				+
			
 
				+	decode_plane(cf, &rlco, ref->luma, cf->height, cf->width,
			
 
				+		     hdr_flags & VICODEC_FL_LUMA_IS_UNCOMPRESSED);
			
 
				+	decode_plane(cf, &rlco, ref->cb, cf->height / 2, cf->width / 2,
			
 
				+		     hdr_flags & VICODEC_FL_CB_IS_UNCOMPRESSED);
			
 
				+	decode_plane(cf, &rlco, ref->cr, cf->height / 2, cf->width / 2,
			
 
				+		     hdr_flags & VICODEC_FL_CR_IS_UNCOMPRESSED);
			
 
				+}
			
--- a/drivers/media/platform/vicodec/vicodec-codec.h
+++ b/drivers/media/platform/vicodec/vicodec-codec.h
@@ -0,0 +1,129 @@
 
				+/* SPDX-License-Identifier: GPL-2.0+ */
			
 
				+/*
			
 
				+ * Copyright 2016 Tom aan de Wiel
			
 
				+ * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
			
 
				+ */
			
 
				+
			
 
				+#ifndef VICODEC_RLC_H
			
 
				+#define VICODEC_RLC_H
			
 
				+
			
 
				+#include <linux/types.h>
			
 
				+#include <linux/bitops.h>
			
 
				+#include <asm/byteorder.h>
			
 
				+
			
 
				+/*
			
 
				+ * The compressed format consists of a cframe_hdr struct followed by the
			
 
				+ * compressed frame data. The header contains the size of that data.
			
 
				+ * Each Y, Cb and Cr plane is compressed separately. If the compressed
			
 
				+ * size of each plane becomes larger than the uncompressed size, then
			
 
				+ * that plane is stored uncompressed and the corresponding bit is set
			
 
				+ * in the flags field of the header.
			
 
				+ *
			
 
				+ * Each compressed plane consists of macroblocks and each macroblock
			
 
				+ * is run-length-encoded. Each macroblock starts with a 16 bit value.
			
 
				+ * Bit 15 indicates if this is a P-coded macroblock (1) or not (0).
			
 
				+ * P-coded macroblocks contain a delta against the previous frame.
			
 
				+ *
			
 
				+ * Bits 1-12 contain a number. If non-zero, then this same macroblock
			
 
				+ * repeats that number of times. This results in a high degree of
			
 
				+ * compression for generated images like colorbars.
			
 
				+ *
			
 
				+ * Following this macroblock header the MB coefficients are run-length
			
 
				+ * encoded: the top 12 bits contain the coefficient, the bottom 4 bits
			
 
				+ * tell how many times this coefficient occurs. The value 0xf indicates
			
 
				+ * that the remainder of the macroblock should be filled with zeroes.
			
 
				+ *
			
 
				+ * All 16 and 32 bit values are stored in big-endian (network) order.
			
 
				+ *
			
 
				+ * Each cframe_hdr starts with an 8 byte magic header that is
			
 
				+ * guaranteed not to occur in the compressed frame data. This header
			
 
				+ * can be used to sync to the next frame.
			
 
				+ *
			
 
				+ * This codec uses the Fast Walsh Hadamard Transform. Tom aan de Wiel
			
 
				+ * developed this as part of a university project, specifically for use
			
 
				+ * with this driver. His project report can be found here:
			
 
				+ *
			
 
				+ * https://hverkuil.home.xs4all.nl/fwht.pdf
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * Note: bit 0 of the header must always be 0. Otherwise it cannot
			
 
				+ * be guaranteed that the magic 8 byte sequence (see below) can
			
 
				+ * never occur in the rlc output.
			
 
				+ */
			
 
				+#define PFRAME_BIT (1 << 15)
			
 
				+#define DUPS_MASK 0x1ffe
			
 
				+
			
 
				+/*
			
 
				+ * This is a sequence of 8 bytes with the low 4 bits set to 0xf.
			
 
				+ *
			
 
				+ * This sequence cannot occur in the encoded data
			
 
				+ */
			
 
				+#define VICODEC_MAGIC1 0x4f4f4f4f
			
 
				+#define VICODEC_MAGIC2 0xffffffff
			
 
				+
			
 
				+#define VICODEC_VERSION 1
			
 
				+
			
 
				+#define VICODEC_MAX_WIDTH 3840
			
 
				+#define VICODEC_MAX_HEIGHT 2160
			
 
				+#define VICODEC_MIN_WIDTH 640
			
 
				+#define VICODEC_MIN_HEIGHT 480
			
 
				+
			
 
				+#define PBLOCK 0
			
 
				+#define IBLOCK 1
			
 
				+
			
 
				+/* Set if this is an interlaced format */
			
 
				+#define VICODEC_FL_IS_INTERLACED	BIT(0)
			
 
				+/* Set if this is a bottom-first (NTSC) interlaced format */
			
 
				+#define VICODEC_FL_IS_BOTTOM_FIRST	BIT(1)
			
 
				+/* Set if each 'frame' contains just one field */
			
 
				+#define VICODEC_FL_IS_ALTERNATE		BIT(2)
			
 
				+/*
			
 
				+ * If VICODEC_FL_IS_ALTERNATE was set, then this is set if this
			
 
				+ * 'frame' is the bottom field, else it is the top field.
			
 
				+ */
			
 
				+#define VICODEC_FL_IS_BOTTOM_FIELD	BIT(3)
			
 
				+/* Set if this frame is uncompressed */
			
 
				+#define VICODEC_FL_LUMA_IS_UNCOMPRESSED	BIT(4)
			
 
				+#define VICODEC_FL_CB_IS_UNCOMPRESSED	BIT(5)
			
 
				+#define VICODEC_FL_CR_IS_UNCOMPRESSED	BIT(6)
			
 
				+
			
 
				+struct cframe_hdr {
			
 
				+	u32 magic1;
			
 
				+	u32 magic2;
			
 
				+	__be32 version;
			
 
				+	__be32 width, height;
			
 
				+	__be32 flags;
			
 
				+	__be32 colorspace;
			
 
				+	__be32 xfer_func;
			
 
				+	__be32 ycbcr_enc;
			
 
				+	__be32 quantization;
			
 
				+	__be32 size;
			
 
				+};
			
 
				+
			
 
				+struct cframe {
			
 
				+	unsigned int width, height;
			
 
				+	__be16 *rlc_data;
			
 
				+	s16 coeffs[8 * 8];
			
 
				+	s16 de_coeffs[8 * 8];
			
 
				+	s16 de_fwht[8 * 8];
			
 
				+	u32 size;
			
 
				+};
			
 
				+
			
 
				+struct raw_frame {
			
 
				+	unsigned int width, height;
			
 
				+	unsigned int chroma_step;
			
 
				+	u8 *luma, *cb, *cr;
			
 
				+};
			
 
				+
			
 
				+#define FRAME_PCODED	BIT(0)
			
 
				+#define FRAME_UNENCODED	BIT(1)
			
 
				+#define LUMA_UNENCODED	BIT(2)
			
 
				+#define CB_UNENCODED	BIT(3)
			
 
				+#define CR_UNENCODED	BIT(4)
			
 
				+
			
 
				+u32 encode_frame(struct raw_frame *frm, struct raw_frame *ref_frm,
			
 
				+		 struct cframe *cf, bool is_intra, bool next_is_intra);
			
 
				+void decode_frame(struct cframe *cf, struct raw_frame *ref, u32 hdr_flags);
			
 
				+
			
 
				+#endif