10 years ago · e5c88e3f2f
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -0,0 +1,193 @@
 
															+/*
														
 
															+ * Copyright (C) 2013 ARM Ltd.
														
 
															+ * Copyright (C) 2013 Linaro.
														
 
															+ *
														
 
															+ * This code is based on glibc cortex strings work originally authored by Linaro
														
 
															+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
														
 
															+ * be found @
														
 
															+ *
														
 
															+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
														
 
															+ * files/head:/src/aarch64/
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify
														
 
															+ * it under the terms of the GNU General Public License version 2 as
														
 
															+ * published by the Free Software Foundation.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful,
														
 
															+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
														
 
															+ * GNU General Public License for more details.
														
 
															+ *
														
 
															+ * You should have received a copy of the GNU General Public License
														
 
															+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+/*
														
 
															+ * Copy a buffer from src to dest (alignment handled by the hardware)
														
 
															+ *
														
 
															+ * Parameters:
														
 
															+ *	x0 - dest
														
 
															+ *	x1 - src
														
 
															+ *	x2 - n
														
 
															+ * Returns:
														
 
															+ *	x0 - dest
														
 
															+ */
														
 
															+dstin	.req	x0
														
 
															+src	.req	x1
														
 
															+count	.req	x2
														
 
															+tmp1	.req	x3
														
 
															+tmp1w	.req	w3
														
 
															+tmp2	.req	x4
														
 
															+tmp2w	.req	w4
														
 
															+dst	.req	x6
														
 
															+
														
 
															+A_l	.req	x7
														
 
															+A_h	.req	x8
														
 
															+B_l	.req	x9
														
 
															+B_h	.req	x10
														
 
															+C_l	.req	x11
														
 
															+C_h	.req	x12
														
 
															+D_l	.req	x13
														
 
															+D_h	.req	x14
														
 
															+
														
 
															+	mov	dst, dstin
														
 
															+	cmp	count, #16
														
 
															+	/*When memory length is less than 16, the accessed are not aligned.*/
														
 
															+	b.lo	.Ltiny15
														
 
															+
														
 
															+	neg	tmp2, src
														
 
															+	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
														
 
															+	b.eq	.LSrcAligned
														
 
															+	sub	count, count, tmp2
														
 
															+	/*
														
 
															+	* Copy the leading memory data from src to dst in an increasing
														
 
															+	* address order.By this way,the risk of overwritting the source
														
 
															+	* memory data is eliminated when the distance between src and
														
 
															+	* dst is less than 16. The memory accesses here are alignment.
														
 
															+	*/
														
 
															+	tbz	tmp2, #0, 1f
														
 
															+	ldrb1	tmp1w, src, #1
														
 
															+	strb1	tmp1w, dst, #1
														
 
															+1:
														
 
															+	tbz	tmp2, #1, 2f
														
 
															+	ldrh1	tmp1w, src, #2
														
 
															+	strh1	tmp1w, dst, #2
														
 
															+2:
														
 
															+	tbz	tmp2, #2, 3f
														
 
															+	ldr1	tmp1w, src, #4
														
 
															+	str1	tmp1w, dst, #4
														
 
															+3:
														
 
															+	tbz	tmp2, #3, .LSrcAligned
														
 
															+	ldr1	tmp1, src, #8
														
 
															+	str1	tmp1, dst, #8
														
 
															+
														
 
															+.LSrcAligned:
														
 
															+	cmp	count, #64
														
 
															+	b.ge	.Lcpy_over64
														
 
															+	/*
														
 
															+	* Deal with small copies quickly by dropping straight into the
														
 
															+	* exit block.
														
 
															+	*/
														
 
															+.Ltail63:
														
 
															+	/*
														
 
															+	* Copy up to 48 bytes of data. At this point we only need the
														
 
															+	* bottom 6 bits of count to be accurate.
														
 
															+	*/
														
 
															+	ands	tmp1, count, #0x30
														
 
															+	b.eq	.Ltiny15
														
 
															+	cmp	tmp1w, #0x20
														
 
															+	b.eq	1f
														
 
															+	b.lt	2f
														
 
															+	ldp1	A_l, A_h, src, #16
														
 
															+	stp1	A_l, A_h, dst, #16
														
 
															+1:
														
 
															+	ldp1	A_l, A_h, src, #16
														
 
															+	stp1	A_l, A_h, dst, #16
														
 
															+2:
														
 
															+	ldp1	A_l, A_h, src, #16
														
 
															+	stp1	A_l, A_h, dst, #16
														
 
															+.Ltiny15:
														
 
															+	/*
														
 
															+	* Prefer to break one ldp/stp into several load/store to access
														
 
															+	* memory in an increasing address order,rather than to load/store 16
														
 
															+	* bytes from (src-16) to (dst-16) and to backward the src to aligned
														
 
															+	* address,which way is used in original cortex memcpy. If keeping
														
 
															+	* the original memcpy process here, memmove need to satisfy the
														
 
															+	* precondition that src address is at least 16 bytes bigger than dst
														
 
															+	* address,otherwise some source data will be overwritten when memove
														
 
															+	* call memcpy directly. To make memmove simpler and decouple the
														
 
															+	* memcpy's dependency on memmove, withdrew the original process.
														
 
															+	*/
														
 
															+	tbz	count, #3, 1f
														
 
															+	ldr1	tmp1, src, #8
														
 
															+	str1	tmp1, dst, #8
														
 
															+1:
														
 
															+	tbz	count, #2, 2f
														
 
															+	ldr1	tmp1w, src, #4
														
 
															+	str1	tmp1w, dst, #4
														
 
															+2:
														
 
															+	tbz	count, #1, 3f
														
 
															+	ldrh1	tmp1w, src, #2
														
 
															+	strh1	tmp1w, dst, #2
														
 
															+3:
														
 
															+	tbz	count, #0, .Lexitfunc
														
 
															+	ldrb1	tmp1w, src, #1
														
 
															+	strb1	tmp1w, dst, #1
														
 
															+
														
 
															+	b	.Lexitfunc
														
 
															+
														
 
															+.Lcpy_over64:
														
 
															+	subs	count, count, #128
														
 
															+	b.ge	.Lcpy_body_large
														
 
															+	/*
														
 
															+	* Less than 128 bytes to copy, so handle 64 here and then jump
														
 
															+	* to the tail.
														
 
															+	*/
														
 
															+	ldp1	A_l, A_h, src, #16
														
 
															+	stp1	A_l, A_h, dst, #16
														
 
															+	ldp1	B_l, B_h, src, #16
														
 
															+	ldp1	C_l, C_h, src, #16
														
 
															+	stp1	B_l, B_h, dst, #16
														
 
															+	stp1	C_l, C_h, dst, #16
														
 
															+	ldp1	D_l, D_h, src, #16
														
 
															+	stp1	D_l, D_h, dst, #16
														
 
															+
														
 
															+	tst	count, #0x3f
														
 
															+	b.ne	.Ltail63
														
 
															+	b	.Lexitfunc
														
 
															+
														
 
															+	/*
														
 
															+	* Critical loop.  Start at a new cache line boundary.  Assuming
														
 
															+	* 64 bytes per line this ensures the entire loop is in one line.
														
 
															+	*/
														
 
															+	.p2align	L1_CACHE_SHIFT
														
 
															+.Lcpy_body_large:
														
 
															+	/* pre-get 64 bytes data. */
														
 
															+	ldp1	A_l, A_h, src, #16
														
 
															+	ldp1	B_l, B_h, src, #16
														
 
															+	ldp1	C_l, C_h, src, #16
														
 
															+	ldp1	D_l, D_h, src, #16
														
 
															+1:
														
 
															+	/*
														
 
															+	* interlace the load of next 64 bytes data block with store of the last
														
 
															+	* loaded 64 bytes data.
														
 
															+	*/
														
 
															+	stp1	A_l, A_h, dst, #16
														
 
															+	ldp1	A_l, A_h, src, #16
														
 
															+	stp1	B_l, B_h, dst, #16
														
 
															+	ldp1	B_l, B_h, src, #16
														
 
															+	stp1	C_l, C_h, dst, #16
														
 
															+	ldp1	C_l, C_h, src, #16
														
 
															+	stp1	D_l, D_h, dst, #16
														
 
															+	ldp1	D_l, D_h, src, #16
														
 
															+	subs	count, count, #64
														
 
															+	b.ge	1b
														
 
															+	stp1	A_l, A_h, dst, #16
														
 
															+	stp1	B_l, B_h, dst, #16
														
 
															+	stp1	C_l, C_h, dst, #16
														
 
															+	stp1	D_l, D_h, dst, #16
														
 
															+
														
 
															+	tst	count, #0x3f
														
 
															+	b.ne	.Ltail63
														
 
															+.Lexitfunc:
														
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -36,166 +36,39 @@
 
															  * Returns:
														
 
															  *	x0 - dest
														
 
															  */
														
 
															-dstin	.req	x0
														
 
															-src	.req	x1
														
 
															-count	.req	x2
														
 
															-tmp1	.req	x3
														
 
															-tmp1w	.req	w3
														
 
															-tmp2	.req	x4
														
 
															-tmp2w	.req	w4
														
 
															-tmp3	.req	x5
														
 
															-tmp3w	.req	w5
														
 
															-dst	.req	x6
														
 
															+	.macro ldrb1 ptr, regB, val
														
 
															+	ldrb  \ptr, [\regB], \val
														
 
															+	.endm
														
 
															-A_l	.req	x7
														
 
															-A_h	.req	x8
														
 
															-B_l	.req	x9
														
 
															-B_h	.req	x10
														
 
															-C_l	.req	x11
														
 
															-C_h	.req	x12
														
 
															-D_l	.req	x13
														
 
															-D_h	.req	x14
														
 
															+	.macro strb1 ptr, regB, val
														
 
															+	strb \ptr, [\regB], \val
														
 
															+	.endm
														
 
															-ENTRY(memcpy)
														
 
															-	mov	dst, dstin
														
 
															-	cmp	count, #16
														
 
															-	/*When memory length is less than 16, the accessed are not aligned.*/
														
 
															-	b.lo	.Ltiny15
														
 
															+	.macro ldrh1 ptr, regB, val
														
 
															+	ldrh  \ptr, [\regB], \val
														
 
															+	.endm
														
 
															-	neg	tmp2, src
														
 
															-	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
														
 
															-	b.eq	.LSrcAligned
														
 
															-	sub	count, count, tmp2
														
 
															-	/*
														
 
															-	* Copy the leading memory data from src to dst in an increasing
														
 
															-	* address order.By this way,the risk of overwritting the source
														
 
															-	* memory data is eliminated when the distance between src and
														
 
															-	* dst is less than 16. The memory accesses here are alignment.
														
 
															-	*/
														
 
															-	tbz	tmp2, #0, 1f
														
 
															-	ldrb	tmp1w, [src], #1
														
 
															-	strb	tmp1w, [dst], #1
														
 
															-1:
														
 
															-	tbz	tmp2, #1, 2f
														
 
															-	ldrh	tmp1w, [src], #2
														
 
															-	strh	tmp1w, [dst], #2
														
 
															-2:
														
 
															-	tbz	tmp2, #2, 3f
														
 
															-	ldr	tmp1w, [src], #4
														
 
															-	str	tmp1w, [dst], #4
														
 
															-3:
														
 
															-	tbz	tmp2, #3, .LSrcAligned
														
 
															-	ldr	tmp1, [src],#8
														
 
															-	str	tmp1, [dst],#8
														
 
															+	.macro strh1 ptr, regB, val
														
 
															+	strh \ptr, [\regB], \val
														
 
															+	.endm
														
 
															-.LSrcAligned:
														
 
															-	cmp	count, #64
														
 
															-	b.ge	.Lcpy_over64
														
 
															-	/*
														
 
															-	* Deal with small copies quickly by dropping straight into the
														
 
															-	* exit block.
														
 
															-	*/
														
 
															-.Ltail63:
														
 
															-	/*
														
 
															-	* Copy up to 48 bytes of data. At this point we only need the
														
 
															-	* bottom 6 bits of count to be accurate.
														
 
															-	*/
														
 
															-	ands	tmp1, count, #0x30
														
 
															-	b.eq	.Ltiny15
														
 
															-	cmp	tmp1w, #0x20
														
 
															-	b.eq	1f
														
 
															-	b.lt	2f
														
 
															-	ldp	A_l, A_h, [src], #16
														
 
															-	stp	A_l, A_h, [dst], #16
														
 
															-1:
														
 
															-	ldp	A_l, A_h, [src], #16
														
 
															-	stp	A_l, A_h, [dst], #16
														
 
															-2:
														
 
															-	ldp	A_l, A_h, [src], #16
														
 
															-	stp	A_l, A_h, [dst], #16
														
 
															-.Ltiny15:
														
 
															-	/*
														
 
															-	* Prefer to break one ldp/stp into several load/store to access
														
 
															-	* memory in an increasing address order,rather than to load/store 16
														
 
															-	* bytes from (src-16) to (dst-16) and to backward the src to aligned
														
 
															-	* address,which way is used in original cortex memcpy. If keeping
														
 
															-	* the original memcpy process here, memmove need to satisfy the
														
 
															-	* precondition that src address is at least 16 bytes bigger than dst
														
 
															-	* address,otherwise some source data will be overwritten when memove
														
 
															-	* call memcpy directly. To make memmove simpler and decouple the
														
 
															-	* memcpy's dependency on memmove, withdrew the original process.
														
 
															-	*/
														
 
															-	tbz	count, #3, 1f
														
 
															-	ldr	tmp1, [src], #8
														
 
															-	str	tmp1, [dst], #8
														
 
															-1:
														
 
															-	tbz	count, #2, 2f
														
 
															-	ldr	tmp1w, [src], #4
														
 
															-	str	tmp1w, [dst], #4
														
 
															-2:
														
 
															-	tbz	count, #1, 3f
														
 
															-	ldrh	tmp1w, [src], #2
														
 
															-	strh	tmp1w, [dst], #2
														
 
															-3:
														
 
															-	tbz	count, #0, .Lexitfunc
														
 
															-	ldrb	tmp1w, [src]
														
 
															-	strb	tmp1w, [dst]
														
 
															+	.macro ldr1 ptr, regB, val
														
 
															+	ldr \ptr, [\regB], \val
														
 
															+	.endm
														
 
															-.Lexitfunc:
														
 
															-	ret
														
 
															+	.macro str1 ptr, regB, val
														
 
															+	str \ptr, [\regB], \val
														
 
															+	.endm
														
 
															-.Lcpy_over64:
														
 
															-	subs	count, count, #128
														
 
															-	b.ge	.Lcpy_body_large
														
 
															-	/*
														
 
															-	* Less than 128 bytes to copy, so handle 64 here and then jump
														
 
															-	* to the tail.
														
 
															-	*/
														
 
															-	ldp	A_l, A_h, [src],#16
														
 
															-	stp	A_l, A_h, [dst],#16
														
 
															-	ldp	B_l, B_h, [src],#16
														
 
															-	ldp	C_l, C_h, [src],#16
														
 
															-	stp	B_l, B_h, [dst],#16
														
 
															-	stp	C_l, C_h, [dst],#16
														
 
															-	ldp	D_l, D_h, [src],#16
														
 
															-	stp	D_l, D_h, [dst],#16
														
 
															+	.macro ldp1 ptr, regB, regC, val
														
 
															+	ldp \ptr, \regB, [\regC], \val
														
 
															+	.endm
														
 
															-	tst	count, #0x3f
														
 
															-	b.ne	.Ltail63
														
 
															-	ret
														
 
															+	.macro stp1 ptr, regB, regC, val
														
 
															+	stp \ptr, \regB, [\regC], \val
														
 
															+	.endm
														
 
															-	/*
														
 
															-	* Critical loop.  Start at a new cache line boundary.  Assuming
														
 
															-	* 64 bytes per line this ensures the entire loop is in one line.
														
 
															-	*/
														
 
															-	.p2align	L1_CACHE_SHIFT
														
 
															-.Lcpy_body_large:
														
 
															-	/* pre-get 64 bytes data. */
														
 
															-	ldp	A_l, A_h, [src],#16
														
 
															-	ldp	B_l, B_h, [src],#16
														
 
															-	ldp	C_l, C_h, [src],#16
														
 
															-	ldp	D_l, D_h, [src],#16
														
 
															-1:
														
 
															-	/*
														
 
															-	* interlace the load of next 64 bytes data block with store of the last
														
 
															-	* loaded 64 bytes data.
														
 
															-	*/
														
 
															-	stp	A_l, A_h, [dst],#16
														
 
															-	ldp	A_l, A_h, [src],#16
														
 
															-	stp	B_l, B_h, [dst],#16
														
 
															-	ldp	B_l, B_h, [src],#16
														
 
															-	stp	C_l, C_h, [dst],#16
														
 
															-	ldp	C_l, C_h, [src],#16
														
 
															-	stp	D_l, D_h, [dst],#16
														
 
															-	ldp	D_l, D_h, [src],#16
														
 
															-	subs	count, count, #64
														
 
															-	b.ge	1b
														
 
															-	stp	A_l, A_h, [dst],#16
														
 
															-	stp	B_l, B_h, [dst],#16
														
 
															-	stp	C_l, C_h, [dst],#16
														
 
															-	stp	D_l, D_h, [dst],#16
														
 
															-
														
 
															-	tst	count, #0x3f
														
 
															-	b.ne	.Ltail63
														
 
															+ENTRY(memcpy)
														
 
															+#include "copy_template.S"
														
 
															 	ret
														
 
															 ENDPROC(memcpy)