|
@@ -212,18 +212,18 @@ EXC( STORE t2, UNIT(6)(dst), s_exc_p10u)
|
|
|
ADD src, src, 16*NBYTES
|
|
|
EXC( STORE t3, UNIT(7)(dst), s_exc_p9u)
|
|
|
ADD dst, dst, 16*NBYTES
|
|
|
-EXC( LOAD t0, UNIT(-8)(src), l_exc_copy)
|
|
|
-EXC( LOAD t1, UNIT(-7)(src), l_exc_copy)
|
|
|
-EXC( LOAD t2, UNIT(-6)(src), l_exc_copy)
|
|
|
-EXC( LOAD t3, UNIT(-5)(src), l_exc_copy)
|
|
|
+EXC( LOAD t0, UNIT(-8)(src), l_exc_copy_rewind16)
|
|
|
+EXC( LOAD t1, UNIT(-7)(src), l_exc_copy_rewind16)
|
|
|
+EXC( LOAD t2, UNIT(-6)(src), l_exc_copy_rewind16)
|
|
|
+EXC( LOAD t3, UNIT(-5)(src), l_exc_copy_rewind16)
|
|
|
EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u)
|
|
|
EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u)
|
|
|
EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)
|
|
|
EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)
|
|
|
-EXC( LOAD t0, UNIT(-4)(src), l_exc_copy)
|
|
|
-EXC( LOAD t1, UNIT(-3)(src), l_exc_copy)
|
|
|
-EXC( LOAD t2, UNIT(-2)(src), l_exc_copy)
|
|
|
-EXC( LOAD t3, UNIT(-1)(src), l_exc_copy)
|
|
|
+EXC( LOAD t0, UNIT(-4)(src), l_exc_copy_rewind16)
|
|
|
+EXC( LOAD t1, UNIT(-3)(src), l_exc_copy_rewind16)
|
|
|
+EXC( LOAD t2, UNIT(-2)(src), l_exc_copy_rewind16)
|
|
|
+EXC( LOAD t3, UNIT(-1)(src), l_exc_copy_rewind16)
|
|
|
EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u)
|
|
|
EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u)
|
|
|
EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u)
|
|
@@ -387,6 +387,10 @@ done:
|
|
|
nop
|
|
|
END(memcpy)
|
|
|
|
|
|
+l_exc_copy_rewind16:
|
|
|
+ /* Rewind src and dst by 16*NBYTES for l_exc_copy */
|
|
|
+ SUB src, src, 16*NBYTES
|
|
|
+ SUB dst, dst, 16*NBYTES
|
|
|
l_exc_copy:
|
|
|
/*
|
|
|
* Copy bytes from src until faulting load address (or until a
|