copyuser_power7.S 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706
  1. /*
  2. * This program is free software; you can redistribute it and/or modify
  3. * it under the terms of the GNU General Public License as published by
  4. * the Free Software Foundation; either version 2 of the License, or
  5. * (at your option) any later version.
  6. *
  7. * This program is distributed in the hope that it will be useful,
  8. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. * GNU General Public License for more details.
  11. *
  12. * You should have received a copy of the GNU General Public License
  13. * along with this program; if not, write to the Free Software
  14. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  15. *
  16. * Copyright (C) IBM Corporation, 2011
  17. *
  18. * Author: Anton Blanchard <anton@au.ibm.com>
  19. */
  20. #include <asm/ppc_asm.h>
  21. #ifdef __BIG_ENDIAN__
  22. #define LVS(VRT,RA,RB) lvsl VRT,RA,RB
  23. #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
  24. #else
  25. #define LVS(VRT,RA,RB) lvsr VRT,RA,RB
  26. #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
  27. #endif
  28. .macro err1
  29. 100:
  30. EX_TABLE(100b,.Ldo_err1)
  31. .endm
  32. .macro err2
  33. 200:
  34. EX_TABLE(200b,.Ldo_err2)
  35. .endm
  36. #ifdef CONFIG_ALTIVEC
  37. .macro err3
  38. 300:
  39. EX_TABLE(300b,.Ldo_err3)
  40. .endm
  41. .macro err4
  42. 400:
  43. EX_TABLE(400b,.Ldo_err4)
  44. .endm
  45. .Ldo_err4:
  46. ld r16,STK_REG(R16)(r1)
  47. ld r15,STK_REG(R15)(r1)
  48. ld r14,STK_REG(R14)(r1)
  49. .Ldo_err3:
  50. bl exit_vmx_usercopy
  51. ld r0,STACKFRAMESIZE+16(r1)
  52. mtlr r0
  53. b .Lexit
  54. #endif /* CONFIG_ALTIVEC */
  55. .Ldo_err2:
  56. ld r22,STK_REG(R22)(r1)
  57. ld r21,STK_REG(R21)(r1)
  58. ld r20,STK_REG(R20)(r1)
  59. ld r19,STK_REG(R19)(r1)
  60. ld r18,STK_REG(R18)(r1)
  61. ld r17,STK_REG(R17)(r1)
  62. ld r16,STK_REG(R16)(r1)
  63. ld r15,STK_REG(R15)(r1)
  64. ld r14,STK_REG(R14)(r1)
  65. .Lexit:
  66. addi r1,r1,STACKFRAMESIZE
  67. .Ldo_err1:
  68. ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  69. ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  70. ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  71. b __copy_tofrom_user_base
  72. _GLOBAL(__copy_tofrom_user_power7)
  73. #ifdef CONFIG_ALTIVEC
  74. cmpldi r5,16
  75. cmpldi cr1,r5,3328
  76. std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  77. std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  78. std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  79. blt .Lshort_copy
  80. bge cr1,.Lvmx_copy
  81. #else
  82. cmpldi r5,16
  83. std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  84. std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  85. std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  86. blt .Lshort_copy
  87. #endif
  88. .Lnonvmx_copy:
  89. /* Get the source 8B aligned */
  90. neg r6,r4
  91. mtocrf 0x01,r6
  92. clrldi r6,r6,(64-3)
  93. bf cr7*4+3,1f
  94. err1; lbz r0,0(r4)
  95. addi r4,r4,1
  96. err1; stb r0,0(r3)
  97. addi r3,r3,1
  98. 1: bf cr7*4+2,2f
  99. err1; lhz r0,0(r4)
  100. addi r4,r4,2
  101. err1; sth r0,0(r3)
  102. addi r3,r3,2
  103. 2: bf cr7*4+1,3f
  104. err1; lwz r0,0(r4)
  105. addi r4,r4,4
  106. err1; stw r0,0(r3)
  107. addi r3,r3,4
  108. 3: sub r5,r5,r6
  109. cmpldi r5,128
  110. blt 5f
  111. mflr r0
  112. stdu r1,-STACKFRAMESIZE(r1)
  113. std r14,STK_REG(R14)(r1)
  114. std r15,STK_REG(R15)(r1)
  115. std r16,STK_REG(R16)(r1)
  116. std r17,STK_REG(R17)(r1)
  117. std r18,STK_REG(R18)(r1)
  118. std r19,STK_REG(R19)(r1)
  119. std r20,STK_REG(R20)(r1)
  120. std r21,STK_REG(R21)(r1)
  121. std r22,STK_REG(R22)(r1)
  122. std r0,STACKFRAMESIZE+16(r1)
  123. srdi r6,r5,7
  124. mtctr r6
  125. /* Now do cacheline (128B) sized loads and stores. */
  126. .align 5
  127. 4:
  128. err2; ld r0,0(r4)
  129. err2; ld r6,8(r4)
  130. err2; ld r7,16(r4)
  131. err2; ld r8,24(r4)
  132. err2; ld r9,32(r4)
  133. err2; ld r10,40(r4)
  134. err2; ld r11,48(r4)
  135. err2; ld r12,56(r4)
  136. err2; ld r14,64(r4)
  137. err2; ld r15,72(r4)
  138. err2; ld r16,80(r4)
  139. err2; ld r17,88(r4)
  140. err2; ld r18,96(r4)
  141. err2; ld r19,104(r4)
  142. err2; ld r20,112(r4)
  143. err2; ld r21,120(r4)
  144. addi r4,r4,128
  145. err2; std r0,0(r3)
  146. err2; std r6,8(r3)
  147. err2; std r7,16(r3)
  148. err2; std r8,24(r3)
  149. err2; std r9,32(r3)
  150. err2; std r10,40(r3)
  151. err2; std r11,48(r3)
  152. err2; std r12,56(r3)
  153. err2; std r14,64(r3)
  154. err2; std r15,72(r3)
  155. err2; std r16,80(r3)
  156. err2; std r17,88(r3)
  157. err2; std r18,96(r3)
  158. err2; std r19,104(r3)
  159. err2; std r20,112(r3)
  160. err2; std r21,120(r3)
  161. addi r3,r3,128
  162. bdnz 4b
  163. clrldi r5,r5,(64-7)
  164. ld r14,STK_REG(R14)(r1)
  165. ld r15,STK_REG(R15)(r1)
  166. ld r16,STK_REG(R16)(r1)
  167. ld r17,STK_REG(R17)(r1)
  168. ld r18,STK_REG(R18)(r1)
  169. ld r19,STK_REG(R19)(r1)
  170. ld r20,STK_REG(R20)(r1)
  171. ld r21,STK_REG(R21)(r1)
  172. ld r22,STK_REG(R22)(r1)
  173. addi r1,r1,STACKFRAMESIZE
  174. /* Up to 127B to go */
  175. 5: srdi r6,r5,4
  176. mtocrf 0x01,r6
  177. 6: bf cr7*4+1,7f
  178. err1; ld r0,0(r4)
  179. err1; ld r6,8(r4)
  180. err1; ld r7,16(r4)
  181. err1; ld r8,24(r4)
  182. err1; ld r9,32(r4)
  183. err1; ld r10,40(r4)
  184. err1; ld r11,48(r4)
  185. err1; ld r12,56(r4)
  186. addi r4,r4,64
  187. err1; std r0,0(r3)
  188. err1; std r6,8(r3)
  189. err1; std r7,16(r3)
  190. err1; std r8,24(r3)
  191. err1; std r9,32(r3)
  192. err1; std r10,40(r3)
  193. err1; std r11,48(r3)
  194. err1; std r12,56(r3)
  195. addi r3,r3,64
  196. /* Up to 63B to go */
  197. 7: bf cr7*4+2,8f
  198. err1; ld r0,0(r4)
  199. err1; ld r6,8(r4)
  200. err1; ld r7,16(r4)
  201. err1; ld r8,24(r4)
  202. addi r4,r4,32
  203. err1; std r0,0(r3)
  204. err1; std r6,8(r3)
  205. err1; std r7,16(r3)
  206. err1; std r8,24(r3)
  207. addi r3,r3,32
  208. /* Up to 31B to go */
  209. 8: bf cr7*4+3,9f
  210. err1; ld r0,0(r4)
  211. err1; ld r6,8(r4)
  212. addi r4,r4,16
  213. err1; std r0,0(r3)
  214. err1; std r6,8(r3)
  215. addi r3,r3,16
  216. 9: clrldi r5,r5,(64-4)
  217. /* Up to 15B to go */
  218. .Lshort_copy:
  219. mtocrf 0x01,r5
  220. bf cr7*4+0,12f
  221. err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
  222. err1; lwz r6,4(r4)
  223. addi r4,r4,8
  224. err1; stw r0,0(r3)
  225. err1; stw r6,4(r3)
  226. addi r3,r3,8
  227. 12: bf cr7*4+1,13f
  228. err1; lwz r0,0(r4)
  229. addi r4,r4,4
  230. err1; stw r0,0(r3)
  231. addi r3,r3,4
  232. 13: bf cr7*4+2,14f
  233. err1; lhz r0,0(r4)
  234. addi r4,r4,2
  235. err1; sth r0,0(r3)
  236. addi r3,r3,2
  237. 14: bf cr7*4+3,15f
  238. err1; lbz r0,0(r4)
  239. err1; stb r0,0(r3)
  240. 15: li r3,0
  241. blr
  242. .Lunwind_stack_nonvmx_copy:
  243. addi r1,r1,STACKFRAMESIZE
  244. b .Lnonvmx_copy
  245. #ifdef CONFIG_ALTIVEC
  246. .Lvmx_copy:
  247. mflr r0
  248. std r0,16(r1)
  249. stdu r1,-STACKFRAMESIZE(r1)
  250. bl enter_vmx_usercopy
  251. cmpwi cr1,r3,0
  252. ld r0,STACKFRAMESIZE+16(r1)
  253. ld r3,STK_REG(R31)(r1)
  254. ld r4,STK_REG(R30)(r1)
  255. ld r5,STK_REG(R29)(r1)
  256. mtlr r0
  257. /*
  258. * We prefetch both the source and destination using enhanced touch
  259. * instructions. We use a stream ID of 0 for the load side and
  260. * 1 for the store side.
  261. */
  262. clrrdi r6,r4,7
  263. clrrdi r9,r3,7
  264. ori r9,r9,1 /* stream=1 */
  265. srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
  266. cmpldi r7,0x3FF
  267. ble 1f
  268. li r7,0x3FF
  269. 1: lis r0,0x0E00 /* depth=7 */
  270. sldi r7,r7,7
  271. or r7,r7,r0
  272. ori r10,r7,1 /* stream=1 */
  273. lis r8,0x8000 /* GO=1 */
  274. clrldi r8,r8,32
  275. /* setup read stream 0 */
  276. dcbt 0,r6,0b01000 /* addr from */
  277. dcbt 0,r7,0b01010 /* length and depth from */
  278. /* setup write stream 1 */
  279. dcbtst 0,r9,0b01000 /* addr to */
  280. dcbtst 0,r10,0b01010 /* length and depth to */
  281. eieio
  282. dcbt 0,r8,0b01010 /* all streams GO */
  283. beq cr1,.Lunwind_stack_nonvmx_copy
  284. /*
  285. * If source and destination are not relatively aligned we use a
  286. * slower permute loop.
  287. */
  288. xor r6,r4,r3
  289. rldicl. r6,r6,0,(64-4)
  290. bne .Lvmx_unaligned_copy
  291. /* Get the destination 16B aligned */
  292. neg r6,r3
  293. mtocrf 0x01,r6
  294. clrldi r6,r6,(64-4)
  295. bf cr7*4+3,1f
  296. err3; lbz r0,0(r4)
  297. addi r4,r4,1
  298. err3; stb r0,0(r3)
  299. addi r3,r3,1
  300. 1: bf cr7*4+2,2f
  301. err3; lhz r0,0(r4)
  302. addi r4,r4,2
  303. err3; sth r0,0(r3)
  304. addi r3,r3,2
  305. 2: bf cr7*4+1,3f
  306. err3; lwz r0,0(r4)
  307. addi r4,r4,4
  308. err3; stw r0,0(r3)
  309. addi r3,r3,4
  310. 3: bf cr7*4+0,4f
  311. err3; ld r0,0(r4)
  312. addi r4,r4,8
  313. err3; std r0,0(r3)
  314. addi r3,r3,8
  315. 4: sub r5,r5,r6
  316. /* Get the desination 128B aligned */
  317. neg r6,r3
  318. srdi r7,r6,4
  319. mtocrf 0x01,r7
  320. clrldi r6,r6,(64-7)
  321. li r9,16
  322. li r10,32
  323. li r11,48
  324. bf cr7*4+3,5f
  325. err3; lvx v1,0,r4
  326. addi r4,r4,16
  327. err3; stvx v1,0,r3
  328. addi r3,r3,16
  329. 5: bf cr7*4+2,6f
  330. err3; lvx v1,0,r4
  331. err3; lvx v0,r4,r9
  332. addi r4,r4,32
  333. err3; stvx v1,0,r3
  334. err3; stvx v0,r3,r9
  335. addi r3,r3,32
  336. 6: bf cr7*4+1,7f
  337. err3; lvx v3,0,r4
  338. err3; lvx v2,r4,r9
  339. err3; lvx v1,r4,r10
  340. err3; lvx v0,r4,r11
  341. addi r4,r4,64
  342. err3; stvx v3,0,r3
  343. err3; stvx v2,r3,r9
  344. err3; stvx v1,r3,r10
  345. err3; stvx v0,r3,r11
  346. addi r3,r3,64
  347. 7: sub r5,r5,r6
  348. srdi r6,r5,7
  349. std r14,STK_REG(R14)(r1)
  350. std r15,STK_REG(R15)(r1)
  351. std r16,STK_REG(R16)(r1)
  352. li r12,64
  353. li r14,80
  354. li r15,96
  355. li r16,112
  356. mtctr r6
  357. /*
  358. * Now do cacheline sized loads and stores. By this stage the
  359. * cacheline stores are also cacheline aligned.
  360. */
  361. .align 5
  362. 8:
  363. err4; lvx v7,0,r4
  364. err4; lvx v6,r4,r9
  365. err4; lvx v5,r4,r10
  366. err4; lvx v4,r4,r11
  367. err4; lvx v3,r4,r12
  368. err4; lvx v2,r4,r14
  369. err4; lvx v1,r4,r15
  370. err4; lvx v0,r4,r16
  371. addi r4,r4,128
  372. err4; stvx v7,0,r3
  373. err4; stvx v6,r3,r9
  374. err4; stvx v5,r3,r10
  375. err4; stvx v4,r3,r11
  376. err4; stvx v3,r3,r12
  377. err4; stvx v2,r3,r14
  378. err4; stvx v1,r3,r15
  379. err4; stvx v0,r3,r16
  380. addi r3,r3,128
  381. bdnz 8b
  382. ld r14,STK_REG(R14)(r1)
  383. ld r15,STK_REG(R15)(r1)
  384. ld r16,STK_REG(R16)(r1)
  385. /* Up to 127B to go */
  386. clrldi r5,r5,(64-7)
  387. srdi r6,r5,4
  388. mtocrf 0x01,r6
  389. bf cr7*4+1,9f
  390. err3; lvx v3,0,r4
  391. err3; lvx v2,r4,r9
  392. err3; lvx v1,r4,r10
  393. err3; lvx v0,r4,r11
  394. addi r4,r4,64
  395. err3; stvx v3,0,r3
  396. err3; stvx v2,r3,r9
  397. err3; stvx v1,r3,r10
  398. err3; stvx v0,r3,r11
  399. addi r3,r3,64
  400. 9: bf cr7*4+2,10f
  401. err3; lvx v1,0,r4
  402. err3; lvx v0,r4,r9
  403. addi r4,r4,32
  404. err3; stvx v1,0,r3
  405. err3; stvx v0,r3,r9
  406. addi r3,r3,32
  407. 10: bf cr7*4+3,11f
  408. err3; lvx v1,0,r4
  409. addi r4,r4,16
  410. err3; stvx v1,0,r3
  411. addi r3,r3,16
  412. /* Up to 15B to go */
  413. 11: clrldi r5,r5,(64-4)
  414. mtocrf 0x01,r5
  415. bf cr7*4+0,12f
  416. err3; ld r0,0(r4)
  417. addi r4,r4,8
  418. err3; std r0,0(r3)
  419. addi r3,r3,8
  420. 12: bf cr7*4+1,13f
  421. err3; lwz r0,0(r4)
  422. addi r4,r4,4
  423. err3; stw r0,0(r3)
  424. addi r3,r3,4
  425. 13: bf cr7*4+2,14f
  426. err3; lhz r0,0(r4)
  427. addi r4,r4,2
  428. err3; sth r0,0(r3)
  429. addi r3,r3,2
  430. 14: bf cr7*4+3,15f
  431. err3; lbz r0,0(r4)
  432. err3; stb r0,0(r3)
  433. 15: addi r1,r1,STACKFRAMESIZE
  434. b exit_vmx_usercopy /* tail call optimise */
  435. .Lvmx_unaligned_copy:
  436. /* Get the destination 16B aligned */
  437. neg r6,r3
  438. mtocrf 0x01,r6
  439. clrldi r6,r6,(64-4)
  440. bf cr7*4+3,1f
  441. err3; lbz r0,0(r4)
  442. addi r4,r4,1
  443. err3; stb r0,0(r3)
  444. addi r3,r3,1
  445. 1: bf cr7*4+2,2f
  446. err3; lhz r0,0(r4)
  447. addi r4,r4,2
  448. err3; sth r0,0(r3)
  449. addi r3,r3,2
  450. 2: bf cr7*4+1,3f
  451. err3; lwz r0,0(r4)
  452. addi r4,r4,4
  453. err3; stw r0,0(r3)
  454. addi r3,r3,4
  455. 3: bf cr7*4+0,4f
  456. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  457. err3; lwz r7,4(r4)
  458. addi r4,r4,8
  459. err3; stw r0,0(r3)
  460. err3; stw r7,4(r3)
  461. addi r3,r3,8
  462. 4: sub r5,r5,r6
  463. /* Get the desination 128B aligned */
  464. neg r6,r3
  465. srdi r7,r6,4
  466. mtocrf 0x01,r7
  467. clrldi r6,r6,(64-7)
  468. li r9,16
  469. li r10,32
  470. li r11,48
  471. LVS(v16,0,r4) /* Setup permute control vector */
  472. err3; lvx v0,0,r4
  473. addi r4,r4,16
  474. bf cr7*4+3,5f
  475. err3; lvx v1,0,r4
  476. VPERM(v8,v0,v1,v16)
  477. addi r4,r4,16
  478. err3; stvx v8,0,r3
  479. addi r3,r3,16
  480. vor v0,v1,v1
  481. 5: bf cr7*4+2,6f
  482. err3; lvx v1,0,r4
  483. VPERM(v8,v0,v1,v16)
  484. err3; lvx v0,r4,r9
  485. VPERM(v9,v1,v0,v16)
  486. addi r4,r4,32
  487. err3; stvx v8,0,r3
  488. err3; stvx v9,r3,r9
  489. addi r3,r3,32
  490. 6: bf cr7*4+1,7f
  491. err3; lvx v3,0,r4
  492. VPERM(v8,v0,v3,v16)
  493. err3; lvx v2,r4,r9
  494. VPERM(v9,v3,v2,v16)
  495. err3; lvx v1,r4,r10
  496. VPERM(v10,v2,v1,v16)
  497. err3; lvx v0,r4,r11
  498. VPERM(v11,v1,v0,v16)
  499. addi r4,r4,64
  500. err3; stvx v8,0,r3
  501. err3; stvx v9,r3,r9
  502. err3; stvx v10,r3,r10
  503. err3; stvx v11,r3,r11
  504. addi r3,r3,64
  505. 7: sub r5,r5,r6
  506. srdi r6,r5,7
  507. std r14,STK_REG(R14)(r1)
  508. std r15,STK_REG(R15)(r1)
  509. std r16,STK_REG(R16)(r1)
  510. li r12,64
  511. li r14,80
  512. li r15,96
  513. li r16,112
  514. mtctr r6
  515. /*
  516. * Now do cacheline sized loads and stores. By this stage the
  517. * cacheline stores are also cacheline aligned.
  518. */
  519. .align 5
  520. 8:
  521. err4; lvx v7,0,r4
  522. VPERM(v8,v0,v7,v16)
  523. err4; lvx v6,r4,r9
  524. VPERM(v9,v7,v6,v16)
  525. err4; lvx v5,r4,r10
  526. VPERM(v10,v6,v5,v16)
  527. err4; lvx v4,r4,r11
  528. VPERM(v11,v5,v4,v16)
  529. err4; lvx v3,r4,r12
  530. VPERM(v12,v4,v3,v16)
  531. err4; lvx v2,r4,r14
  532. VPERM(v13,v3,v2,v16)
  533. err4; lvx v1,r4,r15
  534. VPERM(v14,v2,v1,v16)
  535. err4; lvx v0,r4,r16
  536. VPERM(v15,v1,v0,v16)
  537. addi r4,r4,128
  538. err4; stvx v8,0,r3
  539. err4; stvx v9,r3,r9
  540. err4; stvx v10,r3,r10
  541. err4; stvx v11,r3,r11
  542. err4; stvx v12,r3,r12
  543. err4; stvx v13,r3,r14
  544. err4; stvx v14,r3,r15
  545. err4; stvx v15,r3,r16
  546. addi r3,r3,128
  547. bdnz 8b
  548. ld r14,STK_REG(R14)(r1)
  549. ld r15,STK_REG(R15)(r1)
  550. ld r16,STK_REG(R16)(r1)
  551. /* Up to 127B to go */
  552. clrldi r5,r5,(64-7)
  553. srdi r6,r5,4
  554. mtocrf 0x01,r6
  555. bf cr7*4+1,9f
  556. err3; lvx v3,0,r4
  557. VPERM(v8,v0,v3,v16)
  558. err3; lvx v2,r4,r9
  559. VPERM(v9,v3,v2,v16)
  560. err3; lvx v1,r4,r10
  561. VPERM(v10,v2,v1,v16)
  562. err3; lvx v0,r4,r11
  563. VPERM(v11,v1,v0,v16)
  564. addi r4,r4,64
  565. err3; stvx v8,0,r3
  566. err3; stvx v9,r3,r9
  567. err3; stvx v10,r3,r10
  568. err3; stvx v11,r3,r11
  569. addi r3,r3,64
  570. 9: bf cr7*4+2,10f
  571. err3; lvx v1,0,r4
  572. VPERM(v8,v0,v1,v16)
  573. err3; lvx v0,r4,r9
  574. VPERM(v9,v1,v0,v16)
  575. addi r4,r4,32
  576. err3; stvx v8,0,r3
  577. err3; stvx v9,r3,r9
  578. addi r3,r3,32
  579. 10: bf cr7*4+3,11f
  580. err3; lvx v1,0,r4
  581. VPERM(v8,v0,v1,v16)
  582. addi r4,r4,16
  583. err3; stvx v8,0,r3
  584. addi r3,r3,16
  585. /* Up to 15B to go */
  586. 11: clrldi r5,r5,(64-4)
  587. addi r4,r4,-16 /* Unwind the +16 load offset */
  588. mtocrf 0x01,r5
  589. bf cr7*4+0,12f
  590. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  591. err3; lwz r6,4(r4)
  592. addi r4,r4,8
  593. err3; stw r0,0(r3)
  594. err3; stw r6,4(r3)
  595. addi r3,r3,8
  596. 12: bf cr7*4+1,13f
  597. err3; lwz r0,0(r4)
  598. addi r4,r4,4
  599. err3; stw r0,0(r3)
  600. addi r3,r3,4
  601. 13: bf cr7*4+2,14f
  602. err3; lhz r0,0(r4)
  603. addi r4,r4,2
  604. err3; sth r0,0(r3)
  605. addi r3,r3,2
  606. 14: bf cr7*4+3,15f
  607. err3; lbz r0,0(r4)
  608. err3; stb r0,0(r3)
  609. 15: addi r1,r1,STACKFRAMESIZE
  610. b exit_vmx_usercopy /* tail call optimise */
  611. #endif /* CONFIG_ALTIVEC */