copyuser_power7.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709
  1. /*
  2. * This program is free software; you can redistribute it and/or modify
  3. * it under the terms of the GNU General Public License as published by
  4. * the Free Software Foundation; either version 2 of the License, or
  5. * (at your option) any later version.
  6. *
  7. * This program is distributed in the hope that it will be useful,
  8. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. * GNU General Public License for more details.
  11. *
  12. * You should have received a copy of the GNU General Public License
  13. * along with this program; if not, write to the Free Software
  14. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  15. *
  16. * Copyright (C) IBM Corporation, 2011
  17. *
  18. * Author: Anton Blanchard <anton@au.ibm.com>
  19. */
  20. #include <asm/ppc_asm.h>
  21. #ifdef __BIG_ENDIAN__
  22. #define LVS(VRT,RA,RB) lvsl VRT,RA,RB
  23. #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
  24. #else
  25. #define LVS(VRT,RA,RB) lvsr VRT,RA,RB
  26. #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
  27. #endif
  28. .macro err1
  29. 100:
  30. EX_TABLE(100b,.Ldo_err1)
  31. .endm
  32. .macro err2
  33. 200:
  34. EX_TABLE(200b,.Ldo_err2)
  35. .endm
  36. #ifdef CONFIG_ALTIVEC
  37. .macro err3
  38. 300:
  39. EX_TABLE(300b,.Ldo_err3)
  40. .endm
  41. .macro err4
  42. 400:
  43. EX_TABLE(400b,.Ldo_err4)
  44. .endm
  45. .Ldo_err4:
  46. ld r16,STK_REG(R16)(r1)
  47. ld r15,STK_REG(R15)(r1)
  48. ld r14,STK_REG(R14)(r1)
  49. .Ldo_err3:
  50. bl exit_vmx_usercopy
  51. ld r0,STACKFRAMESIZE+16(r1)
  52. mtlr r0
  53. b .Lexit
  54. #endif /* CONFIG_ALTIVEC */
  55. .Ldo_err2:
  56. ld r22,STK_REG(R22)(r1)
  57. ld r21,STK_REG(R21)(r1)
  58. ld r20,STK_REG(R20)(r1)
  59. ld r19,STK_REG(R19)(r1)
  60. ld r18,STK_REG(R18)(r1)
  61. ld r17,STK_REG(R17)(r1)
  62. ld r16,STK_REG(R16)(r1)
  63. ld r15,STK_REG(R15)(r1)
  64. ld r14,STK_REG(R14)(r1)
  65. .Lexit:
  66. addi r1,r1,STACKFRAMESIZE
  67. .Ldo_err1:
  68. ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  69. ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  70. ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  71. b __copy_tofrom_user_base
  72. _GLOBAL(__copy_tofrom_user_power7)
  73. #ifdef CONFIG_ALTIVEC
  74. cmpldi r5,16
  75. cmpldi cr1,r5,4096
  76. std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  77. std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  78. std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  79. blt .Lshort_copy
  80. bgt cr1,.Lvmx_copy
  81. #else
  82. cmpldi r5,16
  83. std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  84. std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  85. std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  86. blt .Lshort_copy
  87. #endif
  88. .Lnonvmx_copy:
  89. /* Get the source 8B aligned */
  90. neg r6,r4
  91. mtocrf 0x01,r6
  92. clrldi r6,r6,(64-3)
  93. bf cr7*4+3,1f
  94. err1; lbz r0,0(r4)
  95. addi r4,r4,1
  96. err1; stb r0,0(r3)
  97. addi r3,r3,1
  98. 1: bf cr7*4+2,2f
  99. err1; lhz r0,0(r4)
  100. addi r4,r4,2
  101. err1; sth r0,0(r3)
  102. addi r3,r3,2
  103. 2: bf cr7*4+1,3f
  104. err1; lwz r0,0(r4)
  105. addi r4,r4,4
  106. err1; stw r0,0(r3)
  107. addi r3,r3,4
  108. 3: sub r5,r5,r6
  109. cmpldi r5,128
  110. blt 5f
  111. mflr r0
  112. stdu r1,-STACKFRAMESIZE(r1)
  113. std r14,STK_REG(R14)(r1)
  114. std r15,STK_REG(R15)(r1)
  115. std r16,STK_REG(R16)(r1)
  116. std r17,STK_REG(R17)(r1)
  117. std r18,STK_REG(R18)(r1)
  118. std r19,STK_REG(R19)(r1)
  119. std r20,STK_REG(R20)(r1)
  120. std r21,STK_REG(R21)(r1)
  121. std r22,STK_REG(R22)(r1)
  122. std r0,STACKFRAMESIZE+16(r1)
  123. srdi r6,r5,7
  124. mtctr r6
  125. /* Now do cacheline (128B) sized loads and stores. */
  126. .align 5
  127. 4:
  128. err2; ld r0,0(r4)
  129. err2; ld r6,8(r4)
  130. err2; ld r7,16(r4)
  131. err2; ld r8,24(r4)
  132. err2; ld r9,32(r4)
  133. err2; ld r10,40(r4)
  134. err2; ld r11,48(r4)
  135. err2; ld r12,56(r4)
  136. err2; ld r14,64(r4)
  137. err2; ld r15,72(r4)
  138. err2; ld r16,80(r4)
  139. err2; ld r17,88(r4)
  140. err2; ld r18,96(r4)
  141. err2; ld r19,104(r4)
  142. err2; ld r20,112(r4)
  143. err2; ld r21,120(r4)
  144. addi r4,r4,128
  145. err2; std r0,0(r3)
  146. err2; std r6,8(r3)
  147. err2; std r7,16(r3)
  148. err2; std r8,24(r3)
  149. err2; std r9,32(r3)
  150. err2; std r10,40(r3)
  151. err2; std r11,48(r3)
  152. err2; std r12,56(r3)
  153. err2; std r14,64(r3)
  154. err2; std r15,72(r3)
  155. err2; std r16,80(r3)
  156. err2; std r17,88(r3)
  157. err2; std r18,96(r3)
  158. err2; std r19,104(r3)
  159. err2; std r20,112(r3)
  160. err2; std r21,120(r3)
  161. addi r3,r3,128
  162. bdnz 4b
  163. clrldi r5,r5,(64-7)
  164. ld r14,STK_REG(R14)(r1)
  165. ld r15,STK_REG(R15)(r1)
  166. ld r16,STK_REG(R16)(r1)
  167. ld r17,STK_REG(R17)(r1)
  168. ld r18,STK_REG(R18)(r1)
  169. ld r19,STK_REG(R19)(r1)
  170. ld r20,STK_REG(R20)(r1)
  171. ld r21,STK_REG(R21)(r1)
  172. ld r22,STK_REG(R22)(r1)
  173. addi r1,r1,STACKFRAMESIZE
  174. /* Up to 127B to go */
  175. 5: srdi r6,r5,4
  176. mtocrf 0x01,r6
  177. 6: bf cr7*4+1,7f
  178. err1; ld r0,0(r4)
  179. err1; ld r6,8(r4)
  180. err1; ld r7,16(r4)
  181. err1; ld r8,24(r4)
  182. err1; ld r9,32(r4)
  183. err1; ld r10,40(r4)
  184. err1; ld r11,48(r4)
  185. err1; ld r12,56(r4)
  186. addi r4,r4,64
  187. err1; std r0,0(r3)
  188. err1; std r6,8(r3)
  189. err1; std r7,16(r3)
  190. err1; std r8,24(r3)
  191. err1; std r9,32(r3)
  192. err1; std r10,40(r3)
  193. err1; std r11,48(r3)
  194. err1; std r12,56(r3)
  195. addi r3,r3,64
  196. /* Up to 63B to go */
  197. 7: bf cr7*4+2,8f
  198. err1; ld r0,0(r4)
  199. err1; ld r6,8(r4)
  200. err1; ld r7,16(r4)
  201. err1; ld r8,24(r4)
  202. addi r4,r4,32
  203. err1; std r0,0(r3)
  204. err1; std r6,8(r3)
  205. err1; std r7,16(r3)
  206. err1; std r8,24(r3)
  207. addi r3,r3,32
  208. /* Up to 31B to go */
  209. 8: bf cr7*4+3,9f
  210. err1; ld r0,0(r4)
  211. err1; ld r6,8(r4)
  212. addi r4,r4,16
  213. err1; std r0,0(r3)
  214. err1; std r6,8(r3)
  215. addi r3,r3,16
  216. 9: clrldi r5,r5,(64-4)
  217. /* Up to 15B to go */
  218. .Lshort_copy:
  219. mtocrf 0x01,r5
  220. bf cr7*4+0,12f
  221. err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
  222. err1; lwz r6,4(r4)
  223. addi r4,r4,8
  224. err1; stw r0,0(r3)
  225. err1; stw r6,4(r3)
  226. addi r3,r3,8
  227. 12: bf cr7*4+1,13f
  228. err1; lwz r0,0(r4)
  229. addi r4,r4,4
  230. err1; stw r0,0(r3)
  231. addi r3,r3,4
  232. 13: bf cr7*4+2,14f
  233. err1; lhz r0,0(r4)
  234. addi r4,r4,2
  235. err1; sth r0,0(r3)
  236. addi r3,r3,2
  237. 14: bf cr7*4+3,15f
  238. err1; lbz r0,0(r4)
  239. err1; stb r0,0(r3)
  240. 15: li r3,0
  241. blr
  242. .Lunwind_stack_nonvmx_copy:
  243. addi r1,r1,STACKFRAMESIZE
  244. b .Lnonvmx_copy
  245. #ifdef CONFIG_ALTIVEC
  246. .Lvmx_copy:
  247. mflr r0
  248. std r0,16(r1)
  249. stdu r1,-STACKFRAMESIZE(r1)
  250. bl enter_vmx_usercopy
  251. cmpwi cr1,r3,0
  252. ld r0,STACKFRAMESIZE+16(r1)
  253. ld r3,STK_REG(R31)(r1)
  254. ld r4,STK_REG(R30)(r1)
  255. ld r5,STK_REG(R29)(r1)
  256. mtlr r0
  257. /*
  258. * We prefetch both the source and destination using enhanced touch
  259. * instructions. We use a stream ID of 0 for the load side and
  260. * 1 for the store side.
  261. */
  262. clrrdi r6,r4,7
  263. clrrdi r9,r3,7
  264. ori r9,r9,1 /* stream=1 */
  265. srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
  266. cmpldi r7,0x3FF
  267. ble 1f
  268. li r7,0x3FF
  269. 1: lis r0,0x0E00 /* depth=7 */
  270. sldi r7,r7,7
  271. or r7,r7,r0
  272. ori r10,r7,1 /* stream=1 */
  273. lis r8,0x8000 /* GO=1 */
  274. clrldi r8,r8,32
  275. .machine push
  276. .machine "power4"
  277. /* setup read stream 0 */
  278. dcbt r0,r6,0b01000 /* addr from */
  279. dcbt r0,r7,0b01010 /* length and depth from */
  280. /* setup write stream 1 */
  281. dcbtst r0,r9,0b01000 /* addr to */
  282. dcbtst r0,r10,0b01010 /* length and depth to */
  283. eieio
  284. dcbt r0,r8,0b01010 /* all streams GO */
  285. .machine pop
  286. beq cr1,.Lunwind_stack_nonvmx_copy
  287. /*
  288. * If source and destination are not relatively aligned we use a
  289. * slower permute loop.
  290. */
  291. xor r6,r4,r3
  292. rldicl. r6,r6,0,(64-4)
  293. bne .Lvmx_unaligned_copy
  294. /* Get the destination 16B aligned */
  295. neg r6,r3
  296. mtocrf 0x01,r6
  297. clrldi r6,r6,(64-4)
  298. bf cr7*4+3,1f
  299. err3; lbz r0,0(r4)
  300. addi r4,r4,1
  301. err3; stb r0,0(r3)
  302. addi r3,r3,1
  303. 1: bf cr7*4+2,2f
  304. err3; lhz r0,0(r4)
  305. addi r4,r4,2
  306. err3; sth r0,0(r3)
  307. addi r3,r3,2
  308. 2: bf cr7*4+1,3f
  309. err3; lwz r0,0(r4)
  310. addi r4,r4,4
  311. err3; stw r0,0(r3)
  312. addi r3,r3,4
  313. 3: bf cr7*4+0,4f
  314. err3; ld r0,0(r4)
  315. addi r4,r4,8
  316. err3; std r0,0(r3)
  317. addi r3,r3,8
  318. 4: sub r5,r5,r6
  319. /* Get the desination 128B aligned */
  320. neg r6,r3
  321. srdi r7,r6,4
  322. mtocrf 0x01,r7
  323. clrldi r6,r6,(64-7)
  324. li r9,16
  325. li r10,32
  326. li r11,48
  327. bf cr7*4+3,5f
  328. err3; lvx v1,r0,r4
  329. addi r4,r4,16
  330. err3; stvx v1,r0,r3
  331. addi r3,r3,16
  332. 5: bf cr7*4+2,6f
  333. err3; lvx v1,r0,r4
  334. err3; lvx v0,r4,r9
  335. addi r4,r4,32
  336. err3; stvx v1,r0,r3
  337. err3; stvx v0,r3,r9
  338. addi r3,r3,32
  339. 6: bf cr7*4+1,7f
  340. err3; lvx v3,r0,r4
  341. err3; lvx v2,r4,r9
  342. err3; lvx v1,r4,r10
  343. err3; lvx v0,r4,r11
  344. addi r4,r4,64
  345. err3; stvx v3,r0,r3
  346. err3; stvx v2,r3,r9
  347. err3; stvx v1,r3,r10
  348. err3; stvx v0,r3,r11
  349. addi r3,r3,64
  350. 7: sub r5,r5,r6
  351. srdi r6,r5,7
  352. std r14,STK_REG(R14)(r1)
  353. std r15,STK_REG(R15)(r1)
  354. std r16,STK_REG(R16)(r1)
  355. li r12,64
  356. li r14,80
  357. li r15,96
  358. li r16,112
  359. mtctr r6
  360. /*
  361. * Now do cacheline sized loads and stores. By this stage the
  362. * cacheline stores are also cacheline aligned.
  363. */
  364. .align 5
  365. 8:
  366. err4; lvx v7,r0,r4
  367. err4; lvx v6,r4,r9
  368. err4; lvx v5,r4,r10
  369. err4; lvx v4,r4,r11
  370. err4; lvx v3,r4,r12
  371. err4; lvx v2,r4,r14
  372. err4; lvx v1,r4,r15
  373. err4; lvx v0,r4,r16
  374. addi r4,r4,128
  375. err4; stvx v7,r0,r3
  376. err4; stvx v6,r3,r9
  377. err4; stvx v5,r3,r10
  378. err4; stvx v4,r3,r11
  379. err4; stvx v3,r3,r12
  380. err4; stvx v2,r3,r14
  381. err4; stvx v1,r3,r15
  382. err4; stvx v0,r3,r16
  383. addi r3,r3,128
  384. bdnz 8b
  385. ld r14,STK_REG(R14)(r1)
  386. ld r15,STK_REG(R15)(r1)
  387. ld r16,STK_REG(R16)(r1)
  388. /* Up to 127B to go */
  389. clrldi r5,r5,(64-7)
  390. srdi r6,r5,4
  391. mtocrf 0x01,r6
  392. bf cr7*4+1,9f
  393. err3; lvx v3,r0,r4
  394. err3; lvx v2,r4,r9
  395. err3; lvx v1,r4,r10
  396. err3; lvx v0,r4,r11
  397. addi r4,r4,64
  398. err3; stvx v3,r0,r3
  399. err3; stvx v2,r3,r9
  400. err3; stvx v1,r3,r10
  401. err3; stvx v0,r3,r11
  402. addi r3,r3,64
  403. 9: bf cr7*4+2,10f
  404. err3; lvx v1,r0,r4
  405. err3; lvx v0,r4,r9
  406. addi r4,r4,32
  407. err3; stvx v1,r0,r3
  408. err3; stvx v0,r3,r9
  409. addi r3,r3,32
  410. 10: bf cr7*4+3,11f
  411. err3; lvx v1,r0,r4
  412. addi r4,r4,16
  413. err3; stvx v1,r0,r3
  414. addi r3,r3,16
  415. /* Up to 15B to go */
  416. 11: clrldi r5,r5,(64-4)
  417. mtocrf 0x01,r5
  418. bf cr7*4+0,12f
  419. err3; ld r0,0(r4)
  420. addi r4,r4,8
  421. err3; std r0,0(r3)
  422. addi r3,r3,8
  423. 12: bf cr7*4+1,13f
  424. err3; lwz r0,0(r4)
  425. addi r4,r4,4
  426. err3; stw r0,0(r3)
  427. addi r3,r3,4
  428. 13: bf cr7*4+2,14f
  429. err3; lhz r0,0(r4)
  430. addi r4,r4,2
  431. err3; sth r0,0(r3)
  432. addi r3,r3,2
  433. 14: bf cr7*4+3,15f
  434. err3; lbz r0,0(r4)
  435. err3; stb r0,0(r3)
  436. 15: addi r1,r1,STACKFRAMESIZE
  437. b exit_vmx_usercopy /* tail call optimise */
  438. .Lvmx_unaligned_copy:
  439. /* Get the destination 16B aligned */
  440. neg r6,r3
  441. mtocrf 0x01,r6
  442. clrldi r6,r6,(64-4)
  443. bf cr7*4+3,1f
  444. err3; lbz r0,0(r4)
  445. addi r4,r4,1
  446. err3; stb r0,0(r3)
  447. addi r3,r3,1
  448. 1: bf cr7*4+2,2f
  449. err3; lhz r0,0(r4)
  450. addi r4,r4,2
  451. err3; sth r0,0(r3)
  452. addi r3,r3,2
  453. 2: bf cr7*4+1,3f
  454. err3; lwz r0,0(r4)
  455. addi r4,r4,4
  456. err3; stw r0,0(r3)
  457. addi r3,r3,4
  458. 3: bf cr7*4+0,4f
  459. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  460. err3; lwz r7,4(r4)
  461. addi r4,r4,8
  462. err3; stw r0,0(r3)
  463. err3; stw r7,4(r3)
  464. addi r3,r3,8
  465. 4: sub r5,r5,r6
  466. /* Get the desination 128B aligned */
  467. neg r6,r3
  468. srdi r7,r6,4
  469. mtocrf 0x01,r7
  470. clrldi r6,r6,(64-7)
  471. li r9,16
  472. li r10,32
  473. li r11,48
  474. LVS(v16,0,r4) /* Setup permute control vector */
  475. err3; lvx v0,0,r4
  476. addi r4,r4,16
  477. bf cr7*4+3,5f
  478. err3; lvx v1,r0,r4
  479. VPERM(v8,v0,v1,v16)
  480. addi r4,r4,16
  481. err3; stvx v8,r0,r3
  482. addi r3,r3,16
  483. vor v0,v1,v1
  484. 5: bf cr7*4+2,6f
  485. err3; lvx v1,r0,r4
  486. VPERM(v8,v0,v1,v16)
  487. err3; lvx v0,r4,r9
  488. VPERM(v9,v1,v0,v16)
  489. addi r4,r4,32
  490. err3; stvx v8,r0,r3
  491. err3; stvx v9,r3,r9
  492. addi r3,r3,32
  493. 6: bf cr7*4+1,7f
  494. err3; lvx v3,r0,r4
  495. VPERM(v8,v0,v3,v16)
  496. err3; lvx v2,r4,r9
  497. VPERM(v9,v3,v2,v16)
  498. err3; lvx v1,r4,r10
  499. VPERM(v10,v2,v1,v16)
  500. err3; lvx v0,r4,r11
  501. VPERM(v11,v1,v0,v16)
  502. addi r4,r4,64
  503. err3; stvx v8,r0,r3
  504. err3; stvx v9,r3,r9
  505. err3; stvx v10,r3,r10
  506. err3; stvx v11,r3,r11
  507. addi r3,r3,64
  508. 7: sub r5,r5,r6
  509. srdi r6,r5,7
  510. std r14,STK_REG(R14)(r1)
  511. std r15,STK_REG(R15)(r1)
  512. std r16,STK_REG(R16)(r1)
  513. li r12,64
  514. li r14,80
  515. li r15,96
  516. li r16,112
  517. mtctr r6
  518. /*
  519. * Now do cacheline sized loads and stores. By this stage the
  520. * cacheline stores are also cacheline aligned.
  521. */
  522. .align 5
  523. 8:
  524. err4; lvx v7,r0,r4
  525. VPERM(v8,v0,v7,v16)
  526. err4; lvx v6,r4,r9
  527. VPERM(v9,v7,v6,v16)
  528. err4; lvx v5,r4,r10
  529. VPERM(v10,v6,v5,v16)
  530. err4; lvx v4,r4,r11
  531. VPERM(v11,v5,v4,v16)
  532. err4; lvx v3,r4,r12
  533. VPERM(v12,v4,v3,v16)
  534. err4; lvx v2,r4,r14
  535. VPERM(v13,v3,v2,v16)
  536. err4; lvx v1,r4,r15
  537. VPERM(v14,v2,v1,v16)
  538. err4; lvx v0,r4,r16
  539. VPERM(v15,v1,v0,v16)
  540. addi r4,r4,128
  541. err4; stvx v8,r0,r3
  542. err4; stvx v9,r3,r9
  543. err4; stvx v10,r3,r10
  544. err4; stvx v11,r3,r11
  545. err4; stvx v12,r3,r12
  546. err4; stvx v13,r3,r14
  547. err4; stvx v14,r3,r15
  548. err4; stvx v15,r3,r16
  549. addi r3,r3,128
  550. bdnz 8b
  551. ld r14,STK_REG(R14)(r1)
  552. ld r15,STK_REG(R15)(r1)
  553. ld r16,STK_REG(R16)(r1)
  554. /* Up to 127B to go */
  555. clrldi r5,r5,(64-7)
  556. srdi r6,r5,4
  557. mtocrf 0x01,r6
  558. bf cr7*4+1,9f
  559. err3; lvx v3,r0,r4
  560. VPERM(v8,v0,v3,v16)
  561. err3; lvx v2,r4,r9
  562. VPERM(v9,v3,v2,v16)
  563. err3; lvx v1,r4,r10
  564. VPERM(v10,v2,v1,v16)
  565. err3; lvx v0,r4,r11
  566. VPERM(v11,v1,v0,v16)
  567. addi r4,r4,64
  568. err3; stvx v8,r0,r3
  569. err3; stvx v9,r3,r9
  570. err3; stvx v10,r3,r10
  571. err3; stvx v11,r3,r11
  572. addi r3,r3,64
  573. 9: bf cr7*4+2,10f
  574. err3; lvx v1,r0,r4
  575. VPERM(v8,v0,v1,v16)
  576. err3; lvx v0,r4,r9
  577. VPERM(v9,v1,v0,v16)
  578. addi r4,r4,32
  579. err3; stvx v8,r0,r3
  580. err3; stvx v9,r3,r9
  581. addi r3,r3,32
  582. 10: bf cr7*4+3,11f
  583. err3; lvx v1,r0,r4
  584. VPERM(v8,v0,v1,v16)
  585. addi r4,r4,16
  586. err3; stvx v8,r0,r3
  587. addi r3,r3,16
  588. /* Up to 15B to go */
  589. 11: clrldi r5,r5,(64-4)
  590. addi r4,r4,-16 /* Unwind the +16 load offset */
  591. mtocrf 0x01,r5
  592. bf cr7*4+0,12f
  593. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  594. err3; lwz r6,4(r4)
  595. addi r4,r4,8
  596. err3; stw r0,0(r3)
  597. err3; stw r6,4(r3)
  598. addi r3,r3,8
  599. 12: bf cr7*4+1,13f
  600. err3; lwz r0,0(r4)
  601. addi r4,r4,4
  602. err3; stw r0,0(r3)
  603. addi r3,r3,4
  604. 13: bf cr7*4+2,14f
  605. err3; lhz r0,0(r4)
  606. addi r4,r4,2
  607. err3; sth r0,0(r3)
  608. addi r3,r3,2
  609. 14: bf cr7*4+3,15f
  610. err3; lbz r0,0(r4)
  611. err3; stb r0,0(r3)
  612. 15: addi r1,r1,STACKFRAMESIZE
  613. b exit_vmx_usercopy /* tail call optimise */
  614. #endif /* CONFIG_ALTIVEC */