copyuser_power7.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707
  1. /*
  2. * This program is free software; you can redistribute it and/or modify
  3. * it under the terms of the GNU General Public License as published by
  4. * the Free Software Foundation; either version 2 of the License, or
  5. * (at your option) any later version.
  6. *
  7. * This program is distributed in the hope that it will be useful,
  8. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. * GNU General Public License for more details.
  11. *
  12. * You should have received a copy of the GNU General Public License
  13. * along with this program; if not, write to the Free Software
  14. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  15. *
  16. * Copyright (C) IBM Corporation, 2011
  17. *
  18. * Author: Anton Blanchard <anton@au.ibm.com>
  19. */
  20. #include <asm/ppc_asm.h>
  21. #ifndef SELFTEST_CASE
  22. /* 0 == don't use VMX, 1 == use VMX */
  23. #define SELFTEST_CASE 0
  24. #endif
  25. #ifdef __BIG_ENDIAN__
  26. #define LVS(VRT,RA,RB) lvsl VRT,RA,RB
  27. #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
  28. #else
  29. #define LVS(VRT,RA,RB) lvsr VRT,RA,RB
  30. #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
  31. #endif
  32. .macro err1
  33. 100:
  34. EX_TABLE(100b,.Ldo_err1)
  35. .endm
  36. .macro err2
  37. 200:
  38. EX_TABLE(200b,.Ldo_err2)
  39. .endm
  40. #ifdef CONFIG_ALTIVEC
  41. .macro err3
  42. 300:
  43. EX_TABLE(300b,.Ldo_err3)
  44. .endm
  45. .macro err4
  46. 400:
  47. EX_TABLE(400b,.Ldo_err4)
  48. .endm
  49. .Ldo_err4:
  50. ld r16,STK_REG(R16)(r1)
  51. ld r15,STK_REG(R15)(r1)
  52. ld r14,STK_REG(R14)(r1)
  53. .Ldo_err3:
  54. bl exit_vmx_usercopy
  55. ld r0,STACKFRAMESIZE+16(r1)
  56. mtlr r0
  57. b .Lexit
  58. #endif /* CONFIG_ALTIVEC */
  59. .Ldo_err2:
  60. ld r22,STK_REG(R22)(r1)
  61. ld r21,STK_REG(R21)(r1)
  62. ld r20,STK_REG(R20)(r1)
  63. ld r19,STK_REG(R19)(r1)
  64. ld r18,STK_REG(R18)(r1)
  65. ld r17,STK_REG(R17)(r1)
  66. ld r16,STK_REG(R16)(r1)
  67. ld r15,STK_REG(R15)(r1)
  68. ld r14,STK_REG(R14)(r1)
  69. .Lexit:
  70. addi r1,r1,STACKFRAMESIZE
  71. .Ldo_err1:
  72. ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  73. ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  74. ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  75. b __copy_tofrom_user_base
  76. _GLOBAL(__copy_tofrom_user_power7)
  77. cmpldi r5,16
  78. cmpldi cr1,r5,3328
  79. std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  80. std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
  81. std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
  82. blt .Lshort_copy
  83. #ifdef CONFIG_ALTIVEC
  84. test_feature = SELFTEST_CASE
  85. BEGIN_FTR_SECTION
  86. bgt cr1,.Lvmx_copy
  87. END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
  88. #endif
  89. .Lnonvmx_copy:
  90. /* Get the source 8B aligned */
  91. neg r6,r4
  92. mtocrf 0x01,r6
  93. clrldi r6,r6,(64-3)
  94. bf cr7*4+3,1f
  95. err1; lbz r0,0(r4)
  96. addi r4,r4,1
  97. err1; stb r0,0(r3)
  98. addi r3,r3,1
  99. 1: bf cr7*4+2,2f
  100. err1; lhz r0,0(r4)
  101. addi r4,r4,2
  102. err1; sth r0,0(r3)
  103. addi r3,r3,2
  104. 2: bf cr7*4+1,3f
  105. err1; lwz r0,0(r4)
  106. addi r4,r4,4
  107. err1; stw r0,0(r3)
  108. addi r3,r3,4
  109. 3: sub r5,r5,r6
  110. cmpldi r5,128
  111. blt 5f
  112. mflr r0
  113. stdu r1,-STACKFRAMESIZE(r1)
  114. std r14,STK_REG(R14)(r1)
  115. std r15,STK_REG(R15)(r1)
  116. std r16,STK_REG(R16)(r1)
  117. std r17,STK_REG(R17)(r1)
  118. std r18,STK_REG(R18)(r1)
  119. std r19,STK_REG(R19)(r1)
  120. std r20,STK_REG(R20)(r1)
  121. std r21,STK_REG(R21)(r1)
  122. std r22,STK_REG(R22)(r1)
  123. std r0,STACKFRAMESIZE+16(r1)
  124. srdi r6,r5,7
  125. mtctr r6
  126. /* Now do cacheline (128B) sized loads and stores. */
  127. .align 5
  128. 4:
  129. err2; ld r0,0(r4)
  130. err2; ld r6,8(r4)
  131. err2; ld r7,16(r4)
  132. err2; ld r8,24(r4)
  133. err2; ld r9,32(r4)
  134. err2; ld r10,40(r4)
  135. err2; ld r11,48(r4)
  136. err2; ld r12,56(r4)
  137. err2; ld r14,64(r4)
  138. err2; ld r15,72(r4)
  139. err2; ld r16,80(r4)
  140. err2; ld r17,88(r4)
  141. err2; ld r18,96(r4)
  142. err2; ld r19,104(r4)
  143. err2; ld r20,112(r4)
  144. err2; ld r21,120(r4)
  145. addi r4,r4,128
  146. err2; std r0,0(r3)
  147. err2; std r6,8(r3)
  148. err2; std r7,16(r3)
  149. err2; std r8,24(r3)
  150. err2; std r9,32(r3)
  151. err2; std r10,40(r3)
  152. err2; std r11,48(r3)
  153. err2; std r12,56(r3)
  154. err2; std r14,64(r3)
  155. err2; std r15,72(r3)
  156. err2; std r16,80(r3)
  157. err2; std r17,88(r3)
  158. err2; std r18,96(r3)
  159. err2; std r19,104(r3)
  160. err2; std r20,112(r3)
  161. err2; std r21,120(r3)
  162. addi r3,r3,128
  163. bdnz 4b
  164. clrldi r5,r5,(64-7)
  165. ld r14,STK_REG(R14)(r1)
  166. ld r15,STK_REG(R15)(r1)
  167. ld r16,STK_REG(R16)(r1)
  168. ld r17,STK_REG(R17)(r1)
  169. ld r18,STK_REG(R18)(r1)
  170. ld r19,STK_REG(R19)(r1)
  171. ld r20,STK_REG(R20)(r1)
  172. ld r21,STK_REG(R21)(r1)
  173. ld r22,STK_REG(R22)(r1)
  174. addi r1,r1,STACKFRAMESIZE
  175. /* Up to 127B to go */
  176. 5: srdi r6,r5,4
  177. mtocrf 0x01,r6
  178. 6: bf cr7*4+1,7f
  179. err1; ld r0,0(r4)
  180. err1; ld r6,8(r4)
  181. err1; ld r7,16(r4)
  182. err1; ld r8,24(r4)
  183. err1; ld r9,32(r4)
  184. err1; ld r10,40(r4)
  185. err1; ld r11,48(r4)
  186. err1; ld r12,56(r4)
  187. addi r4,r4,64
  188. err1; std r0,0(r3)
  189. err1; std r6,8(r3)
  190. err1; std r7,16(r3)
  191. err1; std r8,24(r3)
  192. err1; std r9,32(r3)
  193. err1; std r10,40(r3)
  194. err1; std r11,48(r3)
  195. err1; std r12,56(r3)
  196. addi r3,r3,64
  197. /* Up to 63B to go */
  198. 7: bf cr7*4+2,8f
  199. err1; ld r0,0(r4)
  200. err1; ld r6,8(r4)
  201. err1; ld r7,16(r4)
  202. err1; ld r8,24(r4)
  203. addi r4,r4,32
  204. err1; std r0,0(r3)
  205. err1; std r6,8(r3)
  206. err1; std r7,16(r3)
  207. err1; std r8,24(r3)
  208. addi r3,r3,32
  209. /* Up to 31B to go */
  210. 8: bf cr7*4+3,9f
  211. err1; ld r0,0(r4)
  212. err1; ld r6,8(r4)
  213. addi r4,r4,16
  214. err1; std r0,0(r3)
  215. err1; std r6,8(r3)
  216. addi r3,r3,16
  217. 9: clrldi r5,r5,(64-4)
  218. /* Up to 15B to go */
  219. .Lshort_copy:
  220. mtocrf 0x01,r5
  221. bf cr7*4+0,12f
  222. err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
  223. err1; lwz r6,4(r4)
  224. addi r4,r4,8
  225. err1; stw r0,0(r3)
  226. err1; stw r6,4(r3)
  227. addi r3,r3,8
  228. 12: bf cr7*4+1,13f
  229. err1; lwz r0,0(r4)
  230. addi r4,r4,4
  231. err1; stw r0,0(r3)
  232. addi r3,r3,4
  233. 13: bf cr7*4+2,14f
  234. err1; lhz r0,0(r4)
  235. addi r4,r4,2
  236. err1; sth r0,0(r3)
  237. addi r3,r3,2
  238. 14: bf cr7*4+3,15f
  239. err1; lbz r0,0(r4)
  240. err1; stb r0,0(r3)
  241. 15: li r3,0
  242. blr
  243. .Lunwind_stack_nonvmx_copy:
  244. addi r1,r1,STACKFRAMESIZE
  245. b .Lnonvmx_copy
  246. .Lvmx_copy:
  247. #ifdef CONFIG_ALTIVEC
  248. mflr r0
  249. std r0,16(r1)
  250. stdu r1,-STACKFRAMESIZE(r1)
  251. bl enter_vmx_usercopy
  252. cmpwi cr1,r3,0
  253. ld r0,STACKFRAMESIZE+16(r1)
  254. ld r3,STK_REG(R31)(r1)
  255. ld r4,STK_REG(R30)(r1)
  256. ld r5,STK_REG(R29)(r1)
  257. mtlr r0
  258. /*
  259. * We prefetch both the source and destination using enhanced touch
  260. * instructions. We use a stream ID of 0 for the load side and
  261. * 1 for the store side.
  262. */
  263. clrrdi r6,r4,7
  264. clrrdi r9,r3,7
  265. ori r9,r9,1 /* stream=1 */
  266. srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
  267. cmpldi r7,0x3FF
  268. ble 1f
  269. li r7,0x3FF
  270. 1: lis r0,0x0E00 /* depth=7 */
  271. sldi r7,r7,7
  272. or r7,r7,r0
  273. ori r10,r7,1 /* stream=1 */
  274. lis r8,0x8000 /* GO=1 */
  275. clrldi r8,r8,32
  276. /* setup read stream 0 */
  277. dcbt 0,r6,0b01000 /* addr from */
  278. dcbt 0,r7,0b01010 /* length and depth from */
  279. /* setup write stream 1 */
  280. dcbtst 0,r9,0b01000 /* addr to */
  281. dcbtst 0,r10,0b01010 /* length and depth to */
  282. eieio
  283. dcbt 0,r8,0b01010 /* all streams GO */
  284. beq cr1,.Lunwind_stack_nonvmx_copy
  285. /*
  286. * If source and destination are not relatively aligned we use a
  287. * slower permute loop.
  288. */
  289. xor r6,r4,r3
  290. rldicl. r6,r6,0,(64-4)
  291. bne .Lvmx_unaligned_copy
  292. /* Get the destination 16B aligned */
  293. neg r6,r3
  294. mtocrf 0x01,r6
  295. clrldi r6,r6,(64-4)
  296. bf cr7*4+3,1f
  297. err3; lbz r0,0(r4)
  298. addi r4,r4,1
  299. err3; stb r0,0(r3)
  300. addi r3,r3,1
  301. 1: bf cr7*4+2,2f
  302. err3; lhz r0,0(r4)
  303. addi r4,r4,2
  304. err3; sth r0,0(r3)
  305. addi r3,r3,2
  306. 2: bf cr7*4+1,3f
  307. err3; lwz r0,0(r4)
  308. addi r4,r4,4
  309. err3; stw r0,0(r3)
  310. addi r3,r3,4
  311. 3: bf cr7*4+0,4f
  312. err3; ld r0,0(r4)
  313. addi r4,r4,8
  314. err3; std r0,0(r3)
  315. addi r3,r3,8
  316. 4: sub r5,r5,r6
  317. /* Get the desination 128B aligned */
  318. neg r6,r3
  319. srdi r7,r6,4
  320. mtocrf 0x01,r7
  321. clrldi r6,r6,(64-7)
  322. li r9,16
  323. li r10,32
  324. li r11,48
  325. bf cr7*4+3,5f
  326. err3; lvx v1,0,r4
  327. addi r4,r4,16
  328. err3; stvx v1,0,r3
  329. addi r3,r3,16
  330. 5: bf cr7*4+2,6f
  331. err3; lvx v1,0,r4
  332. err3; lvx v0,r4,r9
  333. addi r4,r4,32
  334. err3; stvx v1,0,r3
  335. err3; stvx v0,r3,r9
  336. addi r3,r3,32
  337. 6: bf cr7*4+1,7f
  338. err3; lvx v3,0,r4
  339. err3; lvx v2,r4,r9
  340. err3; lvx v1,r4,r10
  341. err3; lvx v0,r4,r11
  342. addi r4,r4,64
  343. err3; stvx v3,0,r3
  344. err3; stvx v2,r3,r9
  345. err3; stvx v1,r3,r10
  346. err3; stvx v0,r3,r11
  347. addi r3,r3,64
  348. 7: sub r5,r5,r6
  349. srdi r6,r5,7
  350. std r14,STK_REG(R14)(r1)
  351. std r15,STK_REG(R15)(r1)
  352. std r16,STK_REG(R16)(r1)
  353. li r12,64
  354. li r14,80
  355. li r15,96
  356. li r16,112
  357. mtctr r6
  358. /*
  359. * Now do cacheline sized loads and stores. By this stage the
  360. * cacheline stores are also cacheline aligned.
  361. */
  362. .align 5
  363. 8:
  364. err4; lvx v7,0,r4
  365. err4; lvx v6,r4,r9
  366. err4; lvx v5,r4,r10
  367. err4; lvx v4,r4,r11
  368. err4; lvx v3,r4,r12
  369. err4; lvx v2,r4,r14
  370. err4; lvx v1,r4,r15
  371. err4; lvx v0,r4,r16
  372. addi r4,r4,128
  373. err4; stvx v7,0,r3
  374. err4; stvx v6,r3,r9
  375. err4; stvx v5,r3,r10
  376. err4; stvx v4,r3,r11
  377. err4; stvx v3,r3,r12
  378. err4; stvx v2,r3,r14
  379. err4; stvx v1,r3,r15
  380. err4; stvx v0,r3,r16
  381. addi r3,r3,128
  382. bdnz 8b
  383. ld r14,STK_REG(R14)(r1)
  384. ld r15,STK_REG(R15)(r1)
  385. ld r16,STK_REG(R16)(r1)
  386. /* Up to 127B to go */
  387. clrldi r5,r5,(64-7)
  388. srdi r6,r5,4
  389. mtocrf 0x01,r6
  390. bf cr7*4+1,9f
  391. err3; lvx v3,0,r4
  392. err3; lvx v2,r4,r9
  393. err3; lvx v1,r4,r10
  394. err3; lvx v0,r4,r11
  395. addi r4,r4,64
  396. err3; stvx v3,0,r3
  397. err3; stvx v2,r3,r9
  398. err3; stvx v1,r3,r10
  399. err3; stvx v0,r3,r11
  400. addi r3,r3,64
  401. 9: bf cr7*4+2,10f
  402. err3; lvx v1,0,r4
  403. err3; lvx v0,r4,r9
  404. addi r4,r4,32
  405. err3; stvx v1,0,r3
  406. err3; stvx v0,r3,r9
  407. addi r3,r3,32
  408. 10: bf cr7*4+3,11f
  409. err3; lvx v1,0,r4
  410. addi r4,r4,16
  411. err3; stvx v1,0,r3
  412. addi r3,r3,16
  413. /* Up to 15B to go */
  414. 11: clrldi r5,r5,(64-4)
  415. mtocrf 0x01,r5
  416. bf cr7*4+0,12f
  417. err3; ld r0,0(r4)
  418. addi r4,r4,8
  419. err3; std r0,0(r3)
  420. addi r3,r3,8
  421. 12: bf cr7*4+1,13f
  422. err3; lwz r0,0(r4)
  423. addi r4,r4,4
  424. err3; stw r0,0(r3)
  425. addi r3,r3,4
  426. 13: bf cr7*4+2,14f
  427. err3; lhz r0,0(r4)
  428. addi r4,r4,2
  429. err3; sth r0,0(r3)
  430. addi r3,r3,2
  431. 14: bf cr7*4+3,15f
  432. err3; lbz r0,0(r4)
  433. err3; stb r0,0(r3)
  434. 15: addi r1,r1,STACKFRAMESIZE
  435. b exit_vmx_usercopy /* tail call optimise */
  436. .Lvmx_unaligned_copy:
  437. /* Get the destination 16B aligned */
  438. neg r6,r3
  439. mtocrf 0x01,r6
  440. clrldi r6,r6,(64-4)
  441. bf cr7*4+3,1f
  442. err3; lbz r0,0(r4)
  443. addi r4,r4,1
  444. err3; stb r0,0(r3)
  445. addi r3,r3,1
  446. 1: bf cr7*4+2,2f
  447. err3; lhz r0,0(r4)
  448. addi r4,r4,2
  449. err3; sth r0,0(r3)
  450. addi r3,r3,2
  451. 2: bf cr7*4+1,3f
  452. err3; lwz r0,0(r4)
  453. addi r4,r4,4
  454. err3; stw r0,0(r3)
  455. addi r3,r3,4
  456. 3: bf cr7*4+0,4f
  457. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  458. err3; lwz r7,4(r4)
  459. addi r4,r4,8
  460. err3; stw r0,0(r3)
  461. err3; stw r7,4(r3)
  462. addi r3,r3,8
  463. 4: sub r5,r5,r6
  464. /* Get the desination 128B aligned */
  465. neg r6,r3
  466. srdi r7,r6,4
  467. mtocrf 0x01,r7
  468. clrldi r6,r6,(64-7)
  469. li r9,16
  470. li r10,32
  471. li r11,48
  472. LVS(v16,0,r4) /* Setup permute control vector */
  473. err3; lvx v0,0,r4
  474. addi r4,r4,16
  475. bf cr7*4+3,5f
  476. err3; lvx v1,0,r4
  477. VPERM(v8,v0,v1,v16)
  478. addi r4,r4,16
  479. err3; stvx v8,0,r3
  480. addi r3,r3,16
  481. vor v0,v1,v1
  482. 5: bf cr7*4+2,6f
  483. err3; lvx v1,0,r4
  484. VPERM(v8,v0,v1,v16)
  485. err3; lvx v0,r4,r9
  486. VPERM(v9,v1,v0,v16)
  487. addi r4,r4,32
  488. err3; stvx v8,0,r3
  489. err3; stvx v9,r3,r9
  490. addi r3,r3,32
  491. 6: bf cr7*4+1,7f
  492. err3; lvx v3,0,r4
  493. VPERM(v8,v0,v3,v16)
  494. err3; lvx v2,r4,r9
  495. VPERM(v9,v3,v2,v16)
  496. err3; lvx v1,r4,r10
  497. VPERM(v10,v2,v1,v16)
  498. err3; lvx v0,r4,r11
  499. VPERM(v11,v1,v0,v16)
  500. addi r4,r4,64
  501. err3; stvx v8,0,r3
  502. err3; stvx v9,r3,r9
  503. err3; stvx v10,r3,r10
  504. err3; stvx v11,r3,r11
  505. addi r3,r3,64
  506. 7: sub r5,r5,r6
  507. srdi r6,r5,7
  508. std r14,STK_REG(R14)(r1)
  509. std r15,STK_REG(R15)(r1)
  510. std r16,STK_REG(R16)(r1)
  511. li r12,64
  512. li r14,80
  513. li r15,96
  514. li r16,112
  515. mtctr r6
  516. /*
  517. * Now do cacheline sized loads and stores. By this stage the
  518. * cacheline stores are also cacheline aligned.
  519. */
  520. .align 5
  521. 8:
  522. err4; lvx v7,0,r4
  523. VPERM(v8,v0,v7,v16)
  524. err4; lvx v6,r4,r9
  525. VPERM(v9,v7,v6,v16)
  526. err4; lvx v5,r4,r10
  527. VPERM(v10,v6,v5,v16)
  528. err4; lvx v4,r4,r11
  529. VPERM(v11,v5,v4,v16)
  530. err4; lvx v3,r4,r12
  531. VPERM(v12,v4,v3,v16)
  532. err4; lvx v2,r4,r14
  533. VPERM(v13,v3,v2,v16)
  534. err4; lvx v1,r4,r15
  535. VPERM(v14,v2,v1,v16)
  536. err4; lvx v0,r4,r16
  537. VPERM(v15,v1,v0,v16)
  538. addi r4,r4,128
  539. err4; stvx v8,0,r3
  540. err4; stvx v9,r3,r9
  541. err4; stvx v10,r3,r10
  542. err4; stvx v11,r3,r11
  543. err4; stvx v12,r3,r12
  544. err4; stvx v13,r3,r14
  545. err4; stvx v14,r3,r15
  546. err4; stvx v15,r3,r16
  547. addi r3,r3,128
  548. bdnz 8b
  549. ld r14,STK_REG(R14)(r1)
  550. ld r15,STK_REG(R15)(r1)
  551. ld r16,STK_REG(R16)(r1)
  552. /* Up to 127B to go */
  553. clrldi r5,r5,(64-7)
  554. srdi r6,r5,4
  555. mtocrf 0x01,r6
  556. bf cr7*4+1,9f
  557. err3; lvx v3,0,r4
  558. VPERM(v8,v0,v3,v16)
  559. err3; lvx v2,r4,r9
  560. VPERM(v9,v3,v2,v16)
  561. err3; lvx v1,r4,r10
  562. VPERM(v10,v2,v1,v16)
  563. err3; lvx v0,r4,r11
  564. VPERM(v11,v1,v0,v16)
  565. addi r4,r4,64
  566. err3; stvx v8,0,r3
  567. err3; stvx v9,r3,r9
  568. err3; stvx v10,r3,r10
  569. err3; stvx v11,r3,r11
  570. addi r3,r3,64
  571. 9: bf cr7*4+2,10f
  572. err3; lvx v1,0,r4
  573. VPERM(v8,v0,v1,v16)
  574. err3; lvx v0,r4,r9
  575. VPERM(v9,v1,v0,v16)
  576. addi r4,r4,32
  577. err3; stvx v8,0,r3
  578. err3; stvx v9,r3,r9
  579. addi r3,r3,32
  580. 10: bf cr7*4+3,11f
  581. err3; lvx v1,0,r4
  582. VPERM(v8,v0,v1,v16)
  583. addi r4,r4,16
  584. err3; stvx v8,0,r3
  585. addi r3,r3,16
  586. /* Up to 15B to go */
  587. 11: clrldi r5,r5,(64-4)
  588. addi r4,r4,-16 /* Unwind the +16 load offset */
  589. mtocrf 0x01,r5
  590. bf cr7*4+0,12f
  591. err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
  592. err3; lwz r6,4(r4)
  593. addi r4,r4,8
  594. err3; stw r0,0(r3)
  595. err3; stw r6,4(r3)
  596. addi r3,r3,8
  597. 12: bf cr7*4+1,13f
  598. err3; lwz r0,0(r4)
  599. addi r4,r4,4
  600. err3; stw r0,0(r3)
  601. addi r3,r3,4
  602. 13: bf cr7*4+2,14f
  603. err3; lhz r0,0(r4)
  604. addi r4,r4,2
  605. err3; sth r0,0(r3)
  606. addi r3,r3,2
  607. 14: bf cr7*4+3,15f
  608. err3; lbz r0,0(r4)
  609. err3; stb r0,0(r3)
  610. 15: addi r1,r1,STACKFRAMESIZE
  611. b exit_vmx_usercopy /* tail call optimise */
  612. #endif /* CONFIG_ALTIVEC */