copyuser_64.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. /*
  2. * Copyright (C) 2002 Paul Mackerras, IBM Corp.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public License
  6. * as published by the Free Software Foundation; either version
  7. * 2 of the License, or (at your option) any later version.
  8. */
  9. #include <asm/processor.h>
  10. #include <asm/ppc_asm.h>
  11. #include <asm/export.h>
  12. #include <asm/asm-compat.h>
  13. #include <asm/feature-fixups.h>
  14. #ifndef SELFTEST_CASE
  15. /* 0 == most CPUs, 1 == POWER6, 2 == Cell */
  16. #define SELFTEST_CASE 0
  17. #endif
  18. #ifdef __BIG_ENDIAN__
  19. #define sLd sld /* Shift towards low-numbered address. */
  20. #define sHd srd /* Shift towards high-numbered address. */
  21. #else
  22. #define sLd srd /* Shift towards low-numbered address. */
  23. #define sHd sld /* Shift towards high-numbered address. */
  24. #endif
  25. /*
  26. * These macros are used to generate exception table entries.
  27. * The exception handlers below use the original arguments
  28. * (stored on the stack) and the point where we're up to in
  29. * the destination buffer, i.e. the address of the first
  30. * unmodified byte. Generally r3 points into the destination
  31. * buffer, but the first unmodified byte is at a variable
  32. * offset from r3. In the code below, the symbol r3_offset
  33. * is set to indicate the current offset at each point in
  34. * the code. This offset is then used as a negative offset
  35. * from the exception handler code, and those instructions
  36. * before the exception handlers are addi instructions that
  37. * adjust r3 to point to the correct place.
  38. */
  39. .macro lex /* exception handler for load */
  40. 100: EX_TABLE(100b, .Lld_exc - r3_offset)
  41. .endm
  42. .macro stex /* exception handler for store */
  43. 100: EX_TABLE(100b, .Lst_exc - r3_offset)
  44. .endm
  45. .align 7
  46. _GLOBAL_TOC(__copy_tofrom_user)
  47. #ifdef CONFIG_PPC_BOOK3S_64
  48. BEGIN_FTR_SECTION
  49. nop
  50. FTR_SECTION_ELSE
  51. b __copy_tofrom_user_power7
  52. ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
  53. #endif
  54. _GLOBAL(__copy_tofrom_user_base)
  55. /* first check for a 4kB copy on a 4kB boundary */
  56. cmpldi cr1,r5,16
  57. cmpdi cr6,r5,4096
  58. or r0,r3,r4
  59. neg r6,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
  60. andi. r0,r0,4095
  61. std r3,-24(r1)
  62. crand cr0*4+2,cr0*4+2,cr6*4+2
  63. std r4,-16(r1)
  64. std r5,-8(r1)
  65. dcbt 0,r4
  66. beq .Lcopy_page_4K
  67. andi. r6,r6,7
  68. PPC_MTOCRF(0x01,r5)
  69. blt cr1,.Lshort_copy
  70. /* Below we want to nop out the bne if we're on a CPU that has the
  71. * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
  72. * cleared.
  73. * At the time of writing the only CPU that has this combination of bits
  74. * set is Power6.
  75. */
  76. test_feature = (SELFTEST_CASE == 1)
  77. BEGIN_FTR_SECTION
  78. nop
  79. FTR_SECTION_ELSE
  80. bne .Ldst_unaligned
  81. ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
  82. CPU_FTR_UNALIGNED_LD_STD)
  83. .Ldst_aligned:
  84. addi r3,r3,-16
  85. r3_offset = 16
  86. test_feature = (SELFTEST_CASE == 0)
  87. BEGIN_FTR_SECTION
  88. andi. r0,r4,7
  89. bne .Lsrc_unaligned
  90. END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
  91. blt cr1,.Ldo_tail /* if < 16 bytes to copy */
  92. srdi r0,r5,5
  93. cmpdi cr1,r0,0
  94. lex; ld r7,0(r4)
  95. lex; ld r6,8(r4)
  96. addi r4,r4,16
  97. mtctr r0
  98. andi. r0,r5,0x10
  99. beq 22f
  100. addi r3,r3,16
  101. r3_offset = 0
  102. addi r4,r4,-16
  103. mr r9,r7
  104. mr r8,r6
  105. beq cr1,72f
  106. 21:
  107. lex; ld r7,16(r4)
  108. lex; ld r6,24(r4)
  109. addi r4,r4,32
  110. stex; std r9,0(r3)
  111. r3_offset = 8
  112. stex; std r8,8(r3)
  113. r3_offset = 16
  114. 22:
  115. lex; ld r9,0(r4)
  116. lex; ld r8,8(r4)
  117. stex; std r7,16(r3)
  118. r3_offset = 24
  119. stex; std r6,24(r3)
  120. addi r3,r3,32
  121. r3_offset = 0
  122. bdnz 21b
  123. 72:
  124. stex; std r9,0(r3)
  125. r3_offset = 8
  126. stex; std r8,8(r3)
  127. r3_offset = 16
  128. andi. r5,r5,0xf
  129. beq+ 3f
  130. addi r4,r4,16
  131. .Ldo_tail:
  132. addi r3,r3,16
  133. r3_offset = 0
  134. bf cr7*4+0,246f
  135. lex; ld r9,0(r4)
  136. addi r4,r4,8
  137. stex; std r9,0(r3)
  138. addi r3,r3,8
  139. 246: bf cr7*4+1,1f
  140. lex; lwz r9,0(r4)
  141. addi r4,r4,4
  142. stex; stw r9,0(r3)
  143. addi r3,r3,4
  144. 1: bf cr7*4+2,2f
  145. lex; lhz r9,0(r4)
  146. addi r4,r4,2
  147. stex; sth r9,0(r3)
  148. addi r3,r3,2
  149. 2: bf cr7*4+3,3f
  150. lex; lbz r9,0(r4)
  151. stex; stb r9,0(r3)
  152. 3: li r3,0
  153. blr
  154. .Lsrc_unaligned:
  155. r3_offset = 16
  156. srdi r6,r5,3
  157. addi r5,r5,-16
  158. subf r4,r0,r4
  159. srdi r7,r5,4
  160. sldi r10,r0,3
  161. cmpldi cr6,r6,3
  162. andi. r5,r5,7
  163. mtctr r7
  164. subfic r11,r10,64
  165. add r5,r5,r0
  166. bt cr7*4+0,28f
  167. lex; ld r9,0(r4) /* 3+2n loads, 2+2n stores */
  168. lex; ld r0,8(r4)
  169. sLd r6,r9,r10
  170. lex; ldu r9,16(r4)
  171. sHd r7,r0,r11
  172. sLd r8,r0,r10
  173. or r7,r7,r6
  174. blt cr6,79f
  175. lex; ld r0,8(r4)
  176. b 2f
  177. 28:
  178. lex; ld r0,0(r4) /* 4+2n loads, 3+2n stores */
  179. lex; ldu r9,8(r4)
  180. sLd r8,r0,r10
  181. addi r3,r3,-8
  182. r3_offset = 24
  183. blt cr6,5f
  184. lex; ld r0,8(r4)
  185. sHd r12,r9,r11
  186. sLd r6,r9,r10
  187. lex; ldu r9,16(r4)
  188. or r12,r8,r12
  189. sHd r7,r0,r11
  190. sLd r8,r0,r10
  191. addi r3,r3,16
  192. r3_offset = 8
  193. beq cr6,78f
  194. 1: or r7,r7,r6
  195. lex; ld r0,8(r4)
  196. stex; std r12,8(r3)
  197. r3_offset = 16
  198. 2: sHd r12,r9,r11
  199. sLd r6,r9,r10
  200. lex; ldu r9,16(r4)
  201. or r12,r8,r12
  202. stex; stdu r7,16(r3)
  203. r3_offset = 8
  204. sHd r7,r0,r11
  205. sLd r8,r0,r10
  206. bdnz 1b
  207. 78:
  208. stex; std r12,8(r3)
  209. r3_offset = 16
  210. or r7,r7,r6
  211. 79:
  212. stex; std r7,16(r3)
  213. r3_offset = 24
  214. 5: sHd r12,r9,r11
  215. or r12,r8,r12
  216. stex; std r12,24(r3)
  217. r3_offset = 32
  218. bne 6f
  219. li r3,0
  220. blr
  221. 6: cmpwi cr1,r5,8
  222. addi r3,r3,32
  223. r3_offset = 0
  224. sLd r9,r9,r10
  225. ble cr1,7f
  226. lex; ld r0,8(r4)
  227. sHd r7,r0,r11
  228. or r9,r7,r9
  229. 7:
  230. bf cr7*4+1,1f
  231. #ifdef __BIG_ENDIAN__
  232. rotldi r9,r9,32
  233. #endif
  234. stex; stw r9,0(r3)
  235. #ifdef __LITTLE_ENDIAN__
  236. rotrdi r9,r9,32
  237. #endif
  238. addi r3,r3,4
  239. 1: bf cr7*4+2,2f
  240. #ifdef __BIG_ENDIAN__
  241. rotldi r9,r9,16
  242. #endif
  243. stex; sth r9,0(r3)
  244. #ifdef __LITTLE_ENDIAN__
  245. rotrdi r9,r9,16
  246. #endif
  247. addi r3,r3,2
  248. 2: bf cr7*4+3,3f
  249. #ifdef __BIG_ENDIAN__
  250. rotldi r9,r9,8
  251. #endif
  252. stex; stb r9,0(r3)
  253. #ifdef __LITTLE_ENDIAN__
  254. rotrdi r9,r9,8
  255. #endif
  256. 3: li r3,0
  257. blr
  258. .Ldst_unaligned:
  259. r3_offset = 0
  260. PPC_MTOCRF(0x01,r6) /* put #bytes to 8B bdry into cr7 */
  261. subf r5,r6,r5
  262. li r7,0
  263. cmpldi cr1,r5,16
  264. bf cr7*4+3,1f
  265. 100: EX_TABLE(100b, .Lld_exc_r7)
  266. lbz r0,0(r4)
  267. 100: EX_TABLE(100b, .Lst_exc_r7)
  268. stb r0,0(r3)
  269. addi r7,r7,1
  270. 1: bf cr7*4+2,2f
  271. 100: EX_TABLE(100b, .Lld_exc_r7)
  272. lhzx r0,r7,r4
  273. 100: EX_TABLE(100b, .Lst_exc_r7)
  274. sthx r0,r7,r3
  275. addi r7,r7,2
  276. 2: bf cr7*4+1,3f
  277. 100: EX_TABLE(100b, .Lld_exc_r7)
  278. lwzx r0,r7,r4
  279. 100: EX_TABLE(100b, .Lst_exc_r7)
  280. stwx r0,r7,r3
  281. 3: PPC_MTOCRF(0x01,r5)
  282. add r4,r6,r4
  283. add r3,r6,r3
  284. b .Ldst_aligned
  285. .Lshort_copy:
  286. r3_offset = 0
  287. bf cr7*4+0,1f
  288. lex; lwz r0,0(r4)
  289. lex; lwz r9,4(r4)
  290. addi r4,r4,8
  291. stex; stw r0,0(r3)
  292. stex; stw r9,4(r3)
  293. addi r3,r3,8
  294. 1: bf cr7*4+1,2f
  295. lex; lwz r0,0(r4)
  296. addi r4,r4,4
  297. stex; stw r0,0(r3)
  298. addi r3,r3,4
  299. 2: bf cr7*4+2,3f
  300. lex; lhz r0,0(r4)
  301. addi r4,r4,2
  302. stex; sth r0,0(r3)
  303. addi r3,r3,2
  304. 3: bf cr7*4+3,4f
  305. lex; lbz r0,0(r4)
  306. stex; stb r0,0(r3)
  307. 4: li r3,0
  308. blr
  309. /*
  310. * exception handlers follow
  311. * we have to return the number of bytes not copied
  312. * for an exception on a load, we set the rest of the destination to 0
  313. * Note that the number of bytes of instructions for adjusting r3 needs
  314. * to equal the amount of the adjustment, due to the trick of using
  315. * .Lld_exc - r3_offset as the handler address.
  316. */
  317. .Lld_exc_r7:
  318. add r3,r3,r7
  319. b .Lld_exc
  320. /* adjust by 24 */
  321. addi r3,r3,8
  322. nop
  323. /* adjust by 16 */
  324. addi r3,r3,8
  325. nop
  326. /* adjust by 8 */
  327. addi r3,r3,8
  328. nop
  329. /*
  330. * Here we have had a fault on a load and r3 points to the first
  331. * unmodified byte of the destination. We use the original arguments
  332. * and r3 to work out how much wasn't copied. Since we load some
  333. * distance ahead of the stores, we continue copying byte-by-byte until
  334. * we hit the load fault again in order to copy as much as possible.
  335. */
  336. .Lld_exc:
  337. ld r6,-24(r1)
  338. ld r4,-16(r1)
  339. ld r5,-8(r1)
  340. subf r6,r6,r3
  341. add r4,r4,r6
  342. subf r5,r6,r5 /* #bytes left to go */
  343. /*
  344. * first see if we can copy any more bytes before hitting another exception
  345. */
  346. mtctr r5
  347. r3_offset = 0
  348. 100: EX_TABLE(100b, .Ldone)
  349. 43: lbz r0,0(r4)
  350. addi r4,r4,1
  351. stex; stb r0,0(r3)
  352. addi r3,r3,1
  353. bdnz 43b
  354. li r3,0 /* huh? all copied successfully this time? */
  355. blr
  356. /*
  357. * here we have trapped again, amount remaining is in ctr.
  358. */
  359. .Ldone:
  360. mfctr r3
  361. blr
  362. /*
  363. * exception handlers for stores: we need to work out how many bytes
  364. * weren't copied, and we may need to copy some more.
  365. * Note that the number of bytes of instructions for adjusting r3 needs
  366. * to equal the amount of the adjustment, due to the trick of using
  367. * .Lst_exc - r3_offset as the handler address.
  368. */
  369. .Lst_exc_r7:
  370. add r3,r3,r7
  371. b .Lst_exc
  372. /* adjust by 24 */
  373. addi r3,r3,8
  374. nop
  375. /* adjust by 16 */
  376. addi r3,r3,8
  377. nop
  378. /* adjust by 8 */
  379. addi r3,r3,4
  380. /* adjust by 4 */
  381. addi r3,r3,4
  382. .Lst_exc:
  383. ld r6,-24(r1) /* original destination pointer */
  384. ld r4,-16(r1) /* original source pointer */
  385. ld r5,-8(r1) /* original number of bytes */
  386. add r7,r6,r5
  387. /*
  388. * If the destination pointer isn't 8-byte aligned,
  389. * we may have got the exception as a result of a
  390. * store that overlapped a page boundary, so we may be
  391. * able to copy a few more bytes.
  392. */
  393. 17: andi. r0,r3,7
  394. beq 19f
  395. subf r8,r6,r3 /* #bytes copied */
  396. 100: EX_TABLE(100b,19f)
  397. lbzx r0,r8,r4
  398. 100: EX_TABLE(100b,19f)
  399. stb r0,0(r3)
  400. addi r3,r3,1
  401. cmpld r3,r7
  402. blt 17b
  403. 19: subf r3,r3,r7 /* #bytes not copied in r3 */
  404. blr
  405. /*
  406. * Routine to copy a whole page of data, optimized for POWER4.
  407. * On POWER4 it is more than 50% faster than the simple loop
  408. * above (following the .Ldst_aligned label).
  409. */
  410. .macro exc
  411. 100: EX_TABLE(100b, .Labort)
  412. .endm
  413. .Lcopy_page_4K:
  414. std r31,-32(1)
  415. std r30,-40(1)
  416. std r29,-48(1)
  417. std r28,-56(1)
  418. std r27,-64(1)
  419. std r26,-72(1)
  420. std r25,-80(1)
  421. std r24,-88(1)
  422. std r23,-96(1)
  423. std r22,-104(1)
  424. std r21,-112(1)
  425. std r20,-120(1)
  426. li r5,4096/32 - 1
  427. addi r3,r3,-8
  428. li r0,5
  429. 0: addi r5,r5,-24
  430. mtctr r0
  431. exc; ld r22,640(4)
  432. exc; ld r21,512(4)
  433. exc; ld r20,384(4)
  434. exc; ld r11,256(4)
  435. exc; ld r9,128(4)
  436. exc; ld r7,0(4)
  437. exc; ld r25,648(4)
  438. exc; ld r24,520(4)
  439. exc; ld r23,392(4)
  440. exc; ld r10,264(4)
  441. exc; ld r8,136(4)
  442. exc; ldu r6,8(4)
  443. cmpwi r5,24
  444. 1:
  445. exc; std r22,648(3)
  446. exc; std r21,520(3)
  447. exc; std r20,392(3)
  448. exc; std r11,264(3)
  449. exc; std r9,136(3)
  450. exc; std r7,8(3)
  451. exc; ld r28,648(4)
  452. exc; ld r27,520(4)
  453. exc; ld r26,392(4)
  454. exc; ld r31,264(4)
  455. exc; ld r30,136(4)
  456. exc; ld r29,8(4)
  457. exc; std r25,656(3)
  458. exc; std r24,528(3)
  459. exc; std r23,400(3)
  460. exc; std r10,272(3)
  461. exc; std r8,144(3)
  462. exc; std r6,16(3)
  463. exc; ld r22,656(4)
  464. exc; ld r21,528(4)
  465. exc; ld r20,400(4)
  466. exc; ld r11,272(4)
  467. exc; ld r9,144(4)
  468. exc; ld r7,16(4)
  469. exc; std r28,664(3)
  470. exc; std r27,536(3)
  471. exc; std r26,408(3)
  472. exc; std r31,280(3)
  473. exc; std r30,152(3)
  474. exc; stdu r29,24(3)
  475. exc; ld r25,664(4)
  476. exc; ld r24,536(4)
  477. exc; ld r23,408(4)
  478. exc; ld r10,280(4)
  479. exc; ld r8,152(4)
  480. exc; ldu r6,24(4)
  481. bdnz 1b
  482. exc; std r22,648(3)
  483. exc; std r21,520(3)
  484. exc; std r20,392(3)
  485. exc; std r11,264(3)
  486. exc; std r9,136(3)
  487. exc; std r7,8(3)
  488. addi r4,r4,640
  489. addi r3,r3,648
  490. bge 0b
  491. mtctr r5
  492. exc; ld r7,0(4)
  493. exc; ld r8,8(4)
  494. exc; ldu r9,16(4)
  495. 3:
  496. exc; ld r10,8(4)
  497. exc; std r7,8(3)
  498. exc; ld r7,16(4)
  499. exc; std r8,16(3)
  500. exc; ld r8,24(4)
  501. exc; std r9,24(3)
  502. exc; ldu r9,32(4)
  503. exc; stdu r10,32(3)
  504. bdnz 3b
  505. 4:
  506. exc; ld r10,8(4)
  507. exc; std r7,8(3)
  508. exc; std r8,16(3)
  509. exc; std r9,24(3)
  510. exc; std r10,32(3)
  511. 9: ld r20,-120(1)
  512. ld r21,-112(1)
  513. ld r22,-104(1)
  514. ld r23,-96(1)
  515. ld r24,-88(1)
  516. ld r25,-80(1)
  517. ld r26,-72(1)
  518. ld r27,-64(1)
  519. ld r28,-56(1)
  520. ld r29,-48(1)
  521. ld r30,-40(1)
  522. ld r31,-32(1)
  523. li r3,0
  524. blr
  525. /*
  526. * on an exception, reset to the beginning and jump back into the
  527. * standard __copy_tofrom_user
  528. */
  529. .Labort:
  530. ld r20,-120(1)
  531. ld r21,-112(1)
  532. ld r22,-104(1)
  533. ld r23,-96(1)
  534. ld r24,-88(1)
  535. ld r25,-80(1)
  536. ld r26,-72(1)
  537. ld r27,-64(1)
  538. ld r28,-56(1)
  539. ld r29,-48(1)
  540. ld r30,-40(1)
  541. ld r31,-32(1)
  542. ld r3,-24(r1)
  543. ld r4,-16(r1)
  544. li r5,4096
  545. b .Ldst_aligned
  546. EXPORT_SYMBOL(__copy_tofrom_user)