copy_32.S 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. /*
  2. * Memory copy functions for 32-bit PowerPC.
  3. *
  4. * Copyright (C) 1996-2005 Paul Mackerras.
  5. *
  6. * This program is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU General Public License
  8. * as published by the Free Software Foundation; either version
  9. * 2 of the License, or (at your option) any later version.
  10. */
  11. #include <asm/processor.h>
  12. #include <asm/cache.h>
  13. #include <asm/errno.h>
  14. #include <asm/ppc_asm.h>
  15. #define COPY_16_BYTES \
  16. lwz r7,4(r4); \
  17. lwz r8,8(r4); \
  18. lwz r9,12(r4); \
  19. lwzu r10,16(r4); \
  20. stw r7,4(r6); \
  21. stw r8,8(r6); \
  22. stw r9,12(r6); \
  23. stwu r10,16(r6)
  24. #define COPY_16_BYTES_WITHEX(n) \
  25. 8 ## n ## 0: \
  26. lwz r7,4(r4); \
  27. 8 ## n ## 1: \
  28. lwz r8,8(r4); \
  29. 8 ## n ## 2: \
  30. lwz r9,12(r4); \
  31. 8 ## n ## 3: \
  32. lwzu r10,16(r4); \
  33. 8 ## n ## 4: \
  34. stw r7,4(r6); \
  35. 8 ## n ## 5: \
  36. stw r8,8(r6); \
  37. 8 ## n ## 6: \
  38. stw r9,12(r6); \
  39. 8 ## n ## 7: \
  40. stwu r10,16(r6)
  41. #define COPY_16_BYTES_EXCODE(n) \
  42. 9 ## n ## 0: \
  43. addi r5,r5,-(16 * n); \
  44. b 104f; \
  45. 9 ## n ## 1: \
  46. addi r5,r5,-(16 * n); \
  47. b 105f; \
  48. .section __ex_table,"a"; \
  49. .align 2; \
  50. .long 8 ## n ## 0b,9 ## n ## 0b; \
  51. .long 8 ## n ## 1b,9 ## n ## 0b; \
  52. .long 8 ## n ## 2b,9 ## n ## 0b; \
  53. .long 8 ## n ## 3b,9 ## n ## 0b; \
  54. .long 8 ## n ## 4b,9 ## n ## 1b; \
  55. .long 8 ## n ## 5b,9 ## n ## 1b; \
  56. .long 8 ## n ## 6b,9 ## n ## 1b; \
  57. .long 8 ## n ## 7b,9 ## n ## 1b; \
  58. .text
  59. .text
  60. .stabs "arch/powerpc/lib/",N_SO,0,0,0f
  61. .stabs "copy_32.S",N_SO,0,0,0f
  62. 0:
  63. CACHELINE_BYTES = L1_CACHE_BYTES
  64. LG_CACHELINE_BYTES = L1_CACHE_SHIFT
  65. CACHELINE_MASK = (L1_CACHE_BYTES-1)
  66. _GLOBAL(memset)
  67. rlwimi r4,r4,8,16,23
  68. rlwimi r4,r4,16,0,15
  69. addi r6,r3,-4
  70. cmplwi 0,r5,4
  71. blt 7f
  72. stwu r4,4(r6)
  73. beqlr
  74. andi. r0,r6,3
  75. add r5,r0,r5
  76. subf r6,r0,r6
  77. srwi r0,r5,2
  78. mtctr r0
  79. bdz 6f
  80. 1: stwu r4,4(r6)
  81. bdnz 1b
  82. 6: andi. r5,r5,3
  83. 7: cmpwi 0,r5,0
  84. beqlr
  85. mtctr r5
  86. addi r6,r6,3
  87. 8: stbu r4,1(r6)
  88. bdnz 8b
  89. blr
  90. _GLOBAL(memmove)
  91. cmplw 0,r3,r4
  92. bgt backwards_memcpy
  93. /* fall through */
  94. _GLOBAL(memcpy)
  95. srwi. r7,r5,3
  96. addi r6,r3,-4
  97. addi r4,r4,-4
  98. beq 2f /* if less than 8 bytes to do */
  99. andi. r0,r6,3 /* get dest word aligned */
  100. mtctr r7
  101. bne 5f
  102. 1: lwz r7,4(r4)
  103. lwzu r8,8(r4)
  104. stw r7,4(r6)
  105. stwu r8,8(r6)
  106. bdnz 1b
  107. andi. r5,r5,7
  108. 2: cmplwi 0,r5,4
  109. blt 3f
  110. lwzu r0,4(r4)
  111. addi r5,r5,-4
  112. stwu r0,4(r6)
  113. 3: cmpwi 0,r5,0
  114. beqlr
  115. mtctr r5
  116. addi r4,r4,3
  117. addi r6,r6,3
  118. 4: lbzu r0,1(r4)
  119. stbu r0,1(r6)
  120. bdnz 4b
  121. blr
  122. 5: subfic r0,r0,4
  123. mtctr r0
  124. 6: lbz r7,4(r4)
  125. addi r4,r4,1
  126. stb r7,4(r6)
  127. addi r6,r6,1
  128. bdnz 6b
  129. subf r5,r0,r5
  130. rlwinm. r7,r5,32-3,3,31
  131. beq 2b
  132. mtctr r7
  133. b 1b
  134. _GLOBAL(backwards_memcpy)
  135. rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
  136. add r6,r3,r5
  137. add r4,r4,r5
  138. beq 2f
  139. andi. r0,r6,3
  140. mtctr r7
  141. bne 5f
  142. 1: lwz r7,-4(r4)
  143. lwzu r8,-8(r4)
  144. stw r7,-4(r6)
  145. stwu r8,-8(r6)
  146. bdnz 1b
  147. andi. r5,r5,7
  148. 2: cmplwi 0,r5,4
  149. blt 3f
  150. lwzu r0,-4(r4)
  151. subi r5,r5,4
  152. stwu r0,-4(r6)
  153. 3: cmpwi 0,r5,0
  154. beqlr
  155. mtctr r5
  156. 4: lbzu r0,-1(r4)
  157. stbu r0,-1(r6)
  158. bdnz 4b
  159. blr
  160. 5: mtctr r0
  161. 6: lbzu r7,-1(r4)
  162. stbu r7,-1(r6)
  163. bdnz 6b
  164. subf r5,r0,r5
  165. rlwinm. r7,r5,32-3,3,31
  166. beq 2b
  167. mtctr r7
  168. b 1b
  169. _GLOBAL(__copy_tofrom_user)
  170. addi r4,r4,-4
  171. addi r6,r3,-4
  172. neg r0,r3
  173. andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
  174. beq 58f
  175. cmplw 0,r5,r0 /* is this more than total to do? */
  176. blt 63f /* if not much to do */
  177. andi. r8,r0,3 /* get it word-aligned first */
  178. mtctr r8
  179. beq+ 61f
  180. 70: lbz r9,4(r4) /* do some bytes */
  181. 71: stb r9,4(r6)
  182. addi r4,r4,1
  183. addi r6,r6,1
  184. bdnz 70b
  185. 61: subf r5,r0,r5
  186. srwi. r0,r0,2
  187. mtctr r0
  188. beq 58f
  189. 72: lwzu r9,4(r4) /* do some words */
  190. 73: stwu r9,4(r6)
  191. bdnz 72b
  192. .section __ex_table,"a"
  193. .align 2
  194. .long 70b,100f
  195. .long 71b,101f
  196. .long 72b,102f
  197. .long 73b,103f
  198. .text
  199. 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
  200. clrlwi r5,r5,32-LG_CACHELINE_BYTES
  201. li r11,4
  202. beq 63f
  203. /* Here we decide how far ahead to prefetch the source */
  204. li r3,4
  205. cmpwi r0,1
  206. li r7,0
  207. ble 114f
  208. li r7,1
  209. #if MAX_COPY_PREFETCH > 1
  210. /* Heuristically, for large transfers we prefetch
  211. MAX_COPY_PREFETCH cachelines ahead. For small transfers
  212. we prefetch 1 cacheline ahead. */
  213. cmpwi r0,MAX_COPY_PREFETCH
  214. ble 112f
  215. li r7,MAX_COPY_PREFETCH
  216. 112: mtctr r7
  217. 111: dcbt r3,r4
  218. addi r3,r3,CACHELINE_BYTES
  219. bdnz 111b
  220. #else
  221. dcbt r3,r4
  222. addi r3,r3,CACHELINE_BYTES
  223. #endif /* MAX_COPY_PREFETCH > 1 */
  224. 114: subf r8,r7,r0
  225. mr r0,r7
  226. mtctr r8
  227. 53: dcbt r3,r4
  228. 54: dcbz r11,r6
  229. .section __ex_table,"a"
  230. .align 2
  231. .long 54b,105f
  232. .text
  233. /* the main body of the cacheline loop */
  234. COPY_16_BYTES_WITHEX(0)
  235. #if L1_CACHE_BYTES >= 32
  236. COPY_16_BYTES_WITHEX(1)
  237. #if L1_CACHE_BYTES >= 64
  238. COPY_16_BYTES_WITHEX(2)
  239. COPY_16_BYTES_WITHEX(3)
  240. #if L1_CACHE_BYTES >= 128
  241. COPY_16_BYTES_WITHEX(4)
  242. COPY_16_BYTES_WITHEX(5)
  243. COPY_16_BYTES_WITHEX(6)
  244. COPY_16_BYTES_WITHEX(7)
  245. #endif
  246. #endif
  247. #endif
  248. bdnz 53b
  249. cmpwi r0,0
  250. li r3,4
  251. li r7,0
  252. bne 114b
  253. 63: srwi. r0,r5,2
  254. mtctr r0
  255. beq 64f
  256. 30: lwzu r0,4(r4)
  257. 31: stwu r0,4(r6)
  258. bdnz 30b
  259. 64: andi. r0,r5,3
  260. mtctr r0
  261. beq+ 65f
  262. 40: lbz r0,4(r4)
  263. 41: stb r0,4(r6)
  264. addi r4,r4,1
  265. addi r6,r6,1
  266. bdnz 40b
  267. 65: li r3,0
  268. blr
  269. /* read fault, initial single-byte copy */
  270. 100: li r9,0
  271. b 90f
  272. /* write fault, initial single-byte copy */
  273. 101: li r9,1
  274. 90: subf r5,r8,r5
  275. li r3,0
  276. b 99f
  277. /* read fault, initial word copy */
  278. 102: li r9,0
  279. b 91f
  280. /* write fault, initial word copy */
  281. 103: li r9,1
  282. 91: li r3,2
  283. b 99f
  284. /*
  285. * this stuff handles faults in the cacheline loop and branches to either
  286. * 104f (if in read part) or 105f (if in write part), after updating r5
  287. */
  288. COPY_16_BYTES_EXCODE(0)
  289. #if L1_CACHE_BYTES >= 32
  290. COPY_16_BYTES_EXCODE(1)
  291. #if L1_CACHE_BYTES >= 64
  292. COPY_16_BYTES_EXCODE(2)
  293. COPY_16_BYTES_EXCODE(3)
  294. #if L1_CACHE_BYTES >= 128
  295. COPY_16_BYTES_EXCODE(4)
  296. COPY_16_BYTES_EXCODE(5)
  297. COPY_16_BYTES_EXCODE(6)
  298. COPY_16_BYTES_EXCODE(7)
  299. #endif
  300. #endif
  301. #endif
  302. /* read fault in cacheline loop */
  303. 104: li r9,0
  304. b 92f
  305. /* fault on dcbz (effectively a write fault) */
  306. /* or write fault in cacheline loop */
  307. 105: li r9,1
  308. 92: li r3,LG_CACHELINE_BYTES
  309. mfctr r8
  310. add r0,r0,r8
  311. b 106f
  312. /* read fault in final word loop */
  313. 108: li r9,0
  314. b 93f
  315. /* write fault in final word loop */
  316. 109: li r9,1
  317. 93: andi. r5,r5,3
  318. li r3,2
  319. b 99f
  320. /* read fault in final byte loop */
  321. 110: li r9,0
  322. b 94f
  323. /* write fault in final byte loop */
  324. 111: li r9,1
  325. 94: li r5,0
  326. li r3,0
  327. /*
  328. * At this stage the number of bytes not copied is
  329. * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
  330. */
  331. 99: mfctr r0
  332. 106: slw r3,r0,r3
  333. add. r3,r3,r5
  334. beq 120f /* shouldn't happen */
  335. cmpwi 0,r9,0
  336. bne 120f
  337. /* for a read fault, first try to continue the copy one byte at a time */
  338. mtctr r3
  339. 130: lbz r0,4(r4)
  340. 131: stb r0,4(r6)
  341. addi r4,r4,1
  342. addi r6,r6,1
  343. bdnz 130b
  344. /* then clear out the destination: r3 bytes starting at 4(r6) */
  345. 132: mfctr r3
  346. srwi. r0,r3,2
  347. li r9,0
  348. mtctr r0
  349. beq 113f
  350. 112: stwu r9,4(r6)
  351. bdnz 112b
  352. 113: andi. r0,r3,3
  353. mtctr r0
  354. beq 120f
  355. 114: stb r9,4(r6)
  356. addi r6,r6,1
  357. bdnz 114b
  358. 120: blr
  359. .section __ex_table,"a"
  360. .align 2
  361. .long 30b,108b
  362. .long 31b,109b
  363. .long 40b,110b
  364. .long 41b,111b
  365. .long 130b,132b
  366. .long 131b,120b
  367. .long 112b,120b
  368. .long 114b,120b
  369. .text