checksum_32.S 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. /*
  2. * This file contains assembly-language implementations
  3. * of IP-style 1's complement checksum routines.
  4. *
  5. * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU General Public License
  9. * as published by the Free Software Foundation; either version
  10. * 2 of the License, or (at your option) any later version.
  11. *
  12. * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
  13. */
  14. #include <linux/sys.h>
  15. #include <asm/processor.h>
  16. #include <asm/cache.h>
  17. #include <asm/errno.h>
  18. #include <asm/ppc_asm.h>
  19. .text
  20. /*
  21. * computes the checksum of a memory block at buff, length len,
  22. * and adds in "sum" (32-bit)
  23. *
  24. * __csum_partial(buff, len, sum)
  25. */
  26. _GLOBAL(__csum_partial)
  27. subi r3,r3,4
  28. srawi. r6,r4,2 /* Divide len by 4 and also clear carry */
  29. beq 3f /* if we're doing < 4 bytes */
  30. andi. r0,r3,2 /* Align buffer to longword boundary */
  31. beq+ 1f
  32. lhz r0,4(r3) /* do 2 bytes to get aligned */
  33. subi r4,r4,2
  34. addi r3,r3,2
  35. srwi. r6,r4,2 /* # words to do */
  36. adde r5,r5,r0
  37. beq 3f
  38. 1: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */
  39. beq 21f
  40. mtctr r6
  41. 2: lwzu r0,4(r3)
  42. adde r5,r5,r0
  43. bdnz 2b
  44. 21: srwi. r6,r4,4 /* # blocks of 4 words to do */
  45. beq 3f
  46. mtctr r6
  47. 22: lwz r0,4(r3)
  48. lwz r6,8(r3)
  49. lwz r7,12(r3)
  50. lwzu r8,16(r3)
  51. adde r5,r5,r0
  52. adde r5,r5,r6
  53. adde r5,r5,r7
  54. adde r5,r5,r8
  55. bdnz 22b
  56. 3: andi. r0,r4,2
  57. beq+ 4f
  58. lhz r0,4(r3)
  59. addi r3,r3,2
  60. adde r5,r5,r0
  61. 4: andi. r0,r4,1
  62. beq+ 5f
  63. lbz r0,4(r3)
  64. slwi r0,r0,8 /* Upper byte of word */
  65. adde r5,r5,r0
  66. 5: addze r3,r5 /* add in final carry */
  67. blr
  68. /*
  69. * Computes the checksum of a memory block at src, length len,
  70. * and adds in "sum" (32-bit), while copying the block to dst.
  71. * If an access exception occurs on src or dst, it stores -EFAULT
  72. * to *src_err or *dst_err respectively, and (for an error on
  73. * src) zeroes the rest of dst.
  74. *
  75. * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err)
  76. */
  77. #define CSUM_COPY_16_BYTES_WITHEX(n) \
  78. 8 ## n ## 0: \
  79. lwz r7,4(r4); \
  80. 8 ## n ## 1: \
  81. lwz r8,8(r4); \
  82. 8 ## n ## 2: \
  83. lwz r9,12(r4); \
  84. 8 ## n ## 3: \
  85. lwzu r10,16(r4); \
  86. 8 ## n ## 4: \
  87. stw r7,4(r6); \
  88. adde r12,r12,r7; \
  89. 8 ## n ## 5: \
  90. stw r8,8(r6); \
  91. adde r12,r12,r8; \
  92. 8 ## n ## 6: \
  93. stw r9,12(r6); \
  94. adde r12,r12,r9; \
  95. 8 ## n ## 7: \
  96. stwu r10,16(r6); \
  97. adde r12,r12,r10
  98. #define CSUM_COPY_16_BYTES_EXCODE(n) \
  99. .section __ex_table,"a"; \
  100. .align 2; \
  101. .long 8 ## n ## 0b,src_error; \
  102. .long 8 ## n ## 1b,src_error; \
  103. .long 8 ## n ## 2b,src_error; \
  104. .long 8 ## n ## 3b,src_error; \
  105. .long 8 ## n ## 4b,dst_error; \
  106. .long 8 ## n ## 5b,dst_error; \
  107. .long 8 ## n ## 6b,dst_error; \
  108. .long 8 ## n ## 7b,dst_error; \
  109. .text
  110. .text
  111. .stabs "arch/powerpc/lib/",N_SO,0,0,0f
  112. .stabs "checksum_32.S",N_SO,0,0,0f
  113. 0:
  114. CACHELINE_BYTES = L1_CACHE_BYTES
  115. LG_CACHELINE_BYTES = L1_CACHE_SHIFT
  116. CACHELINE_MASK = (L1_CACHE_BYTES-1)
  117. _GLOBAL(csum_partial_copy_generic)
  118. stwu r1,-16(r1)
  119. stw r7,12(r1)
  120. stw r8,8(r1)
  121. rlwinm r0,r4,3,0x8
  122. rlwnm r6,r6,r0,0,31 /* odd destination address: rotate one byte */
  123. cmplwi cr7,r0,0 /* is destination address even ? */
  124. addic r12,r6,0
  125. addi r6,r4,-4
  126. neg r0,r4
  127. addi r4,r3,-4
  128. andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
  129. beq 58f
  130. cmplw 0,r5,r0 /* is this more than total to do? */
  131. blt 63f /* if not much to do */
  132. andi. r8,r0,3 /* get it word-aligned first */
  133. mtctr r8
  134. beq+ 61f
  135. li r3,0
  136. 70: lbz r9,4(r4) /* do some bytes */
  137. addi r4,r4,1
  138. slwi r3,r3,8
  139. rlwimi r3,r9,0,24,31
  140. 71: stb r9,4(r6)
  141. addi r6,r6,1
  142. bdnz 70b
  143. adde r12,r12,r3
  144. 61: subf r5,r0,r5
  145. srwi. r0,r0,2
  146. mtctr r0
  147. beq 58f
  148. 72: lwzu r9,4(r4) /* do some words */
  149. adde r12,r12,r9
  150. 73: stwu r9,4(r6)
  151. bdnz 72b
  152. 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
  153. clrlwi r5,r5,32-LG_CACHELINE_BYTES
  154. li r11,4
  155. beq 63f
  156. /* Here we decide how far ahead to prefetch the source */
  157. li r3,4
  158. cmpwi r0,1
  159. li r7,0
  160. ble 114f
  161. li r7,1
  162. #if MAX_COPY_PREFETCH > 1
  163. /* Heuristically, for large transfers we prefetch
  164. MAX_COPY_PREFETCH cachelines ahead. For small transfers
  165. we prefetch 1 cacheline ahead. */
  166. cmpwi r0,MAX_COPY_PREFETCH
  167. ble 112f
  168. li r7,MAX_COPY_PREFETCH
  169. 112: mtctr r7
  170. 111: dcbt r3,r4
  171. addi r3,r3,CACHELINE_BYTES
  172. bdnz 111b
  173. #else
  174. dcbt r3,r4
  175. addi r3,r3,CACHELINE_BYTES
  176. #endif /* MAX_COPY_PREFETCH > 1 */
  177. 114: subf r8,r7,r0
  178. mr r0,r7
  179. mtctr r8
  180. 53: dcbt r3,r4
  181. 54: dcbz r11,r6
  182. /* the main body of the cacheline loop */
  183. CSUM_COPY_16_BYTES_WITHEX(0)
  184. #if L1_CACHE_BYTES >= 32
  185. CSUM_COPY_16_BYTES_WITHEX(1)
  186. #if L1_CACHE_BYTES >= 64
  187. CSUM_COPY_16_BYTES_WITHEX(2)
  188. CSUM_COPY_16_BYTES_WITHEX(3)
  189. #if L1_CACHE_BYTES >= 128
  190. CSUM_COPY_16_BYTES_WITHEX(4)
  191. CSUM_COPY_16_BYTES_WITHEX(5)
  192. CSUM_COPY_16_BYTES_WITHEX(6)
  193. CSUM_COPY_16_BYTES_WITHEX(7)
  194. #endif
  195. #endif
  196. #endif
  197. bdnz 53b
  198. cmpwi r0,0
  199. li r3,4
  200. li r7,0
  201. bne 114b
  202. 63: srwi. r0,r5,2
  203. mtctr r0
  204. beq 64f
  205. 30: lwzu r0,4(r4)
  206. adde r12,r12,r0
  207. 31: stwu r0,4(r6)
  208. bdnz 30b
  209. 64: andi. r0,r5,2
  210. beq+ 65f
  211. 40: lhz r0,4(r4)
  212. addi r4,r4,2
  213. 41: sth r0,4(r6)
  214. adde r12,r12,r0
  215. addi r6,r6,2
  216. 65: andi. r0,r5,1
  217. beq+ 66f
  218. 50: lbz r0,4(r4)
  219. 51: stb r0,4(r6)
  220. slwi r0,r0,8
  221. adde r12,r12,r0
  222. 66: addze r3,r12
  223. addi r1,r1,16
  224. beqlr+ cr7
  225. rlwinm r3,r3,8,0,31 /* odd destination address: rotate one byte */
  226. blr
  227. /* read fault */
  228. src_error:
  229. lwz r7,12(r1)
  230. addi r1,r1,16
  231. cmpwi cr0,r7,0
  232. beqlr
  233. li r0,-EFAULT
  234. stw r0,0(r7)
  235. blr
  236. /* write fault */
  237. dst_error:
  238. lwz r8,8(r1)
  239. addi r1,r1,16
  240. cmpwi cr0,r8,0
  241. beqlr
  242. li r0,-EFAULT
  243. stw r0,0(r8)
  244. blr
  245. .section __ex_table,"a"
  246. .align 2
  247. .long 70b,src_error
  248. .long 71b,dst_error
  249. .long 72b,src_error
  250. .long 73b,dst_error
  251. .long 54b,dst_error
  252. .text
  253. /*
  254. * this stuff handles faults in the cacheline loop and branches to either
  255. * src_error (if in read part) or dst_error (if in write part)
  256. */
  257. CSUM_COPY_16_BYTES_EXCODE(0)
  258. #if L1_CACHE_BYTES >= 32
  259. CSUM_COPY_16_BYTES_EXCODE(1)
  260. #if L1_CACHE_BYTES >= 64
  261. CSUM_COPY_16_BYTES_EXCODE(2)
  262. CSUM_COPY_16_BYTES_EXCODE(3)
  263. #if L1_CACHE_BYTES >= 128
  264. CSUM_COPY_16_BYTES_EXCODE(4)
  265. CSUM_COPY_16_BYTES_EXCODE(5)
  266. CSUM_COPY_16_BYTES_EXCODE(6)
  267. CSUM_COPY_16_BYTES_EXCODE(7)
  268. #endif
  269. #endif
  270. #endif
  271. .section __ex_table,"a"
  272. .align 2
  273. .long 30b,src_error
  274. .long 31b,dst_error
  275. .long 40b,src_error
  276. .long 41b,dst_error
  277. .long 50b,src_error
  278. .long 51b,dst_error