memset.S 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. /*
  2. * Copyright (C) 2013 ARM Ltd.
  3. * Copyright (C) 2013 Linaro.
  4. *
  5. * This code is based on glibc cortex strings work originally authored by Linaro
  6. * and re-licensed under GPLv2 for the Linux kernel. The original code can
  7. * be found @
  8. *
  9. * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
  10. * files/head:/src/aarch64/
  11. *
  12. * This program is free software; you can redistribute it and/or modify
  13. * it under the terms of the GNU General Public License version 2 as
  14. * published by the Free Software Foundation.
  15. *
  16. * This program is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  23. */
  24. #include <linux/linkage.h>
  25. #include <asm/assembler.h>
  26. #include <asm/cache.h>
  27. /*
  28. * Fill in the buffer with character c (alignment handled by the hardware)
  29. *
  30. * Parameters:
  31. * x0 - buf
  32. * x1 - c
  33. * x2 - n
  34. * Returns:
  35. * x0 - buf
  36. */
  37. dstin .req x0
  38. val .req w1
  39. count .req x2
  40. tmp1 .req x3
  41. tmp1w .req w3
  42. tmp2 .req x4
  43. tmp2w .req w4
  44. zva_len_x .req x5
  45. zva_len .req w5
  46. zva_bits_x .req x6
  47. A_l .req x7
  48. A_lw .req w7
  49. dst .req x8
  50. tmp3w .req w9
  51. tmp3 .req x9
  52. ENTRY(memset)
  53. mov dst, dstin /* Preserve return value. */
  54. and A_lw, val, #255
  55. orr A_lw, A_lw, A_lw, lsl #8
  56. orr A_lw, A_lw, A_lw, lsl #16
  57. orr A_l, A_l, A_l, lsl #32
  58. cmp count, #15
  59. b.hi .Lover16_proc
  60. /*All store maybe are non-aligned..*/
  61. tbz count, #3, 1f
  62. str A_l, [dst], #8
  63. 1:
  64. tbz count, #2, 2f
  65. str A_lw, [dst], #4
  66. 2:
  67. tbz count, #1, 3f
  68. strh A_lw, [dst], #2
  69. 3:
  70. tbz count, #0, 4f
  71. strb A_lw, [dst]
  72. 4:
  73. ret
  74. .Lover16_proc:
  75. /*Whether the start address is aligned with 16.*/
  76. neg tmp2, dst
  77. ands tmp2, tmp2, #15
  78. b.eq .Laligned
  79. /*
  80. * The count is not less than 16, we can use stp to store the start 16 bytes,
  81. * then adjust the dst aligned with 16.This process will make the current
  82. * memory address at alignment boundary.
  83. */
  84. stp A_l, A_l, [dst] /*non-aligned store..*/
  85. /*make the dst aligned..*/
  86. sub count, count, tmp2
  87. add dst, dst, tmp2
  88. .Laligned:
  89. cbz A_l, .Lzero_mem
  90. .Ltail_maybe_long:
  91. cmp count, #64
  92. b.ge .Lnot_short
  93. .Ltail63:
  94. ands tmp1, count, #0x30
  95. b.eq 3f
  96. cmp tmp1w, #0x20
  97. b.eq 1f
  98. b.lt 2f
  99. stp A_l, A_l, [dst], #16
  100. 1:
  101. stp A_l, A_l, [dst], #16
  102. 2:
  103. stp A_l, A_l, [dst], #16
  104. /*
  105. * The last store length is less than 16,use stp to write last 16 bytes.
  106. * It will lead some bytes written twice and the access is non-aligned.
  107. */
  108. 3:
  109. ands count, count, #15
  110. cbz count, 4f
  111. add dst, dst, count
  112. stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
  113. 4:
  114. ret
  115. /*
  116. * Critical loop. Start at a new cache line boundary. Assuming
  117. * 64 bytes per line, this ensures the entire loop is in one line.
  118. */
  119. .p2align L1_CACHE_SHIFT
  120. .Lnot_short:
  121. sub dst, dst, #16/* Pre-bias. */
  122. sub count, count, #64
  123. 1:
  124. stp A_l, A_l, [dst, #16]
  125. stp A_l, A_l, [dst, #32]
  126. stp A_l, A_l, [dst, #48]
  127. stp A_l, A_l, [dst, #64]!
  128. subs count, count, #64
  129. b.ge 1b
  130. tst count, #0x3f
  131. add dst, dst, #16
  132. b.ne .Ltail63
  133. .Lexitfunc:
  134. ret
  135. /*
  136. * For zeroing memory, check to see if we can use the ZVA feature to
  137. * zero entire 'cache' lines.
  138. */
  139. .Lzero_mem:
  140. cmp count, #63
  141. b.le .Ltail63
  142. /*
  143. * For zeroing small amounts of memory, it's not worth setting up
  144. * the line-clear code.
  145. */
  146. cmp count, #128
  147. b.lt .Lnot_short /*count is at least 128 bytes*/
  148. mrs tmp1, dczid_el0
  149. tbnz tmp1, #4, .Lnot_short
  150. mov tmp3w, #4
  151. and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
  152. lsl zva_len, tmp3w, zva_len
  153. ands tmp3w, zva_len, #63
  154. /*
  155. * ensure the zva_len is not less than 64.
  156. * It is not meaningful to use ZVA if the block size is less than 64.
  157. */
  158. b.ne .Lnot_short
  159. .Lzero_by_line:
  160. /*
  161. * Compute how far we need to go to become suitably aligned. We're
  162. * already at quad-word alignment.
  163. */
  164. cmp count, zva_len_x
  165. b.lt .Lnot_short /* Not enough to reach alignment. */
  166. sub zva_bits_x, zva_len_x, #1
  167. neg tmp2, dst
  168. ands tmp2, tmp2, zva_bits_x
  169. b.eq 2f /* Already aligned. */
  170. /* Not aligned, check that there's enough to copy after alignment.*/
  171. sub tmp1, count, tmp2
  172. /*
  173. * grantee the remain length to be ZVA is bigger than 64,
  174. * avoid to make the 2f's process over mem range.*/
  175. cmp tmp1, #64
  176. ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
  177. b.lt .Lnot_short
  178. /*
  179. * We know that there's at least 64 bytes to zero and that it's safe
  180. * to overrun by 64 bytes.
  181. */
  182. mov count, tmp1
  183. 1:
  184. stp A_l, A_l, [dst]
  185. stp A_l, A_l, [dst, #16]
  186. stp A_l, A_l, [dst, #32]
  187. subs tmp2, tmp2, #64
  188. stp A_l, A_l, [dst, #48]
  189. add dst, dst, #64
  190. b.ge 1b
  191. /* We've overrun a bit, so adjust dst downwards.*/
  192. add dst, dst, tmp2
  193. 2:
  194. sub count, count, zva_len_x
  195. 3:
  196. dc zva, dst
  197. add dst, dst, zva_len_x
  198. subs count, count, zva_len_x
  199. b.ge 3b
  200. ands count, count, zva_bits_x
  201. b.ne .Ltail_maybe_long
  202. ret
  203. ENDPROC(memset)