xor.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. #ifndef _ASM_X86_XOR_H
  2. #define _ASM_X86_XOR_H
  3. /*
  4. * Optimized RAID-5 checksumming functions for SSE.
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; either version 2, or (at your option)
  9. * any later version.
  10. *
  11. * You should have received a copy of the GNU General Public License
  12. * (for example /usr/src/linux/COPYING); if not, write to the Free
  13. * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  14. */
  15. /*
  16. * Cache avoiding checksumming functions utilizing KNI instructions
  17. * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
  18. */
  19. /*
  20. * Based on
  21. * High-speed RAID5 checksumming functions utilizing SSE instructions.
  22. * Copyright (C) 1998 Ingo Molnar.
  23. */
  24. /*
  25. * x86-64 changes / gcc fixes from Andi Kleen.
  26. * Copyright 2002 Andi Kleen, SuSE Labs.
  27. *
  28. * This hasn't been optimized for the hammer yet, but there are likely
  29. * no advantages to be gotten from x86-64 here anyways.
  30. */
  31. #include <asm/fpu/api.h>
  32. #ifdef CONFIG_X86_32
  33. /* reduce register pressure */
  34. # define XOR_CONSTANT_CONSTRAINT "i"
  35. #else
  36. # define XOR_CONSTANT_CONSTRAINT "re"
  37. #endif
  38. #define OFFS(x) "16*("#x")"
  39. #define PF_OFFS(x) "256+16*("#x")"
  40. #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
  41. #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
  42. #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
  43. #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
  44. #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
  45. #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
  46. #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
  47. #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
  48. #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
  49. #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
  50. #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
  51. #define NOP(x)
  52. #define BLK64(pf, op, i) \
  53. pf(i) \
  54. op(i, 0) \
  55. op(i + 1, 1) \
  56. op(i + 2, 2) \
  57. op(i + 3, 3)
  58. static void
  59. xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  60. {
  61. unsigned long lines = bytes >> 8;
  62. kernel_fpu_begin();
  63. asm volatile(
  64. #undef BLOCK
  65. #define BLOCK(i) \
  66. LD(i, 0) \
  67. LD(i + 1, 1) \
  68. PF1(i) \
  69. PF1(i + 2) \
  70. LD(i + 2, 2) \
  71. LD(i + 3, 3) \
  72. PF0(i + 4) \
  73. PF0(i + 6) \
  74. XO1(i, 0) \
  75. XO1(i + 1, 1) \
  76. XO1(i + 2, 2) \
  77. XO1(i + 3, 3) \
  78. ST(i, 0) \
  79. ST(i + 1, 1) \
  80. ST(i + 2, 2) \
  81. ST(i + 3, 3) \
  82. PF0(0)
  83. PF0(2)
  84. " .align 32 ;\n"
  85. " 1: ;\n"
  86. BLOCK(0)
  87. BLOCK(4)
  88. BLOCK(8)
  89. BLOCK(12)
  90. " add %[inc], %[p1] ;\n"
  91. " add %[inc], %[p2] ;\n"
  92. " dec %[cnt] ;\n"
  93. " jnz 1b ;\n"
  94. : [cnt] "+r" (lines),
  95. [p1] "+r" (p1), [p2] "+r" (p2)
  96. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  97. : "memory");
  98. kernel_fpu_end();
  99. }
  100. static void
  101. xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
  102. {
  103. unsigned long lines = bytes >> 8;
  104. kernel_fpu_begin();
  105. asm volatile(
  106. #undef BLOCK
  107. #define BLOCK(i) \
  108. BLK64(PF0, LD, i) \
  109. BLK64(PF1, XO1, i) \
  110. BLK64(NOP, ST, i) \
  111. " .align 32 ;\n"
  112. " 1: ;\n"
  113. BLOCK(0)
  114. BLOCK(4)
  115. BLOCK(8)
  116. BLOCK(12)
  117. " add %[inc], %[p1] ;\n"
  118. " add %[inc], %[p2] ;\n"
  119. " dec %[cnt] ;\n"
  120. " jnz 1b ;\n"
  121. : [cnt] "+r" (lines),
  122. [p1] "+r" (p1), [p2] "+r" (p2)
  123. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  124. : "memory");
  125. kernel_fpu_end();
  126. }
  127. static void
  128. xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  129. unsigned long *p3)
  130. {
  131. unsigned long lines = bytes >> 8;
  132. kernel_fpu_begin();
  133. asm volatile(
  134. #undef BLOCK
  135. #define BLOCK(i) \
  136. PF1(i) \
  137. PF1(i + 2) \
  138. LD(i, 0) \
  139. LD(i + 1, 1) \
  140. LD(i + 2, 2) \
  141. LD(i + 3, 3) \
  142. PF2(i) \
  143. PF2(i + 2) \
  144. PF0(i + 4) \
  145. PF0(i + 6) \
  146. XO1(i, 0) \
  147. XO1(i + 1, 1) \
  148. XO1(i + 2, 2) \
  149. XO1(i + 3, 3) \
  150. XO2(i, 0) \
  151. XO2(i + 1, 1) \
  152. XO2(i + 2, 2) \
  153. XO2(i + 3, 3) \
  154. ST(i, 0) \
  155. ST(i + 1, 1) \
  156. ST(i + 2, 2) \
  157. ST(i + 3, 3) \
  158. PF0(0)
  159. PF0(2)
  160. " .align 32 ;\n"
  161. " 1: ;\n"
  162. BLOCK(0)
  163. BLOCK(4)
  164. BLOCK(8)
  165. BLOCK(12)
  166. " add %[inc], %[p1] ;\n"
  167. " add %[inc], %[p2] ;\n"
  168. " add %[inc], %[p3] ;\n"
  169. " dec %[cnt] ;\n"
  170. " jnz 1b ;\n"
  171. : [cnt] "+r" (lines),
  172. [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
  173. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  174. : "memory");
  175. kernel_fpu_end();
  176. }
  177. static void
  178. xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  179. unsigned long *p3)
  180. {
  181. unsigned long lines = bytes >> 8;
  182. kernel_fpu_begin();
  183. asm volatile(
  184. #undef BLOCK
  185. #define BLOCK(i) \
  186. BLK64(PF0, LD, i) \
  187. BLK64(PF1, XO1, i) \
  188. BLK64(PF2, XO2, i) \
  189. BLK64(NOP, ST, i) \
  190. " .align 32 ;\n"
  191. " 1: ;\n"
  192. BLOCK(0)
  193. BLOCK(4)
  194. BLOCK(8)
  195. BLOCK(12)
  196. " add %[inc], %[p1] ;\n"
  197. " add %[inc], %[p2] ;\n"
  198. " add %[inc], %[p3] ;\n"
  199. " dec %[cnt] ;\n"
  200. " jnz 1b ;\n"
  201. : [cnt] "+r" (lines),
  202. [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
  203. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  204. : "memory");
  205. kernel_fpu_end();
  206. }
  207. static void
  208. xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  209. unsigned long *p3, unsigned long *p4)
  210. {
  211. unsigned long lines = bytes >> 8;
  212. kernel_fpu_begin();
  213. asm volatile(
  214. #undef BLOCK
  215. #define BLOCK(i) \
  216. PF1(i) \
  217. PF1(i + 2) \
  218. LD(i, 0) \
  219. LD(i + 1, 1) \
  220. LD(i + 2, 2) \
  221. LD(i + 3, 3) \
  222. PF2(i) \
  223. PF2(i + 2) \
  224. XO1(i, 0) \
  225. XO1(i + 1, 1) \
  226. XO1(i + 2, 2) \
  227. XO1(i + 3, 3) \
  228. PF3(i) \
  229. PF3(i + 2) \
  230. PF0(i + 4) \
  231. PF0(i + 6) \
  232. XO2(i, 0) \
  233. XO2(i + 1, 1) \
  234. XO2(i + 2, 2) \
  235. XO2(i + 3, 3) \
  236. XO3(i, 0) \
  237. XO3(i + 1, 1) \
  238. XO3(i + 2, 2) \
  239. XO3(i + 3, 3) \
  240. ST(i, 0) \
  241. ST(i + 1, 1) \
  242. ST(i + 2, 2) \
  243. ST(i + 3, 3) \
  244. PF0(0)
  245. PF0(2)
  246. " .align 32 ;\n"
  247. " 1: ;\n"
  248. BLOCK(0)
  249. BLOCK(4)
  250. BLOCK(8)
  251. BLOCK(12)
  252. " add %[inc], %[p1] ;\n"
  253. " add %[inc], %[p2] ;\n"
  254. " add %[inc], %[p3] ;\n"
  255. " add %[inc], %[p4] ;\n"
  256. " dec %[cnt] ;\n"
  257. " jnz 1b ;\n"
  258. : [cnt] "+r" (lines), [p1] "+r" (p1),
  259. [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
  260. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  261. : "memory");
  262. kernel_fpu_end();
  263. }
  264. static void
  265. xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  266. unsigned long *p3, unsigned long *p4)
  267. {
  268. unsigned long lines = bytes >> 8;
  269. kernel_fpu_begin();
  270. asm volatile(
  271. #undef BLOCK
  272. #define BLOCK(i) \
  273. BLK64(PF0, LD, i) \
  274. BLK64(PF1, XO1, i) \
  275. BLK64(PF2, XO2, i) \
  276. BLK64(PF3, XO3, i) \
  277. BLK64(NOP, ST, i) \
  278. " .align 32 ;\n"
  279. " 1: ;\n"
  280. BLOCK(0)
  281. BLOCK(4)
  282. BLOCK(8)
  283. BLOCK(12)
  284. " add %[inc], %[p1] ;\n"
  285. " add %[inc], %[p2] ;\n"
  286. " add %[inc], %[p3] ;\n"
  287. " add %[inc], %[p4] ;\n"
  288. " dec %[cnt] ;\n"
  289. " jnz 1b ;\n"
  290. : [cnt] "+r" (lines), [p1] "+r" (p1),
  291. [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
  292. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  293. : "memory");
  294. kernel_fpu_end();
  295. }
  296. static void
  297. xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  298. unsigned long *p3, unsigned long *p4, unsigned long *p5)
  299. {
  300. unsigned long lines = bytes >> 8;
  301. kernel_fpu_begin();
  302. asm volatile(
  303. #undef BLOCK
  304. #define BLOCK(i) \
  305. PF1(i) \
  306. PF1(i + 2) \
  307. LD(i, 0) \
  308. LD(i + 1, 1) \
  309. LD(i + 2, 2) \
  310. LD(i + 3, 3) \
  311. PF2(i) \
  312. PF2(i + 2) \
  313. XO1(i, 0) \
  314. XO1(i + 1, 1) \
  315. XO1(i + 2, 2) \
  316. XO1(i + 3, 3) \
  317. PF3(i) \
  318. PF3(i + 2) \
  319. XO2(i, 0) \
  320. XO2(i + 1, 1) \
  321. XO2(i + 2, 2) \
  322. XO2(i + 3, 3) \
  323. PF4(i) \
  324. PF4(i + 2) \
  325. PF0(i + 4) \
  326. PF0(i + 6) \
  327. XO3(i, 0) \
  328. XO3(i + 1, 1) \
  329. XO3(i + 2, 2) \
  330. XO3(i + 3, 3) \
  331. XO4(i, 0) \
  332. XO4(i + 1, 1) \
  333. XO4(i + 2, 2) \
  334. XO4(i + 3, 3) \
  335. ST(i, 0) \
  336. ST(i + 1, 1) \
  337. ST(i + 2, 2) \
  338. ST(i + 3, 3) \
  339. PF0(0)
  340. PF0(2)
  341. " .align 32 ;\n"
  342. " 1: ;\n"
  343. BLOCK(0)
  344. BLOCK(4)
  345. BLOCK(8)
  346. BLOCK(12)
  347. " add %[inc], %[p1] ;\n"
  348. " add %[inc], %[p2] ;\n"
  349. " add %[inc], %[p3] ;\n"
  350. " add %[inc], %[p4] ;\n"
  351. " add %[inc], %[p5] ;\n"
  352. " dec %[cnt] ;\n"
  353. " jnz 1b ;\n"
  354. : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
  355. [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
  356. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  357. : "memory");
  358. kernel_fpu_end();
  359. }
  360. static void
  361. xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
  362. unsigned long *p3, unsigned long *p4, unsigned long *p5)
  363. {
  364. unsigned long lines = bytes >> 8;
  365. kernel_fpu_begin();
  366. asm volatile(
  367. #undef BLOCK
  368. #define BLOCK(i) \
  369. BLK64(PF0, LD, i) \
  370. BLK64(PF1, XO1, i) \
  371. BLK64(PF2, XO2, i) \
  372. BLK64(PF3, XO3, i) \
  373. BLK64(PF4, XO4, i) \
  374. BLK64(NOP, ST, i) \
  375. " .align 32 ;\n"
  376. " 1: ;\n"
  377. BLOCK(0)
  378. BLOCK(4)
  379. BLOCK(8)
  380. BLOCK(12)
  381. " add %[inc], %[p1] ;\n"
  382. " add %[inc], %[p2] ;\n"
  383. " add %[inc], %[p3] ;\n"
  384. " add %[inc], %[p4] ;\n"
  385. " add %[inc], %[p5] ;\n"
  386. " dec %[cnt] ;\n"
  387. " jnz 1b ;\n"
  388. : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
  389. [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
  390. : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
  391. : "memory");
  392. kernel_fpu_end();
  393. }
  394. static struct xor_block_template xor_block_sse_pf64 = {
  395. .name = "prefetch64-sse",
  396. .do_2 = xor_sse_2_pf64,
  397. .do_3 = xor_sse_3_pf64,
  398. .do_4 = xor_sse_4_pf64,
  399. .do_5 = xor_sse_5_pf64,
  400. };
  401. #undef LD
  402. #undef XO1
  403. #undef XO2
  404. #undef XO3
  405. #undef XO4
  406. #undef ST
  407. #undef NOP
  408. #undef BLK64
  409. #undef BLOCK
  410. #undef XOR_CONSTANT_CONSTRAINT
  411. #ifdef CONFIG_X86_32
  412. # include <asm/xor_32.h>
  413. #else
  414. # include <asm/xor_64.h>
  415. #endif
  416. #define XOR_SELECT_TEMPLATE(FASTEST) \
  417. AVX_SELECT(FASTEST)
  418. #endif /* _ASM_X86_XOR_H */