memcmp_64.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633
  1. /*
  2. * Author: Anton Blanchard <anton@au.ibm.com>
  3. * Copyright 2015 IBM Corporation.
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU General Public License
  7. * as published by the Free Software Foundation; either version
  8. * 2 of the License, or (at your option) any later version.
  9. */
  10. #include <asm/ppc_asm.h>
  11. #include <asm/export.h>
  12. #include <asm/ppc-opcode.h>
  13. #define off8 r6
  14. #define off16 r7
  15. #define off24 r8
  16. #define rA r9
  17. #define rB r10
  18. #define rC r11
  19. #define rD r27
  20. #define rE r28
  21. #define rF r29
  22. #define rG r30
  23. #define rH r31
  24. #ifdef __LITTLE_ENDIAN__
  25. #define LH lhbrx
  26. #define LW lwbrx
  27. #define LD ldbrx
  28. #define LVS lvsr
  29. #define VPERM(_VRT,_VRA,_VRB,_VRC) \
  30. vperm _VRT,_VRB,_VRA,_VRC
  31. #else
  32. #define LH lhzx
  33. #define LW lwzx
  34. #define LD ldx
  35. #define LVS lvsl
  36. #define VPERM(_VRT,_VRA,_VRB,_VRC) \
  37. vperm _VRT,_VRA,_VRB,_VRC
  38. #endif
  39. #define VMX_THRESH 4096
  40. #define ENTER_VMX_OPS \
  41. mflr r0; \
  42. std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
  43. std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
  44. std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
  45. std r0,16(r1); \
  46. stdu r1,-STACKFRAMESIZE(r1); \
  47. bl enter_vmx_ops; \
  48. cmpwi cr1,r3,0; \
  49. ld r0,STACKFRAMESIZE+16(r1); \
  50. ld r3,STK_REG(R31)(r1); \
  51. ld r4,STK_REG(R30)(r1); \
  52. ld r5,STK_REG(R29)(r1); \
  53. addi r1,r1,STACKFRAMESIZE; \
  54. mtlr r0
  55. #define EXIT_VMX_OPS \
  56. mflr r0; \
  57. std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
  58. std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
  59. std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
  60. std r0,16(r1); \
  61. stdu r1,-STACKFRAMESIZE(r1); \
  62. bl exit_vmx_ops; \
  63. ld r0,STACKFRAMESIZE+16(r1); \
  64. ld r3,STK_REG(R31)(r1); \
  65. ld r4,STK_REG(R30)(r1); \
  66. ld r5,STK_REG(R29)(r1); \
  67. addi r1,r1,STACKFRAMESIZE; \
  68. mtlr r0
  69. /*
  70. * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
  71. * 16 bytes boundary and permute the result with the 1st 16 bytes.
  72. * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
  73. * ^ ^ ^
  74. * 0xbbbb10 0xbbbb20 0xbbb30
  75. * ^
  76. * _vaddr
  77. *
  78. *
  79. * _vmask is the mask generated by LVS
  80. * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
  81. * for example: 0xyyyyyyyyyyyyy012 for big endian
  82. * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
  83. * for example: 0x3456789abcdefzzz for big endian
  84. * The permute result is saved in _v_res.
  85. * for example: 0x0123456789abcdef for big endian.
  86. */
  87. #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
  88. lvx _v2nd_qw,_vaddr,off16; \
  89. VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
  90. /*
  91. * There are 2 categories for memcmp:
  92. * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
  93. * are named like .Lsameoffset_xxxx
  94. * 2) src/dst has different offset to the 8 bytes boundary. The handlers
  95. * are named like .Ldiffoffset_xxxx
  96. */
  97. _GLOBAL_TOC(memcmp)
  98. cmpdi cr1,r5,0
  99. /* Use the short loop if the src/dst addresses are not
  100. * with the same offset of 8 bytes align boundary.
  101. */
  102. xor r6,r3,r4
  103. andi. r6,r6,7
  104. /* Fall back to short loop if compare at aligned addrs
  105. * with less than 8 bytes.
  106. */
  107. cmpdi cr6,r5,7
  108. beq cr1,.Lzero
  109. bgt cr6,.Lno_short
  110. .Lshort:
  111. mtctr r5
  112. 1: lbz rA,0(r3)
  113. lbz rB,0(r4)
  114. subf. rC,rB,rA
  115. bne .Lnon_zero
  116. bdz .Lzero
  117. lbz rA,1(r3)
  118. lbz rB,1(r4)
  119. subf. rC,rB,rA
  120. bne .Lnon_zero
  121. bdz .Lzero
  122. lbz rA,2(r3)
  123. lbz rB,2(r4)
  124. subf. rC,rB,rA
  125. bne .Lnon_zero
  126. bdz .Lzero
  127. lbz rA,3(r3)
  128. lbz rB,3(r4)
  129. subf. rC,rB,rA
  130. bne .Lnon_zero
  131. addi r3,r3,4
  132. addi r4,r4,4
  133. bdnz 1b
  134. .Lzero:
  135. li r3,0
  136. blr
  137. .Lno_short:
  138. dcbt 0,r3
  139. dcbt 0,r4
  140. bne .Ldiffoffset_8bytes_make_align_start
  141. .Lsameoffset_8bytes_make_align_start:
  142. /* attempt to compare bytes not aligned with 8 bytes so that
  143. * rest comparison can run based on 8 bytes alignment.
  144. */
  145. andi. r6,r3,7
  146. /* Try to compare the first double word which is not 8 bytes aligned:
  147. * load the first double word at (src & ~7UL) and shift left appropriate
  148. * bits before comparision.
  149. */
  150. rlwinm r6,r3,3,26,28
  151. beq .Lsameoffset_8bytes_aligned
  152. clrrdi r3,r3,3
  153. clrrdi r4,r4,3
  154. LD rA,0,r3
  155. LD rB,0,r4
  156. sld rA,rA,r6
  157. sld rB,rB,r6
  158. cmpld cr0,rA,rB
  159. srwi r6,r6,3
  160. bne cr0,.LcmpAB_lightweight
  161. subfic r6,r6,8
  162. subf. r5,r6,r5
  163. addi r3,r3,8
  164. addi r4,r4,8
  165. beq .Lzero
  166. .Lsameoffset_8bytes_aligned:
  167. /* now we are aligned with 8 bytes.
  168. * Use .Llong loop if left cmp bytes are equal or greater than 32B.
  169. */
  170. cmpdi cr6,r5,31
  171. bgt cr6,.Llong
  172. .Lcmp_lt32bytes:
  173. /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
  174. cmpdi cr5,r5,7
  175. srdi r0,r5,3
  176. ble cr5,.Lcmp_rest_lt8bytes
  177. /* handle 8 ~ 31 bytes */
  178. clrldi r5,r5,61
  179. mtctr r0
  180. 2:
  181. LD rA,0,r3
  182. LD rB,0,r4
  183. cmpld cr0,rA,rB
  184. addi r3,r3,8
  185. addi r4,r4,8
  186. bne cr0,.LcmpAB_lightweight
  187. bdnz 2b
  188. cmpwi r5,0
  189. beq .Lzero
  190. .Lcmp_rest_lt8bytes:
  191. /* Here we have only less than 8 bytes to compare with. at least s1
  192. * Address is aligned with 8 bytes.
  193. * The next double words are load and shift right with appropriate
  194. * bits.
  195. */
  196. subfic r6,r5,8
  197. slwi r6,r6,3
  198. LD rA,0,r3
  199. LD rB,0,r4
  200. srd rA,rA,r6
  201. srd rB,rB,r6
  202. cmpld cr0,rA,rB
  203. bne cr0,.LcmpAB_lightweight
  204. b .Lzero
  205. .Lnon_zero:
  206. mr r3,rC
  207. blr
  208. .Llong:
  209. #ifdef CONFIG_ALTIVEC
  210. BEGIN_FTR_SECTION
  211. /* Try to use vmx loop if length is equal or greater than 4K */
  212. cmpldi cr6,r5,VMX_THRESH
  213. bge cr6,.Lsameoffset_vmx_cmp
  214. END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  215. .Llong_novmx_cmp:
  216. #endif
  217. /* At least s1 addr is aligned with 8 bytes */
  218. li off8,8
  219. li off16,16
  220. li off24,24
  221. std r31,-8(r1)
  222. std r30,-16(r1)
  223. std r29,-24(r1)
  224. std r28,-32(r1)
  225. std r27,-40(r1)
  226. srdi r0,r5,5
  227. mtctr r0
  228. andi. r5,r5,31
  229. LD rA,0,r3
  230. LD rB,0,r4
  231. LD rC,off8,r3
  232. LD rD,off8,r4
  233. LD rE,off16,r3
  234. LD rF,off16,r4
  235. LD rG,off24,r3
  236. LD rH,off24,r4
  237. cmpld cr0,rA,rB
  238. addi r3,r3,32
  239. addi r4,r4,32
  240. bdz .Lfirst32
  241. LD rA,0,r3
  242. LD rB,0,r4
  243. cmpld cr1,rC,rD
  244. LD rC,off8,r3
  245. LD rD,off8,r4
  246. cmpld cr6,rE,rF
  247. LD rE,off16,r3
  248. LD rF,off16,r4
  249. cmpld cr7,rG,rH
  250. bne cr0,.LcmpAB
  251. LD rG,off24,r3
  252. LD rH,off24,r4
  253. cmpld cr0,rA,rB
  254. bne cr1,.LcmpCD
  255. addi r3,r3,32
  256. addi r4,r4,32
  257. bdz .Lsecond32
  258. .balign 16
  259. 1: LD rA,0,r3
  260. LD rB,0,r4
  261. cmpld cr1,rC,rD
  262. bne cr6,.LcmpEF
  263. LD rC,off8,r3
  264. LD rD,off8,r4
  265. cmpld cr6,rE,rF
  266. bne cr7,.LcmpGH
  267. LD rE,off16,r3
  268. LD rF,off16,r4
  269. cmpld cr7,rG,rH
  270. bne cr0,.LcmpAB
  271. LD rG,off24,r3
  272. LD rH,off24,r4
  273. cmpld cr0,rA,rB
  274. bne cr1,.LcmpCD
  275. addi r3,r3,32
  276. addi r4,r4,32
  277. bdnz 1b
  278. .Lsecond32:
  279. cmpld cr1,rC,rD
  280. bne cr6,.LcmpEF
  281. cmpld cr6,rE,rF
  282. bne cr7,.LcmpGH
  283. cmpld cr7,rG,rH
  284. bne cr0,.LcmpAB
  285. bne cr1,.LcmpCD
  286. bne cr6,.LcmpEF
  287. bne cr7,.LcmpGH
  288. .Ltail:
  289. ld r31,-8(r1)
  290. ld r30,-16(r1)
  291. ld r29,-24(r1)
  292. ld r28,-32(r1)
  293. ld r27,-40(r1)
  294. cmpdi r5,0
  295. beq .Lzero
  296. b .Lshort
  297. .Lfirst32:
  298. cmpld cr1,rC,rD
  299. cmpld cr6,rE,rF
  300. cmpld cr7,rG,rH
  301. bne cr0,.LcmpAB
  302. bne cr1,.LcmpCD
  303. bne cr6,.LcmpEF
  304. bne cr7,.LcmpGH
  305. b .Ltail
  306. .LcmpAB:
  307. li r3,1
  308. bgt cr0,.Lout
  309. li r3,-1
  310. b .Lout
  311. .LcmpCD:
  312. li r3,1
  313. bgt cr1,.Lout
  314. li r3,-1
  315. b .Lout
  316. .LcmpEF:
  317. li r3,1
  318. bgt cr6,.Lout
  319. li r3,-1
  320. b .Lout
  321. .LcmpGH:
  322. li r3,1
  323. bgt cr7,.Lout
  324. li r3,-1
  325. .Lout:
  326. ld r31,-8(r1)
  327. ld r30,-16(r1)
  328. ld r29,-24(r1)
  329. ld r28,-32(r1)
  330. ld r27,-40(r1)
  331. blr
  332. .LcmpAB_lightweight: /* skip NV GPRS restore */
  333. li r3,1
  334. bgtlr
  335. li r3,-1
  336. blr
  337. #ifdef CONFIG_ALTIVEC
  338. .Lsameoffset_vmx_cmp:
  339. /* Enter with src/dst addrs has the same offset with 8 bytes
  340. * align boundary.
  341. *
  342. * There is an optimization based on following fact: memcmp()
  343. * prones to fail early at the first 32 bytes.
  344. * Before applying VMX instructions which will lead to 32x128bits
  345. * VMX regs load/restore penalty, we compare the first 32 bytes
  346. * so that we can catch the ~80% fail cases.
  347. */
  348. li r0,4
  349. mtctr r0
  350. .Lsameoffset_prechk_32B_loop:
  351. LD rA,0,r3
  352. LD rB,0,r4
  353. cmpld cr0,rA,rB
  354. addi r3,r3,8
  355. addi r4,r4,8
  356. bne cr0,.LcmpAB_lightweight
  357. addi r5,r5,-8
  358. bdnz .Lsameoffset_prechk_32B_loop
  359. ENTER_VMX_OPS
  360. beq cr1,.Llong_novmx_cmp
  361. 3:
  362. /* need to check whether r4 has the same offset with r3
  363. * for 16 bytes boundary.
  364. */
  365. xor r0,r3,r4
  366. andi. r0,r0,0xf
  367. bne .Ldiffoffset_vmx_cmp_start
  368. /* len is no less than 4KB. Need to align with 16 bytes further.
  369. */
  370. andi. rA,r3,8
  371. LD rA,0,r3
  372. beq 4f
  373. LD rB,0,r4
  374. cmpld cr0,rA,rB
  375. addi r3,r3,8
  376. addi r4,r4,8
  377. addi r5,r5,-8
  378. beq cr0,4f
  379. /* save and restore cr0 */
  380. mfocrf r5,128
  381. EXIT_VMX_OPS
  382. mtocrf 128,r5
  383. b .LcmpAB_lightweight
  384. 4:
  385. /* compare 32 bytes for each loop */
  386. srdi r0,r5,5
  387. mtctr r0
  388. clrldi r5,r5,59
  389. li off16,16
  390. .balign 16
  391. 5:
  392. lvx v0,0,r3
  393. lvx v1,0,r4
  394. VCMPEQUD_RC(v0,v0,v1)
  395. bnl cr6,7f
  396. lvx v0,off16,r3
  397. lvx v1,off16,r4
  398. VCMPEQUD_RC(v0,v0,v1)
  399. bnl cr6,6f
  400. addi r3,r3,32
  401. addi r4,r4,32
  402. bdnz 5b
  403. EXIT_VMX_OPS
  404. cmpdi r5,0
  405. beq .Lzero
  406. b .Lcmp_lt32bytes
  407. 6:
  408. addi r3,r3,16
  409. addi r4,r4,16
  410. 7:
  411. /* diff the last 16 bytes */
  412. EXIT_VMX_OPS
  413. LD rA,0,r3
  414. LD rB,0,r4
  415. cmpld cr0,rA,rB
  416. li off8,8
  417. bne cr0,.LcmpAB_lightweight
  418. LD rA,off8,r3
  419. LD rB,off8,r4
  420. cmpld cr0,rA,rB
  421. bne cr0,.LcmpAB_lightweight
  422. b .Lzero
  423. #endif
  424. .Ldiffoffset_8bytes_make_align_start:
  425. /* now try to align s1 with 8 bytes */
  426. rlwinm r6,r3,3,26,28
  427. beq .Ldiffoffset_align_s1_8bytes
  428. clrrdi r3,r3,3
  429. LD rA,0,r3
  430. LD rB,0,r4 /* unaligned load */
  431. sld rA,rA,r6
  432. srd rA,rA,r6
  433. srd rB,rB,r6
  434. cmpld cr0,rA,rB
  435. srwi r6,r6,3
  436. bne cr0,.LcmpAB_lightweight
  437. subfic r6,r6,8
  438. subf. r5,r6,r5
  439. addi r3,r3,8
  440. add r4,r4,r6
  441. beq .Lzero
  442. .Ldiffoffset_align_s1_8bytes:
  443. /* now s1 is aligned with 8 bytes. */
  444. #ifdef CONFIG_ALTIVEC
  445. BEGIN_FTR_SECTION
  446. /* only do vmx ops when the size equal or greater than 4K bytes */
  447. cmpdi cr5,r5,VMX_THRESH
  448. bge cr5,.Ldiffoffset_vmx_cmp
  449. END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  450. .Ldiffoffset_novmx_cmp:
  451. #endif
  452. cmpdi cr5,r5,31
  453. ble cr5,.Lcmp_lt32bytes
  454. #ifdef CONFIG_ALTIVEC
  455. b .Llong_novmx_cmp
  456. #else
  457. b .Llong
  458. #endif
  459. #ifdef CONFIG_ALTIVEC
  460. .Ldiffoffset_vmx_cmp:
  461. /* perform a 32 bytes pre-checking before
  462. * enable VMX operations.
  463. */
  464. li r0,4
  465. mtctr r0
  466. .Ldiffoffset_prechk_32B_loop:
  467. LD rA,0,r3
  468. LD rB,0,r4
  469. cmpld cr0,rA,rB
  470. addi r3,r3,8
  471. addi r4,r4,8
  472. bne cr0,.LcmpAB_lightweight
  473. addi r5,r5,-8
  474. bdnz .Ldiffoffset_prechk_32B_loop
  475. ENTER_VMX_OPS
  476. beq cr1,.Ldiffoffset_novmx_cmp
  477. .Ldiffoffset_vmx_cmp_start:
  478. /* Firstly try to align r3 with 16 bytes */
  479. andi. r6,r3,0xf
  480. li off16,16
  481. beq .Ldiffoffset_vmx_s1_16bytes_align
  482. LVS v3,0,r3
  483. LVS v4,0,r4
  484. lvx v5,0,r3
  485. lvx v6,0,r4
  486. LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
  487. LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
  488. VCMPEQUB_RC(v7,v9,v10)
  489. bnl cr6,.Ldiffoffset_vmx_diff_found
  490. subfic r6,r6,16
  491. subf r5,r6,r5
  492. add r3,r3,r6
  493. add r4,r4,r6
  494. .Ldiffoffset_vmx_s1_16bytes_align:
  495. /* now s1 is aligned with 16 bytes */
  496. lvx v6,0,r4
  497. LVS v4,0,r4
  498. srdi r6,r5,5 /* loop for 32 bytes each */
  499. clrldi r5,r5,59
  500. mtctr r6
  501. .balign 16
  502. .Ldiffoffset_vmx_32bytesloop:
  503. /* the first qw of r4 was saved in v6 */
  504. lvx v9,0,r3
  505. LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
  506. VCMPEQUB_RC(v7,v9,v10)
  507. vor v6,v8,v8
  508. bnl cr6,.Ldiffoffset_vmx_diff_found
  509. addi r3,r3,16
  510. addi r4,r4,16
  511. lvx v9,0,r3
  512. LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
  513. VCMPEQUB_RC(v7,v9,v10)
  514. vor v6,v8,v8
  515. bnl cr6,.Ldiffoffset_vmx_diff_found
  516. addi r3,r3,16
  517. addi r4,r4,16
  518. bdnz .Ldiffoffset_vmx_32bytesloop
  519. EXIT_VMX_OPS
  520. cmpdi r5,0
  521. beq .Lzero
  522. b .Lcmp_lt32bytes
  523. .Ldiffoffset_vmx_diff_found:
  524. EXIT_VMX_OPS
  525. /* anyway, the diff will appear in next 16 bytes */
  526. li r5,16
  527. b .Lcmp_lt32bytes
  528. #endif
  529. EXPORT_SYMBOL(memcmp)