arith.fuc 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. /*
  2. * Copyright 2014 Martin Peres <martin.peres@free.fr>
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a
  5. * copy of this software and associated documentation files (the "Software"),
  6. * to deal in the Software without restriction, including without limitation
  7. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. * and/or sell copies of the Software, and to permit persons to whom the
  9. * Software is furnished to do so, subject to the folloing conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be included in
  12. * all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  17. * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18. * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19. * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20. * OTHER DEALINGS IN THE SOFTWARE.
  21. *
  22. * Authors: Martin Peres
  23. */
  24. /******************************************************************************
  25. * arith data segment
  26. *****************************************************************************/
  27. #ifdef INCLUDE_PROC
  28. #endif
  29. #ifdef INCLUDE_DATA
  30. #endif
  31. /******************************************************************************
  32. * arith code segment
  33. *****************************************************************************/
  34. #ifdef INCLUDE_CODE
  35. // does a 32x32 -> 64 multiplication
  36. //
  37. // A * B = A_lo * B_lo
  38. // + ( A_hi * B_lo ) << 16
  39. // + ( A_lo * B_hi ) << 16
  40. // + ( A_hi * B_hi ) << 32
  41. //
  42. // $r15 - current
  43. // $r14 - A
  44. // $r13 - B
  45. // $r12 - mul_lo (return)
  46. // $r11 - mul_hi (return)
  47. // $r0 - zero
  48. mulu32_32_64:
  49. push $r1 // A_hi
  50. push $r2 // B_hi
  51. push $r3 // tmp0
  52. push $r4 // tmp1
  53. shr b32 $r1 $r14 16
  54. shr b32 $r2 $r13 16
  55. clear b32 $r12
  56. clear b32 $r11
  57. // A_lo * B_lo
  58. mulu $r12 $r14 $r13
  59. // ( A_hi * B_lo ) << 16
  60. mulu $r3 $r1 $r13 // tmp0 = A_hi * B_lo
  61. mov b32 $r4 $r3
  62. and $r3 0xffff // tmp0 = tmp0_lo
  63. shl b32 $r3 16
  64. shr b32 $r4 16 // tmp1 = tmp0_hi
  65. add b32 $r12 $r3
  66. adc b32 $r11 $r4
  67. // ( A_lo * B_hi ) << 16
  68. mulu $r3 $r14 $r2 // tmp0 = A_lo * B_hi
  69. mov b32 $r4 $r3
  70. and $r3 0xffff // tmp0 = tmp0_lo
  71. shl b32 $r3 16
  72. shr b32 $r4 16 // tmp1 = tmp0_hi
  73. add b32 $r12 $r3
  74. adc b32 $r11 $r4
  75. // ( A_hi * B_hi ) << 32
  76. mulu $r3 $r1 $r2 // tmp0 = A_hi * B_hi
  77. add b32 $r11 $r3
  78. pop $r4
  79. pop $r3
  80. pop $r2
  81. pop $r1
  82. ret
  83. #endif