fpu.c 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. /*
  2. * In-kernel vector facility support functions
  3. *
  4. * Copyright IBM Corp. 2015
  5. * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
  6. */
  7. #include <linux/kernel.h>
  8. #include <linux/cpu.h>
  9. #include <linux/sched.h>
  10. #include <asm/fpu/types.h>
  11. #include <asm/fpu/api.h>
  12. /*
  13. * Per-CPU variable to maintain FPU register ranges that are in use
  14. * by the kernel.
  15. */
  16. static DEFINE_PER_CPU(u32, kernel_fpu_state);
  17. #define KERNEL_FPU_STATE_MASK (KERNEL_FPU_MASK|KERNEL_FPC)
  18. void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
  19. {
  20. if (!__this_cpu_read(kernel_fpu_state)) {
  21. /*
  22. * Save user space FPU state and register contents. Multiple
  23. * calls because of interruptions do not matter and return
  24. * immediately. This also sets CIF_FPU to lazy restore FP/VX
  25. * register contents when returning to user space.
  26. */
  27. save_fpu_regs();
  28. }
  29. /* Update flags to use the vector facility for KERNEL_FPR */
  30. if (MACHINE_HAS_VX && (state->mask & KERNEL_FPR)) {
  31. flags |= KERNEL_VXR_LOW | KERNEL_FPC;
  32. flags &= ~KERNEL_FPR;
  33. }
  34. /* Save and update current kernel VX state */
  35. state->mask = __this_cpu_read(kernel_fpu_state);
  36. __this_cpu_or(kernel_fpu_state, flags & KERNEL_FPU_STATE_MASK);
  37. /*
  38. * If this is the first call to __kernel_fpu_begin(), no additional
  39. * work is required.
  40. */
  41. if (!(state->mask & KERNEL_FPU_STATE_MASK))
  42. return;
  43. /*
  44. * If KERNEL_FPR is still set, the vector facility is not available
  45. * and, thus, save floating-point control and registers only.
  46. */
  47. if (state->mask & KERNEL_FPR) {
  48. asm volatile("stfpc %0" : "=Q" (state->fpc));
  49. asm volatile("std 0,%0" : "=Q" (state->fprs[0]));
  50. asm volatile("std 1,%0" : "=Q" (state->fprs[1]));
  51. asm volatile("std 2,%0" : "=Q" (state->fprs[2]));
  52. asm volatile("std 3,%0" : "=Q" (state->fprs[3]));
  53. asm volatile("std 4,%0" : "=Q" (state->fprs[4]));
  54. asm volatile("std 5,%0" : "=Q" (state->fprs[5]));
  55. asm volatile("std 6,%0" : "=Q" (state->fprs[6]));
  56. asm volatile("std 7,%0" : "=Q" (state->fprs[7]));
  57. asm volatile("std 8,%0" : "=Q" (state->fprs[8]));
  58. asm volatile("std 9,%0" : "=Q" (state->fprs[9]));
  59. asm volatile("std 10,%0" : "=Q" (state->fprs[10]));
  60. asm volatile("std 11,%0" : "=Q" (state->fprs[11]));
  61. asm volatile("std 12,%0" : "=Q" (state->fprs[12]));
  62. asm volatile("std 13,%0" : "=Q" (state->fprs[13]));
  63. asm volatile("std 14,%0" : "=Q" (state->fprs[14]));
  64. asm volatile("std 15,%0" : "=Q" (state->fprs[15]));
  65. return;
  66. }
  67. /*
  68. * If this is a nested call to __kernel_fpu_begin(), check the saved
  69. * state mask to save and later restore the vector registers that
  70. * are already in use. Let's start with checking floating-point
  71. * controls.
  72. */
  73. if (state->mask & KERNEL_FPC)
  74. asm volatile("stfpc %0" : "=m" (state->fpc));
  75. /* Test and save vector registers */
  76. asm volatile (
  77. /*
  78. * Test if any vector register must be saved and, if so,
  79. * test if all register can be saved.
  80. */
  81. " tmll %[m],15\n" /* KERNEL_VXR_MASK */
  82. " jz 20f\n" /* no work -> done */
  83. " la 1,%[vxrs]\n" /* load save area */
  84. " jo 18f\n" /* -> save V0..V31 */
  85. /*
  86. * Test if V8..V23 can be saved at once... this speeds up
  87. * for KERNEL_fpu_MID only. Otherwise continue to split the
  88. * range of vector registers into two halves and test them
  89. * separately.
  90. */
  91. " tmll %[m],6\n" /* KERNEL_VXR_MID */
  92. " jo 17f\n" /* -> save V8..V23 */
  93. /* Test and save the first half of 16 vector registers */
  94. "1: tmll %[m],3\n" /* KERNEL_VXR_LOW */
  95. " jz 10f\n" /* -> KERNEL_VXR_HIGH */
  96. " jo 2f\n" /* 11 -> save V0..V15 */
  97. " brc 4,3f\n" /* 01 -> save V0..V7 */
  98. " brc 2,4f\n" /* 10 -> save V8..V15 */
  99. /* Test and save the second half of 16 vector registers */
  100. "10: tmll %[m],12\n" /* KERNEL_VXR_HIGH */
  101. " jo 19f\n" /* 11 -> save V16..V31 */
  102. " brc 4,11f\n" /* 01 -> save V16..V23 */
  103. " brc 2,12f\n" /* 10 -> save V24..V31 */
  104. " j 20f\n" /* 00 -> done */
  105. /*
  106. * Below are the vstm combinations to save multiple vector
  107. * registers at once.
  108. */
  109. "2: .word 0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
  110. " j 10b\n" /* -> VXR_HIGH */
  111. "3: .word 0xe707,0x1000,0x003e\n" /* vstm 0,7,0(1) */
  112. " j 10b\n" /* -> VXR_HIGH */
  113. "4: .word 0xe78f,0x1080,0x003e\n" /* vstm 8,15,128(1) */
  114. " j 10b\n" /* -> VXR_HIGH */
  115. "\n"
  116. "11: .word 0xe707,0x1100,0x0c3e\n" /* vstm 16,23,256(1) */
  117. " j 20f\n" /* -> done */
  118. "12: .word 0xe78f,0x1180,0x0c3e\n" /* vstm 24,31,384(1) */
  119. " j 20f\n" /* -> done */
  120. "\n"
  121. "17: .word 0xe787,0x1080,0x043e\n" /* vstm 8,23,128(1) */
  122. " nill %[m],249\n" /* m &= ~VXR_MID */
  123. " j 1b\n" /* -> VXR_LOW */
  124. "\n"
  125. "18: .word 0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
  126. "19: .word 0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */
  127. "20:"
  128. : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs)
  129. : [m] "d" (state->mask)
  130. : "1", "cc");
  131. }
  132. EXPORT_SYMBOL(__kernel_fpu_begin);
  133. void __kernel_fpu_end(struct kernel_fpu *state)
  134. {
  135. /* Just update the per-CPU state if there is nothing to restore */
  136. if (!(state->mask & KERNEL_FPU_STATE_MASK))
  137. goto update_fpu_state;
  138. /*
  139. * If KERNEL_FPR is specified, the vector facility is not available
  140. * and, thus, restore floating-point control and registers only.
  141. */
  142. if (state->mask & KERNEL_FPR) {
  143. asm volatile("lfpc %0" : : "Q" (state->fpc));
  144. asm volatile("ld 0,%0" : : "Q" (state->fprs[0]));
  145. asm volatile("ld 1,%0" : : "Q" (state->fprs[1]));
  146. asm volatile("ld 2,%0" : : "Q" (state->fprs[2]));
  147. asm volatile("ld 3,%0" : : "Q" (state->fprs[3]));
  148. asm volatile("ld 4,%0" : : "Q" (state->fprs[4]));
  149. asm volatile("ld 5,%0" : : "Q" (state->fprs[5]));
  150. asm volatile("ld 6,%0" : : "Q" (state->fprs[6]));
  151. asm volatile("ld 7,%0" : : "Q" (state->fprs[7]));
  152. asm volatile("ld 8,%0" : : "Q" (state->fprs[8]));
  153. asm volatile("ld 9,%0" : : "Q" (state->fprs[9]));
  154. asm volatile("ld 10,%0" : : "Q" (state->fprs[10]));
  155. asm volatile("ld 11,%0" : : "Q" (state->fprs[11]));
  156. asm volatile("ld 12,%0" : : "Q" (state->fprs[12]));
  157. asm volatile("ld 13,%0" : : "Q" (state->fprs[13]));
  158. asm volatile("ld 14,%0" : : "Q" (state->fprs[14]));
  159. asm volatile("ld 15,%0" : : "Q" (state->fprs[15]));
  160. goto update_fpu_state;
  161. }
  162. /* Test and restore floating-point controls */
  163. if (state->mask & KERNEL_FPC)
  164. asm volatile("lfpc %0" : : "Q" (state->fpc));
  165. /* Test and restore (load) vector registers */
  166. asm volatile (
  167. /*
  168. * Test if any vector registers must be loaded and, if so,
  169. * test if all registers can be loaded at once.
  170. */
  171. " tmll %[m],15\n" /* KERNEL_VXR_MASK */
  172. " jz 20f\n" /* no work -> done */
  173. " la 1,%[vxrs]\n" /* load load area */
  174. " jo 18f\n" /* -> load V0..V31 */
  175. /*
  176. * Test if V8..V23 can be restored at once... this speeds up
  177. * for KERNEL_VXR_MID only. Otherwise continue to split the
  178. * range of vector registers into two halves and test them
  179. * separately.
  180. */
  181. " tmll %[m],6\n" /* KERNEL_VXR_MID */
  182. " jo 17f\n" /* -> load V8..V23 */
  183. /* Test and load the first half of 16 vector registers */
  184. "1: tmll %[m],3\n" /* KERNEL_VXR_LOW */
  185. " jz 10f\n" /* -> KERNEL_VXR_HIGH */
  186. " jo 2f\n" /* 11 -> load V0..V15 */
  187. " brc 4,3f\n" /* 01 -> load V0..V7 */
  188. " brc 2,4f\n" /* 10 -> load V8..V15 */
  189. /* Test and load the second half of 16 vector registers */
  190. "10: tmll %[m],12\n" /* KERNEL_VXR_HIGH */
  191. " jo 19f\n" /* 11 -> load V16..V31 */
  192. " brc 4,11f\n" /* 01 -> load V16..V23 */
  193. " brc 2,12f\n" /* 10 -> load V24..V31 */
  194. " j 20f\n" /* 00 -> done */
  195. /*
  196. * Below are the vstm combinations to load multiple vector
  197. * registers at once.
  198. */
  199. "2: .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
  200. " j 10b\n" /* -> VXR_HIGH */
  201. "3: .word 0xe707,0x1000,0x0036\n" /* vlm 0,7,0(1) */
  202. " j 10b\n" /* -> VXR_HIGH */
  203. "4: .word 0xe78f,0x1080,0x0036\n" /* vlm 8,15,128(1) */
  204. " j 10b\n" /* -> VXR_HIGH */
  205. "\n"
  206. "11: .word 0xe707,0x1100,0x0c36\n" /* vlm 16,23,256(1) */
  207. " j 20f\n" /* -> done */
  208. "12: .word 0xe78f,0x1180,0x0c36\n" /* vlm 24,31,384(1) */
  209. " j 20f\n" /* -> done */
  210. "\n"
  211. "17: .word 0xe787,0x1080,0x0436\n" /* vlm 8,23,128(1) */
  212. " nill %[m],249\n" /* m &= ~VXR_MID */
  213. " j 1b\n" /* -> VXR_LOW */
  214. "\n"
  215. "18: .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
  216. "19: .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
  217. "20:"
  218. :
  219. : [vxrs] "Q" (*(struct vx_array *) &state->vxrs),
  220. [m] "d" (state->mask)
  221. : "1", "cc");
  222. update_fpu_state:
  223. /* Update current kernel VX state */
  224. __this_cpu_write(kernel_fpu_state, state->mask);
  225. }
  226. EXPORT_SYMBOL(__kernel_fpu_end);