nmi.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. /*
  2. * Machine check handler
  3. *
  4. * Copyright IBM Corp. 2000, 2009
  5. * Author(s): Ingo Adlung <adlung@de.ibm.com>,
  6. * Martin Schwidefsky <schwidefsky@de.ibm.com>,
  7. * Cornelia Huck <cornelia.huck@de.ibm.com>,
  8. * Heiko Carstens <heiko.carstens@de.ibm.com>,
  9. */
  10. #include <linux/kernel_stat.h>
  11. #include <linux/init.h>
  12. #include <linux/errno.h>
  13. #include <linux/hardirq.h>
  14. #include <linux/time.h>
  15. #include <linux/module.h>
  16. #include <linux/sched/signal.h>
  17. #include <linux/export.h>
  18. #include <asm/lowcore.h>
  19. #include <asm/smp.h>
  20. #include <asm/stp.h>
  21. #include <asm/cputime.h>
  22. #include <asm/nmi.h>
  23. #include <asm/crw.h>
  24. #include <asm/switch_to.h>
  25. #include <asm/ctl_reg.h>
  26. struct mcck_struct {
  27. unsigned int kill_task : 1;
  28. unsigned int channel_report : 1;
  29. unsigned int warning : 1;
  30. unsigned int stp_queue : 1;
  31. unsigned long mcck_code;
  32. };
  33. static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck);
  34. static void s390_handle_damage(void)
  35. {
  36. smp_send_stop();
  37. disabled_wait((unsigned long) __builtin_return_address(0));
  38. while (1);
  39. }
  40. /*
  41. * Main machine check handler function. Will be called with interrupts enabled
  42. * or disabled and machine checks enabled or disabled.
  43. */
  44. void s390_handle_mcck(void)
  45. {
  46. unsigned long flags;
  47. struct mcck_struct mcck;
  48. /*
  49. * Disable machine checks and get the current state of accumulated
  50. * machine checks. Afterwards delete the old state and enable machine
  51. * checks again.
  52. */
  53. local_irq_save(flags);
  54. local_mcck_disable();
  55. mcck = *this_cpu_ptr(&cpu_mcck);
  56. memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck));
  57. clear_cpu_flag(CIF_MCCK_PENDING);
  58. local_mcck_enable();
  59. local_irq_restore(flags);
  60. if (mcck.channel_report)
  61. crw_handle_channel_report();
  62. /*
  63. * A warning may remain for a prolonged period on the bare iron.
  64. * (actually until the machine is powered off, or the problem is gone)
  65. * So we just stop listening for the WARNING MCH and avoid continuously
  66. * being interrupted. One caveat is however, that we must do this per
  67. * processor and cannot use the smp version of ctl_clear_bit().
  68. * On VM we only get one interrupt per virtally presented machinecheck.
  69. * Though one suffices, we may get one interrupt per (virtual) cpu.
  70. */
  71. if (mcck.warning) { /* WARNING pending ? */
  72. static int mchchk_wng_posted = 0;
  73. /* Use single cpu clear, as we cannot handle smp here. */
  74. __ctl_clear_bit(14, 24); /* Disable WARNING MCH */
  75. if (xchg(&mchchk_wng_posted, 1) == 0)
  76. kill_cad_pid(SIGPWR, 1);
  77. }
  78. if (mcck.stp_queue)
  79. stp_queue_work();
  80. if (mcck.kill_task) {
  81. local_irq_enable();
  82. printk(KERN_EMERG "mcck: Terminating task because of machine "
  83. "malfunction (code 0x%016lx).\n", mcck.mcck_code);
  84. printk(KERN_EMERG "mcck: task: %s, pid: %d.\n",
  85. current->comm, current->pid);
  86. do_exit(SIGSEGV);
  87. }
  88. }
  89. EXPORT_SYMBOL_GPL(s390_handle_mcck);
  90. /*
  91. * returns 0 if all registers could be validated
  92. * returns 1 otherwise
  93. */
  94. static int notrace s390_validate_registers(union mci mci, int umode)
  95. {
  96. int kill_task;
  97. u64 zero;
  98. void *fpt_save_area;
  99. struct mcesa *mcesa;
  100. kill_task = 0;
  101. zero = 0;
  102. if (!mci.gr) {
  103. /*
  104. * General purpose registers couldn't be restored and have
  105. * unknown contents. Stop system or terminate process.
  106. */
  107. if (!umode)
  108. s390_handle_damage();
  109. kill_task = 1;
  110. }
  111. /* Validate control registers */
  112. if (!mci.cr) {
  113. /*
  114. * Control registers have unknown contents.
  115. * Can't recover and therefore stopping machine.
  116. */
  117. s390_handle_damage();
  118. } else {
  119. asm volatile(
  120. " lctlg 0,15,0(%0)\n"
  121. " ptlb\n"
  122. : : "a" (&S390_lowcore.cregs_save_area) : "memory");
  123. }
  124. if (!mci.fp) {
  125. /*
  126. * Floating point registers can't be restored. If the
  127. * kernel currently uses floating point registers the
  128. * system is stopped. If the process has its floating
  129. * pointer registers loaded it is terminated.
  130. * Otherwise just revalidate the registers.
  131. */
  132. if (S390_lowcore.fpu_flags & KERNEL_VXR_V0V7)
  133. s390_handle_damage();
  134. if (!test_cpu_flag(CIF_FPU))
  135. kill_task = 1;
  136. }
  137. fpt_save_area = &S390_lowcore.floating_pt_save_area;
  138. if (!mci.fc) {
  139. /*
  140. * Floating point control register can't be restored.
  141. * If the kernel currently uses the floating pointer
  142. * registers and needs the FPC register the system is
  143. * stopped. If the process has its floating pointer
  144. * registers loaded it is terminated. Otherwiese the
  145. * FPC is just revalidated.
  146. */
  147. if (S390_lowcore.fpu_flags & KERNEL_FPC)
  148. s390_handle_damage();
  149. asm volatile("lfpc %0" : : "Q" (zero));
  150. if (!test_cpu_flag(CIF_FPU))
  151. kill_task = 1;
  152. } else {
  153. asm volatile("lfpc %0"
  154. : : "Q" (S390_lowcore.fpt_creg_save_area));
  155. }
  156. mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK);
  157. if (!MACHINE_HAS_VX) {
  158. /* Validate floating point registers */
  159. asm volatile(
  160. " ld 0,0(%0)\n"
  161. " ld 1,8(%0)\n"
  162. " ld 2,16(%0)\n"
  163. " ld 3,24(%0)\n"
  164. " ld 4,32(%0)\n"
  165. " ld 5,40(%0)\n"
  166. " ld 6,48(%0)\n"
  167. " ld 7,56(%0)\n"
  168. " ld 8,64(%0)\n"
  169. " ld 9,72(%0)\n"
  170. " ld 10,80(%0)\n"
  171. " ld 11,88(%0)\n"
  172. " ld 12,96(%0)\n"
  173. " ld 13,104(%0)\n"
  174. " ld 14,112(%0)\n"
  175. " ld 15,120(%0)\n"
  176. : : "a" (fpt_save_area) : "memory");
  177. } else {
  178. /* Validate vector registers */
  179. union ctlreg0 cr0;
  180. if (!mci.vr) {
  181. /*
  182. * Vector registers can't be restored. If the kernel
  183. * currently uses vector registers the system is
  184. * stopped. If the process has its vector registers
  185. * loaded it is terminated. Otherwise just revalidate
  186. * the registers.
  187. */
  188. if (S390_lowcore.fpu_flags & KERNEL_VXR)
  189. s390_handle_damage();
  190. if (!test_cpu_flag(CIF_FPU))
  191. kill_task = 1;
  192. }
  193. cr0.val = S390_lowcore.cregs_save_area[0];
  194. cr0.afp = cr0.vx = 1;
  195. __ctl_load(cr0.val, 0, 0);
  196. asm volatile(
  197. " la 1,%0\n"
  198. " .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
  199. " .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
  200. : : "Q" (*(struct vx_array *) mcesa->vector_save_area)
  201. : "1");
  202. __ctl_load(S390_lowcore.cregs_save_area[0], 0, 0);
  203. }
  204. /* Validate access registers */
  205. asm volatile(
  206. " lam 0,15,0(%0)"
  207. : : "a" (&S390_lowcore.access_regs_save_area));
  208. if (!mci.ar) {
  209. /*
  210. * Access registers have unknown contents.
  211. * Terminating task.
  212. */
  213. kill_task = 1;
  214. }
  215. /* Validate guarded storage registers */
  216. if (MACHINE_HAS_GS && (S390_lowcore.cregs_save_area[2] & (1UL << 4))) {
  217. if (!mci.gs)
  218. /*
  219. * Guarded storage register can't be restored and
  220. * the current processes uses guarded storage.
  221. * It has to be terminated.
  222. */
  223. kill_task = 1;
  224. else
  225. load_gs_cb((struct gs_cb *)
  226. mcesa->guarded_storage_save_area);
  227. }
  228. /*
  229. * We don't even try to validate the TOD register, since we simply
  230. * can't write something sensible into that register.
  231. */
  232. /*
  233. * See if we can validate the TOD programmable register with its
  234. * old contents (should be zero) otherwise set it to zero.
  235. */
  236. if (!mci.pr)
  237. asm volatile(
  238. " sr 0,0\n"
  239. " sckpf"
  240. : : : "0", "cc");
  241. else
  242. asm volatile(
  243. " l 0,%0\n"
  244. " sckpf"
  245. : : "Q" (S390_lowcore.tod_progreg_save_area)
  246. : "0", "cc");
  247. /* Validate clock comparator register */
  248. set_clock_comparator(S390_lowcore.clock_comparator);
  249. /* Check if old PSW is valid */
  250. if (!mci.wp)
  251. /*
  252. * Can't tell if we come from user or kernel mode
  253. * -> stopping machine.
  254. */
  255. s390_handle_damage();
  256. if (!mci.ms || !mci.pm || !mci.ia)
  257. kill_task = 1;
  258. return kill_task;
  259. }
  260. #define MAX_IPD_COUNT 29
  261. #define MAX_IPD_TIME (5 * 60 * USEC_PER_SEC) /* 5 minutes */
  262. #define ED_STP_ISLAND 6 /* External damage STP island check */
  263. #define ED_STP_SYNC 7 /* External damage STP sync check */
  264. /*
  265. * machine check handler.
  266. */
  267. void notrace s390_do_machine_check(struct pt_regs *regs)
  268. {
  269. static int ipd_count;
  270. static DEFINE_SPINLOCK(ipd_lock);
  271. static unsigned long long last_ipd;
  272. struct mcck_struct *mcck;
  273. unsigned long long tmp;
  274. union mci mci;
  275. nmi_enter();
  276. inc_irq_stat(NMI_NMI);
  277. mci.val = S390_lowcore.mcck_interruption_code;
  278. mcck = this_cpu_ptr(&cpu_mcck);
  279. if (mci.sd) {
  280. /* System damage -> stopping machine */
  281. s390_handle_damage();
  282. }
  283. if (mci.pd) {
  284. if (mci.b) {
  285. /* Processing backup -> verify if we can survive this */
  286. u64 z_mcic, o_mcic, t_mcic;
  287. z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29);
  288. o_mcic = (1ULL<<43 | 1ULL<<42 | 1ULL<<41 | 1ULL<<40 |
  289. 1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 |
  290. 1ULL<<30 | 1ULL<<21 | 1ULL<<20 | 1ULL<<17 |
  291. 1ULL<<16);
  292. t_mcic = mci.val;
  293. if (((t_mcic & z_mcic) != 0) ||
  294. ((t_mcic & o_mcic) != o_mcic)) {
  295. s390_handle_damage();
  296. }
  297. /*
  298. * Nullifying exigent condition, therefore we might
  299. * retry this instruction.
  300. */
  301. spin_lock(&ipd_lock);
  302. tmp = get_tod_clock();
  303. if (((tmp - last_ipd) >> 12) < MAX_IPD_TIME)
  304. ipd_count++;
  305. else
  306. ipd_count = 1;
  307. last_ipd = tmp;
  308. if (ipd_count == MAX_IPD_COUNT)
  309. s390_handle_damage();
  310. spin_unlock(&ipd_lock);
  311. } else {
  312. /* Processing damage -> stopping machine */
  313. s390_handle_damage();
  314. }
  315. }
  316. if (s390_validate_registers(mci, user_mode(regs))) {
  317. /*
  318. * Couldn't restore all register contents for the
  319. * user space process -> mark task for termination.
  320. */
  321. mcck->kill_task = 1;
  322. mcck->mcck_code = mci.val;
  323. set_cpu_flag(CIF_MCCK_PENDING);
  324. }
  325. if (mci.cd) {
  326. /* Timing facility damage */
  327. s390_handle_damage();
  328. }
  329. if (mci.ed && mci.ec) {
  330. /* External damage */
  331. if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC))
  332. mcck->stp_queue |= stp_sync_check();
  333. if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND))
  334. mcck->stp_queue |= stp_island_check();
  335. if (mcck->stp_queue)
  336. set_cpu_flag(CIF_MCCK_PENDING);
  337. }
  338. if (mci.se)
  339. /* Storage error uncorrected */
  340. s390_handle_damage();
  341. if (mci.ke)
  342. /* Storage key-error uncorrected */
  343. s390_handle_damage();
  344. if (mci.ds && mci.fa)
  345. /* Storage degradation */
  346. s390_handle_damage();
  347. if (mci.cp) {
  348. /* Channel report word pending */
  349. mcck->channel_report = 1;
  350. set_cpu_flag(CIF_MCCK_PENDING);
  351. }
  352. if (mci.w) {
  353. /* Warning pending */
  354. mcck->warning = 1;
  355. set_cpu_flag(CIF_MCCK_PENDING);
  356. }
  357. nmi_exit();
  358. }
  359. static int __init machine_check_init(void)
  360. {
  361. ctl_set_bit(14, 25); /* enable external damage MCH */
  362. ctl_set_bit(14, 27); /* enable system recovery MCH */
  363. ctl_set_bit(14, 24); /* enable warning MCH */
  364. return 0;
  365. }
  366. early_initcall(machine_check_init);