dev-mcelog.c 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430
  1. /*
  2. * /dev/mcelog driver
  3. *
  4. * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  5. * Rest from unknown author(s).
  6. * 2004 Andi Kleen. Rewrote most of it.
  7. * Copyright 2008 Intel Corporation
  8. * Author: Andi Kleen
  9. */
  10. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  11. #include <linux/miscdevice.h>
  12. #include <linux/slab.h>
  13. #include <linux/kmod.h>
  14. #include <linux/poll.h>
  15. #include "mce-internal.h"
  16. static BLOCKING_NOTIFIER_HEAD(mce_injector_chain);
  17. static DEFINE_MUTEX(mce_chrdev_read_mutex);
  18. static char mce_helper[128];
  19. static char *mce_helper_argv[2] = { mce_helper, NULL };
  20. #define mce_log_get_idx_check(p) \
  21. ({ \
  22. RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
  23. !lockdep_is_held(&mce_chrdev_read_mutex), \
  24. "suspicious mce_log_get_idx_check() usage"); \
  25. smp_load_acquire(&(p)); \
  26. })
  27. /*
  28. * Lockless MCE logging infrastructure.
  29. * This avoids deadlocks on printk locks without having to break locks. Also
  30. * separate MCEs from kernel messages to avoid bogus bug reports.
  31. */
  32. static struct mce_log_buffer mcelog = {
  33. .signature = MCE_LOG_SIGNATURE,
  34. .len = MCE_LOG_LEN,
  35. .recordlen = sizeof(struct mce),
  36. };
  37. static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
  38. /* User mode helper program triggered by machine check event */
  39. extern char mce_helper[128];
  40. static int dev_mce_log(struct notifier_block *nb, unsigned long val,
  41. void *data)
  42. {
  43. struct mce *mce = (struct mce *)data;
  44. unsigned int next, entry;
  45. wmb();
  46. for (;;) {
  47. entry = mce_log_get_idx_check(mcelog.next);
  48. for (;;) {
  49. /*
  50. * When the buffer fills up discard new entries.
  51. * Assume that the earlier errors are the more
  52. * interesting ones:
  53. */
  54. if (entry >= MCE_LOG_LEN) {
  55. set_bit(MCE_OVERFLOW,
  56. (unsigned long *)&mcelog.flags);
  57. return NOTIFY_OK;
  58. }
  59. /* Old left over entry. Skip: */
  60. if (mcelog.entry[entry].finished) {
  61. entry++;
  62. continue;
  63. }
  64. break;
  65. }
  66. smp_rmb();
  67. next = entry + 1;
  68. if (cmpxchg(&mcelog.next, entry, next) == entry)
  69. break;
  70. }
  71. memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
  72. wmb();
  73. mcelog.entry[entry].finished = 1;
  74. wmb();
  75. /* wake processes polling /dev/mcelog */
  76. wake_up_interruptible(&mce_chrdev_wait);
  77. return NOTIFY_OK;
  78. }
  79. static struct notifier_block dev_mcelog_nb = {
  80. .notifier_call = dev_mce_log,
  81. .priority = MCE_PRIO_MCELOG,
  82. };
  83. static void mce_do_trigger(struct work_struct *work)
  84. {
  85. call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
  86. }
  87. static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
  88. void mce_work_trigger(void)
  89. {
  90. if (mce_helper[0])
  91. schedule_work(&mce_trigger_work);
  92. }
  93. static ssize_t
  94. show_trigger(struct device *s, struct device_attribute *attr, char *buf)
  95. {
  96. strcpy(buf, mce_helper);
  97. strcat(buf, "\n");
  98. return strlen(mce_helper) + 1;
  99. }
  100. static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
  101. const char *buf, size_t siz)
  102. {
  103. char *p;
  104. strncpy(mce_helper, buf, sizeof(mce_helper));
  105. mce_helper[sizeof(mce_helper)-1] = 0;
  106. p = strchr(mce_helper, '\n');
  107. if (p)
  108. *p = 0;
  109. return strlen(mce_helper) + !!p;
  110. }
  111. DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
  112. /*
  113. * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
  114. */
  115. static DEFINE_SPINLOCK(mce_chrdev_state_lock);
  116. static int mce_chrdev_open_count; /* #times opened */
  117. static int mce_chrdev_open_exclu; /* already open exclusive? */
  118. static int mce_chrdev_open(struct inode *inode, struct file *file)
  119. {
  120. spin_lock(&mce_chrdev_state_lock);
  121. if (mce_chrdev_open_exclu ||
  122. (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
  123. spin_unlock(&mce_chrdev_state_lock);
  124. return -EBUSY;
  125. }
  126. if (file->f_flags & O_EXCL)
  127. mce_chrdev_open_exclu = 1;
  128. mce_chrdev_open_count++;
  129. spin_unlock(&mce_chrdev_state_lock);
  130. return nonseekable_open(inode, file);
  131. }
  132. static int mce_chrdev_release(struct inode *inode, struct file *file)
  133. {
  134. spin_lock(&mce_chrdev_state_lock);
  135. mce_chrdev_open_count--;
  136. mce_chrdev_open_exclu = 0;
  137. spin_unlock(&mce_chrdev_state_lock);
  138. return 0;
  139. }
  140. static void collect_tscs(void *data)
  141. {
  142. unsigned long *cpu_tsc = (unsigned long *)data;
  143. cpu_tsc[smp_processor_id()] = rdtsc();
  144. }
  145. static int mce_apei_read_done;
  146. /* Collect MCE record of previous boot in persistent storage via APEI ERST. */
  147. static int __mce_read_apei(char __user **ubuf, size_t usize)
  148. {
  149. int rc;
  150. u64 record_id;
  151. struct mce m;
  152. if (usize < sizeof(struct mce))
  153. return -EINVAL;
  154. rc = apei_read_mce(&m, &record_id);
  155. /* Error or no more MCE record */
  156. if (rc <= 0) {
  157. mce_apei_read_done = 1;
  158. /*
  159. * When ERST is disabled, mce_chrdev_read() should return
  160. * "no record" instead of "no device."
  161. */
  162. if (rc == -ENODEV)
  163. return 0;
  164. return rc;
  165. }
  166. rc = -EFAULT;
  167. if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
  168. return rc;
  169. /*
  170. * In fact, we should have cleared the record after that has
  171. * been flushed to the disk or sent to network in
  172. * /sbin/mcelog, but we have no interface to support that now,
  173. * so just clear it to avoid duplication.
  174. */
  175. rc = apei_clear_mce(record_id);
  176. if (rc) {
  177. mce_apei_read_done = 1;
  178. return rc;
  179. }
  180. *ubuf += sizeof(struct mce);
  181. return 0;
  182. }
  183. static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
  184. size_t usize, loff_t *off)
  185. {
  186. char __user *buf = ubuf;
  187. unsigned long *cpu_tsc;
  188. unsigned prev, next;
  189. int i, err;
  190. cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
  191. if (!cpu_tsc)
  192. return -ENOMEM;
  193. mutex_lock(&mce_chrdev_read_mutex);
  194. if (!mce_apei_read_done) {
  195. err = __mce_read_apei(&buf, usize);
  196. if (err || buf != ubuf)
  197. goto out;
  198. }
  199. next = mce_log_get_idx_check(mcelog.next);
  200. /* Only supports full reads right now */
  201. err = -EINVAL;
  202. if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
  203. goto out;
  204. err = 0;
  205. prev = 0;
  206. do {
  207. for (i = prev; i < next; i++) {
  208. unsigned long start = jiffies;
  209. struct mce *m = &mcelog.entry[i];
  210. while (!m->finished) {
  211. if (time_after_eq(jiffies, start + 2)) {
  212. memset(m, 0, sizeof(*m));
  213. goto timeout;
  214. }
  215. cpu_relax();
  216. }
  217. smp_rmb();
  218. err |= copy_to_user(buf, m, sizeof(*m));
  219. buf += sizeof(*m);
  220. timeout:
  221. ;
  222. }
  223. memset(mcelog.entry + prev, 0,
  224. (next - prev) * sizeof(struct mce));
  225. prev = next;
  226. next = cmpxchg(&mcelog.next, prev, 0);
  227. } while (next != prev);
  228. synchronize_sched();
  229. /*
  230. * Collect entries that were still getting written before the
  231. * synchronize.
  232. */
  233. on_each_cpu(collect_tscs, cpu_tsc, 1);
  234. for (i = next; i < MCE_LOG_LEN; i++) {
  235. struct mce *m = &mcelog.entry[i];
  236. if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
  237. err |= copy_to_user(buf, m, sizeof(*m));
  238. smp_rmb();
  239. buf += sizeof(*m);
  240. memset(m, 0, sizeof(*m));
  241. }
  242. }
  243. if (err)
  244. err = -EFAULT;
  245. out:
  246. mutex_unlock(&mce_chrdev_read_mutex);
  247. kfree(cpu_tsc);
  248. return err ? err : buf - ubuf;
  249. }
  250. static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
  251. {
  252. poll_wait(file, &mce_chrdev_wait, wait);
  253. if (READ_ONCE(mcelog.next))
  254. return POLLIN | POLLRDNORM;
  255. if (!mce_apei_read_done && apei_check_mce())
  256. return POLLIN | POLLRDNORM;
  257. return 0;
  258. }
  259. static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
  260. unsigned long arg)
  261. {
  262. int __user *p = (int __user *)arg;
  263. if (!capable(CAP_SYS_ADMIN))
  264. return -EPERM;
  265. switch (cmd) {
  266. case MCE_GET_RECORD_LEN:
  267. return put_user(sizeof(struct mce), p);
  268. case MCE_GET_LOG_LEN:
  269. return put_user(MCE_LOG_LEN, p);
  270. case MCE_GETCLEAR_FLAGS: {
  271. unsigned flags;
  272. do {
  273. flags = mcelog.flags;
  274. } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
  275. return put_user(flags, p);
  276. }
  277. default:
  278. return -ENOTTY;
  279. }
  280. }
  281. void mce_register_injector_chain(struct notifier_block *nb)
  282. {
  283. blocking_notifier_chain_register(&mce_injector_chain, nb);
  284. }
  285. EXPORT_SYMBOL_GPL(mce_register_injector_chain);
  286. void mce_unregister_injector_chain(struct notifier_block *nb)
  287. {
  288. blocking_notifier_chain_unregister(&mce_injector_chain, nb);
  289. }
  290. EXPORT_SYMBOL_GPL(mce_unregister_injector_chain);
  291. static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
  292. size_t usize, loff_t *off)
  293. {
  294. struct mce m;
  295. if (!capable(CAP_SYS_ADMIN))
  296. return -EPERM;
  297. /*
  298. * There are some cases where real MSR reads could slip
  299. * through.
  300. */
  301. if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA))
  302. return -EIO;
  303. if ((unsigned long)usize > sizeof(struct mce))
  304. usize = sizeof(struct mce);
  305. if (copy_from_user(&m, ubuf, usize))
  306. return -EFAULT;
  307. if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
  308. return -EINVAL;
  309. /*
  310. * Need to give user space some time to set everything up,
  311. * so do it a jiffie or two later everywhere.
  312. */
  313. schedule_timeout(2);
  314. blocking_notifier_call_chain(&mce_injector_chain, 0, &m);
  315. return usize;
  316. }
  317. static const struct file_operations mce_chrdev_ops = {
  318. .open = mce_chrdev_open,
  319. .release = mce_chrdev_release,
  320. .read = mce_chrdev_read,
  321. .write = mce_chrdev_write,
  322. .poll = mce_chrdev_poll,
  323. .unlocked_ioctl = mce_chrdev_ioctl,
  324. .llseek = no_llseek,
  325. };
  326. static struct miscdevice mce_chrdev_device = {
  327. MISC_MCELOG_MINOR,
  328. "mcelog",
  329. &mce_chrdev_ops,
  330. };
  331. static __init int dev_mcelog_init_device(void)
  332. {
  333. int err;
  334. /* register character device /dev/mcelog */
  335. err = misc_register(&mce_chrdev_device);
  336. if (err) {
  337. if (err == -EBUSY)
  338. /* Xen dom0 might have registered the device already. */
  339. pr_info("Unable to init device /dev/mcelog, already registered");
  340. else
  341. pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
  342. return err;
  343. }
  344. mce_register_decode_chain(&dev_mcelog_nb);
  345. return 0;
  346. }
  347. device_initcall_sync(dev_mcelog_init_device);