cec.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532
  1. #include <linux/mm.h>
  2. #include <linux/gfp.h>
  3. #include <linux/kernel.h>
  4. #include <asm/mce.h>
  5. #include "debugfs.h"
  6. /*
  7. * RAS Correctable Errors Collector
  8. *
  9. * This is a simple gadget which collects correctable errors and counts their
  10. * occurrence per physical page address.
  11. *
  12. * We've opted for possibly the simplest data structure to collect those - an
  13. * array of the size of a memory page. It stores 512 u64's with the following
  14. * structure:
  15. *
  16. * [63 ... PFN ... 12 | 11 ... generation ... 10 | 9 ... count ... 0]
  17. *
  18. * The generation in the two highest order bits is two bits which are set to 11b
  19. * on every insertion. During the course of each entry's existence, the
  20. * generation field gets decremented during spring cleaning to 10b, then 01b and
  21. * then 00b.
  22. *
  23. * This way we're employing the natural numeric ordering to make sure that newly
  24. * inserted/touched elements have higher 12-bit counts (which we've manufactured)
  25. * and thus iterating over the array initially won't kick out those elements
  26. * which were inserted last.
  27. *
  28. * Spring cleaning is what we do when we reach a certain number CLEAN_ELEMS of
  29. * elements entered into the array, during which, we're decaying all elements.
  30. * If, after decay, an element gets inserted again, its generation is set to 11b
  31. * to make sure it has higher numerical count than other, older elements and
  32. * thus emulate an an LRU-like behavior when deleting elements to free up space
  33. * in the page.
  34. *
  35. * When an element reaches it's max count of count_threshold, we try to poison
  36. * it by assuming that errors triggered count_threshold times in a single page
  37. * are excessive and that page shouldn't be used anymore. count_threshold is
  38. * initialized to COUNT_MASK which is the maximum.
  39. *
  40. * That error event entry causes cec_add_elem() to return !0 value and thus
  41. * signal to its callers to log the error.
  42. *
  43. * To the question why we've chosen a page and moving elements around with
  44. * memmove(), it is because it is a very simple structure to handle and max data
  45. * movement is 4K which on highly optimized modern CPUs is almost unnoticeable.
  46. * We wanted to avoid the pointer traversal of more complex structures like a
  47. * linked list or some sort of a balancing search tree.
  48. *
  49. * Deleting an element takes O(n) but since it is only a single page, it should
  50. * be fast enough and it shouldn't happen all too often depending on error
  51. * patterns.
  52. */
  53. #undef pr_fmt
  54. #define pr_fmt(fmt) "RAS: " fmt
  55. /*
  56. * We use DECAY_BITS bits of PAGE_SHIFT bits for counting decay, i.e., how long
  57. * elements have stayed in the array without having been accessed again.
  58. */
  59. #define DECAY_BITS 2
  60. #define DECAY_MASK ((1ULL << DECAY_BITS) - 1)
  61. #define MAX_ELEMS (PAGE_SIZE / sizeof(u64))
  62. /*
  63. * Threshold amount of inserted elements after which we start spring
  64. * cleaning.
  65. */
  66. #define CLEAN_ELEMS (MAX_ELEMS >> DECAY_BITS)
  67. /* Bits which count the number of errors happened in this 4K page. */
  68. #define COUNT_BITS (PAGE_SHIFT - DECAY_BITS)
  69. #define COUNT_MASK ((1ULL << COUNT_BITS) - 1)
  70. #define FULL_COUNT_MASK (PAGE_SIZE - 1)
  71. /*
  72. * u64: [ 63 ... 12 | DECAY_BITS | COUNT_BITS ]
  73. */
  74. #define PFN(e) ((e) >> PAGE_SHIFT)
  75. #define DECAY(e) (((e) >> COUNT_BITS) & DECAY_MASK)
  76. #define COUNT(e) ((unsigned int)(e) & COUNT_MASK)
  77. #define FULL_COUNT(e) ((e) & (PAGE_SIZE - 1))
  78. static struct ce_array {
  79. u64 *array; /* container page */
  80. unsigned int n; /* number of elements in the array */
  81. unsigned int decay_count; /*
  82. * number of element insertions/increments
  83. * since the last spring cleaning.
  84. */
  85. u64 pfns_poisoned; /*
  86. * number of PFNs which got poisoned.
  87. */
  88. u64 ces_entered; /*
  89. * The number of correctable errors
  90. * entered into the collector.
  91. */
  92. u64 decays_done; /*
  93. * Times we did spring cleaning.
  94. */
  95. union {
  96. struct {
  97. __u32 disabled : 1, /* cmdline disabled */
  98. __resv : 31;
  99. };
  100. __u32 flags;
  101. };
  102. } ce_arr;
  103. static DEFINE_MUTEX(ce_mutex);
  104. static u64 dfs_pfn;
  105. /* Amount of errors after which we offline */
  106. static unsigned int count_threshold = COUNT_MASK;
  107. /*
  108. * The timer "decays" element count each timer_interval which is 24hrs by
  109. * default.
  110. */
  111. #define CEC_TIMER_DEFAULT_INTERVAL 24 * 60 * 60 /* 24 hrs */
  112. #define CEC_TIMER_MIN_INTERVAL 1 * 60 * 60 /* 1h */
  113. #define CEC_TIMER_MAX_INTERVAL 30 * 24 * 60 * 60 /* one month */
  114. static struct timer_list cec_timer;
  115. static u64 timer_interval = CEC_TIMER_DEFAULT_INTERVAL;
  116. /*
  117. * Decrement decay value. We're using DECAY_BITS bits to denote decay of an
  118. * element in the array. On insertion and any access, it gets reset to max.
  119. */
  120. static void do_spring_cleaning(struct ce_array *ca)
  121. {
  122. int i;
  123. for (i = 0; i < ca->n; i++) {
  124. u8 decay = DECAY(ca->array[i]);
  125. if (!decay)
  126. continue;
  127. decay--;
  128. ca->array[i] &= ~(DECAY_MASK << COUNT_BITS);
  129. ca->array[i] |= (decay << COUNT_BITS);
  130. }
  131. ca->decay_count = 0;
  132. ca->decays_done++;
  133. }
  134. /*
  135. * @interval in seconds
  136. */
  137. static void cec_mod_timer(struct timer_list *t, unsigned long interval)
  138. {
  139. unsigned long iv;
  140. iv = interval * HZ + jiffies;
  141. mod_timer(t, round_jiffies(iv));
  142. }
  143. static void cec_timer_fn(unsigned long data)
  144. {
  145. struct ce_array *ca = (struct ce_array *)data;
  146. do_spring_cleaning(ca);
  147. cec_mod_timer(&cec_timer, timer_interval);
  148. }
  149. /*
  150. * @to: index of the smallest element which is >= then @pfn.
  151. *
  152. * Return the index of the pfn if found, otherwise negative value.
  153. */
  154. static int __find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
  155. {
  156. u64 this_pfn;
  157. int min = 0, max = ca->n;
  158. while (min < max) {
  159. int tmp = (max + min) >> 1;
  160. this_pfn = PFN(ca->array[tmp]);
  161. if (this_pfn < pfn)
  162. min = tmp + 1;
  163. else if (this_pfn > pfn)
  164. max = tmp;
  165. else {
  166. min = tmp;
  167. break;
  168. }
  169. }
  170. if (to)
  171. *to = min;
  172. this_pfn = PFN(ca->array[min]);
  173. if (this_pfn == pfn)
  174. return min;
  175. return -ENOKEY;
  176. }
  177. static int find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
  178. {
  179. WARN_ON(!to);
  180. if (!ca->n) {
  181. *to = 0;
  182. return -ENOKEY;
  183. }
  184. return __find_elem(ca, pfn, to);
  185. }
  186. static void del_elem(struct ce_array *ca, int idx)
  187. {
  188. /* Save us a function call when deleting the last element. */
  189. if (ca->n - (idx + 1))
  190. memmove((void *)&ca->array[idx],
  191. (void *)&ca->array[idx + 1],
  192. (ca->n - (idx + 1)) * sizeof(u64));
  193. ca->n--;
  194. }
  195. static u64 del_lru_elem_unlocked(struct ce_array *ca)
  196. {
  197. unsigned int min = FULL_COUNT_MASK;
  198. int i, min_idx = 0;
  199. for (i = 0; i < ca->n; i++) {
  200. unsigned int this = FULL_COUNT(ca->array[i]);
  201. if (min > this) {
  202. min = this;
  203. min_idx = i;
  204. }
  205. }
  206. del_elem(ca, min_idx);
  207. return PFN(ca->array[min_idx]);
  208. }
  209. /*
  210. * We return the 0th pfn in the error case under the assumption that it cannot
  211. * be poisoned and excessive CEs in there are a serious deal anyway.
  212. */
  213. static u64 __maybe_unused del_lru_elem(void)
  214. {
  215. struct ce_array *ca = &ce_arr;
  216. u64 pfn;
  217. if (!ca->n)
  218. return 0;
  219. mutex_lock(&ce_mutex);
  220. pfn = del_lru_elem_unlocked(ca);
  221. mutex_unlock(&ce_mutex);
  222. return pfn;
  223. }
  224. int cec_add_elem(u64 pfn)
  225. {
  226. struct ce_array *ca = &ce_arr;
  227. unsigned int to;
  228. int count, ret = 0;
  229. /*
  230. * We can be called very early on the identify_cpu() path where we are
  231. * not initialized yet. We ignore the error for simplicity.
  232. */
  233. if (!ce_arr.array || ce_arr.disabled)
  234. return -ENODEV;
  235. ca->ces_entered++;
  236. mutex_lock(&ce_mutex);
  237. if (ca->n == MAX_ELEMS)
  238. WARN_ON(!del_lru_elem_unlocked(ca));
  239. ret = find_elem(ca, pfn, &to);
  240. if (ret < 0) {
  241. /*
  242. * Shift range [to-end] to make room for one more element.
  243. */
  244. memmove((void *)&ca->array[to + 1],
  245. (void *)&ca->array[to],
  246. (ca->n - to) * sizeof(u64));
  247. ca->array[to] = (pfn << PAGE_SHIFT) |
  248. (DECAY_MASK << COUNT_BITS) | 1;
  249. ca->n++;
  250. ret = 0;
  251. goto decay;
  252. }
  253. count = COUNT(ca->array[to]);
  254. if (count < count_threshold) {
  255. ca->array[to] |= (DECAY_MASK << COUNT_BITS);
  256. ca->array[to]++;
  257. ret = 0;
  258. } else {
  259. u64 pfn = ca->array[to] >> PAGE_SHIFT;
  260. if (!pfn_valid(pfn)) {
  261. pr_warn("CEC: Invalid pfn: 0x%llx\n", pfn);
  262. } else {
  263. /* We have reached max count for this page, soft-offline it. */
  264. pr_err("Soft-offlining pfn: 0x%llx\n", pfn);
  265. memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE);
  266. ca->pfns_poisoned++;
  267. }
  268. del_elem(ca, to);
  269. /*
  270. * Return a >0 value to denote that we've reached the offlining
  271. * threshold.
  272. */
  273. ret = 1;
  274. goto unlock;
  275. }
  276. decay:
  277. ca->decay_count++;
  278. if (ca->decay_count >= CLEAN_ELEMS)
  279. do_spring_cleaning(ca);
  280. unlock:
  281. mutex_unlock(&ce_mutex);
  282. return ret;
  283. }
  284. static int u64_get(void *data, u64 *val)
  285. {
  286. *val = *(u64 *)data;
  287. return 0;
  288. }
  289. static int pfn_set(void *data, u64 val)
  290. {
  291. *(u64 *)data = val;
  292. return cec_add_elem(val);
  293. }
  294. DEFINE_DEBUGFS_ATTRIBUTE(pfn_ops, u64_get, pfn_set, "0x%llx\n");
  295. static int decay_interval_set(void *data, u64 val)
  296. {
  297. *(u64 *)data = val;
  298. if (val < CEC_TIMER_MIN_INTERVAL)
  299. return -EINVAL;
  300. if (val > CEC_TIMER_MAX_INTERVAL)
  301. return -EINVAL;
  302. timer_interval = val;
  303. cec_mod_timer(&cec_timer, timer_interval);
  304. return 0;
  305. }
  306. DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n");
  307. static int count_threshold_set(void *data, u64 val)
  308. {
  309. *(u64 *)data = val;
  310. if (val > COUNT_MASK)
  311. val = COUNT_MASK;
  312. count_threshold = val;
  313. return 0;
  314. }
  315. DEFINE_DEBUGFS_ATTRIBUTE(count_threshold_ops, u64_get, count_threshold_set, "%lld\n");
  316. static int array_dump(struct seq_file *m, void *v)
  317. {
  318. struct ce_array *ca = &ce_arr;
  319. u64 prev = 0;
  320. int i;
  321. mutex_lock(&ce_mutex);
  322. seq_printf(m, "{ n: %d\n", ca->n);
  323. for (i = 0; i < ca->n; i++) {
  324. u64 this = PFN(ca->array[i]);
  325. seq_printf(m, " %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
  326. WARN_ON(prev > this);
  327. prev = this;
  328. }
  329. seq_printf(m, "}\n");
  330. seq_printf(m, "Stats:\nCEs: %llu\nofflined pages: %llu\n",
  331. ca->ces_entered, ca->pfns_poisoned);
  332. seq_printf(m, "Flags: 0x%x\n", ca->flags);
  333. seq_printf(m, "Timer interval: %lld seconds\n", timer_interval);
  334. seq_printf(m, "Decays: %lld\n", ca->decays_done);
  335. seq_printf(m, "Action threshold: %d\n", count_threshold);
  336. mutex_unlock(&ce_mutex);
  337. return 0;
  338. }
  339. static int array_open(struct inode *inode, struct file *filp)
  340. {
  341. return single_open(filp, array_dump, NULL);
  342. }
  343. static const struct file_operations array_ops = {
  344. .owner = THIS_MODULE,
  345. .open = array_open,
  346. .read = seq_read,
  347. .llseek = seq_lseek,
  348. .release = single_release,
  349. };
  350. static int __init create_debugfs_nodes(void)
  351. {
  352. struct dentry *d, *pfn, *decay, *count, *array;
  353. d = debugfs_create_dir("cec", ras_debugfs_dir);
  354. if (!d) {
  355. pr_warn("Error creating cec debugfs node!\n");
  356. return -1;
  357. }
  358. pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
  359. if (!pfn) {
  360. pr_warn("Error creating pfn debugfs node!\n");
  361. goto err;
  362. }
  363. array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops);
  364. if (!array) {
  365. pr_warn("Error creating array debugfs node!\n");
  366. goto err;
  367. }
  368. decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d,
  369. &timer_interval, &decay_interval_ops);
  370. if (!decay) {
  371. pr_warn("Error creating decay_interval debugfs node!\n");
  372. goto err;
  373. }
  374. count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d,
  375. &count_threshold, &count_threshold_ops);
  376. if (!count) {
  377. pr_warn("Error creating count_threshold debugfs node!\n");
  378. goto err;
  379. }
  380. return 0;
  381. err:
  382. debugfs_remove_recursive(d);
  383. return 1;
  384. }
  385. void __init cec_init(void)
  386. {
  387. if (ce_arr.disabled)
  388. return;
  389. ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL);
  390. if (!ce_arr.array) {
  391. pr_err("Error allocating CE array page!\n");
  392. return;
  393. }
  394. if (create_debugfs_nodes())
  395. return;
  396. setup_timer(&cec_timer, cec_timer_fn, (unsigned long)&ce_arr);
  397. cec_mod_timer(&cec_timer, CEC_TIMER_DEFAULT_INTERVAL);
  398. pr_info("Correctable Errors collector initialized.\n");
  399. }
  400. int __init parse_cec_param(char *str)
  401. {
  402. if (!str)
  403. return 0;
  404. if (*str == '=')
  405. str++;
  406. if (!strncmp(str, "cec_disable", 7))
  407. ce_arr.disabled = 1;
  408. else
  409. return 0;
  410. return 1;
  411. }