workingset.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548
  1. /*
  2. * Workingset detection
  3. *
  4. * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
  5. */
  6. #include <linux/memcontrol.h>
  7. #include <linux/writeback.h>
  8. #include <linux/pagemap.h>
  9. #include <linux/atomic.h>
  10. #include <linux/module.h>
  11. #include <linux/swap.h>
  12. #include <linux/dax.h>
  13. #include <linux/fs.h>
  14. #include <linux/mm.h>
  15. /*
  16. * Double CLOCK lists
  17. *
  18. * Per node, two clock lists are maintained for file pages: the
  19. * inactive and the active list. Freshly faulted pages start out at
  20. * the head of the inactive list and page reclaim scans pages from the
  21. * tail. Pages that are accessed multiple times on the inactive list
  22. * are promoted to the active list, to protect them from reclaim,
  23. * whereas active pages are demoted to the inactive list when the
  24. * active list grows too big.
  25. *
  26. * fault ------------------------+
  27. * |
  28. * +--------------+ | +-------------+
  29. * reclaim <- | inactive | <-+-- demotion | active | <--+
  30. * +--------------+ +-------------+ |
  31. * | |
  32. * +-------------- promotion ------------------+
  33. *
  34. *
  35. * Access frequency and refault distance
  36. *
  37. * A workload is thrashing when its pages are frequently used but they
  38. * are evicted from the inactive list every time before another access
  39. * would have promoted them to the active list.
  40. *
  41. * In cases where the average access distance between thrashing pages
  42. * is bigger than the size of memory there is nothing that can be
  43. * done - the thrashing set could never fit into memory under any
  44. * circumstance.
  45. *
  46. * However, the average access distance could be bigger than the
  47. * inactive list, yet smaller than the size of memory. In this case,
  48. * the set could fit into memory if it weren't for the currently
  49. * active pages - which may be used more, hopefully less frequently:
  50. *
  51. * +-memory available to cache-+
  52. * | |
  53. * +-inactive------+-active----+
  54. * a b | c d e f g h i | J K L M N |
  55. * +---------------+-----------+
  56. *
  57. * It is prohibitively expensive to accurately track access frequency
  58. * of pages. But a reasonable approximation can be made to measure
  59. * thrashing on the inactive list, after which refaulting pages can be
  60. * activated optimistically to compete with the existing active pages.
  61. *
  62. * Approximating inactive page access frequency - Observations:
  63. *
  64. * 1. When a page is accessed for the first time, it is added to the
  65. * head of the inactive list, slides every existing inactive page
  66. * towards the tail by one slot, and pushes the current tail page
  67. * out of memory.
  68. *
  69. * 2. When a page is accessed for the second time, it is promoted to
  70. * the active list, shrinking the inactive list by one slot. This
  71. * also slides all inactive pages that were faulted into the cache
  72. * more recently than the activated page towards the tail of the
  73. * inactive list.
  74. *
  75. * Thus:
  76. *
  77. * 1. The sum of evictions and activations between any two points in
  78. * time indicate the minimum number of inactive pages accessed in
  79. * between.
  80. *
  81. * 2. Moving one inactive page N page slots towards the tail of the
  82. * list requires at least N inactive page accesses.
  83. *
  84. * Combining these:
  85. *
  86. * 1. When a page is finally evicted from memory, the number of
  87. * inactive pages accessed while the page was in cache is at least
  88. * the number of page slots on the inactive list.
  89. *
  90. * 2. In addition, measuring the sum of evictions and activations (E)
  91. * at the time of a page's eviction, and comparing it to another
  92. * reading (R) at the time the page faults back into memory tells
  93. * the minimum number of accesses while the page was not cached.
  94. * This is called the refault distance.
  95. *
  96. * Because the first access of the page was the fault and the second
  97. * access the refault, we combine the in-cache distance with the
  98. * out-of-cache distance to get the complete minimum access distance
  99. * of this page:
  100. *
  101. * NR_inactive + (R - E)
  102. *
  103. * And knowing the minimum access distance of a page, we can easily
  104. * tell if the page would be able to stay in cache assuming all page
  105. * slots in the cache were available:
  106. *
  107. * NR_inactive + (R - E) <= NR_inactive + NR_active
  108. *
  109. * which can be further simplified to
  110. *
  111. * (R - E) <= NR_active
  112. *
  113. * Put into words, the refault distance (out-of-cache) can be seen as
  114. * a deficit in inactive list space (in-cache). If the inactive list
  115. * had (R - E) more page slots, the page would not have been evicted
  116. * in between accesses, but activated instead. And on a full system,
  117. * the only thing eating into inactive list space is active pages.
  118. *
  119. *
  120. * Activating refaulting pages
  121. *
  122. * All that is known about the active list is that the pages have been
  123. * accessed more than once in the past. This means that at any given
  124. * time there is actually a good chance that pages on the active list
  125. * are no longer in active use.
  126. *
  127. * So when a refault distance of (R - E) is observed and there are at
  128. * least (R - E) active pages, the refaulting page is activated
  129. * optimistically in the hope that (R - E) active pages are actually
  130. * used less frequently than the refaulting page - or even not used at
  131. * all anymore.
  132. *
  133. * If this is wrong and demotion kicks in, the pages which are truly
  134. * used more frequently will be reactivated while the less frequently
  135. * used once will be evicted from memory.
  136. *
  137. * But if this is right, the stale pages will be pushed out of memory
  138. * and the used pages get to stay in cache.
  139. *
  140. *
  141. * Implementation
  142. *
  143. * For each node's file LRU lists, a counter for inactive evictions
  144. * and activations is maintained (node->inactive_age).
  145. *
  146. * On eviction, a snapshot of this counter (along with some bits to
  147. * identify the node) is stored in the now empty page cache radix tree
  148. * slot of the evicted page. This is called a shadow entry.
  149. *
  150. * On cache misses for which there are shadow entries, an eligible
  151. * refault distance will immediately activate the refaulting page.
  152. */
  153. #define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
  154. NODES_SHIFT + \
  155. MEM_CGROUP_ID_SHIFT)
  156. #define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
  157. /*
  158. * Eviction timestamps need to be able to cover the full range of
  159. * actionable refaults. However, bits are tight in the radix tree
  160. * entry, and after storing the identifier for the lruvec there might
  161. * not be enough left to represent every single actionable refault. In
  162. * that case, we have to sacrifice granularity for distance, and group
  163. * evictions into coarser buckets by shaving off lower timestamp bits.
  164. */
  165. static unsigned int bucket_order __read_mostly;
  166. static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
  167. {
  168. eviction >>= bucket_order;
  169. eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
  170. eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
  171. eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
  172. return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
  173. }
  174. static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
  175. unsigned long *evictionp)
  176. {
  177. unsigned long entry = (unsigned long)shadow;
  178. int memcgid, nid;
  179. entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
  180. nid = entry & ((1UL << NODES_SHIFT) - 1);
  181. entry >>= NODES_SHIFT;
  182. memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
  183. entry >>= MEM_CGROUP_ID_SHIFT;
  184. *memcgidp = memcgid;
  185. *pgdat = NODE_DATA(nid);
  186. *evictionp = entry << bucket_order;
  187. }
  188. /**
  189. * workingset_eviction - note the eviction of a page from memory
  190. * @mapping: address space the page was backing
  191. * @page: the page being evicted
  192. *
  193. * Returns a shadow entry to be stored in @mapping->page_tree in place
  194. * of the evicted @page so that a later refault can be detected.
  195. */
  196. void *workingset_eviction(struct address_space *mapping, struct page *page)
  197. {
  198. struct mem_cgroup *memcg = page_memcg(page);
  199. struct pglist_data *pgdat = page_pgdat(page);
  200. int memcgid = mem_cgroup_id(memcg);
  201. unsigned long eviction;
  202. struct lruvec *lruvec;
  203. /* Page is fully exclusive and pins page->mem_cgroup */
  204. VM_BUG_ON_PAGE(PageLRU(page), page);
  205. VM_BUG_ON_PAGE(page_count(page), page);
  206. VM_BUG_ON_PAGE(!PageLocked(page), page);
  207. lruvec = mem_cgroup_lruvec(pgdat, memcg);
  208. eviction = atomic_long_inc_return(&lruvec->inactive_age);
  209. return pack_shadow(memcgid, pgdat, eviction);
  210. }
  211. /**
  212. * workingset_refault - evaluate the refault of a previously evicted page
  213. * @shadow: shadow entry of the evicted page
  214. *
  215. * Calculates and evaluates the refault distance of the previously
  216. * evicted page in the context of the node it was allocated in.
  217. *
  218. * Returns %true if the page should be activated, %false otherwise.
  219. */
  220. bool workingset_refault(void *shadow)
  221. {
  222. unsigned long refault_distance;
  223. unsigned long active_file;
  224. struct mem_cgroup *memcg;
  225. unsigned long eviction;
  226. struct lruvec *lruvec;
  227. unsigned long refault;
  228. struct pglist_data *pgdat;
  229. int memcgid;
  230. unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
  231. rcu_read_lock();
  232. /*
  233. * Look up the memcg associated with the stored ID. It might
  234. * have been deleted since the page's eviction.
  235. *
  236. * Note that in rare events the ID could have been recycled
  237. * for a new cgroup that refaults a shared page. This is
  238. * impossible to tell from the available data. However, this
  239. * should be a rare and limited disturbance, and activations
  240. * are always speculative anyway. Ultimately, it's the aging
  241. * algorithm's job to shake out the minimum access frequency
  242. * for the active cache.
  243. *
  244. * XXX: On !CONFIG_MEMCG, this will always return NULL; it
  245. * would be better if the root_mem_cgroup existed in all
  246. * configurations instead.
  247. */
  248. memcg = mem_cgroup_from_id(memcgid);
  249. if (!mem_cgroup_disabled() && !memcg) {
  250. rcu_read_unlock();
  251. return false;
  252. }
  253. lruvec = mem_cgroup_lruvec(pgdat, memcg);
  254. refault = atomic_long_read(&lruvec->inactive_age);
  255. active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
  256. rcu_read_unlock();
  257. /*
  258. * The unsigned subtraction here gives an accurate distance
  259. * across inactive_age overflows in most cases.
  260. *
  261. * There is a special case: usually, shadow entries have a
  262. * short lifetime and are either refaulted or reclaimed along
  263. * with the inode before they get too old. But it is not
  264. * impossible for the inactive_age to lap a shadow entry in
  265. * the field, which can then can result in a false small
  266. * refault distance, leading to a false activation should this
  267. * old entry actually refault again. However, earlier kernels
  268. * used to deactivate unconditionally with *every* reclaim
  269. * invocation for the longest time, so the occasional
  270. * inappropriate activation leading to pressure on the active
  271. * list is not a problem.
  272. */
  273. refault_distance = (refault - eviction) & EVICTION_MASK;
  274. inc_node_state(pgdat, WORKINGSET_REFAULT);
  275. if (refault_distance <= active_file) {
  276. inc_node_state(pgdat, WORKINGSET_ACTIVATE);
  277. return true;
  278. }
  279. return false;
  280. }
  281. /**
  282. * workingset_activation - note a page activation
  283. * @page: page that is being activated
  284. */
  285. void workingset_activation(struct page *page)
  286. {
  287. struct mem_cgroup *memcg;
  288. struct lruvec *lruvec;
  289. rcu_read_lock();
  290. /*
  291. * Filter non-memcg pages here, e.g. unmap can call
  292. * mark_page_accessed() on VDSO pages.
  293. *
  294. * XXX: See workingset_refault() - this should return
  295. * root_mem_cgroup even for !CONFIG_MEMCG.
  296. */
  297. memcg = page_memcg_rcu(page);
  298. if (!mem_cgroup_disabled() && !memcg)
  299. goto out;
  300. lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
  301. atomic_long_inc(&lruvec->inactive_age);
  302. out:
  303. rcu_read_unlock();
  304. }
  305. /*
  306. * Shadow entries reflect the share of the working set that does not
  307. * fit into memory, so their number depends on the access pattern of
  308. * the workload. In most cases, they will refault or get reclaimed
  309. * along with the inode, but a (malicious) workload that streams
  310. * through files with a total size several times that of available
  311. * memory, while preventing the inodes from being reclaimed, can
  312. * create excessive amounts of shadow nodes. To keep a lid on this,
  313. * track shadow nodes and reclaim them when they grow way past the
  314. * point where they would still be useful.
  315. */
  316. static struct list_lru shadow_nodes;
  317. void workingset_update_node(struct radix_tree_node *node, void *private)
  318. {
  319. struct address_space *mapping = private;
  320. /* Only regular page cache has shadow entries */
  321. if (dax_mapping(mapping) || shmem_mapping(mapping))
  322. return;
  323. /*
  324. * Track non-empty nodes that contain only shadow entries;
  325. * unlink those that contain pages or are being freed.
  326. *
  327. * Avoid acquiring the list_lru lock when the nodes are
  328. * already where they should be. The list_empty() test is safe
  329. * as node->private_list is protected by &mapping->tree_lock.
  330. */
  331. if (node->count && node->count == node->exceptional) {
  332. if (list_empty(&node->private_list)) {
  333. node->private_data = mapping;
  334. list_lru_add(&shadow_nodes, &node->private_list);
  335. }
  336. } else {
  337. if (!list_empty(&node->private_list))
  338. list_lru_del(&shadow_nodes, &node->private_list);
  339. }
  340. }
  341. static unsigned long count_shadow_nodes(struct shrinker *shrinker,
  342. struct shrink_control *sc)
  343. {
  344. unsigned long max_nodes;
  345. unsigned long nodes;
  346. unsigned long cache;
  347. /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
  348. local_irq_disable();
  349. nodes = list_lru_shrink_count(&shadow_nodes, sc);
  350. local_irq_enable();
  351. /*
  352. * Approximate a reasonable limit for the radix tree nodes
  353. * containing shadow entries. We don't need to keep more
  354. * shadow entries than possible pages on the active list,
  355. * since refault distances bigger than that are dismissed.
  356. *
  357. * The size of the active list converges toward 100% of
  358. * overall page cache as memory grows, with only a tiny
  359. * inactive list. Assume the total cache size for that.
  360. *
  361. * Nodes might be sparsely populated, with only one shadow
  362. * entry in the extreme case. Obviously, we cannot keep one
  363. * node for every eligible shadow entry, so compromise on a
  364. * worst-case density of 1/8th. Below that, not all eligible
  365. * refaults can be detected anymore.
  366. *
  367. * On 64-bit with 7 radix_tree_nodes per page and 64 slots
  368. * each, this will reclaim shadow entries when they consume
  369. * ~1.8% of available memory:
  370. *
  371. * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE
  372. */
  373. if (sc->memcg) {
  374. cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
  375. LRU_ALL_FILE);
  376. } else {
  377. cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
  378. node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
  379. }
  380. max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3);
  381. if (nodes <= max_nodes)
  382. return 0;
  383. return nodes - max_nodes;
  384. }
  385. static enum lru_status shadow_lru_isolate(struct list_head *item,
  386. struct list_lru_one *lru,
  387. spinlock_t *lru_lock,
  388. void *arg)
  389. {
  390. struct address_space *mapping;
  391. struct radix_tree_node *node;
  392. unsigned int i;
  393. int ret;
  394. /*
  395. * Page cache insertions and deletions synchroneously maintain
  396. * the shadow node LRU under the mapping->tree_lock and the
  397. * lru_lock. Because the page cache tree is emptied before
  398. * the inode can be destroyed, holding the lru_lock pins any
  399. * address_space that has radix tree nodes on the LRU.
  400. *
  401. * We can then safely transition to the mapping->tree_lock to
  402. * pin only the address_space of the particular node we want
  403. * to reclaim, take the node off-LRU, and drop the lru_lock.
  404. */
  405. node = container_of(item, struct radix_tree_node, private_list);
  406. mapping = node->private_data;
  407. /* Coming from the list, invert the lock order */
  408. if (!spin_trylock(&mapping->tree_lock)) {
  409. spin_unlock(lru_lock);
  410. ret = LRU_RETRY;
  411. goto out;
  412. }
  413. list_lru_isolate(lru, item);
  414. spin_unlock(lru_lock);
  415. /*
  416. * The nodes should only contain one or more shadow entries,
  417. * no pages, so we expect to be able to remove them all and
  418. * delete and free the empty node afterwards.
  419. */
  420. if (WARN_ON_ONCE(!node->exceptional))
  421. goto out_invalid;
  422. if (WARN_ON_ONCE(node->count != node->exceptional))
  423. goto out_invalid;
  424. for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
  425. if (node->slots[i]) {
  426. if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
  427. goto out_invalid;
  428. if (WARN_ON_ONCE(!node->exceptional))
  429. goto out_invalid;
  430. if (WARN_ON_ONCE(!mapping->nrexceptional))
  431. goto out_invalid;
  432. node->slots[i] = NULL;
  433. node->exceptional--;
  434. node->count--;
  435. mapping->nrexceptional--;
  436. }
  437. }
  438. if (WARN_ON_ONCE(node->exceptional))
  439. goto out_invalid;
  440. inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
  441. __radix_tree_delete_node(&mapping->page_tree, node,
  442. workingset_update_node, mapping);
  443. out_invalid:
  444. spin_unlock(&mapping->tree_lock);
  445. ret = LRU_REMOVED_RETRY;
  446. out:
  447. local_irq_enable();
  448. cond_resched();
  449. local_irq_disable();
  450. spin_lock(lru_lock);
  451. return ret;
  452. }
  453. static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
  454. struct shrink_control *sc)
  455. {
  456. unsigned long ret;
  457. /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
  458. local_irq_disable();
  459. ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
  460. local_irq_enable();
  461. return ret;
  462. }
  463. static struct shrinker workingset_shadow_shrinker = {
  464. .count_objects = count_shadow_nodes,
  465. .scan_objects = scan_shadow_nodes,
  466. .seeks = DEFAULT_SEEKS,
  467. .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
  468. };
  469. /*
  470. * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
  471. * mapping->tree_lock.
  472. */
  473. static struct lock_class_key shadow_nodes_key;
  474. static int __init workingset_init(void)
  475. {
  476. unsigned int timestamp_bits;
  477. unsigned int max_order;
  478. int ret;
  479. BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
  480. /*
  481. * Calculate the eviction bucket size to cover the longest
  482. * actionable refault distance, which is currently half of
  483. * memory (totalram_pages/2). However, memory hotplug may add
  484. * some more pages at runtime, so keep working with up to
  485. * double the initial memory by using totalram_pages as-is.
  486. */
  487. timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
  488. max_order = fls_long(totalram_pages - 1);
  489. if (max_order > timestamp_bits)
  490. bucket_order = max_order - timestamp_bits;
  491. pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
  492. timestamp_bits, max_order, bucket_order);
  493. ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key);
  494. if (ret)
  495. goto err;
  496. ret = register_shrinker(&workingset_shadow_shrinker);
  497. if (ret)
  498. goto err_list_lru;
  499. return 0;
  500. err_list_lru:
  501. list_lru_destroy(&shadow_nodes);
  502. err:
  503. return ret;
  504. }
  505. module_init(workingset_init);