|
@@ -2006,6 +2006,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
|
|
|
* Both inactive lists should also be large enough that each inactive
|
|
|
* page has a chance to be referenced again before it is reclaimed.
|
|
|
*
|
|
|
+ * If that fails and refaulting is observed, the inactive list grows.
|
|
|
+ *
|
|
|
* The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
|
|
|
* on this LRU, maintained by the pageout code. A zone->inactive_ratio
|
|
|
* of 3 means 3:1 or 25% of the pages are kept on the inactive list.
|
|
@@ -2022,12 +2024,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
|
|
|
* 10TB 320 32GB
|
|
|
*/
|
|
|
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
|
|
|
- struct scan_control *sc, bool trace)
|
|
|
+ struct mem_cgroup *memcg,
|
|
|
+ struct scan_control *sc, bool actual_reclaim)
|
|
|
{
|
|
|
- unsigned long inactive_ratio;
|
|
|
- unsigned long inactive, active;
|
|
|
- enum lru_list inactive_lru = file * LRU_FILE;
|
|
|
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
|
|
|
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
|
|
+ enum lru_list inactive_lru = file * LRU_FILE;
|
|
|
+ unsigned long inactive, active;
|
|
|
+ unsigned long inactive_ratio;
|
|
|
+ unsigned long refaults;
|
|
|
unsigned long gb;
|
|
|
|
|
|
/*
|
|
@@ -2040,27 +2045,43 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
|
|
|
inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
|
|
|
active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
|
|
|
|
|
|
- gb = (inactive + active) >> (30 - PAGE_SHIFT);
|
|
|
- if (gb)
|
|
|
- inactive_ratio = int_sqrt(10 * gb);
|
|
|
+ if (memcg)
|
|
|
+ refaults = mem_cgroup_read_stat(memcg,
|
|
|
+ MEMCG_WORKINGSET_ACTIVATE);
|
|
|
else
|
|
|
- inactive_ratio = 1;
|
|
|
+ refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * When refaults are being observed, it means a new workingset
|
|
|
+ * is being established. Disable active list protection to get
|
|
|
+ * rid of the stale workingset quickly.
|
|
|
+ */
|
|
|
+ if (file && actual_reclaim && lruvec->refaults != refaults) {
|
|
|
+ inactive_ratio = 0;
|
|
|
+ } else {
|
|
|
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
|
|
|
+ if (gb)
|
|
|
+ inactive_ratio = int_sqrt(10 * gb);
|
|
|
+ else
|
|
|
+ inactive_ratio = 1;
|
|
|
+ }
|
|
|
|
|
|
- if (trace)
|
|
|
- trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
|
|
|
- sc->reclaim_idx,
|
|
|
- lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
|
|
|
- lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
|
|
|
- inactive_ratio, file);
|
|
|
+ if (actual_reclaim)
|
|
|
+ trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
|
|
|
+ lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
|
|
|
+ lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
|
|
|
+ inactive_ratio, file);
|
|
|
|
|
|
return inactive * inactive_ratio < active;
|
|
|
}
|
|
|
|
|
|
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
|
|
|
- struct lruvec *lruvec, struct scan_control *sc)
|
|
|
+ struct lruvec *lruvec, struct mem_cgroup *memcg,
|
|
|
+ struct scan_control *sc)
|
|
|
{
|
|
|
if (is_active_lru(lru)) {
|
|
|
- if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
|
|
|
+ if (inactive_list_is_low(lruvec, is_file_lru(lru),
|
|
|
+ memcg, sc, true))
|
|
|
shrink_active_list(nr_to_scan, lruvec, sc, lru);
|
|
|
return 0;
|
|
|
}
|
|
@@ -2169,7 +2190,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
|
|
|
* lruvec even if it has plenty of old anonymous pages unless the
|
|
|
* system is under heavy pressure.
|
|
|
*/
|
|
|
- if (!inactive_list_is_low(lruvec, true, sc, false) &&
|
|
|
+ if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
|
|
|
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
|
|
|
scan_balance = SCAN_FILE;
|
|
|
goto out;
|
|
@@ -2320,7 +2341,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
|
|
|
nr[lru] -= nr_to_scan;
|
|
|
|
|
|
nr_reclaimed += shrink_list(lru, nr_to_scan,
|
|
|
- lruvec, sc);
|
|
|
+ lruvec, memcg, sc);
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -2387,7 +2408,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
|
|
|
* Even if we did not try to evict anon pages at all, we want to
|
|
|
* rebalance the anon lru active/inactive ratio.
|
|
|
*/
|
|
|
- if (inactive_list_is_low(lruvec, false, sc, true))
|
|
|
+ if (inactive_list_is_low(lruvec, false, memcg, sc, true))
|
|
|
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
|
|
|
sc, LRU_ACTIVE_ANON);
|
|
|
}
|
|
@@ -2703,6 +2724,26 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
|
|
|
sc->gfp_mask = orig_mask;
|
|
|
}
|
|
|
|
|
|
+static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
|
|
|
+{
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
+
|
|
|
+ memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
|
|
|
+ do {
|
|
|
+ unsigned long refaults;
|
|
|
+ struct lruvec *lruvec;
|
|
|
+
|
|
|
+ if (memcg)
|
|
|
+ refaults = mem_cgroup_read_stat(memcg,
|
|
|
+ MEMCG_WORKINGSET_ACTIVATE);
|
|
|
+ else
|
|
|
+ refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
|
|
|
+
|
|
|
+ lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
|
|
+ lruvec->refaults = refaults;
|
|
|
+ } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* This is the main entry point to direct page reclaim.
|
|
|
*
|
|
@@ -2723,6 +2764,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
|
|
|
struct scan_control *sc)
|
|
|
{
|
|
|
int initial_priority = sc->priority;
|
|
|
+ pg_data_t *last_pgdat;
|
|
|
+ struct zoneref *z;
|
|
|
+ struct zone *zone;
|
|
|
retry:
|
|
|
delayacct_freepages_start();
|
|
|
|
|
@@ -2749,6 +2793,15 @@ retry:
|
|
|
sc->may_writepage = 1;
|
|
|
} while (--sc->priority >= 0);
|
|
|
|
|
|
+ last_pgdat = NULL;
|
|
|
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
|
|
|
+ sc->nodemask) {
|
|
|
+ if (zone->zone_pgdat == last_pgdat)
|
|
|
+ continue;
|
|
|
+ last_pgdat = zone->zone_pgdat;
|
|
|
+ snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
|
|
|
+ }
|
|
|
+
|
|
|
delayacct_freepages_end();
|
|
|
|
|
|
if (sc->nr_reclaimed)
|
|
@@ -3033,7 +3086,7 @@ static void age_active_anon(struct pglist_data *pgdat,
|
|
|
do {
|
|
|
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
|
|
|
|
|
- if (inactive_list_is_low(lruvec, false, sc, true))
|
|
|
+ if (inactive_list_is_low(lruvec, false, memcg, sc, true))
|
|
|
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
|
|
|
sc, LRU_ACTIVE_ANON);
|
|
|
|
|
@@ -3280,6 +3333,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
|
|
|
pgdat->kswapd_failures++;
|
|
|
|
|
|
out:
|
|
|
+ snapshot_refaults(NULL, pgdat);
|
|
|
/*
|
|
|
* Return the order kswapd stopped reclaiming at as
|
|
|
* prepare_kswapd_sleep() takes it into account. If another caller
|