|
@@ -121,7 +121,7 @@
|
|
* the only thing eating into inactive list space is active pages.
|
|
* the only thing eating into inactive list space is active pages.
|
|
*
|
|
*
|
|
*
|
|
*
|
|
- * Activating refaulting pages
|
|
|
|
|
|
+ * Refaulting inactive pages
|
|
*
|
|
*
|
|
* All that is known about the active list is that the pages have been
|
|
* All that is known about the active list is that the pages have been
|
|
* accessed more than once in the past. This means that at any given
|
|
* accessed more than once in the past. This means that at any given
|
|
@@ -134,6 +134,10 @@
|
|
* used less frequently than the refaulting page - or even not used at
|
|
* used less frequently than the refaulting page - or even not used at
|
|
* all anymore.
|
|
* all anymore.
|
|
*
|
|
*
|
|
|
|
+ * That means if inactive cache is refaulting with a suitable refault
|
|
|
|
+ * distance, we assume the cache workingset is transitioning and put
|
|
|
|
+ * pressure on the current active list.
|
|
|
|
+ *
|
|
* If this is wrong and demotion kicks in, the pages which are truly
|
|
* If this is wrong and demotion kicks in, the pages which are truly
|
|
* used more frequently will be reactivated while the less frequently
|
|
* used more frequently will be reactivated while the less frequently
|
|
* used once will be evicted from memory.
|
|
* used once will be evicted from memory.
|
|
@@ -141,6 +145,14 @@
|
|
* But if this is right, the stale pages will be pushed out of memory
|
|
* But if this is right, the stale pages will be pushed out of memory
|
|
* and the used pages get to stay in cache.
|
|
* and the used pages get to stay in cache.
|
|
*
|
|
*
|
|
|
|
+ * Refaulting active pages
|
|
|
|
+ *
|
|
|
|
+ * If on the other hand the refaulting pages have recently been
|
|
|
|
+ * deactivated, it means that the active list is no longer protecting
|
|
|
|
+ * actively used cache from reclaim. The cache is NOT transitioning to
|
|
|
|
+ * a different workingset; the existing workingset is thrashing in the
|
|
|
|
+ * space allocated to the page cache.
|
|
|
|
+ *
|
|
*
|
|
*
|
|
* Implementation
|
|
* Implementation
|
|
*
|
|
*
|
|
@@ -156,8 +168,7 @@
|
|
*/
|
|
*/
|
|
|
|
|
|
#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
|
|
#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
|
|
- NODES_SHIFT + \
|
|
|
|
- MEM_CGROUP_ID_SHIFT)
|
|
|
|
|
|
+ 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
|
|
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
|
|
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -170,23 +181,28 @@
|
|
*/
|
|
*/
|
|
static unsigned int bucket_order __read_mostly;
|
|
static unsigned int bucket_order __read_mostly;
|
|
|
|
|
|
-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
|
|
|
|
|
|
+static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
|
|
|
|
+ bool workingset)
|
|
{
|
|
{
|
|
eviction >>= bucket_order;
|
|
eviction >>= bucket_order;
|
|
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
|
|
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
|
|
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
|
|
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
|
|
|
|
+ eviction = (eviction << 1) | workingset;
|
|
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
|
|
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
|
|
|
|
|
|
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
|
|
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
|
|
}
|
|
}
|
|
|
|
|
|
static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
|
|
static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
|
|
- unsigned long *evictionp)
|
|
|
|
|
|
+ unsigned long *evictionp, bool *workingsetp)
|
|
{
|
|
{
|
|
unsigned long entry = (unsigned long)shadow;
|
|
unsigned long entry = (unsigned long)shadow;
|
|
int memcgid, nid;
|
|
int memcgid, nid;
|
|
|
|
+ bool workingset;
|
|
|
|
|
|
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
|
|
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
|
|
|
|
+ workingset = entry & 1;
|
|
|
|
+ entry >>= 1;
|
|
nid = entry & ((1UL << NODES_SHIFT) - 1);
|
|
nid = entry & ((1UL << NODES_SHIFT) - 1);
|
|
entry >>= NODES_SHIFT;
|
|
entry >>= NODES_SHIFT;
|
|
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
|
|
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
|
|
@@ -195,6 +211,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
|
|
*memcgidp = memcgid;
|
|
*memcgidp = memcgid;
|
|
*pgdat = NODE_DATA(nid);
|
|
*pgdat = NODE_DATA(nid);
|
|
*evictionp = entry << bucket_order;
|
|
*evictionp = entry << bucket_order;
|
|
|
|
+ *workingsetp = workingset;
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
@@ -207,8 +224,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
|
|
*/
|
|
*/
|
|
void *workingset_eviction(struct address_space *mapping, struct page *page)
|
|
void *workingset_eviction(struct address_space *mapping, struct page *page)
|
|
{
|
|
{
|
|
- struct mem_cgroup *memcg = page_memcg(page);
|
|
|
|
struct pglist_data *pgdat = page_pgdat(page);
|
|
struct pglist_data *pgdat = page_pgdat(page);
|
|
|
|
+ struct mem_cgroup *memcg = page_memcg(page);
|
|
int memcgid = mem_cgroup_id(memcg);
|
|
int memcgid = mem_cgroup_id(memcg);
|
|
unsigned long eviction;
|
|
unsigned long eviction;
|
|
struct lruvec *lruvec;
|
|
struct lruvec *lruvec;
|
|
@@ -220,30 +237,30 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
|
|
|
|
|
|
lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
|
lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
|
eviction = atomic_long_inc_return(&lruvec->inactive_age);
|
|
eviction = atomic_long_inc_return(&lruvec->inactive_age);
|
|
- return pack_shadow(memcgid, pgdat, eviction);
|
|
|
|
|
|
+ return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
* workingset_refault - evaluate the refault of a previously evicted page
|
|
* workingset_refault - evaluate the refault of a previously evicted page
|
|
|
|
+ * @page: the freshly allocated replacement page
|
|
* @shadow: shadow entry of the evicted page
|
|
* @shadow: shadow entry of the evicted page
|
|
*
|
|
*
|
|
* Calculates and evaluates the refault distance of the previously
|
|
* Calculates and evaluates the refault distance of the previously
|
|
* evicted page in the context of the node it was allocated in.
|
|
* evicted page in the context of the node it was allocated in.
|
|
- *
|
|
|
|
- * Returns %true if the page should be activated, %false otherwise.
|
|
|
|
*/
|
|
*/
|
|
-bool workingset_refault(void *shadow)
|
|
|
|
|
|
+void workingset_refault(struct page *page, void *shadow)
|
|
{
|
|
{
|
|
unsigned long refault_distance;
|
|
unsigned long refault_distance;
|
|
|
|
+ struct pglist_data *pgdat;
|
|
unsigned long active_file;
|
|
unsigned long active_file;
|
|
struct mem_cgroup *memcg;
|
|
struct mem_cgroup *memcg;
|
|
unsigned long eviction;
|
|
unsigned long eviction;
|
|
struct lruvec *lruvec;
|
|
struct lruvec *lruvec;
|
|
unsigned long refault;
|
|
unsigned long refault;
|
|
- struct pglist_data *pgdat;
|
|
|
|
|
|
+ bool workingset;
|
|
int memcgid;
|
|
int memcgid;
|
|
|
|
|
|
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
|
|
|
|
|
|
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
|
|
|
|
|
|
rcu_read_lock();
|
|
rcu_read_lock();
|
|
/*
|
|
/*
|
|
@@ -263,41 +280,51 @@ bool workingset_refault(void *shadow)
|
|
* configurations instead.
|
|
* configurations instead.
|
|
*/
|
|
*/
|
|
memcg = mem_cgroup_from_id(memcgid);
|
|
memcg = mem_cgroup_from_id(memcgid);
|
|
- if (!mem_cgroup_disabled() && !memcg) {
|
|
|
|
- rcu_read_unlock();
|
|
|
|
- return false;
|
|
|
|
- }
|
|
|
|
|
|
+ if (!mem_cgroup_disabled() && !memcg)
|
|
|
|
+ goto out;
|
|
lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
|
lruvec = mem_cgroup_lruvec(pgdat, memcg);
|
|
refault = atomic_long_read(&lruvec->inactive_age);
|
|
refault = atomic_long_read(&lruvec->inactive_age);
|
|
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
|
|
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
|
|
|
|
|
|
/*
|
|
/*
|
|
- * The unsigned subtraction here gives an accurate distance
|
|
|
|
- * across inactive_age overflows in most cases.
|
|
|
|
|
|
+ * Calculate the refault distance
|
|
*
|
|
*
|
|
- * There is a special case: usually, shadow entries have a
|
|
|
|
- * short lifetime and are either refaulted or reclaimed along
|
|
|
|
- * with the inode before they get too old. But it is not
|
|
|
|
- * impossible for the inactive_age to lap a shadow entry in
|
|
|
|
- * the field, which can then can result in a false small
|
|
|
|
- * refault distance, leading to a false activation should this
|
|
|
|
- * old entry actually refault again. However, earlier kernels
|
|
|
|
- * used to deactivate unconditionally with *every* reclaim
|
|
|
|
- * invocation for the longest time, so the occasional
|
|
|
|
- * inappropriate activation leading to pressure on the active
|
|
|
|
- * list is not a problem.
|
|
|
|
|
|
+ * The unsigned subtraction here gives an accurate distance
|
|
|
|
+ * across inactive_age overflows in most cases. There is a
|
|
|
|
+ * special case: usually, shadow entries have a short lifetime
|
|
|
|
+ * and are either refaulted or reclaimed along with the inode
|
|
|
|
+ * before they get too old. But it is not impossible for the
|
|
|
|
+ * inactive_age to lap a shadow entry in the field, which can
|
|
|
|
+ * then result in a false small refault distance, leading to a
|
|
|
|
+ * false activation should this old entry actually refault
|
|
|
|
+ * again. However, earlier kernels used to deactivate
|
|
|
|
+ * unconditionally with *every* reclaim invocation for the
|
|
|
|
+ * longest time, so the occasional inappropriate activation
|
|
|
|
+ * leading to pressure on the active list is not a problem.
|
|
*/
|
|
*/
|
|
refault_distance = (refault - eviction) & EVICTION_MASK;
|
|
refault_distance = (refault - eviction) & EVICTION_MASK;
|
|
|
|
|
|
inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
|
|
inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
|
|
|
|
|
|
- if (refault_distance <= active_file) {
|
|
|
|
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
|
|
|
|
- rcu_read_unlock();
|
|
|
|
- return true;
|
|
|
|
|
|
+ /*
|
|
|
|
+ * Compare the distance to the existing workingset size. We
|
|
|
|
+ * don't act on pages that couldn't stay resident even if all
|
|
|
|
+ * the memory was available to the page cache.
|
|
|
|
+ */
|
|
|
|
+ if (refault_distance > active_file)
|
|
|
|
+ goto out;
|
|
|
|
+
|
|
|
|
+ SetPageActive(page);
|
|
|
|
+ atomic_long_inc(&lruvec->inactive_age);
|
|
|
|
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
|
|
|
|
+
|
|
|
|
+ /* Page was active prior to eviction */
|
|
|
|
+ if (workingset) {
|
|
|
|
+ SetPageWorkingset(page);
|
|
|
|
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
|
|
}
|
|
}
|
|
|
|
+out:
|
|
rcu_read_unlock();
|
|
rcu_read_unlock();
|
|
- return false;
|
|
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|