|
@@ -89,7 +89,6 @@ enum mem_cgroup_stat_index {
|
|
|
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
|
|
|
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
|
|
|
MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
|
|
|
- MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
|
|
|
MEM_CGROUP_STAT_NSTATS,
|
|
|
};
|
|
|
|
|
@@ -135,7 +134,7 @@ struct mem_cgroup_reclaim_iter {
|
|
|
*/
|
|
|
struct mem_cgroup_per_zone {
|
|
|
struct lruvec lruvec;
|
|
|
- unsigned long count[NR_LRU_LISTS];
|
|
|
+ unsigned long lru_size[NR_LRU_LISTS];
|
|
|
|
|
|
struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
|
|
|
|
|
@@ -144,11 +143,9 @@ struct mem_cgroup_per_zone {
|
|
|
unsigned long long usage_in_excess;/* Set to the value by which */
|
|
|
/* the soft limit is exceeded*/
|
|
|
bool on_tree;
|
|
|
- struct mem_cgroup *mem; /* Back pointer, we cannot */
|
|
|
+ struct mem_cgroup *memcg; /* Back pointer, we cannot */
|
|
|
/* use container_of */
|
|
|
};
|
|
|
-/* Macro for accessing counter */
|
|
|
-#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
|
|
|
|
|
|
struct mem_cgroup_per_node {
|
|
|
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
|
|
@@ -299,6 +296,12 @@ struct mem_cgroup {
|
|
|
* mem_cgroup ? And what type of charges should we move ?
|
|
|
*/
|
|
|
unsigned long move_charge_at_immigrate;
|
|
|
+ /*
|
|
|
+ * set > 0 if pages under this cgroup are moving to other cgroup.
|
|
|
+ */
|
|
|
+ atomic_t moving_account;
|
|
|
+ /* taken only while moving_account > 0 */
|
|
|
+ spinlock_t move_lock;
|
|
|
/*
|
|
|
* percpu counter.
|
|
|
*/
|
|
@@ -612,9 +615,9 @@ retry:
|
|
|
* we will to add it back at the end of reclaim to its correct
|
|
|
* position in the tree.
|
|
|
*/
|
|
|
- __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
|
|
|
- if (!res_counter_soft_limit_excess(&mz->mem->res) ||
|
|
|
- !css_tryget(&mz->mem->css))
|
|
|
+ __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
|
|
|
+ if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
|
|
|
+ !css_tryget(&mz->memcg->css))
|
|
|
goto retry;
|
|
|
done:
|
|
|
return mz;
|
|
@@ -692,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
|
|
|
}
|
|
|
|
|
|
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
|
|
|
- bool file, int nr_pages)
|
|
|
+ bool anon, int nr_pages)
|
|
|
{
|
|
|
preempt_disable();
|
|
|
|
|
|
- if (file)
|
|
|
- __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
|
|
|
+ /*
|
|
|
+ * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
|
|
|
+ * counted as CACHE even if it's on ANON LRU.
|
|
|
+ */
|
|
|
+ if (anon)
|
|
|
+ __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
|
|
|
nr_pages);
|
|
|
else
|
|
|
- __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
|
|
|
+ __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
|
|
|
nr_pages);
|
|
|
|
|
|
/* pagein of a big page is an event. So, ignore page size */
|
|
@@ -721,14 +728,14 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
|
|
|
unsigned int lru_mask)
|
|
|
{
|
|
|
struct mem_cgroup_per_zone *mz;
|
|
|
- enum lru_list l;
|
|
|
+ enum lru_list lru;
|
|
|
unsigned long ret = 0;
|
|
|
|
|
|
mz = mem_cgroup_zoneinfo(memcg, nid, zid);
|
|
|
|
|
|
- for_each_lru(l) {
|
|
|
- if (BIT(l) & lru_mask)
|
|
|
- ret += MEM_CGROUP_ZSTAT(mz, l);
|
|
|
+ for_each_lru(lru) {
|
|
|
+ if (BIT(lru) & lru_mask)
|
|
|
+ ret += mz->lru_size[lru];
|
|
|
}
|
|
|
return ret;
|
|
|
}
|
|
@@ -1077,7 +1084,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
|
|
|
|
|
|
mz = page_cgroup_zoneinfo(memcg, page);
|
|
|
/* compound_order() is stabilized through lru_lock */
|
|
|
- MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
|
|
|
+ mz->lru_size[lru] += 1 << compound_order(page);
|
|
|
return &mz->lruvec;
|
|
|
}
|
|
|
|
|
@@ -1105,8 +1112,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
|
|
|
VM_BUG_ON(!memcg);
|
|
|
mz = page_cgroup_zoneinfo(memcg, page);
|
|
|
/* huge page split is done under lru_lock. so, we have no races. */
|
|
|
- VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page)));
|
|
|
- MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
|
|
|
+ VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
|
|
|
+ mz->lru_size[lru] -= 1 << compound_order(page);
|
|
|
}
|
|
|
|
|
|
void mem_cgroup_lru_del(struct page *page)
|
|
@@ -1285,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
|
|
|
return memcg->swappiness;
|
|
|
}
|
|
|
|
|
|
-static void mem_cgroup_start_move(struct mem_cgroup *memcg)
|
|
|
-{
|
|
|
- int cpu;
|
|
|
+/*
|
|
|
+ * memcg->moving_account is used for checking possibility that some thread is
|
|
|
+ * calling move_account(). When a thread on CPU-A starts moving pages under
|
|
|
+ * a memcg, other threads should check memcg->moving_account under
|
|
|
+ * rcu_read_lock(), like this:
|
|
|
+ *
|
|
|
+ * CPU-A CPU-B
|
|
|
+ * rcu_read_lock()
|
|
|
+ * memcg->moving_account+1 if (memcg->mocing_account)
|
|
|
+ * take heavy locks.
|
|
|
+ * synchronize_rcu() update something.
|
|
|
+ * rcu_read_unlock()
|
|
|
+ * start move here.
|
|
|
+ */
|
|
|
|
|
|
- get_online_cpus();
|
|
|
- spin_lock(&memcg->pcp_counter_lock);
|
|
|
- for_each_online_cpu(cpu)
|
|
|
- per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
|
|
|
- memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
|
|
|
- spin_unlock(&memcg->pcp_counter_lock);
|
|
|
- put_online_cpus();
|
|
|
+/* for quick checking without looking up memcg */
|
|
|
+atomic_t memcg_moving __read_mostly;
|
|
|
|
|
|
+static void mem_cgroup_start_move(struct mem_cgroup *memcg)
|
|
|
+{
|
|
|
+ atomic_inc(&memcg_moving);
|
|
|
+ atomic_inc(&memcg->moving_account);
|
|
|
synchronize_rcu();
|
|
|
}
|
|
|
|
|
|
static void mem_cgroup_end_move(struct mem_cgroup *memcg)
|
|
|
{
|
|
|
- int cpu;
|
|
|
-
|
|
|
- if (!memcg)
|
|
|
- return;
|
|
|
- get_online_cpus();
|
|
|
- spin_lock(&memcg->pcp_counter_lock);
|
|
|
- for_each_online_cpu(cpu)
|
|
|
- per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
|
|
|
- memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
|
|
|
- spin_unlock(&memcg->pcp_counter_lock);
|
|
|
- put_online_cpus();
|
|
|
+ /*
|
|
|
+ * Now, mem_cgroup_clear_mc() may call this function with NULL.
|
|
|
+ * We check NULL in callee rather than caller.
|
|
|
+ */
|
|
|
+ if (memcg) {
|
|
|
+ atomic_dec(&memcg_moving);
|
|
|
+ atomic_dec(&memcg->moving_account);
|
|
|
+ }
|
|
|
}
|
|
|
+
|
|
|
/*
|
|
|
* 2 routines for checking "mem" is under move_account() or not.
|
|
|
*
|
|
|
- * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
|
|
|
- * for avoiding race in accounting. If true,
|
|
|
+ * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
|
|
|
+ * is used for avoiding races in accounting. If true,
|
|
|
* pc->mem_cgroup may be overwritten.
|
|
|
*
|
|
|
* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
|
|
@@ -1326,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
|
|
|
* waiting at hith-memory prressure caused by "move".
|
|
|
*/
|
|
|
|
|
|
-static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
|
|
|
+static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
|
|
|
{
|
|
|
VM_BUG_ON(!rcu_read_lock_held());
|
|
|
- return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
|
|
|
+ return atomic_read(&memcg->moving_account) > 0;
|
|
|
}
|
|
|
|
|
|
static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
|
|
@@ -1370,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Take this lock when
|
|
|
+ * - a code tries to modify page's memcg while it's USED.
|
|
|
+ * - a code tries to modify page state accounting in a memcg.
|
|
|
+ * see mem_cgroup_stolen(), too.
|
|
|
+ */
|
|
|
+static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
|
|
|
+ unsigned long *flags)
|
|
|
+{
|
|
|
+ spin_lock_irqsave(&memcg->move_lock, *flags);
|
|
|
+}
|
|
|
+
|
|
|
+static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
|
|
|
+ unsigned long *flags)
|
|
|
+{
|
|
|
+ spin_unlock_irqrestore(&memcg->move_lock, *flags);
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
|
|
|
* @memcg: The memory cgroup that went over limit
|
|
@@ -1393,7 +1426,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
|
|
|
if (!memcg || !p)
|
|
|
return;
|
|
|
|
|
|
-
|
|
|
rcu_read_lock();
|
|
|
|
|
|
mem_cgrp = memcg->css.cgroup;
|
|
@@ -1772,22 +1804,22 @@ static DEFINE_SPINLOCK(memcg_oom_lock);
|
|
|
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
|
|
|
|
|
|
struct oom_wait_info {
|
|
|
- struct mem_cgroup *mem;
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
wait_queue_t wait;
|
|
|
};
|
|
|
|
|
|
static int memcg_oom_wake_function(wait_queue_t *wait,
|
|
|
unsigned mode, int sync, void *arg)
|
|
|
{
|
|
|
- struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
|
|
|
- *oom_wait_memcg;
|
|
|
+ struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
|
|
|
+ struct mem_cgroup *oom_wait_memcg;
|
|
|
struct oom_wait_info *oom_wait_info;
|
|
|
|
|
|
oom_wait_info = container_of(wait, struct oom_wait_info, wait);
|
|
|
- oom_wait_memcg = oom_wait_info->mem;
|
|
|
+ oom_wait_memcg = oom_wait_info->memcg;
|
|
|
|
|
|
/*
|
|
|
- * Both of oom_wait_info->mem and wake_mem are stable under us.
|
|
|
+ * Both of oom_wait_info->memcg and wake_memcg are stable under us.
|
|
|
* Then we can use css_is_ancestor without taking care of RCU.
|
|
|
*/
|
|
|
if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
|
|
@@ -1811,12 +1843,12 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
|
|
|
/*
|
|
|
* try to call OOM killer. returns false if we should exit memory-reclaim loop.
|
|
|
*/
|
|
|
-bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
|
|
|
+bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
|
|
|
{
|
|
|
struct oom_wait_info owait;
|
|
|
bool locked, need_to_kill;
|
|
|
|
|
|
- owait.mem = memcg;
|
|
|
+ owait.memcg = memcg;
|
|
|
owait.wait.flags = 0;
|
|
|
owait.wait.func = memcg_oom_wake_function;
|
|
|
owait.wait.private = current;
|
|
@@ -1841,7 +1873,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
|
|
|
|
|
|
if (need_to_kill) {
|
|
|
finish_wait(&memcg_oom_waitq, &owait.wait);
|
|
|
- mem_cgroup_out_of_memory(memcg, mask);
|
|
|
+ mem_cgroup_out_of_memory(memcg, mask, order);
|
|
|
} else {
|
|
|
schedule();
|
|
|
finish_wait(&memcg_oom_waitq, &owait.wait);
|
|
@@ -1881,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
|
|
|
* by flags.
|
|
|
*
|
|
|
* Considering "move", this is an only case we see a race. To make the race
|
|
|
- * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
|
|
|
- * possibility of race condition. If there is, we take a lock.
|
|
|
+ * small, we check mm->moving_account and detect there are possibility of race
|
|
|
+ * If there is, we take a lock.
|
|
|
*/
|
|
|
|
|
|
+void __mem_cgroup_begin_update_page_stat(struct page *page,
|
|
|
+ bool *locked, unsigned long *flags)
|
|
|
+{
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
+ struct page_cgroup *pc;
|
|
|
+
|
|
|
+ pc = lookup_page_cgroup(page);
|
|
|
+again:
|
|
|
+ memcg = pc->mem_cgroup;
|
|
|
+ if (unlikely(!memcg || !PageCgroupUsed(pc)))
|
|
|
+ return;
|
|
|
+ /*
|
|
|
+ * If this memory cgroup is not under account moving, we don't
|
|
|
+ * need to take move_lock_page_cgroup(). Because we already hold
|
|
|
+ * rcu_read_lock(), any calls to move_account will be delayed until
|
|
|
+ * rcu_read_unlock() if mem_cgroup_stolen() == true.
|
|
|
+ */
|
|
|
+ if (!mem_cgroup_stolen(memcg))
|
|
|
+ return;
|
|
|
+
|
|
|
+ move_lock_mem_cgroup(memcg, flags);
|
|
|
+ if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
|
|
|
+ move_unlock_mem_cgroup(memcg, flags);
|
|
|
+ goto again;
|
|
|
+ }
|
|
|
+ *locked = true;
|
|
|
+}
|
|
|
+
|
|
|
+void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
|
|
|
+{
|
|
|
+ struct page_cgroup *pc = lookup_page_cgroup(page);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * It's guaranteed that pc->mem_cgroup never changes while
|
|
|
+ * lock is held because a routine modifies pc->mem_cgroup
|
|
|
+ * should take move_lock_page_cgroup().
|
|
|
+ */
|
|
|
+ move_unlock_mem_cgroup(pc->mem_cgroup, flags);
|
|
|
+}
|
|
|
+
|
|
|
void mem_cgroup_update_page_stat(struct page *page,
|
|
|
enum mem_cgroup_page_stat_item idx, int val)
|
|
|
{
|
|
|
struct mem_cgroup *memcg;
|
|
|
struct page_cgroup *pc = lookup_page_cgroup(page);
|
|
|
- bool need_unlock = false;
|
|
|
unsigned long uninitialized_var(flags);
|
|
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
return;
|
|
|
|
|
|
- rcu_read_lock();
|
|
|
memcg = pc->mem_cgroup;
|
|
|
if (unlikely(!memcg || !PageCgroupUsed(pc)))
|
|
|
- goto out;
|
|
|
- /* pc->mem_cgroup is unstable ? */
|
|
|
- if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
|
|
|
- /* take a lock against to access pc->mem_cgroup */
|
|
|
- move_lock_page_cgroup(pc, &flags);
|
|
|
- need_unlock = true;
|
|
|
- memcg = pc->mem_cgroup;
|
|
|
- if (!memcg || !PageCgroupUsed(pc))
|
|
|
- goto out;
|
|
|
- }
|
|
|
+ return;
|
|
|
|
|
|
switch (idx) {
|
|
|
case MEMCG_NR_FILE_MAPPED:
|
|
|
- if (val > 0)
|
|
|
- SetPageCgroupFileMapped(pc);
|
|
|
- else if (!page_mapped(page))
|
|
|
- ClearPageCgroupFileMapped(pc);
|
|
|
idx = MEM_CGROUP_STAT_FILE_MAPPED;
|
|
|
break;
|
|
|
default:
|
|
@@ -1923,14 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page,
|
|
|
}
|
|
|
|
|
|
this_cpu_add(memcg->stat->count[idx], val);
|
|
|
-
|
|
|
-out:
|
|
|
- if (unlikely(need_unlock))
|
|
|
- move_unlock_page_cgroup(pc, &flags);
|
|
|
- rcu_read_unlock();
|
|
|
- return;
|
|
|
}
|
|
|
-EXPORT_SYMBOL(mem_cgroup_update_page_stat);
|
|
|
|
|
|
/*
|
|
|
* size of first charge trial. "32" comes from vmscan.c's magic value.
|
|
@@ -2101,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
|
|
|
per_cpu(memcg->stat->events[i], cpu) = 0;
|
|
|
memcg->nocpu_base.events[i] += x;
|
|
|
}
|
|
|
- /* need to clear ON_MOVE value, works as a kind of lock. */
|
|
|
- per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
|
|
|
- spin_unlock(&memcg->pcp_counter_lock);
|
|
|
-}
|
|
|
-
|
|
|
-static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
|
|
|
-{
|
|
|
- int idx = MEM_CGROUP_ON_MOVE;
|
|
|
-
|
|
|
- spin_lock(&memcg->pcp_counter_lock);
|
|
|
- per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
|
|
|
spin_unlock(&memcg->pcp_counter_lock);
|
|
|
}
|
|
|
|
|
@@ -2123,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
|
|
|
struct memcg_stock_pcp *stock;
|
|
|
struct mem_cgroup *iter;
|
|
|
|
|
|
- if ((action == CPU_ONLINE)) {
|
|
|
- for_each_mem_cgroup(iter)
|
|
|
- synchronize_mem_cgroup_on_move(iter, cpu);
|
|
|
+ if (action == CPU_ONLINE)
|
|
|
return NOTIFY_OK;
|
|
|
- }
|
|
|
|
|
|
if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
|
|
|
return NOTIFY_OK;
|
|
@@ -2212,7 +2248,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
|
|
|
if (!oom_check)
|
|
|
return CHARGE_NOMEM;
|
|
|
/* check OOM */
|
|
|
- if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
|
|
|
+ if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
|
|
|
return CHARGE_OOM_DIE;
|
|
|
|
|
|
return CHARGE_RETRY;
|
|
@@ -2446,6 +2482,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
|
|
|
{
|
|
|
struct zone *uninitialized_var(zone);
|
|
|
bool was_on_lru = false;
|
|
|
+ bool anon;
|
|
|
|
|
|
lock_page_cgroup(pc);
|
|
|
if (unlikely(PageCgroupUsed(pc))) {
|
|
@@ -2481,19 +2518,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
|
|
|
* See mem_cgroup_add_lru_list(), etc.
|
|
|
*/
|
|
|
smp_wmb();
|
|
|
- switch (ctype) {
|
|
|
- case MEM_CGROUP_CHARGE_TYPE_CACHE:
|
|
|
- case MEM_CGROUP_CHARGE_TYPE_SHMEM:
|
|
|
- SetPageCgroupCache(pc);
|
|
|
- SetPageCgroupUsed(pc);
|
|
|
- break;
|
|
|
- case MEM_CGROUP_CHARGE_TYPE_MAPPED:
|
|
|
- ClearPageCgroupCache(pc);
|
|
|
- SetPageCgroupUsed(pc);
|
|
|
- break;
|
|
|
- default:
|
|
|
- break;
|
|
|
- }
|
|
|
+ SetPageCgroupUsed(pc);
|
|
|
|
|
|
if (lrucare) {
|
|
|
if (was_on_lru) {
|
|
@@ -2504,7 +2529,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
|
|
|
spin_unlock_irq(&zone->lru_lock);
|
|
|
}
|
|
|
|
|
|
- mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
|
|
|
+ if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
|
|
|
+ anon = true;
|
|
|
+ else
|
|
|
+ anon = false;
|
|
|
+
|
|
|
+ mem_cgroup_charge_statistics(memcg, anon, nr_pages);
|
|
|
unlock_page_cgroup(pc);
|
|
|
|
|
|
/*
|
|
@@ -2517,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
|
|
|
|
|
|
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
|
|
-#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
|
|
|
- (1 << PCG_MIGRATION))
|
|
|
+#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION))
|
|
|
/*
|
|
|
* Because tail pages are not marked as "used", set it. We're under
|
|
|
* zone->lru_lock, 'splitting on pmd' and compound_lock.
|
|
@@ -2569,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page,
|
|
|
{
|
|
|
unsigned long flags;
|
|
|
int ret;
|
|
|
+ bool anon = PageAnon(page);
|
|
|
|
|
|
VM_BUG_ON(from == to);
|
|
|
VM_BUG_ON(PageLRU(page));
|
|
@@ -2588,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page,
|
|
|
if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
|
|
|
goto unlock;
|
|
|
|
|
|
- move_lock_page_cgroup(pc, &flags);
|
|
|
+ move_lock_mem_cgroup(from, &flags);
|
|
|
|
|
|
- if (PageCgroupFileMapped(pc)) {
|
|
|
+ if (!anon && page_mapped(page)) {
|
|
|
/* Update mapped_file data for mem_cgroup */
|
|
|
preempt_disable();
|
|
|
__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
|
|
|
__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
|
|
|
preempt_enable();
|
|
|
}
|
|
|
- mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
|
|
|
+ mem_cgroup_charge_statistics(from, anon, -nr_pages);
|
|
|
if (uncharge)
|
|
|
/* This is not "cancel", but cancel_charge does all we need. */
|
|
|
__mem_cgroup_cancel_charge(from, nr_pages);
|
|
|
|
|
|
/* caller should have done css_get */
|
|
|
pc->mem_cgroup = to;
|
|
|
- mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
|
|
|
+ mem_cgroup_charge_statistics(to, anon, nr_pages);
|
|
|
/*
|
|
|
* We charges against "to" which may not have any tasks. Then, "to"
|
|
|
* can be under rmdir(). But in current implementation, caller of
|
|
@@ -2612,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page,
|
|
|
* guaranteed that "to" is never removed. So, we don't check rmdir
|
|
|
* status here.
|
|
|
*/
|
|
|
- move_unlock_page_cgroup(pc, &flags);
|
|
|
+ move_unlock_mem_cgroup(from, &flags);
|
|
|
ret = 0;
|
|
|
unlock:
|
|
|
unlock_page_cgroup(pc);
|
|
@@ -2914,7 +2944,6 @@ direct_uncharge:
|
|
|
res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
|
|
|
if (unlikely(batch->memcg != memcg))
|
|
|
memcg_oom_recover(memcg);
|
|
|
- return;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -2926,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
|
|
|
struct mem_cgroup *memcg = NULL;
|
|
|
unsigned int nr_pages = 1;
|
|
|
struct page_cgroup *pc;
|
|
|
+ bool anon;
|
|
|
|
|
|
if (mem_cgroup_disabled())
|
|
|
return NULL;
|
|
@@ -2951,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
|
|
|
if (!PageCgroupUsed(pc))
|
|
|
goto unlock_out;
|
|
|
|
|
|
+ anon = PageAnon(page);
|
|
|
+
|
|
|
switch (ctype) {
|
|
|
case MEM_CGROUP_CHARGE_TYPE_MAPPED:
|
|
|
+ /*
|
|
|
+ * Generally PageAnon tells if it's the anon statistics to be
|
|
|
+ * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
|
|
|
+ * used before page reached the stage of being marked PageAnon.
|
|
|
+ */
|
|
|
+ anon = true;
|
|
|
+ /* fallthrough */
|
|
|
case MEM_CGROUP_CHARGE_TYPE_DROP:
|
|
|
/* See mem_cgroup_prepare_migration() */
|
|
|
if (page_mapped(page) || PageCgroupMigration(pc))
|
|
@@ -2969,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
|
|
|
break;
|
|
|
}
|
|
|
|
|
|
- mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);
|
|
|
+ mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
|
|
|
|
|
|
ClearPageCgroupUsed(pc);
|
|
|
/*
|
|
@@ -3276,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
|
|
|
{
|
|
|
struct page *used, *unused;
|
|
|
struct page_cgroup *pc;
|
|
|
+ bool anon;
|
|
|
|
|
|
if (!memcg)
|
|
|
return;
|
|
@@ -3297,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
|
|
|
lock_page_cgroup(pc);
|
|
|
ClearPageCgroupMigration(pc);
|
|
|
unlock_page_cgroup(pc);
|
|
|
-
|
|
|
- __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
|
|
|
+ anon = PageAnon(used);
|
|
|
+ __mem_cgroup_uncharge_common(unused,
|
|
|
+ anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
|
|
|
+ : MEM_CGROUP_CHARGE_TYPE_CACHE);
|
|
|
|
|
|
/*
|
|
|
* If a page is a file cache, radix-tree replacement is very atomic
|
|
@@ -3308,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
|
|
|
* and USED bit check in mem_cgroup_uncharge_page() will do enough
|
|
|
* check. (see prepare_charge() also)
|
|
|
*/
|
|
|
- if (PageAnon(used))
|
|
|
+ if (anon)
|
|
|
mem_cgroup_uncharge_page(used);
|
|
|
/*
|
|
|
* At migration, we may charge account against cgroup which has no
|
|
@@ -3338,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
|
|
|
/* fix accounting on old pages */
|
|
|
lock_page_cgroup(pc);
|
|
|
memcg = pc->mem_cgroup;
|
|
|
- mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
|
|
|
+ mem_cgroup_charge_statistics(memcg, false, -1);
|
|
|
ClearPageCgroupUsed(pc);
|
|
|
unlock_page_cgroup(pc);
|
|
|
|
|
@@ -3549,7 +3591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
|
|
break;
|
|
|
|
|
|
nr_scanned = 0;
|
|
|
- reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone,
|
|
|
+ reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
|
|
|
gfp_mask, &nr_scanned);
|
|
|
nr_reclaimed += reclaimed;
|
|
|
*total_scanned += nr_scanned;
|
|
@@ -3576,13 +3618,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
|
|
next_mz =
|
|
|
__mem_cgroup_largest_soft_limit_node(mctz);
|
|
|
if (next_mz == mz)
|
|
|
- css_put(&next_mz->mem->css);
|
|
|
+ css_put(&next_mz->memcg->css);
|
|
|
else /* next_mz == NULL or other memcg */
|
|
|
break;
|
|
|
} while (1);
|
|
|
}
|
|
|
- __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
|
|
|
- excess = res_counter_soft_limit_excess(&mz->mem->res);
|
|
|
+ __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
|
|
|
+ excess = res_counter_soft_limit_excess(&mz->memcg->res);
|
|
|
/*
|
|
|
* One school of thought says that we should not add
|
|
|
* back the node to the tree if reclaim returns 0.
|
|
@@ -3592,9 +3634,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
|
|
* term TODO.
|
|
|
*/
|
|
|
/* If excess == 0, no tree ops */
|
|
|
- __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
|
|
|
+ __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
|
|
|
spin_unlock(&mctz->lock);
|
|
|
- css_put(&mz->mem->css);
|
|
|
+ css_put(&mz->memcg->css);
|
|
|
loop++;
|
|
|
/*
|
|
|
* Could not reclaim anything and there are no more
|
|
@@ -3607,7 +3649,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
|
|
|
break;
|
|
|
} while (!nr_reclaimed);
|
|
|
if (next_mz)
|
|
|
- css_put(&next_mz->mem->css);
|
|
|
+ css_put(&next_mz->memcg->css);
|
|
|
return nr_reclaimed;
|
|
|
}
|
|
|
|
|
@@ -3629,7 +3671,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
|
|
|
mz = mem_cgroup_zoneinfo(memcg, node, zid);
|
|
|
list = &mz->lruvec.lists[lru];
|
|
|
|
|
|
- loop = MEM_CGROUP_ZSTAT(mz, lru);
|
|
|
+ loop = mz->lru_size[lru];
|
|
|
/* give some margin against EBUSY etc...*/
|
|
|
loop += 256;
|
|
|
busy = NULL;
|
|
@@ -3703,10 +3745,10 @@ move_account:
|
|
|
mem_cgroup_start_move(memcg);
|
|
|
for_each_node_state(node, N_HIGH_MEMORY) {
|
|
|
for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
|
|
|
- enum lru_list l;
|
|
|
- for_each_lru(l) {
|
|
|
+ enum lru_list lru;
|
|
|
+ for_each_lru(lru) {
|
|
|
ret = mem_cgroup_force_empty_list(memcg,
|
|
|
- node, zid, l);
|
|
|
+ node, zid, lru);
|
|
|
if (ret)
|
|
|
break;
|
|
|
}
|
|
@@ -3860,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
|
|
|
break;
|
|
|
default:
|
|
|
BUG();
|
|
|
- break;
|
|
|
}
|
|
|
return val;
|
|
|
}
|
|
@@ -3939,7 +3980,6 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
|
|
|
out:
|
|
|
*mem_limit = min_limit;
|
|
|
*memsw_limit = min_memsw_limit;
|
|
|
- return;
|
|
|
}
|
|
|
|
|
|
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
|
|
@@ -4098,38 +4138,38 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
|
|
|
unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
|
|
|
unsigned long node_nr;
|
|
|
struct cgroup *cont = m->private;
|
|
|
- struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
|
|
|
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
|
|
|
|
|
|
- total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
|
|
|
+ total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
|
|
|
seq_printf(m, "total=%lu", total_nr);
|
|
|
for_each_node_state(nid, N_HIGH_MEMORY) {
|
|
|
- node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
|
|
|
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
|
|
|
seq_printf(m, " N%d=%lu", nid, node_nr);
|
|
|
}
|
|
|
seq_putc(m, '\n');
|
|
|
|
|
|
- file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
|
|
|
+ file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
|
|
|
seq_printf(m, "file=%lu", file_nr);
|
|
|
for_each_node_state(nid, N_HIGH_MEMORY) {
|
|
|
- node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
|
|
|
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
|
|
|
LRU_ALL_FILE);
|
|
|
seq_printf(m, " N%d=%lu", nid, node_nr);
|
|
|
}
|
|
|
seq_putc(m, '\n');
|
|
|
|
|
|
- anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
|
|
|
+ anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
|
|
|
seq_printf(m, "anon=%lu", anon_nr);
|
|
|
for_each_node_state(nid, N_HIGH_MEMORY) {
|
|
|
- node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
|
|
|
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
|
|
|
LRU_ALL_ANON);
|
|
|
seq_printf(m, " N%d=%lu", nid, node_nr);
|
|
|
}
|
|
|
seq_putc(m, '\n');
|
|
|
|
|
|
- unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
|
|
|
+ unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
|
|
|
seq_printf(m, "unevictable=%lu", unevictable_nr);
|
|
|
for_each_node_state(nid, N_HIGH_MEMORY) {
|
|
|
- node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
|
|
|
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
|
|
|
BIT(LRU_UNEVICTABLE));
|
|
|
seq_printf(m, " N%d=%lu", nid, node_nr);
|
|
|
}
|
|
@@ -4141,12 +4181,12 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
|
|
|
static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
|
|
|
struct cgroup_map_cb *cb)
|
|
|
{
|
|
|
- struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
|
|
|
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
|
|
|
struct mcs_total_stat mystat;
|
|
|
int i;
|
|
|
|
|
|
memset(&mystat, 0, sizeof(mystat));
|
|
|
- mem_cgroup_get_local_stat(mem_cont, &mystat);
|
|
|
+ mem_cgroup_get_local_stat(memcg, &mystat);
|
|
|
|
|
|
|
|
|
for (i = 0; i < NR_MCS_STAT; i++) {
|
|
@@ -4158,14 +4198,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
|
|
|
/* Hierarchical information */
|
|
|
{
|
|
|
unsigned long long limit, memsw_limit;
|
|
|
- memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
|
|
|
+ memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
|
|
|
cb->fill(cb, "hierarchical_memory_limit", limit);
|
|
|
if (do_swap_account)
|
|
|
cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
|
|
|
}
|
|
|
|
|
|
memset(&mystat, 0, sizeof(mystat));
|
|
|
- mem_cgroup_get_total_stat(mem_cont, &mystat);
|
|
|
+ mem_cgroup_get_total_stat(memcg, &mystat);
|
|
|
for (i = 0; i < NR_MCS_STAT; i++) {
|
|
|
if (i == MCS_SWAP && !do_swap_account)
|
|
|
continue;
|
|
@@ -4181,7 +4221,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
|
|
|
|
|
|
for_each_online_node(nid)
|
|
|
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
|
|
|
- mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
|
|
|
+ mz = mem_cgroup_zoneinfo(memcg, nid, zid);
|
|
|
|
|
|
recent_rotated[0] +=
|
|
|
mz->reclaim_stat.recent_rotated[0];
|
|
@@ -4426,12 +4466,6 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
|
|
|
else
|
|
|
BUG();
|
|
|
|
|
|
- /*
|
|
|
- * Something went wrong if we trying to unregister a threshold
|
|
|
- * if we don't have thresholds
|
|
|
- */
|
|
|
- BUG_ON(!thresholds);
|
|
|
-
|
|
|
if (!thresholds->primary)
|
|
|
goto unlock;
|
|
|
|
|
@@ -4736,7 +4770,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
|
|
|
{
|
|
|
struct mem_cgroup_per_node *pn;
|
|
|
struct mem_cgroup_per_zone *mz;
|
|
|
- enum lru_list l;
|
|
|
+ enum lru_list lru;
|
|
|
int zone, tmp = node;
|
|
|
/*
|
|
|
* This routine is called against possible nodes.
|
|
@@ -4754,11 +4788,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
|
|
|
|
|
|
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
|
|
mz = &pn->zoneinfo[zone];
|
|
|
- for_each_lru(l)
|
|
|
- INIT_LIST_HEAD(&mz->lruvec.lists[l]);
|
|
|
+ for_each_lru(lru)
|
|
|
+ INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
|
|
|
mz->usage_in_excess = 0;
|
|
|
mz->on_tree = false;
|
|
|
- mz->mem = memcg;
|
|
|
+ mz->memcg = memcg;
|
|
|
}
|
|
|
memcg->info.nodeinfo[node] = pn;
|
|
|
return 0;
|
|
@@ -4771,29 +4805,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
|
|
|
|
|
|
static struct mem_cgroup *mem_cgroup_alloc(void)
|
|
|
{
|
|
|
- struct mem_cgroup *mem;
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
int size = sizeof(struct mem_cgroup);
|
|
|
|
|
|
/* Can be very big if MAX_NUMNODES is very big */
|
|
|
if (size < PAGE_SIZE)
|
|
|
- mem = kzalloc(size, GFP_KERNEL);
|
|
|
+ memcg = kzalloc(size, GFP_KERNEL);
|
|
|
else
|
|
|
- mem = vzalloc(size);
|
|
|
+ memcg = vzalloc(size);
|
|
|
|
|
|
- if (!mem)
|
|
|
+ if (!memcg)
|
|
|
return NULL;
|
|
|
|
|
|
- mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
|
|
|
- if (!mem->stat)
|
|
|
+ memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
|
|
|
+ if (!memcg->stat)
|
|
|
goto out_free;
|
|
|
- spin_lock_init(&mem->pcp_counter_lock);
|
|
|
- return mem;
|
|
|
+ spin_lock_init(&memcg->pcp_counter_lock);
|
|
|
+ return memcg;
|
|
|
|
|
|
out_free:
|
|
|
if (size < PAGE_SIZE)
|
|
|
- kfree(mem);
|
|
|
+ kfree(memcg);
|
|
|
else
|
|
|
- vfree(mem);
|
|
|
+ vfree(memcg);
|
|
|
return NULL;
|
|
|
}
|
|
|
|
|
@@ -4981,6 +5015,7 @@ mem_cgroup_create(struct cgroup *cont)
|
|
|
atomic_set(&memcg->refcnt, 1);
|
|
|
memcg->move_charge_at_immigrate = 0;
|
|
|
mutex_init(&memcg->thresholds_lock);
|
|
|
+ spin_lock_init(&memcg->move_lock);
|
|
|
return &memcg->css;
|
|
|
free_out:
|
|
|
__mem_cgroup_free(memcg);
|
|
@@ -5075,7 +5110,7 @@ one_by_one:
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * is_target_pte_for_mc - check a pte whether it is valid for move charge
|
|
|
+ * get_mctgt_type - get target type of moving charge
|
|
|
* @vma: the vma the pte to be checked belongs
|
|
|
* @addr: the address corresponding to the pte to be checked
|
|
|
* @ptent: the pte to be checked
|
|
@@ -5098,7 +5133,7 @@ union mc_target {
|
|
|
};
|
|
|
|
|
|
enum mc_target_type {
|
|
|
- MC_TARGET_NONE, /* not used */
|
|
|
+ MC_TARGET_NONE = 0,
|
|
|
MC_TARGET_PAGE,
|
|
|
MC_TARGET_SWAP,
|
|
|
};
|
|
@@ -5179,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
|
|
|
return page;
|
|
|
}
|
|
|
|
|
|
-static int is_target_pte_for_mc(struct vm_area_struct *vma,
|
|
|
+static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
|
|
|
unsigned long addr, pte_t ptent, union mc_target *target)
|
|
|
{
|
|
|
struct page *page = NULL;
|
|
|
struct page_cgroup *pc;
|
|
|
- int ret = 0;
|
|
|
+ enum mc_target_type ret = MC_TARGET_NONE;
|
|
|
swp_entry_t ent = { .val = 0 };
|
|
|
|
|
|
if (pte_present(ptent))
|
|
@@ -5195,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
|
|
|
page = mc_handle_file_pte(vma, addr, ptent, &ent);
|
|
|
|
|
|
if (!page && !ent.val)
|
|
|
- return 0;
|
|
|
+ return ret;
|
|
|
if (page) {
|
|
|
pc = lookup_page_cgroup(page);
|
|
|
/*
|
|
@@ -5221,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
+/*
|
|
|
+ * We don't consider swapping or file mapped pages because THP does not
|
|
|
+ * support them for now.
|
|
|
+ * Caller should make sure that pmd_trans_huge(pmd) is true.
|
|
|
+ */
|
|
|
+static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
|
|
|
+ unsigned long addr, pmd_t pmd, union mc_target *target)
|
|
|
+{
|
|
|
+ struct page *page = NULL;
|
|
|
+ struct page_cgroup *pc;
|
|
|
+ enum mc_target_type ret = MC_TARGET_NONE;
|
|
|
+
|
|
|
+ page = pmd_page(pmd);
|
|
|
+ VM_BUG_ON(!page || !PageHead(page));
|
|
|
+ if (!move_anon())
|
|
|
+ return ret;
|
|
|
+ pc = lookup_page_cgroup(page);
|
|
|
+ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
|
|
|
+ ret = MC_TARGET_PAGE;
|
|
|
+ if (target) {
|
|
|
+ get_page(page);
|
|
|
+ target->page = page;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+#else
|
|
|
+static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
|
|
|
+ unsigned long addr, pmd_t pmd, union mc_target *target)
|
|
|
+{
|
|
|
+ return MC_TARGET_NONE;
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
|
|
|
unsigned long addr, unsigned long end,
|
|
|
struct mm_walk *walk)
|
|
@@ -5229,11 +5299,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
|
|
|
pte_t *pte;
|
|
|
spinlock_t *ptl;
|
|
|
|
|
|
- split_huge_page_pmd(walk->mm, pmd);
|
|
|
+ if (pmd_trans_huge_lock(pmd, vma) == 1) {
|
|
|
+ if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
|
|
|
+ mc.precharge += HPAGE_PMD_NR;
|
|
|
+ spin_unlock(&vma->vm_mm->page_table_lock);
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
|
|
|
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
|
|
|
for (; addr != end; pte++, addr += PAGE_SIZE)
|
|
|
- if (is_target_pte_for_mc(vma, addr, *pte, NULL))
|
|
|
+ if (get_mctgt_type(vma, addr, *pte, NULL))
|
|
|
mc.precharge++; /* increment precharge temporarily */
|
|
|
pte_unmap_unlock(pte - 1, ptl);
|
|
|
cond_resched();
|
|
@@ -5388,23 +5463,55 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
|
|
|
struct vm_area_struct *vma = walk->private;
|
|
|
pte_t *pte;
|
|
|
spinlock_t *ptl;
|
|
|
+ enum mc_target_type target_type;
|
|
|
+ union mc_target target;
|
|
|
+ struct page *page;
|
|
|
+ struct page_cgroup *pc;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We don't take compound_lock() here but no race with splitting thp
|
|
|
+ * happens because:
|
|
|
+ * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
|
|
|
+ * under splitting, which means there's no concurrent thp split,
|
|
|
+ * - if another thread runs into split_huge_page() just after we
|
|
|
+ * entered this if-block, the thread must wait for page table lock
|
|
|
+ * to be unlocked in __split_huge_page_splitting(), where the main
|
|
|
+ * part of thp split is not executed yet.
|
|
|
+ */
|
|
|
+ if (pmd_trans_huge_lock(pmd, vma) == 1) {
|
|
|
+ if (!mc.precharge) {
|
|
|
+ spin_unlock(&vma->vm_mm->page_table_lock);
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
|
|
|
+ if (target_type == MC_TARGET_PAGE) {
|
|
|
+ page = target.page;
|
|
|
+ if (!isolate_lru_page(page)) {
|
|
|
+ pc = lookup_page_cgroup(page);
|
|
|
+ if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
|
|
|
+ pc, mc.from, mc.to,
|
|
|
+ false)) {
|
|
|
+ mc.precharge -= HPAGE_PMD_NR;
|
|
|
+ mc.moved_charge += HPAGE_PMD_NR;
|
|
|
+ }
|
|
|
+ putback_lru_page(page);
|
|
|
+ }
|
|
|
+ put_page(page);
|
|
|
+ }
|
|
|
+ spin_unlock(&vma->vm_mm->page_table_lock);
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
|
|
|
- split_huge_page_pmd(walk->mm, pmd);
|
|
|
retry:
|
|
|
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
|
|
|
for (; addr != end; addr += PAGE_SIZE) {
|
|
|
pte_t ptent = *(pte++);
|
|
|
- union mc_target target;
|
|
|
- int type;
|
|
|
- struct page *page;
|
|
|
- struct page_cgroup *pc;
|
|
|
swp_entry_t ent;
|
|
|
|
|
|
if (!mc.precharge)
|
|
|
break;
|
|
|
|
|
|
- type = is_target_pte_for_mc(vma, addr, ptent, &target);
|
|
|
- switch (type) {
|
|
|
+ switch (get_mctgt_type(vma, addr, ptent, &target)) {
|
|
|
case MC_TARGET_PAGE:
|
|
|
page = target.page;
|
|
|
if (isolate_lru_page(page))
|
|
@@ -5417,7 +5524,7 @@ retry:
|
|
|
mc.moved_charge++;
|
|
|
}
|
|
|
putback_lru_page(page);
|
|
|
-put: /* is_target_pte_for_mc() gets the page */
|
|
|
+put: /* get_mctgt_type() gets the page */
|
|
|
put_page(page);
|
|
|
break;
|
|
|
case MC_TARGET_SWAP:
|