11 ani în urmă · 3e32cb2e0a
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -52,9 +52,9 @@ Brief summary of control files.
 
				  tasks				 # attach a task(thread) and show list of threads
			
 
				  cgroup.procs			 # show list of processes
			
 
				  cgroup.event_control		 # an interface for event_fd()
			
 
				- memory.usage_in_bytes		 # show current res_counter usage for memory
			
 
				+ memory.usage_in_bytes		 # show current usage for memory
			
 
				 				 (See 5.5 for details)
			
 
				- memory.memsw.usage_in_bytes	 # show current res_counter usage for memory+Swap
			
 
				+ memory.memsw.usage_in_bytes	 # show current usage for memory+Swap
			
 
				 				 (See 5.5 for details)
			
 
				  memory.limit_in_bytes		 # set/show limit of memory usage
			
 
				  memory.memsw.limit_in_bytes	 # set/show limit of memory+Swap usage
			
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -447,9 +447,8 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 
				 	/*
			
 
				 	 * __GFP_NOFAIL allocations will move on even if charging is not
			
 
				 	 * possible. Therefore we don't even try, and have this allocation
			
 
				-	 * unaccounted. We could in theory charge it with
			
 
				-	 * res_counter_charge_nofail, but we hope those allocations are rare,
			
 
				-	 * and won't be worth the trouble.
			
 
				+	 * unaccounted. We could in theory charge it forcibly, but we hope
			
 
				+	 * those allocations are rare, and won't be worth the trouble.
			
 
				 	 */
			
 
				 	if (gfp & __GFP_NOFAIL)
			
 
				 		return true;
			
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -0,0 +1,51 @@
 
				+#ifndef _LINUX_PAGE_COUNTER_H
			
 
				+#define _LINUX_PAGE_COUNTER_H
			
 
				+
			
 
				+#include <linux/atomic.h>
			
 
				+#include <linux/kernel.h>
			
 
				+#include <asm/page.h>
			
 
				+
			
 
				+struct page_counter {
			
 
				+	atomic_long_t count;
			
 
				+	unsigned long limit;
			
 
				+	struct page_counter *parent;
			
 
				+
			
 
				+	/* legacy */
			
 
				+	unsigned long watermark;
			
 
				+	unsigned long failcnt;
			
 
				+};
			
 
				+
			
 
				+#if BITS_PER_LONG == 32
			
 
				+#define PAGE_COUNTER_MAX LONG_MAX
			
 
				+#else
			
 
				+#define PAGE_COUNTER_MAX (LONG_MAX / PAGE_SIZE)
			
 
				+#endif
			
 
				+
			
 
				+static inline void page_counter_init(struct page_counter *counter,
			
 
				+				     struct page_counter *parent)
			
 
				+{
			
 
				+	atomic_long_set(&counter->count, 0);
			
 
				+	counter->limit = PAGE_COUNTER_MAX;
			
 
				+	counter->parent = parent;
			
 
				+}
			
 
				+
			
 
				+static inline unsigned long page_counter_read(struct page_counter *counter)
			
 
				+{
			
 
				+	return atomic_long_read(&counter->count);
			
 
				+}
			
 
				+
			
 
				+int page_counter_cancel(struct page_counter *counter, unsigned long nr_pages);
			
 
				+void page_counter_charge(struct page_counter *counter, unsigned long nr_pages);
			
 
				+int page_counter_try_charge(struct page_counter *counter,
			
 
				+			    unsigned long nr_pages,
			
 
				+			    struct page_counter **fail);
			
 
				+int page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
			
 
				+int page_counter_limit(struct page_counter *counter, unsigned long limit);
			
 
				+int page_counter_memparse(const char *buf, unsigned long *nr_pages);
			
 
				+
			
 
				+static inline void page_counter_reset_watermark(struct page_counter *counter)
			
 
				+{
			
 
				+	counter->watermark = page_counter_read(counter);
			
 
				+}
			
 
				+
			
 
				+#endif /* _LINUX_PAGE_COUNTER_H */
			
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -54,8 +54,8 @@
 
				 #include <linux/security.h>
			
 
				 #include <linux/slab.h>
			
 
				 #include <linux/uaccess.h>
			
 
				+#include <linux/page_counter.h>
			
 
				 #include <linux/memcontrol.h>
			
 
				-#include <linux/res_counter.h>
			
 
				 #include <linux/static_key.h>
			
 
				 #include <linux/aio.h>
			
 
				 #include <linux/sched.h>
			
@@ -1062,7 +1062,7 @@ enum cg_proto_flags {
 
				 };
			
 
				 
			
 
				 struct cg_proto {
			
 
				-	struct res_counter	memory_allocated;	/* Current allocated memory. */
			
 
				+	struct page_counter	memory_allocated;	/* Current allocated memory. */
			
 
				 	struct percpu_counter	sockets_allocated;	/* Current number of sockets. */
			
 
				 	int			memory_pressure;
			
 
				 	long			sysctl_mem[3];
			
@@ -1214,34 +1214,26 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot,
 
				 					      unsigned long amt,
			
 
				 					      int *parent_status)
			
 
				 {
			
 
				-	struct res_counter *fail;
			
 
				-	int ret;
			
 
				+	page_counter_charge(&prot->memory_allocated, amt);
			
 
				 
			
 
				-	ret = res_counter_charge_nofail(&prot->memory_allocated,
			
 
				-					amt << PAGE_SHIFT, &fail);
			
 
				-	if (ret < 0)
			
 
				+	if (page_counter_read(&prot->memory_allocated) >
			
 
				+	    prot->memory_allocated.limit)
			
 
				 		*parent_status = OVER_LIMIT;
			
 
				 }
			
 
				 
			
 
				 static inline void memcg_memory_allocated_sub(struct cg_proto *prot,
			
 
				 					      unsigned long amt)
			
 
				 {
			
 
				-	res_counter_uncharge(&prot->memory_allocated, amt << PAGE_SHIFT);
			
 
				-}
			
 
				-
			
 
				-static inline u64 memcg_memory_allocated_read(struct cg_proto *prot)
			
 
				-{
			
 
				-	u64 ret;
			
 
				-	ret = res_counter_read_u64(&prot->memory_allocated, RES_USAGE);
			
 
				-	return ret >> PAGE_SHIFT;
			
 
				+	page_counter_uncharge(&prot->memory_allocated, amt);
			
 
				 }
			
 
				 
			
 
				 static inline long
			
 
				 sk_memory_allocated(const struct sock *sk)
			
 
				 {
			
 
				 	struct proto *prot = sk->sk_prot;
			
 
				+
			
 
				 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
			
 
				-		return memcg_memory_allocated_read(sk->sk_cgrp);
			
 
				+		return page_counter_read(&sk->sk_cgrp->memory_allocated);
			
 
				 
			
 
				 	return atomic_long_read(prot->memory_allocated);
			
 
				 }
			
@@ -1255,7 +1247,7 @@ sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status)
 
				 		memcg_memory_allocated_add(sk->sk_cgrp, amt, parent_status);
			
 
				 		/* update the root cgroup regardless */
			
 
				 		atomic_long_add_return(amt, prot->memory_allocated);
			
 
				-		return memcg_memory_allocated_read(sk->sk_cgrp);
			
 
				+		return page_counter_read(&sk->sk_cgrp->memory_allocated);
			
 
				 	}
			
 
				 
			
 
				 	return atomic_long_add_return(amt, prot->memory_allocated);
			
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -978,9 +978,12 @@ config RESOURCE_COUNTERS
 
				 	  This option enables controller independent resource accounting
			
 
				 	  infrastructure that works with cgroups.
			
 
				 
			
 
				+config PAGE_COUNTER
			
 
				+       bool
			
 
				+
			
 
				 config MEMCG
			
 
				 	bool "Memory Resource Controller for Control Groups"
			
 
				-	depends on RESOURCE_COUNTERS
			
 
				+	select PAGE_COUNTER
			
 
				 	select EVENTFD
			
 
				 	help
			
 
				 	  Provides a memory resource controller that manages both anonymous
			
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 
				 obj-$(CONFIG_MIGRATION) += migrate.o
			
 
				 obj-$(CONFIG_QUICKLIST) += quicklist.o
			
 
				 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
			
 
				+obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
			
 
				 obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
			
 
				 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
			
 
				 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
			
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,7 +25,7 @@
 
				  * GNU General Public License for more details.
			
 
				  */
			
 
				 
			
 
				-#include <linux/res_counter.h>
			
 
				+#include <linux/page_counter.h>
			
 
				 #include <linux/memcontrol.h>
			
 
				 #include <linux/cgroup.h>
			
 
				 #include <linux/mm.h>
			
@@ -165,7 +165,7 @@ struct mem_cgroup_per_zone {
 
				 	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
			
 
				 
			
 
				 	struct rb_node		tree_node;	/* RB tree node */
			
 
				-	unsigned long long	usage_in_excess;/* Set to the value by which */
			
 
				+	unsigned long		usage_in_excess;/* Set to the value by which */
			
 
				 						/* the soft limit is exceeded*/
			
 
				 	bool			on_tree;
			
 
				 	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
			
@@ -198,7 +198,7 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 
				 
			
 
				 struct mem_cgroup_threshold {
			
 
				 	struct eventfd_ctx *eventfd;
			
 
				-	u64 threshold;
			
 
				+	unsigned long threshold;
			
 
				 };
			
 
				 
			
 
				 /* For threshold */
			
@@ -284,10 +284,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 
				  */
			
 
				 struct mem_cgroup {
			
 
				 	struct cgroup_subsys_state css;
			
 
				-	/*
			
 
				-	 * the counter to account for memory usage
			
 
				-	 */
			
 
				-	struct res_counter res;
			
 
				+
			
 
				+	/* Accounted resources */
			
 
				+	struct page_counter memory;
			
 
				+	struct page_counter memsw;
			
 
				+	struct page_counter kmem;
			
 
				+
			
 
				+	unsigned long soft_limit;
			
 
				 
			
 
				 	/* vmpressure notifications */
			
 
				 	struct vmpressure vmpressure;
			
@@ -295,15 +298,6 @@ struct mem_cgroup {
 
				 	/* css_online() has been completed */
			
 
				 	int initialized;
			
 
				 
			
 
				-	/*
			
 
				-	 * the counter to account for mem+swap usage.
			
 
				-	 */
			
 
				-	struct res_counter memsw;
			
 
				-
			
 
				-	/*
			
 
				-	 * the counter to account for kernel memory usage.
			
 
				-	 */
			
 
				-	struct res_counter kmem;
			
 
				 	/*
			
 
				 	 * Should the accounting and control be hierarchical, per subtree?
			
 
				 	 */
			
@@ -650,7 +644,7 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg)
 
				 	 * This check can't live in kmem destruction function,
			
 
				 	 * since the charges will outlive the cgroup
			
 
				 	 */
			
 
				-	WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
			
 
				+	WARN_ON(page_counter_read(&memcg->kmem));
			
 
				 }
			
 
				 #else
			
 
				 static void disarm_kmem_keys(struct mem_cgroup *memcg)
			
@@ -706,7 +700,7 @@ soft_limit_tree_from_page(struct page *page)
 
				 
			
 
				 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
			
 
				 					 struct mem_cgroup_tree_per_zone *mctz,
			
 
				-					 unsigned long long new_usage_in_excess)
			
 
				+					 unsigned long new_usage_in_excess)
			
 
				 {
			
 
				 	struct rb_node **p = &mctz->rb_root.rb_node;
			
 
				 	struct rb_node *parent = NULL;
			
@@ -755,10 +749,21 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
 
				 	spin_unlock_irqrestore(&mctz->lock, flags);
			
 
				 }
			
 
				 
			
 
				+static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
			
 
				+{
			
 
				+	unsigned long nr_pages = page_counter_read(&memcg->memory);
			
 
				+	unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
			
 
				+	unsigned long excess = 0;
			
 
				+
			
 
				+	if (nr_pages > soft_limit)
			
 
				+		excess = nr_pages - soft_limit;
			
 
				+
			
 
				+	return excess;
			
 
				+}
			
 
				 
			
 
				 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
			
 
				 {
			
 
				-	unsigned long long excess;
			
 
				+	unsigned long excess;
			
 
				 	struct mem_cgroup_per_zone *mz;
			
 
				 	struct mem_cgroup_tree_per_zone *mctz;
			
 
				 
			
@@ -769,7 +774,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 
				 	 */
			
 
				 	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
			
 
				 		mz = mem_cgroup_page_zoneinfo(memcg, page);
			
 
				-		excess = res_counter_soft_limit_excess(&memcg->res);
			
 
				+		excess = soft_limit_excess(memcg);
			
 
				 		/*
			
 
				 		 * We have to update the tree if mz is on RB-tree or
			
 
				 		 * mem is over its softlimit.
			
@@ -825,7 +830,7 @@ __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 
				 	 * position in the tree.
			
 
				 	 */
			
 
				 	__mem_cgroup_remove_exceeded(mz, mctz);
			
 
				-	if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
			
 
				+	if (!soft_limit_excess(mz->memcg) ||
			
 
				 	    !css_tryget_online(&mz->memcg->css))
			
 
				 		goto retry;
			
 
				 done:
			
@@ -1492,7 +1497,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 
				 	return inactive * inactive_ratio < active;
			
 
				 }
			
 
				 
			
 
				-#define mem_cgroup_from_res_counter(counter, member)	\
			
 
				+#define mem_cgroup_from_counter(counter, member)	\
			
 
				 	container_of(counter, struct mem_cgroup, member)
			
 
				 
			
 
				 /**
			
@@ -1504,12 +1509,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 
				  */
			
 
				 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
			
 
				 {
			
 
				-	unsigned long long margin;
			
 
				+	unsigned long margin = 0;
			
 
				+	unsigned long count;
			
 
				+	unsigned long limit;
			
 
				 
			
 
				-	margin = res_counter_margin(&memcg->res);
			
 
				-	if (do_swap_account)
			
 
				-		margin = min(margin, res_counter_margin(&memcg->memsw));
			
 
				-	return margin >> PAGE_SHIFT;
			
 
				+	count = page_counter_read(&memcg->memory);
			
 
				+	limit = ACCESS_ONCE(memcg->memory.limit);
			
 
				+	if (count < limit)
			
 
				+		margin = limit - count;
			
 
				+
			
 
				+	if (do_swap_account) {
			
 
				+		count = page_counter_read(&memcg->memsw);
			
 
				+		limit = ACCESS_ONCE(memcg->memsw.limit);
			
 
				+		if (count <= limit)
			
 
				+			margin = min(margin, limit - count);
			
 
				+	}
			
 
				+
			
 
				+	return margin;
			
 
				 }
			
 
				 
			
 
				 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
			
@@ -1644,18 +1660,15 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 
				 
			
 
				 	rcu_read_unlock();
			
 
				 
			
 
				-	pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
			
 
				-		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
			
 
				-		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
			
 
				-		res_counter_read_u64(&memcg->res, RES_FAILCNT));
			
 
				-	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
			
 
				-		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
			
 
				-		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
			
 
				-		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
			
 
				-	pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
			
 
				-		res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
			
 
				-		res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
			
 
				-		res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
			
 
				+	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
			
 
				+		K((u64)page_counter_read(&memcg->memory)),
			
 
				+		K((u64)memcg->memory.limit), memcg->memory.failcnt);
			
 
				+	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
			
 
				+		K((u64)page_counter_read(&memcg->memsw)),
			
 
				+		K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
			
 
				+	pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
			
 
				+		K((u64)page_counter_read(&memcg->kmem)),
			
 
				+		K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
			
 
				 
			
 
				 	for_each_mem_cgroup_tree(iter, memcg) {
			
 
				 		pr_info("Memory cgroup stats for ");
			
@@ -1695,28 +1708,17 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
 
				 /*
			
 
				  * Return the memory (and swap, if configured) limit for a memcg.
			
 
				  */
			
 
				-static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
			
 
				+static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
			
 
				 {
			
 
				-	u64 limit;
			
 
				+	unsigned long limit;
			
 
				 
			
 
				-	limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
			
 
				-
			
 
				-	/*
			
 
				-	 * Do not consider swap space if we cannot swap due to swappiness
			
 
				-	 */
			
 
				+	limit = memcg->memory.limit;
			
 
				 	if (mem_cgroup_swappiness(memcg)) {
			
 
				-		u64 memsw;
			
 
				+		unsigned long memsw_limit;
			
 
				 
			
 
				-		limit += total_swap_pages << PAGE_SHIFT;
			
 
				-		memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
			
 
				-
			
 
				-		/*
			
 
				-		 * If memsw is finite and limits the amount of swap space
			
 
				-		 * available to this memcg, return that limit.
			
 
				-		 */
			
 
				-		limit = min(limit, memsw);
			
 
				+		memsw_limit = memcg->memsw.limit;
			
 
				+		limit = min(limit + total_swap_pages, memsw_limit);
			
 
				 	}
			
 
				-
			
 
				 	return limit;
			
 
				 }
			
 
				 
			
@@ -1740,7 +1742,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
				 	}
			
 
				 
			
 
				 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
			
 
				-	totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
			
 
				+	totalpages = mem_cgroup_get_limit(memcg) ? : 1;
			
 
				 	for_each_mem_cgroup_tree(iter, memcg) {
			
 
				 		struct css_task_iter it;
			
 
				 		struct task_struct *task;
			
@@ -1943,7 +1945,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
 
				 		.priority = 0,
			
 
				 	};
			
 
				 
			
 
				-	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
			
 
				+	excess = soft_limit_excess(root_memcg);
			
 
				 
			
 
				 	while (1) {
			
 
				 		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
			
@@ -1974,7 +1976,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
 
				 		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
			
 
				 						     zone, &nr_scanned);
			
 
				 		*total_scanned += nr_scanned;
			
 
				-		if (!res_counter_soft_limit_excess(&root_memcg->res))
			
 
				+		if (!soft_limit_excess(root_memcg))
			
 
				 			break;
			
 
				 	}
			
 
				 	mem_cgroup_iter_break(root_memcg, victim);
			
@@ -2316,33 +2318,31 @@ static DEFINE_MUTEX(percpu_charge_mutex);
 
				 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
			
 
				 {
			
 
				 	struct memcg_stock_pcp *stock;
			
 
				-	bool ret = true;
			
 
				+	bool ret = false;
			
 
				 
			
 
				 	if (nr_pages > CHARGE_BATCH)
			
 
				-		return false;
			
 
				+		return ret;
			
 
				 
			
 
				 	stock = &get_cpu_var(memcg_stock);
			
 
				-	if (memcg == stock->cached && stock->nr_pages >= nr_pages)
			
 
				+	if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
			
 
				 		stock->nr_pages -= nr_pages;
			
 
				-	else /* need to call res_counter_charge */
			
 
				-		ret = false;
			
 
				+		ret = true;
			
 
				+	}
			
 
				 	put_cpu_var(memcg_stock);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Returns stocks cached in percpu to res_counter and reset cached information.
			
 
				+ * Returns stocks cached in percpu and reset cached information.
			
 
				  */
			
 
				 static void drain_stock(struct memcg_stock_pcp *stock)
			
 
				 {
			
 
				 	struct mem_cgroup *old = stock->cached;
			
 
				 
			
 
				 	if (stock->nr_pages) {
			
 
				-		unsigned long bytes = stock->nr_pages * PAGE_SIZE;
			
 
				-
			
 
				-		res_counter_uncharge(&old->res, bytes);
			
 
				+		page_counter_uncharge(&old->memory, stock->nr_pages);
			
 
				 		if (do_swap_account)
			
 
				-			res_counter_uncharge(&old->memsw, bytes);
			
 
				+			page_counter_uncharge(&old->memsw, stock->nr_pages);
			
 
				 		stock->nr_pages = 0;
			
 
				 	}
			
 
				 	stock->cached = NULL;
			
@@ -2371,7 +2371,7 @@ static void __init memcg_stock_init(void)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Cache charges(val) which is from res_counter, to local per_cpu area.
			
 
				+ * Cache charges(val) to local per_cpu area.
			
 
				  * This will be consumed by consume_stock() function, later.
			
 
				  */
			
 
				 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
			
@@ -2431,8 +2431,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
 
				 /*
			
 
				  * Tries to drain stocked charges in other cpus. This function is asynchronous
			
 
				  * and just put a work per cpu for draining localy on each cpu. Caller can
			
 
				- * expects some charges will be back to res_counter later but cannot wait for
			
 
				- * it.
			
 
				+ * expects some charges will be back later but cannot wait for it.
			
 
				  */
			
 
				 static void drain_all_stock_async(struct mem_cgroup *root_memcg)
			
 
				 {
			
@@ -2506,9 +2505,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
				 	unsigned int batch = max(CHARGE_BATCH, nr_pages);
			
 
				 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
			
 
				 	struct mem_cgroup *mem_over_limit;
			
 
				-	struct res_counter *fail_res;
			
 
				+	struct page_counter *counter;
			
 
				 	unsigned long nr_reclaimed;
			
 
				-	unsigned long long size;
			
 
				 	bool may_swap = true;
			
 
				 	bool drained = false;
			
 
				 	int ret = 0;
			
@@ -2519,16 +2517,15 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
				 	if (consume_stock(memcg, nr_pages))
			
 
				 		goto done;
			
 
				 
			
 
				-	size = batch * PAGE_SIZE;
			
 
				 	if (!do_swap_account ||
			
 
				-	    !res_counter_charge(&memcg->memsw, size, &fail_res)) {
			
 
				-		if (!res_counter_charge(&memcg->res, size, &fail_res))
			
 
				+	    !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
			
 
				+		if (!page_counter_try_charge(&memcg->memory, batch, &counter))
			
 
				 			goto done_restock;
			
 
				 		if (do_swap_account)
			
 
				-			res_counter_uncharge(&memcg->memsw, size);
			
 
				-		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
			
 
				+			page_counter_uncharge(&memcg->memsw, batch);
			
 
				+		mem_over_limit = mem_cgroup_from_counter(counter, memory);
			
 
				 	} else {
			
 
				-		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
			
 
				+		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
			
 
				 		may_swap = false;
			
 
				 	}
			
 
				 
			
@@ -2611,32 +2608,12 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
				 
			
 
				 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
			
 
				 {
			
 
				-	unsigned long bytes = nr_pages * PAGE_SIZE;
			
 
				-
			
 
				 	if (mem_cgroup_is_root(memcg))
			
 
				 		return;
			
 
				 
			
 
				-	res_counter_uncharge(&memcg->res, bytes);
			
 
				+	page_counter_uncharge(&memcg->memory, nr_pages);
			
 
				 	if (do_swap_account)
			
 
				-		res_counter_uncharge(&memcg->memsw, bytes);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
			
 
				- * This is useful when moving usage to parent cgroup.
			
 
				- */
			
 
				-static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
			
 
				-					unsigned int nr_pages)
			
 
				-{
			
 
				-	unsigned long bytes = nr_pages * PAGE_SIZE;
			
 
				-
			
 
				-	if (mem_cgroup_is_root(memcg))
			
 
				-		return;
			
 
				-
			
 
				-	res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
			
 
				-	if (do_swap_account)
			
 
				-		res_counter_uncharge_until(&memcg->memsw,
			
 
				-						memcg->memsw.parent, bytes);
			
 
				+		page_counter_uncharge(&memcg->memsw, nr_pages);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -2760,8 +2737,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 
				 		unlock_page_lru(page, isolated);
			
 
				 }
			
 
				 
			
 
				-static DEFINE_MUTEX(set_limit_mutex);
			
 
				-
			
 
				 #ifdef CONFIG_MEMCG_KMEM
			
 
				 /*
			
 
				  * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
			
@@ -2804,16 +2779,17 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
			
 
				+static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
			
 
				+			     unsigned long nr_pages)
			
 
				 {
			
 
				-	struct res_counter *fail_res;
			
 
				+	struct page_counter *counter;
			
 
				 	int ret = 0;
			
 
				 
			
 
				-	ret = res_counter_charge(&memcg->kmem, size, &fail_res);
			
 
				-	if (ret)
			
 
				+	ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
			
 
				+	if (ret < 0)
			
 
				 		return ret;
			
 
				 
			
 
				-	ret = try_charge(memcg, gfp, size >> PAGE_SHIFT);
			
 
				+	ret = try_charge(memcg, gfp, nr_pages);
			
 
				 	if (ret == -EINTR)  {
			
 
				 		/*
			
 
				 		 * try_charge() chose to bypass to root due to OOM kill or
			
@@ -2830,25 +2806,25 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
 
				 		 * when the allocation triggers should have been already
			
 
				 		 * directed to the root cgroup in memcontrol.h
			
 
				 		 */
			
 
				-		res_counter_charge_nofail(&memcg->res, size, &fail_res);
			
 
				+		page_counter_charge(&memcg->memory, nr_pages);
			
 
				 		if (do_swap_account)
			
 
				-			res_counter_charge_nofail(&memcg->memsw, size,
			
 
				-						  &fail_res);
			
 
				+			page_counter_charge(&memcg->memsw, nr_pages);
			
 
				 		ret = 0;
			
 
				 	} else if (ret)
			
 
				-		res_counter_uncharge(&memcg->kmem, size);
			
 
				+		page_counter_uncharge(&memcg->kmem, nr_pages);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
			
 
				+static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
			
 
				+				unsigned long nr_pages)
			
 
				 {
			
 
				-	res_counter_uncharge(&memcg->res, size);
			
 
				+	page_counter_uncharge(&memcg->memory, nr_pages);
			
 
				 	if (do_swap_account)
			
 
				-		res_counter_uncharge(&memcg->memsw, size);
			
 
				+		page_counter_uncharge(&memcg->memsw, nr_pages);
			
 
				 
			
 
				 	/* Not down to 0 */
			
 
				-	if (res_counter_uncharge(&memcg->kmem, size))
			
 
				+	if (page_counter_uncharge(&memcg->kmem, nr_pages))
			
 
				 		return;
			
 
				 
			
 
				 	/*
			
@@ -3124,19 +3100,21 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
 
				 
			
 
				 int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
			
 
				 {
			
 
				+	unsigned int nr_pages = 1 << order;
			
 
				 	int res;
			
 
				 
			
 
				-	res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
			
 
				-				PAGE_SIZE << order);
			
 
				+	res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
			
 
				 	if (!res)
			
 
				-		atomic_add(1 << order, &cachep->memcg_params->nr_pages);
			
 
				+		atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
			
 
				 	return res;
			
 
				 }
			
 
				 
			
 
				 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
			
 
				 {
			
 
				-	memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
			
 
				-	atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
			
 
				+	unsigned int nr_pages = 1 << order;
			
 
				+
			
 
				+	memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
			
 
				+	atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -3257,7 +3235,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
 
				 		return true;
			
 
				 	}
			
 
				 
			
 
				-	ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
			
 
				+	ret = memcg_charge_kmem(memcg, gfp, 1 << order);
			
 
				 	if (!ret)
			
 
				 		*_memcg = memcg;
			
 
				 
			
@@ -3274,7 +3252,7 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
 
				 
			
 
				 	/* The page allocation failed. Revert */
			
 
				 	if (!page) {
			
 
				-		memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
			
 
				+		memcg_uncharge_kmem(memcg, 1 << order);
			
 
				 		return;
			
 
				 	}
			
 
				 	/*
			
@@ -3307,7 +3285,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
 
				 		return;
			
 
				 
			
 
				 	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
			
 
				-	memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
			
 
				+	memcg_uncharge_kmem(memcg, 1 << order);
			
 
				 }
			
 
				 #else
			
 
				 static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
			
@@ -3485,8 +3463,12 @@ static int mem_cgroup_move_parent(struct page *page,
 
				 
			
 
				 	ret = mem_cgroup_move_account(page, nr_pages,
			
 
				 				pc, child, parent);
			
 
				-	if (!ret)
			
 
				-		__mem_cgroup_cancel_local_charge(child, nr_pages);
			
 
				+	if (!ret) {
			
 
				+		/* Take charge off the local counters */
			
 
				+		page_counter_cancel(&child->memory, nr_pages);
			
 
				+		if (do_swap_account)
			
 
				+			page_counter_cancel(&child->memsw, nr_pages);
			
 
				+	}
			
 
				 
			
 
				 	if (nr_pages > 1)
			
 
				 		compound_unlock_irqrestore(page, flags);
			
@@ -3516,7 +3498,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
 
				  *
			
 
				  * Returns 0 on success, -EINVAL on failure.
			
 
				  *
			
 
				- * The caller must have charged to @to, IOW, called res_counter_charge() about
			
 
				+ * The caller must have charged to @to, IOW, called page_counter_charge() about
			
 
				  * both res and memsw, and called css_get().
			
 
				  */
			
 
				 static int mem_cgroup_move_swap_account(swp_entry_t entry,
			
@@ -3532,7 +3514,7 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
 
				 		mem_cgroup_swap_statistics(to, true);
			
 
				 		/*
			
 
				 		 * This function is only called from task migration context now.
			
 
				-		 * It postpones res_counter and refcount handling till the end
			
 
				+		 * It postpones page_counter and refcount handling till the end
			
 
				 		 * of task migration(mem_cgroup_clear_mc()) for performance
			
 
				 		 * improvement. But we cannot postpone css_get(to)  because if
			
 
				 		 * the process that has been moved to @to does swap-in, the
			
@@ -3590,60 +3572,57 @@ void mem_cgroup_print_bad_page(struct page *page)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+static DEFINE_MUTEX(memcg_limit_mutex);
			
 
				+
			
 
				 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
			
 
				-				unsigned long long val)
			
 
				+				   unsigned long limit)
			
 
				 {
			
 
				+	unsigned long curusage;
			
 
				+	unsigned long oldusage;
			
 
				+	bool enlarge = false;
			
 
				 	int retry_count;
			
 
				-	int ret = 0;
			
 
				-	int children = mem_cgroup_count_children(memcg);
			
 
				-	u64 curusage, oldusage;
			
 
				-	int enlarge;
			
 
				+	int ret;
			
 
				 
			
 
				 	/*
			
 
				 	 * For keeping hierarchical_reclaim simple, how long we should retry
			
 
				 	 * is depends on callers. We set our retry-count to be function
			
 
				 	 * of # of children which we should visit in this loop.
			
 
				 	 */
			
 
				-	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
			
 
				+	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
			
 
				+		      mem_cgroup_count_children(memcg);
			
 
				 
			
 
				-	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
			
 
				+	oldusage = page_counter_read(&memcg->memory);
			
 
				 
			
 
				-	enlarge = 0;
			
 
				-	while (retry_count) {
			
 
				+	do {
			
 
				 		if (signal_pending(current)) {
			
 
				 			ret = -EINTR;
			
 
				 			break;
			
 
				 		}
			
 
				-		/*
			
 
				-		 * Rather than hide all in some function, I do this in
			
 
				-		 * open coded manner. You see what this really does.
			
 
				-		 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
			
 
				-		 */
			
 
				-		mutex_lock(&set_limit_mutex);
			
 
				-		if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) {
			
 
				+
			
 
				+		mutex_lock(&memcg_limit_mutex);
			
 
				+		if (limit > memcg->memsw.limit) {
			
 
				+			mutex_unlock(&memcg_limit_mutex);
			
 
				 			ret = -EINVAL;
			
 
				-			mutex_unlock(&set_limit_mutex);
			
 
				 			break;
			
 
				 		}
			
 
				-
			
 
				-		if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val)
			
 
				-			enlarge = 1;
			
 
				-
			
 
				-		ret = res_counter_set_limit(&memcg->res, val);
			
 
				-		mutex_unlock(&set_limit_mutex);
			
 
				+		if (limit > memcg->memory.limit)
			
 
				+			enlarge = true;
			
 
				+		ret = page_counter_limit(&memcg->memory, limit);
			
 
				+		mutex_unlock(&memcg_limit_mutex);
			
 
				 
			
 
				 		if (!ret)
			
 
				 			break;
			
 
				 
			
 
				 		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
			
 
				 
			
 
				-		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
			
 
				+		curusage = page_counter_read(&memcg->memory);
			
 
				 		/* Usage is reduced ? */
			
 
				 		if (curusage >= oldusage)
			
 
				 			retry_count--;
			
 
				 		else
			
 
				 			oldusage = curusage;
			
 
				-	}
			
 
				+	} while (retry_count);
			
 
				+
			
 
				 	if (!ret && enlarge)
			
 
				 		memcg_oom_recover(memcg);
			
 
				 
			
@@ -3651,52 +3630,53 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 
				 }
			
 
				 
			
 
				 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
			
 
				-					unsigned long long val)
			
 
				+					 unsigned long limit)
			
 
				 {
			
 
				+	unsigned long curusage;
			
 
				+	unsigned long oldusage;
			
 
				+	bool enlarge = false;
			
 
				 	int retry_count;
			
 
				-	u64 oldusage, curusage;
			
 
				-	int children = mem_cgroup_count_children(memcg);
			
 
				-	int ret = -EBUSY;
			
 
				-	int enlarge = 0;
			
 
				+	int ret;
			
 
				 
			
 
				 	/* see mem_cgroup_resize_res_limit */
			
 
				-	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
			
 
				-	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
			
 
				-	while (retry_count) {
			
 
				+	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
			
 
				+		      mem_cgroup_count_children(memcg);
			
 
				+
			
 
				+	oldusage = page_counter_read(&memcg->memsw);
			
 
				+
			
 
				+	do {
			
 
				 		if (signal_pending(current)) {
			
 
				 			ret = -EINTR;
			
 
				 			break;
			
 
				 		}
			
 
				-		/*
			
 
				-		 * Rather than hide all in some function, I do this in
			
 
				-		 * open coded manner. You see what this really does.
			
 
				-		 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
			
 
				-		 */
			
 
				-		mutex_lock(&set_limit_mutex);
			
 
				-		if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) {
			
 
				+
			
 
				+		mutex_lock(&memcg_limit_mutex);
			
 
				+		if (limit < memcg->memory.limit) {
			
 
				+			mutex_unlock(&memcg_limit_mutex);
			
 
				 			ret = -EINVAL;
			
 
				-			mutex_unlock(&set_limit_mutex);
			
 
				 			break;
			
 
				 		}
			
 
				-		if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val)
			
 
				-			enlarge = 1;
			
 
				-		ret = res_counter_set_limit(&memcg->memsw, val);
			
 
				-		mutex_unlock(&set_limit_mutex);
			
 
				+		if (limit > memcg->memsw.limit)
			
 
				+			enlarge = true;
			
 
				+		ret = page_counter_limit(&memcg->memsw, limit);
			
 
				+		mutex_unlock(&memcg_limit_mutex);
			
 
				 
			
 
				 		if (!ret)
			
 
				 			break;
			
 
				 
			
 
				 		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
			
 
				 
			
 
				-		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
			
 
				+		curusage = page_counter_read(&memcg->memsw);
			
 
				 		/* Usage is reduced ? */
			
 
				 		if (curusage >= oldusage)
			
 
				 			retry_count--;
			
 
				 		else
			
 
				 			oldusage = curusage;
			
 
				-	}
			
 
				+	} while (retry_count);
			
 
				+
			
 
				 	if (!ret && enlarge)
			
 
				 		memcg_oom_recover(memcg);
			
 
				+
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -3709,7 +3689,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 
				 	unsigned long reclaimed;
			
 
				 	int loop = 0;
			
 
				 	struct mem_cgroup_tree_per_zone *mctz;
			
 
				-	unsigned long long excess;
			
 
				+	unsigned long excess;
			
 
				 	unsigned long nr_scanned;
			
 
				 
			
 
				 	if (order > 0)
			
@@ -3763,7 +3743,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 
				 			} while (1);
			
 
				 		}
			
 
				 		__mem_cgroup_remove_exceeded(mz, mctz);
			
 
				-		excess = res_counter_soft_limit_excess(&mz->memcg->res);
			
 
				+		excess = soft_limit_excess(mz->memcg);
			
 
				 		/*
			
 
				 		 * One school of thought says that we should not add
			
 
				 		 * back the node to the tree if reclaim returns 0.
			
@@ -3856,7 +3836,6 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 
				 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
			
 
				 {
			
 
				 	int node, zid;
			
 
				-	u64 usage;
			
 
				 
			
 
				 	do {
			
 
				 		/* This is for making all *used* pages to be on LRU. */
			
@@ -3888,9 +3867,8 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
 
				 		 * right after the check. RES_USAGE should be safe as we always
			
 
				 		 * charge before adding to the LRU.
			
 
				 		 */
			
 
				-		usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
			
 
				-			res_counter_read_u64(&memcg->kmem, RES_USAGE);
			
 
				-	} while (usage > 0);
			
 
				+	} while (page_counter_read(&memcg->memory) -
			
 
				+		 page_counter_read(&memcg->kmem) > 0);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -3930,7 +3908,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 
				 	/* we call try-to-free pages for make this cgroup empty */
			
 
				 	lru_add_drain_all();
			
 
				 	/* try to free all pages in this cgroup */
			
 
				-	while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
			
 
				+	while (nr_retries && page_counter_read(&memcg->memory)) {
			
 
				 		int progress;
			
 
				 
			
 
				 		if (signal_pending(current))
			
@@ -4001,8 +3979,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
 
				 	return retval;
			
 
				 }
			
 
				 
			
 
				-static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
			
 
				-					       enum mem_cgroup_stat_index idx)
			
 
				+static unsigned long tree_stat(struct mem_cgroup *memcg,
			
 
				+			       enum mem_cgroup_stat_index idx)
			
 
				 {
			
 
				 	struct mem_cgroup *iter;
			
 
				 	long val = 0;
			
@@ -4020,55 +3998,72 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 
				 {
			
 
				 	u64 val;
			
 
				 
			
 
				-	if (!mem_cgroup_is_root(memcg)) {
			
 
				+	if (mem_cgroup_is_root(memcg)) {
			
 
				+		val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
			
 
				+		val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
			
 
				+		if (swap)
			
 
				+			val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
			
 
				+	} else {
			
 
				 		if (!swap)
			
 
				-			return res_counter_read_u64(&memcg->res, RES_USAGE);
			
 
				+			val = page_counter_read(&memcg->memory);
			
 
				 		else
			
 
				-			return res_counter_read_u64(&memcg->memsw, RES_USAGE);
			
 
				+			val = page_counter_read(&memcg->memsw);
			
 
				 	}
			
 
				-
			
 
				-	/*
			
 
				-	 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
			
 
				-	 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
			
 
				-	 */
			
 
				-	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
			
 
				-	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
			
 
				-
			
 
				-	if (swap)
			
 
				-		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
			
 
				-
			
 
				 	return val << PAGE_SHIFT;
			
 
				 }
			
 
				 
			
 
				+enum {
			
 
				+	RES_USAGE,
			
 
				+	RES_LIMIT,
			
 
				+	RES_MAX_USAGE,
			
 
				+	RES_FAILCNT,
			
 
				+	RES_SOFT_LIMIT,
			
 
				+};
			
 
				 
			
 
				 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
			
 
				 			       struct cftype *cft)
			
 
				 {
			
 
				 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
			
 
				-	enum res_type type = MEMFILE_TYPE(cft->private);
			
 
				-	int name = MEMFILE_ATTR(cft->private);
			
 
				+	struct page_counter *counter;
			
 
				 
			
 
				-	switch (type) {
			
 
				+	switch (MEMFILE_TYPE(cft->private)) {
			
 
				 	case _MEM:
			
 
				-		if (name == RES_USAGE)
			
 
				-			return mem_cgroup_usage(memcg, false);
			
 
				-		return res_counter_read_u64(&memcg->res, name);
			
 
				+		counter = &memcg->memory;
			
 
				+		break;
			
 
				 	case _MEMSWAP:
			
 
				-		if (name == RES_USAGE)
			
 
				-			return mem_cgroup_usage(memcg, true);
			
 
				-		return res_counter_read_u64(&memcg->memsw, name);
			
 
				+		counter = &memcg->memsw;
			
 
				+		break;
			
 
				 	case _KMEM:
			
 
				-		return res_counter_read_u64(&memcg->kmem, name);
			
 
				+		counter = &memcg->kmem;
			
 
				 		break;
			
 
				 	default:
			
 
				 		BUG();
			
 
				 	}
			
 
				+
			
 
				+	switch (MEMFILE_ATTR(cft->private)) {
			
 
				+	case RES_USAGE:
			
 
				+		if (counter == &memcg->memory)
			
 
				+			return mem_cgroup_usage(memcg, false);
			
 
				+		if (counter == &memcg->memsw)
			
 
				+			return mem_cgroup_usage(memcg, true);
			
 
				+		return (u64)page_counter_read(counter) * PAGE_SIZE;
			
 
				+	case RES_LIMIT:
			
 
				+		return (u64)counter->limit * PAGE_SIZE;
			
 
				+	case RES_MAX_USAGE:
			
 
				+		return (u64)counter->watermark * PAGE_SIZE;
			
 
				+	case RES_FAILCNT:
			
 
				+		return counter->failcnt;
			
 
				+	case RES_SOFT_LIMIT:
			
 
				+		return (u64)memcg->soft_limit * PAGE_SIZE;
			
 
				+	default:
			
 
				+		BUG();
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_MEMCG_KMEM
			
 
				 /* should be called with activate_kmem_mutex held */
			
 
				 static int __memcg_activate_kmem(struct mem_cgroup *memcg,
			
 
				-				 unsigned long long limit)
			
 
				+				 unsigned long nr_pages)
			
 
				 {
			
 
				 	int err = 0;
			
 
				 	int memcg_id;
			
@@ -4115,7 +4110,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
 
				 	 * We couldn't have accounted to this cgroup, because it hasn't got the
			
 
				 	 * active bit set yet, so this should succeed.
			
 
				 	 */
			
 
				-	err = res_counter_set_limit(&memcg->kmem, limit);
			
 
				+	err = page_counter_limit(&memcg->kmem, nr_pages);
			
 
				 	VM_BUG_ON(err);
			
 
				 
			
 
				 	static_key_slow_inc(&memcg_kmem_enabled_key);
			
@@ -4131,25 +4126,27 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
 
				 }
			
 
				 
			
 
				 static int memcg_activate_kmem(struct mem_cgroup *memcg,
			
 
				-			       unsigned long long limit)
			
 
				+			       unsigned long nr_pages)
			
 
				 {
			
 
				 	int ret;
			
 
				 
			
 
				 	mutex_lock(&activate_kmem_mutex);
			
 
				-	ret = __memcg_activate_kmem(memcg, limit);
			
 
				+	ret = __memcg_activate_kmem(memcg, nr_pages);
			
 
				 	mutex_unlock(&activate_kmem_mutex);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
			
 
				-				   unsigned long long val)
			
 
				+				   unsigned long limit)
			
 
				 {
			
 
				 	int ret;
			
 
				 
			
 
				+	mutex_lock(&memcg_limit_mutex);
			
 
				 	if (!memcg_kmem_is_active(memcg))
			
 
				-		ret = memcg_activate_kmem(memcg, val);
			
 
				+		ret = memcg_activate_kmem(memcg, limit);
			
 
				 	else
			
 
				-		ret = res_counter_set_limit(&memcg->kmem, val);
			
 
				+		ret = page_counter_limit(&memcg->kmem, limit);
			
 
				+	mutex_unlock(&memcg_limit_mutex);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -4167,13 +4164,13 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 
				 	 * after this point, because it has at least one child already.
			
 
				 	 */
			
 
				 	if (memcg_kmem_is_active(parent))
			
 
				-		ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
			
 
				+		ret = __memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
			
 
				 	mutex_unlock(&activate_kmem_mutex);
			
 
				 	return ret;
			
 
				 }
			
 
				 #else
			
 
				 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
			
 
				-				   unsigned long long val)
			
 
				+				   unsigned long limit)
			
 
				 {
			
 
				 	return -EINVAL;
			
 
				 }
			
@@ -4187,110 +4184,69 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
 
				 				char *buf, size_t nbytes, loff_t off)
			
 
				 {
			
 
				 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
			
 
				-	enum res_type type;
			
 
				-	int name;
			
 
				-	unsigned long long val;
			
 
				+	unsigned long nr_pages;
			
 
				 	int ret;
			
 
				 
			
 
				 	buf = strstrip(buf);
			
 
				-	type = MEMFILE_TYPE(of_cft(of)->private);
			
 
				-	name = MEMFILE_ATTR(of_cft(of)->private);
			
 
				+	ret = page_counter_memparse(buf, &nr_pages);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				 
			
 
				-	switch (name) {
			
 
				+	switch (MEMFILE_ATTR(of_cft(of)->private)) {
			
 
				 	case RES_LIMIT:
			
 
				 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
			
 
				 			ret = -EINVAL;
			
 
				 			break;
			
 
				 		}
			
 
				-		/* This function does all necessary parse...reuse it */
			
 
				-		ret = res_counter_memparse_write_strategy(buf, &val);
			
 
				-		if (ret)
			
 
				+		switch (MEMFILE_TYPE(of_cft(of)->private)) {
			
 
				+		case _MEM:
			
 
				+			ret = mem_cgroup_resize_limit(memcg, nr_pages);
			
 
				 			break;
			
 
				-		if (type == _MEM)
			
 
				-			ret = mem_cgroup_resize_limit(memcg, val);
			
 
				-		else if (type == _MEMSWAP)
			
 
				-			ret = mem_cgroup_resize_memsw_limit(memcg, val);
			
 
				-		else if (type == _KMEM)
			
 
				-			ret = memcg_update_kmem_limit(memcg, val);
			
 
				-		else
			
 
				-			return -EINVAL;
			
 
				-		break;
			
 
				-	case RES_SOFT_LIMIT:
			
 
				-		ret = res_counter_memparse_write_strategy(buf, &val);
			
 
				-		if (ret)
			
 
				+		case _MEMSWAP:
			
 
				+			ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
			
 
				 			break;
			
 
				-		/*
			
 
				-		 * For memsw, soft limits are hard to implement in terms
			
 
				-		 * of semantics, for now, we support soft limits for
			
 
				-		 * control without swap
			
 
				-		 */
			
 
				-		if (type == _MEM)
			
 
				-			ret = res_counter_set_soft_limit(&memcg->res, val);
			
 
				-		else
			
 
				-			ret = -EINVAL;
			
 
				+		case _KMEM:
			
 
				+			ret = memcg_update_kmem_limit(memcg, nr_pages);
			
 
				+			break;
			
 
				+		}
			
 
				 		break;
			
 
				-	default:
			
 
				-		ret = -EINVAL; /* should be BUG() ? */
			
 
				+	case RES_SOFT_LIMIT:
			
 
				+		memcg->soft_limit = nr_pages;
			
 
				+		ret = 0;
			
 
				 		break;
			
 
				 	}
			
 
				 	return ret ?: nbytes;
			
 
				 }
			
 
				 
			
 
				-static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
			
 
				-		unsigned long long *mem_limit, unsigned long long *memsw_limit)
			
 
				-{
			
 
				-	unsigned long long min_limit, min_memsw_limit, tmp;
			
 
				-
			
 
				-	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
			
 
				-	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
			
 
				-	if (!memcg->use_hierarchy)
			
 
				-		goto out;
			
 
				-
			
 
				-	while (memcg->css.parent) {
			
 
				-		memcg = mem_cgroup_from_css(memcg->css.parent);
			
 
				-		if (!memcg->use_hierarchy)
			
 
				-			break;
			
 
				-		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
			
 
				-		min_limit = min(min_limit, tmp);
			
 
				-		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
			
 
				-		min_memsw_limit = min(min_memsw_limit, tmp);
			
 
				-	}
			
 
				-out:
			
 
				-	*mem_limit = min_limit;
			
 
				-	*memsw_limit = min_memsw_limit;
			
 
				-}
			
 
				-
			
 
				 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
			
 
				 				size_t nbytes, loff_t off)
			
 
				 {
			
 
				 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
			
 
				-	int name;
			
 
				-	enum res_type type;
			
 
				+	struct page_counter *counter;
			
 
				 
			
 
				-	type = MEMFILE_TYPE(of_cft(of)->private);
			
 
				-	name = MEMFILE_ATTR(of_cft(of)->private);
			
 
				+	switch (MEMFILE_TYPE(of_cft(of)->private)) {
			
 
				+	case _MEM:
			
 
				+		counter = &memcg->memory;
			
 
				+		break;
			
 
				+	case _MEMSWAP:
			
 
				+		counter = &memcg->memsw;
			
 
				+		break;
			
 
				+	case _KMEM:
			
 
				+		counter = &memcg->kmem;
			
 
				+		break;
			
 
				+	default:
			
 
				+		BUG();
			
 
				+	}
			
 
				 
			
 
				-	switch (name) {
			
 
				+	switch (MEMFILE_ATTR(of_cft(of)->private)) {
			
 
				 	case RES_MAX_USAGE:
			
 
				-		if (type == _MEM)
			
 
				-			res_counter_reset_max(&memcg->res);
			
 
				-		else if (type == _MEMSWAP)
			
 
				-			res_counter_reset_max(&memcg->memsw);
			
 
				-		else if (type == _KMEM)
			
 
				-			res_counter_reset_max(&memcg->kmem);
			
 
				-		else
			
 
				-			return -EINVAL;
			
 
				+		page_counter_reset_watermark(counter);
			
 
				 		break;
			
 
				 	case RES_FAILCNT:
			
 
				-		if (type == _MEM)
			
 
				-			res_counter_reset_failcnt(&memcg->res);
			
 
				-		else if (type == _MEMSWAP)
			
 
				-			res_counter_reset_failcnt(&memcg->memsw);
			
 
				-		else if (type == _KMEM)
			
 
				-			res_counter_reset_failcnt(&memcg->kmem);
			
 
				-		else
			
 
				-			return -EINVAL;
			
 
				+		counter->failcnt = 0;
			
 
				 		break;
			
 
				+	default:
			
 
				+		BUG();
			
 
				 	}
			
 
				 
			
 
				 	return nbytes;
			
@@ -4387,6 +4343,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
 
				 static int memcg_stat_show(struct seq_file *m, void *v)
			
 
				 {
			
 
				 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
			
 
				+	unsigned long memory, memsw;
			
 
				 	struct mem_cgroup *mi;
			
 
				 	unsigned int i;
			
 
				 
			
@@ -4406,14 +4363,16 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 
				 			   mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
			
 
				 
			
 
				 	/* Hierarchical information */
			
 
				-	{
			
 
				-		unsigned long long limit, memsw_limit;
			
 
				-		memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
			
 
				-		seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
			
 
				-		if (do_swap_account)
			
 
				-			seq_printf(m, "hierarchical_memsw_limit %llu\n",
			
 
				-				   memsw_limit);
			
 
				+	memory = memsw = PAGE_COUNTER_MAX;
			
 
				+	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
			
 
				+		memory = min(memory, mi->memory.limit);
			
 
				+		memsw = min(memsw, mi->memsw.limit);
			
 
				 	}
			
 
				+	seq_printf(m, "hierarchical_memory_limit %llu\n",
			
 
				+		   (u64)memory * PAGE_SIZE);
			
 
				+	if (do_swap_account)
			
 
				+		seq_printf(m, "hierarchical_memsw_limit %llu\n",
			
 
				+			   (u64)memsw * PAGE_SIZE);
			
 
				 
			
 
				 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
			
 
				 		long long val = 0;
			
@@ -4497,7 +4456,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 
				 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
			
 
				 {
			
 
				 	struct mem_cgroup_threshold_ary *t;
			
 
				-	u64 usage;
			
 
				+	unsigned long usage;
			
 
				 	int i;
			
 
				 
			
 
				 	rcu_read_lock();
			
@@ -4596,10 +4555,11 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 
				 {
			
 
				 	struct mem_cgroup_thresholds *thresholds;
			
 
				 	struct mem_cgroup_threshold_ary *new;
			
 
				-	u64 threshold, usage;
			
 
				+	unsigned long threshold;
			
 
				+	unsigned long usage;
			
 
				 	int i, size, ret;
			
 
				 
			
 
				-	ret = res_counter_memparse_write_strategy(args, &threshold);
			
 
				+	ret = page_counter_memparse(args, &threshold);
			
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
@@ -4689,7 +4649,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 
				 {
			
 
				 	struct mem_cgroup_thresholds *thresholds;
			
 
				 	struct mem_cgroup_threshold_ary *new;
			
 
				-	u64 usage;
			
 
				+	unsigned long usage;
			
 
				 	int i, j, size;
			
 
				 
			
 
				 	mutex_lock(&memcg->thresholds_lock);
			
@@ -4883,7 +4843,7 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
 
				 
			
 
				 	memcg_kmem_mark_dead(memcg);
			
 
				 
			
 
				-	if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
			
 
				+	if (page_counter_read(&memcg->kmem))
			
 
				 		return;
			
 
				 
			
 
				 	if (memcg_kmem_test_and_clear_dead(memcg))
			
@@ -5363,9 +5323,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 
				  */
			
 
				 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
			
 
				 {
			
 
				-	if (!memcg->res.parent)
			
 
				+	if (!memcg->memory.parent)
			
 
				 		return NULL;
			
 
				-	return mem_cgroup_from_res_counter(memcg->res.parent, res);
			
 
				+	return mem_cgroup_from_counter(memcg->memory.parent, memory);
			
 
				 }
			
 
				 EXPORT_SYMBOL(parent_mem_cgroup);
			
 
				 
			
@@ -5410,9 +5370,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 
				 	/* root ? */
			
 
				 	if (parent_css == NULL) {
			
 
				 		root_mem_cgroup = memcg;
			
 
				-		res_counter_init(&memcg->res, NULL);
			
 
				-		res_counter_init(&memcg->memsw, NULL);
			
 
				-		res_counter_init(&memcg->kmem, NULL);
			
 
				+		page_counter_init(&memcg->memory, NULL);
			
 
				+		page_counter_init(&memcg->memsw, NULL);
			
 
				+		page_counter_init(&memcg->kmem, NULL);
			
 
				 	}
			
 
				 
			
 
				 	memcg->last_scanned_node = MAX_NUMNODES;
			
@@ -5451,18 +5411,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 
				 	memcg->swappiness = mem_cgroup_swappiness(parent);
			
 
				 
			
 
				 	if (parent->use_hierarchy) {
			
 
				-		res_counter_init(&memcg->res, &parent->res);
			
 
				-		res_counter_init(&memcg->memsw, &parent->memsw);
			
 
				-		res_counter_init(&memcg->kmem, &parent->kmem);
			
 
				+		page_counter_init(&memcg->memory, &parent->memory);
			
 
				+		page_counter_init(&memcg->memsw, &parent->memsw);
			
 
				+		page_counter_init(&memcg->kmem, &parent->kmem);
			
 
				 
			
 
				 		/*
			
 
				 		 * No need to take a reference to the parent because cgroup
			
 
				 		 * core guarantees its existence.
			
 
				 		 */
			
 
				 	} else {
			
 
				-		res_counter_init(&memcg->res, NULL);
			
 
				-		res_counter_init(&memcg->memsw, NULL);
			
 
				-		res_counter_init(&memcg->kmem, NULL);
			
 
				+		page_counter_init(&memcg->memory, NULL);
			
 
				+		page_counter_init(&memcg->memsw, NULL);
			
 
				+		page_counter_init(&memcg->kmem, NULL);
			
 
				 		/*
			
 
				 		 * Deeper hierachy with use_hierarchy == false doesn't make
			
 
				 		 * much sense so let cgroup subsystem know about this
			
@@ -5544,7 +5504,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 
				 	/*
			
 
				 	 * XXX: css_offline() would be where we should reparent all
			
 
				 	 * memory to prepare the cgroup for destruction.  However,
			
 
				-	 * memcg does not do css_tryget_online() and res_counter charging
			
 
				+	 * memcg does not do css_tryget_online() and page_counter charging
			
 
				 	 * under the same RCU lock region, which means that charging
			
 
				 	 * could race with offlining.  Offlining only happens to
			
 
				 	 * cgroups with no tasks in them but charges can show up
			
@@ -5564,7 +5524,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 
				 	 * call_rcu()
			
 
				 	 *   offline_css()
			
 
				 	 *     reparent_charges()
			
 
				-	 *                           res_counter_charge()
			
 
				+	 *                           page_counter_try_charge()
			
 
				 	 *                           css_put()
			
 
				 	 *                             css_free()
			
 
				 	 *                           pc->mem_cgroup = dead memcg
			
@@ -5599,10 +5559,10 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 
				 {
			
 
				 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
			
 
				 
			
 
				-	mem_cgroup_resize_limit(memcg, ULLONG_MAX);
			
 
				-	mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX);
			
 
				-	memcg_update_kmem_limit(memcg, ULLONG_MAX);
			
 
				-	res_counter_set_soft_limit(&memcg->res, ULLONG_MAX);
			
 
				+	mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
			
 
				+	mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
			
 
				+	memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
			
 
				+	memcg->soft_limit = 0;
			
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_MMU
			
@@ -5916,19 +5876,18 @@ static void __mem_cgroup_clear_mc(void)
 
				 	if (mc.moved_swap) {
			
 
				 		/* uncharge swap account from the old cgroup */
			
 
				 		if (!mem_cgroup_is_root(mc.from))
			
 
				-			res_counter_uncharge(&mc.from->memsw,
			
 
				-					     PAGE_SIZE * mc.moved_swap);
			
 
				-
			
 
				-		for (i = 0; i < mc.moved_swap; i++)
			
 
				-			css_put(&mc.from->css);
			
 
				+			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
			
 
				 
			
 
				 		/*
			
 
				-		 * we charged both to->res and to->memsw, so we should
			
 
				-		 * uncharge to->res.
			
 
				+		 * we charged both to->memory and to->memsw, so we
			
 
				+		 * should uncharge to->memory.
			
 
				 		 */
			
 
				 		if (!mem_cgroup_is_root(mc.to))
			
 
				-			res_counter_uncharge(&mc.to->res,
			
 
				-					     PAGE_SIZE * mc.moved_swap);
			
 
				+			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
			
 
				+
			
 
				+		for (i = 0; i < mc.moved_swap; i++)
			
 
				+			css_put(&mc.from->css);
			
 
				+
			
 
				 		/* we've already done css_get(mc.to) */
			
 
				 		mc.moved_swap = 0;
			
 
				 	}
			
@@ -6294,7 +6253,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
 
				 	memcg = mem_cgroup_lookup(id);
			
 
				 	if (memcg) {
			
 
				 		if (!mem_cgroup_is_root(memcg))
			
 
				-			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
			
 
				+			page_counter_uncharge(&memcg->memsw, 1);
			
 
				 		mem_cgroup_swap_statistics(memcg, false);
			
 
				 		css_put(&memcg->css);
			
 
				 	}
			
@@ -6460,11 +6419,9 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 
				 
			
 
				 	if (!mem_cgroup_is_root(memcg)) {
			
 
				 		if (nr_mem)
			
 
				-			res_counter_uncharge(&memcg->res,
			
 
				-					     nr_mem * PAGE_SIZE);
			
 
				+			page_counter_uncharge(&memcg->memory, nr_mem);
			
 
				 		if (nr_memsw)
			
 
				-			res_counter_uncharge(&memcg->memsw,
			
 
				-					     nr_memsw * PAGE_SIZE);
			
 
				+			page_counter_uncharge(&memcg->memsw, nr_memsw);
			
 
				 		memcg_oom_recover(memcg);
			
 
				 	}
			
 
				 
			
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -0,0 +1,207 @@
 
				+/*
			
 
				+ * Lockless hierarchical page accounting & limiting
			
 
				+ *
			
 
				+ * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
			
 
				+ */
			
 
				+
			
 
				+#include <linux/page_counter.h>
			
 
				+#include <linux/atomic.h>
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/string.h>
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/bug.h>
			
 
				+#include <asm/page.h>
			
 
				+
			
 
				+/**
			
 
				+ * page_counter_cancel - take pages out of the local counter
			
 
				+ * @counter: counter
			
 
				+ * @nr_pages: number of pages to cancel
			
 
				+ *
			
 
				+ * Returns whether there are remaining pages in the counter.
			
 
				+ */
			
 
				+int page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
			
 
				+{
			
 
				+	long new;
			
 
				+
			
 
				+	new = atomic_long_sub_return(nr_pages, &counter->count);
			
 
				+
			
 
				+	/* More uncharges than charges? */
			
 
				+	WARN_ON_ONCE(new < 0);
			
 
				+
			
 
				+	return new > 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * page_counter_charge - hierarchically charge pages
			
 
				+ * @counter: counter
			
 
				+ * @nr_pages: number of pages to charge
			
 
				+ *
			
 
				+ * NOTE: This does not consider any configured counter limits.
			
 
				+ */
			
 
				+void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
			
 
				+{
			
 
				+	struct page_counter *c;
			
 
				+
			
 
				+	for (c = counter; c; c = c->parent) {
			
 
				+		long new;
			
 
				+
			
 
				+		new = atomic_long_add_return(nr_pages, &c->count);
			
 
				+		/*
			
 
				+		 * This is indeed racy, but we can live with some
			
 
				+		 * inaccuracy in the watermark.
			
 
				+		 */
			
 
				+		if (new > c->watermark)
			
 
				+			c->watermark = new;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * page_counter_try_charge - try to hierarchically charge pages
			
 
				+ * @counter: counter
			
 
				+ * @nr_pages: number of pages to charge
			
 
				+ * @fail: points first counter to hit its limit, if any
			
 
				+ *
			
 
				+ * Returns 0 on success, or -ENOMEM and @fail if the counter or one of
			
 
				+ * its ancestors has hit its configured limit.
			
 
				+ */
			
 
				+int page_counter_try_charge(struct page_counter *counter,
			
 
				+			    unsigned long nr_pages,
			
 
				+			    struct page_counter **fail)
			
 
				+{
			
 
				+	struct page_counter *c;
			
 
				+
			
 
				+	for (c = counter; c; c = c->parent) {
			
 
				+		long new;
			
 
				+		/*
			
 
				+		 * Charge speculatively to avoid an expensive CAS.  If
			
 
				+		 * a bigger charge fails, it might falsely lock out a
			
 
				+		 * racing smaller charge and send it into reclaim
			
 
				+		 * early, but the error is limited to the difference
			
 
				+		 * between the two sizes, which is less than 2M/4M in
			
 
				+		 * case of a THP locking out a regular page charge.
			
 
				+		 *
			
 
				+		 * The atomic_long_add_return() implies a full memory
			
 
				+		 * barrier between incrementing the count and reading
			
 
				+		 * the limit.  When racing with page_counter_limit(),
			
 
				+		 * we either see the new limit or the setter sees the
			
 
				+		 * counter has changed and retries.
			
 
				+		 */
			
 
				+		new = atomic_long_add_return(nr_pages, &c->count);
			
 
				+		if (new > c->limit) {
			
 
				+			atomic_long_sub(nr_pages, &c->count);
			
 
				+			/*
			
 
				+			 * This is racy, but we can live with some
			
 
				+			 * inaccuracy in the failcnt.
			
 
				+			 */
			
 
				+			c->failcnt++;
			
 
				+			*fail = c;
			
 
				+			goto failed;
			
 
				+		}
			
 
				+		/*
			
 
				+		 * Just like with failcnt, we can live with some
			
 
				+		 * inaccuracy in the watermark.
			
 
				+		 */
			
 
				+		if (new > c->watermark)
			
 
				+			c->watermark = new;
			
 
				+	}
			
 
				+	return 0;
			
 
				+
			
 
				+failed:
			
 
				+	for (c = counter; c != *fail; c = c->parent)
			
 
				+		page_counter_cancel(c, nr_pages);
			
 
				+
			
 
				+	return -ENOMEM;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * page_counter_uncharge - hierarchically uncharge pages
			
 
				+ * @counter: counter
			
 
				+ * @nr_pages: number of pages to uncharge
			
 
				+ *
			
 
				+ * Returns whether there are remaining charges in @counter.
			
 
				+ */
			
 
				+int page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
			
 
				+{
			
 
				+	struct page_counter *c;
			
 
				+	int ret = 1;
			
 
				+
			
 
				+	for (c = counter; c; c = c->parent) {
			
 
				+		int remainder;
			
 
				+
			
 
				+		remainder = page_counter_cancel(c, nr_pages);
			
 
				+		if (c == counter && !remainder)
			
 
				+			ret = 0;
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * page_counter_limit - limit the number of pages allowed
			
 
				+ * @counter: counter
			
 
				+ * @limit: limit to set
			
 
				+ *
			
 
				+ * Returns 0 on success, -EBUSY if the current number of pages on the
			
 
				+ * counter already exceeds the specified limit.
			
 
				+ *
			
 
				+ * The caller must serialize invocations on the same counter.
			
 
				+ */
			
 
				+int page_counter_limit(struct page_counter *counter, unsigned long limit)
			
 
				+{
			
 
				+	for (;;) {
			
 
				+		unsigned long old;
			
 
				+		long count;
			
 
				+
			
 
				+		/*
			
 
				+		 * Update the limit while making sure that it's not
			
 
				+		 * below the concurrently-changing counter value.
			
 
				+		 *
			
 
				+		 * The xchg implies two full memory barriers before
			
 
				+		 * and after, so the read-swap-read is ordered and
			
 
				+		 * ensures coherency with page_counter_try_charge():
			
 
				+		 * that function modifies the count before checking
			
 
				+		 * the limit, so if it sees the old limit, we see the
			
 
				+		 * modified counter and retry.
			
 
				+		 */
			
 
				+		count = atomic_long_read(&counter->count);
			
 
				+
			
 
				+		if (count > limit)
			
 
				+			return -EBUSY;
			
 
				+
			
 
				+		old = xchg(&counter->limit, limit);
			
 
				+
			
 
				+		if (atomic_long_read(&counter->count) <= count)
			
 
				+			return 0;
			
 
				+
			
 
				+		counter->limit = old;
			
 
				+		cond_resched();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * page_counter_memparse - memparse() for page counter limits
			
 
				+ * @buf: string to parse
			
 
				+ * @nr_pages: returns the result in number of pages
			
 
				+ *
			
 
				+ * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
			
 
				+ * limited to %PAGE_COUNTER_MAX.
			
 
				+ */
			
 
				+int page_counter_memparse(const char *buf, unsigned long *nr_pages)
			
 
				+{
			
 
				+	char unlimited[] = "-1";
			
 
				+	char *end;
			
 
				+	u64 bytes;
			
 
				+
			
 
				+	if (!strncmp(buf, unlimited, sizeof(unlimited))) {
			
 
				+		*nr_pages = PAGE_COUNTER_MAX;
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	bytes = memparse(buf, &end);
			
 
				+	if (*end != '\0')
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	*nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -9,13 +9,13 @@
 
				 int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
			
 
				 {
			
 
				 	/*
			
 
				-	 * The root cgroup does not use res_counters, but rather,
			
 
				+	 * The root cgroup does not use page_counters, but rather,
			
 
				 	 * rely on the data already collected by the network
			
 
				 	 * subsystem
			
 
				 	 */
			
 
				-	struct res_counter *res_parent = NULL;
			
 
				-	struct cg_proto *cg_proto, *parent_cg;
			
 
				 	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
			
 
				+	struct page_counter *counter_parent = NULL;
			
 
				+	struct cg_proto *cg_proto, *parent_cg;
			
 
				 
			
 
				 	cg_proto = tcp_prot.proto_cgroup(memcg);
			
 
				 	if (!cg_proto)
			
@@ -29,9 +29,9 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 
				 
			
 
				 	parent_cg = tcp_prot.proto_cgroup(parent);
			
 
				 	if (parent_cg)
			
 
				-		res_parent = &parent_cg->memory_allocated;
			
 
				+		counter_parent = &parent_cg->memory_allocated;
			
 
				 
			
 
				-	res_counter_init(&cg_proto->memory_allocated, res_parent);
			
 
				+	page_counter_init(&cg_proto->memory_allocated, counter_parent);
			
 
				 	percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL);
			
 
				 
			
 
				 	return 0;
			
@@ -50,7 +50,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
 
				 }
			
 
				 EXPORT_SYMBOL(tcp_destroy_cgroup);
			
 
				 
			
 
				-static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
			
 
				+static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
			
 
				 {
			
 
				 	struct cg_proto *cg_proto;
			
 
				 	int i;
			
@@ -60,20 +60,17 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 
				 	if (!cg_proto)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				-	if (val > RES_COUNTER_MAX)
			
 
				-		val = RES_COUNTER_MAX;
			
 
				-
			
 
				-	ret = res_counter_set_limit(&cg_proto->memory_allocated, val);
			
 
				+	ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages);
			
 
				 	if (ret)
			
 
				 		return ret;
			
 
				 
			
 
				 	for (i = 0; i < 3; i++)
			
 
				-		cg_proto->sysctl_mem[i] = min_t(long, val >> PAGE_SHIFT,
			
 
				+		cg_proto->sysctl_mem[i] = min_t(long, nr_pages,
			
 
				 						sysctl_tcp_mem[i]);
			
 
				 
			
 
				-	if (val == RES_COUNTER_MAX)
			
 
				+	if (nr_pages == PAGE_COUNTER_MAX)
			
 
				 		clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
			
 
				-	else if (val != RES_COUNTER_MAX) {
			
 
				+	else {
			
 
				 		/*
			
 
				 		 * The active bit needs to be written after the static_key
			
 
				 		 * update. This is what guarantees that the socket activation
			
@@ -102,11 +99,20 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+enum {
			
 
				+	RES_USAGE,
			
 
				+	RES_LIMIT,
			
 
				+	RES_MAX_USAGE,
			
 
				+	RES_FAILCNT,
			
 
				+};
			
 
				+
			
 
				+static DEFINE_MUTEX(tcp_limit_mutex);
			
 
				+
			
 
				 static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
			
 
				 				char *buf, size_t nbytes, loff_t off)
			
 
				 {
			
 
				 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
			
 
				-	unsigned long long val;
			
 
				+	unsigned long nr_pages;
			
 
				 	int ret = 0;
			
 
				 
			
 
				 	buf = strstrip(buf);
			
@@ -114,10 +120,12 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
 
				 	switch (of_cft(of)->private) {
			
 
				 	case RES_LIMIT:
			
 
				 		/* see memcontrol.c */
			
 
				-		ret = res_counter_memparse_write_strategy(buf, &val);
			
 
				+		ret = page_counter_memparse(buf, &nr_pages);
			
 
				 		if (ret)
			
 
				 			break;
			
 
				-		ret = tcp_update_limit(memcg, val);
			
 
				+		mutex_lock(&tcp_limit_mutex);
			
 
				+		ret = tcp_update_limit(memcg, nr_pages);
			
 
				+		mutex_unlock(&tcp_limit_mutex);
			
 
				 		break;
			
 
				 	default:
			
 
				 		ret = -EINVAL;
			
@@ -126,43 +134,36 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
 
				 	return ret ?: nbytes;
			
 
				 }
			
 
				 
			
 
				-static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
			
 
				-{
			
 
				-	struct cg_proto *cg_proto;
			
 
				-
			
 
				-	cg_proto = tcp_prot.proto_cgroup(memcg);
			
 
				-	if (!cg_proto)
			
 
				-		return default_val;
			
 
				-
			
 
				-	return res_counter_read_u64(&cg_proto->memory_allocated, type);
			
 
				-}
			
 
				-
			
 
				-static u64 tcp_read_usage(struct mem_cgroup *memcg)
			
 
				-{
			
 
				-	struct cg_proto *cg_proto;
			
 
				-
			
 
				-	cg_proto = tcp_prot.proto_cgroup(memcg);
			
 
				-	if (!cg_proto)
			
 
				-		return atomic_long_read(&tcp_memory_allocated) << PAGE_SHIFT;
			
 
				-
			
 
				-	return res_counter_read_u64(&cg_proto->memory_allocated, RES_USAGE);
			
 
				-}
			
 
				-
			
 
				 static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
			
 
				 {
			
 
				 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
			
 
				+	struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg);
			
 
				 	u64 val;
			
 
				 
			
 
				 	switch (cft->private) {
			
 
				 	case RES_LIMIT:
			
 
				-		val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX);
			
 
				+		if (!cg_proto)
			
 
				+			return PAGE_COUNTER_MAX;
			
 
				+		val = cg_proto->memory_allocated.limit;
			
 
				+		val *= PAGE_SIZE;
			
 
				 		break;
			
 
				 	case RES_USAGE:
			
 
				-		val = tcp_read_usage(memcg);
			
 
				+		if (!cg_proto)
			
 
				+			val = atomic_long_read(&tcp_memory_allocated);
			
 
				+		else
			
 
				+			val = page_counter_read(&cg_proto->memory_allocated);
			
 
				+		val *= PAGE_SIZE;
			
 
				 		break;
			
 
				 	case RES_FAILCNT:
			
 
				+		if (!cg_proto)
			
 
				+			return 0;
			
 
				+		val = cg_proto->memory_allocated.failcnt;
			
 
				+		break;
			
 
				 	case RES_MAX_USAGE:
			
 
				-		val = tcp_read_stat(memcg, cft->private, 0);
			
 
				+		if (!cg_proto)
			
 
				+			return 0;
			
 
				+		val = cg_proto->memory_allocated.watermark;
			
 
				+		val *= PAGE_SIZE;
			
 
				 		break;
			
 
				 	default:
			
 
				 		BUG();
			
@@ -183,10 +184,10 @@ static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
 
				 
			
 
				 	switch (of_cft(of)->private) {
			
 
				 	case RES_MAX_USAGE:
			
 
				-		res_counter_reset_max(&cg_proto->memory_allocated);
			
 
				+		page_counter_reset_watermark(&cg_proto->memory_allocated);
			
 
				 		break;
			
 
				 	case RES_FAILCNT:
			
 
				-		res_counter_reset_failcnt(&cg_proto->memory_allocated);
			
 
				+		cg_proto->memory_allocated.failcnt = 0;
			
 
				 		break;
			
 
				 	}