浏览代码

memcg: oom kill disable and oom status

This adds a feature to disable oom-killer for memcg, if disabled, of
course, tasks under memcg will stop.

But now, we have oom-notifier for memcg.  And the world around memcg is
not under out-of-memory.  memcg's out-of-memory just shows memcg hits
limit.  Then, administrator or management daemon can recover the situation
by

	- kill some process
	- enlarge limit, add more swap.
	- migrate some tasks
	- remove file cache on tmps (difficult ?)

Unlike oom-killer, you can take enough information before killing tasks.
(by gcore, or, ps etc.)

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
KAMEZAWA Hiroyuki 15 年之前
父节点
当前提交
3c11ecf448
共有 2 个文件被更改,包括 117 次插入19 次删除
  1. 23 0
      Documentation/cgroups/memory.txt
  2. 94 19
      mm/memcontrol.c

+ 23 - 0
Documentation/cgroups/memory.txt

@@ -493,6 +493,8 @@ It's applicable for root and non-root cgroup.
 
 
 10. OOM Control
 10. OOM Control
 
 
+memory.oom_control file is for OOM notification and other controls.
+
 Memory controler implements oom notifier using cgroup notification
 Memory controler implements oom notifier using cgroup notification
 API (See cgroups.txt). It allows to register multiple oom notification
 API (See cgroups.txt). It allows to register multiple oom notification
 delivery and gets notification when oom happens.
 delivery and gets notification when oom happens.
@@ -505,6 +507,27 @@ To register a notifier, application need:
 Application will be notifier through eventfd when oom happens.
 Application will be notifier through eventfd when oom happens.
 OOM notification doesn't work for root cgroup.
 OOM notification doesn't work for root cgroup.
 
 
+You can disable oom-killer by writing "1" to memory.oom_control file.
+As.
+	#echo 1 > memory.oom_control
+
+This operation is only allowed to the top cgroup of subhierarchy.
+If oom-killer is disabled, tasks under cgroup will hang/sleep
+in memcg's oom-waitq when they request accountable memory.
+
+For running them, you have to relax the memcg's oom sitaution by
+	* enlarge limit or reduce usage.
+To reduce usage,
+	* kill some tasks.
+	* move some tasks to other group with account migration.
+	* remove some files (on tmpfs?)
+
+Then, stopped tasks will work again.
+
+At reading, current status of OOM is shown.
+	oom_kill_disable 0 or 1 (if 1, oom-killer is disabled)
+	under_oom	 0 or 1 (if 1, the memcg is under OOM,tasks may
+				 be stopped.)
 
 
 11. TODO
 11. TODO
 
 

+ 94 - 19
mm/memcontrol.c

@@ -214,6 +214,8 @@ struct mem_cgroup {
 	atomic_t	refcnt;
 	atomic_t	refcnt;
 
 
 	unsigned int	swappiness;
 	unsigned int	swappiness;
+	/* OOM-Killer disable */
+	int		oom_kill_disable;
 
 
 	/* set when res.limit == memsw.limit */
 	/* set when res.limit == memsw.limit */
 	bool		memsw_is_minimum;
 	bool		memsw_is_minimum;
@@ -235,7 +237,6 @@ struct mem_cgroup {
 	 * mem_cgroup ? And what type of charges should we move ?
 	 * mem_cgroup ? And what type of charges should we move ?
 	 */
 	 */
 	unsigned long 	move_charge_at_immigrate;
 	unsigned long 	move_charge_at_immigrate;
-
 	/*
 	/*
 	 * percpu counter.
 	 * percpu counter.
 	 */
 	 */
@@ -1342,20 +1343,26 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
 	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
 	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
 }
 }
 
 
+static void memcg_oom_recover(struct mem_cgroup *mem)
+{
+	if (mem->oom_kill_disable && atomic_read(&mem->oom_lock))
+		memcg_wakeup_oom(mem);
+}
+
 /*
 /*
  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
  */
  */
 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 {
 {
 	struct oom_wait_info owait;
 	struct oom_wait_info owait;
-	bool locked;
+	bool locked, need_to_kill;
 
 
 	owait.mem = mem;
 	owait.mem = mem;
 	owait.wait.flags = 0;
 	owait.wait.flags = 0;
 	owait.wait.func = memcg_oom_wake_function;
 	owait.wait.func = memcg_oom_wake_function;
 	owait.wait.private = current;
 	owait.wait.private = current;
 	INIT_LIST_HEAD(&owait.wait.task_list);
 	INIT_LIST_HEAD(&owait.wait.task_list);
-
+	need_to_kill = true;
 	/* At first, try to OOM lock hierarchy under mem.*/
 	/* At first, try to OOM lock hierarchy under mem.*/
 	mutex_lock(&memcg_oom_mutex);
 	mutex_lock(&memcg_oom_mutex);
 	locked = mem_cgroup_oom_lock(mem);
 	locked = mem_cgroup_oom_lock(mem);
@@ -1364,15 +1371,17 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
 	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
 	 * under OOM is always welcomed, use TASK_KILLABLE here.
 	 * under OOM is always welcomed, use TASK_KILLABLE here.
 	 */
 	 */
-	if (!locked)
-		prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-	else
+	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+	if (!locked || mem->oom_kill_disable)
+		need_to_kill = false;
+	if (locked)
 		mem_cgroup_oom_notify(mem);
 		mem_cgroup_oom_notify(mem);
 	mutex_unlock(&memcg_oom_mutex);
 	mutex_unlock(&memcg_oom_mutex);
 
 
-	if (locked)
+	if (need_to_kill) {
+		finish_wait(&memcg_oom_waitq, &owait.wait);
 		mem_cgroup_out_of_memory(mem, mask);
 		mem_cgroup_out_of_memory(mem, mask);
-	else {
+	} else {
 		schedule();
 		schedule();
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 	}
 	}
@@ -2162,15 +2171,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
 	/* If swapout, usage of swap doesn't decrease */
 	/* If swapout, usage of swap doesn't decrease */
 	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
 	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
 		uncharge_memsw = false;
 		uncharge_memsw = false;
-	/*
-	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
-	 * In those cases, all pages freed continously can be expected to be in
-	 * the same cgroup and we have chance to coalesce uncharges.
-	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
-	 * because we want to do uncharge as soon as possible.
-	 */
-	if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
-		goto direct_uncharge;
 
 
 	batch = &current->memcg_batch;
 	batch = &current->memcg_batch;
 	/*
 	/*
@@ -2180,6 +2180,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
 	 */
 	 */
 	if (!batch->memcg)
 	if (!batch->memcg)
 		batch->memcg = mem;
 		batch->memcg = mem;
+	/*
+	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
+	 * In those cases, all pages freed continously can be expected to be in
+	 * the same cgroup and we have chance to coalesce uncharges.
+	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
+	 * because we want to do uncharge as soon as possible.
+	 */
+
+	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
+		goto direct_uncharge;
+
 	/*
 	/*
 	 * In typical case, batch->memcg == mem. This means we can
 	 * In typical case, batch->memcg == mem. This means we can
 	 * merge a series of uncharges to an uncharge of res_counter.
 	 * merge a series of uncharges to an uncharge of res_counter.
@@ -2196,6 +2207,8 @@ direct_uncharge:
 	res_counter_uncharge(&mem->res, PAGE_SIZE);
 	res_counter_uncharge(&mem->res, PAGE_SIZE);
 	if (uncharge_memsw)
 	if (uncharge_memsw)
 		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
 		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+	if (unlikely(batch->memcg != mem))
+		memcg_oom_recover(mem);
 	return;
 	return;
 }
 }
 
 
@@ -2332,6 +2345,7 @@ void mem_cgroup_uncharge_end(void)
 		res_counter_uncharge(&batch->memcg->res, batch->bytes);
 		res_counter_uncharge(&batch->memcg->res, batch->bytes);
 	if (batch->memsw_bytes)
 	if (batch->memsw_bytes)
 		res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
 		res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
+	memcg_oom_recover(batch->memcg);
 	/* forget this pointer (for sanity check) */
 	/* forget this pointer (for sanity check) */
 	batch->memcg = NULL;
 	batch->memcg = NULL;
 }
 }
@@ -2568,10 +2582,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 				unsigned long long val)
 				unsigned long long val)
 {
 {
 	int retry_count;
 	int retry_count;
-	u64 memswlimit;
+	u64 memswlimit, memlimit;
 	int ret = 0;
 	int ret = 0;
 	int children = mem_cgroup_count_children(memcg);
 	int children = mem_cgroup_count_children(memcg);
 	u64 curusage, oldusage;
 	u64 curusage, oldusage;
+	int enlarge;
 
 
 	/*
 	/*
 	 * For keeping hierarchical_reclaim simple, how long we should retry
 	 * For keeping hierarchical_reclaim simple, how long we should retry
@@ -2582,6 +2597,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 
 
 	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 
 
+	enlarge = 0;
 	while (retry_count) {
 	while (retry_count) {
 		if (signal_pending(current)) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			ret = -EINTR;
@@ -2599,6 +2615,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 			mutex_unlock(&set_limit_mutex);
 			mutex_unlock(&set_limit_mutex);
 			break;
 			break;
 		}
 		}
+
+		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+		if (memlimit < val)
+			enlarge = 1;
+
 		ret = res_counter_set_limit(&memcg->res, val);
 		ret = res_counter_set_limit(&memcg->res, val);
 		if (!ret) {
 		if (!ret) {
 			if (memswlimit == val)
 			if (memswlimit == val)
@@ -2620,6 +2641,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 		else
 		else
 			oldusage = curusage;
 			oldusage = curusage;
 	}
 	}
+	if (!ret && enlarge)
+		memcg_oom_recover(memcg);
 
 
 	return ret;
 	return ret;
 }
 }
@@ -2628,9 +2651,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 					unsigned long long val)
 					unsigned long long val)
 {
 {
 	int retry_count;
 	int retry_count;
-	u64 memlimit, oldusage, curusage;
+	u64 memlimit, memswlimit, oldusage, curusage;
 	int children = mem_cgroup_count_children(memcg);
 	int children = mem_cgroup_count_children(memcg);
 	int ret = -EBUSY;
 	int ret = -EBUSY;
+	int enlarge = 0;
 
 
 	/* see mem_cgroup_resize_res_limit */
 	/* see mem_cgroup_resize_res_limit */
  	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
  	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
@@ -2652,6 +2676,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 			mutex_unlock(&set_limit_mutex);
 			mutex_unlock(&set_limit_mutex);
 			break;
 			break;
 		}
 		}
+		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+		if (memswlimit < val)
+			enlarge = 1;
 		ret = res_counter_set_limit(&memcg->memsw, val);
 		ret = res_counter_set_limit(&memcg->memsw, val);
 		if (!ret) {
 		if (!ret) {
 			if (memlimit == val)
 			if (memlimit == val)
@@ -2674,6 +2701,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 		else
 		else
 			oldusage = curusage;
 			oldusage = curusage;
 	}
 	}
+	if (!ret && enlarge)
+		memcg_oom_recover(memcg);
 	return ret;
 	return ret;
 }
 }
 
 
@@ -2865,6 +2894,7 @@ move_account:
 			if (ret)
 			if (ret)
 				break;
 				break;
 		}
 		}
+		memcg_oom_recover(mem);
 		/* it seems parent cgroup doesn't have enough mem */
 		/* it seems parent cgroup doesn't have enough mem */
 		if (ret == -ENOMEM)
 		if (ret == -ENOMEM)
 			goto try_to_free;
 			goto try_to_free;
@@ -3645,6 +3675,46 @@ static int mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 	return 0;
 	return 0;
 }
 }
 
 
+static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
+	struct cftype *cft,  struct cgroup_map_cb *cb)
+{
+	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+
+	cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
+
+	if (atomic_read(&mem->oom_lock))
+		cb->fill(cb, "under_oom", 1);
+	else
+		cb->fill(cb, "under_oom", 0);
+	return 0;
+}
+
+/*
+ */
+static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
+	struct cftype *cft, u64 val)
+{
+	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+	struct mem_cgroup *parent;
+
+	/* cannot set to root cgroup and only 0 and 1 are allowed */
+	if (!cgrp->parent || !((val == 0) || (val == 1)))
+		return -EINVAL;
+
+	parent = mem_cgroup_from_cont(cgrp->parent);
+
+	cgroup_lock();
+	/* oom-kill-disable is a flag for subhierarchy. */
+	if ((parent->use_hierarchy) ||
+	    (mem->use_hierarchy && !list_empty(&cgrp->children))) {
+		cgroup_unlock();
+		return -EINVAL;
+	}
+	mem->oom_kill_disable = val;
+	cgroup_unlock();
+	return 0;
+}
+
 static struct cftype mem_cgroup_files[] = {
 static struct cftype mem_cgroup_files[] = {
 	{
 	{
 		.name = "usage_in_bytes",
 		.name = "usage_in_bytes",
@@ -3702,6 +3772,8 @@ static struct cftype mem_cgroup_files[] = {
 	},
 	},
 	{
 	{
 		.name = "oom_control",
 		.name = "oom_control",
+		.read_map = mem_cgroup_oom_control_read,
+		.write_u64 = mem_cgroup_oom_control_write,
 		.register_event = mem_cgroup_oom_register_event,
 		.register_event = mem_cgroup_oom_register_event,
 		.unregister_event = mem_cgroup_oom_unregister_event,
 		.unregister_event = mem_cgroup_oom_unregister_event,
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
@@ -3943,6 +4015,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	} else {
 	} else {
 		parent = mem_cgroup_from_cont(cont->parent);
 		parent = mem_cgroup_from_cont(cont->parent);
 		mem->use_hierarchy = parent->use_hierarchy;
 		mem->use_hierarchy = parent->use_hierarchy;
+		mem->oom_kill_disable = parent->oom_kill_disable;
 	}
 	}
 
 
 	if (parent && parent->use_hierarchy) {
 	if (parent && parent->use_hierarchy) {
@@ -4215,6 +4288,7 @@ static void mem_cgroup_clear_mc(void)
 	if (mc.precharge) {
 	if (mc.precharge) {
 		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
 		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
 		mc.precharge = 0;
 		mc.precharge = 0;
+		memcg_oom_recover(mc.to);
 	}
 	}
 	/*
 	/*
 	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
 	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4223,6 +4297,7 @@ static void mem_cgroup_clear_mc(void)
 	if (mc.moved_charge) {
 	if (mc.moved_charge) {
 		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
 		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
 		mc.moved_charge = 0;
 		mc.moved_charge = 0;
+		memcg_oom_recover(mc.from);
 	}
 	}
 	/* we must fixup refcnts and charges */
 	/* we must fixup refcnts and charges */
 	if (mc.moved_swap) {
 	if (mc.moved_swap) {