9 년 전 · fb0dc5f129
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -7,7 +7,7 @@ This is the authoritative documentation on the design, interface and
 
				 conventions of cgroup v2.  It describes all userland-visible aspects
			
 
				 of cgroup including core and specific controller behaviors.  All
			
 
				 future changes must be reflected in this document.  Documentation for
			
 
				-v1 is available under Documentation/cgroup-legacy/.
			
 
				+v1 is available under Documentation/cgroup-v1/.
			
 
				 
			
 
				 CONTENTS
			
 
				 
			
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -127,6 +127,12 @@ struct cgroup_subsys_state {
 
				 	 */
			
 
				 	u64 serial_nr;
			
 
				 
			
 
				+	/*
			
 
				+	 * Incremented by online self and children.  Used to guarantee that
			
 
				+	 * parents are not offlined before their children.
			
 
				+	 */
			
 
				+	atomic_t online_cnt;
			
 
				+
			
 
				 	/* percpu_ref killing and RCU release */
			
 
				 	struct rcu_head rcu_head;
			
 
				 	struct work_struct destroy_work;
			
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -137,6 +137,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 
				 	task_unlock(current);
			
 
				 }
			
 
				 
			
 
				+extern void cpuset_post_attach_flush(void);
			
 
				+
			
 
				 #else /* !CONFIG_CPUSETS */
			
 
				 
			
 
				 static inline bool cpusets_enabled(void) { return false; }
			
@@ -243,6 +245,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
 
				 	return false;
			
 
				 }
			
 
				 
			
 
				+static inline void cpuset_post_attach_flush(void)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				 #endif /* !CONFIG_CPUSETS */
			
 
				 
			
 
				 #endif /* _LINUX_CPUSET_H */
			
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -58,6 +58,7 @@
 
				 #include <linux/kthread.h>
			
 
				 #include <linux/delay.h>
			
 
				 #include <linux/atomic.h>
			
 
				+#include <linux/cpuset.h>
			
 
				 #include <net/sock.h>
			
 
				 
			
 
				 /*
			
@@ -2739,6 +2740,7 @@ out_unlock_rcu:
 
				 out_unlock_threadgroup:
			
 
				 	percpu_up_write(&cgroup_threadgroup_rwsem);
			
 
				 	cgroup_kn_unlock(of->kn);
			
 
				+	cpuset_post_attach_flush();
			
 
				 	return ret ?: nbytes;
			
 
				 }
			
 
				 
			
@@ -4655,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work)
 
				 
			
 
				 	if (ss) {
			
 
				 		/* css free path */
			
 
				+		struct cgroup_subsys_state *parent = css->parent;
			
 
				 		int id = css->id;
			
 
				 
			
 
				-		if (css->parent)
			
 
				-			css_put(css->parent);
			
 
				-
			
 
				 		ss->css_free(css);
			
 
				 		cgroup_idr_remove(&ss->css_idr, id);
			
 
				 		cgroup_put(cgrp);
			
 
				+
			
 
				+		if (parent)
			
 
				+			css_put(parent);
			
 
				 	} else {
			
 
				 		/* cgroup free path */
			
 
				 		atomic_dec(&cgrp->root->nr_cgrps);
			
@@ -4758,6 +4761,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 
				 	INIT_LIST_HEAD(&css->sibling);
			
 
				 	INIT_LIST_HEAD(&css->children);
			
 
				 	css->serial_nr = css_serial_nr_next++;
			
 
				+	atomic_set(&css->online_cnt, 0);
			
 
				 
			
 
				 	if (cgroup_parent(cgrp)) {
			
 
				 		css->parent = cgroup_css(cgroup_parent(cgrp), ss);
			
@@ -4780,6 +4784,10 @@ static int online_css(struct cgroup_subsys_state *css)
 
				 	if (!ret) {
			
 
				 		css->flags |= CSS_ONLINE;
			
 
				 		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
			
 
				+
			
 
				+		atomic_inc(&css->online_cnt);
			
 
				+		if (css->parent)
			
 
				+			atomic_inc(&css->parent->online_cnt);
			
 
				 	}
			
 
				 	return ret;
			
 
				 }
			
@@ -5017,10 +5025,15 @@ static void css_killed_work_fn(struct work_struct *work)
 
				 		container_of(work, struct cgroup_subsys_state, destroy_work);
			
 
				 
			
 
				 	mutex_lock(&cgroup_mutex);
			
 
				-	offline_css(css);
			
 
				-	mutex_unlock(&cgroup_mutex);
			
 
				 
			
 
				-	css_put(css);
			
 
				+	do {
			
 
				+		offline_css(css);
			
 
				+		css_put(css);
			
 
				+		/* @css can't go away while we're holding cgroup_mutex */
			
 
				+		css = css->parent;
			
 
				+	} while (css && atomic_dec_and_test(&css->online_cnt));
			
 
				+
			
 
				+	mutex_unlock(&cgroup_mutex);
			
 
				 }
			
 
				 
			
 
				 /* css kill confirmation processing requires process context, bounce */
			
@@ -5029,8 +5042,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
 
				 	struct cgroup_subsys_state *css =
			
 
				 		container_of(ref, struct cgroup_subsys_state, refcnt);
			
 
				 
			
 
				-	INIT_WORK(&css->destroy_work, css_killed_work_fn);
			
 
				-	queue_work(cgroup_destroy_wq, &css->destroy_work);
			
 
				+	if (atomic_dec_and_test(&css->online_cnt)) {
			
 
				+		INIT_WORK(&css->destroy_work, css_killed_work_fn);
			
 
				+		queue_work(cgroup_destroy_wq, &css->destroy_work);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /**
			
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -287,6 +287,8 @@ static struct cpuset top_cpuset = {
 
				 static DEFINE_MUTEX(cpuset_mutex);
			
 
				 static DEFINE_SPINLOCK(callback_lock);
			
 
				 
			
 
				+static struct workqueue_struct *cpuset_migrate_mm_wq;
			
 
				+
			
 
				 /*
			
 
				  * CPU / memory hotplug is handled asynchronously.
			
 
				  */
			
@@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * cpuset_migrate_mm
			
 
				- *
			
 
				- *    Migrate memory region from one set of nodes to another.
			
 
				- *
			
 
				- *    Temporarilly set tasks mems_allowed to target nodes of migration,
			
 
				- *    so that the migration code can allocate pages on these nodes.
			
 
				- *
			
 
				- *    While the mm_struct we are migrating is typically from some
			
 
				- *    other task, the task_struct mems_allowed that we are hacking
			
 
				- *    is for our current task, which must allocate new pages for that
			
 
				- *    migrating memory region.
			
 
				+ * Migrate memory region from one set of nodes to another.  This is
			
 
				+ * performed asynchronously as it can be called from process migration path
			
 
				+ * holding locks involved in process management.  All mm migrations are
			
 
				+ * performed in the queued order and can be waited for by flushing
			
 
				+ * cpuset_migrate_mm_wq.
			
 
				  */
			
 
				 
			
 
				+struct cpuset_migrate_mm_work {
			
 
				+	struct work_struct	work;
			
 
				+	struct mm_struct	*mm;
			
 
				+	nodemask_t		from;
			
 
				+	nodemask_t		to;
			
 
				+};
			
 
				+
			
 
				+static void cpuset_migrate_mm_workfn(struct work_struct *work)
			
 
				+{
			
 
				+	struct cpuset_migrate_mm_work *mwork =
			
 
				+		container_of(work, struct cpuset_migrate_mm_work, work);
			
 
				+
			
 
				+	/* on a wq worker, no need to worry about %current's mems_allowed */
			
 
				+	do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
			
 
				+	mmput(mwork->mm);
			
 
				+	kfree(mwork);
			
 
				+}
			
 
				+
			
 
				 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
			
 
				 							const nodemask_t *to)
			
 
				 {
			
 
				-	struct task_struct *tsk = current;
			
 
				-
			
 
				-	tsk->mems_allowed = *to;
			
 
				+	struct cpuset_migrate_mm_work *mwork;
			
 
				 
			
 
				-	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
			
 
				+	mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
			
 
				+	if (mwork) {
			
 
				+		mwork->mm = mm;
			
 
				+		mwork->from = *from;
			
 
				+		mwork->to = *to;
			
 
				+		INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
			
 
				+		queue_work(cpuset_migrate_mm_wq, &mwork->work);
			
 
				+	} else {
			
 
				+		mmput(mm);
			
 
				+	}
			
 
				+}
			
 
				 
			
 
				-	rcu_read_lock();
			
 
				-	guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
			
 
				-	rcu_read_unlock();
			
 
				+void cpuset_post_attach_flush(void)
			
 
				+{
			
 
				+	flush_workqueue(cpuset_migrate_mm_wq);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs)
 
				 		mpol_rebind_mm(mm, &cs->mems_allowed);
			
 
				 		if (migrate)
			
 
				 			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
			
 
				-		mmput(mm);
			
 
				+		else
			
 
				+			mmput(mm);
			
 
				 	}
			
 
				 	css_task_iter_end(&it);
			
 
				 
			
@@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 
				 			 * @old_mems_allowed is the right nodesets that we
			
 
				 			 * migrate mm from.
			
 
				 			 */
			
 
				-			if (is_memory_migrate(cs)) {
			
 
				+			if (is_memory_migrate(cs))
			
 
				 				cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
			
 
				 						  &cpuset_attach_nodemask_to);
			
 
				-			}
			
 
				-			mmput(mm);
			
 
				+			else
			
 
				+				mmput(mm);
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -1714,6 +1737,7 @@ out_unlock:
 
				 	mutex_unlock(&cpuset_mutex);
			
 
				 	kernfs_unbreak_active_protection(of->kn);
			
 
				 	css_put(&cs->css);
			
 
				+	flush_workqueue(cpuset_migrate_mm_wq);
			
 
				 	return retval ?: nbytes;
			
 
				 }
			
 
				 
			
@@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void)
 
				 	top_cpuset.effective_mems = node_states[N_MEMORY];
			
 
				 
			
 
				 	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
			
 
				+
			
 
				+	cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
			
 
				+	BUG_ON(!cpuset_migrate_mm_wq);
			
 
				 }
			
 
				 
			
 
				 /**