11 gadi atpakaļ · 0e1d768f1b
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -658,10 +658,12 @@ struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css)
 
				  */
			
 
				 #ifdef CONFIG_PROVE_RCU
			
 
				 extern struct mutex cgroup_mutex;
			
 
				+extern struct rw_semaphore css_set_rwsem;
			
 
				 #define task_css_set_check(task, __c)					\
			
 
				 	rcu_dereference_check((task)->cgroups,				\
			
 
				-		lockdep_is_held(&(task)->alloc_lock) ||			\
			
 
				-		lockdep_is_held(&cgroup_mutex) || (__c))
			
 
				+		lockdep_is_held(&cgroup_mutex) ||			\
			
 
				+		lockdep_is_held(&css_set_rwsem) ||			\
			
 
				+		((task)->flags & PF_EXITING) || (__c))
			
 
				 #else
			
 
				 #define task_css_set_check(task, __c)					\
			
 
				 	rcu_dereference((task)->cgroups)
			
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -80,12 +80,21 @@ static DEFINE_MUTEX(cgroup_tree_mutex);
 
				 /*
			
 
				  * cgroup_mutex is the master lock.  Any modification to cgroup or its
			
 
				  * hierarchy must be performed while holding it.
			
 
				+ *
			
 
				+ * css_set_rwsem protects task->cgroups pointer, the list of css_set
			
 
				+ * objects, and the chain of tasks off each css_set.
			
 
				+ *
			
 
				+ * These locks are exported if CONFIG_PROVE_RCU so that accessors in
			
 
				+ * cgroup.h can use them for lockdep annotations.
			
 
				  */
			
 
				 #ifdef CONFIG_PROVE_RCU
			
 
				 DEFINE_MUTEX(cgroup_mutex);
			
 
				-EXPORT_SYMBOL_GPL(cgroup_mutex);	/* only for lockdep */
			
 
				+DECLARE_RWSEM(css_set_rwsem);
			
 
				+EXPORT_SYMBOL_GPL(cgroup_mutex);
			
 
				+EXPORT_SYMBOL_GPL(css_set_rwsem);
			
 
				 #else
			
 
				 static DEFINE_MUTEX(cgroup_mutex);
			
 
				+static DECLARE_RWSEM(css_set_rwsem);
			
 
				 #endif
			
 
				 
			
 
				 /*
			
@@ -338,12 +347,6 @@ struct cgrp_cset_link {
 
				 
			
 
				 static struct css_set init_css_set;
			
 
				 static struct cgrp_cset_link init_cgrp_cset_link;
			
 
				-
			
 
				-/*
			
 
				- * css_set_rwsem protects the list of css_set objects, and the chain of
			
 
				- * tasks off each css_set.
			
 
				- */
			
 
				-static DECLARE_RWSEM(css_set_rwsem);
			
 
				 static int css_set_count;
			
 
				 
			
 
				 /*
			
@@ -803,10 +806,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * There is one global cgroup mutex. We also require taking
			
 
				- * task_lock() when dereferencing a task's cgroup subsys pointers.
			
 
				- * See "The task_lock() exception", at the end of this comment.
			
 
				- *
			
 
				  * A task must hold cgroup_mutex to modify cgroups.
			
 
				  *
			
 
				  * Any task can increment and decrement the count field without lock.
			
@@ -836,18 +835,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 
				  * always has either children cgroups and/or using tasks.  So we don't
			
 
				  * need a special hack to ensure that top_cgroup cannot be deleted.
			
 
				  *
			
 
				- *	The task_lock() exception
			
 
				- *
			
 
				- * The need for this exception arises from the action of
			
 
				- * cgroup_attach_task(), which overwrites one task's cgroup pointer with
			
 
				- * another.  It does so using cgroup_mutex, however there are
			
 
				- * several performance critical places that need to reference
			
 
				- * task->cgroup without the expense of grabbing a system global
			
 
				- * mutex.  Therefore except as noted below, when dereferencing or, as
			
 
				- * in cgroup_attach_task(), modifying a task's cgroup pointer we use
			
 
				- * task_lock(), which acts on a spinlock (task->alloc_lock) already in
			
 
				- * the task_struct routinely used for such matters.
			
 
				- *
			
 
				  * P.S.  One more locking exception.  RCU is used to guard the
			
 
				  * update of a tasks cgroup pointer by cgroup_attach_task()
			
 
				  */
			
@@ -1329,8 +1316,6 @@ static void cgroup_enable_task_cg_lists(void)
 
				 	 */
			
 
				 	read_lock(&tasklist_lock);
			
 
				 	do_each_thread(g, p) {
			
 
				-		task_lock(p);
			
 
				-
			
 
				 		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
			
 
				 			     task_css_set(p) != &init_css_set);
			
 
				 
			
@@ -1349,8 +1334,6 @@ static void cgroup_enable_task_cg_lists(void)
 
				 			get_css_set(cset);
			
 
				 		}
			
 
				 		spin_unlock_irq(&p->sighand->siglock);
			
 
				-
			
 
				-		task_unlock(p);
			
 
				 	} while_each_thread(g, p);
			
 
				 	read_unlock(&tasklist_lock);
			
 
				 out_unlock:
			
@@ -1743,11 +1726,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 
				 	old_cset = task_css_set(tsk);
			
 
				 
			
 
				 	get_css_set(new_cset);
			
 
				-
			
 
				-	task_lock(tsk);
			
 
				 	rcu_assign_pointer(tsk->cgroups, new_cset);
			
 
				-	task_unlock(tsk);
			
 
				-
			
 
				 	list_move(&tsk->cg_list, &new_cset->mg_tasks);
			
 
				 
			
 
				 	/*
			
@@ -1999,8 +1978,7 @@ out_release_tset:
 
				  * @leader: the task or the leader of the threadgroup to be attached
			
 
				  * @threadgroup: attach the whole threadgroup?
			
 
				  *
			
 
				- * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
			
 
				- * task_lock of @tsk or each thread in the threadgroup individually in turn.
			
 
				+ * Call holding cgroup_mutex and threadgroup_lock of @leader.
			
 
				  */
			
 
				 static int cgroup_attach_task(struct cgroup *dst_cgrp,
			
 
				 			      struct task_struct *leader, bool threadgroup)
			
@@ -2034,7 +2012,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 
				 /*
			
 
				  * Find the task_struct of the task to attach by vpid and pass it along to the
			
 
				  * function to attach either it or all tasks in its threadgroup. Will lock
			
 
				- * cgroup_mutex and threadgroup; may take task_lock of task.
			
 
				+ * cgroup_mutex and threadgroup.
			
 
				  */
			
 
				 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
			
 
				 {
			
@@ -4155,12 +4133,6 @@ core_initcall(cgroup_wq_init);
 
				  * proc_cgroup_show()
			
 
				  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
			
 
				  *  - Used for /proc/<pid>/cgroup.
			
 
				- *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
			
 
				- *    doesn't really matter if tsk->cgroup changes after we read it,
			
 
				- *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
			
 
				- *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
			
 
				- *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
			
 
				- *    cgroup to top_cgroup.
			
 
				  */
			
 
				 
			
 
				 /* TODO: Use a proper seq_file iterator */
			
@@ -4310,15 +4282,12 @@ void cgroup_post_fork(struct task_struct *child)
 
				 		struct css_set *cset;
			
 
				 
			
 
				 		down_write(&css_set_rwsem);
			
 
				-		cset = task_css_set_check(current,
			
 
				-					  lockdep_is_held(&css_set_rwsem));
			
 
				-		task_lock(child);
			
 
				+		cset = task_css_set(current);
			
 
				 		if (list_empty(&child->cg_list)) {
			
 
				 			rcu_assign_pointer(child->cgroups, cset);
			
 
				 			list_add(&child->cg_list, &cset->tasks);
			
 
				 			get_css_set(cset);
			
 
				 		}
			
 
				-		task_unlock(child);
			
 
				 		up_write(&css_set_rwsem);
			
 
				 	}
			
 
				 
			
@@ -4347,27 +4316,13 @@ void cgroup_post_fork(struct task_struct *child)
 
				  * use notify_on_release cgroups where very high task exit scaling
			
 
				  * is required on large systems.
			
 
				  *
			
 
				- * the_top_cgroup_hack:
			
 
				- *
			
 
				- *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
			
 
				- *
			
 
				- *    We call cgroup_exit() while the task is still competent to
			
 
				- *    handle notify_on_release(), then leave the task attached to the
			
 
				- *    root cgroup in each hierarchy for the remainder of its exit.
			
 
				- *
			
 
				- *    To do this properly, we would increment the reference count on
			
 
				- *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
			
 
				- *    code we would add a second cgroup function call, to drop that
			
 
				- *    reference.  This would just create an unnecessary hot spot on
			
 
				- *    the top_cgroup reference count, to no avail.
			
 
				- *
			
 
				- *    Normally, holding a reference to a cgroup without bumping its
			
 
				- *    count is unsafe.   The cgroup could go away, or someone could
			
 
				- *    attach us to a different cgroup, decrementing the count on
			
 
				- *    the first cgroup that we never incremented.  But in this case,
			
 
				- *    top_cgroup isn't going away, and either task has PF_EXITING set,
			
 
				- *    which wards off any cgroup_attach_task() attempts, or task is a failed
			
 
				- *    fork, never visible to cgroup_attach_task.
			
 
				+ * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
			
 
				+ * call cgroup_exit() while the task is still competent to handle
			
 
				+ * notify_on_release(), then leave the task attached to the root cgroup in
			
 
				+ * each hierarchy for the remainder of its exit.  No need to bother with
			
 
				+ * init_css_set refcnting.  init_css_set never goes away and we can't race
			
 
				+ * with migration path - either PF_EXITING is visible to migration path or
			
 
				+ * @tsk never got on the tasklist.
			
 
				  */
			
 
				 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
			
 
				 {
			
@@ -4377,20 +4332,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 
				 	int i;
			
 
				 
			
 
				 	/*
			
 
				-	 * Unlink from the css_set task list if necessary.  Optimistically
			
 
				-	 * check cg_list before taking css_set_rwsem.
			
 
				+	 * Unlink from @tsk from its css_set.  As migration path can't race
			
 
				+	 * with us, we can check cg_list without grabbing css_set_rwsem.
			
 
				 	 */
			
 
				 	if (!list_empty(&tsk->cg_list)) {
			
 
				 		down_write(&css_set_rwsem);
			
 
				-		if (!list_empty(&tsk->cg_list)) {
			
 
				-			list_del_init(&tsk->cg_list);
			
 
				-			put_cset = true;
			
 
				-		}
			
 
				+		list_del_init(&tsk->cg_list);
			
 
				 		up_write(&css_set_rwsem);
			
 
				+		put_cset = true;
			
 
				 	}
			
 
				 
			
 
				 	/* Reassign the task to the init_css_set. */
			
 
				-	task_lock(tsk);
			
 
				 	cset = task_css_set(tsk);
			
 
				 	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
			
 
				 
			
@@ -4405,7 +4357,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				-	task_unlock(tsk);
			
 
				 
			
 
				 	if (put_cset)
			
 
				 		put_css_set(cset, true);