11 years ago · 9d755d33f0
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -160,8 +160,6 @@ struct cgroup {
 
				 	 */
			
 
				 	int populated_cnt;
			
 
				 
			
 
				-	atomic_t refcnt;
			
 
				-
			
 
				 	/*
			
 
				 	 * We link our 'sibling' struct into our parent's 'children'.
			
 
				 	 * Our children link their 'sibling' into our 'children'.
			
@@ -218,10 +216,6 @@ struct cgroup {
 
				 	struct list_head pidlists;
			
 
				 	struct mutex pidlist_mutex;
			
 
				 
			
 
				-	/* For css percpu_ref killing and RCU-protected deletion */
			
 
				-	struct rcu_head rcu_head;
			
 
				-	struct work_struct destroy_work;
			
 
				-
			
 
				 	/* used to wait for offlining of csses */
			
 
				 	wait_queue_head_t offline_waitq;
			
 
				 };
			
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -176,10 +176,12 @@ static int need_forkexit_callback __read_mostly;
 
				 static struct cftype cgroup_base_files[];
			
 
				 
			
 
				 static void cgroup_put(struct cgroup *cgrp);
			
 
				+static bool cgroup_has_live_children(struct cgroup *cgrp);
			
 
				 static int rebind_subsystems(struct cgroup_root *dst_root,
			
 
				 			     unsigned int ss_mask);
			
 
				 static int cgroup_destroy_locked(struct cgroup *cgrp);
			
 
				 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
			
 
				+static void css_release(struct percpu_ref *ref);
			
 
				 static void kill_css(struct cgroup_subsys_state *css);
			
 
				 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
			
 
				 			      bool is_add);
			
@@ -1008,62 +1010,15 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 
				 	return mode;
			
 
				 }
			
 
				 
			
 
				-static void cgroup_free_fn(struct work_struct *work)
			
 
				-{
			
 
				-	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
			
 
				-
			
 
				-	atomic_dec(&cgrp->root->nr_cgrps);
			
 
				-	cgroup_pidlist_destroy_all(cgrp);
			
 
				-
			
 
				-	if (cgrp->parent) {
			
 
				-		/*
			
 
				-		 * We get a ref to the parent, and put the ref when this
			
 
				-		 * cgroup is being freed, so it's guaranteed that the
			
 
				-		 * parent won't be destroyed before its children.
			
 
				-		 */
			
 
				-		cgroup_put(cgrp->parent);
			
 
				-		kernfs_put(cgrp->kn);
			
 
				-		kfree(cgrp);
			
 
				-	} else {
			
 
				-		/*
			
 
				-		 * This is root cgroup's refcnt reaching zero, which
			
 
				-		 * indicates that the root should be released.
			
 
				-		 */
			
 
				-		cgroup_destroy_root(cgrp->root);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static void cgroup_free_rcu(struct rcu_head *head)
			
 
				-{
			
 
				-	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
			
 
				-
			
 
				-	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
			
 
				-	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
			
 
				-}
			
 
				-
			
 
				 static void cgroup_get(struct cgroup *cgrp)
			
 
				 {
			
 
				 	WARN_ON_ONCE(cgroup_is_dead(cgrp));
			
 
				-	WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
			
 
				-	atomic_inc(&cgrp->refcnt);
			
 
				+	css_get(&cgrp->self);
			
 
				 }
			
 
				 
			
 
				 static void cgroup_put(struct cgroup *cgrp)
			
 
				 {
			
 
				-	if (!atomic_dec_and_test(&cgrp->refcnt))
			
 
				-		return;
			
 
				-	if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
			
 
				-		return;
			
 
				-
			
 
				-	/* delete this cgroup from parent->children */
			
 
				-	mutex_lock(&cgroup_mutex);
			
 
				-	list_del_rcu(&cgrp->sibling);
			
 
				-	mutex_unlock(&cgroup_mutex);
			
 
				-
			
 
				-	cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
			
 
				-	cgrp->id = -1;
			
 
				-
			
 
				-	call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
			
 
				+	css_put(&cgrp->self);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -1548,7 +1503,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 
				 	struct cgroup_subsys *ss;
			
 
				 	int ssid;
			
 
				 
			
 
				-	atomic_set(&cgrp->refcnt, 1);
			
 
				 	INIT_LIST_HEAD(&cgrp->sibling);
			
 
				 	INIT_LIST_HEAD(&cgrp->children);
			
 
				 	INIT_LIST_HEAD(&cgrp->cset_links);
			
@@ -1597,6 +1551,10 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
 
				 		goto out;
			
 
				 	root_cgrp->id = ret;
			
 
				 
			
 
				+	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
			
 
				+	if (ret)
			
 
				+		goto out;
			
 
				+
			
 
				 	/*
			
 
				 	 * We're accessing css_set_count without locking css_set_rwsem here,
			
 
				 	 * but that's OK - it can only be increased by someone holding
			
@@ -1605,11 +1563,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
 
				 	 */
			
 
				 	ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
			
 
				 	if (ret)
			
 
				-		goto out;
			
 
				+		goto cancel_ref;
			
 
				 
			
 
				 	ret = cgroup_init_root_id(root);
			
 
				 	if (ret)
			
 
				-		goto out;
			
 
				+		goto cancel_ref;
			
 
				 
			
 
				 	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
			
 
				 					   KERNFS_ROOT_CREATE_DEACTIVATED,
			
@@ -1657,6 +1615,8 @@ destroy_root:
 
				 	root->kf_root = NULL;
			
 
				 exit_root_id:
			
 
				 	cgroup_exit_root_id(root);
			
 
				+cancel_ref:
			
 
				+	percpu_ref_cancel_init(&root_cgrp->self.refcnt);
			
 
				 out:
			
 
				 	free_cgrp_cset_links(&tmp_links);
			
 
				 	return ret;
			
@@ -1735,13 +1695,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 
				 		}
			
 
				 
			
 
				 		/*
			
 
				-		 * A root's lifetime is governed by its root cgroup.  Zero
			
 
				-		 * ref indicate that the root is being destroyed.  Wait for
			
 
				-		 * destruction to complete so that the subsystems are free.
			
 
				-		 * We can use wait_queue for the wait but this path is
			
 
				-		 * super cold.  Let's just sleep for a bit and retry.
			
 
				+		 * A root's lifetime is governed by its root cgroup.
			
 
				+		 * tryget_live failure indicate that the root is being
			
 
				+		 * destroyed.  Wait for destruction to complete so that the
			
 
				+		 * subsystems are free.  We can use wait_queue for the wait
			
 
				+		 * but this path is super cold.  Let's just sleep for a bit
			
 
				+		 * and retry.
			
 
				 		 */
			
 
				-		if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
			
 
				+		if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
			
 
				 			mutex_unlock(&cgroup_mutex);
			
 
				 			msleep(10);
			
 
				 			ret = restart_syscall();
			
@@ -1794,7 +1755,16 @@ static void cgroup_kill_sb(struct super_block *sb)
 
				 	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
			
 
				 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
			
 
				 
			
 
				-	cgroup_put(&root->cgrp);
			
 
				+	/*
			
 
				+	 * If @root doesn't have any mounts or children, start killing it.
			
 
				+	 * This prevents new mounts by disabling percpu_ref_tryget_live().
			
 
				+	 * cgroup_mount() may wait for @root's release.
			
 
				+	 */
			
 
				+	if (cgroup_has_live_children(&root->cgrp))
			
 
				+		cgroup_put(&root->cgrp);
			
 
				+	else
			
 
				+		percpu_ref_kill(&root->cgrp.self.refcnt);
			
 
				+
			
 
				 	kernfs_kill_sb(sb);
			
 
				 }
			
 
				 
			
@@ -4110,11 +4080,37 @@ static void css_free_work_fn(struct work_struct *work)
 
				 		container_of(work, struct cgroup_subsys_state, destroy_work);
			
 
				 	struct cgroup *cgrp = css->cgroup;
			
 
				 
			
 
				-	if (css->parent)
			
 
				-		css_put(css->parent);
			
 
				+	if (css->ss) {
			
 
				+		/* css free path */
			
 
				+		if (css->parent)
			
 
				+			css_put(css->parent);
			
 
				 
			
 
				-	css->ss->css_free(css);
			
 
				-	cgroup_put(cgrp);
			
 
				+		css->ss->css_free(css);
			
 
				+		cgroup_put(cgrp);
			
 
				+	} else {
			
 
				+		/* cgroup free path */
			
 
				+		atomic_dec(&cgrp->root->nr_cgrps);
			
 
				+		cgroup_pidlist_destroy_all(cgrp);
			
 
				+
			
 
				+		if (cgrp->parent) {
			
 
				+			/*
			
 
				+			 * We get a ref to the parent, and put the ref when
			
 
				+			 * this cgroup is being freed, so it's guaranteed
			
 
				+			 * that the parent won't be destroyed before its
			
 
				+			 * children.
			
 
				+			 */
			
 
				+			cgroup_put(cgrp->parent);
			
 
				+			kernfs_put(cgrp->kn);
			
 
				+			kfree(cgrp);
			
 
				+		} else {
			
 
				+			/*
			
 
				+			 * This is root cgroup's refcnt reaching zero,
			
 
				+			 * which indicates that the root should be
			
 
				+			 * released.
			
 
				+			 */
			
 
				+			cgroup_destroy_root(cgrp->root);
			
 
				+		}
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static void css_free_rcu_fn(struct rcu_head *rcu_head)
			
@@ -4131,8 +4127,20 @@ static void css_release_work_fn(struct work_struct *work)
 
				 	struct cgroup_subsys_state *css =
			
 
				 		container_of(work, struct cgroup_subsys_state, destroy_work);
			
 
				 	struct cgroup_subsys *ss = css->ss;
			
 
				+	struct cgroup *cgrp = css->cgroup;
			
 
				 
			
 
				-	cgroup_idr_remove(&ss->css_idr, css->id);
			
 
				+	if (ss) {
			
 
				+		/* css release path */
			
 
				+		cgroup_idr_remove(&ss->css_idr, css->id);
			
 
				+	} else {
			
 
				+		/* cgroup release path */
			
 
				+		mutex_lock(&cgroup_mutex);
			
 
				+		list_del_rcu(&cgrp->sibling);
			
 
				+		mutex_unlock(&cgroup_mutex);
			
 
				+
			
 
				+		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
			
 
				+		cgrp->id = -1;
			
 
				+	}
			
 
				 
			
 
				 	call_rcu(&css->rcu_head, css_free_rcu_fn);
			
 
				 }
			
@@ -4285,6 +4293,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 
			
 
				+	ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
			
 
				+	if (ret)
			
 
				+		goto out_free_cgrp;
			
 
				+
			
 
				 	/*
			
 
				 	 * Temporarily set the pointer to NULL, so idr_find() won't return
			
 
				 	 * a half-baked cgroup.
			
@@ -4292,7 +4304,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 
				 	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
			
 
				 	if (cgrp->id < 0) {
			
 
				 		ret = -ENOMEM;
			
 
				-		goto out_free_cgrp;
			
 
				+		goto out_cancel_ref;
			
 
				 	}
			
 
				 
			
 
				 	init_cgroup_housekeeping(cgrp);
			
@@ -4365,6 +4377,8 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 
				 
			
 
				 out_free_id:
			
 
				 	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
			
 
				+out_cancel_ref:
			
 
				+	percpu_ref_cancel_init(&cgrp->self.refcnt);
			
 
				 out_free_cgrp:
			
 
				 	kfree(cgrp);
			
 
				 out_unlock:
			
@@ -4521,7 +4535,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
				 	check_for_release(cgrp->parent);
			
 
				 
			
 
				 	/* put the base reference */
			
 
				-	cgroup_put(cgrp);
			
 
				+	percpu_ref_kill(&cgrp->self.refcnt);
			
 
				 
			
 
				 	return 0;
			
 
				 };