8 years ago · 9ced560b82
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -149,6 +149,16 @@ during boot, before manual intervention is possible. To make testing
 
															 and experimenting easier, the kernel parameter cgroup_no_v1= allows
														
 
															 disabling controllers in v1 and make them always available in v2.
														
 
															+cgroup v2 currently supports the following mount options.
														
 
															+
														
 
															+  nsdelegate
														
 
															+
														
 
															+	Consider cgroup namespaces as delegation boundaries.  This
														
 
															+	option is system wide and can only be set on mount or modified
														
 
															+	through remount from the init namespace.  The mount option is
														
 
															+	ignored on non-init namespace mounts.  Please refer to the
														
 
															+	Delegation section for details.
														
 
															+
														
 
															 2-2. Organizing Processes
														
@@ -308,18 +318,27 @@ file.
 
															 2-5-1. Model of Delegation
														
 
															-A cgroup can be delegated to a less privileged user by granting write
														
 
															-access of the directory and its "cgroup.procs" file to the user.  Note
														
 
															-that resource control interface files in a given directory control the
														
 
															-distribution of the parent's resources and thus must not be delegated
														
 
															-along with the directory.
														
 
															-
														
 
															-Once delegated, the user can build sub-hierarchy under the directory,
														
 
															-organize processes as it sees fit and further distribute the resources
														
 
															-it received from the parent.  The limits and other settings of all
														
 
															-resource controllers are hierarchical and regardless of what happens
														
 
															-in the delegated sub-hierarchy, nothing can escape the resource
														
 
															-restrictions imposed by the parent.
														
 
															+A cgroup can be delegated in two ways.  First, to a less privileged
														
 
															+user by granting write access of the directory and its "cgroup.procs"
														
 
															+and "cgroup.subtree_control" files to the user.  Second, if the
														
 
															+"nsdelegate" mount option is set, automatically to a cgroup namespace
														
 
															+on namespace creation.
														
 
															+
														
 
															+Because the resource control interface files in a given directory
														
 
															+control the distribution of the parent's resources, the delegatee
														
 
															+shouldn't be allowed to write to them.  For the first method, this is
														
 
															+achieved by not granting access to these files.  For the second, the
														
 
															+kernel rejects writes to all files other than "cgroup.procs" and
														
 
															+"cgroup.subtree_control" on a namespace root from inside the
														
 
															+namespace.
														
 
															+
														
 
															+The end results are equivalent for both delegation types.  Once
														
 
															+delegated, the user can build sub-hierarchy under the directory,
														
 
															+organize processes inside it as it sees fit and further distribute the
														
 
															+resources it received from the parent.  The limits and other settings
														
 
															+of all resource controllers are hierarchical and regardless of what
														
 
															+happens in the delegated sub-hierarchy, nothing can escape the
														
 
															+resource restrictions imposed by the parent.
														
 
															 Currently, cgroup doesn't impose any restrictions on the number of
														
 
															 cgroups in or nesting depth of a delegated sub-hierarchy; however,
														
@@ -329,10 +348,12 @@ this may be limited explicitly in the future.
 
															 2-5-2. Delegation Containment
														
 
															 A delegated sub-hierarchy is contained in the sense that processes
														
 
															-can't be moved into or out of the sub-hierarchy by the delegatee.  For
														
 
															-a process with a non-root euid to migrate a target process into a
														
 
															-cgroup by writing its PID to the "cgroup.procs" file, the following
														
 
															-conditions must be met.
														
 
															+can't be moved into or out of the sub-hierarchy by the delegatee.
														
 
															+
														
 
															+For delegations to a less privileged user, this is achieved by
														
 
															+requiring the following conditions for a process with a non-root euid
														
 
															+to migrate a target process into a cgroup by writing its PID to the
														
 
															+"cgroup.procs" file.
														
 
															 - The writer must have write access to the "cgroup.procs" file.
														
@@ -359,6 +380,11 @@ destination cgroup C00 is above the points of delegation and U0 would
 
															 not have write access to its "cgroup.procs" files and thus the write
														
 
															 will be denied with -EACCES.
														
 
															+For delegations to namespaces, containment is achieved by requiring
														
 
															+that both the source and destination cgroups are reachable from the
														
 
															+namespace of the process which is attempting the migration.  If either
														
 
															+is not reachable, the migration is rejected with -ENOENT.
														
 
															+
														
 
															 2-6. Guidelines
														
@@ -1413,7 +1439,7 @@ D. Deprecated v1 Core Features
 
															 - Multiple hierarchies including named ones are not supported.
														
 
															-- All mount options and remounting are not supported.
														
 
															+- All v1 mount options are not supported.
														
 
															 - The "tasks" file is removed and "cgroup.procs" is not sorted.
														
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -67,12 +67,21 @@ enum {
 
															 enum {
														
 
															 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
														
 
															 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
														
 
															+
														
 
															+	/*
														
 
															+	 * Consider namespaces as delegation boundaries.  If this flag is
														
 
															+	 * set, controller specific interface files in a namespace root
														
 
															+	 * aren't writeable from inside the namespace.
														
 
															+	 */
														
 
															+	CGRP_ROOT_NS_DELEGATE	= (1 << 3),
														
 
															 };
														
 
															 /* cftype->flags */
														
 
															 enum {
														
 
															 	CFTYPE_ONLY_ON_ROOT	= (1 << 0),	/* only create on root cgrp */
														
 
															 	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cgrp */
														
 
															+	CFTYPE_NS_DELEGATABLE	= (1 << 2),	/* writeable beyond delegation boundaries */
														
 
															+
														
 
															 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
														
 
															 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
														
@@ -166,6 +175,9 @@ struct css_set {
 
															 	/* the default cgroup associated with this css_set */
														
 
															 	struct cgroup *dfl_cgrp;
														
 
															+	/* internal task count, protected by css_set_lock */
														
 
															+	int nr_tasks;
														
 
															+
														
 
															 	/*
														
 
															 	 * Lists running through all tasks using this cgroup group.
														
 
															 	 * mg_tasks lists tasks which belong to this cset but are in the
														
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -859,11 +859,14 @@ config CGROUP_BPF
 
															 	  inet sockets.
														
 
															 config CGROUP_DEBUG
														
 
															-	bool "Example controller"
														
 
															+	bool "Debug controller"
														
 
															 	default n
														
 
															+	depends on DEBUG_KERNEL
														
 
															 	help
														
 
															 	  This option enables a simple controller that exports
														
 
															-	  debugging information about the cgroups framework.
														
 
															+	  debugging information about the cgroups framework. This
														
 
															+	  controller is for control cgroup debugging only. Its
														
 
															+	  interfaces are not stable.
														
 
															 	  Say N.
														
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 
															 obj-$(CONFIG_CGROUP_PIDS) += pids.o
														
 
															 obj-$(CONFIG_CGROUP_RDMA) += rdma.o
														
 
															 obj-$(CONFIG_CPUSETS) += cpuset.o
														
 
															+obj-$(CONFIG_CGROUP_DEBUG) += debug.o
														
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn);
 
															 int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
														
 
															 		     struct kernfs_root *kf_root);
														
 
															+int cgroup_task_count(const struct cgroup *cgrp);
														
 
															+
														
 
															 /*
														
 
															  * namespace.c
														
 
															  */
														
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -334,19 +334,15 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
 
															 /**
														
 
															  * cgroup_task_count - count the number of tasks in a cgroup.
														
 
															  * @cgrp: the cgroup in question
														
 
															- *
														
 
															- * Return the number of tasks in the cgroup.  The returned number can be
														
 
															- * higher than the actual number of tasks due to css_set references from
														
 
															- * namespace roots and temporary usages.
														
 
															  */
														
 
															-static int cgroup_task_count(const struct cgroup *cgrp)
														
 
															+int cgroup_task_count(const struct cgroup *cgrp)
														
 
															 {
														
 
															 	int count = 0;
														
 
															 	struct cgrp_cset_link *link;
														
 
															 	spin_lock_irq(&css_set_lock);
														
 
															 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
														
 
															-		count += refcount_read(&link->cset->refcount);
														
 
															+		count += link->cset->nr_tasks;
														
 
															 	spin_unlock_irq(&css_set_lock);
														
 
															 	return count;
														
 
															 }
														
@@ -1263,150 +1259,3 @@ static int __init cgroup_no_v1(char *str)
 
															 	return 1;
														
 
															 }
														
 
															 __setup("cgroup_no_v1=", cgroup_no_v1);
														
 
															-
														
 
															-
														
 
															-#ifdef CONFIG_CGROUP_DEBUG
														
 
															-static struct cgroup_subsys_state *
														
 
															-debug_css_alloc(struct cgroup_subsys_state *parent_css)
														
 
															-{
														
 
															-	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
														
 
															-
														
 
															-	if (!css)
														
 
															-		return ERR_PTR(-ENOMEM);
														
 
															-
														
 
															-	return css;
														
 
															-}
														
 
															-
														
 
															-static void debug_css_free(struct cgroup_subsys_state *css)
														
 
															-{
														
 
															-	kfree(css);
														
 
															-}
														
 
															-
														
 
															-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
														
 
															-				struct cftype *cft)
														
 
															-{
														
 
															-	return cgroup_task_count(css->cgroup);
														
 
															-}
														
 
															-
														
 
															-static u64 current_css_set_read(struct cgroup_subsys_state *css,
														
 
															-				struct cftype *cft)
														
 
															-{
														
 
															-	return (u64)(unsigned long)current->cgroups;
														
 
															-}
														
 
															-
														
 
															-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
														
 
															-					 struct cftype *cft)
														
 
															-{
														
 
															-	u64 count;
														
 
															-
														
 
															-	rcu_read_lock();
														
 
															-	count = refcount_read(&task_css_set(current)->refcount);
														
 
															-	rcu_read_unlock();
														
 
															-	return count;
														
 
															-}
														
 
															-
														
 
															-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
														
 
															-{
														
 
															-	struct cgrp_cset_link *link;
														
 
															-	struct css_set *cset;
														
 
															-	char *name_buf;
														
 
															-
														
 
															-	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
														
 
															-	if (!name_buf)
														
 
															-		return -ENOMEM;
														
 
															-
														
 
															-	spin_lock_irq(&css_set_lock);
														
 
															-	rcu_read_lock();
														
 
															-	cset = rcu_dereference(current->cgroups);
														
 
															-	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
														
 
															-		struct cgroup *c = link->cgrp;
														
 
															-
														
 
															-		cgroup_name(c, name_buf, NAME_MAX + 1);
														
 
															-		seq_printf(seq, "Root %d group %s\n",
														
 
															-			   c->root->hierarchy_id, name_buf);
														
 
															-	}
														
 
															-	rcu_read_unlock();
														
 
															-	spin_unlock_irq(&css_set_lock);
														
 
															-	kfree(name_buf);
														
 
															-	return 0;
														
 
															-}
														
 
															-
														
 
															-#define MAX_TASKS_SHOWN_PER_CSS 25
														
 
															-static int cgroup_css_links_read(struct seq_file *seq, void *v)
														
 
															-{
														
 
															-	struct cgroup_subsys_state *css = seq_css(seq);
														
 
															-	struct cgrp_cset_link *link;
														
 
															-
														
 
															-	spin_lock_irq(&css_set_lock);
														
 
															-	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
														
 
															-		struct css_set *cset = link->cset;
														
 
															-		struct task_struct *task;
														
 
															-		int count = 0;
														
 
															-
														
 
															-		seq_printf(seq, "css_set %pK\n", cset);
														
 
															-
														
 
															-		list_for_each_entry(task, &cset->tasks, cg_list) {
														
 
															-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
														
 
															-				goto overflow;
														
 
															-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
														
 
															-		}
														
 
															-
														
 
															-		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
														
 
															-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
														
 
															-				goto overflow;
														
 
															-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
														
 
															-		}
														
 
															-		continue;
														
 
															-	overflow:
														
 
															-		seq_puts(seq, "  ...\n");
														
 
															-	}
														
 
															-	spin_unlock_irq(&css_set_lock);
														
 
															-	return 0;
														
 
															-}
														
 
															-
														
 
															-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
														
 
															-{
														
 
															-	return (!cgroup_is_populated(css->cgroup) &&
														
 
															-		!css_has_online_children(&css->cgroup->self));
														
 
															-}
														
 
															-
														
 
															-static struct cftype debug_files[] =  {
														
 
															-	{
														
 
															-		.name = "taskcount",
														
 
															-		.read_u64 = debug_taskcount_read,
														
 
															-	},
														
 
															-
														
 
															-	{
														
 
															-		.name = "current_css_set",
														
 
															-		.read_u64 = current_css_set_read,
														
 
															-	},
														
 
															-
														
 
															-	{
														
 
															-		.name = "current_css_set_refcount",
														
 
															-		.read_u64 = current_css_set_refcount_read,
														
 
															-	},
														
 
															-
														
 
															-	{
														
 
															-		.name = "current_css_set_cg_links",
														
 
															-		.seq_show = current_css_set_cg_links_read,
														
 
															-	},
														
 
															-
														
 
															-	{
														
 
															-		.name = "cgroup_css_links",
														
 
															-		.seq_show = cgroup_css_links_read,
														
 
															-	},
														
 
															-
														
 
															-	{
														
 
															-		.name = "releasable",
														
 
															-		.read_u64 = releasable_read,
														
 
															-	},
														
 
															-
														
 
															-	{ }	/* terminate */
														
 
															-};
														
 
															-
														
 
															-struct cgroup_subsys debug_cgrp_subsys = {
														
 
															-	.css_alloc = debug_css_alloc,
														
 
															-	.css_free = debug_css_free,
														
 
															-	.legacy_cftypes = debug_files,
														
 
															-};
														
 
															-#endif /* CONFIG_CGROUP_DEBUG */
														
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -573,6 +573,11 @@ static int css_set_count	= 1;	/* 1 for init_css_set */
 
															 /**
														
 
															  * css_set_populated - does a css_set contain any tasks?
														
 
															  * @cset: target css_set
														
 
															+ *
														
 
															+ * css_set_populated() should be the same as !!cset->nr_tasks at steady
														
 
															+ * state. However, css_set_populated() can be called while a task is being
														
 
															+ * added to or removed from the linked list before the nr_tasks is
														
 
															+ * properly updated. Hence, we can't just look at ->nr_tasks here.
														
 
															  */
														
 
															 static bool css_set_populated(struct css_set *cset)
														
 
															 {
														
@@ -1542,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 
															 	return len;
														
 
															 }
														
 
															+static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
														
 
															+{
														
 
															+	char *token;
														
 
															+
														
 
															+	*root_flags = 0;
														
 
															+
														
 
															+	if (!data)
														
 
															+		return 0;
														
 
															+
														
 
															+	while ((token = strsep(&data, ",")) != NULL) {
														
 
															+		if (!strcmp(token, "nsdelegate")) {
														
 
															+			*root_flags |= CGRP_ROOT_NS_DELEGATE;
														
 
															+			continue;
														
 
															+		}
														
 
															+
														
 
															+		pr_err("cgroup2: unknown option \"%s\"\n", token);
														
 
															+		return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void apply_cgroup_root_flags(unsigned int root_flags)
														
 
															+{
														
 
															+	if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
														
 
															+		if (root_flags & CGRP_ROOT_NS_DELEGATE)
														
 
															+			cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
														
 
															+		else
														
 
															+			cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
														
 
															+{
														
 
															+	if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
														
 
															+		seq_puts(seq, ",nsdelegate");
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
														
 
															 {
														
 
															-	pr_err("remount is not allowed\n");
														
 
															-	return -EINVAL;
														
 
															+	unsigned int root_flags;
														
 
															+	int ret;
														
 
															+
														
 
															+	ret = parse_cgroup_root_flags(data, &root_flags);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	apply_cgroup_root_flags(root_flags);
														
 
															+	return 0;
														
 
															 }
														
 
															 /*
														
@@ -1598,6 +1649,7 @@ static void cgroup_enable_task_cg_lists(void)
 
															 				css_set_update_populated(cset, true);
														
 
															 			list_add_tail(&p->cg_list, &cset->tasks);
														
 
															 			get_css_set(cset);
														
 
															+			cset->nr_tasks++;
														
 
															 		}
														
 
															 		spin_unlock(&p->sighand->siglock);
														
 
															 	} while_each_thread(g, p);
														
@@ -1784,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 
															 {
														
 
															 	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
														
 
															 	struct dentry *dentry;
														
 
															+	int ret;
														
 
															 	get_cgroup_ns(ns);
														
@@ -1801,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 
															 		cgroup_enable_task_cg_lists();
														
 
															 	if (fs_type == &cgroup2_fs_type) {
														
 
															-		if (data) {
														
 
															-			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
														
 
															+		unsigned int root_flags;
														
 
															+
														
 
															+		ret = parse_cgroup_root_flags(data, &root_flags);
														
 
															+		if (ret) {
														
 
															 			put_cgroup_ns(ns);
														
 
															-			return ERR_PTR(-EINVAL);
														
 
															+			return ERR_PTR(ret);
														
 
															 		}
														
 
															+
														
 
															 		cgrp_dfl_visible = true;
														
 
															 		cgroup_get_live(&cgrp_dfl_root.cgrp);
														
 
															 		dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
														
 
															 					 CGROUP2_SUPER_MAGIC, ns);
														
 
															+		if (!IS_ERR(dentry))
														
 
															+			apply_cgroup_root_flags(root_flags);
														
 
															 	} else {
														
 
															 		dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
														
 
															 				       CGROUP_SUPER_MAGIC, ns);
														
@@ -2064,8 +2122,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 
															 			struct css_set *to_cset = cset->mg_dst_cset;
														
 
															 			get_css_set(to_cset);
														
 
															+			to_cset->nr_tasks++;
														
 
															 			css_set_move_task(task, from_cset, to_cset, true);
														
 
															 			put_css_set_locked(from_cset);
														
 
															+			from_cset->nr_tasks--;
														
 
															 		}
														
 
															 	}
														
 
															 	spin_unlock_irq(&css_set_lock);
														
@@ -2355,27 +2415,14 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 
															 					 struct cgroup *dst_cgrp,
														
 
															 					 struct kernfs_open_file *of)
														
 
															 {
														
 
															-	int ret = 0;
														
 
															-
														
 
															-	if (cgroup_on_dfl(dst_cgrp)) {
														
 
															-		struct super_block *sb = of->file->f_path.dentry->d_sb;
														
 
															-		struct cgroup *cgrp;
														
 
															-		struct inode *inode;
														
 
															-
														
 
															-		spin_lock_irq(&css_set_lock);
														
 
															-		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
														
 
															-		spin_unlock_irq(&css_set_lock);
														
 
															-
														
 
															-		while (!cgroup_is_descendant(dst_cgrp, cgrp))
														
 
															-			cgrp = cgroup_parent(cgrp);
														
 
															+	struct super_block *sb = of->file->f_path.dentry->d_sb;
														
 
															+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
														
 
															+	struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
														
 
															+	struct cgroup *src_cgrp, *com_cgrp;
														
 
															+	struct inode *inode;
														
 
															+	int ret;
														
 
															-		ret = -ENOMEM;
														
 
															-		inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
														
 
															-		if (inode) {
														
 
															-			ret = inode_permission(inode, MAY_WRITE);
														
 
															-			iput(inode);
														
 
															-		}
														
 
															-	} else {
														
 
															+	if (!cgroup_on_dfl(dst_cgrp)) {
														
 
															 		const struct cred *cred = current_cred();
														
 
															 		const struct cred *tcred = get_task_cred(task);
														
@@ -2383,14 +2430,47 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 
															 		 * even if we're attaching all tasks in the thread group,
														
 
															 		 * we only need to check permissions on one of them.
														
 
															 		 */
														
 
															-		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
														
 
															-		    !uid_eq(cred->euid, tcred->uid) &&
														
 
															-		    !uid_eq(cred->euid, tcred->suid))
														
 
															+		if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
														
 
															+		    uid_eq(cred->euid, tcred->uid) ||
														
 
															+		    uid_eq(cred->euid, tcred->suid))
														
 
															+			ret = 0;
														
 
															+		else
														
 
															 			ret = -EACCES;
														
 
															+
														
 
															 		put_cred(tcred);
														
 
															+		return ret;
														
 
															 	}
														
 
															-	return ret;
														
 
															+	/* find the source cgroup */
														
 
															+	spin_lock_irq(&css_set_lock);
														
 
															+	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
														
 
															+	spin_unlock_irq(&css_set_lock);
														
 
															+
														
 
															+	/* and the common ancestor */
														
 
															+	com_cgrp = src_cgrp;
														
 
															+	while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
														
 
															+		com_cgrp = cgroup_parent(com_cgrp);
														
 
															+
														
 
															+	/* %current should be authorized to migrate to the common ancestor */
														
 
															+	inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
														
 
															+	if (!inode)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	ret = inode_permission(inode, MAY_WRITE);
														
 
															+	iput(inode);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	/*
														
 
															+	 * If namespaces are delegation boundaries, %current must be able
														
 
															+	 * to see both source and destination cgroups from its namespace.
														
 
															+	 */
														
 
															+	if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
														
 
															+	    (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
														
 
															+	     !cgroup_is_descendant(dst_cgrp, root_cgrp)))
														
 
															+		return -ENOENT;
														
 
															+
														
 
															+	return 0;
														
 
															 }
														
 
															 /*
														
@@ -2954,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of)
 
															 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
														
 
															 				 size_t nbytes, loff_t off)
														
 
															 {
														
 
															+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
														
 
															 	struct cgroup *cgrp = of->kn->parent->priv;
														
 
															 	struct cftype *cft = of->kn->priv;
														
 
															 	struct cgroup_subsys_state *css;
														
 
															 	int ret;
														
 
															+	/*
														
 
															+	 * If namespaces are delegation boundaries, disallow writes to
														
 
															+	 * files in an non-init namespace root from inside the namespace
														
 
															+	 * except for the files explicitly marked delegatable -
														
 
															+	 * cgroup.procs and cgroup.subtree_control.
														
 
															+	 */
														
 
															+	if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
														
 
															+	    !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
														
 
															+	    ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
														
 
															+		return -EPERM;
														
 
															+
														
 
															 	if (cft->write)
														
 
															 		return cft->write(of, buf, nbytes, off);
														
@@ -3792,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
 
															 static struct cftype cgroup_base_files[] = {
														
 
															 	{
														
 
															 		.name = "cgroup.procs",
														
 
															+		.flags = CFTYPE_NS_DELEGATABLE,
														
 
															 		.file_offset = offsetof(struct cgroup, procs_file),
														
 
															 		.release = cgroup_procs_release,
														
 
															 		.seq_start = cgroup_procs_start,
														
@@ -3805,6 +3898,7 @@ static struct cftype cgroup_base_files[] = {
 
															 	},
														
 
															 	{
														
 
															 		.name = "cgroup.subtree_control",
														
 
															+		.flags = CFTYPE_NS_DELEGATABLE,
														
 
															 		.seq_show = cgroup_subtree_control_show,
														
 
															 		.write = cgroup_subtree_control_write,
														
 
															 	},
														
@@ -4393,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn)
 
															 }
														
 
															 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
														
 
															+	.show_options		= cgroup_show_options,
														
 
															 	.remount_fs		= cgroup_remount,
														
 
															 	.mkdir			= cgroup_mkdir,
														
 
															 	.rmdir			= cgroup_rmdir,
														
@@ -4789,6 +4884,7 @@ void cgroup_post_fork(struct task_struct *child)
 
															 		cset = task_css_set(current);
														
 
															 		if (list_empty(&child->cg_list)) {
														
 
															 			get_css_set(cset);
														
 
															+			cset->nr_tasks++;
														
 
															 			css_set_move_task(child, NULL, cset, false);
														
 
															 		}
														
 
															 		spin_unlock_irq(&css_set_lock);
														
@@ -4838,6 +4934,7 @@ void cgroup_exit(struct task_struct *tsk)
 
															 	if (!list_empty(&tsk->cg_list)) {
														
 
															 		spin_lock_irq(&css_set_lock);
														
 
															 		css_set_move_task(tsk, cset, NULL, false);
														
 
															+		cset->nr_tasks--;
														
 
															 		spin_unlock_irq(&css_set_lock);
														
 
															 	} else {
														
 
															 		get_css_set(cset);
														
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -0,0 +1,357 @@
 
															+/*
														
 
															+ * Debug controller
														
 
															+ *
														
 
															+ * WARNING: This controller is for cgroup core debugging only.
														
 
															+ * Its interfaces are unstable and subject to changes at any time.
														
 
															+ */
														
 
															+#include <linux/ctype.h>
														
 
															+#include <linux/mm.h>
														
 
															+#include <linux/slab.h>
														
 
															+
														
 
															+#include "cgroup-internal.h"
														
 
															+
														
 
															+static struct cgroup_subsys_state *
														
 
															+debug_css_alloc(struct cgroup_subsys_state *parent_css)
														
 
															+{
														
 
															+	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
														
 
															+
														
 
															+	if (!css)
														
 
															+		return ERR_PTR(-ENOMEM);
														
 
															+
														
 
															+	return css;
														
 
															+}
														
 
															+
														
 
															+static void debug_css_free(struct cgroup_subsys_state *css)
														
 
															+{
														
 
															+	kfree(css);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * debug_taskcount_read - return the number of tasks in a cgroup.
														
 
															+ * @cgrp: the cgroup in question
														
 
															+ */
														
 
															+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
														
 
															+				struct cftype *cft)
														
 
															+{
														
 
															+	return cgroup_task_count(css->cgroup);
														
 
															+}
														
 
															+
														
 
															+static int current_css_set_read(struct seq_file *seq, void *v)
														
 
															+{
														
 
															+	struct kernfs_open_file *of = seq->private;
														
 
															+	struct css_set *cset;
														
 
															+	struct cgroup_subsys *ss;
														
 
															+	struct cgroup_subsys_state *css;
														
 
															+	int i, refcnt;
														
 
															+
														
 
															+	if (!cgroup_kn_lock_live(of->kn, false))
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	spin_lock_irq(&css_set_lock);
														
 
															+	rcu_read_lock();
														
 
															+	cset = rcu_dereference(current->cgroups);
														
 
															+	refcnt = refcount_read(&cset->refcount);
														
 
															+	seq_printf(seq, "css_set %pK %d", cset, refcnt);
														
 
															+	if (refcnt > cset->nr_tasks)
														
 
															+		seq_printf(seq, " +%d", refcnt - cset->nr_tasks);
														
 
															+	seq_puts(seq, "\n");
														
 
															+
														
 
															+	/*
														
 
															+	 * Print the css'es stored in the current css_set.
														
 
															+	 */
														
 
															+	for_each_subsys(ss, i) {
														
 
															+		css = cset->subsys[ss->id];
														
 
															+		if (!css)
														
 
															+			continue;
														
 
															+		seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
														
 
															+			  (unsigned long)css, css->id);
														
 
															+	}
														
 
															+	rcu_read_unlock();
														
 
															+	spin_unlock_irq(&css_set_lock);
														
 
															+	cgroup_kn_unlock(of->kn);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
														
 
															+					 struct cftype *cft)
														
 
															+{
														
 
															+	u64 count;
														
 
															+
														
 
															+	rcu_read_lock();
														
 
															+	count = refcount_read(&task_css_set(current)->refcount);
														
 
															+	rcu_read_unlock();
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
														
 
															+{
														
 
															+	struct cgrp_cset_link *link;
														
 
															+	struct css_set *cset;
														
 
															+	char *name_buf;
														
 
															+
														
 
															+	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
														
 
															+	if (!name_buf)
														
 
															+		return -ENOMEM;
														
 
															+
														
 
															+	spin_lock_irq(&css_set_lock);
														
 
															+	rcu_read_lock();
														
 
															+	cset = rcu_dereference(current->cgroups);
														
 
															+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
														
 
															+		struct cgroup *c = link->cgrp;
														
 
															+
														
 
															+		cgroup_name(c, name_buf, NAME_MAX + 1);
														
 
															+		seq_printf(seq, "Root %d group %s\n",
														
 
															+			   c->root->hierarchy_id, name_buf);
														
 
															+	}
														
 
															+	rcu_read_unlock();
														
 
															+	spin_unlock_irq(&css_set_lock);
														
 
															+	kfree(name_buf);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+#define MAX_TASKS_SHOWN_PER_CSS 25
														
 
															+static int cgroup_css_links_read(struct seq_file *seq, void *v)
														
 
															+{
														
 
															+	struct cgroup_subsys_state *css = seq_css(seq);
														
 
															+	struct cgrp_cset_link *link;
														
 
															+	int dead_cnt = 0, extra_refs = 0;
														
 
															+
														
 
															+	spin_lock_irq(&css_set_lock);
														
 
															+	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
														
 
															+		struct css_set *cset = link->cset;
														
 
															+		struct task_struct *task;
														
 
															+		int count = 0;
														
 
															+		int refcnt = refcount_read(&cset->refcount);
														
 
															+
														
 
															+		seq_printf(seq, " %d", refcnt);
														
 
															+		if (refcnt - cset->nr_tasks > 0) {
														
 
															+			int extra = refcnt - cset->nr_tasks;
														
 
															+
														
 
															+			seq_printf(seq, " +%d", extra);
														
 
															+			/*
														
 
															+			 * Take out the one additional reference in
														
 
															+			 * init_css_set.
														
 
															+			 */
														
 
															+			if (cset == &init_css_set)
														
 
															+				extra--;
														
 
															+			extra_refs += extra;
														
 
															+		}
														
 
															+		seq_puts(seq, "\n");
														
 
															+
														
 
															+		list_for_each_entry(task, &cset->tasks, cg_list) {
														
 
															+			if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
														
 
															+				seq_printf(seq, "  task %d\n",
														
 
															+					   task_pid_vnr(task));
														
 
															+		}
														
 
															+
														
 
															+		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
														
 
															+			if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
														
 
															+				seq_printf(seq, "  task %d\n",
														
 
															+					   task_pid_vnr(task));
														
 
															+		}
														
 
															+		/* show # of overflowed tasks */
														
 
															+		if (count > MAX_TASKS_SHOWN_PER_CSS)
														
 
															+			seq_printf(seq, "  ... (%d)\n",
														
 
															+				   count - MAX_TASKS_SHOWN_PER_CSS);
														
 
															+
														
 
															+		if (cset->dead) {
														
 
															+			seq_puts(seq, "    [dead]\n");
														
 
															+			dead_cnt++;
														
 
															+		}
														
 
															+
														
 
															+		WARN_ON(count != cset->nr_tasks);
														
 
															+	}
														
 
															+	spin_unlock_irq(&css_set_lock);
														
 
															+
														
 
															+	if (!dead_cnt && !extra_refs)
														
 
															+		return 0;
														
 
															+
														
 
															+	seq_puts(seq, "\n");
														
 
															+	if (extra_refs)
														
 
															+		seq_printf(seq, "extra references = %d\n", extra_refs);
														
 
															+	if (dead_cnt)
														
 
															+		seq_printf(seq, "dead css_sets = %d\n", dead_cnt);
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
														
 
															+{
														
 
															+	struct kernfs_open_file *of = seq->private;
														
 
															+	struct cgroup *cgrp;
														
 
															+	struct cgroup_subsys *ss;
														
 
															+	struct cgroup_subsys_state *css;
														
 
															+	char pbuf[16];
														
 
															+	int i;
														
 
															+
														
 
															+	cgrp = cgroup_kn_lock_live(of->kn, false);
														
 
															+	if (!cgrp)
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	for_each_subsys(ss, i) {
														
 
															+		css = rcu_dereference_check(cgrp->subsys[ss->id], true);
														
 
															+		if (!css)
														
 
															+			continue;
														
 
															+
														
 
															+		pbuf[0] = '\0';
														
 
															+
														
 
															+		/* Show the parent CSS if applicable*/
														
 
															+		if (css->parent)
														
 
															+			snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
														
 
															+				 css->parent->id);
														
 
															+		seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
														
 
															+			  (unsigned long)css, css->id,
														
 
															+			  atomic_read(&css->online_cnt), pbuf);
														
 
															+	}
														
 
															+
														
 
															+	cgroup_kn_unlock(of->kn);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
														
 
															+				  u16 mask)
														
 
															+{
														
 
															+	struct cgroup_subsys *ss;
														
 
															+	int ssid;
														
 
															+	bool first = true;
														
 
															+
														
 
															+	seq_printf(seq, "%-17s: ", name);
														
 
															+	for_each_subsys(ss, ssid) {
														
 
															+		if (!(mask & (1 << ssid)))
														
 
															+			continue;
														
 
															+		if (!first)
														
 
															+			seq_puts(seq, ", ");
														
 
															+		seq_puts(seq, ss->name);
														
 
															+		first = false;
														
 
															+	}
														
 
															+	seq_putc(seq, '\n');
														
 
															+}
														
 
															+
														
 
															+static int cgroup_masks_read(struct seq_file *seq, void *v)
														
 
															+{
														
 
															+	struct kernfs_open_file *of = seq->private;
														
 
															+	struct cgroup *cgrp;
														
 
															+
														
 
															+	cgrp = cgroup_kn_lock_live(of->kn, false);
														
 
															+	if (!cgrp)
														
 
															+		return -ENODEV;
														
 
															+
														
 
															+	cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
														
 
															+	cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
														
 
															+
														
 
															+	cgroup_kn_unlock(of->kn);
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
														
 
															+{
														
 
															+	return (!cgroup_is_populated(css->cgroup) &&
														
 
															+		!css_has_online_children(&css->cgroup->self));
														
 
															+}
														
 
															+
														
 
															+static struct cftype debug_legacy_files[] =  {
														
 
															+	{
														
 
															+		.name = "taskcount",
														
 
															+		.read_u64 = debug_taskcount_read,
														
 
															+	},
														
 
															+
														
 
															+	{
														
 
															+		.name = "current_css_set",
														
 
															+		.seq_show = current_css_set_read,
														
 
															+		.flags = CFTYPE_ONLY_ON_ROOT,
														
 
															+	},
														
 
															+
														
 
															+	{
														
 
															+		.name = "current_css_set_refcount",
														
 
															+		.read_u64 = current_css_set_refcount_read,
														
 
															+		.flags = CFTYPE_ONLY_ON_ROOT,
														
 
															+	},
														
 
															+
														
 
															+	{
														
 
															+		.name = "current_css_set_cg_links",
														
 
															+		.seq_show = current_css_set_cg_links_read,
														
 
															+		.flags = CFTYPE_ONLY_ON_ROOT,
														
 
															+	},
														
 
															+
														
 
															+	{
														
 
															+		.name = "cgroup_css_links",
														
 
															+		.seq_show = cgroup_css_links_read,
														
 
															+	},
														
 
															+
														
 
															+	{
														
 
															+		.name = "cgroup_subsys_states",
														
 
															+		.seq_show = cgroup_subsys_states_read,
														
 
															+	},
														
 
															+
														
 
															+	{
														
 
															+		.name = "cgroup_masks",
														
 
															+		.seq_show = cgroup_masks_read,
														
 
															+	},
														
 
															+
														
 
															+	{
														
 
															+		.name = "releasable",
														
 
															+		.read_u64 = releasable_read,
														
 
															+	},
														
 
															+
														
 
															+	{ }	/* terminate */
														
 
															+};
														
 
															+
														
 
															+static struct cftype debug_files[] =  {
														
 
															+	{
														
 
															+		.name = "taskcount",
														
 
															+		.read_u64 = debug_taskcount_read,
														
 
															+	},
														
 
															+
														
 
															+	{
														
 
															+		.name = "current_css_set",
														
 
															+		.seq_show = current_css_set_read,
														
 
															+		.flags = CFTYPE_ONLY_ON_ROOT,
														
 
															+	},
														
 
															+
														
 
															+	{
														
 
															+		.name = "current_css_set_refcount",
														
 
															+		.read_u64 = current_css_set_refcount_read,
														
 
															+		.flags = CFTYPE_ONLY_ON_ROOT,
														
 
															+	},
														
 
															+
														
 
															+	{
														
 
															+		.name = "current_css_set_cg_links",
														
 
															+		.seq_show = current_css_set_cg_links_read,
														
 
															+		.flags = CFTYPE_ONLY_ON_ROOT,
														
 
															+	},
														
 
															+
														
 
															+	{
														
 
															+		.name = "css_links",
														
 
															+		.seq_show = cgroup_css_links_read,
														
 
															+	},
														
 
															+
														
 
															+	{
														
 
															+		.name = "csses",
														
 
															+		.seq_show = cgroup_subsys_states_read,
														
 
															+	},
														
 
															+
														
 
															+	{
														
 
															+		.name = "masks",
														
 
															+		.seq_show = cgroup_masks_read,
														
 
															+	},
														
 
															+
														
 
															+	{ }	/* terminate */
														
 
															+};
														
 
															+
														
 
															+struct cgroup_subsys debug_cgrp_subsys = {
														
 
															+	.css_alloc	= debug_css_alloc,
														
 
															+	.css_free	= debug_css_free,
														
 
															+	.legacy_cftypes	= debug_legacy_files,
														
 
															+};
														
 
															+
														
 
															+/*
														
 
															+ * On v2, debug is an implicit controller enabled by "cgroup_debug" boot
														
 
															+ * parameter.
														
 
															+ */
														
 
															+static int __init enable_cgroup_debug(char *str)
														
 
															+{
														
 
															+	debug_cgrp_subsys.dfl_cftypes = debug_files;
														
 
															+	debug_cgrp_subsys.implicit_on_dfl = true;
														
 
															+	return 1;
														
 
															+}
														
 
															+__setup("cgroup_debug", enable_cgroup_debug);