Browse Source

Merge branch 'for-4.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup changes from Tejun Heo:

 - Waiman made the debug controller work and a lot more useful on
   cgroup2

 - There were a couple issues with cgroup subtree delegation. The
   documentation on delegating to a non-root user was missing some part
   and cgroup namespace support wasn't factoring in delegation at all.
   The documentation is updated and the now there is a mount option to
   make cgroup namespace fit for delegation

* 'for-4.13' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup: implement "nsdelegate" mount option
  cgroup: restructure cgroup_procs_write_permission()
  cgroup: "cgroup.subtree_control" should be writeable by delegatee
  cgroup: fix lockdep warning in debug controller
  cgroup: refactor cgroup_masks_read() in the debug controller
  cgroup: make debug an implicit controller on cgroup2
  cgroup: Make debug cgroup support v2 and thread mode
  cgroup: Make Kconfig prompt of debug cgroup more accurate
  cgroup: Move debug cgroup to its own file
  cgroup: Keep accurate count of tasks in each css_set
Linus Torvalds 8 years ago
parent
commit
9ced560b82

+ 43 - 17
Documentation/cgroup-v2.txt

@@ -149,6 +149,16 @@ during boot, before manual intervention is possible. To make testing
 and experimenting easier, the kernel parameter cgroup_no_v1= allows
 and experimenting easier, the kernel parameter cgroup_no_v1= allows
 disabling controllers in v1 and make them always available in v2.
 disabling controllers in v1 and make them always available in v2.
 
 
+cgroup v2 currently supports the following mount options.
+
+  nsdelegate
+
+	Consider cgroup namespaces as delegation boundaries.  This
+	option is system wide and can only be set on mount or modified
+	through remount from the init namespace.  The mount option is
+	ignored on non-init namespace mounts.  Please refer to the
+	Delegation section for details.
+
 
 
 2-2. Organizing Processes
 2-2. Organizing Processes
 
 
@@ -308,18 +318,27 @@ file.
 
 
 2-5-1. Model of Delegation
 2-5-1. Model of Delegation
 
 
-A cgroup can be delegated to a less privileged user by granting write
-access of the directory and its "cgroup.procs" file to the user.  Note
-that resource control interface files in a given directory control the
-distribution of the parent's resources and thus must not be delegated
-along with the directory.
-
-Once delegated, the user can build sub-hierarchy under the directory,
-organize processes as it sees fit and further distribute the resources
-it received from the parent.  The limits and other settings of all
-resource controllers are hierarchical and regardless of what happens
-in the delegated sub-hierarchy, nothing can escape the resource
-restrictions imposed by the parent.
+A cgroup can be delegated in two ways.  First, to a less privileged
+user by granting write access of the directory and its "cgroup.procs"
+and "cgroup.subtree_control" files to the user.  Second, if the
+"nsdelegate" mount option is set, automatically to a cgroup namespace
+on namespace creation.
+
+Because the resource control interface files in a given directory
+control the distribution of the parent's resources, the delegatee
+shouldn't be allowed to write to them.  For the first method, this is
+achieved by not granting access to these files.  For the second, the
+kernel rejects writes to all files other than "cgroup.procs" and
+"cgroup.subtree_control" on a namespace root from inside the
+namespace.
+
+The end results are equivalent for both delegation types.  Once
+delegated, the user can build sub-hierarchy under the directory,
+organize processes inside it as it sees fit and further distribute the
+resources it received from the parent.  The limits and other settings
+of all resource controllers are hierarchical and regardless of what
+happens in the delegated sub-hierarchy, nothing can escape the
+resource restrictions imposed by the parent.
 
 
 Currently, cgroup doesn't impose any restrictions on the number of
 Currently, cgroup doesn't impose any restrictions on the number of
 cgroups in or nesting depth of a delegated sub-hierarchy; however,
 cgroups in or nesting depth of a delegated sub-hierarchy; however,
@@ -329,10 +348,12 @@ this may be limited explicitly in the future.
 2-5-2. Delegation Containment
 2-5-2. Delegation Containment
 
 
 A delegated sub-hierarchy is contained in the sense that processes
 A delegated sub-hierarchy is contained in the sense that processes
-can't be moved into or out of the sub-hierarchy by the delegatee.  For
-a process with a non-root euid to migrate a target process into a
-cgroup by writing its PID to the "cgroup.procs" file, the following
-conditions must be met.
+can't be moved into or out of the sub-hierarchy by the delegatee.
+
+For delegations to a less privileged user, this is achieved by
+requiring the following conditions for a process with a non-root euid
+to migrate a target process into a cgroup by writing its PID to the
+"cgroup.procs" file.
 
 
 - The writer must have write access to the "cgroup.procs" file.
 - The writer must have write access to the "cgroup.procs" file.
 
 
@@ -359,6 +380,11 @@ destination cgroup C00 is above the points of delegation and U0 would
 not have write access to its "cgroup.procs" files and thus the write
 not have write access to its "cgroup.procs" files and thus the write
 will be denied with -EACCES.
 will be denied with -EACCES.
 
 
+For delegations to namespaces, containment is achieved by requiring
+that both the source and destination cgroups are reachable from the
+namespace of the process which is attempting the migration.  If either
+is not reachable, the migration is rejected with -ENOENT.
+
 
 
 2-6. Guidelines
 2-6. Guidelines
 
 
@@ -1413,7 +1439,7 @@ D. Deprecated v1 Core Features
 
 
 - Multiple hierarchies including named ones are not supported.
 - Multiple hierarchies including named ones are not supported.
 
 
-- All mount options and remounting are not supported.
+- All v1 mount options are not supported.
 
 
 - The "tasks" file is removed and "cgroup.procs" is not sorted.
 - The "tasks" file is removed and "cgroup.procs" is not sorted.
 
 

+ 12 - 0
include/linux/cgroup-defs.h

@@ -67,12 +67,21 @@ enum {
 enum {
 enum {
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
+
+	/*
+	 * Consider namespaces as delegation boundaries.  If this flag is
+	 * set, controller specific interface files in a namespace root
+	 * aren't writeable from inside the namespace.
+	 */
+	CGRP_ROOT_NS_DELEGATE	= (1 << 3),
 };
 };
 
 
 /* cftype->flags */
 /* cftype->flags */
 enum {
 enum {
 	CFTYPE_ONLY_ON_ROOT	= (1 << 0),	/* only create on root cgrp */
 	CFTYPE_ONLY_ON_ROOT	= (1 << 0),	/* only create on root cgrp */
 	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cgrp */
 	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cgrp */
+	CFTYPE_NS_DELEGATABLE	= (1 << 2),	/* writeable beyond delegation boundaries */
+
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
 
 
@@ -166,6 +175,9 @@ struct css_set {
 	/* the default cgroup associated with this css_set */
 	/* the default cgroup associated with this css_set */
 	struct cgroup *dfl_cgrp;
 	struct cgroup *dfl_cgrp;
 
 
+	/* internal task count, protected by css_set_lock */
+	int nr_tasks;
+
 	/*
 	/*
 	 * Lists running through all tasks using this cgroup group.
 	 * Lists running through all tasks using this cgroup group.
 	 * mg_tasks lists tasks which belong to this cset but are in the
 	 * mg_tasks lists tasks which belong to this cset but are in the

+ 5 - 2
init/Kconfig

@@ -859,11 +859,14 @@ config CGROUP_BPF
 	  inet sockets.
 	  inet sockets.
 
 
 config CGROUP_DEBUG
 config CGROUP_DEBUG
-	bool "Example controller"
+	bool "Debug controller"
 	default n
 	default n
+	depends on DEBUG_KERNEL
 	help
 	help
 	  This option enables a simple controller that exports
 	  This option enables a simple controller that exports
-	  debugging information about the cgroups framework.
+	  debugging information about the cgroups framework. This
+	  controller is for control cgroup debugging only. Its
+	  interfaces are not stable.
 
 
 	  Say N.
 	  Say N.
 
 

+ 1 - 0
kernel/cgroup/Makefile

@@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
 obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CGROUP_RDMA) += rdma.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUP_DEBUG) += debug.o

+ 2 - 0
kernel/cgroup/cgroup-internal.h

@@ -192,6 +192,8 @@ int cgroup_rmdir(struct kernfs_node *kn);
 int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 		     struct kernfs_root *kf_root);
 		     struct kernfs_root *kf_root);
 
 
+int cgroup_task_count(const struct cgroup *cgrp);
+
 /*
 /*
  * namespace.c
  * namespace.c
  */
  */

+ 2 - 153
kernel/cgroup/cgroup-v1.c

@@ -334,19 +334,15 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
 /**
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
  * @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup.  The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
  */
  */
-static int cgroup_task_count(const struct cgroup *cgrp)
+int cgroup_task_count(const struct cgroup *cgrp)
 {
 {
 	int count = 0;
 	int count = 0;
 	struct cgrp_cset_link *link;
 	struct cgrp_cset_link *link;
 
 
 	spin_lock_irq(&css_set_lock);
 	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
-		count += refcount_read(&link->cset->refcount);
+		count += link->cset->nr_tasks;
 	spin_unlock_irq(&css_set_lock);
 	spin_unlock_irq(&css_set_lock);
 	return count;
 	return count;
 }
 }
@@ -1263,150 +1259,3 @@ static int __init cgroup_no_v1(char *str)
 	return 1;
 	return 1;
 }
 }
 __setup("cgroup_no_v1=", cgroup_no_v1);
 __setup("cgroup_no_v1=", cgroup_no_v1);
-
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
-	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
-	if (!css)
-		return ERR_PTR(-ENOMEM);
-
-	return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
-	kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
-				struct cftype *cft)
-{
-	return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
-					 struct cftype *cft)
-{
-	u64 count;
-
-	rcu_read_lock();
-	count = refcount_read(&task_css_set(current)->refcount);
-	rcu_read_unlock();
-	return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
-	struct cgrp_cset_link *link;
-	struct css_set *cset;
-	char *name_buf;
-
-	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
-	if (!name_buf)
-		return -ENOMEM;
-
-	spin_lock_irq(&css_set_lock);
-	rcu_read_lock();
-	cset = rcu_dereference(current->cgroups);
-	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
-		struct cgroup *c = link->cgrp;
-
-		cgroup_name(c, name_buf, NAME_MAX + 1);
-		seq_printf(seq, "Root %d group %s\n",
-			   c->root->hierarchy_id, name_buf);
-	}
-	rcu_read_unlock();
-	spin_unlock_irq(&css_set_lock);
-	kfree(name_buf);
-	return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
-	struct cgroup_subsys_state *css = seq_css(seq);
-	struct cgrp_cset_link *link;
-
-	spin_lock_irq(&css_set_lock);
-	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
-		struct css_set *cset = link->cset;
-		struct task_struct *task;
-		int count = 0;
-
-		seq_printf(seq, "css_set %pK\n", cset);
-
-		list_for_each_entry(task, &cset->tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-		}
-
-		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
-				goto overflow;
-			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
-		}
-		continue;
-	overflow:
-		seq_puts(seq, "  ...\n");
-	}
-	spin_unlock_irq(&css_set_lock);
-	return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
-	return (!cgroup_is_populated(css->cgroup) &&
-		!css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] =  {
-	{
-		.name = "taskcount",
-		.read_u64 = debug_taskcount_read,
-	},
-
-	{
-		.name = "current_css_set",
-		.read_u64 = current_css_set_read,
-	},
-
-	{
-		.name = "current_css_set_refcount",
-		.read_u64 = current_css_set_refcount_read,
-	},
-
-	{
-		.name = "current_css_set_cg_links",
-		.seq_show = current_css_set_cg_links_read,
-	},
-
-	{
-		.name = "cgroup_css_links",
-		.seq_show = cgroup_css_links_read,
-	},
-
-	{
-		.name = "releasable",
-		.read_u64 = releasable_read,
-	},
-
-	{ }	/* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
-	.css_alloc = debug_css_alloc,
-	.css_free = debug_css_free,
-	.legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */

+ 126 - 29
kernel/cgroup/cgroup.c

@@ -573,6 +573,11 @@ static int css_set_count	= 1;	/* 1 for init_css_set */
 /**
 /**
  * css_set_populated - does a css_set contain any tasks?
  * css_set_populated - does a css_set contain any tasks?
  * @cset: target css_set
  * @cset: target css_set
+ *
+ * css_set_populated() should be the same as !!cset->nr_tasks at steady
+ * state. However, css_set_populated() can be called while a task is being
+ * added to or removed from the linked list before the nr_tasks is
+ * properly updated. Hence, we can't just look at ->nr_tasks here.
  */
  */
 static bool css_set_populated(struct css_set *cset)
 static bool css_set_populated(struct css_set *cset)
 {
 {
@@ -1542,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 	return len;
 	return len;
 }
 }
 
 
+static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
+{
+	char *token;
+
+	*root_flags = 0;
+
+	if (!data)
+		return 0;
+
+	while ((token = strsep(&data, ",")) != NULL) {
+		if (!strcmp(token, "nsdelegate")) {
+			*root_flags |= CGRP_ROOT_NS_DELEGATE;
+			continue;
+		}
+
+		pr_err("cgroup2: unknown option \"%s\"\n", token);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void apply_cgroup_root_flags(unsigned int root_flags)
+{
+	if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
+		if (root_flags & CGRP_ROOT_NS_DELEGATE)
+			cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
+		else
+			cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+	}
+}
+
+static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+	if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
+		seq_puts(seq, ",nsdelegate");
+	return 0;
+}
+
 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
 {
-	pr_err("remount is not allowed\n");
-	return -EINVAL;
+	unsigned int root_flags;
+	int ret;
+
+	ret = parse_cgroup_root_flags(data, &root_flags);
+	if (ret)
+		return ret;
+
+	apply_cgroup_root_flags(root_flags);
+	return 0;
 }
 }
 
 
 /*
 /*
@@ -1598,6 +1649,7 @@ static void cgroup_enable_task_cg_lists(void)
 				css_set_update_populated(cset, true);
 				css_set_update_populated(cset, true);
 			list_add_tail(&p->cg_list, &cset->tasks);
 			list_add_tail(&p->cg_list, &cset->tasks);
 			get_css_set(cset);
 			get_css_set(cset);
+			cset->nr_tasks++;
 		}
 		}
 		spin_unlock(&p->sighand->siglock);
 		spin_unlock(&p->sighand->siglock);
 	} while_each_thread(g, p);
 	} while_each_thread(g, p);
@@ -1784,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 {
 {
 	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
 	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
 	struct dentry *dentry;
 	struct dentry *dentry;
+	int ret;
 
 
 	get_cgroup_ns(ns);
 	get_cgroup_ns(ns);
 
 
@@ -1801,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		cgroup_enable_task_cg_lists();
 		cgroup_enable_task_cg_lists();
 
 
 	if (fs_type == &cgroup2_fs_type) {
 	if (fs_type == &cgroup2_fs_type) {
-		if (data) {
-			pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+		unsigned int root_flags;
+
+		ret = parse_cgroup_root_flags(data, &root_flags);
+		if (ret) {
 			put_cgroup_ns(ns);
 			put_cgroup_ns(ns);
-			return ERR_PTR(-EINVAL);
+			return ERR_PTR(ret);
 		}
 		}
+
 		cgrp_dfl_visible = true;
 		cgrp_dfl_visible = true;
 		cgroup_get_live(&cgrp_dfl_root.cgrp);
 		cgroup_get_live(&cgrp_dfl_root.cgrp);
 
 
 		dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
 		dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
 					 CGROUP2_SUPER_MAGIC, ns);
 					 CGROUP2_SUPER_MAGIC, ns);
+		if (!IS_ERR(dentry))
+			apply_cgroup_root_flags(root_flags);
 	} else {
 	} else {
 		dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
 		dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
 				       CGROUP_SUPER_MAGIC, ns);
 				       CGROUP_SUPER_MAGIC, ns);
@@ -2064,8 +2122,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 			struct css_set *to_cset = cset->mg_dst_cset;
 			struct css_set *to_cset = cset->mg_dst_cset;
 
 
 			get_css_set(to_cset);
 			get_css_set(to_cset);
+			to_cset->nr_tasks++;
 			css_set_move_task(task, from_cset, to_cset, true);
 			css_set_move_task(task, from_cset, to_cset, true);
 			put_css_set_locked(from_cset);
 			put_css_set_locked(from_cset);
+			from_cset->nr_tasks--;
 		}
 		}
 	}
 	}
 	spin_unlock_irq(&css_set_lock);
 	spin_unlock_irq(&css_set_lock);
@@ -2355,27 +2415,14 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 					 struct cgroup *dst_cgrp,
 					 struct cgroup *dst_cgrp,
 					 struct kernfs_open_file *of)
 					 struct kernfs_open_file *of)
 {
 {
-	int ret = 0;
-
-	if (cgroup_on_dfl(dst_cgrp)) {
-		struct super_block *sb = of->file->f_path.dentry->d_sb;
-		struct cgroup *cgrp;
-		struct inode *inode;
-
-		spin_lock_irq(&css_set_lock);
-		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-		spin_unlock_irq(&css_set_lock);
-
-		while (!cgroup_is_descendant(dst_cgrp, cgrp))
-			cgrp = cgroup_parent(cgrp);
+	struct super_block *sb = of->file->f_path.dentry->d_sb;
+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+	struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
+	struct cgroup *src_cgrp, *com_cgrp;
+	struct inode *inode;
+	int ret;
 
 
-		ret = -ENOMEM;
-		inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
-		if (inode) {
-			ret = inode_permission(inode, MAY_WRITE);
-			iput(inode);
-		}
-	} else {
+	if (!cgroup_on_dfl(dst_cgrp)) {
 		const struct cred *cred = current_cred();
 		const struct cred *cred = current_cred();
 		const struct cred *tcred = get_task_cred(task);
 		const struct cred *tcred = get_task_cred(task);
 
 
@@ -2383,14 +2430,47 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 		 * even if we're attaching all tasks in the thread group,
 		 * even if we're attaching all tasks in the thread group,
 		 * we only need to check permissions on one of them.
 		 * we only need to check permissions on one of them.
 		 */
 		 */
-		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
-		    !uid_eq(cred->euid, tcred->uid) &&
-		    !uid_eq(cred->euid, tcred->suid))
+		if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
+		    uid_eq(cred->euid, tcred->uid) ||
+		    uid_eq(cred->euid, tcred->suid))
+			ret = 0;
+		else
 			ret = -EACCES;
 			ret = -EACCES;
+
 		put_cred(tcred);
 		put_cred(tcred);
+		return ret;
 	}
 	}
 
 
-	return ret;
+	/* find the source cgroup */
+	spin_lock_irq(&css_set_lock);
+	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+	spin_unlock_irq(&css_set_lock);
+
+	/* and the common ancestor */
+	com_cgrp = src_cgrp;
+	while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+		com_cgrp = cgroup_parent(com_cgrp);
+
+	/* %current should be authorized to migrate to the common ancestor */
+	inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+	if (!inode)
+		return -ENOMEM;
+
+	ret = inode_permission(inode, MAY_WRITE);
+	iput(inode);
+	if (ret)
+		return ret;
+
+	/*
+	 * If namespaces are delegation boundaries, %current must be able
+	 * to see both source and destination cgroups from its namespace.
+	 */
+	if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+	    (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
+	     !cgroup_is_descendant(dst_cgrp, root_cgrp)))
+		return -ENOENT;
+
+	return 0;
 }
 }
 
 
 /*
 /*
@@ -2954,11 +3034,23 @@ static void cgroup_file_release(struct kernfs_open_file *of)
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 				 size_t nbytes, loff_t off)
 {
 {
+	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
 	struct cgroup *cgrp = of->kn->parent->priv;
 	struct cgroup *cgrp = of->kn->parent->priv;
 	struct cftype *cft = of->kn->priv;
 	struct cftype *cft = of->kn->priv;
 	struct cgroup_subsys_state *css;
 	struct cgroup_subsys_state *css;
 	int ret;
 	int ret;
 
 
+	/*
+	 * If namespaces are delegation boundaries, disallow writes to
+	 * files in an non-init namespace root from inside the namespace
+	 * except for the files explicitly marked delegatable -
+	 * cgroup.procs and cgroup.subtree_control.
+	 */
+	if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
+	    !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
+	    ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+		return -EPERM;
+
 	if (cft->write)
 	if (cft->write)
 		return cft->write(of, buf, nbytes, off);
 		return cft->write(of, buf, nbytes, off);
 
 
@@ -3792,6 +3884,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
 static struct cftype cgroup_base_files[] = {
 static struct cftype cgroup_base_files[] = {
 	{
 	{
 		.name = "cgroup.procs",
 		.name = "cgroup.procs",
+		.flags = CFTYPE_NS_DELEGATABLE,
 		.file_offset = offsetof(struct cgroup, procs_file),
 		.file_offset = offsetof(struct cgroup, procs_file),
 		.release = cgroup_procs_release,
 		.release = cgroup_procs_release,
 		.seq_start = cgroup_procs_start,
 		.seq_start = cgroup_procs_start,
@@ -3805,6 +3898,7 @@ static struct cftype cgroup_base_files[] = {
 	},
 	},
 	{
 	{
 		.name = "cgroup.subtree_control",
 		.name = "cgroup.subtree_control",
+		.flags = CFTYPE_NS_DELEGATABLE,
 		.seq_show = cgroup_subtree_control_show,
 		.seq_show = cgroup_subtree_control_show,
 		.write = cgroup_subtree_control_write,
 		.write = cgroup_subtree_control_write,
 	},
 	},
@@ -4393,6 +4487,7 @@ int cgroup_rmdir(struct kernfs_node *kn)
 }
 }
 
 
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+	.show_options		= cgroup_show_options,
 	.remount_fs		= cgroup_remount,
 	.remount_fs		= cgroup_remount,
 	.mkdir			= cgroup_mkdir,
 	.mkdir			= cgroup_mkdir,
 	.rmdir			= cgroup_rmdir,
 	.rmdir			= cgroup_rmdir,
@@ -4789,6 +4884,7 @@ void cgroup_post_fork(struct task_struct *child)
 		cset = task_css_set(current);
 		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
 		if (list_empty(&child->cg_list)) {
 			get_css_set(cset);
 			get_css_set(cset);
+			cset->nr_tasks++;
 			css_set_move_task(child, NULL, cset, false);
 			css_set_move_task(child, NULL, cset, false);
 		}
 		}
 		spin_unlock_irq(&css_set_lock);
 		spin_unlock_irq(&css_set_lock);
@@ -4838,6 +4934,7 @@ void cgroup_exit(struct task_struct *tsk)
 	if (!list_empty(&tsk->cg_list)) {
 	if (!list_empty(&tsk->cg_list)) {
 		spin_lock_irq(&css_set_lock);
 		spin_lock_irq(&css_set_lock);
 		css_set_move_task(tsk, cset, NULL, false);
 		css_set_move_task(tsk, cset, NULL, false);
+		cset->nr_tasks--;
 		spin_unlock_irq(&css_set_lock);
 		spin_unlock_irq(&css_set_lock);
 	} else {
 	} else {
 		get_css_set(cset);
 		get_css_set(cset);

+ 357 - 0
kernel/cgroup/debug.c

@@ -0,0 +1,357 @@
+/*
+ * Debug controller
+ *
+ * WARNING: This controller is for cgroup core debugging only.
+ * Its interfaces are unstable and subject to changes at any time.
+ */
+#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "cgroup-internal.h"
+
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+	if (!css)
+		return ERR_PTR(-ENOMEM);
+
+	return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css);
+}
+
+/*
+ * debug_taskcount_read - return the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ */
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	return cgroup_task_count(css->cgroup);
+}
+
+static int current_css_set_read(struct seq_file *seq, void *v)
+{
+	struct kernfs_open_file *of = seq->private;
+	struct css_set *cset;
+	struct cgroup_subsys *ss;
+	struct cgroup_subsys_state *css;
+	int i, refcnt;
+
+	if (!cgroup_kn_lock_live(of->kn, false))
+		return -ENODEV;
+
+	spin_lock_irq(&css_set_lock);
+	rcu_read_lock();
+	cset = rcu_dereference(current->cgroups);
+	refcnt = refcount_read(&cset->refcount);
+	seq_printf(seq, "css_set %pK %d", cset, refcnt);
+	if (refcnt > cset->nr_tasks)
+		seq_printf(seq, " +%d", refcnt - cset->nr_tasks);
+	seq_puts(seq, "\n");
+
+	/*
+	 * Print the css'es stored in the current css_set.
+	 */
+	for_each_subsys(ss, i) {
+		css = cset->subsys[ss->id];
+		if (!css)
+			continue;
+		seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
+			  (unsigned long)css, css->id);
+	}
+	rcu_read_unlock();
+	spin_unlock_irq(&css_set_lock);
+	cgroup_kn_unlock(of->kn);
+	return 0;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+					 struct cftype *cft)
+{
+	u64 count;
+
+	rcu_read_lock();
+	count = refcount_read(&task_css_set(current)->refcount);
+	rcu_read_unlock();
+	return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+	struct cgrp_cset_link *link;
+	struct css_set *cset;
+	char *name_buf;
+
+	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!name_buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&css_set_lock);
+	rcu_read_lock();
+	cset = rcu_dereference(current->cgroups);
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+		struct cgroup *c = link->cgrp;
+
+		cgroup_name(c, name_buf, NAME_MAX + 1);
+		seq_printf(seq, "Root %d group %s\n",
+			   c->root->hierarchy_id, name_buf);
+	}
+	rcu_read_unlock();
+	spin_unlock_irq(&css_set_lock);
+	kfree(name_buf);
+	return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(seq);
+	struct cgrp_cset_link *link;
+	int dead_cnt = 0, extra_refs = 0;
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+		struct css_set *cset = link->cset;
+		struct task_struct *task;
+		int count = 0;
+		int refcnt = refcount_read(&cset->refcount);
+
+		seq_printf(seq, " %d", refcnt);
+		if (refcnt - cset->nr_tasks > 0) {
+			int extra = refcnt - cset->nr_tasks;
+
+			seq_printf(seq, " +%d", extra);
+			/*
+			 * Take out the one additional reference in
+			 * init_css_set.
+			 */
+			if (cset == &init_css_set)
+				extra--;
+			extra_refs += extra;
+		}
+		seq_puts(seq, "\n");
+
+		list_for_each_entry(task, &cset->tasks, cg_list) {
+			if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+				seq_printf(seq, "  task %d\n",
+					   task_pid_vnr(task));
+		}
+
+		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+			if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+				seq_printf(seq, "  task %d\n",
+					   task_pid_vnr(task));
+		}
+		/* show # of overflowed tasks */
+		if (count > MAX_TASKS_SHOWN_PER_CSS)
+			seq_printf(seq, "  ... (%d)\n",
+				   count - MAX_TASKS_SHOWN_PER_CSS);
+
+		if (cset->dead) {
+			seq_puts(seq, "    [dead]\n");
+			dead_cnt++;
+		}
+
+		WARN_ON(count != cset->nr_tasks);
+	}
+	spin_unlock_irq(&css_set_lock);
+
+	if (!dead_cnt && !extra_refs)
+		return 0;
+
+	seq_puts(seq, "\n");
+	if (extra_refs)
+		seq_printf(seq, "extra references = %d\n", extra_refs);
+	if (dead_cnt)
+		seq_printf(seq, "dead css_sets = %d\n", dead_cnt);
+
+	return 0;
+}
+
+static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
+{
+	struct kernfs_open_file *of = seq->private;
+	struct cgroup *cgrp;
+	struct cgroup_subsys *ss;
+	struct cgroup_subsys_state *css;
+	char pbuf[16];
+	int i;
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
+
+	for_each_subsys(ss, i) {
+		css = rcu_dereference_check(cgrp->subsys[ss->id], true);
+		if (!css)
+			continue;
+
+		pbuf[0] = '\0';
+
+		/* Show the parent CSS if applicable*/
+		if (css->parent)
+			snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
+				 css->parent->id);
+		seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
+			  (unsigned long)css, css->id,
+			  atomic_read(&css->online_cnt), pbuf);
+	}
+
+	cgroup_kn_unlock(of->kn);
+	return 0;
+}
+
+static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
+				  u16 mask)
+{
+	struct cgroup_subsys *ss;
+	int ssid;
+	bool first = true;
+
+	seq_printf(seq, "%-17s: ", name);
+	for_each_subsys(ss, ssid) {
+		if (!(mask & (1 << ssid)))
+			continue;
+		if (!first)
+			seq_puts(seq, ", ");
+		seq_puts(seq, ss->name);
+		first = false;
+	}
+	seq_putc(seq, '\n');
+}
+
+static int cgroup_masks_read(struct seq_file *seq, void *v)
+{
+	struct kernfs_open_file *of = seq->private;
+	struct cgroup *cgrp;
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
+
+	cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
+	cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
+
+	cgroup_kn_unlock(of->kn);
+	return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return (!cgroup_is_populated(css->cgroup) &&
+		!css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_legacy_files[] =  {
+	{
+		.name = "taskcount",
+		.read_u64 = debug_taskcount_read,
+	},
+
+	{
+		.name = "current_css_set",
+		.seq_show = current_css_set_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+	},
+
+	{
+		.name = "current_css_set_refcount",
+		.read_u64 = current_css_set_refcount_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+	},
+
+	{
+		.name = "current_css_set_cg_links",
+		.seq_show = current_css_set_cg_links_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+	},
+
+	{
+		.name = "cgroup_css_links",
+		.seq_show = cgroup_css_links_read,
+	},
+
+	{
+		.name = "cgroup_subsys_states",
+		.seq_show = cgroup_subsys_states_read,
+	},
+
+	{
+		.name = "cgroup_masks",
+		.seq_show = cgroup_masks_read,
+	},
+
+	{
+		.name = "releasable",
+		.read_u64 = releasable_read,
+	},
+
+	{ }	/* terminate */
+};
+
+static struct cftype debug_files[] =  {
+	{
+		.name = "taskcount",
+		.read_u64 = debug_taskcount_read,
+	},
+
+	{
+		.name = "current_css_set",
+		.seq_show = current_css_set_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+	},
+
+	{
+		.name = "current_css_set_refcount",
+		.read_u64 = current_css_set_refcount_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+	},
+
+	{
+		.name = "current_css_set_cg_links",
+		.seq_show = current_css_set_cg_links_read,
+		.flags = CFTYPE_ONLY_ON_ROOT,
+	},
+
+	{
+		.name = "css_links",
+		.seq_show = cgroup_css_links_read,
+	},
+
+	{
+		.name = "csses",
+		.seq_show = cgroup_subsys_states_read,
+	},
+
+	{
+		.name = "masks",
+		.seq_show = cgroup_masks_read,
+	},
+
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+	.css_alloc	= debug_css_alloc,
+	.css_free	= debug_css_free,
+	.legacy_cftypes	= debug_legacy_files,
+};
+
+/*
+ * On v2, debug is an implicit controller enabled by "cgroup_debug" boot
+ * parameter.
+ */
+static int __init enable_cgroup_debug(char *str)
+{
+	debug_cgrp_subsys.dfl_cftypes = debug_files;
+	debug_cgrp_subsys.implicit_on_dfl = true;
+	return 1;
+}
+__setup("cgroup_debug", enable_cgroup_debug);