|
@@ -59,6 +59,9 @@
|
|
#include <linux/delay.h>
|
|
#include <linux/delay.h>
|
|
#include <linux/atomic.h>
|
|
#include <linux/atomic.h>
|
|
#include <linux/cpuset.h>
|
|
#include <linux/cpuset.h>
|
|
|
|
+#include <linux/proc_ns.h>
|
|
|
|
+#include <linux/nsproxy.h>
|
|
|
|
+#include <linux/proc_ns.h>
|
|
#include <net/sock.h>
|
|
#include <net/sock.h>
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -215,6 +218,15 @@ static u16 have_fork_callback __read_mostly;
|
|
static u16 have_exit_callback __read_mostly;
|
|
static u16 have_exit_callback __read_mostly;
|
|
static u16 have_free_callback __read_mostly;
|
|
static u16 have_free_callback __read_mostly;
|
|
|
|
|
|
|
|
+/* cgroup namespace for init task */
|
|
|
|
+struct cgroup_namespace init_cgroup_ns = {
|
|
|
|
+ .count = { .counter = 2, },
|
|
|
|
+ .user_ns = &init_user_ns,
|
|
|
|
+ .ns.ops = &cgroupns_operations,
|
|
|
|
+ .ns.inum = PROC_CGROUP_INIT_INO,
|
|
|
|
+ .root_cset = &init_css_set,
|
|
|
|
+};
|
|
|
|
+
|
|
/* Ditto for the can_fork callback. */
|
|
/* Ditto for the can_fork callback. */
|
|
static u16 have_canfork_callback __read_mostly;
|
|
static u16 have_canfork_callback __read_mostly;
|
|
|
|
|
|
@@ -2002,6 +2014,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|
{
|
|
{
|
|
bool is_v2 = fs_type == &cgroup2_fs_type;
|
|
bool is_v2 = fs_type == &cgroup2_fs_type;
|
|
struct super_block *pinned_sb = NULL;
|
|
struct super_block *pinned_sb = NULL;
|
|
|
|
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
|
|
struct cgroup_subsys *ss;
|
|
struct cgroup_subsys *ss;
|
|
struct cgroup_root *root;
|
|
struct cgroup_root *root;
|
|
struct cgroup_sb_opts opts;
|
|
struct cgroup_sb_opts opts;
|
|
@@ -2010,6 +2023,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|
int i;
|
|
int i;
|
|
bool new_sb;
|
|
bool new_sb;
|
|
|
|
|
|
|
|
+ get_cgroup_ns(ns);
|
|
|
|
+
|
|
|
|
+ /* Check if the caller has permission to mount. */
|
|
|
|
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
|
|
|
|
+ put_cgroup_ns(ns);
|
|
|
|
+ return ERR_PTR(-EPERM);
|
|
|
|
+ }
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* The first time anyone tries to mount a cgroup, enable the list
|
|
* The first time anyone tries to mount a cgroup, enable the list
|
|
* linking each css_set to its tasks and fix up all existing tasks.
|
|
* linking each css_set to its tasks and fix up all existing tasks.
|
|
@@ -2020,6 +2041,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|
if (is_v2) {
|
|
if (is_v2) {
|
|
if (data) {
|
|
if (data) {
|
|
pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
|
|
pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
|
|
|
|
+ put_cgroup_ns(ns);
|
|
return ERR_PTR(-EINVAL);
|
|
return ERR_PTR(-EINVAL);
|
|
}
|
|
}
|
|
cgrp_dfl_visible = true;
|
|
cgrp_dfl_visible = true;
|
|
@@ -2125,6 +2147,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|
goto out_unlock;
|
|
goto out_unlock;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ /*
|
|
|
|
+ * We know this subsystem has not yet been bound. Users in a non-init
|
|
|
|
+ * user namespace may only mount hierarchies with no bound subsystems,
|
|
|
|
+ * i.e. 'none,name=user1'
|
|
|
|
+ */
|
|
|
|
+ if (!opts.none && !capable(CAP_SYS_ADMIN)) {
|
|
|
|
+ ret = -EPERM;
|
|
|
|
+ goto out_unlock;
|
|
|
|
+ }
|
|
|
|
+
|
|
root = kzalloc(sizeof(*root), GFP_KERNEL);
|
|
root = kzalloc(sizeof(*root), GFP_KERNEL);
|
|
if (!root) {
|
|
if (!root) {
|
|
ret = -ENOMEM;
|
|
ret = -ENOMEM;
|
|
@@ -2143,12 +2175,37 @@ out_free:
|
|
kfree(opts.release_agent);
|
|
kfree(opts.release_agent);
|
|
kfree(opts.name);
|
|
kfree(opts.name);
|
|
|
|
|
|
- if (ret)
|
|
|
|
|
|
+ if (ret) {
|
|
|
|
+ put_cgroup_ns(ns);
|
|
return ERR_PTR(ret);
|
|
return ERR_PTR(ret);
|
|
|
|
+ }
|
|
out_mount:
|
|
out_mount:
|
|
dentry = kernfs_mount(fs_type, flags, root->kf_root,
|
|
dentry = kernfs_mount(fs_type, flags, root->kf_root,
|
|
is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
|
|
is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
|
|
&new_sb);
|
|
&new_sb);
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * In non-init cgroup namespace, instead of root cgroup's
|
|
|
|
+ * dentry, we return the dentry corresponding to the
|
|
|
|
+ * cgroupns->root_cgrp.
|
|
|
|
+ */
|
|
|
|
+ if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
|
|
|
|
+ struct dentry *nsdentry;
|
|
|
|
+ struct cgroup *cgrp;
|
|
|
|
+
|
|
|
|
+ mutex_lock(&cgroup_mutex);
|
|
|
|
+ spin_lock_bh(&css_set_lock);
|
|
|
|
+
|
|
|
|
+ cgrp = cset_cgroup_from_root(ns->root_cset, root);
|
|
|
|
+
|
|
|
|
+ spin_unlock_bh(&css_set_lock);
|
|
|
|
+ mutex_unlock(&cgroup_mutex);
|
|
|
|
+
|
|
|
|
+ nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
|
|
|
|
+ dput(dentry);
|
|
|
|
+ dentry = nsdentry;
|
|
|
|
+ }
|
|
|
|
+
|
|
if (IS_ERR(dentry) || !new_sb)
|
|
if (IS_ERR(dentry) || !new_sb)
|
|
cgroup_put(&root->cgrp);
|
|
cgroup_put(&root->cgrp);
|
|
|
|
|
|
@@ -2161,6 +2218,7 @@ out_mount:
|
|
deactivate_super(pinned_sb);
|
|
deactivate_super(pinned_sb);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ put_cgroup_ns(ns);
|
|
return dentry;
|
|
return dentry;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -2189,14 +2247,45 @@ static struct file_system_type cgroup_fs_type = {
|
|
.name = "cgroup",
|
|
.name = "cgroup",
|
|
.mount = cgroup_mount,
|
|
.mount = cgroup_mount,
|
|
.kill_sb = cgroup_kill_sb,
|
|
.kill_sb = cgroup_kill_sb,
|
|
|
|
+ .fs_flags = FS_USERNS_MOUNT,
|
|
};
|
|
};
|
|
|
|
|
|
static struct file_system_type cgroup2_fs_type = {
|
|
static struct file_system_type cgroup2_fs_type = {
|
|
.name = "cgroup2",
|
|
.name = "cgroup2",
|
|
.mount = cgroup_mount,
|
|
.mount = cgroup_mount,
|
|
.kill_sb = cgroup_kill_sb,
|
|
.kill_sb = cgroup_kill_sb,
|
|
|
|
+ .fs_flags = FS_USERNS_MOUNT,
|
|
};
|
|
};
|
|
|
|
|
|
|
|
+static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
|
|
|
|
+ struct cgroup_namespace *ns)
|
|
|
|
+{
|
|
|
|
+ struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
|
|
|
|
+ int ret;
|
|
|
|
+
|
|
|
|
+ ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
|
|
|
|
+ if (ret < 0 || ret >= buflen)
|
|
|
|
+ return NULL;
|
|
|
|
+ return buf;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
|
|
|
|
+ struct cgroup_namespace *ns)
|
|
|
|
+{
|
|
|
|
+ char *ret;
|
|
|
|
+
|
|
|
|
+ mutex_lock(&cgroup_mutex);
|
|
|
|
+ spin_lock_bh(&css_set_lock);
|
|
|
|
+
|
|
|
|
+ ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
|
|
|
|
+
|
|
|
|
+ spin_unlock_bh(&css_set_lock);
|
|
|
|
+ mutex_unlock(&cgroup_mutex);
|
|
|
|
+
|
|
|
|
+ return ret;
|
|
|
|
+}
|
|
|
|
+EXPORT_SYMBOL_GPL(cgroup_path_ns);
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
|
|
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
|
|
* @task: target task
|
|
* @task: target task
|
|
@@ -2224,7 +2313,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
|
|
|
|
|
|
if (root) {
|
|
if (root) {
|
|
cgrp = task_cgroup_from_root(task, root);
|
|
cgrp = task_cgroup_from_root(task, root);
|
|
- path = cgroup_path(cgrp, buf, buflen);
|
|
|
|
|
|
+ path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
|
|
} else {
|
|
} else {
|
|
/* if no hierarchy exists, everyone is in "/" */
|
|
/* if no hierarchy exists, everyone is in "/" */
|
|
if (strlcpy(buf, "/", buflen) < buflen)
|
|
if (strlcpy(buf, "/", buflen) < buflen)
|
|
@@ -5450,6 +5539,8 @@ int __init cgroup_init(void)
|
|
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
|
|
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
|
|
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
|
|
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
|
|
|
|
|
|
|
|
+ get_user_ns(init_cgroup_ns.user_ns);
|
|
|
|
+
|
|
mutex_lock(&cgroup_mutex);
|
|
mutex_lock(&cgroup_mutex);
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -5601,7 +5692,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
|
* " (deleted)" is appended to the cgroup path.
|
|
* " (deleted)" is appended to the cgroup path.
|
|
*/
|
|
*/
|
|
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
|
|
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
|
|
- path = cgroup_path(cgrp, buf, PATH_MAX);
|
|
|
|
|
|
+ path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
|
|
|
|
+ current->nsproxy->cgroup_ns);
|
|
if (!path) {
|
|
if (!path) {
|
|
retval = -ENAMETOOLONG;
|
|
retval = -ENAMETOOLONG;
|
|
goto out_unlock;
|
|
goto out_unlock;
|
|
@@ -5886,7 +5978,9 @@ static void cgroup_release_agent(struct work_struct *work)
|
|
if (!pathbuf || !agentbuf)
|
|
if (!pathbuf || !agentbuf)
|
|
goto out;
|
|
goto out;
|
|
|
|
|
|
- path = cgroup_path(cgrp, pathbuf, PATH_MAX);
|
|
|
|
|
|
+ spin_lock_bh(&css_set_lock);
|
|
|
|
+ path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
|
|
|
|
+ spin_unlock_bh(&css_set_lock);
|
|
if (!path)
|
|
if (!path)
|
|
goto out;
|
|
goto out;
|
|
|
|
|
|
@@ -6098,6 +6192,133 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
|
|
|
|
|
|
#endif /* CONFIG_SOCK_CGROUP_DATA */
|
|
#endif /* CONFIG_SOCK_CGROUP_DATA */
|
|
|
|
|
|
|
|
+/* cgroup namespaces */
|
|
|
|
+
|
|
|
|
+static struct cgroup_namespace *alloc_cgroup_ns(void)
|
|
|
|
+{
|
|
|
|
+ struct cgroup_namespace *new_ns;
|
|
|
|
+ int ret;
|
|
|
|
+
|
|
|
|
+ new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
|
|
|
|
+ if (!new_ns)
|
|
|
|
+ return ERR_PTR(-ENOMEM);
|
|
|
|
+ ret = ns_alloc_inum(&new_ns->ns);
|
|
|
|
+ if (ret) {
|
|
|
|
+ kfree(new_ns);
|
|
|
|
+ return ERR_PTR(ret);
|
|
|
|
+ }
|
|
|
|
+ atomic_set(&new_ns->count, 1);
|
|
|
|
+ new_ns->ns.ops = &cgroupns_operations;
|
|
|
|
+ return new_ns;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+void free_cgroup_ns(struct cgroup_namespace *ns)
|
|
|
|
+{
|
|
|
|
+ put_css_set(ns->root_cset);
|
|
|
|
+ put_user_ns(ns->user_ns);
|
|
|
|
+ ns_free_inum(&ns->ns);
|
|
|
|
+ kfree(ns);
|
|
|
|
+}
|
|
|
|
+EXPORT_SYMBOL(free_cgroup_ns);
|
|
|
|
+
|
|
|
|
+struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
|
|
|
|
+ struct user_namespace *user_ns,
|
|
|
|
+ struct cgroup_namespace *old_ns)
|
|
|
|
+{
|
|
|
|
+ struct cgroup_namespace *new_ns;
|
|
|
|
+ struct css_set *cset;
|
|
|
|
+
|
|
|
|
+ BUG_ON(!old_ns);
|
|
|
|
+
|
|
|
|
+ if (!(flags & CLONE_NEWCGROUP)) {
|
|
|
|
+ get_cgroup_ns(old_ns);
|
|
|
|
+ return old_ns;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /* Allow only sysadmin to create cgroup namespace. */
|
|
|
|
+ if (!ns_capable(user_ns, CAP_SYS_ADMIN))
|
|
|
|
+ return ERR_PTR(-EPERM);
|
|
|
|
+
|
|
|
|
+ mutex_lock(&cgroup_mutex);
|
|
|
|
+ spin_lock_bh(&css_set_lock);
|
|
|
|
+
|
|
|
|
+ cset = task_css_set(current);
|
|
|
|
+ get_css_set(cset);
|
|
|
|
+
|
|
|
|
+ spin_unlock_bh(&css_set_lock);
|
|
|
|
+ mutex_unlock(&cgroup_mutex);
|
|
|
|
+
|
|
|
|
+ new_ns = alloc_cgroup_ns();
|
|
|
|
+ if (IS_ERR(new_ns)) {
|
|
|
|
+ put_css_set(cset);
|
|
|
|
+ return new_ns;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ new_ns->user_ns = get_user_ns(user_ns);
|
|
|
|
+ new_ns->root_cset = cset;
|
|
|
|
+
|
|
|
|
+ return new_ns;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
|
|
|
|
+{
|
|
|
|
+ return container_of(ns, struct cgroup_namespace, ns);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
|
|
|
|
+{
|
|
|
|
+ struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
|
|
|
|
+
|
|
|
|
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
|
|
|
|
+ !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
|
|
|
|
+ return -EPERM;
|
|
|
|
+
|
|
|
|
+ /* Don't need to do anything if we are attaching to our own cgroupns. */
|
|
|
|
+ if (cgroup_ns == nsproxy->cgroup_ns)
|
|
|
|
+ return 0;
|
|
|
|
+
|
|
|
|
+ get_cgroup_ns(cgroup_ns);
|
|
|
|
+ put_cgroup_ns(nsproxy->cgroup_ns);
|
|
|
|
+ nsproxy->cgroup_ns = cgroup_ns;
|
|
|
|
+
|
|
|
|
+ return 0;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static struct ns_common *cgroupns_get(struct task_struct *task)
|
|
|
|
+{
|
|
|
|
+ struct cgroup_namespace *ns = NULL;
|
|
|
|
+ struct nsproxy *nsproxy;
|
|
|
|
+
|
|
|
|
+ task_lock(task);
|
|
|
|
+ nsproxy = task->nsproxy;
|
|
|
|
+ if (nsproxy) {
|
|
|
|
+ ns = nsproxy->cgroup_ns;
|
|
|
|
+ get_cgroup_ns(ns);
|
|
|
|
+ }
|
|
|
|
+ task_unlock(task);
|
|
|
|
+
|
|
|
|
+ return ns ? &ns->ns : NULL;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void cgroupns_put(struct ns_common *ns)
|
|
|
|
+{
|
|
|
|
+ put_cgroup_ns(to_cg_ns(ns));
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+const struct proc_ns_operations cgroupns_operations = {
|
|
|
|
+ .name = "cgroup",
|
|
|
|
+ .type = CLONE_NEWCGROUP,
|
|
|
|
+ .get = cgroupns_get,
|
|
|
|
+ .put = cgroupns_put,
|
|
|
|
+ .install = cgroupns_install,
|
|
|
|
+};
|
|
|
|
+
|
|
|
|
+static __init int cgroup_namespaces_init(void)
|
|
|
|
+{
|
|
|
|
+ return 0;
|
|
|
|
+}
|
|
|
|
+subsys_initcall(cgroup_namespaces_init);
|
|
|
|
+
|
|
#ifdef CONFIG_CGROUP_DEBUG
|
|
#ifdef CONFIG_CGROUP_DEBUG
|
|
static struct cgroup_subsys_state *
|
|
static struct cgroup_subsys_state *
|
|
debug_css_alloc(struct cgroup_subsys_state *parent_css)
|
|
debug_css_alloc(struct cgroup_subsys_state *parent_css)
|