|
@@ -162,6 +162,9 @@ static u16 cgrp_dfl_inhibit_ss_mask;
|
|
|
/* some controllers are implicitly enabled on the default hierarchy */
|
|
|
static u16 cgrp_dfl_implicit_ss_mask;
|
|
|
|
|
|
+/* some controllers can be threaded on the default hierarchy */
|
|
|
+static u16 cgrp_dfl_threaded_ss_mask;
|
|
|
+
|
|
|
/* The list of hierarchy roots */
|
|
|
LIST_HEAD(cgroup_roots);
|
|
|
static int cgroup_root_count;
|
|
@@ -316,13 +319,87 @@ static void cgroup_idr_remove(struct idr *idr, int id)
|
|
|
spin_unlock_bh(&cgroup_idr_lock);
|
|
|
}
|
|
|
|
|
|
-static struct cgroup *cgroup_parent(struct cgroup *cgrp)
|
|
|
+static bool cgroup_has_tasks(struct cgroup *cgrp)
|
|
|
{
|
|
|
- struct cgroup_subsys_state *parent_css = cgrp->self.parent;
|
|
|
+ return cgrp->nr_populated_csets;
|
|
|
+}
|
|
|
|
|
|
- if (parent_css)
|
|
|
- return container_of(parent_css, struct cgroup, self);
|
|
|
- return NULL;
|
|
|
+bool cgroup_is_threaded(struct cgroup *cgrp)
|
|
|
+{
|
|
|
+ return cgrp->dom_cgrp != cgrp;
|
|
|
+}
|
|
|
+
|
|
|
+/* can @cgrp host both domain and threaded children? */
|
|
|
+static bool cgroup_is_mixable(struct cgroup *cgrp)
|
|
|
+{
|
|
|
+ /*
|
|
|
+ * Root isn't under domain level resource control exempting it from
|
|
|
+ * the no-internal-process constraint, so it can serve as a thread
|
|
|
+ * root and a parent of resource domains at the same time.
|
|
|
+ */
|
|
|
+ return !cgroup_parent(cgrp);
|
|
|
+}
|
|
|
+
|
|
|
+/* can @cgrp become a thread root? should always be true for a thread root */
|
|
|
+static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
|
|
|
+{
|
|
|
+ /* mixables don't care */
|
|
|
+ if (cgroup_is_mixable(cgrp))
|
|
|
+ return true;
|
|
|
+
|
|
|
+ /* domain roots can't be nested under threaded */
|
|
|
+ if (cgroup_is_threaded(cgrp))
|
|
|
+ return false;
|
|
|
+
|
|
|
+ /* can only have either domain or threaded children */
|
|
|
+ if (cgrp->nr_populated_domain_children)
|
|
|
+ return false;
|
|
|
+
|
|
|
+ /* and no domain controllers can be enabled */
|
|
|
+ if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
|
|
|
+ return false;
|
|
|
+
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
+/* is @cgrp root of a threaded subtree? */
|
|
|
+bool cgroup_is_thread_root(struct cgroup *cgrp)
|
|
|
+{
|
|
|
+ /* thread root should be a domain */
|
|
|
+ if (cgroup_is_threaded(cgrp))
|
|
|
+ return false;
|
|
|
+
|
|
|
+ /* a domain w/ threaded children is a thread root */
|
|
|
+ if (cgrp->nr_threaded_children)
|
|
|
+ return true;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * A domain which has tasks and explicit threaded controllers
|
|
|
+ * enabled is a thread root.
|
|
|
+ */
|
|
|
+ if (cgroup_has_tasks(cgrp) &&
|
|
|
+ (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
|
|
|
+ return true;
|
|
|
+
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+/* a domain which isn't connected to the root w/o brekage can't be used */
|
|
|
+static bool cgroup_is_valid_domain(struct cgroup *cgrp)
|
|
|
+{
|
|
|
+ /* the cgroup itself can be a thread root */
|
|
|
+ if (cgroup_is_threaded(cgrp))
|
|
|
+ return false;
|
|
|
+
|
|
|
+ /* but the ancestors can't be unless mixable */
|
|
|
+ while ((cgrp = cgroup_parent(cgrp))) {
|
|
|
+ if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
|
|
|
+ return false;
|
|
|
+ if (cgroup_is_threaded(cgrp))
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ return true;
|
|
|
}
|
|
|
|
|
|
/* subsystems visibly enabled on a cgroup */
|
|
@@ -331,8 +408,14 @@ static u16 cgroup_control(struct cgroup *cgrp)
|
|
|
struct cgroup *parent = cgroup_parent(cgrp);
|
|
|
u16 root_ss_mask = cgrp->root->subsys_mask;
|
|
|
|
|
|
- if (parent)
|
|
|
- return parent->subtree_control;
|
|
|
+ if (parent) {
|
|
|
+ u16 ss_mask = parent->subtree_control;
|
|
|
+
|
|
|
+ /* threaded cgroups can only have threaded controllers */
|
|
|
+ if (cgroup_is_threaded(cgrp))
|
|
|
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
|
|
|
+ return ss_mask;
|
|
|
+ }
|
|
|
|
|
|
if (cgroup_on_dfl(cgrp))
|
|
|
root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
|
|
@@ -345,8 +428,14 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
|
|
|
{
|
|
|
struct cgroup *parent = cgroup_parent(cgrp);
|
|
|
|
|
|
- if (parent)
|
|
|
- return parent->subtree_ss_mask;
|
|
|
+ if (parent) {
|
|
|
+ u16 ss_mask = parent->subtree_ss_mask;
|
|
|
+
|
|
|
+ /* threaded cgroups can only have threaded controllers */
|
|
|
+ if (cgroup_is_threaded(cgrp))
|
|
|
+ ss_mask &= cgrp_dfl_threaded_ss_mask;
|
|
|
+ return ss_mask;
|
|
|
+ }
|
|
|
|
|
|
return cgrp->root->subsys_mask;
|
|
|
}
|
|
@@ -436,22 +525,12 @@ out_unlock:
|
|
|
return css;
|
|
|
}
|
|
|
|
|
|
-static void __maybe_unused cgroup_get(struct cgroup *cgrp)
|
|
|
-{
|
|
|
- css_get(&cgrp->self);
|
|
|
-}
|
|
|
-
|
|
|
static void cgroup_get_live(struct cgroup *cgrp)
|
|
|
{
|
|
|
WARN_ON_ONCE(cgroup_is_dead(cgrp));
|
|
|
css_get(&cgrp->self);
|
|
|
}
|
|
|
|
|
|
-static bool cgroup_tryget(struct cgroup *cgrp)
|
|
|
-{
|
|
|
- return css_tryget(&cgrp->self);
|
|
|
-}
|
|
|
-
|
|
|
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
|
|
|
{
|
|
|
struct cgroup *cgrp = of->kn->parent->priv;
|
|
@@ -560,9 +639,11 @@ EXPORT_SYMBOL_GPL(of_css);
|
|
|
*/
|
|
|
struct css_set init_css_set = {
|
|
|
.refcount = REFCOUNT_INIT(1),
|
|
|
+ .dom_cset = &init_css_set,
|
|
|
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
|
|
|
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
|
|
|
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
|
|
|
+ .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
|
|
|
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
|
|
|
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
|
|
|
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
|
|
@@ -570,6 +651,11 @@ struct css_set init_css_set = {
|
|
|
|
|
|
static int css_set_count = 1; /* 1 for init_css_set */
|
|
|
|
|
|
+static bool css_set_threaded(struct css_set *cset)
|
|
|
+{
|
|
|
+ return cset->dom_cset != cset;
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* css_set_populated - does a css_set contain any tasks?
|
|
|
* @cset: target css_set
|
|
@@ -587,39 +673,48 @@ static bool css_set_populated(struct css_set *cset)
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * cgroup_update_populated - updated populated count of a cgroup
|
|
|
+ * cgroup_update_populated - update the populated count of a cgroup
|
|
|
* @cgrp: the target cgroup
|
|
|
* @populated: inc or dec populated count
|
|
|
*
|
|
|
* One of the css_sets associated with @cgrp is either getting its first
|
|
|
- * task or losing the last. Update @cgrp->populated_cnt accordingly. The
|
|
|
- * count is propagated towards root so that a given cgroup's populated_cnt
|
|
|
- * is zero iff the cgroup and all its descendants don't contain any tasks.
|
|
|
+ * task or losing the last. Update @cgrp->nr_populated_* accordingly. The
|
|
|
+ * count is propagated towards root so that a given cgroup's
|
|
|
+ * nr_populated_children is zero iff none of its descendants contain any
|
|
|
+ * tasks.
|
|
|
*
|
|
|
- * @cgrp's interface file "cgroup.populated" is zero if
|
|
|
- * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
|
|
|
- * changes from or to zero, userland is notified that the content of the
|
|
|
- * interface file has changed. This can be used to detect when @cgrp and
|
|
|
- * its descendants become populated or empty.
|
|
|
+ * @cgrp's interface file "cgroup.populated" is zero if both
|
|
|
+ * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
|
|
|
+ * 1 otherwise. When the sum changes from or to zero, userland is notified
|
|
|
+ * that the content of the interface file has changed. This can be used to
|
|
|
+ * detect when @cgrp and its descendants become populated or empty.
|
|
|
*/
|
|
|
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
|
|
|
{
|
|
|
+ struct cgroup *child = NULL;
|
|
|
+ int adj = populated ? 1 : -1;
|
|
|
+
|
|
|
lockdep_assert_held(&css_set_lock);
|
|
|
|
|
|
do {
|
|
|
- bool trigger;
|
|
|
+ bool was_populated = cgroup_is_populated(cgrp);
|
|
|
|
|
|
- if (populated)
|
|
|
- trigger = !cgrp->populated_cnt++;
|
|
|
- else
|
|
|
- trigger = !--cgrp->populated_cnt;
|
|
|
+ if (!child) {
|
|
|
+ cgrp->nr_populated_csets += adj;
|
|
|
+ } else {
|
|
|
+ if (cgroup_is_threaded(child))
|
|
|
+ cgrp->nr_populated_threaded_children += adj;
|
|
|
+ else
|
|
|
+ cgrp->nr_populated_domain_children += adj;
|
|
|
+ }
|
|
|
|
|
|
- if (!trigger)
|
|
|
+ if (was_populated == cgroup_is_populated(cgrp))
|
|
|
break;
|
|
|
|
|
|
cgroup1_check_for_release(cgrp);
|
|
|
cgroup_file_notify(&cgrp->events_file);
|
|
|
|
|
|
+ child = cgrp;
|
|
|
cgrp = cgroup_parent(cgrp);
|
|
|
} while (cgrp);
|
|
|
}
|
|
@@ -630,7 +725,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
|
|
|
* @populated: whether @cset is populated or depopulated
|
|
|
*
|
|
|
* @cset is either getting the first task or losing the last. Update the
|
|
|
- * ->populated_cnt of all associated cgroups accordingly.
|
|
|
+ * populated counters of all associated cgroups accordingly.
|
|
|
*/
|
|
|
static void css_set_update_populated(struct css_set *cset, bool populated)
|
|
|
{
|
|
@@ -653,7 +748,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
|
|
|
* css_set, @from_cset can be NULL. If @task is being disassociated
|
|
|
* instead of moved, @to_cset can be NULL.
|
|
|
*
|
|
|
- * This function automatically handles populated_cnt updates and
|
|
|
+ * This function automatically handles populated counter updates and
|
|
|
* css_task_iter adjustments but the caller is responsible for managing
|
|
|
* @from_cset and @to_cset's reference counts.
|
|
|
*/
|
|
@@ -737,6 +832,8 @@ void put_css_set_locked(struct css_set *cset)
|
|
|
if (!refcount_dec_and_test(&cset->refcount))
|
|
|
return;
|
|
|
|
|
|
+ WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
|
|
|
+
|
|
|
/* This css_set is dead. unlink it and release cgroup and css refs */
|
|
|
for_each_subsys(ss, ssid) {
|
|
|
list_del(&cset->e_cset_node[ssid]);
|
|
@@ -753,6 +850,11 @@ void put_css_set_locked(struct css_set *cset)
|
|
|
kfree(link);
|
|
|
}
|
|
|
|
|
|
+ if (css_set_threaded(cset)) {
|
|
|
+ list_del(&cset->threaded_csets_node);
|
|
|
+ put_css_set_locked(cset->dom_cset);
|
|
|
+ }
|
|
|
+
|
|
|
kfree_rcu(cset, rcu_head);
|
|
|
}
|
|
|
|
|
@@ -771,6 +873,7 @@ static bool compare_css_sets(struct css_set *cset,
|
|
|
struct cgroup *new_cgrp,
|
|
|
struct cgroup_subsys_state *template[])
|
|
|
{
|
|
|
+ struct cgroup *new_dfl_cgrp;
|
|
|
struct list_head *l1, *l2;
|
|
|
|
|
|
/*
|
|
@@ -781,6 +884,16 @@ static bool compare_css_sets(struct css_set *cset,
|
|
|
if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
|
|
|
return false;
|
|
|
|
|
|
+
|
|
|
+ /* @cset's domain should match the default cgroup's */
|
|
|
+ if (cgroup_on_dfl(new_cgrp))
|
|
|
+ new_dfl_cgrp = new_cgrp;
|
|
|
+ else
|
|
|
+ new_dfl_cgrp = old_cset->dfl_cgrp;
|
|
|
+
|
|
|
+ if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
|
|
|
+ return false;
|
|
|
+
|
|
|
/*
|
|
|
* Compare cgroup pointers in order to distinguish between
|
|
|
* different cgroups in hierarchies. As different cgroups may
|
|
@@ -988,9 +1101,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
|
|
|
}
|
|
|
|
|
|
refcount_set(&cset->refcount, 1);
|
|
|
+ cset->dom_cset = cset;
|
|
|
INIT_LIST_HEAD(&cset->tasks);
|
|
|
INIT_LIST_HEAD(&cset->mg_tasks);
|
|
|
INIT_LIST_HEAD(&cset->task_iters);
|
|
|
+ INIT_LIST_HEAD(&cset->threaded_csets);
|
|
|
INIT_HLIST_NODE(&cset->hlist);
|
|
|
INIT_LIST_HEAD(&cset->cgrp_links);
|
|
|
INIT_LIST_HEAD(&cset->mg_preload_node);
|
|
@@ -1028,6 +1143,28 @@ static struct css_set *find_css_set(struct css_set *old_cset,
|
|
|
|
|
|
spin_unlock_irq(&css_set_lock);
|
|
|
|
|
|
+ /*
|
|
|
+ * If @cset should be threaded, look up the matching dom_cset and
|
|
|
+ * link them up. We first fully initialize @cset then look for the
|
|
|
+ * dom_cset. It's simpler this way and safe as @cset is guaranteed
|
|
|
+ * to stay empty until we return.
|
|
|
+ */
|
|
|
+ if (cgroup_is_threaded(cset->dfl_cgrp)) {
|
|
|
+ struct css_set *dcset;
|
|
|
+
|
|
|
+ dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
|
|
|
+ if (!dcset) {
|
|
|
+ put_css_set(cset);
|
|
|
+ return NULL;
|
|
|
+ }
|
|
|
+
|
|
|
+ spin_lock_irq(&css_set_lock);
|
|
|
+ cset->dom_cset = dcset;
|
|
|
+ list_add_tail(&cset->threaded_csets_node,
|
|
|
+ &dcset->threaded_csets);
|
|
|
+ spin_unlock_irq(&css_set_lock);
|
|
|
+ }
|
|
|
+
|
|
|
return cset;
|
|
|
}
|
|
|
|
|
@@ -1155,6 +1292,8 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
|
|
|
|
|
|
if (cset == &init_css_set) {
|
|
|
res = &root->cgrp;
|
|
|
+ } else if (root == &cgrp_dfl_root) {
|
|
|
+ res = cset->dfl_cgrp;
|
|
|
} else {
|
|
|
struct cgrp_cset_link *link;
|
|
|
|
|
@@ -1670,6 +1809,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
|
|
|
mutex_init(&cgrp->pidlist_mutex);
|
|
|
cgrp->self.cgroup = cgrp;
|
|
|
cgrp->self.flags |= CSS_ONLINE;
|
|
|
+ cgrp->dom_cgrp = cgrp;
|
|
|
+ cgrp->max_descendants = INT_MAX;
|
|
|
+ cgrp->max_depth = INT_MAX;
|
|
|
|
|
|
for_each_subsys(ss, ssid)
|
|
|
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
|
|
@@ -2172,17 +2314,40 @@ out_release_tset:
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
|
|
|
+ * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
|
|
|
* @dst_cgrp: destination cgroup to test
|
|
|
*
|
|
|
- * On the default hierarchy, except for the root, subtree_control must be
|
|
|
- * zero for migration destination cgroups with tasks so that child cgroups
|
|
|
- * don't compete against tasks.
|
|
|
+ * On the default hierarchy, except for the mixable, (possible) thread root
|
|
|
+ * and threaded cgroups, subtree_control must be zero for migration
|
|
|
+ * destination cgroups with tasks so that child cgroups don't compete
|
|
|
+ * against tasks.
|
|
|
*/
|
|
|
-bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
|
|
|
+int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
|
|
|
{
|
|
|
- return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
|
|
|
- !dst_cgrp->subtree_control;
|
|
|
+ /* v1 doesn't have any restriction */
|
|
|
+ if (!cgroup_on_dfl(dst_cgrp))
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ /* verify @dst_cgrp can host resources */
|
|
|
+ if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
|
|
|
+ return -EOPNOTSUPP;
|
|
|
+
|
|
|
+ /* mixables don't care */
|
|
|
+ if (cgroup_is_mixable(dst_cgrp))
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If @dst_cgrp is already or can become a thread root or is
|
|
|
+ * threaded, it doesn't matter.
|
|
|
+ */
|
|
|
+ if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ /* apply no-internal-process constraint */
|
|
|
+ if (dst_cgrp->subtree_control)
|
|
|
+ return -EBUSY;
|
|
|
+
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -2387,8 +2552,9 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
|
|
|
struct task_struct *task;
|
|
|
int ret;
|
|
|
|
|
|
- if (!cgroup_may_migrate_to(dst_cgrp))
|
|
|
- return -EBUSY;
|
|
|
+ ret = cgroup_migrate_vet_dst(dst_cgrp);
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
|
|
|
/* look up all src csets */
|
|
|
spin_lock_irq(&css_set_lock);
|
|
@@ -2415,96 +2581,23 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
-static int cgroup_procs_write_permission(struct task_struct *task,
|
|
|
- struct cgroup *dst_cgrp,
|
|
|
- struct kernfs_open_file *of)
|
|
|
-{
|
|
|
- struct super_block *sb = of->file->f_path.dentry->d_sb;
|
|
|
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
|
|
|
- struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
|
|
|
- struct cgroup *src_cgrp, *com_cgrp;
|
|
|
- struct inode *inode;
|
|
|
- int ret;
|
|
|
-
|
|
|
- if (!cgroup_on_dfl(dst_cgrp)) {
|
|
|
- const struct cred *cred = current_cred();
|
|
|
- const struct cred *tcred = get_task_cred(task);
|
|
|
-
|
|
|
- /*
|
|
|
- * even if we're attaching all tasks in the thread group,
|
|
|
- * we only need to check permissions on one of them.
|
|
|
- */
|
|
|
- if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
|
|
|
- uid_eq(cred->euid, tcred->uid) ||
|
|
|
- uid_eq(cred->euid, tcred->suid))
|
|
|
- ret = 0;
|
|
|
- else
|
|
|
- ret = -EACCES;
|
|
|
-
|
|
|
- put_cred(tcred);
|
|
|
- return ret;
|
|
|
- }
|
|
|
-
|
|
|
- /* find the source cgroup */
|
|
|
- spin_lock_irq(&css_set_lock);
|
|
|
- src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
|
|
|
- spin_unlock_irq(&css_set_lock);
|
|
|
-
|
|
|
- /* and the common ancestor */
|
|
|
- com_cgrp = src_cgrp;
|
|
|
- while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
|
|
|
- com_cgrp = cgroup_parent(com_cgrp);
|
|
|
-
|
|
|
- /* %current should be authorized to migrate to the common ancestor */
|
|
|
- inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
|
|
|
- if (!inode)
|
|
|
- return -ENOMEM;
|
|
|
-
|
|
|
- ret = inode_permission(inode, MAY_WRITE);
|
|
|
- iput(inode);
|
|
|
- if (ret)
|
|
|
- return ret;
|
|
|
-
|
|
|
- /*
|
|
|
- * If namespaces are delegation boundaries, %current must be able
|
|
|
- * to see both source and destination cgroups from its namespace.
|
|
|
- */
|
|
|
- if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
|
|
|
- (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
|
|
|
- !cgroup_is_descendant(dst_cgrp, root_cgrp)))
|
|
|
- return -ENOENT;
|
|
|
-
|
|
|
- return 0;
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- * Find the task_struct of the task to attach by vpid and pass it along to the
|
|
|
- * function to attach either it or all tasks in its threadgroup. Will lock
|
|
|
- * cgroup_mutex and threadgroup.
|
|
|
- */
|
|
|
-ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
|
|
|
- size_t nbytes, loff_t off, bool threadgroup)
|
|
|
+struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup)
|
|
|
+ __acquires(&cgroup_threadgroup_rwsem)
|
|
|
{
|
|
|
struct task_struct *tsk;
|
|
|
- struct cgroup_subsys *ss;
|
|
|
- struct cgroup *cgrp;
|
|
|
pid_t pid;
|
|
|
- int ssid, ret;
|
|
|
|
|
|
if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
|
|
|
- return -EINVAL;
|
|
|
-
|
|
|
- cgrp = cgroup_kn_lock_live(of->kn, false);
|
|
|
- if (!cgrp)
|
|
|
- return -ENODEV;
|
|
|
+ return ERR_PTR(-EINVAL);
|
|
|
|
|
|
percpu_down_write(&cgroup_threadgroup_rwsem);
|
|
|
+
|
|
|
rcu_read_lock();
|
|
|
if (pid) {
|
|
|
tsk = find_task_by_vpid(pid);
|
|
|
if (!tsk) {
|
|
|
- ret = -ESRCH;
|
|
|
- goto out_unlock_rcu;
|
|
|
+ tsk = ERR_PTR(-ESRCH);
|
|
|
+ goto out_unlock_threadgroup;
|
|
|
}
|
|
|
} else {
|
|
|
tsk = current;
|
|
@@ -2520,35 +2613,33 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
|
|
|
* cgroup with no rt_runtime allocated. Just say no.
|
|
|
*/
|
|
|
if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
|
|
|
- ret = -EINVAL;
|
|
|
- goto out_unlock_rcu;
|
|
|
+ tsk = ERR_PTR(-EINVAL);
|
|
|
+ goto out_unlock_threadgroup;
|
|
|
}
|
|
|
|
|
|
get_task_struct(tsk);
|
|
|
+ goto out_unlock_rcu;
|
|
|
+
|
|
|
+out_unlock_threadgroup:
|
|
|
+ percpu_up_write(&cgroup_threadgroup_rwsem);
|
|
|
+out_unlock_rcu:
|
|
|
rcu_read_unlock();
|
|
|
+ return tsk;
|
|
|
+}
|
|
|
|
|
|
- ret = cgroup_procs_write_permission(tsk, cgrp, of);
|
|
|
- if (!ret)
|
|
|
- ret = cgroup_attach_task(cgrp, tsk, threadgroup);
|
|
|
+void cgroup_procs_write_finish(struct task_struct *task)
|
|
|
+ __releases(&cgroup_threadgroup_rwsem)
|
|
|
+{
|
|
|
+ struct cgroup_subsys *ss;
|
|
|
+ int ssid;
|
|
|
|
|
|
- put_task_struct(tsk);
|
|
|
- goto out_unlock_threadgroup;
|
|
|
+ /* release reference from cgroup_procs_write_start() */
|
|
|
+ put_task_struct(task);
|
|
|
|
|
|
-out_unlock_rcu:
|
|
|
- rcu_read_unlock();
|
|
|
-out_unlock_threadgroup:
|
|
|
percpu_up_write(&cgroup_threadgroup_rwsem);
|
|
|
for_each_subsys(ss, ssid)
|
|
|
if (ss->post_attach)
|
|
|
ss->post_attach();
|
|
|
- cgroup_kn_unlock(of->kn);
|
|
|
- return ret ?: nbytes;
|
|
|
-}
|
|
|
-
|
|
|
-ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
|
|
|
- loff_t off)
|
|
|
-{
|
|
|
- return __cgroup_procs_write(of, buf, nbytes, off, true);
|
|
|
}
|
|
|
|
|
|
static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
|
|
@@ -2891,6 +2982,46 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
|
|
|
cgroup_apply_control_disable(cgrp);
|
|
|
}
|
|
|
|
|
|
+static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
|
|
|
+{
|
|
|
+ u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
|
|
|
+
|
|
|
+ /* if nothing is getting enabled, nothing to worry about */
|
|
|
+ if (!enable)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ /* can @cgrp host any resources? */
|
|
|
+ if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
|
|
|
+ return -EOPNOTSUPP;
|
|
|
+
|
|
|
+ /* mixables don't care */
|
|
|
+ if (cgroup_is_mixable(cgrp))
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ if (domain_enable) {
|
|
|
+ /* can't enable domain controllers inside a thread subtree */
|
|
|
+ if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
|
|
|
+ return -EOPNOTSUPP;
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ * Threaded controllers can handle internal competitions
|
|
|
+ * and are always allowed inside a (prospective) thread
|
|
|
+ * subtree.
|
|
|
+ */
|
|
|
+ if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Controllers can't be enabled for a cgroup with tasks to avoid
|
|
|
+ * child cgroups competing against tasks.
|
|
|
+ */
|
|
|
+ if (cgroup_has_tasks(cgrp))
|
|
|
+ return -EBUSY;
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
/* change the enabled child controllers for a cgroup in the default hierarchy */
|
|
|
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
|
|
|
char *buf, size_t nbytes,
|
|
@@ -2966,33 +3097,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
|
|
|
goto out_unlock;
|
|
|
}
|
|
|
|
|
|
- /*
|
|
|
- * Except for the root, subtree_control must be zero for a cgroup
|
|
|
- * with tasks so that child cgroups don't compete against tasks.
|
|
|
- */
|
|
|
- if (enable && cgroup_parent(cgrp)) {
|
|
|
- struct cgrp_cset_link *link;
|
|
|
-
|
|
|
- /*
|
|
|
- * Because namespaces pin csets too, @cgrp->cset_links
|
|
|
- * might not be empty even when @cgrp is empty. Walk and
|
|
|
- * verify each cset.
|
|
|
- */
|
|
|
- spin_lock_irq(&css_set_lock);
|
|
|
-
|
|
|
- ret = 0;
|
|
|
- list_for_each_entry(link, &cgrp->cset_links, cset_link) {
|
|
|
- if (css_set_populated(link->cset)) {
|
|
|
- ret = -EBUSY;
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- spin_unlock_irq(&css_set_lock);
|
|
|
-
|
|
|
- if (ret)
|
|
|
- goto out_unlock;
|
|
|
- }
|
|
|
+ ret = cgroup_vet_subtree_control_enable(cgrp, enable);
|
|
|
+ if (ret)
|
|
|
+ goto out_unlock;
|
|
|
|
|
|
/* save and update control masks and prepare csses */
|
|
|
cgroup_save_control(cgrp);
|
|
@@ -3011,6 +3118,172 @@ out_unlock:
|
|
|
return ret ?: nbytes;
|
|
|
}
|
|
|
|
|
|
+/**
|
|
|
+ * cgroup_enable_threaded - make @cgrp threaded
|
|
|
+ * @cgrp: the target cgroup
|
|
|
+ *
|
|
|
+ * Called when "threaded" is written to the cgroup.type interface file and
|
|
|
+ * tries to make @cgrp threaded and join the parent's resource domain.
|
|
|
+ * This function is never called on the root cgroup as cgroup.type doesn't
|
|
|
+ * exist on it.
|
|
|
+ */
|
|
|
+static int cgroup_enable_threaded(struct cgroup *cgrp)
|
|
|
+{
|
|
|
+ struct cgroup *parent = cgroup_parent(cgrp);
|
|
|
+ struct cgroup *dom_cgrp = parent->dom_cgrp;
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ lockdep_assert_held(&cgroup_mutex);
|
|
|
+
|
|
|
+ /* noop if already threaded */
|
|
|
+ if (cgroup_is_threaded(cgrp))
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ /* we're joining the parent's domain, ensure its validity */
|
|
|
+ if (!cgroup_is_valid_domain(dom_cgrp) ||
|
|
|
+ !cgroup_can_be_thread_root(dom_cgrp))
|
|
|
+ return -EOPNOTSUPP;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The following shouldn't cause actual migrations and should
|
|
|
+ * always succeed.
|
|
|
+ */
|
|
|
+ cgroup_save_control(cgrp);
|
|
|
+
|
|
|
+ cgrp->dom_cgrp = dom_cgrp;
|
|
|
+ ret = cgroup_apply_control(cgrp);
|
|
|
+ if (!ret)
|
|
|
+ parent->nr_threaded_children++;
|
|
|
+ else
|
|
|
+ cgrp->dom_cgrp = cgrp;
|
|
|
+
|
|
|
+ cgroup_finalize_control(cgrp, ret);
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+static int cgroup_type_show(struct seq_file *seq, void *v)
|
|
|
+{
|
|
|
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
|
|
|
+
|
|
|
+ if (cgroup_is_threaded(cgrp))
|
|
|
+ seq_puts(seq, "threaded\n");
|
|
|
+ else if (!cgroup_is_valid_domain(cgrp))
|
|
|
+ seq_puts(seq, "domain invalid\n");
|
|
|
+ else if (cgroup_is_thread_root(cgrp))
|
|
|
+ seq_puts(seq, "domain threaded\n");
|
|
|
+ else
|
|
|
+ seq_puts(seq, "domain\n");
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
|
|
|
+ size_t nbytes, loff_t off)
|
|
|
+{
|
|
|
+ struct cgroup *cgrp;
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ /* only switching to threaded mode is supported */
|
|
|
+ if (strcmp(strstrip(buf), "threaded"))
|
|
|
+ return -EINVAL;
|
|
|
+
|
|
|
+ cgrp = cgroup_kn_lock_live(of->kn, false);
|
|
|
+ if (!cgrp)
|
|
|
+ return -ENOENT;
|
|
|
+
|
|
|
+ /* threaded can only be enabled */
|
|
|
+ ret = cgroup_enable_threaded(cgrp);
|
|
|
+
|
|
|
+ cgroup_kn_unlock(of->kn);
|
|
|
+ return ret ?: nbytes;
|
|
|
+}
|
|
|
+
|
|
|
+static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
|
|
|
+{
|
|
|
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
|
|
|
+ int descendants = READ_ONCE(cgrp->max_descendants);
|
|
|
+
|
|
|
+ if (descendants == INT_MAX)
|
|
|
+ seq_puts(seq, "max\n");
|
|
|
+ else
|
|
|
+ seq_printf(seq, "%d\n", descendants);
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
|
|
|
+ char *buf, size_t nbytes, loff_t off)
|
|
|
+{
|
|
|
+ struct cgroup *cgrp;
|
|
|
+ int descendants;
|
|
|
+ ssize_t ret;
|
|
|
+
|
|
|
+ buf = strstrip(buf);
|
|
|
+ if (!strcmp(buf, "max")) {
|
|
|
+ descendants = INT_MAX;
|
|
|
+ } else {
|
|
|
+ ret = kstrtoint(buf, 0, &descendants);
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (descendants < 0)
|
|
|
+ return -ERANGE;
|
|
|
+
|
|
|
+ cgrp = cgroup_kn_lock_live(of->kn, false);
|
|
|
+ if (!cgrp)
|
|
|
+ return -ENOENT;
|
|
|
+
|
|
|
+ cgrp->max_descendants = descendants;
|
|
|
+
|
|
|
+ cgroup_kn_unlock(of->kn);
|
|
|
+
|
|
|
+ return nbytes;
|
|
|
+}
|
|
|
+
|
|
|
+static int cgroup_max_depth_show(struct seq_file *seq, void *v)
|
|
|
+{
|
|
|
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
|
|
|
+ int depth = READ_ONCE(cgrp->max_depth);
|
|
|
+
|
|
|
+ if (depth == INT_MAX)
|
|
|
+ seq_puts(seq, "max\n");
|
|
|
+ else
|
|
|
+ seq_printf(seq, "%d\n", depth);
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
|
|
|
+ char *buf, size_t nbytes, loff_t off)
|
|
|
+{
|
|
|
+ struct cgroup *cgrp;
|
|
|
+ ssize_t ret;
|
|
|
+ int depth;
|
|
|
+
|
|
|
+ buf = strstrip(buf);
|
|
|
+ if (!strcmp(buf, "max")) {
|
|
|
+ depth = INT_MAX;
|
|
|
+ } else {
|
|
|
+ ret = kstrtoint(buf, 0, &depth);
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (depth < 0)
|
|
|
+ return -ERANGE;
|
|
|
+
|
|
|
+ cgrp = cgroup_kn_lock_live(of->kn, false);
|
|
|
+ if (!cgrp)
|
|
|
+ return -ENOENT;
|
|
|
+
|
|
|
+ cgrp->max_depth = depth;
|
|
|
+
|
|
|
+ cgroup_kn_unlock(of->kn);
|
|
|
+
|
|
|
+ return nbytes;
|
|
|
+}
|
|
|
+
|
|
|
static int cgroup_events_show(struct seq_file *seq, void *v)
|
|
|
{
|
|
|
seq_printf(seq, "populated %d\n",
|
|
@@ -3018,6 +3291,18 @@ static int cgroup_events_show(struct seq_file *seq, void *v)
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+static int cgroup_stat_show(struct seq_file *seq, void *v)
|
|
|
+{
|
|
|
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
|
|
|
+
|
|
|
+ seq_printf(seq, "nr_descendants %d\n",
|
|
|
+ cgroup->nr_descendants);
|
|
|
+ seq_printf(seq, "nr_dying_descendants %d\n",
|
|
|
+ cgroup->nr_dying_descendants);
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
static int cgroup_file_open(struct kernfs_open_file *of)
|
|
|
{
|
|
|
struct cftype *cft = of->kn->priv;
|
|
@@ -3234,7 +3519,6 @@ restart:
|
|
|
|
|
|
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
|
|
|
{
|
|
|
- LIST_HEAD(pending);
|
|
|
struct cgroup_subsys *ss = cfts[0].ss;
|
|
|
struct cgroup *root = &ss->root->cgrp;
|
|
|
struct cgroup_subsys_state *css;
|
|
@@ -3659,6 +3943,58 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
+static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
|
|
|
+{
|
|
|
+ struct list_head *l;
|
|
|
+ struct cgrp_cset_link *link;
|
|
|
+ struct css_set *cset;
|
|
|
+
|
|
|
+ lockdep_assert_held(&css_set_lock);
|
|
|
+
|
|
|
+ /* find the next threaded cset */
|
|
|
+ if (it->tcset_pos) {
|
|
|
+ l = it->tcset_pos->next;
|
|
|
+
|
|
|
+ if (l != it->tcset_head) {
|
|
|
+ it->tcset_pos = l;
|
|
|
+ return container_of(l, struct css_set,
|
|
|
+ threaded_csets_node);
|
|
|
+ }
|
|
|
+
|
|
|
+ it->tcset_pos = NULL;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* find the next cset */
|
|
|
+ l = it->cset_pos;
|
|
|
+ l = l->next;
|
|
|
+ if (l == it->cset_head) {
|
|
|
+ it->cset_pos = NULL;
|
|
|
+ return NULL;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (it->ss) {
|
|
|
+ cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
|
|
|
+ } else {
|
|
|
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
|
|
|
+ cset = link->cset;
|
|
|
+ }
|
|
|
+
|
|
|
+ it->cset_pos = l;
|
|
|
+
|
|
|
+ /* initialize threaded css_set walking */
|
|
|
+ if (it->flags & CSS_TASK_ITER_THREADED) {
|
|
|
+ if (it->cur_dcset)
|
|
|
+ put_css_set_locked(it->cur_dcset);
|
|
|
+ it->cur_dcset = cset;
|
|
|
+ get_css_set(cset);
|
|
|
+
|
|
|
+ it->tcset_head = &cset->threaded_csets;
|
|
|
+ it->tcset_pos = &cset->threaded_csets;
|
|
|
+ }
|
|
|
+
|
|
|
+ return cset;
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* css_task_iter_advance_css_set - advance a task itererator to the next css_set
|
|
|
* @it: the iterator to advance
|
|
@@ -3667,32 +4003,19 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
|
|
|
*/
|
|
|
static void css_task_iter_advance_css_set(struct css_task_iter *it)
|
|
|
{
|
|
|
- struct list_head *l = it->cset_pos;
|
|
|
- struct cgrp_cset_link *link;
|
|
|
struct css_set *cset;
|
|
|
|
|
|
lockdep_assert_held(&css_set_lock);
|
|
|
|
|
|
/* Advance to the next non-empty css_set */
|
|
|
do {
|
|
|
- l = l->next;
|
|
|
- if (l == it->cset_head) {
|
|
|
- it->cset_pos = NULL;
|
|
|
+ cset = css_task_iter_next_css_set(it);
|
|
|
+ if (!cset) {
|
|
|
it->task_pos = NULL;
|
|
|
return;
|
|
|
}
|
|
|
-
|
|
|
- if (it->ss) {
|
|
|
- cset = container_of(l, struct css_set,
|
|
|
- e_cset_node[it->ss->id]);
|
|
|
- } else {
|
|
|
- link = list_entry(l, struct cgrp_cset_link, cset_link);
|
|
|
- cset = link->cset;
|
|
|
- }
|
|
|
} while (!css_set_populated(cset));
|
|
|
|
|
|
- it->cset_pos = l;
|
|
|
-
|
|
|
if (!list_empty(&cset->tasks))
|
|
|
it->task_pos = cset->tasks.next;
|
|
|
else
|
|
@@ -3732,6 +4055,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
|
|
|
lockdep_assert_held(&css_set_lock);
|
|
|
WARN_ON_ONCE(!l);
|
|
|
|
|
|
+repeat:
|
|
|
/*
|
|
|
* Advance iterator to find next entry. cset->tasks is consumed
|
|
|
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
|
|
@@ -3746,11 +4070,18 @@ static void css_task_iter_advance(struct css_task_iter *it)
|
|
|
css_task_iter_advance_css_set(it);
|
|
|
else
|
|
|
it->task_pos = l;
|
|
|
+
|
|
|
+ /* if PROCS, skip over tasks which aren't group leaders */
|
|
|
+ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
|
|
|
+ !thread_group_leader(list_entry(it->task_pos, struct task_struct,
|
|
|
+ cg_list)))
|
|
|
+ goto repeat;
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
* css_task_iter_start - initiate task iteration
|
|
|
* @css: the css to walk tasks of
|
|
|
+ * @flags: CSS_TASK_ITER_* flags
|
|
|
* @it: the task iterator to use
|
|
|
*
|
|
|
* Initiate iteration through the tasks of @css. The caller can call
|
|
@@ -3758,7 +4089,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
|
|
|
* returns NULL. On completion of iteration, css_task_iter_end() must be
|
|
|
* called.
|
|
|
*/
|
|
|
-void css_task_iter_start(struct cgroup_subsys_state *css,
|
|
|
+void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
|
|
|
struct css_task_iter *it)
|
|
|
{
|
|
|
/* no one should try to iterate before mounting cgroups */
|
|
@@ -3769,6 +4100,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
|
|
|
spin_lock_irq(&css_set_lock);
|
|
|
|
|
|
it->ss = css->ss;
|
|
|
+ it->flags = flags;
|
|
|
|
|
|
if (it->ss)
|
|
|
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
|
|
@@ -3826,6 +4158,9 @@ void css_task_iter_end(struct css_task_iter *it)
|
|
|
spin_unlock_irq(&css_set_lock);
|
|
|
}
|
|
|
|
|
|
+ if (it->cur_dcset)
|
|
|
+ put_css_set(it->cur_dcset);
|
|
|
+
|
|
|
if (it->cur_task)
|
|
|
put_task_struct(it->cur_task);
|
|
|
}
|
|
@@ -3842,16 +4177,12 @@ static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
|
|
|
{
|
|
|
struct kernfs_open_file *of = s->private;
|
|
|
struct css_task_iter *it = of->priv;
|
|
|
- struct task_struct *task;
|
|
|
|
|
|
- do {
|
|
|
- task = css_task_iter_next(it);
|
|
|
- } while (task && !thread_group_leader(task));
|
|
|
-
|
|
|
- return task;
|
|
|
+ return css_task_iter_next(it);
|
|
|
}
|
|
|
|
|
|
-static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
|
|
|
+static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
|
|
|
+ unsigned int iter_flags)
|
|
|
{
|
|
|
struct kernfs_open_file *of = s->private;
|
|
|
struct cgroup *cgrp = seq_css(s)->cgroup;
|
|
@@ -3869,23 +4200,168 @@ static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
|
|
|
if (!it)
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
of->priv = it;
|
|
|
- css_task_iter_start(&cgrp->self, it);
|
|
|
+ css_task_iter_start(&cgrp->self, iter_flags, it);
|
|
|
} else if (!(*pos)++) {
|
|
|
css_task_iter_end(it);
|
|
|
- css_task_iter_start(&cgrp->self, it);
|
|
|
+ css_task_iter_start(&cgrp->self, iter_flags, it);
|
|
|
}
|
|
|
|
|
|
return cgroup_procs_next(s, NULL, NULL);
|
|
|
}
|
|
|
|
|
|
+static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
|
|
|
+{
|
|
|
+ struct cgroup *cgrp = seq_css(s)->cgroup;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * All processes of a threaded subtree belong to the domain cgroup
|
|
|
+ * of the subtree. Only threads can be distributed across the
|
|
|
+ * subtree. Reject reads on cgroup.procs in the subtree proper.
|
|
|
+ * They're always empty anyway.
|
|
|
+ */
|
|
|
+ if (cgroup_is_threaded(cgrp))
|
|
|
+ return ERR_PTR(-EOPNOTSUPP);
|
|
|
+
|
|
|
+ return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
|
|
|
+ CSS_TASK_ITER_THREADED);
|
|
|
+}
|
|
|
+
|
|
|
static int cgroup_procs_show(struct seq_file *s, void *v)
|
|
|
{
|
|
|
- seq_printf(s, "%d\n", task_tgid_vnr(v));
|
|
|
+ seq_printf(s, "%d\n", task_pid_vnr(v));
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
|
|
|
+ struct cgroup *dst_cgrp,
|
|
|
+ struct super_block *sb)
|
|
|
+{
|
|
|
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
|
|
|
+ struct cgroup *com_cgrp = src_cgrp;
|
|
|
+ struct inode *inode;
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ lockdep_assert_held(&cgroup_mutex);
|
|
|
+
|
|
|
+ /* find the common ancestor */
|
|
|
+ while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
|
|
|
+ com_cgrp = cgroup_parent(com_cgrp);
|
|
|
+
|
|
|
+ /* %current should be authorized to migrate to the common ancestor */
|
|
|
+ inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
|
|
|
+ if (!inode)
|
|
|
+ return -ENOMEM;
|
|
|
+
|
|
|
+ ret = inode_permission(inode, MAY_WRITE);
|
|
|
+ iput(inode);
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If namespaces are delegation boundaries, %current must be able
|
|
|
+ * to see both source and destination cgroups from its namespace.
|
|
|
+ */
|
|
|
+ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
|
|
|
+ (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
|
|
|
+ !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
|
|
|
+ return -ENOENT;
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
|
|
|
+ char *buf, size_t nbytes, loff_t off)
|
|
|
+{
|
|
|
+ struct cgroup *src_cgrp, *dst_cgrp;
|
|
|
+ struct task_struct *task;
|
|
|
+ ssize_t ret;
|
|
|
+
|
|
|
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
|
|
|
+ if (!dst_cgrp)
|
|
|
+ return -ENODEV;
|
|
|
+
|
|
|
+ task = cgroup_procs_write_start(buf, true);
|
|
|
+ ret = PTR_ERR_OR_ZERO(task);
|
|
|
+ if (ret)
|
|
|
+ goto out_unlock;
|
|
|
+
|
|
|
+ /* find the source cgroup */
|
|
|
+ spin_lock_irq(&css_set_lock);
|
|
|
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
|
|
|
+ spin_unlock_irq(&css_set_lock);
|
|
|
+
|
|
|
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
|
|
|
+ of->file->f_path.dentry->d_sb);
|
|
|
+ if (ret)
|
|
|
+ goto out_finish;
|
|
|
+
|
|
|
+ ret = cgroup_attach_task(dst_cgrp, task, true);
|
|
|
+
|
|
|
+out_finish:
|
|
|
+ cgroup_procs_write_finish(task);
|
|
|
+out_unlock:
|
|
|
+ cgroup_kn_unlock(of->kn);
|
|
|
+
|
|
|
+ return ret ?: nbytes;
|
|
|
+}
|
|
|
+
|
|
|
+static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
|
|
|
+{
|
|
|
+ return __cgroup_procs_start(s, pos, 0);
|
|
|
+}
|
|
|
+
|
|
|
+static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
|
|
|
+ char *buf, size_t nbytes, loff_t off)
|
|
|
+{
|
|
|
+ struct cgroup *src_cgrp, *dst_cgrp;
|
|
|
+ struct task_struct *task;
|
|
|
+ ssize_t ret;
|
|
|
+
|
|
|
+ buf = strstrip(buf);
|
|
|
+
|
|
|
+ dst_cgrp = cgroup_kn_lock_live(of->kn, false);
|
|
|
+ if (!dst_cgrp)
|
|
|
+ return -ENODEV;
|
|
|
+
|
|
|
+ task = cgroup_procs_write_start(buf, false);
|
|
|
+ ret = PTR_ERR_OR_ZERO(task);
|
|
|
+ if (ret)
|
|
|
+ goto out_unlock;
|
|
|
+
|
|
|
+ /* find the source cgroup */
|
|
|
+ spin_lock_irq(&css_set_lock);
|
|
|
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
|
|
|
+ spin_unlock_irq(&css_set_lock);
|
|
|
+
|
|
|
+ /* thread migrations follow the cgroup.procs delegation rule */
|
|
|
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
|
|
|
+ of->file->f_path.dentry->d_sb);
|
|
|
+ if (ret)
|
|
|
+ goto out_finish;
|
|
|
+
|
|
|
+ /* and must be contained in the same domain */
|
|
|
+ ret = -EOPNOTSUPP;
|
|
|
+ if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
|
|
|
+ goto out_finish;
|
|
|
+
|
|
|
+ ret = cgroup_attach_task(dst_cgrp, task, false);
|
|
|
+
|
|
|
+out_finish:
|
|
|
+ cgroup_procs_write_finish(task);
|
|
|
+out_unlock:
|
|
|
+ cgroup_kn_unlock(of->kn);
|
|
|
+
|
|
|
+ return ret ?: nbytes;
|
|
|
+}
|
|
|
+
|
|
|
/* cgroup core interface files for the default hierarchy */
|
|
|
static struct cftype cgroup_base_files[] = {
|
|
|
+ {
|
|
|
+ .name = "cgroup.type",
|
|
|
+ .flags = CFTYPE_NOT_ON_ROOT,
|
|
|
+ .seq_show = cgroup_type_show,
|
|
|
+ .write = cgroup_type_write,
|
|
|
+ },
|
|
|
{
|
|
|
.name = "cgroup.procs",
|
|
|
.flags = CFTYPE_NS_DELEGATABLE,
|
|
@@ -3896,6 +4372,14 @@ static struct cftype cgroup_base_files[] = {
|
|
|
.seq_show = cgroup_procs_show,
|
|
|
.write = cgroup_procs_write,
|
|
|
},
|
|
|
+ {
|
|
|
+ .name = "cgroup.threads",
|
|
|
+ .release = cgroup_procs_release,
|
|
|
+ .seq_start = cgroup_threads_start,
|
|
|
+ .seq_next = cgroup_procs_next,
|
|
|
+ .seq_show = cgroup_procs_show,
|
|
|
+ .write = cgroup_threads_write,
|
|
|
+ },
|
|
|
{
|
|
|
.name = "cgroup.controllers",
|
|
|
.seq_show = cgroup_controllers_show,
|
|
@@ -3912,6 +4396,20 @@ static struct cftype cgroup_base_files[] = {
|
|
|
.file_offset = offsetof(struct cgroup, events_file),
|
|
|
.seq_show = cgroup_events_show,
|
|
|
},
|
|
|
+ {
|
|
|
+ .name = "cgroup.max.descendants",
|
|
|
+ .seq_show = cgroup_max_descendants_show,
|
|
|
+ .write = cgroup_max_descendants_write,
|
|
|
+ },
|
|
|
+ {
|
|
|
+ .name = "cgroup.max.depth",
|
|
|
+ .seq_show = cgroup_max_depth_show,
|
|
|
+ .write = cgroup_max_depth_write,
|
|
|
+ },
|
|
|
+ {
|
|
|
+ .name = "cgroup.stat",
|
|
|
+ .seq_show = cgroup_stat_show,
|
|
|
+ },
|
|
|
{ } /* terminate */
|
|
|
};
|
|
|
|
|
@@ -4011,9 +4509,15 @@ static void css_release_work_fn(struct work_struct *work)
|
|
|
if (ss->css_released)
|
|
|
ss->css_released(css);
|
|
|
} else {
|
|
|
+ struct cgroup *tcgrp;
|
|
|
+
|
|
|
/* cgroup release path */
|
|
|
trace_cgroup_release(cgrp);
|
|
|
|
|
|
+ for (tcgrp = cgroup_parent(cgrp); tcgrp;
|
|
|
+ tcgrp = cgroup_parent(tcgrp))
|
|
|
+ tcgrp->nr_dying_descendants--;
|
|
|
+
|
|
|
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
|
|
|
cgrp->id = -1;
|
|
|
|
|
@@ -4209,9 +4713,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
|
|
|
cgrp->root = root;
|
|
|
cgrp->level = level;
|
|
|
|
|
|
- for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
|
|
|
+ for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
|
|
|
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
|
|
|
|
|
|
+ if (tcgrp != cgrp)
|
|
|
+ tcgrp->nr_descendants++;
|
|
|
+ }
|
|
|
+
|
|
|
if (notify_on_release(parent))
|
|
|
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
|
|
|
|
|
@@ -4252,6 +4760,29 @@ out_free_cgrp:
|
|
|
return ERR_PTR(ret);
|
|
|
}
|
|
|
|
|
|
+static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
|
|
|
+{
|
|
|
+ struct cgroup *cgroup;
|
|
|
+ int ret = false;
|
|
|
+ int level = 1;
|
|
|
+
|
|
|
+ lockdep_assert_held(&cgroup_mutex);
|
|
|
+
|
|
|
+ for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
|
|
|
+ if (cgroup->nr_descendants >= cgroup->max_descendants)
|
|
|
+ goto fail;
|
|
|
+
|
|
|
+ if (level > cgroup->max_depth)
|
|
|
+ goto fail;
|
|
|
+
|
|
|
+ level++;
|
|
|
+ }
|
|
|
+
|
|
|
+ ret = true;
|
|
|
+fail:
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
|
|
|
{
|
|
|
struct cgroup *parent, *cgrp;
|
|
@@ -4266,6 +4797,11 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
|
|
|
if (!parent)
|
|
|
return -ENODEV;
|
|
|
|
|
|
+ if (!cgroup_check_hierarchy_limits(parent)) {
|
|
|
+ ret = -EAGAIN;
|
|
|
+ goto out_unlock;
|
|
|
+ }
|
|
|
+
|
|
|
cgrp = cgroup_create(parent);
|
|
|
if (IS_ERR(cgrp)) {
|
|
|
ret = PTR_ERR(cgrp);
|
|
@@ -4417,6 +4953,7 @@ static void kill_css(struct cgroup_subsys_state *css)
|
|
|
static int cgroup_destroy_locked(struct cgroup *cgrp)
|
|
|
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
|
|
|
{
|
|
|
+ struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
|
|
|
struct cgroup_subsys_state *css;
|
|
|
struct cgrp_cset_link *link;
|
|
|
int ssid;
|
|
@@ -4461,7 +4998,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
|
|
|
*/
|
|
|
kernfs_remove(cgrp->kn);
|
|
|
|
|
|
- cgroup1_check_for_release(cgroup_parent(cgrp));
|
|
|
+ if (parent && cgroup_is_threaded(cgrp))
|
|
|
+ parent->nr_threaded_children--;
|
|
|
+
|
|
|
+ for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
|
|
|
+ tcgrp->nr_descendants--;
|
|
|
+ tcgrp->nr_dying_descendants++;
|
|
|
+ }
|
|
|
+
|
|
|
+ cgroup1_check_for_release(parent);
|
|
|
|
|
|
/* put the base reference */
|
|
|
percpu_ref_kill(&cgrp->self.refcnt);
|
|
@@ -4656,11 +5201,17 @@ int __init cgroup_init(void)
|
|
|
|
|
|
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
|
|
|
|
|
|
+ /* implicit controllers must be threaded too */
|
|
|
+ WARN_ON(ss->implicit_on_dfl && !ss->threaded);
|
|
|
+
|
|
|
if (ss->implicit_on_dfl)
|
|
|
cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
|
|
|
else if (!ss->dfl_cftypes)
|
|
|
cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
|
|
|
|
|
|
+ if (ss->threaded)
|
|
|
+ cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
|
|
|
+
|
|
|
if (ss->dfl_cftypes == ss->legacy_cftypes) {
|
|
|
WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
|
|
|
} else {
|