|
@@ -76,8 +76,34 @@ struct cpuset {
|
|
|
struct cgroup_subsys_state css;
|
|
|
|
|
|
unsigned long flags; /* "unsigned long" so bitops work */
|
|
|
- cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
|
|
|
- nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
|
|
|
+
|
|
|
+ /*
|
|
|
+ * On default hierarchy:
|
|
|
+ *
|
|
|
+ * The user-configured masks can only be changed by writing to
|
|
|
+ * cpuset.cpus and cpuset.mems, and won't be limited by the
|
|
|
+ * parent masks.
|
|
|
+ *
|
|
|
+ * The effective masks is the real masks that apply to the tasks
|
|
|
+ * in the cpuset. They may be changed if the configured masks are
|
|
|
+ * changed or hotplug happens.
|
|
|
+ *
|
|
|
+ * effective_mask == configured_mask & parent's effective_mask,
|
|
|
+ * and if it ends up empty, it will inherit the parent's mask.
|
|
|
+ *
|
|
|
+ *
|
|
|
+ * On legacy hierachy:
|
|
|
+ *
|
|
|
+ * The user-configured masks are always the same with effective masks.
|
|
|
+ */
|
|
|
+
|
|
|
+ /* user-configured CPUs and Memory Nodes allow to tasks */
|
|
|
+ cpumask_var_t cpus_allowed;
|
|
|
+ nodemask_t mems_allowed;
|
|
|
+
|
|
|
+ /* effective CPUs and Memory Nodes allow to tasks */
|
|
|
+ cpumask_var_t effective_cpus;
|
|
|
+ nodemask_t effective_mems;
|
|
|
|
|
|
/*
|
|
|
* This is old Memory Nodes tasks took on.
|
|
@@ -307,9 +333,9 @@ static struct file_system_type cpuset_fs_type = {
|
|
|
*/
|
|
|
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
|
|
|
{
|
|
|
- while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
|
|
|
+ while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
|
|
|
cs = parent_cs(cs);
|
|
|
- cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
|
|
|
+ cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -325,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
|
|
|
*/
|
|
|
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
|
|
|
{
|
|
|
- while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
|
|
|
+ while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
|
|
|
cs = parent_cs(cs);
|
|
|
- nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
|
|
|
+ nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -376,13 +402,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
|
|
|
if (!trial)
|
|
|
return NULL;
|
|
|
|
|
|
- if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
|
|
|
- kfree(trial);
|
|
|
- return NULL;
|
|
|
- }
|
|
|
- cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
|
|
|
+ if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
|
|
|
+ goto free_cs;
|
|
|
+ if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
|
|
|
+ goto free_cpus;
|
|
|
|
|
|
+ cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
|
|
|
+ cpumask_copy(trial->effective_cpus, cs->effective_cpus);
|
|
|
return trial;
|
|
|
+
|
|
|
+free_cpus:
|
|
|
+ free_cpumask_var(trial->cpus_allowed);
|
|
|
+free_cs:
|
|
|
+ kfree(trial);
|
|
|
+ return NULL;
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -391,6 +424,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
|
|
|
*/
|
|
|
static void free_trial_cpuset(struct cpuset *trial)
|
|
|
{
|
|
|
+ free_cpumask_var(trial->effective_cpus);
|
|
|
free_cpumask_var(trial->cpus_allowed);
|
|
|
kfree(trial);
|
|
|
}
|
|
@@ -436,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
|
|
|
|
|
|
par = parent_cs(cur);
|
|
|
|
|
|
- /* We must be a subset of our parent cpuset */
|
|
|
+ /* On legacy hiearchy, we must be a subset of our parent cpuset. */
|
|
|
ret = -EACCES;
|
|
|
- if (!is_cpuset_subset(trial, par))
|
|
|
+ if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
|
|
|
goto out;
|
|
|
|
|
|
/*
|
|
@@ -480,11 +514,11 @@ out:
|
|
|
#ifdef CONFIG_SMP
|
|
|
/*
|
|
|
* Helper routine for generate_sched_domains().
|
|
|
- * Do cpusets a, b have overlapping cpus_allowed masks?
|
|
|
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
|
|
|
*/
|
|
|
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
|
|
|
{
|
|
|
- return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
|
|
|
+ return cpumask_intersects(a->effective_cpus, b->effective_cpus);
|
|
|
}
|
|
|
|
|
|
static void
|
|
@@ -601,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
|
|
|
*dattr = SD_ATTR_INIT;
|
|
|
update_domain_attr_tree(dattr, &top_cpuset);
|
|
|
}
|
|
|
- cpumask_copy(doms[0], top_cpuset.cpus_allowed);
|
|
|
+ cpumask_copy(doms[0], top_cpuset.effective_cpus);
|
|
|
|
|
|
goto done;
|
|
|
}
|
|
@@ -705,7 +739,7 @@ restart:
|
|
|
struct cpuset *b = csa[j];
|
|
|
|
|
|
if (apn == b->pn) {
|
|
|
- cpumask_or(dp, dp, b->cpus_allowed);
|
|
|
+ cpumask_or(dp, dp, b->effective_cpus);
|
|
|
if (dattr)
|
|
|
update_domain_attr_tree(dattr + nslot, b);
|
|
|
|
|
@@ -757,7 +791,7 @@ static void rebuild_sched_domains_locked(void)
|
|
|
* passing doms with offlined cpu to partition_sched_domains().
|
|
|
* Anyways, hotplug work item will rebuild sched domains.
|
|
|
*/
|
|
|
- if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
|
|
|
+ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
|
|
|
goto out;
|
|
|
|
|
|
/* Generate domain masks and attrs */
|
|
@@ -781,45 +815,6 @@ void rebuild_sched_domains(void)
|
|
|
mutex_unlock(&cpuset_mutex);
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
|
|
|
- * @cs: the cpuset in interest
|
|
|
- *
|
|
|
- * A cpuset's effective cpumask is the cpumask of the nearest ancestor
|
|
|
- * with non-empty cpus. We use effective cpumask whenever:
|
|
|
- * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
|
|
|
- * if the cpuset they reside in has no cpus)
|
|
|
- * - we want to retrieve task_cs(tsk)'s cpus_allowed.
|
|
|
- *
|
|
|
- * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
|
|
|
- * exception. See comments there.
|
|
|
- */
|
|
|
-static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
|
|
|
-{
|
|
|
- while (cpumask_empty(cs->cpus_allowed))
|
|
|
- cs = parent_cs(cs);
|
|
|
- return cs;
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
|
|
|
- * @cs: the cpuset in interest
|
|
|
- *
|
|
|
- * A cpuset's effective nodemask is the nodemask of the nearest ancestor
|
|
|
- * with non-empty memss. We use effective nodemask whenever:
|
|
|
- * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
|
|
|
- * if the cpuset they reside in has no mems)
|
|
|
- * - we want to retrieve task_cs(tsk)'s mems_allowed.
|
|
|
- *
|
|
|
- * Called with cpuset_mutex held.
|
|
|
- */
|
|
|
-static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
|
|
|
-{
|
|
|
- while (nodes_empty(cs->mems_allowed))
|
|
|
- cs = parent_cs(cs);
|
|
|
- return cs;
|
|
|
-}
|
|
|
-
|
|
|
/**
|
|
|
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
|
|
|
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
|
|
@@ -830,53 +825,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
|
|
|
*/
|
|
|
static void update_tasks_cpumask(struct cpuset *cs)
|
|
|
{
|
|
|
- struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
|
|
|
struct css_task_iter it;
|
|
|
struct task_struct *task;
|
|
|
|
|
|
css_task_iter_start(&cs->css, &it);
|
|
|
while ((task = css_task_iter_next(&it)))
|
|
|
- set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
|
|
|
+ set_cpus_allowed_ptr(task, cs->effective_cpus);
|
|
|
css_task_iter_end(&it);
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
|
|
|
- * @root_cs: the root cpuset of the hierarchy
|
|
|
- * @update_root: update root cpuset or not?
|
|
|
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
|
|
|
+ * @cs: the cpuset to consider
|
|
|
+ * @new_cpus: temp variable for calculating new effective_cpus
|
|
|
+ *
|
|
|
+ * When congifured cpumask is changed, the effective cpumasks of this cpuset
|
|
|
+ * and all its descendants need to be updated.
|
|
|
*
|
|
|
- * This will update cpumasks of tasks in @root_cs and all other empty cpusets
|
|
|
- * which take on cpumask of @root_cs.
|
|
|
+ * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
|
|
|
*
|
|
|
* Called with cpuset_mutex held
|
|
|
*/
|
|
|
-static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
|
|
|
+static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
|
|
|
{
|
|
|
struct cpuset *cp;
|
|
|
struct cgroup_subsys_state *pos_css;
|
|
|
+ bool need_rebuild_sched_domains = false;
|
|
|
|
|
|
rcu_read_lock();
|
|
|
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
|
|
|
- if (cp == root_cs) {
|
|
|
- if (!update_root)
|
|
|
- continue;
|
|
|
- } else {
|
|
|
- /* skip the whole subtree if @cp have some CPU */
|
|
|
- if (!cpumask_empty(cp->cpus_allowed)) {
|
|
|
- pos_css = css_rightmost_descendant(pos_css);
|
|
|
- continue;
|
|
|
- }
|
|
|
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
|
|
|
+ struct cpuset *parent = parent_cs(cp);
|
|
|
+
|
|
|
+ cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If it becomes empty, inherit the effective mask of the
|
|
|
+ * parent, which is guaranteed to have some CPUs.
|
|
|
+ */
|
|
|
+ if (cpumask_empty(new_cpus))
|
|
|
+ cpumask_copy(new_cpus, parent->effective_cpus);
|
|
|
+
|
|
|
+ /* Skip the whole subtree if the cpumask remains the same. */
|
|
|
+ if (cpumask_equal(new_cpus, cp->effective_cpus)) {
|
|
|
+ pos_css = css_rightmost_descendant(pos_css);
|
|
|
+ continue;
|
|
|
}
|
|
|
+
|
|
|
if (!css_tryget_online(&cp->css))
|
|
|
continue;
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
+ mutex_lock(&callback_mutex);
|
|
|
+ cpumask_copy(cp->effective_cpus, new_cpus);
|
|
|
+ mutex_unlock(&callback_mutex);
|
|
|
+
|
|
|
+ WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
|
|
|
+ !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
|
|
|
+
|
|
|
update_tasks_cpumask(cp);
|
|
|
|
|
|
+ /*
|
|
|
+ * If the effective cpumask of any non-empty cpuset is changed,
|
|
|
+ * we need to rebuild sched domains.
|
|
|
+ */
|
|
|
+ if (!cpumask_empty(cp->cpus_allowed) &&
|
|
|
+ is_sched_load_balance(cp))
|
|
|
+ need_rebuild_sched_domains = true;
|
|
|
+
|
|
|
rcu_read_lock();
|
|
|
css_put(&cp->css);
|
|
|
}
|
|
|
rcu_read_unlock();
|
|
|
+
|
|
|
+ if (need_rebuild_sched_domains)
|
|
|
+ rebuild_sched_domains_locked();
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -889,7 +911,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
|
|
const char *buf)
|
|
|
{
|
|
|
int retval;
|
|
|
- int is_load_balanced;
|
|
|
|
|
|
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
|
|
|
if (cs == &top_cpuset)
|
|
@@ -908,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
|
|
if (retval < 0)
|
|
|
return retval;
|
|
|
|
|
|
- if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
|
|
|
+ if (!cpumask_subset(trialcs->cpus_allowed,
|
|
|
+ top_cpuset.cpus_allowed))
|
|
|
return -EINVAL;
|
|
|
}
|
|
|
|
|
@@ -920,16 +942,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
|
|
|
if (retval < 0)
|
|
|
return retval;
|
|
|
|
|
|
- is_load_balanced = is_sched_load_balance(trialcs);
|
|
|
-
|
|
|
mutex_lock(&callback_mutex);
|
|
|
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
|
|
|
mutex_unlock(&callback_mutex);
|
|
|
|
|
|
- update_tasks_cpumask_hier(cs, true);
|
|
|
-
|
|
|
- if (is_load_balanced)
|
|
|
- rebuild_sched_domains_locked();
|
|
|
+ /* use trialcs->cpus_allowed as a temp variable */
|
|
|
+ update_cpumasks_hier(cs, trialcs->cpus_allowed);
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
@@ -951,15 +969,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
|
|
|
const nodemask_t *to)
|
|
|
{
|
|
|
struct task_struct *tsk = current;
|
|
|
- struct cpuset *mems_cs;
|
|
|
|
|
|
tsk->mems_allowed = *to;
|
|
|
|
|
|
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
|
|
|
|
|
|
rcu_read_lock();
|
|
|
- mems_cs = effective_nodemask_cpuset(task_cs(tsk));
|
|
|
- guarantee_online_mems(mems_cs, &tsk->mems_allowed);
|
|
|
+ guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
|
|
|
rcu_read_unlock();
|
|
|
}
|
|
|
|
|
@@ -1028,13 +1044,12 @@ static void *cpuset_being_rebound;
|
|
|
static void update_tasks_nodemask(struct cpuset *cs)
|
|
|
{
|
|
|
static nodemask_t newmems; /* protected by cpuset_mutex */
|
|
|
- struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
|
|
|
struct css_task_iter it;
|
|
|
struct task_struct *task;
|
|
|
|
|
|
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
|
|
|
|
|
|
- guarantee_online_mems(mems_cs, &newmems);
|
|
|
+ guarantee_online_mems(cs, &newmems);
|
|
|
|
|
|
/*
|
|
|
* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
|
|
@@ -1077,36 +1092,52 @@ static void update_tasks_nodemask(struct cpuset *cs)
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
|
|
|
- * @cs: the root cpuset of the hierarchy
|
|
|
- * @update_root: update the root cpuset or not?
|
|
|
+ * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
|
|
|
+ * @cs: the cpuset to consider
|
|
|
+ * @new_mems: a temp variable for calculating new effective_mems
|
|
|
*
|
|
|
- * This will update nodemasks of tasks in @root_cs and all other empty cpusets
|
|
|
- * which take on nodemask of @root_cs.
|
|
|
+ * When configured nodemask is changed, the effective nodemasks of this cpuset
|
|
|
+ * and all its descendants need to be updated.
|
|
|
+ *
|
|
|
+ * On legacy hiearchy, effective_mems will be the same with mems_allowed.
|
|
|
*
|
|
|
* Called with cpuset_mutex held
|
|
|
*/
|
|
|
-static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
|
|
|
+static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
|
|
|
{
|
|
|
struct cpuset *cp;
|
|
|
struct cgroup_subsys_state *pos_css;
|
|
|
|
|
|
rcu_read_lock();
|
|
|
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
|
|
|
- if (cp == root_cs) {
|
|
|
- if (!update_root)
|
|
|
- continue;
|
|
|
- } else {
|
|
|
- /* skip the whole subtree if @cp have some CPU */
|
|
|
- if (!nodes_empty(cp->mems_allowed)) {
|
|
|
- pos_css = css_rightmost_descendant(pos_css);
|
|
|
- continue;
|
|
|
- }
|
|
|
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
|
|
|
+ struct cpuset *parent = parent_cs(cp);
|
|
|
+
|
|
|
+ nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If it becomes empty, inherit the effective mask of the
|
|
|
+ * parent, which is guaranteed to have some MEMs.
|
|
|
+ */
|
|
|
+ if (nodes_empty(*new_mems))
|
|
|
+ *new_mems = parent->effective_mems;
|
|
|
+
|
|
|
+ /* Skip the whole subtree if the nodemask remains the same. */
|
|
|
+ if (nodes_equal(*new_mems, cp->effective_mems)) {
|
|
|
+ pos_css = css_rightmost_descendant(pos_css);
|
|
|
+ continue;
|
|
|
}
|
|
|
+
|
|
|
if (!css_tryget_online(&cp->css))
|
|
|
continue;
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
+ mutex_lock(&callback_mutex);
|
|
|
+ cp->effective_mems = *new_mems;
|
|
|
+ mutex_unlock(&callback_mutex);
|
|
|
+
|
|
|
+ WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
|
|
|
+ !nodes_equal(cp->mems_allowed, cp->effective_mems));
|
|
|
+
|
|
|
update_tasks_nodemask(cp);
|
|
|
|
|
|
rcu_read_lock();
|
|
@@ -1156,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
|
|
|
goto done;
|
|
|
|
|
|
if (!nodes_subset(trialcs->mems_allowed,
|
|
|
- node_states[N_MEMORY])) {
|
|
|
- retval = -EINVAL;
|
|
|
+ top_cpuset.mems_allowed)) {
|
|
|
+ retval = -EINVAL;
|
|
|
goto done;
|
|
|
}
|
|
|
}
|
|
@@ -1174,7 +1205,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
|
|
|
cs->mems_allowed = trialcs->mems_allowed;
|
|
|
mutex_unlock(&callback_mutex);
|
|
|
|
|
|
- update_tasks_nodemask_hier(cs, true);
|
|
|
+ /* use trialcs->mems_allowed as a temp variable */
|
|
|
+ update_nodemasks_hier(cs, &cs->mems_allowed);
|
|
|
done:
|
|
|
return retval;
|
|
|
}
|
|
@@ -1389,12 +1421,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
|
|
|
|
|
|
mutex_lock(&cpuset_mutex);
|
|
|
|
|
|
- /*
|
|
|
- * We allow to move tasks into an empty cpuset if sane_behavior
|
|
|
- * flag is set.
|
|
|
- */
|
|
|
+ /* allow moving tasks into an empty cpuset if on default hierarchy */
|
|
|
ret = -ENOSPC;
|
|
|
- if (!cgroup_sane_behavior(css->cgroup) &&
|
|
|
+ if (!cgroup_on_dfl(css->cgroup) &&
|
|
|
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
|
|
|
goto out_unlock;
|
|
|
|
|
@@ -1452,8 +1481,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
|
|
|
struct task_struct *leader = cgroup_taskset_first(tset);
|
|
|
struct cpuset *cs = css_cs(css);
|
|
|
struct cpuset *oldcs = cpuset_attach_old_cs;
|
|
|
- struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
|
|
|
- struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
|
|
|
|
|
|
mutex_lock(&cpuset_mutex);
|
|
|
|
|
@@ -1461,9 +1488,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
|
|
|
if (cs == &top_cpuset)
|
|
|
cpumask_copy(cpus_attach, cpu_possible_mask);
|
|
|
else
|
|
|
- guarantee_online_cpus(cpus_cs, cpus_attach);
|
|
|
+ guarantee_online_cpus(cs, cpus_attach);
|
|
|
|
|
|
- guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
|
|
|
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
|
|
|
|
|
|
cgroup_taskset_for_each(task, tset) {
|
|
|
/*
|
|
@@ -1480,11 +1507,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
|
|
|
* Change mm, possibly for multiple threads in a threadgroup. This is
|
|
|
* expensive and may sleep.
|
|
|
*/
|
|
|
- cpuset_attach_nodemask_to = cs->mems_allowed;
|
|
|
+ cpuset_attach_nodemask_to = cs->effective_mems;
|
|
|
mm = get_task_mm(leader);
|
|
|
if (mm) {
|
|
|
- struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
|
|
|
-
|
|
|
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
|
|
|
|
|
|
/*
|
|
@@ -1495,7 +1520,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
|
|
|
* mm from.
|
|
|
*/
|
|
|
if (is_memory_migrate(cs)) {
|
|
|
- cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
|
|
|
+ cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
|
|
|
&cpuset_attach_nodemask_to);
|
|
|
}
|
|
|
mmput(mm);
|
|
@@ -1516,6 +1541,8 @@ typedef enum {
|
|
|
FILE_MEMORY_MIGRATE,
|
|
|
FILE_CPULIST,
|
|
|
FILE_MEMLIST,
|
|
|
+ FILE_EFFECTIVE_CPULIST,
|
|
|
+ FILE_EFFECTIVE_MEMLIST,
|
|
|
FILE_CPU_EXCLUSIVE,
|
|
|
FILE_MEM_EXCLUSIVE,
|
|
|
FILE_MEM_HARDWALL,
|
|
@@ -1694,6 +1721,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
|
|
|
case FILE_MEMLIST:
|
|
|
s += nodelist_scnprintf(s, count, cs->mems_allowed);
|
|
|
break;
|
|
|
+ case FILE_EFFECTIVE_CPULIST:
|
|
|
+ s += cpulist_scnprintf(s, count, cs->effective_cpus);
|
|
|
+ break;
|
|
|
+ case FILE_EFFECTIVE_MEMLIST:
|
|
|
+ s += nodelist_scnprintf(s, count, cs->effective_mems);
|
|
|
+ break;
|
|
|
default:
|
|
|
ret = -EINVAL;
|
|
|
goto out_unlock;
|
|
@@ -1778,6 +1811,18 @@ static struct cftype files[] = {
|
|
|
.private = FILE_MEMLIST,
|
|
|
},
|
|
|
|
|
|
+ {
|
|
|
+ .name = "effective_cpus",
|
|
|
+ .seq_show = cpuset_common_seq_show,
|
|
|
+ .private = FILE_EFFECTIVE_CPULIST,
|
|
|
+ },
|
|
|
+
|
|
|
+ {
|
|
|
+ .name = "effective_mems",
|
|
|
+ .seq_show = cpuset_common_seq_show,
|
|
|
+ .private = FILE_EFFECTIVE_MEMLIST,
|
|
|
+ },
|
|
|
+
|
|
|
{
|
|
|
.name = "cpu_exclusive",
|
|
|
.read_u64 = cpuset_read_u64,
|
|
@@ -1869,18 +1914,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
|
|
|
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
|
|
|
if (!cs)
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
- if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
|
|
|
- kfree(cs);
|
|
|
- return ERR_PTR(-ENOMEM);
|
|
|
- }
|
|
|
+ if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
|
|
|
+ goto free_cs;
|
|
|
+ if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
|
|
|
+ goto free_cpus;
|
|
|
|
|
|
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
|
|
|
cpumask_clear(cs->cpus_allowed);
|
|
|
nodes_clear(cs->mems_allowed);
|
|
|
+ cpumask_clear(cs->effective_cpus);
|
|
|
+ nodes_clear(cs->effective_mems);
|
|
|
fmeter_init(&cs->fmeter);
|
|
|
cs->relax_domain_level = -1;
|
|
|
|
|
|
return &cs->css;
|
|
|
+
|
|
|
+free_cpus:
|
|
|
+ free_cpumask_var(cs->cpus_allowed);
|
|
|
+free_cs:
|
|
|
+ kfree(cs);
|
|
|
+ return ERR_PTR(-ENOMEM);
|
|
|
}
|
|
|
|
|
|
static int cpuset_css_online(struct cgroup_subsys_state *css)
|
|
@@ -1903,6 +1956,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
|
|
|
|
|
|
cpuset_inc();
|
|
|
|
|
|
+ mutex_lock(&callback_mutex);
|
|
|
+ if (cgroup_on_dfl(cs->css.cgroup)) {
|
|
|
+ cpumask_copy(cs->effective_cpus, parent->effective_cpus);
|
|
|
+ cs->effective_mems = parent->effective_mems;
|
|
|
+ }
|
|
|
+ mutex_unlock(&callback_mutex);
|
|
|
+
|
|
|
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
|
|
|
goto out_unlock;
|
|
|
|
|
@@ -1962,20 +2022,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
|
|
|
{
|
|
|
struct cpuset *cs = css_cs(css);
|
|
|
|
|
|
+ free_cpumask_var(cs->effective_cpus);
|
|
|
free_cpumask_var(cs->cpus_allowed);
|
|
|
kfree(cs);
|
|
|
}
|
|
|
|
|
|
+static void cpuset_bind(struct cgroup_subsys_state *root_css)
|
|
|
+{
|
|
|
+ mutex_lock(&cpuset_mutex);
|
|
|
+ mutex_lock(&callback_mutex);
|
|
|
+
|
|
|
+ if (cgroup_on_dfl(root_css->cgroup)) {
|
|
|
+ cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
|
|
|
+ top_cpuset.mems_allowed = node_possible_map;
|
|
|
+ } else {
|
|
|
+ cpumask_copy(top_cpuset.cpus_allowed,
|
|
|
+ top_cpuset.effective_cpus);
|
|
|
+ top_cpuset.mems_allowed = top_cpuset.effective_mems;
|
|
|
+ }
|
|
|
+
|
|
|
+ mutex_unlock(&callback_mutex);
|
|
|
+ mutex_unlock(&cpuset_mutex);
|
|
|
+}
|
|
|
+
|
|
|
struct cgroup_subsys cpuset_cgrp_subsys = {
|
|
|
- .css_alloc = cpuset_css_alloc,
|
|
|
- .css_online = cpuset_css_online,
|
|
|
- .css_offline = cpuset_css_offline,
|
|
|
- .css_free = cpuset_css_free,
|
|
|
- .can_attach = cpuset_can_attach,
|
|
|
- .cancel_attach = cpuset_cancel_attach,
|
|
|
- .attach = cpuset_attach,
|
|
|
- .base_cftypes = files,
|
|
|
- .early_init = 1,
|
|
|
+ .css_alloc = cpuset_css_alloc,
|
|
|
+ .css_online = cpuset_css_online,
|
|
|
+ .css_offline = cpuset_css_offline,
|
|
|
+ .css_free = cpuset_css_free,
|
|
|
+ .can_attach = cpuset_can_attach,
|
|
|
+ .cancel_attach = cpuset_cancel_attach,
|
|
|
+ .attach = cpuset_attach,
|
|
|
+ .bind = cpuset_bind,
|
|
|
+ .legacy_cftypes = files,
|
|
|
+ .early_init = 1,
|
|
|
};
|
|
|
|
|
|
/**
|
|
@@ -1990,9 +2070,13 @@ int __init cpuset_init(void)
|
|
|
|
|
|
if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
|
|
|
BUG();
|
|
|
+ if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
|
|
|
+ BUG();
|
|
|
|
|
|
cpumask_setall(top_cpuset.cpus_allowed);
|
|
|
nodes_setall(top_cpuset.mems_allowed);
|
|
|
+ cpumask_setall(top_cpuset.effective_cpus);
|
|
|
+ nodes_setall(top_cpuset.effective_mems);
|
|
|
|
|
|
fmeter_init(&top_cpuset.fmeter);
|
|
|
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
|
|
@@ -2035,6 +2119,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+static void
|
|
|
+hotplug_update_tasks_legacy(struct cpuset *cs,
|
|
|
+ struct cpumask *new_cpus, nodemask_t *new_mems,
|
|
|
+ bool cpus_updated, bool mems_updated)
|
|
|
+{
|
|
|
+ bool is_empty;
|
|
|
+
|
|
|
+ mutex_lock(&callback_mutex);
|
|
|
+ cpumask_copy(cs->cpus_allowed, new_cpus);
|
|
|
+ cpumask_copy(cs->effective_cpus, new_cpus);
|
|
|
+ cs->mems_allowed = *new_mems;
|
|
|
+ cs->effective_mems = *new_mems;
|
|
|
+ mutex_unlock(&callback_mutex);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Don't call update_tasks_cpumask() if the cpuset becomes empty,
|
|
|
+ * as the tasks will be migratecd to an ancestor.
|
|
|
+ */
|
|
|
+ if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
|
|
|
+ update_tasks_cpumask(cs);
|
|
|
+ if (mems_updated && !nodes_empty(cs->mems_allowed))
|
|
|
+ update_tasks_nodemask(cs);
|
|
|
+
|
|
|
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
|
|
|
+ nodes_empty(cs->mems_allowed);
|
|
|
+
|
|
|
+ mutex_unlock(&cpuset_mutex);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Move tasks to the nearest ancestor with execution resources,
|
|
|
+ * This is full cgroup operation which will also call back into
|
|
|
+ * cpuset. Should be done outside any lock.
|
|
|
+ */
|
|
|
+ if (is_empty)
|
|
|
+ remove_tasks_in_empty_cpuset(cs);
|
|
|
+
|
|
|
+ mutex_lock(&cpuset_mutex);
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+hotplug_update_tasks(struct cpuset *cs,
|
|
|
+ struct cpumask *new_cpus, nodemask_t *new_mems,
|
|
|
+ bool cpus_updated, bool mems_updated)
|
|
|
+{
|
|
|
+ if (cpumask_empty(new_cpus))
|
|
|
+ cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
|
|
|
+ if (nodes_empty(*new_mems))
|
|
|
+ *new_mems = parent_cs(cs)->effective_mems;
|
|
|
+
|
|
|
+ mutex_lock(&callback_mutex);
|
|
|
+ cpumask_copy(cs->effective_cpus, new_cpus);
|
|
|
+ cs->effective_mems = *new_mems;
|
|
|
+ mutex_unlock(&callback_mutex);
|
|
|
+
|
|
|
+ if (cpus_updated)
|
|
|
+ update_tasks_cpumask(cs);
|
|
|
+ if (mems_updated)
|
|
|
+ update_tasks_nodemask(cs);
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
|
|
|
* @cs: cpuset in interest
|
|
@@ -2045,11 +2189,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
|
|
|
*/
|
|
|
static void cpuset_hotplug_update_tasks(struct cpuset *cs)
|
|
|
{
|
|
|
- static cpumask_t off_cpus;
|
|
|
- static nodemask_t off_mems;
|
|
|
- bool is_empty;
|
|
|
- bool sane = cgroup_sane_behavior(cs->css.cgroup);
|
|
|
-
|
|
|
+ static cpumask_t new_cpus;
|
|
|
+ static nodemask_t new_mems;
|
|
|
+ bool cpus_updated;
|
|
|
+ bool mems_updated;
|
|
|
retry:
|
|
|
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
|
|
|
|
|
@@ -2064,51 +2207,20 @@ retry:
|
|
|
goto retry;
|
|
|
}
|
|
|
|
|
|
- cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
|
|
|
- nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
|
|
|
-
|
|
|
- mutex_lock(&callback_mutex);
|
|
|
- cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
|
|
|
- mutex_unlock(&callback_mutex);
|
|
|
-
|
|
|
- /*
|
|
|
- * If sane_behavior flag is set, we need to update tasks' cpumask
|
|
|
- * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
|
|
|
- * call update_tasks_cpumask() if the cpuset becomes empty, as
|
|
|
- * the tasks in it will be migrated to an ancestor.
|
|
|
- */
|
|
|
- if ((sane && cpumask_empty(cs->cpus_allowed)) ||
|
|
|
- (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
|
|
|
- update_tasks_cpumask(cs);
|
|
|
+ cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
|
|
|
+ nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
|
|
|
|
|
|
- mutex_lock(&callback_mutex);
|
|
|
- nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
|
|
|
- mutex_unlock(&callback_mutex);
|
|
|
-
|
|
|
- /*
|
|
|
- * If sane_behavior flag is set, we need to update tasks' nodemask
|
|
|
- * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
|
|
|
- * call update_tasks_nodemask() if the cpuset becomes empty, as
|
|
|
- * the tasks in it will be migratd to an ancestor.
|
|
|
- */
|
|
|
- if ((sane && nodes_empty(cs->mems_allowed)) ||
|
|
|
- (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
|
|
|
- update_tasks_nodemask(cs);
|
|
|
+ cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
|
|
|
+ mems_updated = !nodes_equal(new_mems, cs->effective_mems);
|
|
|
|
|
|
- is_empty = cpumask_empty(cs->cpus_allowed) ||
|
|
|
- nodes_empty(cs->mems_allowed);
|
|
|
+ if (cgroup_on_dfl(cs->css.cgroup))
|
|
|
+ hotplug_update_tasks(cs, &new_cpus, &new_mems,
|
|
|
+ cpus_updated, mems_updated);
|
|
|
+ else
|
|
|
+ hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
|
|
|
+ cpus_updated, mems_updated);
|
|
|
|
|
|
mutex_unlock(&cpuset_mutex);
|
|
|
-
|
|
|
- /*
|
|
|
- * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
|
|
|
- *
|
|
|
- * Otherwise move tasks to the nearest ancestor with execution
|
|
|
- * resources. This is full cgroup operation which will
|
|
|
- * also call back into cpuset. Should be done outside any lock.
|
|
|
- */
|
|
|
- if (!sane && is_empty)
|
|
|
- remove_tasks_in_empty_cpuset(cs);
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -2132,6 +2244,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
|
|
|
static cpumask_t new_cpus;
|
|
|
static nodemask_t new_mems;
|
|
|
bool cpus_updated, mems_updated;
|
|
|
+ bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
|
|
|
|
|
|
mutex_lock(&cpuset_mutex);
|
|
|
|
|
@@ -2139,13 +2252,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
|
|
|
cpumask_copy(&new_cpus, cpu_active_mask);
|
|
|
new_mems = node_states[N_MEMORY];
|
|
|
|
|
|
- cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
|
|
|
- mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
|
|
|
+ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
|
|
|
+ mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
|
|
|
|
|
|
/* synchronize cpus_allowed to cpu_active_mask */
|
|
|
if (cpus_updated) {
|
|
|
mutex_lock(&callback_mutex);
|
|
|
- cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
|
|
|
+ if (!on_dfl)
|
|
|
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
|
|
|
+ cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
|
|
|
mutex_unlock(&callback_mutex);
|
|
|
/* we don't mess with cpumasks of tasks in top_cpuset */
|
|
|
}
|
|
@@ -2153,7 +2268,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
|
|
|
/* synchronize mems_allowed to N_MEMORY */
|
|
|
if (mems_updated) {
|
|
|
mutex_lock(&callback_mutex);
|
|
|
- top_cpuset.mems_allowed = new_mems;
|
|
|
+ if (!on_dfl)
|
|
|
+ top_cpuset.mems_allowed = new_mems;
|
|
|
+ top_cpuset.effective_mems = new_mems;
|
|
|
mutex_unlock(&callback_mutex);
|
|
|
update_tasks_nodemask(&top_cpuset);
|
|
|
}
|
|
@@ -2228,6 +2345,9 @@ void __init cpuset_init_smp(void)
|
|
|
top_cpuset.mems_allowed = node_states[N_MEMORY];
|
|
|
top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
|
|
|
|
|
|
+ cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
|
|
|
+ top_cpuset.effective_mems = node_states[N_MEMORY];
|
|
|
+
|
|
|
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
|
|
|
}
|
|
|
|
|
@@ -2244,23 +2364,17 @@ void __init cpuset_init_smp(void)
|
|
|
|
|
|
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
|
|
|
{
|
|
|
- struct cpuset *cpus_cs;
|
|
|
-
|
|
|
mutex_lock(&callback_mutex);
|
|
|
rcu_read_lock();
|
|
|
- cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
|
|
|
- guarantee_online_cpus(cpus_cs, pmask);
|
|
|
+ guarantee_online_cpus(task_cs(tsk), pmask);
|
|
|
rcu_read_unlock();
|
|
|
mutex_unlock(&callback_mutex);
|
|
|
}
|
|
|
|
|
|
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
|
|
|
{
|
|
|
- struct cpuset *cpus_cs;
|
|
|
-
|
|
|
rcu_read_lock();
|
|
|
- cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
|
|
|
- do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
|
|
|
+ do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
/*
|
|
@@ -2299,13 +2413,11 @@ void cpuset_init_current_mems_allowed(void)
|
|
|
|
|
|
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
|
|
|
{
|
|
|
- struct cpuset *mems_cs;
|
|
|
nodemask_t mask;
|
|
|
|
|
|
mutex_lock(&callback_mutex);
|
|
|
rcu_read_lock();
|
|
|
- mems_cs = effective_nodemask_cpuset(task_cs(tsk));
|
|
|
- guarantee_online_mems(mems_cs, &mask);
|
|
|
+ guarantee_online_mems(task_cs(tsk), &mask);
|
|
|
rcu_read_unlock();
|
|
|
mutex_unlock(&callback_mutex);
|
|
|
|