11 yıl önce · 2756d373a3
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -638,8 +638,10 @@ struct cgroup_subsys {
 
				 	struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
			
 
				 	int (*css_online)(struct cgroup_subsys_state *css);
			
 
				 	void (*css_offline)(struct cgroup_subsys_state *css);
			
 
				+	void (*css_released)(struct cgroup_subsys_state *css);
			
 
				 	void (*css_free)(struct cgroup_subsys_state *css);
			
 
				 	void (*css_reset)(struct cgroup_subsys_state *css);
			
 
				+	void (*css_e_css_changed)(struct cgroup_subsys_state *css);
			
 
				 
			
 
				 	int (*can_attach)(struct cgroup_subsys_state *css,
			
 
				 			  struct cgroup_taskset *tset);
			
@@ -934,6 +936,8 @@ void css_task_iter_end(struct css_task_iter *it);
 
				 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
			
 
				 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
			
 
				 
			
 
				+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
			
 
				+					     struct cgroup_subsys *ss);
			
 
				 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
			
 
				 						       struct cgroup_subsys *ss);
			
 
				 
			
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -48,29 +48,16 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 
				 void cpuset_init_current_mems_allowed(void);
			
 
				 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
			
 
				 
			
 
				-extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask);
			
 
				-extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
			
 
				+extern int __cpuset_node_allowed(int node, gfp_t gfp_mask);
			
 
				 
			
 
				-static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
			
 
				+static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
			
 
				 {
			
 
				-	return nr_cpusets() <= 1 ||
			
 
				-		__cpuset_node_allowed_softwall(node, gfp_mask);
			
 
				+	return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask);
			
 
				 }
			
 
				 
			
 
				-static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
			
 
				+static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
			
 
				 {
			
 
				-	return nr_cpusets() <= 1 ||
			
 
				-		__cpuset_node_allowed_hardwall(node, gfp_mask);
			
 
				-}
			
 
				-
			
 
				-static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
			
 
				-{
			
 
				-	return cpuset_node_allowed_softwall(zone_to_nid(z), gfp_mask);
			
 
				-}
			
 
				-
			
 
				-static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
			
 
				-{
			
 
				-	return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask);
			
 
				+	return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
			
 
				 }
			
 
				 
			
 
				 extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
			
@@ -179,22 +166,12 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				-static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
			
 
				-{
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
			
 
				-{
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
			
 
				+static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
			
 
				 {
			
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				-static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
			
 
				+static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
			
 
				 {
			
 
				 	return 1;
			
 
				 }
			
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -277,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 
				 	if (!(cgrp->root->subsys_mask & (1 << ss->id)))
			
 
				 		return NULL;
			
 
				 
			
 
				+	/*
			
 
				+	 * This function is used while updating css associations and thus
			
 
				+	 * can't test the csses directly.  Use ->child_subsys_mask.
			
 
				+	 */
			
 
				 	while (cgroup_parent(cgrp) &&
			
 
				 	       !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
			
 
				 		cgrp = cgroup_parent(cgrp);
			
@@ -284,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 
				 	return cgroup_css(cgrp, ss);
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
			
 
				+ * @cgrp: the cgroup of interest
			
 
				+ * @ss: the subsystem of interest
			
 
				+ *
			
 
				+ * Find and get the effective css of @cgrp for @ss.  The effective css is
			
 
				+ * defined as the matching css of the nearest ancestor including self which
			
 
				+ * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
			
 
				+ * the root css is returned, so this function always returns a valid css.
			
 
				+ * The returned css must be put using css_put().
			
 
				+ */
			
 
				+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
			
 
				+					     struct cgroup_subsys *ss)
			
 
				+{
			
 
				+	struct cgroup_subsys_state *css;
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+
			
 
				+	do {
			
 
				+		css = cgroup_css(cgrp, ss);
			
 
				+
			
 
				+		if (css && css_tryget_online(css))
			
 
				+			goto out_unlock;
			
 
				+		cgrp = cgroup_parent(cgrp);
			
 
				+	} while (cgrp);
			
 
				+
			
 
				+	css = init_css_set.subsys[ss->id];
			
 
				+	css_get(css);
			
 
				+out_unlock:
			
 
				+	rcu_read_unlock();
			
 
				+	return css;
			
 
				+}
			
 
				+
			
 
				 /* convenient tests for these bits */
			
 
				 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
			
 
				 {
			
@@ -1019,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp)
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * cgroup_refresh_child_subsys_mask - update child_subsys_mask
			
 
				+ * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
			
 
				  * @cgrp: the target cgroup
			
 
				+ * @subtree_control: the new subtree_control mask to consider
			
 
				  *
			
 
				  * On the default hierarchy, a subsystem may request other subsystems to be
			
 
				  * enabled together through its ->depends_on mask.  In such cases, more
			
 
				  * subsystems than specified in "cgroup.subtree_control" may be enabled.
			
 
				  *
			
 
				- * This function determines which subsystems need to be enabled given the
			
 
				- * current @cgrp->subtree_control and records it in
			
 
				- * @cgrp->child_subsys_mask.  The resulting mask is always a superset of
			
 
				- * @cgrp->subtree_control and follows the usual hierarchy rules.
			
 
				+ * This function calculates which subsystems need to be enabled if
			
 
				+ * @subtree_control is to be applied to @cgrp.  The returned mask is always
			
 
				+ * a superset of @subtree_control and follows the usual hierarchy rules.
			
 
				  */
			
 
				-static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
			
 
				+static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
			
 
				+						  unsigned int subtree_control)
			
 
				 {
			
 
				 	struct cgroup *parent = cgroup_parent(cgrp);
			
 
				-	unsigned int cur_ss_mask = cgrp->subtree_control;
			
 
				+	unsigned int cur_ss_mask = subtree_control;
			
 
				 	struct cgroup_subsys *ss;
			
 
				 	int ssid;
			
 
				 
			
 
				 	lockdep_assert_held(&cgroup_mutex);
			
 
				 
			
 
				-	if (!cgroup_on_dfl(cgrp)) {
			
 
				-		cgrp->child_subsys_mask = cur_ss_mask;
			
 
				-		return;
			
 
				-	}
			
 
				+	if (!cgroup_on_dfl(cgrp))
			
 
				+		return cur_ss_mask;
			
 
				 
			
 
				 	while (true) {
			
 
				 		unsigned int new_ss_mask = cur_ss_mask;
			
@@ -1067,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
 
				 		cur_ss_mask = new_ss_mask;
			
 
				 	}
			
 
				 
			
 
				-	cgrp->child_subsys_mask = cur_ss_mask;
			
 
				+	return cur_ss_mask;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * cgroup_refresh_child_subsys_mask - update child_subsys_mask
			
 
				+ * @cgrp: the target cgroup
			
 
				+ *
			
 
				+ * Update @cgrp->child_subsys_mask according to the current
			
 
				+ * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
			
 
				+ */
			
 
				+static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
			
 
				+{
			
 
				+	cgrp->child_subsys_mask =
			
 
				+		cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -2641,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 
				 					    loff_t off)
			
 
				 {
			
 
				 	unsigned int enable = 0, disable = 0;
			
 
				-	unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
			
 
				+	unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
			
 
				 	struct cgroup *cgrp, *child;
			
 
				 	struct cgroup_subsys *ss;
			
 
				 	char *tok;
			
@@ -2693,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 
				 				ret = -ENOENT;
			
 
				 				goto out_unlock;
			
 
				 			}
			
 
				-
			
 
				-			/*
			
 
				-			 * @ss is already enabled through dependency and
			
 
				-			 * we'll just make it visible.  Skip draining.
			
 
				-			 */
			
 
				-			if (cgrp->child_subsys_mask & (1 << ssid))
			
 
				-				continue;
			
 
				-
			
 
				-			/*
			
 
				-			 * Because css offlining is asynchronous, userland
			
 
				-			 * might try to re-enable the same controller while
			
 
				-			 * the previous instance is still around.  In such
			
 
				-			 * cases, wait till it's gone using offline_waitq.
			
 
				-			 */
			
 
				-			cgroup_for_each_live_child(child, cgrp) {
			
 
				-				DEFINE_WAIT(wait);
			
 
				-
			
 
				-				if (!cgroup_css(child, ss))
			
 
				-					continue;
			
 
				-
			
 
				-				cgroup_get(child);
			
 
				-				prepare_to_wait(&child->offline_waitq, &wait,
			
 
				-						TASK_UNINTERRUPTIBLE);
			
 
				-				cgroup_kn_unlock(of->kn);
			
 
				-				schedule();
			
 
				-				finish_wait(&child->offline_waitq, &wait);
			
 
				-				cgroup_put(child);
			
 
				-
			
 
				-				return restart_syscall();
			
 
				-			}
			
 
				 		} else if (disable & (1 << ssid)) {
			
 
				 			if (!(cgrp->subtree_control & (1 << ssid))) {
			
 
				 				disable &= ~(1 << ssid);
			
@@ -2758,18 +2777,47 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 
				 	 * subsystems than specified may need to be enabled or disabled
			
 
				 	 * depending on subsystem dependencies.
			
 
				 	 */
			
 
				-	cgrp->subtree_control |= enable;
			
 
				-	cgrp->subtree_control &= ~disable;
			
 
				+	old_sc = cgrp->subtree_control;
			
 
				+	old_ss = cgrp->child_subsys_mask;
			
 
				+	new_sc = (old_sc | enable) & ~disable;
			
 
				+	new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
			
 
				 
			
 
				-	old_ctrl = cgrp->child_subsys_mask;
			
 
				-	cgroup_refresh_child_subsys_mask(cgrp);
			
 
				-	new_ctrl = cgrp->child_subsys_mask;
			
 
				-
			
 
				-	css_enable = ~old_ctrl & new_ctrl;
			
 
				-	css_disable = old_ctrl & ~new_ctrl;
			
 
				+	css_enable = ~old_ss & new_ss;
			
 
				+	css_disable = old_ss & ~new_ss;
			
 
				 	enable |= css_enable;
			
 
				 	disable |= css_disable;
			
 
				 
			
 
				+	/*
			
 
				+	 * Because css offlining is asynchronous, userland might try to
			
 
				+	 * re-enable the same controller while the previous instance is
			
 
				+	 * still around.  In such cases, wait till it's gone using
			
 
				+	 * offline_waitq.
			
 
				+	 */
			
 
				+	for_each_subsys(ss, ssid) {
			
 
				+		if (!(css_enable & (1 << ssid)))
			
 
				+			continue;
			
 
				+
			
 
				+		cgroup_for_each_live_child(child, cgrp) {
			
 
				+			DEFINE_WAIT(wait);
			
 
				+
			
 
				+			if (!cgroup_css(child, ss))
			
 
				+				continue;
			
 
				+
			
 
				+			cgroup_get(child);
			
 
				+			prepare_to_wait(&child->offline_waitq, &wait,
			
 
				+					TASK_UNINTERRUPTIBLE);
			
 
				+			cgroup_kn_unlock(of->kn);
			
 
				+			schedule();
			
 
				+			finish_wait(&child->offline_waitq, &wait);
			
 
				+			cgroup_put(child);
			
 
				+
			
 
				+			return restart_syscall();
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	cgrp->subtree_control = new_sc;
			
 
				+	cgrp->child_subsys_mask = new_ss;
			
 
				+
			
 
				 	/*
			
 
				 	 * Create new csses or make the existing ones visible.  A css is
			
 
				 	 * created invisible if it's being implicitly enabled through
			
@@ -2825,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	/*
			
 
				+	 * The effective csses of all the descendants (excluding @cgrp) may
			
 
				+	 * have changed.  Subsystems can optionally subscribe to this event
			
 
				+	 * by implementing ->css_e_css_changed() which is invoked if any of
			
 
				+	 * the effective csses seen from the css's cgroup may have changed.
			
 
				+	 */
			
 
				+	for_each_subsys(ss, ssid) {
			
 
				+		struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss);
			
 
				+		struct cgroup_subsys_state *css;
			
 
				+
			
 
				+		if (!ss->css_e_css_changed || !this_css)
			
 
				+			continue;
			
 
				+
			
 
				+		css_for_each_descendant_pre(css, this_css)
			
 
				+			if (css != this_css)
			
 
				+				ss->css_e_css_changed(css);
			
 
				+	}
			
 
				+
			
 
				 	kernfs_activate(cgrp->kn);
			
 
				 	ret = 0;
			
 
				 out_unlock:
			
@@ -2832,9 +2898,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 
				 	return ret ?: nbytes;
			
 
				 
			
 
				 err_undo_css:
			
 
				-	cgrp->subtree_control &= ~enable;
			
 
				-	cgrp->subtree_control |= disable;
			
 
				-	cgroup_refresh_child_subsys_mask(cgrp);
			
 
				+	cgrp->subtree_control = old_sc;
			
 
				+	cgrp->child_subsys_mask = old_ss;
			
 
				 
			
 
				 	for_each_subsys(ss, ssid) {
			
 
				 		if (!(enable & (1 << ssid)))
			
@@ -4370,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work)
 
				 	if (ss) {
			
 
				 		/* css release path */
			
 
				 		cgroup_idr_remove(&ss->css_idr, css->id);
			
 
				+		if (ss->css_released)
			
 
				+			ss->css_released(css);
			
 
				 	} else {
			
 
				 		/* cgroup release path */
			
 
				 		cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
			
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -248,34 +248,34 @@ static struct cpuset top_cpuset = {
 
				 		if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
			
 
				 
			
 
				 /*
			
 
				- * There are two global mutexes guarding cpuset structures - cpuset_mutex
			
 
				- * and callback_mutex.  The latter may nest inside the former.  We also
			
 
				- * require taking task_lock() when dereferencing a task's cpuset pointer.
			
 
				- * See "The task_lock() exception", at the end of this comment.
			
 
				+ * There are two global locks guarding cpuset structures - cpuset_mutex and
			
 
				+ * callback_lock. We also require taking task_lock() when dereferencing a
			
 
				+ * task's cpuset pointer. See "The task_lock() exception", at the end of this
			
 
				+ * comment.
			
 
				  *
			
 
				- * A task must hold both mutexes to modify cpusets.  If a task holds
			
 
				+ * A task must hold both locks to modify cpusets.  If a task holds
			
 
				  * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
			
 
				- * is the only task able to also acquire callback_mutex and be able to
			
 
				+ * is the only task able to also acquire callback_lock and be able to
			
 
				  * modify cpusets.  It can perform various checks on the cpuset structure
			
 
				  * first, knowing nothing will change.  It can also allocate memory while
			
 
				  * just holding cpuset_mutex.  While it is performing these checks, various
			
 
				- * callback routines can briefly acquire callback_mutex to query cpusets.
			
 
				- * Once it is ready to make the changes, it takes callback_mutex, blocking
			
 
				+ * callback routines can briefly acquire callback_lock to query cpusets.
			
 
				+ * Once it is ready to make the changes, it takes callback_lock, blocking
			
 
				  * everyone else.
			
 
				  *
			
 
				  * Calls to the kernel memory allocator can not be made while holding
			
 
				- * callback_mutex, as that would risk double tripping on callback_mutex
			
 
				+ * callback_lock, as that would risk double tripping on callback_lock
			
 
				  * from one of the callbacks into the cpuset code from within
			
 
				  * __alloc_pages().
			
 
				  *
			
 
				- * If a task is only holding callback_mutex, then it has read-only
			
 
				+ * If a task is only holding callback_lock, then it has read-only
			
 
				  * access to cpusets.
			
 
				  *
			
 
				  * Now, the task_struct fields mems_allowed and mempolicy may be changed
			
 
				  * by other task, we use alloc_lock in the task_struct fields to protect
			
 
				  * them.
			
 
				  *
			
 
				- * The cpuset_common_file_read() handlers only hold callback_mutex across
			
 
				+ * The cpuset_common_file_read() handlers only hold callback_lock across
			
 
				  * small pieces of code, such as when reading out possibly multi-word
			
 
				  * cpumasks and nodemasks.
			
 
				  *
			
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
 
				  */
			
 
				 
			
 
				 static DEFINE_MUTEX(cpuset_mutex);
			
 
				-static DEFINE_MUTEX(callback_mutex);
			
 
				+static DEFINE_SPINLOCK(callback_lock);
			
 
				 
			
 
				 /*
			
 
				  * CPU / memory hotplug is handled asynchronously.
			
@@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = {
 
				  * One way or another, we guarantee to return some non-empty subset
			
 
				  * of cpu_online_mask.
			
 
				  *
			
 
				- * Call with callback_mutex held.
			
 
				+ * Call with callback_lock or cpuset_mutex held.
			
 
				  */
			
 
				 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
			
 
				 {
			
@@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
 
				  * One way or another, we guarantee to return some non-empty subset
			
 
				  * of node_states[N_MEMORY].
			
 
				  *
			
 
				- * Call with callback_mutex held.
			
 
				+ * Call with callback_lock or cpuset_mutex held.
			
 
				  */
			
 
				 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
			
 
				 {
			
@@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
 
				 /*
			
 
				  * update task's spread flag if cpuset's page/slab spread flag is set
			
 
				  *
			
 
				- * Called with callback_mutex/cpuset_mutex held
			
 
				+ * Call with callback_lock or cpuset_mutex held.
			
 
				  */
			
 
				 static void cpuset_update_task_spread_flag(struct cpuset *cs,
			
 
				 					struct task_struct *tsk)
			
@@ -886,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 
				 			continue;
			
 
				 		rcu_read_unlock();
			
 
				 
			
 
				-		mutex_lock(&callback_mutex);
			
 
				+		spin_lock_irq(&callback_lock);
			
 
				 		cpumask_copy(cp->effective_cpus, new_cpus);
			
 
				-		mutex_unlock(&callback_mutex);
			
 
				+		spin_unlock_irq(&callback_lock);
			
 
				 
			
 
				 		WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
			
 
				 			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
			
@@ -953,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 
				 	if (retval < 0)
			
 
				 		return retval;
			
 
				 
			
 
				-	mutex_lock(&callback_mutex);
			
 
				+	spin_lock_irq(&callback_lock);
			
 
				 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
			
 
				-	mutex_unlock(&callback_mutex);
			
 
				+	spin_unlock_irq(&callback_lock);
			
 
				 
			
 
				 	/* use trialcs->cpus_allowed as a temp variable */
			
 
				 	update_cpumasks_hier(cs, trialcs->cpus_allowed);
			
@@ -1142,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 
				 			continue;
			
 
				 		rcu_read_unlock();
			
 
				 
			
 
				-		mutex_lock(&callback_mutex);
			
 
				+		spin_lock_irq(&callback_lock);
			
 
				 		cp->effective_mems = *new_mems;
			
 
				-		mutex_unlock(&callback_mutex);
			
 
				+		spin_unlock_irq(&callback_lock);
			
 
				 
			
 
				 		WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
			
 
				 			!nodes_equal(cp->mems_allowed, cp->effective_mems));
			
@@ -1165,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 
				  * mempolicies and if the cpuset is marked 'memory_migrate',
			
 
				  * migrate the tasks pages to the new memory.
			
 
				  *
			
 
				- * Call with cpuset_mutex held.  May take callback_mutex during call.
			
 
				+ * Call with cpuset_mutex held. May take callback_lock during call.
			
 
				  * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
			
 
				  * lock each such tasks mm->mmap_sem, scan its vma's and rebind
			
 
				  * their mempolicies to the cpusets new mems_allowed.
			
@@ -1212,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 
				 	if (retval < 0)
			
 
				 		goto done;
			
 
				 
			
 
				-	mutex_lock(&callback_mutex);
			
 
				+	spin_lock_irq(&callback_lock);
			
 
				 	cs->mems_allowed = trialcs->mems_allowed;
			
 
				-	mutex_unlock(&callback_mutex);
			
 
				+	spin_unlock_irq(&callback_lock);
			
 
				 
			
 
				 	/* use trialcs->mems_allowed as a temp variable */
			
 
				 	update_nodemasks_hier(cs, &cs->mems_allowed);
			
@@ -1305,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 
				 	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
			
 
				 			|| (is_spread_page(cs) != is_spread_page(trialcs)));
			
 
				 
			
 
				-	mutex_lock(&callback_mutex);
			
 
				+	spin_lock_irq(&callback_lock);
			
 
				 	cs->flags = trialcs->flags;
			
 
				-	mutex_unlock(&callback_mutex);
			
 
				+	spin_unlock_irq(&callback_lock);
			
 
				 
			
 
				 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
			
 
				 		rebuild_sched_domains_locked();
			
@@ -1714,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 
				 	count = seq_get_buf(sf, &buf);
			
 
				 	s = buf;
			
 
				 
			
 
				-	mutex_lock(&callback_mutex);
			
 
				+	spin_lock_irq(&callback_lock);
			
 
				 
			
 
				 	switch (type) {
			
 
				 	case FILE_CPULIST:
			
@@ -1741,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 
				 		seq_commit(sf, -1);
			
 
				 	}
			
 
				 out_unlock:
			
 
				-	mutex_unlock(&callback_mutex);
			
 
				+	spin_unlock_irq(&callback_lock);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
@@ -1958,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 
				 
			
 
				 	cpuset_inc();
			
 
				 
			
 
				-	mutex_lock(&callback_mutex);
			
 
				+	spin_lock_irq(&callback_lock);
			
 
				 	if (cgroup_on_dfl(cs->css.cgroup)) {
			
 
				 		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
			
 
				 		cs->effective_mems = parent->effective_mems;
			
 
				 	}
			
 
				-	mutex_unlock(&callback_mutex);
			
 
				+	spin_unlock_irq(&callback_lock);
			
 
				 
			
 
				 	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
			
 
				 		goto out_unlock;
			
@@ -1990,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 
				 	}
			
 
				 	rcu_read_unlock();
			
 
				 
			
 
				-	mutex_lock(&callback_mutex);
			
 
				+	spin_lock_irq(&callback_lock);
			
 
				 	cs->mems_allowed = parent->mems_allowed;
			
 
				 	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
			
 
				-	mutex_unlock(&callback_mutex);
			
 
				+	spin_unlock_irq(&callback_lock);
			
 
				 out_unlock:
			
 
				 	mutex_unlock(&cpuset_mutex);
			
 
				 	return 0;
			
@@ -2032,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
 
				 static void cpuset_bind(struct cgroup_subsys_state *root_css)
			
 
				 {
			
 
				 	mutex_lock(&cpuset_mutex);
			
 
				-	mutex_lock(&callback_mutex);
			
 
				+	spin_lock_irq(&callback_lock);
			
 
				 
			
 
				 	if (cgroup_on_dfl(root_css->cgroup)) {
			
 
				 		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
			
@@ -2043,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 
				 		top_cpuset.mems_allowed = top_cpuset.effective_mems;
			
 
				 	}
			
 
				 
			
 
				-	mutex_unlock(&callback_mutex);
			
 
				+	spin_unlock_irq(&callback_lock);
			
 
				 	mutex_unlock(&cpuset_mutex);
			
 
				 }
			
 
				 
			
@@ -2128,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 
				 {
			
 
				 	bool is_empty;
			
 
				 
			
 
				-	mutex_lock(&callback_mutex);
			
 
				+	spin_lock_irq(&callback_lock);
			
 
				 	cpumask_copy(cs->cpus_allowed, new_cpus);
			
 
				 	cpumask_copy(cs->effective_cpus, new_cpus);
			
 
				 	cs->mems_allowed = *new_mems;
			
 
				 	cs->effective_mems = *new_mems;
			
 
				-	mutex_unlock(&callback_mutex);
			
 
				+	spin_unlock_irq(&callback_lock);
			
 
				 
			
 
				 	/*
			
 
				 	 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
			
@@ -2170,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs,
 
				 	if (nodes_empty(*new_mems))
			
 
				 		*new_mems = parent_cs(cs)->effective_mems;
			
 
				 
			
 
				-	mutex_lock(&callback_mutex);
			
 
				+	spin_lock_irq(&callback_lock);
			
 
				 	cpumask_copy(cs->effective_cpus, new_cpus);
			
 
				 	cs->effective_mems = *new_mems;
			
 
				-	mutex_unlock(&callback_mutex);
			
 
				+	spin_unlock_irq(&callback_lock);
			
 
				 
			
 
				 	if (cpus_updated)
			
 
				 		update_tasks_cpumask(cs);
			
@@ -2259,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 
				 
			
 
				 	/* synchronize cpus_allowed to cpu_active_mask */
			
 
				 	if (cpus_updated) {
			
 
				-		mutex_lock(&callback_mutex);
			
 
				+		spin_lock_irq(&callback_lock);
			
 
				 		if (!on_dfl)
			
 
				 			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
			
 
				 		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
			
 
				-		mutex_unlock(&callback_mutex);
			
 
				+		spin_unlock_irq(&callback_lock);
			
 
				 		/* we don't mess with cpumasks of tasks in top_cpuset */
			
 
				 	}
			
 
				 
			
 
				 	/* synchronize mems_allowed to N_MEMORY */
			
 
				 	if (mems_updated) {
			
 
				-		mutex_lock(&callback_mutex);
			
 
				+		spin_lock_irq(&callback_lock);
			
 
				 		if (!on_dfl)
			
 
				 			top_cpuset.mems_allowed = new_mems;
			
 
				 		top_cpuset.effective_mems = new_mems;
			
 
				-		mutex_unlock(&callback_mutex);
			
 
				+		spin_unlock_irq(&callback_lock);
			
 
				 		update_tasks_nodemask(&top_cpuset);
			
 
				 	}
			
 
				 
			
@@ -2366,11 +2366,13 @@ void __init cpuset_init_smp(void)
 
				 
			
 
				 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
			
 
				 {
			
 
				-	mutex_lock(&callback_mutex);
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&callback_lock, flags);
			
 
				 	rcu_read_lock();
			
 
				 	guarantee_online_cpus(task_cs(tsk), pmask);
			
 
				 	rcu_read_unlock();
			
 
				-	mutex_unlock(&callback_mutex);
			
 
				+	spin_unlock_irqrestore(&callback_lock, flags);
			
 
				 }
			
 
				 
			
 
				 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
			
@@ -2416,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void)
 
				 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
			
 
				 {
			
 
				 	nodemask_t mask;
			
 
				+	unsigned long flags;
			
 
				 
			
 
				-	mutex_lock(&callback_mutex);
			
 
				+	spin_lock_irqsave(&callback_lock, flags);
			
 
				 	rcu_read_lock();
			
 
				 	guarantee_online_mems(task_cs(tsk), &mask);
			
 
				 	rcu_read_unlock();
			
 
				-	mutex_unlock(&callback_mutex);
			
 
				+	spin_unlock_irqrestore(&callback_lock, flags);
			
 
				 
			
 
				 	return mask;
			
 
				 }
			
@@ -2440,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 
				 /*
			
 
				  * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
			
 
				  * mem_hardwall ancestor to the specified cpuset.  Call holding
			
 
				- * callback_mutex.  If no ancestor is mem_exclusive or mem_hardwall
			
 
				+ * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
			
 
				  * (an unusual configuration), then returns the root cpuset.
			
 
				  */
			
 
				 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
			
@@ -2451,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * cpuset_node_allowed_softwall - Can we allocate on a memory node?
			
 
				+ * cpuset_node_allowed - Can we allocate on a memory node?
			
 
				  * @node: is this an allowed node?
			
 
				  * @gfp_mask: memory allocation flags
			
 
				  *
			
@@ -2463,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 
				  * flag, yes.
			
 
				  * Otherwise, no.
			
 
				  *
			
 
				- * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
			
 
				- * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
			
 
				- * might sleep, and might allow a node from an enclosing cpuset.
			
 
				- *
			
 
				- * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
			
 
				- * cpusets, and never sleeps.
			
 
				- *
			
 
				  * The __GFP_THISNODE placement logic is really handled elsewhere,
			
 
				  * by forcibly using a zonelist starting at a specified node, and by
			
 
				  * (in get_page_from_freelist()) refusing to consider the zones for
			
@@ -2482,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 
				  * GFP_KERNEL allocations are not so marked, so can escape to the
			
 
				  * nearest enclosing hardwalled ancestor cpuset.
			
 
				  *
			
 
				- * Scanning up parent cpusets requires callback_mutex.  The
			
 
				+ * Scanning up parent cpusets requires callback_lock.  The
			
 
				  * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
			
 
				  * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
			
 
				  * current tasks mems_allowed came up empty on the first pass over
			
 
				  * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
			
 
				- * cpuset are short of memory, might require taking the callback_mutex
			
 
				- * mutex.
			
 
				+ * cpuset are short of memory, might require taking the callback_lock.
			
 
				  *
			
 
				  * The first call here from mm/page_alloc:get_page_from_freelist()
			
 
				  * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
			
@@ -2505,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
 
				  *	TIF_MEMDIE   - any node ok
			
 
				  *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
			
 
				  *	GFP_USER     - only nodes in current tasks mems allowed ok.
			
 
				- *
			
 
				- * Rule:
			
 
				- *    Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
			
 
				- *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
			
 
				- *    the code that might scan up ancestor cpusets and sleep.
			
 
				  */
			
 
				-int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
			
 
				+int __cpuset_node_allowed(int node, gfp_t gfp_mask)
			
 
				 {
			
 
				 	struct cpuset *cs;		/* current cpuset ancestors */
			
 
				 	int allowed;			/* is allocation in zone z allowed? */
			
 
				+	unsigned long flags;
			
 
				 
			
 
				 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
			
 
				 		return 1;
			
 
				-	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
			
 
				 	if (node_isset(node, current->mems_allowed))
			
 
				 		return 1;
			
 
				 	/*
			
@@ -2534,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 
				 		return 1;
			
 
				 
			
 
				 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
			
 
				-	mutex_lock(&callback_mutex);
			
 
				+	spin_lock_irqsave(&callback_lock, flags);
			
 
				 
			
 
				 	rcu_read_lock();
			
 
				 	cs = nearest_hardwall_ancestor(task_cs(current));
			
 
				 	allowed = node_isset(node, cs->mems_allowed);
			
 
				 	rcu_read_unlock();
			
 
				 
			
 
				-	mutex_unlock(&callback_mutex);
			
 
				+	spin_unlock_irqrestore(&callback_lock, flags);
			
 
				 	return allowed;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
			
 
				- * @node: is this an allowed node?
			
 
				- * @gfp_mask: memory allocation flags
			
 
				- *
			
 
				- * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
			
 
				- * set, yes, we can always allocate.  If node is in our task's mems_allowed,
			
 
				- * yes.  If the task has been OOM killed and has access to memory reserves as
			
 
				- * specified by the TIF_MEMDIE flag, yes.
			
 
				- * Otherwise, no.
			
 
				- *
			
 
				- * The __GFP_THISNODE placement logic is really handled elsewhere,
			
 
				- * by forcibly using a zonelist starting at a specified node, and by
			
 
				- * (in get_page_from_freelist()) refusing to consider the zones for
			
 
				- * any node on the zonelist except the first.  By the time any such
			
 
				- * calls get to this routine, we should just shut up and say 'yes'.
			
 
				- *
			
 
				- * Unlike the cpuset_node_allowed_softwall() variant, above,
			
 
				- * this variant requires that the node be in the current task's
			
 
				- * mems_allowed or that we're in interrupt.  It does not scan up the
			
 
				- * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
			
 
				- * It never sleeps.
			
 
				- */
			
 
				-int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
			
 
				-{
			
 
				-	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
			
 
				-		return 1;
			
 
				-	if (node_isset(node, current->mems_allowed))
			
 
				-		return 1;
			
 
				-	/*
			
 
				-	 * Allow tasks that have access to memory reserves because they have
			
 
				-	 * been OOM killed to get memory anywhere.
			
 
				-	 */
			
 
				-	if (unlikely(test_thread_flag(TIF_MEMDIE)))
			
 
				-		return 1;
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * cpuset_mem_spread_node() - On which node to begin search for a file page
			
 
				  * cpuset_slab_spread_node() - On which node to begin search for a slab page
			
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -582,7 +582,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 
				 
			
 
				 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
			
 
				 						MAX_NR_ZONES - 1, nodemask) {
			
 
				-		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) {
			
 
				+		if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) {
			
 
				 			page = dequeue_huge_page_node(h, zone_to_nid(zone));
			
 
				 			if (page) {
			
 
				 				if (avoid_reserve)
			
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -233,7 +233,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 
				 	/* Check this allocation failure is caused by cpuset's wall function */
			
 
				 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
			
 
				 			high_zoneidx, nodemask)
			
 
				-		if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
			
 
				+		if (!cpuset_zone_allowed(zone, gfp_mask))
			
 
				 			cpuset_limited = true;
			
 
				 
			
 
				 	if (cpuset_limited) {
			
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1990,7 +1990,7 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 
				 
			
 
				 	/*
			
 
				 	 * Scan zonelist, looking for a zone with enough free.
			
 
				-	 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
			
 
				+	 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
			
 
				 	 */
			
 
				 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
			
 
				 						high_zoneidx, nodemask) {
			
@@ -2001,7 +2001,7 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 
				 				continue;
			
 
				 		if (cpusets_enabled() &&
			
 
				 			(alloc_flags & ALLOC_CPUSET) &&
			
 
				-			!cpuset_zone_allowed_softwall(zone, gfp_mask))
			
 
				+			!cpuset_zone_allowed(zone, gfp_mask))
			
 
				 				continue;
			
 
				 		/*
			
 
				 		 * Distribute pages in proportion to the individual
			
@@ -2529,7 +2529,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 
				 			alloc_flags |= ALLOC_HARDER;
			
 
				 		/*
			
 
				 		 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
			
 
				-		 * comment for __cpuset_node_allowed_softwall().
			
 
				+		 * comment for __cpuset_node_allowed().
			
 
				 		 */
			
 
				 		alloc_flags &= ~ALLOC_CPUSET;
			
 
				 	} else if (unlikely(rt_task(current)) && !in_interrupt())
			
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3015,7 +3015,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 
				 	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
			
 
				 		nid = zone_to_nid(zone);
			
 
				 
			
 
				-		if (cpuset_zone_allowed_hardwall(zone, flags) &&
			
 
				+		if (cpuset_zone_allowed(zone, flags | __GFP_HARDWALL) &&
			
 
				 			get_node(cache, nid) &&
			
 
				 			get_node(cache, nid)->free_objects) {
			
 
				 				obj = ____cache_alloc_node(cache,
			
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1665,7 +1665,8 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
 
				 
			
 
				 			n = get_node(s, zone_to_nid(zone));
			
 
				 
			
 
				-			if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
			
 
				+			if (n && cpuset_zone_allowed(zone,
			
 
				+						     flags | __GFP_HARDWALL) &&
			
 
				 					n->nr_partial > s->min_partial) {
			
 
				 				object = get_partial_node(s, n, c, flags);
			
 
				 				if (object) {
			
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2405,7 +2405,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 
				 		 * to global LRU.
			
 
				 		 */
			
 
				 		if (global_reclaim(sc)) {
			
 
				-			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
			
 
				+			if (!cpuset_zone_allowed(zone,
			
 
				+						 GFP_KERNEL | __GFP_HARDWALL))
			
 
				 				continue;
			
 
				 
			
 
				 			lru_pages += zone_reclaimable_pages(zone);
			
@@ -3388,7 +3389,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 
				 	if (!populated_zone(zone))
			
 
				 		return;
			
 
				 
			
 
				-	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
			
 
				+	if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
			
 
				 		return;
			
 
				 	pgdat = zone->zone_pgdat;
			
 
				 	if (pgdat->kswapd_max_order < order) {