9 years ago · 10e2f1acd0
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1070,6 +1070,7 @@ struct sched_group;
 
				 struct sched_domain_shared {
			
 
				 	atomic_t	ref;
			
 
				 	atomic_t	nr_busy_cpus;
			
 
				+	int		has_idle_cores;
			
 
				 };
			
 
				 
			
 
				 struct sched_domain {
			
@@ -1102,6 +1103,8 @@ struct sched_domain {
 
				 	u64 max_newidle_lb_cost;
			
 
				 	unsigned long next_decay_max_lb_cost;
			
 
				 
			
 
				+	u64 avg_scan_cost;		/* select_idle_sibling */
			
 
				+
			
 
				 #ifdef CONFIG_SCHEDSTATS
			
 
				 	/* load_balance() stats */
			
 
				 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
			
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7478,6 +7478,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
 
				 #endif
			
 
				 
			
 
				 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
			
 
				+DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
			
 
				 
			
 
				 void __init sched_init(void)
			
 
				 {
			
@@ -7514,6 +7515,8 @@ void __init sched_init(void)
 
				 	for_each_possible_cpu(i) {
			
 
				 		per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
			
 
				 			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
			
 
				+		per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
			
 
				+			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
			
 
				 	}
			
 
				 #endif /* CONFIG_CPUMASK_OFFSTACK */
			
 
				 
			
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1582,9 +1582,16 @@ balance:
 
				 	 * One idle CPU per node is evaluated for a task numa move.
			
 
				 	 * Call select_idle_sibling to maybe find a better one.
			
 
				 	 */
			
 
				-	if (!cur)
			
 
				+	if (!cur) {
			
 
				+		/*
			
 
				+		 * select_idle_siblings() uses an per-cpu cpumask that
			
 
				+		 * can be used from IRQ context.
			
 
				+		 */
			
 
				+		local_irq_disable();
			
 
				 		env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
			
 
				 						   env->dst_cpu);
			
 
				+		local_irq_enable();
			
 
				+	}
			
 
				 
			
 
				 assign:
			
 
				 	task_numa_assign(env, cur, imp);
			
@@ -4616,6 +4623,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				+
			
 
				+/* Working cpumask for: load_balance, load_balance_newidle. */
			
 
				+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
			
 
				+DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
			
 
				+
			
 
				 #ifdef CONFIG_NO_HZ_COMMON
			
 
				 /*
			
 
				  * per rq 'load' arrray crap; XXX kill this.
			
@@ -5280,65 +5292,231 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Try and locate an idle CPU in the sched_domain.
			
 
				+ * Implement a for_each_cpu() variant that starts the scan at a given cpu
			
 
				+ * (@start), and wraps around.
			
 
				+ *
			
 
				+ * This is used to scan for idle CPUs; such that not all CPUs looking for an
			
 
				+ * idle CPU find the same CPU. The down-side is that tasks tend to cycle
			
 
				+ * through the LLC domain.
			
 
				+ *
			
 
				+ * Especially tbench is found sensitive to this.
			
 
				+ */
			
 
				+
			
 
				+static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
			
 
				+{
			
 
				+	int next;
			
 
				+
			
 
				+again:
			
 
				+	next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
			
 
				+
			
 
				+	if (*wrapped) {
			
 
				+		if (next >= start)
			
 
				+			return nr_cpumask_bits;
			
 
				+	} else {
			
 
				+		if (next >= nr_cpumask_bits) {
			
 
				+			*wrapped = 1;
			
 
				+			n = -1;
			
 
				+			goto again;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return next;
			
 
				+}
			
 
				+
			
 
				+#define for_each_cpu_wrap(cpu, mask, start, wrap)				\
			
 
				+	for ((wrap) = 0, (cpu) = (start)-1;					\
			
 
				+		(cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)),	\
			
 
				+		(cpu) < nr_cpumask_bits; )
			
 
				+
			
 
				+#ifdef CONFIG_SCHED_SMT
			
 
				+
			
 
				+static inline void set_idle_cores(int cpu, int val)
			
 
				+{
			
 
				+	struct sched_domain_shared *sds;
			
 
				+
			
 
				+	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
			
 
				+	if (sds)
			
 
				+		WRITE_ONCE(sds->has_idle_cores, val);
			
 
				+}
			
 
				+
			
 
				+static inline bool test_idle_cores(int cpu, bool def)
			
 
				+{
			
 
				+	struct sched_domain_shared *sds;
			
 
				+
			
 
				+	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
			
 
				+	if (sds)
			
 
				+		return READ_ONCE(sds->has_idle_cores);
			
 
				+
			
 
				+	return def;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Scans the local SMT mask to see if the entire core is idle, and records this
			
 
				+ * information in sd_llc_shared->has_idle_cores.
			
 
				+ *
			
 
				+ * Since SMT siblings share all cache levels, inspecting this limited remote
			
 
				+ * state should be fairly cheap.
			
 
				+ */
			
 
				+void update_idle_core(struct rq *rq)
			
 
				+{
			
 
				+	int core = cpu_of(rq);
			
 
				+	int cpu;
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	if (test_idle_cores(core, true))
			
 
				+		goto unlock;
			
 
				+
			
 
				+	for_each_cpu(cpu, cpu_smt_mask(core)) {
			
 
				+		if (cpu == core)
			
 
				+			continue;
			
 
				+
			
 
				+		if (!idle_cpu(cpu))
			
 
				+			goto unlock;
			
 
				+	}
			
 
				+
			
 
				+	set_idle_cores(core, 1);
			
 
				+unlock:
			
 
				+	rcu_read_unlock();
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Scan the entire LLC domain for idle cores; this dynamically switches off if
			
 
				+ * there are no idle cores left in the system; tracked through
			
 
				+ * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
			
 
				+ */
			
 
				+static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
			
 
				+{
			
 
				+	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
			
 
				+	int core, cpu, wrap;
			
 
				+
			
 
				+	if (!test_idle_cores(target, false))
			
 
				+		return -1;
			
 
				+
			
 
				+	cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
			
 
				+
			
 
				+	for_each_cpu_wrap(core, cpus, target, wrap) {
			
 
				+		bool idle = true;
			
 
				+
			
 
				+		for_each_cpu(cpu, cpu_smt_mask(core)) {
			
 
				+			cpumask_clear_cpu(cpu, cpus);
			
 
				+			if (!idle_cpu(cpu))
			
 
				+				idle = false;
			
 
				+		}
			
 
				+
			
 
				+		if (idle)
			
 
				+			return core;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Failed to find an idle core; stop looking for one.
			
 
				+	 */
			
 
				+	set_idle_cores(target, 0);
			
 
				+
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Scan the local SMT mask for idle CPUs.
			
 
				+ */
			
 
				+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
			
 
				+{
			
 
				+	int cpu;
			
 
				+
			
 
				+	for_each_cpu(cpu, cpu_smt_mask(target)) {
			
 
				+		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
			
 
				+			continue;
			
 
				+		if (idle_cpu(cpu))
			
 
				+			return cpu;
			
 
				+	}
			
 
				+
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+#else /* CONFIG_SCHED_SMT */
			
 
				+
			
 
				+static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
			
 
				+{
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
			
 
				+{
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+#endif /* CONFIG_SCHED_SMT */
			
 
				+
			
 
				+/*
			
 
				+ * Scan the LLC domain for idle CPUs; this is dynamically regulated by
			
 
				+ * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
			
 
				+ * average idle time for this rq (as found in rq->avg_idle).
			
 
				+ */
			
 
				+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
			
 
				+{
			
 
				+	struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
			
 
				+	u64 avg_idle = this_rq()->avg_idle;
			
 
				+	u64 avg_cost = this_sd->avg_scan_cost;
			
 
				+	u64 time, cost;
			
 
				+	s64 delta;
			
 
				+	int cpu, wrap;
			
 
				+
			
 
				+	/*
			
 
				+	 * Due to large variance we need a large fuzz factor; hackbench in
			
 
				+	 * particularly is sensitive here.
			
 
				+	 */
			
 
				+	if ((avg_idle / 512) < avg_cost)
			
 
				+		return -1;
			
 
				+
			
 
				+	time = local_clock();
			
 
				+
			
 
				+	for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
			
 
				+		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
			
 
				+			continue;
			
 
				+		if (idle_cpu(cpu))
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	time = local_clock() - time;
			
 
				+	cost = this_sd->avg_scan_cost;
			
 
				+	delta = (s64)(time - cost) / 8;
			
 
				+	this_sd->avg_scan_cost += delta;
			
 
				+
			
 
				+	return cpu;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Try and locate an idle core/thread in the LLC cache domain.
			
 
				  */
			
 
				 static int select_idle_sibling(struct task_struct *p, int prev, int target)
			
 
				 {
			
 
				 	struct sched_domain *sd;
			
 
				-	struct sched_group *sg;
			
 
				+	int i;
			
 
				 
			
 
				 	if (idle_cpu(target))
			
 
				 		return target;
			
 
				 
			
 
				 	/*
			
 
				-	 * If the prevous cpu is cache affine and idle, don't be stupid.
			
 
				+	 * If the previous cpu is cache affine and idle, don't be stupid.
			
 
				 	 */
			
 
				 	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
			
 
				 		return prev;
			
 
				 
			
 
				-	/*
			
 
				-	 * Otherwise, iterate the domains and find an eligible idle cpu.
			
 
				-	 *
			
 
				-	 * A completely idle sched group at higher domains is more
			
 
				-	 * desirable than an idle group at a lower level, because lower
			
 
				-	 * domains have smaller groups and usually share hardware
			
 
				-	 * resources which causes tasks to contend on them, e.g. x86
			
 
				-	 * hyperthread siblings in the lowest domain (SMT) can contend
			
 
				-	 * on the shared cpu pipeline.
			
 
				-	 *
			
 
				-	 * However, while we prefer idle groups at higher domains
			
 
				-	 * finding an idle cpu at the lowest domain is still better than
			
 
				-	 * returning 'target', which we've already established, isn't
			
 
				-	 * idle.
			
 
				-	 */
			
 
				 	sd = rcu_dereference(per_cpu(sd_llc, target));
			
 
				-	for_each_lower_domain(sd) {
			
 
				-		sg = sd->groups;
			
 
				-		do {
			
 
				-			int i;
			
 
				+	if (!sd)
			
 
				+		return target;
			
 
				 
			
 
				-			if (!cpumask_intersects(sched_group_cpus(sg),
			
 
				-						tsk_cpus_allowed(p)))
			
 
				-				goto next;
			
 
				+	i = select_idle_core(p, sd, target);
			
 
				+	if ((unsigned)i < nr_cpumask_bits)
			
 
				+		return i;
			
 
				 
			
 
				-			/* Ensure the entire group is idle */
			
 
				-			for_each_cpu(i, sched_group_cpus(sg)) {
			
 
				-				if (i == target || !idle_cpu(i))
			
 
				-					goto next;
			
 
				-			}
			
 
				+	i = select_idle_cpu(p, sd, target);
			
 
				+	if ((unsigned)i < nr_cpumask_bits)
			
 
				+		return i;
			
 
				+
			
 
				+	i = select_idle_smt(p, sd, target);
			
 
				+	if ((unsigned)i < nr_cpumask_bits)
			
 
				+		return i;
			
 
				 
			
 
				-			/*
			
 
				-			 * It doesn't matter which cpu we pick, the
			
 
				-			 * whole group is idle.
			
 
				-			 */
			
 
				-			target = cpumask_first_and(sched_group_cpus(sg),
			
 
				-					tsk_cpus_allowed(p));
			
 
				-			goto done;
			
 
				-next:
			
 
				-			sg = sg->next;
			
 
				-		} while (sg != sd->groups);
			
 
				-	}
			
 
				-done:
			
 
				 	return target;
			
 
				 }
			
 
				 
			
@@ -7397,9 +7575,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 
				  */
			
 
				 #define MAX_PINNED_INTERVAL	512
			
 
				 
			
 
				-/* Working cpumask for load_balance and load_balance_newidle. */
			
 
				-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
			
 
				-
			
 
				 static int need_active_balance(struct lb_env *env)
			
 
				 {
			
 
				 	struct sched_domain *sd = env->sd;
			
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -27,7 +27,7 @@ static struct task_struct *
 
				 pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
			
 
				 {
			
 
				 	put_prev_task(rq, prev);
			
 
				-
			
 
				+	update_idle_core(rq);
			
 
				 	schedstat_inc(rq->sched_goidle);
			
 
				 	return rq->idle;
			
 
				 }
			
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -36,6 +36,12 @@ extern void cpu_load_update_active(struct rq *this_rq);
 
				 static inline void cpu_load_update_active(struct rq *this_rq) { }
			
 
				 #endif
			
 
				 
			
 
				+#ifdef CONFIG_SCHED_SMT
			
 
				+extern void update_idle_core(struct rq *rq);
			
 
				+#else
			
 
				+static inline void update_idle_core(struct rq *rq) { }
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  * Helpers for converting nanosecond timing to jiffy resolution
			
 
				  */