8 years ago · 90001d67be
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,6 +71,14 @@ struct sched_domain_shared {
 
				 	atomic_t	ref;
			
 
				 	atomic_t	nr_busy_cpus;
			
 
				 	int		has_idle_cores;
			
 
				+
			
 
				+	/*
			
 
				+	 * Some variables from the most recent sd_lb_stats for this domain,
			
 
				+	 * used by wake_affine().
			
 
				+	 */
			
 
				+	unsigned long	nr_running;
			
 
				+	unsigned long	load;
			
 
				+	unsigned long	capacity;
			
 
				 };
			
 
				 
			
 
				 struct sched_domain {
			
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2658,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Can a task be moved from prev_cpu to this_cpu without causing a load
			
 
				- * imbalance that would trigger the load balancer?
			
 
				- */
			
 
				-static inline bool numa_wake_affine(struct sched_domain *sd,
			
 
				-				    struct task_struct *p, int this_cpu,
			
 
				-				    int prev_cpu, int sync)
			
 
				-{
			
 
				-	struct numa_stats prev_load, this_load;
			
 
				-	s64 this_eff_load, prev_eff_load;
			
 
				-
			
 
				-	update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
			
 
				-	update_numa_stats(&this_load, cpu_to_node(this_cpu));
			
 
				-
			
 
				-	/*
			
 
				-	 * If sync wakeup then subtract the (maximum possible)
			
 
				-	 * effect of the currently running task from the load
			
 
				-	 * of the current CPU:
			
 
				-	 */
			
 
				-	if (sync) {
			
 
				-		unsigned long current_load = task_h_load(current);
			
 
				-
			
 
				-		if (this_load.load > current_load)
			
 
				-			this_load.load -= current_load;
			
 
				-		else
			
 
				-			this_load.load = 0;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * In low-load situations, where this_cpu's node is idle due to the
			
 
				-	 * sync cause above having dropped this_load.load to 0, move the task.
			
 
				-	 * Moving to an idle socket will not create a bad imbalance.
			
 
				-	 *
			
 
				-	 * Otherwise check if the nodes are near enough in load to allow this
			
 
				-	 * task to be woken on this_cpu's node.
			
 
				-	 */
			
 
				-	if (this_load.load > 0) {
			
 
				-		unsigned long task_load = task_h_load(p);
			
 
				-
			
 
				-		this_eff_load = 100;
			
 
				-		this_eff_load *= prev_load.compute_capacity;
			
 
				-
			
 
				-		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
			
 
				-		prev_eff_load *= this_load.compute_capacity;
			
 
				-
			
 
				-		this_eff_load *= this_load.load + task_load;
			
 
				-		prev_eff_load *= prev_load.load - task_load;
			
 
				-
			
 
				-		return this_eff_load <= prev_eff_load;
			
 
				-	}
			
 
				-
			
 
				-	return true;
			
 
				-}
			
 
				 #else
			
 
				 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
			
 
				 {
			
@@ -2724,14 +2671,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 
				 {
			
 
				 }
			
 
				 
			
 
				-#ifdef CONFIG_SMP
			
 
				-static inline bool numa_wake_affine(struct sched_domain *sd,
			
 
				-				    struct task_struct *p, int this_cpu,
			
 
				-				    int prev_cpu, int sync)
			
 
				-{
			
 
				-	return true;
			
 
				-}
			
 
				-#endif /* !SMP */
			
 
				 #endif /* CONFIG_NUMA_BALANCING */
			
 
				 
			
 
				 static void
			
@@ -5428,20 +5367,115 @@ static int wake_wide(struct task_struct *p)
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				+struct llc_stats {
			
 
				+	unsigned long	nr_running;
			
 
				+	unsigned long	load;
			
 
				+	unsigned long	capacity;
			
 
				+	int		has_capacity;
			
 
				+};
			
 
				+
			
 
				+static bool get_llc_stats(struct llc_stats *stats, int cpu)
			
 
				+{
			
 
				+	struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
			
 
				+
			
 
				+	if (!sds)
			
 
				+		return false;
			
 
				+
			
 
				+	stats->nr_running	= READ_ONCE(sds->nr_running);
			
 
				+	stats->load		= READ_ONCE(sds->load);
			
 
				+	stats->capacity		= READ_ONCE(sds->capacity);
			
 
				+	stats->has_capacity	= stats->nr_running < per_cpu(sd_llc_size, cpu);
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Can a task be moved from prev_cpu to this_cpu without causing a load
			
 
				+ * imbalance that would trigger the load balancer?
			
 
				+ *
			
 
				+ * Since we're running on 'stale' values, we might in fact create an imbalance
			
 
				+ * but recomputing these values is expensive, as that'd mean iteration 2 cache
			
 
				+ * domains worth of CPUs.
			
 
				+ */
			
 
				+static bool
			
 
				+wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
			
 
				+		int this_cpu, int prev_cpu, int sync)
			
 
				+{
			
 
				+	struct llc_stats prev_stats, this_stats;
			
 
				+	s64 this_eff_load, prev_eff_load;
			
 
				+	unsigned long task_load;
			
 
				+
			
 
				+	if (!get_llc_stats(&prev_stats, prev_cpu) ||
			
 
				+	    !get_llc_stats(&this_stats, this_cpu))
			
 
				+		return false;
			
 
				+
			
 
				+	/*
			
 
				+	 * If sync wakeup then subtract the (maximum possible)
			
 
				+	 * effect of the currently running task from the load
			
 
				+	 * of the current LLC.
			
 
				+	 */
			
 
				+	if (sync) {
			
 
				+		unsigned long current_load = task_h_load(current);
			
 
				+
			
 
				+		/* in this case load hits 0 and this LLC is considered 'idle' */
			
 
				+		if (current_load > this_stats.load)
			
 
				+			return true;
			
 
				+
			
 
				+		this_stats.load -= current_load;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * The has_capacity stuff is not SMT aware, but by trying to balance
			
 
				+	 * the nr_running on both ends we try and fill the domain at equal
			
 
				+	 * rates, thereby first consuming cores before siblings.
			
 
				+	 */
			
 
				+
			
 
				+	/* if the old cache has capacity, stay there */
			
 
				+	if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
			
 
				+		return false;
			
 
				+
			
 
				+	/* if this cache has capacity, come here */
			
 
				+	if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
			
 
				+		return true;
			
 
				+
			
 
				+	/*
			
 
				+	 * Check to see if we can move the load without causing too much
			
 
				+	 * imbalance.
			
 
				+	 */
			
 
				+	task_load = task_h_load(p);
			
 
				+
			
 
				+	this_eff_load = 100;
			
 
				+	this_eff_load *= prev_stats.capacity;
			
 
				+
			
 
				+	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
			
 
				+	prev_eff_load *= this_stats.capacity;
			
 
				+
			
 
				+	this_eff_load *= this_stats.load + task_load;
			
 
				+	prev_eff_load *= prev_stats.load - task_load;
			
 
				+
			
 
				+	return this_eff_load <= prev_eff_load;
			
 
				+}
			
 
				+
			
 
				 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
			
 
				 		       int prev_cpu, int sync)
			
 
				 {
			
 
				 	int this_cpu = smp_processor_id();
			
 
				-	bool affine = false;
			
 
				+	bool affine;
			
 
				 
			
 
				 	/*
			
 
				-	 * Common case: CPUs are in the same socket, and select_idle_sibling()
			
 
				-	 * will do its thing regardless of what we return:
			
 
				+	 * Default to no affine wakeups; wake_affine() should not effect a task
			
 
				+	 * placement the load-balancer feels inclined to undo. The conservative
			
 
				+	 * option is therefore to not move tasks when they wake up.
			
 
				 	 */
			
 
				-	if (cpus_share_cache(prev_cpu, this_cpu))
			
 
				-		affine = true;
			
 
				-	else
			
 
				-		affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
			
 
				+	affine = false;
			
 
				+
			
 
				+	/*
			
 
				+	 * If the wakeup is across cache domains, try to evaluate if movement
			
 
				+	 * makes sense, otherwise rely on select_idle_siblings() to do
			
 
				+	 * placement inside the cache domain.
			
 
				+	 */
			
 
				+	if (!cpus_share_cache(prev_cpu, this_cpu))
			
 
				+		affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
			
 
				 
			
 
				 	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
			
 
				 	if (affine) {
			
@@ -7121,6 +7155,7 @@ struct sg_lb_stats {
 
				 struct sd_lb_stats {
			
 
				 	struct sched_group *busiest;	/* Busiest group in this sd */
			
 
				 	struct sched_group *local;	/* Local group in this sd */
			
 
				+	unsigned long total_running;
			
 
				 	unsigned long total_load;	/* Total load of all groups in sd */
			
 
				 	unsigned long total_capacity;	/* Total capacity of all groups in sd */
			
 
				 	unsigned long avg_load;	/* Average load across all groups in sd */
			
@@ -7140,6 +7175,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 
				 	*sds = (struct sd_lb_stats){
			
 
				 		.busiest = NULL,
			
 
				 		.local = NULL,
			
 
				+		.total_running = 0UL,
			
 
				 		.total_load = 0UL,
			
 
				 		.total_capacity = 0UL,
			
 
				 		.busiest_stat = {
			
@@ -7575,6 +7611,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
 
				  */
			
 
				 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
			
 
				 {
			
 
				+	struct sched_domain_shared *shared = env->sd->shared;
			
 
				 	struct sched_domain *child = env->sd->child;
			
 
				 	struct sched_group *sg = env->sd->groups;
			
 
				 	struct sg_lb_stats *local = &sds->local_stat;
			
@@ -7631,6 +7668,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 
				 
			
 
				 next_group:
			
 
				 		/* Now, start updating sd_lb_stats */
			
 
				+		sds->total_running += sgs->sum_nr_running;
			
 
				 		sds->total_load += sgs->group_load;
			
 
				 		sds->total_capacity += sgs->group_capacity;
			
 
				 
			
@@ -7646,6 +7684,21 @@ next_group:
 
				 			env->dst_rq->rd->overload = overload;
			
 
				 	}
			
 
				 
			
 
				+	if (!shared)
			
 
				+		return;
			
 
				+
			
 
				+	/*
			
 
				+	 * Since these are sums over groups they can contain some CPUs
			
 
				+	 * multiple times for the NUMA domains.
			
 
				+	 *
			
 
				+	 * Currently only wake_affine_llc() and find_busiest_group()
			
 
				+	 * uses these numbers, only the last is affected by this problem.
			
 
				+	 *
			
 
				+	 * XXX fix that.
			
 
				+	 */
			
 
				+	WRITE_ONCE(shared->nr_running,	sds->total_running);
			
 
				+	WRITE_ONCE(shared->load,	sds->total_load);
			
 
				+	WRITE_ONCE(shared->capacity,	sds->total_capacity);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -7875,6 +7928,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 
				 	if (!sds.busiest || busiest->sum_nr_running == 0)
			
 
				 		goto out_balanced;
			
 
				 
			
 
				+	/* XXX broken for overlapping NUMA groups */
			
 
				 	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
			
 
				 						/ sds.total_capacity;