7 年之前 · a339b35130
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -71,14 +71,6 @@ struct sched_domain_shared {
 
															 	atomic_t	ref;
														
 
															 	atomic_t	nr_busy_cpus;
														
 
															 	int		has_idle_cores;
														
 
															-
														
 
															-	/*
														
 
															-	 * Some variables from the most recent sd_lb_stats for this domain,
														
 
															-	 * used by wake_affine().
														
 
															-	 */
														
 
															-	unsigned long	nr_running;
														
 
															-	unsigned long	load;
														
 
															-	unsigned long	capacity;
														
 
															 };
														
 
															 struct sched_domain {
														
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5356,91 +5356,62 @@ static int wake_wide(struct task_struct *p)
 
															 	return 1;
														
 
															 }
														
 
															-struct llc_stats {
														
 
															-	unsigned long	nr_running;
														
 
															-	unsigned long	load;
														
 
															-	unsigned long	capacity;
														
 
															-	int		has_capacity;
														
 
															-};
														
 
															+/*
														
 
															+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
														
 
															+ * soonest. For the purpose of speed we only consider the waking and previous
														
 
															+ * CPU.
														
 
															+ *
														
 
															+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
														
 
															+ *			will be) idle.
														
 
															+ *
														
 
															+ * wake_affine_weight() - considers the weight to reflect the average
														
 
															+ *			  scheduling latency of the CPUs. This seems to work
														
 
															+ *			  for the overloaded case.
														
 
															+ */
														
 
															-static bool get_llc_stats(struct llc_stats *stats, int cpu)
														
 
															+static bool
														
 
															+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
														
 
															+		 int this_cpu, int prev_cpu, int sync)
														
 
															 {
														
 
															-	struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
														
 
															-
														
 
															-	if (!sds)
														
 
															-		return false;
														
 
															+	if (idle_cpu(this_cpu))
														
 
															+		return true;
														
 
															-	stats->nr_running	= READ_ONCE(sds->nr_running);
														
 
															-	stats->load		= READ_ONCE(sds->load);
														
 
															-	stats->capacity		= READ_ONCE(sds->capacity);
														
 
															-	stats->has_capacity	= stats->nr_running < per_cpu(sd_llc_size, cpu);
														
 
															+	if (sync && cpu_rq(this_cpu)->nr_running == 1)
														
 
															+		return true;
														
 
															-	return true;
														
 
															+	return false;
														
 
															 }
														
 
															-/*
														
 
															- * Can a task be moved from prev_cpu to this_cpu without causing a load
														
 
															- * imbalance that would trigger the load balancer?
														
 
															- *
														
 
															- * Since we're running on 'stale' values, we might in fact create an imbalance
														
 
															- * but recomputing these values is expensive, as that'd mean iteration 2 cache
														
 
															- * domains worth of CPUs.
														
 
															- */
														
 
															 static bool
														
 
															-wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
														
 
															-		int this_cpu, int prev_cpu, int sync)
														
 
															+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
														
 
															+		   int this_cpu, int prev_cpu, int sync)
														
 
															 {
														
 
															-	struct llc_stats prev_stats, this_stats;
														
 
															 	s64 this_eff_load, prev_eff_load;
														
 
															 	unsigned long task_load;
														
 
															-	if (!get_llc_stats(&prev_stats, prev_cpu) ||
														
 
															-	    !get_llc_stats(&this_stats, this_cpu))
														
 
															-		return false;
														
 
															+	this_eff_load = target_load(this_cpu, sd->wake_idx);
														
 
															+	prev_eff_load = source_load(prev_cpu, sd->wake_idx);
														
 
															-	/*
														
 
															-	 * If sync wakeup then subtract the (maximum possible)
														
 
															-	 * effect of the currently running task from the load
														
 
															-	 * of the current LLC.
														
 
															-	 */
														
 
															 	if (sync) {
														
 
															 		unsigned long current_load = task_h_load(current);
														
 
															-		/* in this case load hits 0 and this LLC is considered 'idle' */
														
 
															-		if (current_load > this_stats.load)
														
 
															+		if (current_load > this_eff_load)
														
 
															 			return true;
														
 
															-		this_stats.load -= current_load;
														
 
															+		this_eff_load -= current_load;
														
 
															 	}
														
 
															-	/*
														
 
															-	 * The has_capacity stuff is not SMT aware, but by trying to balance
														
 
															-	 * the nr_running on both ends we try and fill the domain at equal
														
 
															-	 * rates, thereby first consuming cores before siblings.
														
 
															-	 */
														
 
															-
														
 
															-	/* if the old cache has capacity, stay there */
														
 
															-	if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
														
 
															-		return false;
														
 
															-
														
 
															-	/* if this cache has capacity, come here */
														
 
															-	if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
														
 
															-		return true;
														
 
															-
														
 
															-	/*
														
 
															-	 * Check to see if we can move the load without causing too much
														
 
															-	 * imbalance.
														
 
															-	 */
														
 
															 	task_load = task_h_load(p);
														
 
															-	this_eff_load = 100;
														
 
															-	this_eff_load *= prev_stats.capacity;
														
 
															-
														
 
															-	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
														
 
															-	prev_eff_load *= this_stats.capacity;
														
 
															+	this_eff_load += task_load;
														
 
															+	if (sched_feat(WA_BIAS))
														
 
															+		this_eff_load *= 100;
														
 
															+	this_eff_load *= capacity_of(prev_cpu);
														
 
															-	this_eff_load *= this_stats.load + task_load;
														
 
															-	prev_eff_load *= prev_stats.load - task_load;
														
 
															+	prev_eff_load -= task_load;
														
 
															+	if (sched_feat(WA_BIAS))
														
 
															+		prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
														
 
															+	prev_eff_load *= capacity_of(this_cpu);
														
 
															 	return this_eff_load <= prev_eff_load;
														
 
															 }
														
@@ -5449,22 +5420,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 
															 		       int prev_cpu, int sync)
														
 
															 {
														
 
															 	int this_cpu = smp_processor_id();
														
 
															-	bool affine;
														
 
															+	bool affine = false;
														
 
															-	/*
														
 
															-	 * Default to no affine wakeups; wake_affine() should not effect a task
														
 
															-	 * placement the load-balancer feels inclined to undo. The conservative
														
 
															-	 * option is therefore to not move tasks when they wake up.
														
 
															-	 */
														
 
															-	affine = false;
														
 
															+	if (sched_feat(WA_IDLE) && !affine)
														
 
															+		affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
														
 
															-	/*
														
 
															-	 * If the wakeup is across cache domains, try to evaluate if movement
														
 
															-	 * makes sense, otherwise rely on select_idle_siblings() to do
														
 
															-	 * placement inside the cache domain.
														
 
															-	 */
														
 
															-	if (!cpus_share_cache(prev_cpu, this_cpu))
														
 
															-		affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
														
 
															+	if (sched_feat(WA_WEIGHT) && !affine)
														
 
															+		affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
														
 
															 	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
														
 
															 	if (affine) {
														
@@ -7600,7 +7562,6 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
 
															  */
														
 
															 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
														
 
															 {
														
 
															-	struct sched_domain_shared *shared = env->sd->shared;
														
 
															 	struct sched_domain *child = env->sd->child;
														
 
															 	struct sched_group *sg = env->sd->groups;
														
 
															 	struct sg_lb_stats *local = &sds->local_stat;
														
@@ -7672,22 +7633,6 @@ next_group:
 
															 		if (env->dst_rq->rd->overload != overload)
														
 
															 			env->dst_rq->rd->overload = overload;
														
 
															 	}
														
 
															-
														
 
															-	if (!shared)
														
 
															-		return;
														
 
															-
														
 
															-	/*
														
 
															-	 * Since these are sums over groups they can contain some CPUs
														
 
															-	 * multiple times for the NUMA domains.
														
 
															-	 *
														
 
															-	 * Currently only wake_affine_llc() and find_busiest_group()
														
 
															-	 * uses these numbers, only the last is affected by this problem.
														
 
															-	 *
														
 
															-	 * XXX fix that.
														
 
															-	 */
														
 
															-	WRITE_ONCE(shared->nr_running,	sds->total_running);
														
 
															-	WRITE_ONCE(shared->load,	sds->total_load);
														
 
															-	WRITE_ONCE(shared->capacity,	sds->total_capacity);
														
 
															 }
														
 
															 /**
														
@@ -8097,6 +8042,13 @@ static int should_we_balance(struct lb_env *env)
 
															 	struct sched_group *sg = env->sd->groups;
														
 
															 	int cpu, balance_cpu = -1;
														
 
															+	/*
														
 
															+	 * Ensure the balancing environment is consistent; can happen
														
 
															+	 * when the softirq triggers 'during' hotplug.
														
 
															+	 */
														
 
															+	if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
														
 
															+		return 0;
														
 
															+
														
 
															 	/*
														
 
															 	 * In the newly idle case, we will allow all the cpu's
														
 
															 	 * to do the newly idle load balance.
														
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -81,3 +81,6 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true)
 
															 SCHED_FEAT(LB_MIN, false)
														
 
															 SCHED_FEAT(ATTACH_AGE_LOAD, true)
														
 
															+SCHED_FEAT(WA_IDLE, true)
														
 
															+SCHED_FEAT(WA_WEIGHT, true)
														
 
															+SCHED_FEAT(WA_BIAS, true)