|
@@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu);
|
|
static unsigned long task_h_load(struct task_struct *p);
|
|
static unsigned long task_h_load(struct task_struct *p);
|
|
|
|
|
|
static inline void __update_task_entity_contrib(struct sched_entity *se);
|
|
static inline void __update_task_entity_contrib(struct sched_entity *se);
|
|
|
|
+static inline void __update_task_entity_utilization(struct sched_entity *se);
|
|
|
|
|
|
/* Give new task start runnable values to heavy its load in infant time */
|
|
/* Give new task start runnable values to heavy its load in infant time */
|
|
void init_task_runnable_average(struct task_struct *p)
|
|
void init_task_runnable_average(struct task_struct *p)
|
|
@@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p)
|
|
u32 slice;
|
|
u32 slice;
|
|
|
|
|
|
slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
|
|
slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
|
|
- p->se.avg.runnable_avg_sum = slice;
|
|
|
|
- p->se.avg.runnable_avg_period = slice;
|
|
|
|
|
|
+ p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
|
|
|
|
+ p->se.avg.avg_period = slice;
|
|
__update_task_entity_contrib(&p->se);
|
|
__update_task_entity_contrib(&p->se);
|
|
|
|
+ __update_task_entity_utilization(&p->se);
|
|
}
|
|
}
|
|
#else
|
|
#else
|
|
void init_task_runnable_average(struct task_struct *p)
|
|
void init_task_runnable_average(struct task_struct *p)
|
|
@@ -1196,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env,
|
|
static bool load_too_imbalanced(long src_load, long dst_load,
|
|
static bool load_too_imbalanced(long src_load, long dst_load,
|
|
struct task_numa_env *env)
|
|
struct task_numa_env *env)
|
|
{
|
|
{
|
|
- long imb, old_imb;
|
|
|
|
- long orig_src_load, orig_dst_load;
|
|
|
|
long src_capacity, dst_capacity;
|
|
long src_capacity, dst_capacity;
|
|
|
|
+ long orig_src_load;
|
|
|
|
+ long load_a, load_b;
|
|
|
|
+ long moved_load;
|
|
|
|
+ long imb;
|
|
|
|
|
|
/*
|
|
/*
|
|
* The load is corrected for the CPU capacity available on each node.
|
|
* The load is corrected for the CPU capacity available on each node.
|
|
@@ -1211,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load,
|
|
dst_capacity = env->dst_stats.compute_capacity;
|
|
dst_capacity = env->dst_stats.compute_capacity;
|
|
|
|
|
|
/* We care about the slope of the imbalance, not the direction. */
|
|
/* We care about the slope of the imbalance, not the direction. */
|
|
- if (dst_load < src_load)
|
|
|
|
- swap(dst_load, src_load);
|
|
|
|
|
|
+ load_a = dst_load;
|
|
|
|
+ load_b = src_load;
|
|
|
|
+ if (load_a < load_b)
|
|
|
|
+ swap(load_a, load_b);
|
|
|
|
|
|
/* Is the difference below the threshold? */
|
|
/* Is the difference below the threshold? */
|
|
- imb = dst_load * src_capacity * 100 -
|
|
|
|
- src_load * dst_capacity * env->imbalance_pct;
|
|
|
|
|
|
+ imb = load_a * src_capacity * 100 -
|
|
|
|
+ load_b * dst_capacity * env->imbalance_pct;
|
|
if (imb <= 0)
|
|
if (imb <= 0)
|
|
return false;
|
|
return false;
|
|
|
|
|
|
/*
|
|
/*
|
|
* The imbalance is above the allowed threshold.
|
|
* The imbalance is above the allowed threshold.
|
|
- * Compare it with the old imbalance.
|
|
|
|
|
|
+ * Allow a move that brings us closer to a balanced situation,
|
|
|
|
+ * without moving things past the point of balance.
|
|
*/
|
|
*/
|
|
orig_src_load = env->src_stats.load;
|
|
orig_src_load = env->src_stats.load;
|
|
- orig_dst_load = env->dst_stats.load;
|
|
|
|
|
|
|
|
- if (orig_dst_load < orig_src_load)
|
|
|
|
- swap(orig_dst_load, orig_src_load);
|
|
|
|
-
|
|
|
|
- old_imb = orig_dst_load * src_capacity * 100 -
|
|
|
|
- orig_src_load * dst_capacity * env->imbalance_pct;
|
|
|
|
|
|
+ /*
|
|
|
|
+ * In a task swap, there will be one load moving from src to dst,
|
|
|
|
+ * and another moving back. This is the net sum of both moves.
|
|
|
|
+ * A simple task move will always have a positive value.
|
|
|
|
+ * Allow the move if it brings the system closer to a balanced
|
|
|
|
+ * situation, without crossing over the balance point.
|
|
|
|
+ */
|
|
|
|
+ moved_load = orig_src_load - src_load;
|
|
|
|
|
|
- /* Would this change make things worse? */
|
|
|
|
- return (imb > old_imb);
|
|
|
|
|
|
+ if (moved_load > 0)
|
|
|
|
+ /* Moving src -> dst. Did we overshoot balance? */
|
|
|
|
+ return src_load * dst_capacity < dst_load * src_capacity;
|
|
|
|
+ else
|
|
|
|
+ /* Moving dst -> src. Did we overshoot balance? */
|
|
|
|
+ return dst_load * src_capacity < src_load * dst_capacity;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -1675,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
|
|
*period = now - p->last_task_numa_placement;
|
|
*period = now - p->last_task_numa_placement;
|
|
} else {
|
|
} else {
|
|
delta = p->se.avg.runnable_avg_sum;
|
|
delta = p->se.avg.runnable_avg_sum;
|
|
- *period = p->se.avg.runnable_avg_period;
|
|
|
|
|
|
+ *period = p->se.avg.avg_period;
|
|
}
|
|
}
|
|
|
|
|
|
p->last_sum_exec_runtime = runtime;
|
|
p->last_sum_exec_runtime = runtime;
|
|
@@ -1765,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
/* Next round, evaluate the nodes within max_group. */
|
|
/* Next round, evaluate the nodes within max_group. */
|
|
|
|
+ if (!max_faults)
|
|
|
|
+ break;
|
|
nodes = max_group;
|
|
nodes = max_group;
|
|
}
|
|
}
|
|
return nid;
|
|
return nid;
|
|
@@ -2503,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n)
|
|
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
|
|
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
|
|
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
|
|
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
|
|
*/
|
|
*/
|
|
-static __always_inline int __update_entity_runnable_avg(u64 now,
|
|
|
|
|
|
+static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
|
|
struct sched_avg *sa,
|
|
struct sched_avg *sa,
|
|
- int runnable)
|
|
|
|
|
|
+ int runnable,
|
|
|
|
+ int running)
|
|
{
|
|
{
|
|
u64 delta, periods;
|
|
u64 delta, periods;
|
|
u32 runnable_contrib;
|
|
u32 runnable_contrib;
|
|
int delta_w, decayed = 0;
|
|
int delta_w, decayed = 0;
|
|
|
|
+ unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
|
|
|
|
|
|
delta = now - sa->last_runnable_update;
|
|
delta = now - sa->last_runnable_update;
|
|
/*
|
|
/*
|
|
@@ -2531,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
|
|
sa->last_runnable_update = now;
|
|
sa->last_runnable_update = now;
|
|
|
|
|
|
/* delta_w is the amount already accumulated against our next period */
|
|
/* delta_w is the amount already accumulated against our next period */
|
|
- delta_w = sa->runnable_avg_period % 1024;
|
|
|
|
|
|
+ delta_w = sa->avg_period % 1024;
|
|
if (delta + delta_w >= 1024) {
|
|
if (delta + delta_w >= 1024) {
|
|
/* period roll-over */
|
|
/* period roll-over */
|
|
decayed = 1;
|
|
decayed = 1;
|
|
@@ -2544,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
|
|
delta_w = 1024 - delta_w;
|
|
delta_w = 1024 - delta_w;
|
|
if (runnable)
|
|
if (runnable)
|
|
sa->runnable_avg_sum += delta_w;
|
|
sa->runnable_avg_sum += delta_w;
|
|
- sa->runnable_avg_period += delta_w;
|
|
|
|
|
|
+ if (running)
|
|
|
|
+ sa->running_avg_sum += delta_w * scale_freq
|
|
|
|
+ >> SCHED_CAPACITY_SHIFT;
|
|
|
|
+ sa->avg_period += delta_w;
|
|
|
|
|
|
delta -= delta_w;
|
|
delta -= delta_w;
|
|
|
|
|
|
@@ -2554,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
|
|
|
|
|
|
sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
|
|
sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
|
|
periods + 1);
|
|
periods + 1);
|
|
- sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
|
|
|
|
|
|
+ sa->running_avg_sum = decay_load(sa->running_avg_sum,
|
|
|
|
+ periods + 1);
|
|
|
|
+ sa->avg_period = decay_load(sa->avg_period,
|
|
periods + 1);
|
|
periods + 1);
|
|
|
|
|
|
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
|
|
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
|
|
runnable_contrib = __compute_runnable_contrib(periods);
|
|
runnable_contrib = __compute_runnable_contrib(periods);
|
|
if (runnable)
|
|
if (runnable)
|
|
sa->runnable_avg_sum += runnable_contrib;
|
|
sa->runnable_avg_sum += runnable_contrib;
|
|
- sa->runnable_avg_period += runnable_contrib;
|
|
|
|
|
|
+ if (running)
|
|
|
|
+ sa->running_avg_sum += runnable_contrib * scale_freq
|
|
|
|
+ >> SCHED_CAPACITY_SHIFT;
|
|
|
|
+ sa->avg_period += runnable_contrib;
|
|
}
|
|
}
|
|
|
|
|
|
/* Remainder of delta accrued against u_0` */
|
|
/* Remainder of delta accrued against u_0` */
|
|
if (runnable)
|
|
if (runnable)
|
|
sa->runnable_avg_sum += delta;
|
|
sa->runnable_avg_sum += delta;
|
|
- sa->runnable_avg_period += delta;
|
|
|
|
|
|
+ if (running)
|
|
|
|
+ sa->running_avg_sum += delta * scale_freq
|
|
|
|
+ >> SCHED_CAPACITY_SHIFT;
|
|
|
|
+ sa->avg_period += delta;
|
|
|
|
|
|
return decayed;
|
|
return decayed;
|
|
}
|
|
}
|
|
@@ -2584,6 +2612,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
|
|
return 0;
|
|
return 0;
|
|
|
|
|
|
se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
|
|
se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
|
|
|
|
+ se->avg.utilization_avg_contrib =
|
|
|
|
+ decay_load(se->avg.utilization_avg_contrib, decays);
|
|
|
|
|
|
return decays;
|
|
return decays;
|
|
}
|
|
}
|
|
@@ -2619,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
|
|
|
|
|
|
/* The fraction of a cpu used by this cfs_rq */
|
|
/* The fraction of a cpu used by this cfs_rq */
|
|
contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
|
|
contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
|
|
- sa->runnable_avg_period + 1);
|
|
|
|
|
|
+ sa->avg_period + 1);
|
|
contrib -= cfs_rq->tg_runnable_contrib;
|
|
contrib -= cfs_rq->tg_runnable_contrib;
|
|
|
|
|
|
if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
|
|
if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
|
|
@@ -2672,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
|
|
|
|
|
|
static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
|
|
static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
|
|
{
|
|
{
|
|
- __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
|
|
|
|
|
|
+ __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
|
|
|
|
+ runnable, runnable);
|
|
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
|
|
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
|
|
}
|
|
}
|
|
#else /* CONFIG_FAIR_GROUP_SCHED */
|
|
#else /* CONFIG_FAIR_GROUP_SCHED */
|
|
@@ -2690,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
|
|
|
|
|
|
/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
|
|
/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
|
|
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
|
|
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
|
|
- contrib /= (se->avg.runnable_avg_period + 1);
|
|
|
|
|
|
+ contrib /= (se->avg.avg_period + 1);
|
|
se->avg.load_avg_contrib = scale_load(contrib);
|
|
se->avg.load_avg_contrib = scale_load(contrib);
|
|
}
|
|
}
|
|
|
|
|
|
@@ -2709,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
|
|
return se->avg.load_avg_contrib - old_contrib;
|
|
return se->avg.load_avg_contrib - old_contrib;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+
|
|
|
|
+static inline void __update_task_entity_utilization(struct sched_entity *se)
|
|
|
|
+{
|
|
|
|
+ u32 contrib;
|
|
|
|
+
|
|
|
|
+ /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
|
|
|
|
+ contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
|
|
|
|
+ contrib /= (se->avg.avg_period + 1);
|
|
|
|
+ se->avg.utilization_avg_contrib = scale_load(contrib);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
|
|
|
|
+{
|
|
|
|
+ long old_contrib = se->avg.utilization_avg_contrib;
|
|
|
|
+
|
|
|
|
+ if (entity_is_task(se))
|
|
|
|
+ __update_task_entity_utilization(se);
|
|
|
|
+ else
|
|
|
|
+ se->avg.utilization_avg_contrib =
|
|
|
|
+ group_cfs_rq(se)->utilization_load_avg;
|
|
|
|
+
|
|
|
|
+ return se->avg.utilization_avg_contrib - old_contrib;
|
|
|
|
+}
|
|
|
|
+
|
|
static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
|
|
static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
|
|
long load_contrib)
|
|
long load_contrib)
|
|
{
|
|
{
|
|
@@ -2725,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se,
|
|
int update_cfs_rq)
|
|
int update_cfs_rq)
|
|
{
|
|
{
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
- long contrib_delta;
|
|
|
|
|
|
+ long contrib_delta, utilization_delta;
|
|
|
|
+ int cpu = cpu_of(rq_of(cfs_rq));
|
|
u64 now;
|
|
u64 now;
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -2737,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se,
|
|
else
|
|
else
|
|
now = cfs_rq_clock_task(group_cfs_rq(se));
|
|
now = cfs_rq_clock_task(group_cfs_rq(se));
|
|
|
|
|
|
- if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
|
|
|
|
|
|
+ if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
|
|
|
|
+ cfs_rq->curr == se))
|
|
return;
|
|
return;
|
|
|
|
|
|
contrib_delta = __update_entity_load_avg_contrib(se);
|
|
contrib_delta = __update_entity_load_avg_contrib(se);
|
|
|
|
+ utilization_delta = __update_entity_utilization_avg_contrib(se);
|
|
|
|
|
|
if (!update_cfs_rq)
|
|
if (!update_cfs_rq)
|
|
return;
|
|
return;
|
|
|
|
|
|
- if (se->on_rq)
|
|
|
|
|
|
+ if (se->on_rq) {
|
|
cfs_rq->runnable_load_avg += contrib_delta;
|
|
cfs_rq->runnable_load_avg += contrib_delta;
|
|
- else
|
|
|
|
|
|
+ cfs_rq->utilization_load_avg += utilization_delta;
|
|
|
|
+ } else {
|
|
subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
|
|
subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -2823,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
|
|
}
|
|
}
|
|
|
|
|
|
cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
|
|
cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
|
|
|
|
+ cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
|
|
/* we force update consideration on load-balancer moves */
|
|
/* we force update consideration on load-balancer moves */
|
|
update_cfs_rq_blocked_load(cfs_rq, !wakeup);
|
|
update_cfs_rq_blocked_load(cfs_rq, !wakeup);
|
|
}
|
|
}
|
|
@@ -2841,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
|
|
update_cfs_rq_blocked_load(cfs_rq, !sleep);
|
|
update_cfs_rq_blocked_load(cfs_rq, !sleep);
|
|
|
|
|
|
cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
|
|
cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
|
|
|
|
+ cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
|
|
if (sleep) {
|
|
if (sleep) {
|
|
cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
|
|
cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
|
|
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
|
|
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
|
|
@@ -3178,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
*/
|
|
*/
|
|
update_stats_wait_end(cfs_rq, se);
|
|
update_stats_wait_end(cfs_rq, se);
|
|
__dequeue_entity(cfs_rq, se);
|
|
__dequeue_entity(cfs_rq, se);
|
|
|
|
+ update_entity_load_avg(se, 1);
|
|
}
|
|
}
|
|
|
|
|
|
update_stats_curr_start(cfs_rq, se);
|
|
update_stats_curr_start(cfs_rq, se);
|
|
@@ -4304,6 +4367,11 @@ static unsigned long capacity_of(int cpu)
|
|
return cpu_rq(cpu)->cpu_capacity;
|
|
return cpu_rq(cpu)->cpu_capacity;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+static unsigned long capacity_orig_of(int cpu)
|
|
|
|
+{
|
|
|
|
+ return cpu_rq(cpu)->cpu_capacity_orig;
|
|
|
|
+}
|
|
|
|
+
|
|
static unsigned long cpu_avg_load_per_task(int cpu)
|
|
static unsigned long cpu_avg_load_per_task(int cpu)
|
|
{
|
|
{
|
|
struct rq *rq = cpu_rq(cpu);
|
|
struct rq *rq = cpu_rq(cpu);
|
|
@@ -4717,6 +4785,33 @@ next:
|
|
done:
|
|
done:
|
|
return target;
|
|
return target;
|
|
}
|
|
}
|
|
|
|
+/*
|
|
|
|
+ * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
|
|
|
|
+ * tasks. The unit of the return value must be the one of capacity so we can
|
|
|
|
+ * compare the usage with the capacity of the CPU that is available for CFS
|
|
|
|
+ * task (ie cpu_capacity).
|
|
|
|
+ * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
|
|
|
|
+ * CPU. It represents the amount of utilization of a CPU in the range
|
|
|
|
+ * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
|
|
|
|
+ * capacity of the CPU because it's about the running time on this CPU.
|
|
|
|
+ * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
|
|
|
|
+ * because of unfortunate rounding in avg_period and running_load_avg or just
|
|
|
|
+ * after migrating tasks until the average stabilizes with the new running
|
|
|
|
+ * time. So we need to check that the usage stays into the range
|
|
|
|
+ * [0..cpu_capacity_orig] and cap if necessary.
|
|
|
|
+ * Without capping the usage, a group could be seen as overloaded (CPU0 usage
|
|
|
|
+ * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
|
|
|
|
+ */
|
|
|
|
+static int get_cpu_usage(int cpu)
|
|
|
|
+{
|
|
|
|
+ unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
|
|
|
|
+ unsigned long capacity = capacity_orig_of(cpu);
|
|
|
|
+
|
|
|
|
+ if (usage >= SCHED_LOAD_SCALE)
|
|
|
|
+ return capacity;
|
|
|
|
+
|
|
|
|
+ return (usage * capacity) >> SCHED_LOAD_SHIFT;
|
|
|
|
+}
|
|
|
|
|
|
/*
|
|
/*
|
|
* select_task_rq_fair: Select target runqueue for the waking task in domains
|
|
* select_task_rq_fair: Select target runqueue for the waking task in domains
|
|
@@ -5843,12 +5938,12 @@ struct sg_lb_stats {
|
|
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
|
|
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
|
|
unsigned long load_per_task;
|
|
unsigned long load_per_task;
|
|
unsigned long group_capacity;
|
|
unsigned long group_capacity;
|
|
|
|
+ unsigned long group_usage; /* Total usage of the group */
|
|
unsigned int sum_nr_running; /* Nr tasks running in the group */
|
|
unsigned int sum_nr_running; /* Nr tasks running in the group */
|
|
- unsigned int group_capacity_factor;
|
|
|
|
unsigned int idle_cpus;
|
|
unsigned int idle_cpus;
|
|
unsigned int group_weight;
|
|
unsigned int group_weight;
|
|
enum group_type group_type;
|
|
enum group_type group_type;
|
|
- int group_has_free_capacity;
|
|
|
|
|
|
+ int group_no_capacity;
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
unsigned int nr_numa_running;
|
|
unsigned int nr_numa_running;
|
|
unsigned int nr_preferred_running;
|
|
unsigned int nr_preferred_running;
|
|
@@ -5919,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
|
|
return load_idx;
|
|
return load_idx;
|
|
}
|
|
}
|
|
|
|
|
|
-static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
|
|
|
|
-{
|
|
|
|
- return SCHED_CAPACITY_SCALE;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
|
|
|
|
-{
|
|
|
|
- return default_scale_capacity(sd, cpu);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
|
|
static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
|
|
{
|
|
{
|
|
if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
|
|
if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
|
|
@@ -5945,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
|
|
static unsigned long scale_rt_capacity(int cpu)
|
|
static unsigned long scale_rt_capacity(int cpu)
|
|
{
|
|
{
|
|
struct rq *rq = cpu_rq(cpu);
|
|
struct rq *rq = cpu_rq(cpu);
|
|
- u64 total, available, age_stamp, avg;
|
|
|
|
|
|
+ u64 total, used, age_stamp, avg;
|
|
s64 delta;
|
|
s64 delta;
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -5961,19 +6046,12 @@ static unsigned long scale_rt_capacity(int cpu)
|
|
|
|
|
|
total = sched_avg_period() + delta;
|
|
total = sched_avg_period() + delta;
|
|
|
|
|
|
- if (unlikely(total < avg)) {
|
|
|
|
- /* Ensures that capacity won't end up being negative */
|
|
|
|
- available = 0;
|
|
|
|
- } else {
|
|
|
|
- available = total - avg;
|
|
|
|
- }
|
|
|
|
|
|
+ used = div_u64(avg, total);
|
|
|
|
|
|
- if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
|
|
|
|
- total = SCHED_CAPACITY_SCALE;
|
|
|
|
|
|
+ if (likely(used < SCHED_CAPACITY_SCALE))
|
|
|
|
+ return SCHED_CAPACITY_SCALE - used;
|
|
|
|
|
|
- total >>= SCHED_CAPACITY_SHIFT;
|
|
|
|
-
|
|
|
|
- return div_u64(available, total);
|
|
|
|
|
|
+ return 1;
|
|
}
|
|
}
|
|
|
|
|
|
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
|
|
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
|
|
@@ -5988,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
|
|
|
|
|
|
capacity >>= SCHED_CAPACITY_SHIFT;
|
|
capacity >>= SCHED_CAPACITY_SHIFT;
|
|
|
|
|
|
- sdg->sgc->capacity_orig = capacity;
|
|
|
|
-
|
|
|
|
- if (sched_feat(ARCH_CAPACITY))
|
|
|
|
- capacity *= arch_scale_freq_capacity(sd, cpu);
|
|
|
|
- else
|
|
|
|
- capacity *= default_scale_capacity(sd, cpu);
|
|
|
|
-
|
|
|
|
- capacity >>= SCHED_CAPACITY_SHIFT;
|
|
|
|
|
|
+ cpu_rq(cpu)->cpu_capacity_orig = capacity;
|
|
|
|
|
|
capacity *= scale_rt_capacity(cpu);
|
|
capacity *= scale_rt_capacity(cpu);
|
|
capacity >>= SCHED_CAPACITY_SHIFT;
|
|
capacity >>= SCHED_CAPACITY_SHIFT;
|
|
@@ -6011,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
{
|
|
{
|
|
struct sched_domain *child = sd->child;
|
|
struct sched_domain *child = sd->child;
|
|
struct sched_group *group, *sdg = sd->groups;
|
|
struct sched_group *group, *sdg = sd->groups;
|
|
- unsigned long capacity, capacity_orig;
|
|
|
|
|
|
+ unsigned long capacity;
|
|
unsigned long interval;
|
|
unsigned long interval;
|
|
|
|
|
|
interval = msecs_to_jiffies(sd->balance_interval);
|
|
interval = msecs_to_jiffies(sd->balance_interval);
|
|
@@ -6023,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
return;
|
|
return;
|
|
}
|
|
}
|
|
|
|
|
|
- capacity_orig = capacity = 0;
|
|
|
|
|
|
+ capacity = 0;
|
|
|
|
|
|
if (child->flags & SD_OVERLAP) {
|
|
if (child->flags & SD_OVERLAP) {
|
|
/*
|
|
/*
|
|
@@ -6043,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
* Use capacity_of(), which is set irrespective of domains
|
|
* Use capacity_of(), which is set irrespective of domains
|
|
* in update_cpu_capacity().
|
|
* in update_cpu_capacity().
|
|
*
|
|
*
|
|
- * This avoids capacity/capacity_orig from being 0 and
|
|
|
|
|
|
+ * This avoids capacity from being 0 and
|
|
* causing divide-by-zero issues on boot.
|
|
* causing divide-by-zero issues on boot.
|
|
- *
|
|
|
|
- * Runtime updates will correct capacity_orig.
|
|
|
|
*/
|
|
*/
|
|
if (unlikely(!rq->sd)) {
|
|
if (unlikely(!rq->sd)) {
|
|
- capacity_orig += capacity_of(cpu);
|
|
|
|
capacity += capacity_of(cpu);
|
|
capacity += capacity_of(cpu);
|
|
continue;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
|
|
sgc = rq->sd->groups->sgc;
|
|
sgc = rq->sd->groups->sgc;
|
|
- capacity_orig += sgc->capacity_orig;
|
|
|
|
capacity += sgc->capacity;
|
|
capacity += sgc->capacity;
|
|
}
|
|
}
|
|
} else {
|
|
} else {
|
|
@@ -6066,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
|
|
|
|
group = child->groups;
|
|
group = child->groups;
|
|
do {
|
|
do {
|
|
- capacity_orig += group->sgc->capacity_orig;
|
|
|
|
capacity += group->sgc->capacity;
|
|
capacity += group->sgc->capacity;
|
|
group = group->next;
|
|
group = group->next;
|
|
} while (group != child->groups);
|
|
} while (group != child->groups);
|
|
}
|
|
}
|
|
|
|
|
|
- sdg->sgc->capacity_orig = capacity_orig;
|
|
|
|
sdg->sgc->capacity = capacity;
|
|
sdg->sgc->capacity = capacity;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
- * Try and fix up capacity for tiny siblings, this is needed when
|
|
|
|
- * things like SD_ASYM_PACKING need f_b_g to select another sibling
|
|
|
|
- * which on its own isn't powerful enough.
|
|
|
|
- *
|
|
|
|
- * See update_sd_pick_busiest() and check_asym_packing().
|
|
|
|
|
|
+ * Check whether the capacity of the rq has been noticeably reduced by side
|
|
|
|
+ * activity. The imbalance_pct is used for the threshold.
|
|
|
|
+ * Return true is the capacity is reduced
|
|
*/
|
|
*/
|
|
static inline int
|
|
static inline int
|
|
-fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
|
|
|
|
|
|
+check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
|
|
{
|
|
{
|
|
- /*
|
|
|
|
- * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
|
|
|
|
- */
|
|
|
|
- if (!(sd->flags & SD_SHARE_CPUCAPACITY))
|
|
|
|
- return 0;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * If ~90% of the cpu_capacity is still there, we're good.
|
|
|
|
- */
|
|
|
|
- if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
|
|
|
|
- return 1;
|
|
|
|
-
|
|
|
|
- return 0;
|
|
|
|
|
|
+ return ((rq->cpu_capacity * sd->imbalance_pct) <
|
|
|
|
+ (rq->cpu_capacity_orig * 100));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -6136,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group)
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
- * Compute the group capacity factor.
|
|
|
|
- *
|
|
|
|
- * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
|
|
|
|
- * first dividing out the smt factor and computing the actual number of cores
|
|
|
|
- * and limit unit capacity with that.
|
|
|
|
|
|
+ * group_has_capacity returns true if the group has spare capacity that could
|
|
|
|
+ * be used by some tasks.
|
|
|
|
+ * We consider that a group has spare capacity if the * number of task is
|
|
|
|
+ * smaller than the number of CPUs or if the usage is lower than the available
|
|
|
|
+ * capacity for CFS tasks.
|
|
|
|
+ * For the latter, we use a threshold to stabilize the state, to take into
|
|
|
|
+ * account the variance of the tasks' load and to return true if the available
|
|
|
|
+ * capacity in meaningful for the load balancer.
|
|
|
|
+ * As an example, an available capacity of 1% can appear but it doesn't make
|
|
|
|
+ * any benefit for the load balance.
|
|
*/
|
|
*/
|
|
-static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
|
|
|
|
|
|
+static inline bool
|
|
|
|
+group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
|
|
{
|
|
{
|
|
- unsigned int capacity_factor, smt, cpus;
|
|
|
|
- unsigned int capacity, capacity_orig;
|
|
|
|
|
|
+ if (sgs->sum_nr_running < sgs->group_weight)
|
|
|
|
+ return true;
|
|
|
|
|
|
- capacity = group->sgc->capacity;
|
|
|
|
- capacity_orig = group->sgc->capacity_orig;
|
|
|
|
- cpus = group->group_weight;
|
|
|
|
|
|
+ if ((sgs->group_capacity * 100) >
|
|
|
|
+ (sgs->group_usage * env->sd->imbalance_pct))
|
|
|
|
+ return true;
|
|
|
|
|
|
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
|
|
|
|
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
|
|
|
|
- capacity_factor = cpus / smt; /* cores */
|
|
|
|
|
|
+ return false;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+ * group_is_overloaded returns true if the group has more tasks than it can
|
|
|
|
+ * handle.
|
|
|
|
+ * group_is_overloaded is not equals to !group_has_capacity because a group
|
|
|
|
+ * with the exact right number of tasks, has no more spare capacity but is not
|
|
|
|
+ * overloaded so both group_has_capacity and group_is_overloaded return
|
|
|
|
+ * false.
|
|
|
|
+ */
|
|
|
|
+static inline bool
|
|
|
|
+group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
|
|
|
|
+{
|
|
|
|
+ if (sgs->sum_nr_running <= sgs->group_weight)
|
|
|
|
+ return false;
|
|
|
|
|
|
- capacity_factor = min_t(unsigned,
|
|
|
|
- capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
|
|
|
|
- if (!capacity_factor)
|
|
|
|
- capacity_factor = fix_small_capacity(env->sd, group);
|
|
|
|
|
|
+ if ((sgs->group_capacity * 100) <
|
|
|
|
+ (sgs->group_usage * env->sd->imbalance_pct))
|
|
|
|
+ return true;
|
|
|
|
|
|
- return capacity_factor;
|
|
|
|
|
|
+ return false;
|
|
}
|
|
}
|
|
|
|
|
|
-static enum group_type
|
|
|
|
-group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
|
|
|
|
|
|
+static enum group_type group_classify(struct lb_env *env,
|
|
|
|
+ struct sched_group *group,
|
|
|
|
+ struct sg_lb_stats *sgs)
|
|
{
|
|
{
|
|
- if (sgs->sum_nr_running > sgs->group_capacity_factor)
|
|
|
|
|
|
+ if (sgs->group_no_capacity)
|
|
return group_overloaded;
|
|
return group_overloaded;
|
|
|
|
|
|
if (sg_imbalanced(group))
|
|
if (sg_imbalanced(group))
|
|
@@ -6204,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|
load = source_load(i, load_idx);
|
|
load = source_load(i, load_idx);
|
|
|
|
|
|
sgs->group_load += load;
|
|
sgs->group_load += load;
|
|
|
|
+ sgs->group_usage += get_cpu_usage(i);
|
|
sgs->sum_nr_running += rq->cfs.h_nr_running;
|
|
sgs->sum_nr_running += rq->cfs.h_nr_running;
|
|
|
|
|
|
if (rq->nr_running > 1)
|
|
if (rq->nr_running > 1)
|
|
@@ -6226,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
|
|
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
|
|
|
|
|
|
sgs->group_weight = group->group_weight;
|
|
sgs->group_weight = group->group_weight;
|
|
- sgs->group_capacity_factor = sg_capacity_factor(env, group);
|
|
|
|
- sgs->group_type = group_classify(group, sgs);
|
|
|
|
|
|
|
|
- if (sgs->group_capacity_factor > sgs->sum_nr_running)
|
|
|
|
- sgs->group_has_free_capacity = 1;
|
|
|
|
|
|
+ sgs->group_no_capacity = group_is_overloaded(env, sgs);
|
|
|
|
+ sgs->group_type = group_classify(env, group, sgs);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
@@ -6352,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
|
|
|
|
|
|
/*
|
|
/*
|
|
* In case the child domain prefers tasks go to siblings
|
|
* In case the child domain prefers tasks go to siblings
|
|
- * first, lower the sg capacity factor to one so that we'll try
|
|
|
|
|
|
+ * first, lower the sg capacity so that we'll try
|
|
* and move all the excess tasks away. We lower the capacity
|
|
* and move all the excess tasks away. We lower the capacity
|
|
* of a group only if the local group has the capacity to fit
|
|
* of a group only if the local group has the capacity to fit
|
|
- * these excess tasks, i.e. nr_running < group_capacity_factor. The
|
|
|
|
- * extra check prevents the case where you always pull from the
|
|
|
|
- * heaviest group when it is already under-utilized (possible
|
|
|
|
- * with a large weight task outweighs the tasks on the system).
|
|
|
|
|
|
+ * these excess tasks. The extra check prevents the case where
|
|
|
|
+ * you always pull from the heaviest group when it is already
|
|
|
|
+ * under-utilized (possible with a large weight task outweighs
|
|
|
|
+ * the tasks on the system).
|
|
*/
|
|
*/
|
|
if (prefer_sibling && sds->local &&
|
|
if (prefer_sibling && sds->local &&
|
|
- sds->local_stat.group_has_free_capacity) {
|
|
|
|
- sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
|
|
|
|
- sgs->group_type = group_classify(sg, sgs);
|
|
|
|
|
|
+ group_has_capacity(env, &sds->local_stat) &&
|
|
|
|
+ (sgs->sum_nr_running > 1)) {
|
|
|
|
+ sgs->group_no_capacity = 1;
|
|
|
|
+ sgs->group_type = group_overloaded;
|
|
}
|
|
}
|
|
|
|
|
|
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
|
|
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
|
|
@@ -6543,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
|
|
*/
|
|
*/
|
|
if (busiest->group_type == group_overloaded &&
|
|
if (busiest->group_type == group_overloaded &&
|
|
local->group_type == group_overloaded) {
|
|
local->group_type == group_overloaded) {
|
|
- load_above_capacity =
|
|
|
|
- (busiest->sum_nr_running - busiest->group_capacity_factor);
|
|
|
|
-
|
|
|
|
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
|
|
|
|
- load_above_capacity /= busiest->group_capacity;
|
|
|
|
|
|
+ load_above_capacity = busiest->sum_nr_running *
|
|
|
|
+ SCHED_LOAD_SCALE;
|
|
|
|
+ if (load_above_capacity > busiest->group_capacity)
|
|
|
|
+ load_above_capacity -= busiest->group_capacity;
|
|
|
|
+ else
|
|
|
|
+ load_above_capacity = ~0UL;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -6610,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
|
|
local = &sds.local_stat;
|
|
local = &sds.local_stat;
|
|
busiest = &sds.busiest_stat;
|
|
busiest = &sds.busiest_stat;
|
|
|
|
|
|
|
|
+ /* ASYM feature bypasses nice load balance check */
|
|
if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
|
|
if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
|
|
check_asym_packing(env, &sds))
|
|
check_asym_packing(env, &sds))
|
|
return sds.busiest;
|
|
return sds.busiest;
|
|
@@ -6630,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
|
|
goto force_balance;
|
|
goto force_balance;
|
|
|
|
|
|
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
|
|
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
|
|
- if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
|
|
|
|
- !busiest->group_has_free_capacity)
|
|
|
|
|
|
+ if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
|
|
|
|
+ busiest->group_no_capacity)
|
|
goto force_balance;
|
|
goto force_balance;
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -6690,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
|
|
int i;
|
|
int i;
|
|
|
|
|
|
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
|
|
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
|
|
- unsigned long capacity, capacity_factor, wl;
|
|
|
|
|
|
+ unsigned long capacity, wl;
|
|
enum fbq_type rt;
|
|
enum fbq_type rt;
|
|
|
|
|
|
rq = cpu_rq(i);
|
|
rq = cpu_rq(i);
|
|
@@ -6719,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
|
|
continue;
|
|
continue;
|
|
|
|
|
|
capacity = capacity_of(i);
|
|
capacity = capacity_of(i);
|
|
- capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
|
|
|
|
- if (!capacity_factor)
|
|
|
|
- capacity_factor = fix_small_capacity(env->sd, group);
|
|
|
|
|
|
|
|
wl = weighted_cpuload(i);
|
|
wl = weighted_cpuload(i);
|
|
|
|
|
|
@@ -6729,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
|
|
* When comparing with imbalance, use weighted_cpuload()
|
|
* When comparing with imbalance, use weighted_cpuload()
|
|
* which is not scaled with the cpu capacity.
|
|
* which is not scaled with the cpu capacity.
|
|
*/
|
|
*/
|
|
- if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
|
|
|
|
|
|
+
|
|
|
|
+ if (rq->nr_running == 1 && wl > env->imbalance &&
|
|
|
|
+ !check_cpu_capacity(rq, env->sd))
|
|
continue;
|
|
continue;
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -6777,6 +6849,19 @@ static int need_active_balance(struct lb_env *env)
|
|
return 1;
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ /*
|
|
|
|
+ * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
|
|
|
|
+ * It's worth migrating the task if the src_cpu's capacity is reduced
|
|
|
|
+ * because of other sched_class or IRQs if more capacity stays
|
|
|
|
+ * available on dst_cpu.
|
|
|
|
+ */
|
|
|
|
+ if ((env->idle != CPU_NOT_IDLE) &&
|
|
|
|
+ (env->src_rq->cfs.h_nr_running == 1)) {
|
|
|
|
+ if ((check_cpu_capacity(env->src_rq, sd)) &&
|
|
|
|
+ (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
|
|
|
|
+ return 1;
|
|
|
|
+ }
|
|
|
|
+
|
|
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
|
|
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
|
|
}
|
|
}
|
|
|
|
|
|
@@ -6876,6 +6961,9 @@ redo:
|
|
|
|
|
|
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
|
|
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
|
|
|
|
|
|
|
|
+ env.src_cpu = busiest->cpu;
|
|
|
|
+ env.src_rq = busiest;
|
|
|
|
+
|
|
ld_moved = 0;
|
|
ld_moved = 0;
|
|
if (busiest->nr_running > 1) {
|
|
if (busiest->nr_running > 1) {
|
|
/*
|
|
/*
|
|
@@ -6885,8 +6973,6 @@ redo:
|
|
* correctly treated as an imbalance.
|
|
* correctly treated as an imbalance.
|
|
*/
|
|
*/
|
|
env.flags |= LBF_ALL_PINNED;
|
|
env.flags |= LBF_ALL_PINNED;
|
|
- env.src_cpu = busiest->cpu;
|
|
|
|
- env.src_rq = busiest;
|
|
|
|
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
|
|
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
|
|
|
|
|
|
more_balance:
|
|
more_balance:
|
|
@@ -7586,22 +7672,25 @@ end:
|
|
|
|
|
|
/*
|
|
/*
|
|
* Current heuristic for kicking the idle load balancer in the presence
|
|
* Current heuristic for kicking the idle load balancer in the presence
|
|
- * of an idle cpu is the system.
|
|
|
|
|
|
+ * of an idle cpu in the system.
|
|
* - This rq has more than one task.
|
|
* - This rq has more than one task.
|
|
- * - At any scheduler domain level, this cpu's scheduler group has multiple
|
|
|
|
- * busy cpu's exceeding the group's capacity.
|
|
|
|
|
|
+ * - This rq has at least one CFS task and the capacity of the CPU is
|
|
|
|
+ * significantly reduced because of RT tasks or IRQs.
|
|
|
|
+ * - At parent of LLC scheduler domain level, this cpu's scheduler group has
|
|
|
|
+ * multiple busy cpu.
|
|
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
|
|
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
|
|
* domain span are idle.
|
|
* domain span are idle.
|
|
*/
|
|
*/
|
|
-static inline int nohz_kick_needed(struct rq *rq)
|
|
|
|
|
|
+static inline bool nohz_kick_needed(struct rq *rq)
|
|
{
|
|
{
|
|
unsigned long now = jiffies;
|
|
unsigned long now = jiffies;
|
|
struct sched_domain *sd;
|
|
struct sched_domain *sd;
|
|
struct sched_group_capacity *sgc;
|
|
struct sched_group_capacity *sgc;
|
|
int nr_busy, cpu = rq->cpu;
|
|
int nr_busy, cpu = rq->cpu;
|
|
|
|
+ bool kick = false;
|
|
|
|
|
|
if (unlikely(rq->idle_balance))
|
|
if (unlikely(rq->idle_balance))
|
|
- return 0;
|
|
|
|
|
|
+ return false;
|
|
|
|
|
|
/*
|
|
/*
|
|
* We may be recently in ticked or tickless idle mode. At the first
|
|
* We may be recently in ticked or tickless idle mode. At the first
|
|
@@ -7615,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq)
|
|
* balancing.
|
|
* balancing.
|
|
*/
|
|
*/
|
|
if (likely(!atomic_read(&nohz.nr_cpus)))
|
|
if (likely(!atomic_read(&nohz.nr_cpus)))
|
|
- return 0;
|
|
|
|
|
|
+ return false;
|
|
|
|
|
|
if (time_before(now, nohz.next_balance))
|
|
if (time_before(now, nohz.next_balance))
|
|
- return 0;
|
|
|
|
|
|
+ return false;
|
|
|
|
|
|
if (rq->nr_running >= 2)
|
|
if (rq->nr_running >= 2)
|
|
- goto need_kick;
|
|
|
|
|
|
+ return true;
|
|
|
|
|
|
rcu_read_lock();
|
|
rcu_read_lock();
|
|
sd = rcu_dereference(per_cpu(sd_busy, cpu));
|
|
sd = rcu_dereference(per_cpu(sd_busy, cpu));
|
|
-
|
|
|
|
if (sd) {
|
|
if (sd) {
|
|
sgc = sd->groups->sgc;
|
|
sgc = sd->groups->sgc;
|
|
nr_busy = atomic_read(&sgc->nr_busy_cpus);
|
|
nr_busy = atomic_read(&sgc->nr_busy_cpus);
|
|
|
|
|
|
- if (nr_busy > 1)
|
|
|
|
- goto need_kick_unlock;
|
|
|
|
|
|
+ if (nr_busy > 1) {
|
|
|
|
+ kick = true;
|
|
|
|
+ goto unlock;
|
|
|
|
+ }
|
|
|
|
+
|
|
}
|
|
}
|
|
|
|
|
|
- sd = rcu_dereference(per_cpu(sd_asym, cpu));
|
|
|
|
|
|
+ sd = rcu_dereference(rq->sd);
|
|
|
|
+ if (sd) {
|
|
|
|
+ if ((rq->cfs.h_nr_running >= 1) &&
|
|
|
|
+ check_cpu_capacity(rq, sd)) {
|
|
|
|
+ kick = true;
|
|
|
|
+ goto unlock;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
|
|
|
|
+ sd = rcu_dereference(per_cpu(sd_asym, cpu));
|
|
if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
|
|
if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
|
|
- sched_domain_span(sd)) < cpu))
|
|
|
|
- goto need_kick_unlock;
|
|
|
|
-
|
|
|
|
- rcu_read_unlock();
|
|
|
|
- return 0;
|
|
|
|
|
|
+ sched_domain_span(sd)) < cpu)) {
|
|
|
|
+ kick = true;
|
|
|
|
+ goto unlock;
|
|
|
|
+ }
|
|
|
|
|
|
-need_kick_unlock:
|
|
|
|
|
|
+unlock:
|
|
rcu_read_unlock();
|
|
rcu_read_unlock();
|
|
-need_kick:
|
|
|
|
- return 1;
|
|
|
|
|
|
+ return kick;
|
|
}
|
|
}
|
|
#else
|
|
#else
|
|
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
|
|
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
|
|
@@ -7662,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h)
|
|
enum cpu_idle_type idle = this_rq->idle_balance ?
|
|
enum cpu_idle_type idle = this_rq->idle_balance ?
|
|
CPU_IDLE : CPU_NOT_IDLE;
|
|
CPU_IDLE : CPU_NOT_IDLE;
|
|
|
|
|
|
- rebalance_domains(this_rq, idle);
|
|
|
|
-
|
|
|
|
/*
|
|
/*
|
|
* If this cpu has a pending nohz_balance_kick, then do the
|
|
* If this cpu has a pending nohz_balance_kick, then do the
|
|
* balancing on behalf of the other idle cpus whose ticks are
|
|
* balancing on behalf of the other idle cpus whose ticks are
|
|
- * stopped.
|
|
|
|
|
|
+ * stopped. Do nohz_idle_balance *before* rebalance_domains to
|
|
|
|
+ * give the idle cpus a chance to load balance. Else we may
|
|
|
|
+ * load balance only within the local sched_domain hierarchy
|
|
|
|
+ * and abort nohz_idle_balance altogether if we pull some load.
|
|
*/
|
|
*/
|
|
nohz_idle_balance(this_rq, idle);
|
|
nohz_idle_balance(this_rq, idle);
|
|
|
|
+ rebalance_domains(this_rq, idle);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|