|
@@ -693,6 +693,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
|
|
|
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
|
|
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
|
|
static unsigned long task_h_load(struct task_struct *p);
|
|
static unsigned long task_h_load(struct task_struct *p);
|
|
|
|
+static unsigned long capacity_of(int cpu);
|
|
|
|
|
|
/* Give new sched_entity start runnable values to heavy its load in infant time */
|
|
/* Give new sched_entity start runnable values to heavy its load in infant time */
|
|
void init_entity_runnable_average(struct sched_entity *se)
|
|
void init_entity_runnable_average(struct sched_entity *se)
|
|
@@ -1456,7 +1457,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
|
|
static unsigned long weighted_cpuload(struct rq *rq);
|
|
static unsigned long weighted_cpuload(struct rq *rq);
|
|
static unsigned long source_load(int cpu, int type);
|
|
static unsigned long source_load(int cpu, int type);
|
|
static unsigned long target_load(int cpu, int type);
|
|
static unsigned long target_load(int cpu, int type);
|
|
-static unsigned long capacity_of(int cpu);
|
|
|
|
|
|
|
|
/* Cached statistics for all CPUs within a node */
|
|
/* Cached statistics for all CPUs within a node */
|
|
struct numa_stats {
|
|
struct numa_stats {
|
|
@@ -1464,8 +1464,6 @@ struct numa_stats {
|
|
|
|
|
|
/* Total compute capacity of CPUs on a node */
|
|
/* Total compute capacity of CPUs on a node */
|
|
unsigned long compute_capacity;
|
|
unsigned long compute_capacity;
|
|
-
|
|
|
|
- unsigned int nr_running;
|
|
|
|
};
|
|
};
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -1473,36 +1471,16 @@ struct numa_stats {
|
|
*/
|
|
*/
|
|
static void update_numa_stats(struct numa_stats *ns, int nid)
|
|
static void update_numa_stats(struct numa_stats *ns, int nid)
|
|
{
|
|
{
|
|
- int smt, cpu, cpus = 0;
|
|
|
|
- unsigned long capacity;
|
|
|
|
|
|
+ int cpu;
|
|
|
|
|
|
memset(ns, 0, sizeof(*ns));
|
|
memset(ns, 0, sizeof(*ns));
|
|
for_each_cpu(cpu, cpumask_of_node(nid)) {
|
|
for_each_cpu(cpu, cpumask_of_node(nid)) {
|
|
struct rq *rq = cpu_rq(cpu);
|
|
struct rq *rq = cpu_rq(cpu);
|
|
|
|
|
|
- ns->nr_running += rq->nr_running;
|
|
|
|
ns->load += weighted_cpuload(rq);
|
|
ns->load += weighted_cpuload(rq);
|
|
ns->compute_capacity += capacity_of(cpu);
|
|
ns->compute_capacity += capacity_of(cpu);
|
|
-
|
|
|
|
- cpus++;
|
|
|
|
}
|
|
}
|
|
|
|
|
|
- /*
|
|
|
|
- * If we raced with hotplug and there are no CPUs left in our mask
|
|
|
|
- * the @ns structure is NULL'ed and task_numa_compare() will
|
|
|
|
- * not find this node attractive.
|
|
|
|
- *
|
|
|
|
- * We'll detect a huge imbalance and bail there.
|
|
|
|
- */
|
|
|
|
- if (!cpus)
|
|
|
|
- return;
|
|
|
|
-
|
|
|
|
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
|
|
|
|
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
|
|
|
|
- capacity = cpus / smt; /* cores */
|
|
|
|
-
|
|
|
|
- capacity = min_t(unsigned, capacity,
|
|
|
|
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
|
|
|
|
}
|
|
}
|
|
|
|
|
|
struct task_numa_env {
|
|
struct task_numa_env {
|
|
@@ -3723,6 +3701,29 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
|
|
WRITE_ONCE(p->se.avg.util_est, ue);
|
|
WRITE_ONCE(p->se.avg.util_est, ue);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+static inline int task_fits_capacity(struct task_struct *p, long capacity)
|
|
|
|
+{
|
|
|
|
+ return capacity * 1024 > task_util_est(p) * capacity_margin;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
|
|
|
|
+{
|
|
|
|
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
|
|
|
|
+ return;
|
|
|
|
+
|
|
|
|
+ if (!p) {
|
|
|
|
+ rq->misfit_task_load = 0;
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
|
|
|
|
+ rq->misfit_task_load = 0;
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ rq->misfit_task_load = task_h_load(p);
|
|
|
|
+}
|
|
|
|
+
|
|
#else /* CONFIG_SMP */
|
|
#else /* CONFIG_SMP */
|
|
|
|
|
|
#define UPDATE_TG 0x0
|
|
#define UPDATE_TG 0x0
|
|
@@ -3752,6 +3753,7 @@ util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
|
|
static inline void
|
|
static inline void
|
|
util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
|
|
util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
|
|
bool task_sleep) {}
|
|
bool task_sleep) {}
|
|
|
|
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
|
|
|
|
|
|
#endif /* CONFIG_SMP */
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
|
@@ -6280,6 +6282,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
|
|
{
|
|
{
|
|
long min_cap, max_cap;
|
|
long min_cap, max_cap;
|
|
|
|
|
|
|
|
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
|
|
|
|
+ return 0;
|
|
|
|
+
|
|
min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
|
|
min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
|
|
max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
|
|
max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
|
|
|
|
|
|
@@ -6290,7 +6295,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
|
|
/* Bring task utilization in sync with prev_cpu */
|
|
/* Bring task utilization in sync with prev_cpu */
|
|
sync_entity_load_avg(&p->se);
|
|
sync_entity_load_avg(&p->se);
|
|
|
|
|
|
- return min_cap * 1024 < task_util(p) * capacity_margin;
|
|
|
|
|
|
+ return !task_fits_capacity(p, min_cap);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -6709,9 +6714,12 @@ done: __maybe_unused;
|
|
if (hrtick_enabled(rq))
|
|
if (hrtick_enabled(rq))
|
|
hrtick_start_fair(rq, p);
|
|
hrtick_start_fair(rq, p);
|
|
|
|
|
|
|
|
+ update_misfit_status(p, rq);
|
|
|
|
+
|
|
return p;
|
|
return p;
|
|
|
|
|
|
idle:
|
|
idle:
|
|
|
|
+ update_misfit_status(NULL, rq);
|
|
new_tasks = idle_balance(rq, rf);
|
|
new_tasks = idle_balance(rq, rf);
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -6917,6 +6925,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
|
|
|
|
|
|
enum fbq_type { regular, remote, all };
|
|
enum fbq_type { regular, remote, all };
|
|
|
|
|
|
|
|
+enum group_type {
|
|
|
|
+ group_other = 0,
|
|
|
|
+ group_misfit_task,
|
|
|
|
+ group_imbalanced,
|
|
|
|
+ group_overloaded,
|
|
|
|
+};
|
|
|
|
+
|
|
#define LBF_ALL_PINNED 0x01
|
|
#define LBF_ALL_PINNED 0x01
|
|
#define LBF_NEED_BREAK 0x02
|
|
#define LBF_NEED_BREAK 0x02
|
|
#define LBF_DST_PINNED 0x04
|
|
#define LBF_DST_PINNED 0x04
|
|
@@ -6947,6 +6962,7 @@ struct lb_env {
|
|
unsigned int loop_max;
|
|
unsigned int loop_max;
|
|
|
|
|
|
enum fbq_type fbq_type;
|
|
enum fbq_type fbq_type;
|
|
|
|
+ enum group_type src_grp_type;
|
|
struct list_head tasks;
|
|
struct list_head tasks;
|
|
};
|
|
};
|
|
|
|
|
|
@@ -7327,7 +7343,7 @@ static inline bool others_have_blocked(struct rq *rq)
|
|
if (READ_ONCE(rq->avg_dl.util_avg))
|
|
if (READ_ONCE(rq->avg_dl.util_avg))
|
|
return true;
|
|
return true;
|
|
|
|
|
|
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
|
|
|
|
|
|
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
|
|
if (READ_ONCE(rq->avg_irq.util_avg))
|
|
if (READ_ONCE(rq->avg_irq.util_avg))
|
|
return true;
|
|
return true;
|
|
#endif
|
|
#endif
|
|
@@ -7490,12 +7506,6 @@ static unsigned long task_h_load(struct task_struct *p)
|
|
|
|
|
|
/********** Helpers for find_busiest_group ************************/
|
|
/********** Helpers for find_busiest_group ************************/
|
|
|
|
|
|
-enum group_type {
|
|
|
|
- group_other = 0,
|
|
|
|
- group_imbalanced,
|
|
|
|
- group_overloaded,
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
/*
|
|
/*
|
|
* sg_lb_stats - stats of a sched_group required for load_balancing
|
|
* sg_lb_stats - stats of a sched_group required for load_balancing
|
|
*/
|
|
*/
|
|
@@ -7511,6 +7521,7 @@ struct sg_lb_stats {
|
|
unsigned int group_weight;
|
|
unsigned int group_weight;
|
|
enum group_type group_type;
|
|
enum group_type group_type;
|
|
int group_no_capacity;
|
|
int group_no_capacity;
|
|
|
|
+ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
unsigned int nr_numa_running;
|
|
unsigned int nr_numa_running;
|
|
unsigned int nr_preferred_running;
|
|
unsigned int nr_preferred_running;
|
|
@@ -7619,13 +7630,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
|
|
cpu_rq(cpu)->cpu_capacity = capacity;
|
|
cpu_rq(cpu)->cpu_capacity = capacity;
|
|
sdg->sgc->capacity = capacity;
|
|
sdg->sgc->capacity = capacity;
|
|
sdg->sgc->min_capacity = capacity;
|
|
sdg->sgc->min_capacity = capacity;
|
|
|
|
+ sdg->sgc->max_capacity = capacity;
|
|
}
|
|
}
|
|
|
|
|
|
void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
{
|
|
{
|
|
struct sched_domain *child = sd->child;
|
|
struct sched_domain *child = sd->child;
|
|
struct sched_group *group, *sdg = sd->groups;
|
|
struct sched_group *group, *sdg = sd->groups;
|
|
- unsigned long capacity, min_capacity;
|
|
|
|
|
|
+ unsigned long capacity, min_capacity, max_capacity;
|
|
unsigned long interval;
|
|
unsigned long interval;
|
|
|
|
|
|
interval = msecs_to_jiffies(sd->balance_interval);
|
|
interval = msecs_to_jiffies(sd->balance_interval);
|
|
@@ -7639,6 +7651,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
|
|
|
|
capacity = 0;
|
|
capacity = 0;
|
|
min_capacity = ULONG_MAX;
|
|
min_capacity = ULONG_MAX;
|
|
|
|
+ max_capacity = 0;
|
|
|
|
|
|
if (child->flags & SD_OVERLAP) {
|
|
if (child->flags & SD_OVERLAP) {
|
|
/*
|
|
/*
|
|
@@ -7669,6 +7682,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
}
|
|
}
|
|
|
|
|
|
min_capacity = min(capacity, min_capacity);
|
|
min_capacity = min(capacity, min_capacity);
|
|
|
|
+ max_capacity = max(capacity, max_capacity);
|
|
}
|
|
}
|
|
} else {
|
|
} else {
|
|
/*
|
|
/*
|
|
@@ -7682,12 +7696,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
|
|
|
|
|
capacity += sgc->capacity;
|
|
capacity += sgc->capacity;
|
|
min_capacity = min(sgc->min_capacity, min_capacity);
|
|
min_capacity = min(sgc->min_capacity, min_capacity);
|
|
|
|
+ max_capacity = max(sgc->max_capacity, max_capacity);
|
|
group = group->next;
|
|
group = group->next;
|
|
} while (group != child->groups);
|
|
} while (group != child->groups);
|
|
}
|
|
}
|
|
|
|
|
|
sdg->sgc->capacity = capacity;
|
|
sdg->sgc->capacity = capacity;
|
|
sdg->sgc->min_capacity = min_capacity;
|
|
sdg->sgc->min_capacity = min_capacity;
|
|
|
|
+ sdg->sgc->max_capacity = max_capacity;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -7783,16 +7799,27 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
- * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
|
|
|
|
|
|
+ * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
|
|
* per-CPU capacity than sched_group ref.
|
|
* per-CPU capacity than sched_group ref.
|
|
*/
|
|
*/
|
|
static inline bool
|
|
static inline bool
|
|
-group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
|
|
|
|
|
|
+group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
|
|
{
|
|
{
|
|
return sg->sgc->min_capacity * capacity_margin <
|
|
return sg->sgc->min_capacity * capacity_margin <
|
|
ref->sgc->min_capacity * 1024;
|
|
ref->sgc->min_capacity * 1024;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+/*
|
|
|
|
+ * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
|
|
|
|
+ * per-CPU capacity_orig than sched_group ref.
|
|
|
|
+ */
|
|
|
|
+static inline bool
|
|
|
|
+group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
|
|
|
|
+{
|
|
|
|
+ return sg->sgc->max_capacity * capacity_margin <
|
|
|
|
+ ref->sgc->max_capacity * 1024;
|
|
|
|
+}
|
|
|
|
+
|
|
static inline enum
|
|
static inline enum
|
|
group_type group_classify(struct sched_group *group,
|
|
group_type group_classify(struct sched_group *group,
|
|
struct sg_lb_stats *sgs)
|
|
struct sg_lb_stats *sgs)
|
|
@@ -7803,6 +7830,9 @@ group_type group_classify(struct sched_group *group,
|
|
if (sg_imbalanced(group))
|
|
if (sg_imbalanced(group))
|
|
return group_imbalanced;
|
|
return group_imbalanced;
|
|
|
|
|
|
|
|
+ if (sgs->group_misfit_task_load)
|
|
|
|
+ return group_misfit_task;
|
|
|
|
+
|
|
return group_other;
|
|
return group_other;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -7835,7 +7865,7 @@ static bool update_nohz_stats(struct rq *rq, bool force)
|
|
* @load_idx: Load index of sched_domain of this_cpu for load calc.
|
|
* @load_idx: Load index of sched_domain of this_cpu for load calc.
|
|
* @local_group: Does group contain this_cpu.
|
|
* @local_group: Does group contain this_cpu.
|
|
* @sgs: variable to hold the statistics for this group.
|
|
* @sgs: variable to hold the statistics for this group.
|
|
- * @overload: Indicate more than one runnable task for any CPU.
|
|
|
|
|
|
+ * @overload: Indicate pullable load (e.g. >1 runnable task).
|
|
*/
|
|
*/
|
|
static inline void update_sg_lb_stats(struct lb_env *env,
|
|
static inline void update_sg_lb_stats(struct lb_env *env,
|
|
struct sched_group *group, int load_idx,
|
|
struct sched_group *group, int load_idx,
|
|
@@ -7877,6 +7907,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|
*/
|
|
*/
|
|
if (!nr_running && idle_cpu(i))
|
|
if (!nr_running && idle_cpu(i))
|
|
sgs->idle_cpus++;
|
|
sgs->idle_cpus++;
|
|
|
|
+
|
|
|
|
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
|
|
|
|
+ sgs->group_misfit_task_load < rq->misfit_task_load) {
|
|
|
|
+ sgs->group_misfit_task_load = rq->misfit_task_load;
|
|
|
|
+ *overload = 1;
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
/* Adjust by relative CPU capacity of the group */
|
|
/* Adjust by relative CPU capacity of the group */
|
|
@@ -7912,6 +7948,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
|
|
{
|
|
{
|
|
struct sg_lb_stats *busiest = &sds->busiest_stat;
|
|
struct sg_lb_stats *busiest = &sds->busiest_stat;
|
|
|
|
|
|
|
|
+ /*
|
|
|
|
+ * Don't try to pull misfit tasks we can't help.
|
|
|
|
+ * We can use max_capacity here as reduction in capacity on some
|
|
|
|
+ * CPUs in the group should either be possible to resolve
|
|
|
|
+ * internally or be covered by avg_load imbalance (eventually).
|
|
|
|
+ */
|
|
|
|
+ if (sgs->group_type == group_misfit_task &&
|
|
|
|
+ (!group_smaller_max_cpu_capacity(sg, sds->local) ||
|
|
|
|
+ !group_has_capacity(env, &sds->local_stat)))
|
|
|
|
+ return false;
|
|
|
|
+
|
|
if (sgs->group_type > busiest->group_type)
|
|
if (sgs->group_type > busiest->group_type)
|
|
return true;
|
|
return true;
|
|
|
|
|
|
@@ -7931,7 +7978,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
|
|
* power/energy consequences are not considered.
|
|
* power/energy consequences are not considered.
|
|
*/
|
|
*/
|
|
if (sgs->sum_nr_running <= sgs->group_weight &&
|
|
if (sgs->sum_nr_running <= sgs->group_weight &&
|
|
- group_smaller_cpu_capacity(sds->local, sg))
|
|
|
|
|
|
+ group_smaller_min_cpu_capacity(sds->local, sg))
|
|
|
|
+ return false;
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * If we have more than one misfit sg go with the biggest misfit.
|
|
|
|
+ */
|
|
|
|
+ if (sgs->group_type == group_misfit_task &&
|
|
|
|
+ sgs->group_misfit_task_load < busiest->group_misfit_task_load)
|
|
return false;
|
|
return false;
|
|
|
|
|
|
asym_packing:
|
|
asym_packing:
|
|
@@ -8002,11 +8056,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
|
|
struct sched_group *sg = env->sd->groups;
|
|
struct sched_group *sg = env->sd->groups;
|
|
struct sg_lb_stats *local = &sds->local_stat;
|
|
struct sg_lb_stats *local = &sds->local_stat;
|
|
struct sg_lb_stats tmp_sgs;
|
|
struct sg_lb_stats tmp_sgs;
|
|
- int load_idx, prefer_sibling = 0;
|
|
|
|
|
|
+ int load_idx;
|
|
bool overload = false;
|
|
bool overload = false;
|
|
-
|
|
|
|
- if (child && child->flags & SD_PREFER_SIBLING)
|
|
|
|
- prefer_sibling = 1;
|
|
|
|
|
|
+ bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
|
|
|
|
|
|
#ifdef CONFIG_NO_HZ_COMMON
|
|
#ifdef CONFIG_NO_HZ_COMMON
|
|
if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
|
|
if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
|
|
@@ -8080,8 +8132,8 @@ next_group:
|
|
|
|
|
|
if (!env->sd->parent) {
|
|
if (!env->sd->parent) {
|
|
/* update overload indicator if we are at root domain */
|
|
/* update overload indicator if we are at root domain */
|
|
- if (env->dst_rq->rd->overload != overload)
|
|
|
|
- env->dst_rq->rd->overload = overload;
|
|
|
|
|
|
+ if (READ_ONCE(env->dst_rq->rd->overload) != overload)
|
|
|
|
+ WRITE_ONCE(env->dst_rq->rd->overload, overload);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
@@ -8231,8 +8283,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
|
|
* factors in sg capacity and sgs with smaller group_type are
|
|
* factors in sg capacity and sgs with smaller group_type are
|
|
* skipped when updating the busiest sg:
|
|
* skipped when updating the busiest sg:
|
|
*/
|
|
*/
|
|
- if (busiest->avg_load <= sds->avg_load ||
|
|
|
|
- local->avg_load >= sds->avg_load) {
|
|
|
|
|
|
+ if (busiest->group_type != group_misfit_task &&
|
|
|
|
+ (busiest->avg_load <= sds->avg_load ||
|
|
|
|
+ local->avg_load >= sds->avg_load)) {
|
|
env->imbalance = 0;
|
|
env->imbalance = 0;
|
|
return fix_small_imbalance(env, sds);
|
|
return fix_small_imbalance(env, sds);
|
|
}
|
|
}
|
|
@@ -8266,6 +8319,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
|
|
(sds->avg_load - local->avg_load) * local->group_capacity
|
|
(sds->avg_load - local->avg_load) * local->group_capacity
|
|
) / SCHED_CAPACITY_SCALE;
|
|
) / SCHED_CAPACITY_SCALE;
|
|
|
|
|
|
|
|
+ /* Boost imbalance to allow misfit task to be balanced. */
|
|
|
|
+ if (busiest->group_type == group_misfit_task) {
|
|
|
|
+ env->imbalance = max_t(long, env->imbalance,
|
|
|
|
+ busiest->group_misfit_task_load);
|
|
|
|
+ }
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* if *imbalance is less than the average load per runnable task
|
|
* if *imbalance is less than the average load per runnable task
|
|
* there is no guarantee that any tasks will be moved so we'll have
|
|
* there is no guarantee that any tasks will be moved so we'll have
|
|
@@ -8332,6 +8391,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
|
|
busiest->group_no_capacity)
|
|
busiest->group_no_capacity)
|
|
goto force_balance;
|
|
goto force_balance;
|
|
|
|
|
|
|
|
+ /* Misfit tasks should be dealt with regardless of the avg load */
|
|
|
|
+ if (busiest->group_type == group_misfit_task)
|
|
|
|
+ goto force_balance;
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* If the local group is busier than the selected busiest group
|
|
* If the local group is busier than the selected busiest group
|
|
* don't try and pull any tasks.
|
|
* don't try and pull any tasks.
|
|
@@ -8369,6 +8432,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
|
|
|
|
|
|
force_balance:
|
|
force_balance:
|
|
/* Looks like there is an imbalance. Compute it */
|
|
/* Looks like there is an imbalance. Compute it */
|
|
|
|
+ env->src_grp_type = busiest->group_type;
|
|
calculate_imbalance(env, &sds);
|
|
calculate_imbalance(env, &sds);
|
|
return env->imbalance ? sds.busiest : NULL;
|
|
return env->imbalance ? sds.busiest : NULL;
|
|
|
|
|
|
@@ -8416,8 +8480,32 @@ static struct rq *find_busiest_queue(struct lb_env *env,
|
|
if (rt > env->fbq_type)
|
|
if (rt > env->fbq_type)
|
|
continue;
|
|
continue;
|
|
|
|
|
|
|
|
+ /*
|
|
|
|
+ * For ASYM_CPUCAPACITY domains with misfit tasks we simply
|
|
|
|
+ * seek the "biggest" misfit task.
|
|
|
|
+ */
|
|
|
|
+ if (env->src_grp_type == group_misfit_task) {
|
|
|
|
+ if (rq->misfit_task_load > busiest_load) {
|
|
|
|
+ busiest_load = rq->misfit_task_load;
|
|
|
|
+ busiest = rq;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+
|
|
capacity = capacity_of(i);
|
|
capacity = capacity_of(i);
|
|
|
|
|
|
|
|
+ /*
|
|
|
|
+ * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
|
|
|
|
+ * eventually lead to active_balancing high->low capacity.
|
|
|
|
+ * Higher per-CPU capacity is considered better than balancing
|
|
|
|
+ * average load.
|
|
|
|
+ */
|
|
|
|
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
|
|
|
|
+ capacity_of(env->dst_cpu) < capacity &&
|
|
|
|
+ rq->nr_running == 1)
|
|
|
|
+ continue;
|
|
|
|
+
|
|
wl = weighted_cpuload(rq);
|
|
wl = weighted_cpuload(rq);
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -8485,6 +8573,9 @@ static int need_active_balance(struct lb_env *env)
|
|
return 1;
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ if (env->src_grp_type == group_misfit_task)
|
|
|
|
+ return 1;
|
|
|
|
+
|
|
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
|
|
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
|
|
}
|
|
}
|
|
|
|
|
|
@@ -9127,7 +9218,7 @@ static void nohz_balancer_kick(struct rq *rq)
|
|
if (time_before(now, nohz.next_balance))
|
|
if (time_before(now, nohz.next_balance))
|
|
goto out;
|
|
goto out;
|
|
|
|
|
|
- if (rq->nr_running >= 2) {
|
|
|
|
|
|
+ if (rq->nr_running >= 2 || rq->misfit_task_load) {
|
|
flags = NOHZ_KICK_MASK;
|
|
flags = NOHZ_KICK_MASK;
|
|
goto out;
|
|
goto out;
|
|
}
|
|
}
|
|
@@ -9496,7 +9587,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
|
|
rq_unpin_lock(this_rq, rf);
|
|
rq_unpin_lock(this_rq, rf);
|
|
|
|
|
|
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
|
|
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
|
|
- !this_rq->rd->overload) {
|
|
|
|
|
|
+ !READ_ONCE(this_rq->rd->overload)) {
|
|
|
|
|
|
rcu_read_lock();
|
|
rcu_read_lock();
|
|
sd = rcu_dereference_check_sched_domain(this_rq->sd);
|
|
sd = rcu_dereference_check_sched_domain(this_rq->sd);
|
|
@@ -9658,6 +9749,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
|
|
|
|
|
|
if (static_branch_unlikely(&sched_numa_balancing))
|
|
if (static_branch_unlikely(&sched_numa_balancing))
|
|
task_tick_numa(rq, curr);
|
|
task_tick_numa(rq, curr);
|
|
|
|
+
|
|
|
|
+ update_misfit_status(curr, rq);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|